Go to the documentation of this file.
80 #define hadamard_func(cpu) \
81 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, const uint8_t *src1, \
82 const uint8_t *src2, ptrdiff_t stride, int h); \
83 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, const uint8_t *src1, \
84 const uint8_t *src2, ptrdiff_t stride, int h);
91 static int nsse16_mmx(
MpegEncContext *
c,
const uint8_t *pix1,
const uint8_t *pix2,
97 score1 =
c->mecc.sse[0](
c, pix1, pix2,
stride,
h);
104 return score1 +
FFABS(score2) *
c->avctx->nsse_weight;
106 return score1 +
FFABS(score2) * 8;
109 static int nsse8_mmx(
MpegEncContext *
c,
const uint8_t *pix1,
const uint8_t *pix2,
117 return score1 +
FFABS(score2) *
c->avctx->nsse_weight;
119 return score1 +
FFABS(score2) * 8;
127 0x0000000000000000ULL,
128 0x0001000100010001ULL,
129 0x0002000200020002ULL,
132 static inline void sad8_4_mmx(
const uint8_t *blk1,
const uint8_t *blk2,
137 "movq (%1, %%"FF_REG_a
"), %%mm0\n\t"
138 "movq 1(%1, %%"FF_REG_a
"), %%mm2\n\t"
139 "movq %%mm0, %%mm1 \n\t"
140 "movq %%mm2, %%mm3 \n\t"
141 "punpcklbw %%mm7, %%mm0 \n\t"
142 "punpckhbw %%mm7, %%mm1 \n\t"
143 "punpcklbw %%mm7, %%mm2 \n\t"
144 "punpckhbw %%mm7, %%mm3 \n\t"
145 "paddw %%mm2, %%mm0 \n\t"
146 "paddw %%mm3, %%mm1 \n\t"
149 "movq (%2, %%"FF_REG_a
"), %%mm2\n\t"
150 "movq 1(%2, %%"FF_REG_a
"), %%mm4\n\t"
151 "movq %%mm2, %%mm3 \n\t"
152 "movq %%mm4, %%mm5 \n\t"
153 "punpcklbw %%mm7, %%mm2 \n\t"
154 "punpckhbw %%mm7, %%mm3 \n\t"
155 "punpcklbw %%mm7, %%mm4 \n\t"
156 "punpckhbw %%mm7, %%mm5 \n\t"
157 "paddw %%mm4, %%mm2 \n\t"
158 "paddw %%mm5, %%mm3 \n\t"
159 "movq %5, %%mm5 \n\t"
160 "paddw %%mm2, %%mm0 \n\t"
161 "paddw %%mm3, %%mm1 \n\t"
162 "paddw %%mm5, %%mm0 \n\t"
163 "paddw %%mm5, %%mm1 \n\t"
164 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
165 "movq (%3, %%"FF_REG_a
"), %%mm5 \n\t"
166 "psrlw $2, %%mm0 \n\t"
167 "psrlw $2, %%mm1 \n\t"
168 "packuswb %%mm1, %%mm0 \n\t"
169 "psubusb %%mm0, %%mm4 \n\t"
170 "psubusb %%mm5, %%mm0 \n\t"
171 "por %%mm4, %%mm0 \n\t"
172 "movq %%mm0, %%mm4 \n\t"
173 "punpcklbw %%mm7, %%mm0 \n\t"
174 "punpckhbw %%mm7, %%mm4 \n\t"
175 "paddw %%mm0, %%mm6 \n\t"
176 "paddw %%mm4, %%mm6 \n\t"
177 "movq %%mm2, %%mm0 \n\t"
178 "movq %%mm3, %%mm1 \n\t"
179 "add %4, %%"FF_REG_a
" \n\t"
183 "r" (
stride),
"m" (round_tab[2]));
186 static inline int sum_mmx(
void)
190 "movq %%mm6, %%mm0 \n\t"
191 "psrlq $32, %%mm6 \n\t"
192 "paddw %%mm0, %%mm6 \n\t"
193 "movq %%mm6, %%mm0 \n\t"
194 "psrlq $16, %%mm6 \n\t"
195 "paddw %%mm0, %%mm6 \n\t"
196 "movd %%mm6, %0 \n\t"
201 #define PIX_SADXY(suf) \
202 static int sad8_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \
203 const uint8_t *blk1, ptrdiff_t stride, int h) \
206 "pxor %%mm7, %%mm7 \n\t" \
207 "pxor %%mm6, %%mm6 \n\t" \
210 sad8_4_ ## suf(blk1, blk2, stride, h); \
212 return sum_ ## suf(); \
215 static int sad16_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \
216 const uint8_t *blk1, ptrdiff_t stride, int h) \
219 "pxor %%mm7, %%mm7 \n\t" \
220 "pxor %%mm6, %%mm6 \n\t" \
223 sad8_4_ ## suf(blk1, blk2, stride, h); \
224 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
226 return sum_ ## suf(); \
239 c->pix_abs[0][3] = sad16_xy2_mmx;
240 c->pix_abs[1][3] = sad8_xy2_mmx;
248 c->nsse[0] = nsse16_mmx;
249 c->nsse[1] = nsse8_mmx;
254 #if !HAVE_ALIGNED_STACK
255 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
256 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
285 #if HAVE_ALIGNED_STACK
286 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
287 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
305 #if HAVE_ALIGNED_STACK
306 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
307 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
#define INLINE_MMX(flags)
int ff_sad8_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse16_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad_intra16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad_intra8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
#define DECLARE_ASM_CONST(n, t, v)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
int ff_hf_noise8_mmx(const uint8_t *pix1, ptrdiff_t stride, int h)
static atomic_int cpu_flags
int ff_sum_abs_dctelem_ssse3(const int16_t *block)
int ff_vsad16_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int flags
AV_CODEC_FLAG_*.
int ff_hf_noise16_mmx(const uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad16_approx_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
int ff_sad16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad8_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
#define EXTERNAL_SSE2(flags)
int ff_sad16_y2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse8_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
int ff_vsad_intra16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
main external API structure.
int ff_sum_abs_dctelem_sse2(const int16_t *block)
int ff_sad8_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
#define hadamard_func(cpu)
The exact code depends on how similar the blocks are and how related they are to the block
int ff_sad16_x2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
#define EXTERNAL_SSSE3(flags)
#define EXTERNAL_MMX(flags)
#define EXTERNAL_MMXEXT(flags)
int ff_sse16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)