81 #define hadamard_func(cpu) \
82 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
83 uint8_t *src2, ptrdiff_t stride, int h); \
84 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
85 uint8_t *src2, ptrdiff_t stride, int h);
108 return score1 +
FFABS(score2) * 8;
114 int score1 =
ff_sse8_mmx(c, pix1, pix2, stride, h);
121 return score1 +
FFABS(score2) * 8;
129 ptrdiff_t stride,
int h)
136 #define SUM(in0, in1, out0, out1) \
137 "movq (%0), %%mm2\n" \
138 "movq 8(%0), %%mm3\n" \
140 "movq %%mm2, " #out0 "\n" \
141 "movq %%mm3, " #out1 "\n" \
142 "psubusb " #in0 ", %%mm2\n" \
143 "psubusb " #in1 ", %%mm3\n" \
144 "psubusb " #out0 ", " #in0 "\n" \
145 "psubusb " #out1 ", " #in1 "\n" \
146 "por %%mm2, " #in0 "\n" \
147 "por %%mm3, " #in1 "\n" \
148 "movq " #in0 ", %%mm2\n" \
149 "movq " #in1 ", %%mm3\n" \
150 "punpcklbw %%mm7, " #in0 "\n" \
151 "punpcklbw %%mm7, " #in1 "\n" \
152 "punpckhbw %%mm7, %%mm2\n" \
153 "punpckhbw %%mm7, %%mm3\n" \
154 "paddw " #in1 ", " #in0 "\n" \
155 "paddw %%mm3, %%mm2\n" \
156 "paddw %%mm2, " #in0 "\n" \
157 "paddw " #in0 ", %%mm6\n"
162 "pxor %%mm6, %%mm6\n"
163 "pxor %%mm7, %%mm7\n"
165 "movq 8(%0), %%mm1\n"
170 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
177 "movq %%mm6, %%mm0\n"
179 "paddw %%mm6, %%mm0\n"
180 "movq %%mm0, %%mm6\n"
182 "paddw %%mm6, %%mm0\n"
184 :
"+r" (pix),
"=r" (tmp)
193 ptrdiff_t stride,
int h)
201 #define SUM(in0, in1, out0, out1) \
202 "movq (%0), %%mm2\n" \
203 "movq (%1), " #out0 "\n" \
204 "movq 8(%0), %%mm3\n" \
205 "movq 8(%1), " #out1 "\n" \
208 "psubb " #out0 ", %%mm2\n" \
209 "psubb " #out1 ", %%mm3\n" \
210 "pxor %%mm7, %%mm2\n" \
211 "pxor %%mm7, %%mm3\n" \
212 "movq %%mm2, " #out0 "\n" \
213 "movq %%mm3, " #out1 "\n" \
214 "psubusb " #in0 ", %%mm2\n" \
215 "psubusb " #in1 ", %%mm3\n" \
216 "psubusb " #out0 ", " #in0 "\n" \
217 "psubusb " #out1 ", " #in1 "\n" \
218 "por %%mm2, " #in0 "\n" \
219 "por %%mm3, " #in1 "\n" \
220 "movq " #in0 ", %%mm2\n" \
221 "movq " #in1 ", %%mm3\n" \
222 "punpcklbw %%mm7, " #in0 "\n" \
223 "punpcklbw %%mm7, " #in1 "\n" \
224 "punpckhbw %%mm7, %%mm2\n" \
225 "punpckhbw %%mm7, %%mm3\n" \
226 "paddw " #in1 ", " #in0 "\n" \
227 "paddw %%mm3, %%mm2\n" \
228 "paddw %%mm2, " #in0 "\n" \
229 "paddw " #in0 ", %%mm6\n"
234 "pxor %%mm6, %%mm6\n"
235 "pcmpeqw %%mm7, %%mm7\n"
237 "packsswb %%mm7, %%mm7\n"
240 "movq 8(%0), %%mm1\n"
241 "movq 8(%1), %%mm3\n"
244 "psubb %%mm2, %%mm0\n"
245 "psubb %%mm3, %%mm1\n"
246 "pxor %%mm7, %%mm0\n"
247 "pxor %%mm7, %%mm1\n"
251 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
258 "movq %%mm6, %%mm0\n"
260 "paddw %%mm6, %%mm0\n"
261 "movq %%mm0, %%mm6\n"
263 "paddw %%mm6, %%mm0\n"
265 :
"+r" (pix1),
"+r" (pix2),
"=r" (
tmp)
266 :
"r" (stride),
"m" (
h)
274 0x0000000000000000ULL,
275 0x0001000100010001ULL,
276 0x0002000200020002ULL,
280 ptrdiff_t stride,
int h)
286 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t"
287 "movq (%2, %%"FF_REG_a
"), %%mm2 \n\t"
288 "movq (%2, %%"FF_REG_a
"), %%mm4 \n\t"
289 "add %3, %%"FF_REG_a
" \n\t"
290 "psubusb %%mm0, %%mm2 \n\t"
291 "psubusb %%mm4, %%mm0 \n\t"
292 "movq (%1, %%"FF_REG_a
"), %%mm1 \n\t"
293 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t"
294 "movq (%2, %%"FF_REG_a
"), %%mm5 \n\t"
295 "psubusb %%mm1, %%mm3 \n\t"
296 "psubusb %%mm5, %%mm1 \n\t"
297 "por %%mm2, %%mm0 \n\t"
298 "por %%mm1, %%mm3 \n\t"
299 "movq %%mm0, %%mm1 \n\t"
300 "movq %%mm3, %%mm2 \n\t"
301 "punpcklbw %%mm7, %%mm0 \n\t"
302 "punpckhbw %%mm7, %%mm1 \n\t"
303 "punpcklbw %%mm7, %%mm3 \n\t"
304 "punpckhbw %%mm7, %%mm2 \n\t"
305 "paddw %%mm1, %%mm0 \n\t"
306 "paddw %%mm3, %%mm2 \n\t"
307 "paddw %%mm2, %%mm0 \n\t"
308 "paddw %%mm0, %%mm6 \n\t"
309 "add %3, %%"FF_REG_a
" \n\t"
312 :
"r" (blk1 - len),
"r" (blk2 -
len),
"r" (stride));
316 ptrdiff_t stride,
int h)
322 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t"
323 "movq (%2, %%"FF_REG_a
"), %%mm1 \n\t"
324 "movq (%1, %%"FF_REG_a
"), %%mm2 \n\t"
325 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t"
326 "punpcklbw %%mm7, %%mm0 \n\t"
327 "punpcklbw %%mm7, %%mm1 \n\t"
328 "punpckhbw %%mm7, %%mm2 \n\t"
329 "punpckhbw %%mm7, %%mm3 \n\t"
330 "paddw %%mm0, %%mm1 \n\t"
331 "paddw %%mm2, %%mm3 \n\t"
332 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
333 "movq (%3, %%"FF_REG_a
"), %%mm2 \n\t"
334 "paddw %%mm5, %%mm1 \n\t"
335 "paddw %%mm5, %%mm3 \n\t"
336 "psrlw $1, %%mm1 \n\t"
337 "psrlw $1, %%mm3 \n\t"
338 "packuswb %%mm3, %%mm1 \n\t"
339 "psubusb %%mm1, %%mm4 \n\t"
340 "psubusb %%mm2, %%mm1 \n\t"
341 "por %%mm4, %%mm1 \n\t"
342 "movq %%mm1, %%mm0 \n\t"
343 "punpcklbw %%mm7, %%mm0 \n\t"
344 "punpckhbw %%mm7, %%mm1 \n\t"
345 "paddw %%mm1, %%mm0 \n\t"
346 "paddw %%mm0, %%mm6 \n\t"
347 "add %4, %%"FF_REG_a
" \n\t"
350 :
"r" (blk1a - len),
"r" (blk1b -
len),
"r" (blk2 - len),
355 ptrdiff_t stride,
int h)
359 "movq (%1, %%"FF_REG_a
"), %%mm0\n\t"
360 "movq 1(%1, %%"FF_REG_a
"), %%mm2\n\t"
361 "movq %%mm0, %%mm1 \n\t"
362 "movq %%mm2, %%mm3 \n\t"
363 "punpcklbw %%mm7, %%mm0 \n\t"
364 "punpckhbw %%mm7, %%mm1 \n\t"
365 "punpcklbw %%mm7, %%mm2 \n\t"
366 "punpckhbw %%mm7, %%mm3 \n\t"
367 "paddw %%mm2, %%mm0 \n\t"
368 "paddw %%mm3, %%mm1 \n\t"
371 "movq (%2, %%"FF_REG_a
"), %%mm2\n\t"
372 "movq 1(%2, %%"FF_REG_a
"), %%mm4\n\t"
373 "movq %%mm2, %%mm3 \n\t"
374 "movq %%mm4, %%mm5 \n\t"
375 "punpcklbw %%mm7, %%mm2 \n\t"
376 "punpckhbw %%mm7, %%mm3 \n\t"
377 "punpcklbw %%mm7, %%mm4 \n\t"
378 "punpckhbw %%mm7, %%mm5 \n\t"
379 "paddw %%mm4, %%mm2 \n\t"
380 "paddw %%mm5, %%mm3 \n\t"
381 "movq %5, %%mm5 \n\t"
382 "paddw %%mm2, %%mm0 \n\t"
383 "paddw %%mm3, %%mm1 \n\t"
384 "paddw %%mm5, %%mm0 \n\t"
385 "paddw %%mm5, %%mm1 \n\t"
386 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
387 "movq (%3, %%"FF_REG_a
"), %%mm5 \n\t"
388 "psrlw $2, %%mm0 \n\t"
389 "psrlw $2, %%mm1 \n\t"
390 "packuswb %%mm1, %%mm0 \n\t"
391 "psubusb %%mm0, %%mm4 \n\t"
392 "psubusb %%mm5, %%mm0 \n\t"
393 "por %%mm4, %%mm0 \n\t"
394 "movq %%mm0, %%mm4 \n\t"
395 "punpcklbw %%mm7, %%mm0 \n\t"
396 "punpckhbw %%mm7, %%mm4 \n\t"
397 "paddw %%mm0, %%mm6 \n\t"
398 "paddw %%mm4, %%mm6 \n\t"
399 "movq %%mm2, %%mm0 \n\t"
400 "movq %%mm3, %%mm1 \n\t"
401 "add %4, %%"FF_REG_a
" \n\t"
404 :
"r" (blk1 - len),
"r" (blk1 - len +
stride),
"r" (blk2 - len),
405 "r" (
stride),
"m" (round_tab[2]));
408 static inline int sum_mmx(
void)
412 "movq %%mm6, %%mm0 \n\t"
413 "psrlq $32, %%mm6 \n\t"
414 "paddw %%mm0, %%mm6 \n\t"
415 "movq %%mm6, %%mm0 \n\t"
416 "psrlq $16, %%mm6 \n\t"
417 "paddw %%mm0, %%mm6 \n\t"
418 "movd %%mm6, %0 \n\t"
424 ptrdiff_t stride,
int h)
426 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
430 ptrdiff_t stride,
int h)
432 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
435 #define PIX_SAD(suf) \
436 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
437 uint8_t *blk1, ptrdiff_t stride, int h) \
439 av_assert2(h == 8); \
441 "pxor %%mm7, %%mm7 \n\t" \
442 "pxor %%mm6, %%mm6 \n\t" \
445 sad8_1_ ## suf(blk1, blk2, stride, 8); \
447 return sum_ ## suf(); \
450 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
451 uint8_t *blk1, ptrdiff_t stride, int h) \
453 av_assert2(h == 8); \
455 "pxor %%mm7, %%mm7 \n\t" \
456 "pxor %%mm6, %%mm6 \n\t" \
457 "movq %0, %%mm5 \n\t" \
458 :: "m" (round_tab[1])); \
460 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
462 return sum_ ## suf(); \
465 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
466 uint8_t *blk1, ptrdiff_t stride, int h) \
468 av_assert2(h == 8); \
470 "pxor %%mm7, %%mm7 \n\t" \
471 "pxor %%mm6, %%mm6 \n\t" \
472 "movq %0, %%mm5 \n\t" \
473 :: "m" (round_tab[1])); \
475 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
477 return sum_ ## suf(); \
480 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
481 uint8_t *blk1, ptrdiff_t stride, int h) \
483 av_assert2(h == 8); \
485 "pxor %%mm7, %%mm7 \n\t" \
486 "pxor %%mm6, %%mm6 \n\t" \
489 sad8_4_ ## suf(blk1, blk2, stride, 8); \
491 return sum_ ## suf(); \
494 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
495 uint8_t *blk1, ptrdiff_t stride, int h) \
498 "pxor %%mm7, %%mm7 \n\t" \
499 "pxor %%mm6, %%mm6 \n\t" \
502 sad8_1_ ## suf(blk1, blk2, stride, h); \
503 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
505 return sum_ ## suf(); \
508 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
509 uint8_t *blk1, ptrdiff_t stride, int h) \
512 "pxor %%mm7, %%mm7 \n\t" \
513 "pxor %%mm6, %%mm6 \n\t" \
514 "movq %0, %%mm5 \n\t" \
515 :: "m" (round_tab[1])); \
517 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
518 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
520 return sum_ ## suf(); \
523 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
524 uint8_t *blk1, ptrdiff_t stride, int h) \
527 "pxor %%mm7, %%mm7 \n\t" \
528 "pxor %%mm6, %%mm6 \n\t" \
529 "movq %0, %%mm5 \n\t" \
530 :: "m" (round_tab[1])); \
532 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
533 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
535 return sum_ ## suf(); \
538 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
539 uint8_t *blk1, ptrdiff_t stride, int h) \
542 "pxor %%mm7, %%mm7 \n\t" \
543 "pxor %%mm6, %%mm6 \n\t" \
546 sad8_4_ ## suf(blk1, blk2, stride, h); \
547 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
549 return sum_ ## suf(); \
563 c->
pix_abs[0][1] = sad16_x2_mmx;
564 c->
pix_abs[0][2] = sad16_y2_mmx;
565 c->
pix_abs[0][3] = sad16_xy2_mmx;
567 c->
pix_abs[1][1] = sad8_x2_mmx;
568 c->
pix_abs[1][2] = sad8_y2_mmx;
569 c->
pix_abs[1][3] = sad8_xy2_mmx;
571 c->
sad[0] = sad16_mmx;
572 c->
sad[1] = sad8_mmx;
574 c->
vsad[4] = vsad_intra16_mmx;
577 c->
vsad[0] = vsad16_mmx;
590 c->
nsse[0] = nsse16_mmx;
591 c->
nsse[1] = nsse8_mmx;
626 #if HAVE_ALIGNED_STACK
646 #if HAVE_ALIGNED_STACK
#define EXTERNAL_MMX(flags)
int ff_sum_abs_dctelem_mmx(int16_t *block)
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
static atomic_int cpu_flags
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Macro definitions for various function/variable attributes.
me_cmp_func hadamard8_diff[6]
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
#define hadamard_func(cpu)
int ff_sum_abs_dctelem_sse2(int16_t *block)
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
#define EXTERNAL_SSE2(flags)
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sum_abs_dctelem_ssse3(int16_t *block)
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define INLINE_MMX(flags)
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int flags
AV_CODEC_FLAG_*.
me_cmp_func pix_abs[2][4]
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define DECLARE_ASM_CONST(n, t, v)
Declare a static constant aligned variable appropriate for use in inline assembly code...
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
int ff_sum_abs_dctelem_mmxext(int16_t *block)
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
main external API structure.
#define EXTERNAL_SSSE3(flags)
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
#define EXTERNAL_MMXEXT(flags)
struct AVCodecContext * avctx
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
GLint GLenum GLboolean GLsizei stride
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int(* sum_abs_dctelem)(int16_t *block)
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)