Go to the documentation of this file.
81 #define hadamard_func(cpu) \
82 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
83 uint8_t *src2, ptrdiff_t stride, int h); \
84 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
85 uint8_t *src2, ptrdiff_t stride, int h);
99 score1 =
c->mecc.sse[0](
c, pix1, pix2,
stride,
h);
106 return score1 +
FFABS(score2) *
c->avctx->nsse_weight;
108 return score1 +
FFABS(score2) * 8;
119 return score1 +
FFABS(score2) *
c->avctx->nsse_weight;
121 return score1 +
FFABS(score2) * 8;
136 #define SUM(in0, in1, out0, out1) \
137 "movq (%0), %%mm2\n" \
138 "movq 8(%0), %%mm3\n" \
140 "movq %%mm2, " #out0 "\n" \
141 "movq %%mm3, " #out1 "\n" \
142 "psubusb " #in0 ", %%mm2\n" \
143 "psubusb " #in1 ", %%mm3\n" \
144 "psubusb " #out0 ", " #in0 "\n" \
145 "psubusb " #out1 ", " #in1 "\n" \
146 "por %%mm2, " #in0 "\n" \
147 "por %%mm3, " #in1 "\n" \
148 "movq " #in0 ", %%mm2\n" \
149 "movq " #in1 ", %%mm3\n" \
150 "punpcklbw %%mm7, " #in0 "\n" \
151 "punpcklbw %%mm7, " #in1 "\n" \
152 "punpckhbw %%mm7, %%mm2\n" \
153 "punpckhbw %%mm7, %%mm3\n" \
154 "paddw " #in1 ", " #in0 "\n" \
155 "paddw %%mm3, %%mm2\n" \
156 "paddw %%mm2, " #in0 "\n" \
157 "paddw " #in0 ", %%mm6\n"
162 "pxor %%mm6, %%mm6\n"
163 "pxor %%mm7, %%mm7\n"
165 "movq 8(%0), %%mm1\n"
170 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
177 "movq %%mm6, %%mm0\n"
179 "paddw %%mm6, %%mm0\n"
180 "movq %%mm0, %%mm6\n"
182 "paddw %%mm6, %%mm0\n"
184 :
"+r" (pix),
"=r" (
tmp)
201 #define SUM(in0, in1, out0, out1) \
202 "movq (%0), %%mm2\n" \
203 "movq (%1), " #out0 "\n" \
204 "movq 8(%0), %%mm3\n" \
205 "movq 8(%1), " #out1 "\n" \
208 "psubb " #out0 ", %%mm2\n" \
209 "psubb " #out1 ", %%mm3\n" \
210 "pxor %%mm7, %%mm2\n" \
211 "pxor %%mm7, %%mm3\n" \
212 "movq %%mm2, " #out0 "\n" \
213 "movq %%mm3, " #out1 "\n" \
214 "psubusb " #in0 ", %%mm2\n" \
215 "psubusb " #in1 ", %%mm3\n" \
216 "psubusb " #out0 ", " #in0 "\n" \
217 "psubusb " #out1 ", " #in1 "\n" \
218 "por %%mm2, " #in0 "\n" \
219 "por %%mm3, " #in1 "\n" \
220 "movq " #in0 ", %%mm2\n" \
221 "movq " #in1 ", %%mm3\n" \
222 "punpcklbw %%mm7, " #in0 "\n" \
223 "punpcklbw %%mm7, " #in1 "\n" \
224 "punpckhbw %%mm7, %%mm2\n" \
225 "punpckhbw %%mm7, %%mm3\n" \
226 "paddw " #in1 ", " #in0 "\n" \
227 "paddw %%mm3, %%mm2\n" \
228 "paddw %%mm2, " #in0 "\n" \
229 "paddw " #in0 ", %%mm6\n"
234 "pxor %%mm6, %%mm6\n"
235 "pcmpeqw %%mm7, %%mm7\n"
237 "packsswb %%mm7, %%mm7\n"
240 "movq 8(%0), %%mm1\n"
241 "movq 8(%1), %%mm3\n"
244 "psubb %%mm2, %%mm0\n"
245 "psubb %%mm3, %%mm1\n"
246 "pxor %%mm7, %%mm0\n"
247 "pxor %%mm7, %%mm1\n"
251 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
258 "movq %%mm6, %%mm0\n"
260 "paddw %%mm6, %%mm0\n"
261 "movq %%mm0, %%mm6\n"
263 "paddw %%mm6, %%mm0\n"
265 :
"+r" (pix1),
"+r" (pix2),
"=r" (
tmp)
274 0x0000000000000000ULL,
275 0x0001000100010001ULL,
276 0x0002000200020002ULL,
286 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t"
287 "movq (%2, %%"FF_REG_a
"), %%mm2 \n\t"
288 "movq (%2, %%"FF_REG_a
"), %%mm4 \n\t"
289 "add %3, %%"FF_REG_a
" \n\t"
290 "psubusb %%mm0, %%mm2 \n\t"
291 "psubusb %%mm4, %%mm0 \n\t"
292 "movq (%1, %%"FF_REG_a
"), %%mm1 \n\t"
293 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t"
294 "movq (%2, %%"FF_REG_a
"), %%mm5 \n\t"
295 "psubusb %%mm1, %%mm3 \n\t"
296 "psubusb %%mm5, %%mm1 \n\t"
297 "por %%mm2, %%mm0 \n\t"
298 "por %%mm1, %%mm3 \n\t"
299 "movq %%mm0, %%mm1 \n\t"
300 "movq %%mm3, %%mm2 \n\t"
301 "punpcklbw %%mm7, %%mm0 \n\t"
302 "punpckhbw %%mm7, %%mm1 \n\t"
303 "punpcklbw %%mm7, %%mm3 \n\t"
304 "punpckhbw %%mm7, %%mm2 \n\t"
305 "paddw %%mm1, %%mm0 \n\t"
306 "paddw %%mm3, %%mm2 \n\t"
307 "paddw %%mm2, %%mm0 \n\t"
308 "paddw %%mm0, %%mm6 \n\t"
309 "add %3, %%"FF_REG_a
" \n\t"
322 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t"
323 "movq (%2, %%"FF_REG_a
"), %%mm1 \n\t"
324 "movq (%1, %%"FF_REG_a
"), %%mm2 \n\t"
325 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t"
326 "punpcklbw %%mm7, %%mm0 \n\t"
327 "punpcklbw %%mm7, %%mm1 \n\t"
328 "punpckhbw %%mm7, %%mm2 \n\t"
329 "punpckhbw %%mm7, %%mm3 \n\t"
330 "paddw %%mm0, %%mm1 \n\t"
331 "paddw %%mm2, %%mm3 \n\t"
332 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
333 "movq (%3, %%"FF_REG_a
"), %%mm2 \n\t"
334 "paddw %%mm5, %%mm1 \n\t"
335 "paddw %%mm5, %%mm3 \n\t"
336 "psrlw $1, %%mm1 \n\t"
337 "psrlw $1, %%mm3 \n\t"
338 "packuswb %%mm3, %%mm1 \n\t"
339 "psubusb %%mm1, %%mm4 \n\t"
340 "psubusb %%mm2, %%mm1 \n\t"
341 "por %%mm4, %%mm1 \n\t"
342 "movq %%mm1, %%mm0 \n\t"
343 "punpcklbw %%mm7, %%mm0 \n\t"
344 "punpckhbw %%mm7, %%mm1 \n\t"
345 "paddw %%mm1, %%mm0 \n\t"
346 "paddw %%mm0, %%mm6 \n\t"
347 "add %4, %%"FF_REG_a
" \n\t"
350 :
"r" (blk1a -
len),
"r" (blk1b -
len),
"r" (blk2 -
len),
359 "movq (%1, %%"FF_REG_a
"), %%mm0\n\t"
360 "movq 1(%1, %%"FF_REG_a
"), %%mm2\n\t"
361 "movq %%mm0, %%mm1 \n\t"
362 "movq %%mm2, %%mm3 \n\t"
363 "punpcklbw %%mm7, %%mm0 \n\t"
364 "punpckhbw %%mm7, %%mm1 \n\t"
365 "punpcklbw %%mm7, %%mm2 \n\t"
366 "punpckhbw %%mm7, %%mm3 \n\t"
367 "paddw %%mm2, %%mm0 \n\t"
368 "paddw %%mm3, %%mm1 \n\t"
371 "movq (%2, %%"FF_REG_a
"), %%mm2\n\t"
372 "movq 1(%2, %%"FF_REG_a
"), %%mm4\n\t"
373 "movq %%mm2, %%mm3 \n\t"
374 "movq %%mm4, %%mm5 \n\t"
375 "punpcklbw %%mm7, %%mm2 \n\t"
376 "punpckhbw %%mm7, %%mm3 \n\t"
377 "punpcklbw %%mm7, %%mm4 \n\t"
378 "punpckhbw %%mm7, %%mm5 \n\t"
379 "paddw %%mm4, %%mm2 \n\t"
380 "paddw %%mm5, %%mm3 \n\t"
381 "movq %5, %%mm5 \n\t"
382 "paddw %%mm2, %%mm0 \n\t"
383 "paddw %%mm3, %%mm1 \n\t"
384 "paddw %%mm5, %%mm0 \n\t"
385 "paddw %%mm5, %%mm1 \n\t"
386 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
387 "movq (%3, %%"FF_REG_a
"), %%mm5 \n\t"
388 "psrlw $2, %%mm0 \n\t"
389 "psrlw $2, %%mm1 \n\t"
390 "packuswb %%mm1, %%mm0 \n\t"
391 "psubusb %%mm0, %%mm4 \n\t"
392 "psubusb %%mm5, %%mm0 \n\t"
393 "por %%mm4, %%mm0 \n\t"
394 "movq %%mm0, %%mm4 \n\t"
395 "punpcklbw %%mm7, %%mm0 \n\t"
396 "punpckhbw %%mm7, %%mm4 \n\t"
397 "paddw %%mm0, %%mm6 \n\t"
398 "paddw %%mm4, %%mm6 \n\t"
399 "movq %%mm2, %%mm0 \n\t"
400 "movq %%mm3, %%mm1 \n\t"
401 "add %4, %%"FF_REG_a
" \n\t"
405 "r" (
stride),
"m" (round_tab[2]));
408 static inline int sum_mmx(
void)
412 "movq %%mm6, %%mm0 \n\t"
413 "psrlq $32, %%mm6 \n\t"
414 "paddw %%mm0, %%mm6 \n\t"
415 "movq %%mm6, %%mm0 \n\t"
416 "psrlq $16, %%mm6 \n\t"
417 "paddw %%mm0, %%mm6 \n\t"
418 "movd %%mm6, %0 \n\t"
426 sad8_2_mmx(blk1, blk1 + 1, blk2,
stride,
h);
435 #define PIX_SAD(suf) \
436 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
437 uint8_t *blk1, ptrdiff_t stride, int h) \
439 av_assert2(h == 8); \
441 "pxor %%mm7, %%mm7 \n\t" \
442 "pxor %%mm6, %%mm6 \n\t" \
445 sad8_1_ ## suf(blk1, blk2, stride, 8); \
447 return sum_ ## suf(); \
450 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
451 uint8_t *blk1, ptrdiff_t stride, int h) \
453 av_assert2(h == 8); \
455 "pxor %%mm7, %%mm7 \n\t" \
456 "pxor %%mm6, %%mm6 \n\t" \
457 "movq %0, %%mm5 \n\t" \
458 :: "m" (round_tab[1])); \
460 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
462 return sum_ ## suf(); \
465 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
466 uint8_t *blk1, ptrdiff_t stride, int h) \
468 av_assert2(h == 8); \
470 "pxor %%mm7, %%mm7 \n\t" \
471 "pxor %%mm6, %%mm6 \n\t" \
472 "movq %0, %%mm5 \n\t" \
473 :: "m" (round_tab[1])); \
475 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
477 return sum_ ## suf(); \
480 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
481 uint8_t *blk1, ptrdiff_t stride, int h) \
483 av_assert2(h == 8); \
485 "pxor %%mm7, %%mm7 \n\t" \
486 "pxor %%mm6, %%mm6 \n\t" \
489 sad8_4_ ## suf(blk1, blk2, stride, 8); \
491 return sum_ ## suf(); \
494 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
495 uint8_t *blk1, ptrdiff_t stride, int h) \
498 "pxor %%mm7, %%mm7 \n\t" \
499 "pxor %%mm6, %%mm6 \n\t" \
502 sad8_1_ ## suf(blk1, blk2, stride, h); \
503 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
505 return sum_ ## suf(); \
508 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
509 uint8_t *blk1, ptrdiff_t stride, int h) \
512 "pxor %%mm7, %%mm7 \n\t" \
513 "pxor %%mm6, %%mm6 \n\t" \
514 "movq %0, %%mm5 \n\t" \
515 :: "m" (round_tab[1])); \
517 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
518 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
520 return sum_ ## suf(); \
523 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
524 uint8_t *blk1, ptrdiff_t stride, int h) \
527 "pxor %%mm7, %%mm7 \n\t" \
528 "pxor %%mm6, %%mm6 \n\t" \
529 "movq %0, %%mm5 \n\t" \
530 :: "m" (round_tab[1])); \
532 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
533 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
535 return sum_ ## suf(); \
538 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
539 uint8_t *blk1, ptrdiff_t stride, int h) \
542 "pxor %%mm7, %%mm7 \n\t" \
543 "pxor %%mm6, %%mm6 \n\t" \
546 sad8_4_ ## suf(blk1, blk2, stride, h); \
547 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
549 return sum_ ## suf(); \
562 c->pix_abs[0][0] = sad16_mmx;
563 c->pix_abs[0][1] = sad16_x2_mmx;
564 c->pix_abs[0][2] = sad16_y2_mmx;
565 c->pix_abs[0][3] = sad16_xy2_mmx;
566 c->pix_abs[1][0] = sad8_mmx;
567 c->pix_abs[1][1] = sad8_x2_mmx;
568 c->pix_abs[1][2] = sad8_y2_mmx;
569 c->pix_abs[1][3] = sad8_xy2_mmx;
571 c->sad[0] = sad16_mmx;
572 c->sad[1] = sad8_mmx;
574 c->vsad[4] = vsad_intra16_mmx;
577 c->vsad[0] = vsad16_mmx;
584 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
585 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
590 c->nsse[0] = nsse16_mmx;
591 c->nsse[1] = nsse8_mmx;
596 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
597 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
626 #if HAVE_ALIGNED_STACK
627 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
628 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
646 #if HAVE_ALIGNED_STACK
647 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
648 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
#define INLINE_MMX(flags)
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define DECLARE_ASM_CONST(n, t, v)
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
static atomic_int cpu_flags
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int flags
AV_CODEC_FLAG_*.
int ff_sum_abs_dctelem_sse2(int16_t *block)
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sum_abs_dctelem_ssse3(int16_t *block)
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define EXTERNAL_SSE2(flags)
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
main external API structure.
int ff_sum_abs_dctelem_mmx(int16_t *block)
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
#define hadamard_func(cpu)
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
The exact code depends on how similar the blocks are and how related they are to the block
int ff_sum_abs_dctelem_mmxext(int16_t *block)
#define EXTERNAL_SSSE3(flags)
#define EXTERNAL_MMX(flags)
#define EXTERNAL_MMXEXT(flags)
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)