FFmpeg
me_cmp_init.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/me_cmp.h"
30 #include "libavcodec/mpegvideo.h"
31 
32 int ff_sum_abs_dctelem_mmx(int16_t *block);
33 int ff_sum_abs_dctelem_mmxext(int16_t *block);
34 int ff_sum_abs_dctelem_sse2(int16_t *block);
35 int ff_sum_abs_dctelem_ssse3(int16_t *block);
36 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
37  ptrdiff_t stride, int h);
38 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39  ptrdiff_t stride, int h);
40 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
41  ptrdiff_t stride, int h);
42 int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
43 int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44 int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
45  ptrdiff_t stride, int h);
46 int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
47  ptrdiff_t stride, int h);
48 int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
49  ptrdiff_t stride, int h);
50 int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
51  ptrdiff_t stride, int h);
53  ptrdiff_t stride, int h);
54 int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
55  ptrdiff_t stride, int h);
56 int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
57  ptrdiff_t stride, int h);
59  ptrdiff_t stride, int h);
60 int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
61  ptrdiff_t stride, int h);
63  ptrdiff_t stride, int h);
65  ptrdiff_t stride, int h);
67  ptrdiff_t stride, int h);
69  ptrdiff_t stride, int h);
71  ptrdiff_t stride, int h);
73  ptrdiff_t stride, int h);
75  ptrdiff_t stride, int h);
77  ptrdiff_t stride, int h);
79  ptrdiff_t stride, int h);
80 
81 #define hadamard_func(cpu) \
82  int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
83  uint8_t *src2, ptrdiff_t stride, int h); \
84  int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
85  uint8_t *src2, ptrdiff_t stride, int h);
86 
88 hadamard_func(mmxext)
89 hadamard_func(sse2)
90 hadamard_func(ssse3)
91 
92 #if HAVE_X86ASM
93 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
94  ptrdiff_t stride, int h)
95 {
96  int score1, score2;
97 
98  if (c)
99  score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
100  else
101  score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
102  score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
103  - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
104 
105  if (c)
106  return score1 + FFABS(score2) * c->avctx->nsse_weight;
107  else
108  return score1 + FFABS(score2) * 8;
109 }
110 
111 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
112  ptrdiff_t stride, int h)
113 {
114  int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
115  int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
116  ff_hf_noise8_mmx(pix2, stride, h);
117 
118  if (c)
119  return score1 + FFABS(score2) * c->avctx->nsse_weight;
120  else
121  return score1 + FFABS(score2) * 8;
122 }
123 
124 #endif /* HAVE_X86ASM */
125 
126 #if HAVE_INLINE_ASM
127 
128 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
129  ptrdiff_t stride, int h)
130 {
131  int tmp;
132 
133  av_assert2((((int) pix) & 7) == 0);
134  av_assert2((stride & 7) == 0);
135 
136 #define SUM(in0, in1, out0, out1) \
137  "movq (%0), %%mm2\n" \
138  "movq 8(%0), %%mm3\n" \
139  "add %2,%0\n" \
140  "movq %%mm2, " #out0 "\n" \
141  "movq %%mm3, " #out1 "\n" \
142  "psubusb " #in0 ", %%mm2\n" \
143  "psubusb " #in1 ", %%mm3\n" \
144  "psubusb " #out0 ", " #in0 "\n" \
145  "psubusb " #out1 ", " #in1 "\n" \
146  "por %%mm2, " #in0 "\n" \
147  "por %%mm3, " #in1 "\n" \
148  "movq " #in0 ", %%mm2\n" \
149  "movq " #in1 ", %%mm3\n" \
150  "punpcklbw %%mm7, " #in0 "\n" \
151  "punpcklbw %%mm7, " #in1 "\n" \
152  "punpckhbw %%mm7, %%mm2\n" \
153  "punpckhbw %%mm7, %%mm3\n" \
154  "paddw " #in1 ", " #in0 "\n" \
155  "paddw %%mm3, %%mm2\n" \
156  "paddw %%mm2, " #in0 "\n" \
157  "paddw " #in0 ", %%mm6\n"
158 
159 
160  __asm__ volatile (
161  "movl %3, %%ecx\n"
162  "pxor %%mm6, %%mm6\n"
163  "pxor %%mm7, %%mm7\n"
164  "movq (%0), %%mm0\n"
165  "movq 8(%0), %%mm1\n"
166  "add %2, %0\n"
167  "jmp 2f\n"
168  "1:\n"
169 
170  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
171  "2:\n"
172  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
173 
174  "subl $2, %%ecx\n"
175  "jnz 1b\n"
176 
177  "movq %%mm6, %%mm0\n"
178  "psrlq $32, %%mm6\n"
179  "paddw %%mm6, %%mm0\n"
180  "movq %%mm0, %%mm6\n"
181  "psrlq $16, %%mm0\n"
182  "paddw %%mm6, %%mm0\n"
183  "movd %%mm0, %1\n"
184  : "+r" (pix), "=r" (tmp)
185  : "r" (stride), "m" (h)
186  : "%ecx");
187 
188  return tmp & 0xFFFF;
189 }
190 #undef SUM
191 
192 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
193  ptrdiff_t stride, int h)
194 {
195  int tmp;
196 
197  av_assert2((((int) pix1) & 7) == 0);
198  av_assert2((((int) pix2) & 7) == 0);
199  av_assert2((stride & 7) == 0);
200 
201 #define SUM(in0, in1, out0, out1) \
202  "movq (%0), %%mm2\n" \
203  "movq (%1), " #out0 "\n" \
204  "movq 8(%0), %%mm3\n" \
205  "movq 8(%1), " #out1 "\n" \
206  "add %3, %0\n" \
207  "add %3, %1\n" \
208  "psubb " #out0 ", %%mm2\n" \
209  "psubb " #out1 ", %%mm3\n" \
210  "pxor %%mm7, %%mm2\n" \
211  "pxor %%mm7, %%mm3\n" \
212  "movq %%mm2, " #out0 "\n" \
213  "movq %%mm3, " #out1 "\n" \
214  "psubusb " #in0 ", %%mm2\n" \
215  "psubusb " #in1 ", %%mm3\n" \
216  "psubusb " #out0 ", " #in0 "\n" \
217  "psubusb " #out1 ", " #in1 "\n" \
218  "por %%mm2, " #in0 "\n" \
219  "por %%mm3, " #in1 "\n" \
220  "movq " #in0 ", %%mm2\n" \
221  "movq " #in1 ", %%mm3\n" \
222  "punpcklbw %%mm7, " #in0 "\n" \
223  "punpcklbw %%mm7, " #in1 "\n" \
224  "punpckhbw %%mm7, %%mm2\n" \
225  "punpckhbw %%mm7, %%mm3\n" \
226  "paddw " #in1 ", " #in0 "\n" \
227  "paddw %%mm3, %%mm2\n" \
228  "paddw %%mm2, " #in0 "\n" \
229  "paddw " #in0 ", %%mm6\n"
230 
231 
232  __asm__ volatile (
233  "movl %4, %%ecx\n"
234  "pxor %%mm6, %%mm6\n"
235  "pcmpeqw %%mm7, %%mm7\n"
236  "psllw $15, %%mm7\n"
237  "packsswb %%mm7, %%mm7\n"
238  "movq (%0), %%mm0\n"
239  "movq (%1), %%mm2\n"
240  "movq 8(%0), %%mm1\n"
241  "movq 8(%1), %%mm3\n"
242  "add %3, %0\n"
243  "add %3, %1\n"
244  "psubb %%mm2, %%mm0\n"
245  "psubb %%mm3, %%mm1\n"
246  "pxor %%mm7, %%mm0\n"
247  "pxor %%mm7, %%mm1\n"
248  "jmp 2f\n"
249  "1:\n"
250 
251  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
252  "2:\n"
253  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
254 
255  "subl $2, %%ecx\n"
256  "jnz 1b\n"
257 
258  "movq %%mm6, %%mm0\n"
259  "psrlq $32, %%mm6\n"
260  "paddw %%mm6, %%mm0\n"
261  "movq %%mm0, %%mm6\n"
262  "psrlq $16, %%mm0\n"
263  "paddw %%mm6, %%mm0\n"
264  "movd %%mm0, %2\n"
265  : "+r" (pix1), "+r" (pix2), "=r" (tmp)
266  : "r" (stride), "m" (h)
267  : "%ecx");
268 
269  return tmp & 0x7FFF;
270 }
271 #undef SUM
272 
273 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
274  0x0000000000000000ULL,
275  0x0001000100010001ULL,
276  0x0002000200020002ULL,
277 };
278 
279 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
280  ptrdiff_t stride, int h)
281 {
282  x86_reg len = -stride * h;
283  __asm__ volatile (
284  ".p2align 4 \n\t"
285  "1: \n\t"
286  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
287  "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
288  "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
289  "add %3, %%"FF_REG_a" \n\t"
290  "psubusb %%mm0, %%mm2 \n\t"
291  "psubusb %%mm4, %%mm0 \n\t"
292  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
293  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
294  "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
295  "psubusb %%mm1, %%mm3 \n\t"
296  "psubusb %%mm5, %%mm1 \n\t"
297  "por %%mm2, %%mm0 \n\t"
298  "por %%mm1, %%mm3 \n\t"
299  "movq %%mm0, %%mm1 \n\t"
300  "movq %%mm3, %%mm2 \n\t"
301  "punpcklbw %%mm7, %%mm0 \n\t"
302  "punpckhbw %%mm7, %%mm1 \n\t"
303  "punpcklbw %%mm7, %%mm3 \n\t"
304  "punpckhbw %%mm7, %%mm2 \n\t"
305  "paddw %%mm1, %%mm0 \n\t"
306  "paddw %%mm3, %%mm2 \n\t"
307  "paddw %%mm2, %%mm0 \n\t"
308  "paddw %%mm0, %%mm6 \n\t"
309  "add %3, %%"FF_REG_a" \n\t"
310  " js 1b \n\t"
311  : "+a" (len)
312  : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
313 }
314 
315 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
316  ptrdiff_t stride, int h)
317 {
318  x86_reg len = -stride * h;
319  __asm__ volatile (
320  ".p2align 4 \n\t"
321  "1: \n\t"
322  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
323  "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
324  "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
325  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
326  "punpcklbw %%mm7, %%mm0 \n\t"
327  "punpcklbw %%mm7, %%mm1 \n\t"
328  "punpckhbw %%mm7, %%mm2 \n\t"
329  "punpckhbw %%mm7, %%mm3 \n\t"
330  "paddw %%mm0, %%mm1 \n\t"
331  "paddw %%mm2, %%mm3 \n\t"
332  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
333  "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
334  "paddw %%mm5, %%mm1 \n\t"
335  "paddw %%mm5, %%mm3 \n\t"
336  "psrlw $1, %%mm1 \n\t"
337  "psrlw $1, %%mm3 \n\t"
338  "packuswb %%mm3, %%mm1 \n\t"
339  "psubusb %%mm1, %%mm4 \n\t"
340  "psubusb %%mm2, %%mm1 \n\t"
341  "por %%mm4, %%mm1 \n\t"
342  "movq %%mm1, %%mm0 \n\t"
343  "punpcklbw %%mm7, %%mm0 \n\t"
344  "punpckhbw %%mm7, %%mm1 \n\t"
345  "paddw %%mm1, %%mm0 \n\t"
346  "paddw %%mm0, %%mm6 \n\t"
347  "add %4, %%"FF_REG_a" \n\t"
348  " js 1b \n\t"
349  : "+a" (len)
350  : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
351  "r" (stride));
352 }
353 
354 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
355  ptrdiff_t stride, int h)
356 {
357  x86_reg len = -stride * h;
358  __asm__ volatile (
359  "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
360  "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
361  "movq %%mm0, %%mm1 \n\t"
362  "movq %%mm2, %%mm3 \n\t"
363  "punpcklbw %%mm7, %%mm0 \n\t"
364  "punpckhbw %%mm7, %%mm1 \n\t"
365  "punpcklbw %%mm7, %%mm2 \n\t"
366  "punpckhbw %%mm7, %%mm3 \n\t"
367  "paddw %%mm2, %%mm0 \n\t"
368  "paddw %%mm3, %%mm1 \n\t"
369  ".p2align 4 \n\t"
370  "1: \n\t"
371  "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
372  "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
373  "movq %%mm2, %%mm3 \n\t"
374  "movq %%mm4, %%mm5 \n\t"
375  "punpcklbw %%mm7, %%mm2 \n\t"
376  "punpckhbw %%mm7, %%mm3 \n\t"
377  "punpcklbw %%mm7, %%mm4 \n\t"
378  "punpckhbw %%mm7, %%mm5 \n\t"
379  "paddw %%mm4, %%mm2 \n\t"
380  "paddw %%mm5, %%mm3 \n\t"
381  "movq %5, %%mm5 \n\t"
382  "paddw %%mm2, %%mm0 \n\t"
383  "paddw %%mm3, %%mm1 \n\t"
384  "paddw %%mm5, %%mm0 \n\t"
385  "paddw %%mm5, %%mm1 \n\t"
386  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
387  "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
388  "psrlw $2, %%mm0 \n\t"
389  "psrlw $2, %%mm1 \n\t"
390  "packuswb %%mm1, %%mm0 \n\t"
391  "psubusb %%mm0, %%mm4 \n\t"
392  "psubusb %%mm5, %%mm0 \n\t"
393  "por %%mm4, %%mm0 \n\t"
394  "movq %%mm0, %%mm4 \n\t"
395  "punpcklbw %%mm7, %%mm0 \n\t"
396  "punpckhbw %%mm7, %%mm4 \n\t"
397  "paddw %%mm0, %%mm6 \n\t"
398  "paddw %%mm4, %%mm6 \n\t"
399  "movq %%mm2, %%mm0 \n\t"
400  "movq %%mm3, %%mm1 \n\t"
401  "add %4, %%"FF_REG_a" \n\t"
402  " js 1b \n\t"
403  : "+a" (len)
404  : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
405  "r" (stride), "m" (round_tab[2]));
406 }
407 
408 static inline int sum_mmx(void)
409 {
410  int ret;
411  __asm__ volatile (
412  "movq %%mm6, %%mm0 \n\t"
413  "psrlq $32, %%mm6 \n\t"
414  "paddw %%mm0, %%mm6 \n\t"
415  "movq %%mm6, %%mm0 \n\t"
416  "psrlq $16, %%mm6 \n\t"
417  "paddw %%mm0, %%mm6 \n\t"
418  "movd %%mm6, %0 \n\t"
419  : "=r" (ret));
420  return ret & 0xFFFF;
421 }
422 
423 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
424  ptrdiff_t stride, int h)
425 {
426  sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
427 }
428 
429 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
430  ptrdiff_t stride, int h)
431 {
432  sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
433 }
434 
435 #define PIX_SAD(suf) \
436 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
437  uint8_t *blk1, ptrdiff_t stride, int h) \
438 { \
439  av_assert2(h == 8); \
440  __asm__ volatile ( \
441  "pxor %%mm7, %%mm7 \n\t" \
442  "pxor %%mm6, %%mm6 \n\t" \
443  :); \
444  \
445  sad8_1_ ## suf(blk1, blk2, stride, 8); \
446  \
447  return sum_ ## suf(); \
448 } \
449  \
450 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
451  uint8_t *blk1, ptrdiff_t stride, int h) \
452 { \
453  av_assert2(h == 8); \
454  __asm__ volatile ( \
455  "pxor %%mm7, %%mm7 \n\t" \
456  "pxor %%mm6, %%mm6 \n\t" \
457  "movq %0, %%mm5 \n\t" \
458  :: "m" (round_tab[1])); \
459  \
460  sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
461  \
462  return sum_ ## suf(); \
463 } \
464  \
465 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
466  uint8_t *blk1, ptrdiff_t stride, int h) \
467 { \
468  av_assert2(h == 8); \
469  __asm__ volatile ( \
470  "pxor %%mm7, %%mm7 \n\t" \
471  "pxor %%mm6, %%mm6 \n\t" \
472  "movq %0, %%mm5 \n\t" \
473  :: "m" (round_tab[1])); \
474  \
475  sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
476  \
477  return sum_ ## suf(); \
478 } \
479  \
480 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
481  uint8_t *blk1, ptrdiff_t stride, int h) \
482 { \
483  av_assert2(h == 8); \
484  __asm__ volatile ( \
485  "pxor %%mm7, %%mm7 \n\t" \
486  "pxor %%mm6, %%mm6 \n\t" \
487  ::); \
488  \
489  sad8_4_ ## suf(blk1, blk2, stride, 8); \
490  \
491  return sum_ ## suf(); \
492 } \
493  \
494 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
495  uint8_t *blk1, ptrdiff_t stride, int h) \
496 { \
497  __asm__ volatile ( \
498  "pxor %%mm7, %%mm7 \n\t" \
499  "pxor %%mm6, %%mm6 \n\t" \
500  :); \
501  \
502  sad8_1_ ## suf(blk1, blk2, stride, h); \
503  sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
504  \
505  return sum_ ## suf(); \
506 } \
507  \
508 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
509  uint8_t *blk1, ptrdiff_t stride, int h) \
510 { \
511  __asm__ volatile ( \
512  "pxor %%mm7, %%mm7 \n\t" \
513  "pxor %%mm6, %%mm6 \n\t" \
514  "movq %0, %%mm5 \n\t" \
515  :: "m" (round_tab[1])); \
516  \
517  sad8_x2a_ ## suf(blk1, blk2, stride, h); \
518  sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
519  \
520  return sum_ ## suf(); \
521 } \
522  \
523 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
524  uint8_t *blk1, ptrdiff_t stride, int h) \
525 { \
526  __asm__ volatile ( \
527  "pxor %%mm7, %%mm7 \n\t" \
528  "pxor %%mm6, %%mm6 \n\t" \
529  "movq %0, %%mm5 \n\t" \
530  :: "m" (round_tab[1])); \
531  \
532  sad8_y2a_ ## suf(blk1, blk2, stride, h); \
533  sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
534  \
535  return sum_ ## suf(); \
536 } \
537  \
538 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
539  uint8_t *blk1, ptrdiff_t stride, int h) \
540 { \
541  __asm__ volatile ( \
542  "pxor %%mm7, %%mm7 \n\t" \
543  "pxor %%mm6, %%mm6 \n\t" \
544  ::); \
545  \
546  sad8_4_ ## suf(blk1, blk2, stride, h); \
547  sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
548  \
549  return sum_ ## suf(); \
550 } \
551 
552 PIX_SAD(mmx)
553 
554 #endif /* HAVE_INLINE_ASM */
555 
557 {
558  int cpu_flags = av_get_cpu_flags();
559 
560 #if HAVE_INLINE_ASM
561  if (INLINE_MMX(cpu_flags)) {
562  c->pix_abs[0][0] = sad16_mmx;
563  c->pix_abs[0][1] = sad16_x2_mmx;
564  c->pix_abs[0][2] = sad16_y2_mmx;
565  c->pix_abs[0][3] = sad16_xy2_mmx;
566  c->pix_abs[1][0] = sad8_mmx;
567  c->pix_abs[1][1] = sad8_x2_mmx;
568  c->pix_abs[1][2] = sad8_y2_mmx;
569  c->pix_abs[1][3] = sad8_xy2_mmx;
570 
571  c->sad[0] = sad16_mmx;
572  c->sad[1] = sad8_mmx;
573 
574  c->vsad[4] = vsad_intra16_mmx;
575 
576  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
577  c->vsad[0] = vsad16_mmx;
578  }
579  }
580 
581 #endif /* HAVE_INLINE_ASM */
582 
583  if (EXTERNAL_MMX(cpu_flags)) {
584  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
585  c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
586  c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
587  c->sse[0] = ff_sse16_mmx;
588  c->sse[1] = ff_sse8_mmx;
589 #if HAVE_X86ASM
590  c->nsse[0] = nsse16_mmx;
591  c->nsse[1] = nsse8_mmx;
592 #endif
593  }
594 
595  if (EXTERNAL_MMXEXT(cpu_flags)) {
596  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
597  c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
598  c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
599 
600  c->sad[0] = ff_sad16_mmxext;
601  c->sad[1] = ff_sad8_mmxext;
602 
603  c->pix_abs[0][0] = ff_sad16_mmxext;
604  c->pix_abs[0][1] = ff_sad16_x2_mmxext;
605  c->pix_abs[0][2] = ff_sad16_y2_mmxext;
606  c->pix_abs[1][0] = ff_sad8_mmxext;
607  c->pix_abs[1][1] = ff_sad8_x2_mmxext;
608  c->pix_abs[1][2] = ff_sad8_y2_mmxext;
609 
610  c->vsad[4] = ff_vsad_intra16_mmxext;
611  c->vsad[5] = ff_vsad_intra8_mmxext;
612 
613  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
614  c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
615  c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
616 
617  c->vsad[0] = ff_vsad16_approx_mmxext;
618  c->vsad[1] = ff_vsad8_approx_mmxext;
619  }
620  }
621 
622  if (EXTERNAL_SSE2(cpu_flags)) {
623  c->sse[0] = ff_sse16_sse2;
624  c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
625 
626 #if HAVE_ALIGNED_STACK
627  c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
628  c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
629 #endif
631  c->sad[0] = ff_sad16_sse2;
632  c->pix_abs[0][0] = ff_sad16_sse2;
633  c->pix_abs[0][1] = ff_sad16_x2_sse2;
634  c->pix_abs[0][2] = ff_sad16_y2_sse2;
635 
636  c->vsad[4] = ff_vsad_intra16_sse2;
637  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
638  c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
639  c->vsad[0] = ff_vsad16_approx_sse2;
640  }
641  }
642  }
643 
644  if (EXTERNAL_SSSE3(cpu_flags)) {
645  c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
646 #if HAVE_ALIGNED_STACK
647  c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
648  c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
649 #endif
650  }
651 }
INLINE_MMX
#define INLINE_MMX(flags)
Definition: cpu.h:86
stride
int stride
Definition: mace.c:144
cpu.h
ff_vsad16_approx_mmxext
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad8_y2_mmxext
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
DECLARE_ASM_CONST
#define DECLARE_ASM_CONST(n, t, v)
Definition: mem.h:114
ff_sad16_y2_mmxext
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
ff_sse16_mmx
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_me_cmp_init_x86
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
mpegvideo.h
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:50
ff_vsad16_approx_sse2
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
AVCodecContext::flags
int flags
AV_CODEC_FLAG_*.
Definition: avcodec.h:606
ff_sum_abs_dctelem_sse2
int ff_sum_abs_dctelem_sse2(int16_t *block)
ff_sad16_y2_sse2
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
av_cold
#define av_cold
Definition: attributes.h:90
ff_sad16_approx_xy2_sse2
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad8_mmxext
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
AVCodecContext::codec_id
enum AVCodecID codec_id
Definition: avcodec.h:536
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
ff_vsad_intra16_mmxext
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
MECmpContext
Definition: me_cmp.h:53
ff_sum_abs_dctelem_ssse3
int ff_sum_abs_dctelem_ssse3(int16_t *block)
ff_sse16_sse2
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_vsad_intra16_sse2
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_hf_noise16_mmx
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
ff_sad16_sse2
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
cpu.h
AV_CPU_FLAG_SSE2SLOW
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
Definition: cpu.h:37
asm.h
ff_sad16_x2_mmxext
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
attributes.h
ff_sad8_approx_xy2_mmxext
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
ff_vsad_intra8_mmxext
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad16_mmxext
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
uint8_t
uint8_t
Definition: audio_convert.c:194
len
int len
Definition: vorbis_enc_data.h:452
ff_sad8_x2_mmxext
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ret
ret
Definition: filter_design.txt:187
me_cmp.h
AVCodecContext
main external API structure.
Definition: avcodec.h:526
AV_CODEC_ID_SNOW
@ AV_CODEC_ID_SNOW
Definition: codec_id.h:257
dummy
int dummy
Definition: motion.c:64
ff_sum_abs_dctelem_mmx
int ff_sum_abs_dctelem_mmx(int16_t *block)
ff_vsad8_approx_mmxext
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
AV_CODEC_FLAG_BITEXACT
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
Definition: avcodec.h:333
hadamard_func
#define hadamard_func(cpu)
Definition: me_cmp_init.c:81
ff_hf_noise8_mmx
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
ff_sad16_x2_sse2
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad16_approx_xy2_mmxext
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
x86_reg
int x86_reg
Definition: asm.h:72
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2038
ff_sum_abs_dctelem_mmxext
int ff_sum_abs_dctelem_mmxext(int16_t *block)
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
EXTERNAL_MMX
#define EXTERNAL_MMX(flags)
Definition: cpu.h:56
MpegEncContext
MpegEncContext.
Definition: mpegvideo.h:81
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:57
ff_sse8_mmx
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)