FFmpeg
me_cmp_init.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/me_cmp.h"
31 #include "libavcodec/mpegvideo.h"
32 
33 int ff_sum_abs_dctelem_sse2(const int16_t *block);
34 int ff_sum_abs_dctelem_ssse3(const int16_t *block);
35 int ff_sse8_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
36  ptrdiff_t stride, int h);
37 int ff_sse16_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
38  ptrdiff_t stride, int h);
39 int ff_sse16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
40  ptrdiff_t stride, int h);
41 int ff_hf_noise8_mmx(const uint8_t *pix1, ptrdiff_t stride, int h);
42 int ff_hf_noise16_mmx(const uint8_t *pix1, ptrdiff_t stride, int h);
43 int ff_sad8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
44  ptrdiff_t stride, int h);
45 int ff_sad16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
46  ptrdiff_t stride, int h);
47 int ff_sad16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
48  ptrdiff_t stride, int h);
49 int ff_sad8_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
50  ptrdiff_t stride, int h);
51 int ff_sad16_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
52  ptrdiff_t stride, int h);
53 int ff_sad16_x2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
54  ptrdiff_t stride, int h);
55 int ff_sad8_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
56  ptrdiff_t stride, int h);
57 int ff_sad16_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
58  ptrdiff_t stride, int h);
59 int ff_sad16_y2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
60  ptrdiff_t stride, int h);
61 int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
62  ptrdiff_t stride, int h);
63 int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
64  ptrdiff_t stride, int h);
65 int ff_sad16_approx_xy2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
66  ptrdiff_t stride, int h);
67 int ff_vsad_intra8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
68  ptrdiff_t stride, int h);
69 int ff_vsad_intra16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
70  ptrdiff_t stride, int h);
71 int ff_vsad_intra16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
72  ptrdiff_t stride, int h);
73 int ff_vsad8_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
74  ptrdiff_t stride, int h);
75 int ff_vsad16_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
76  ptrdiff_t stride, int h);
77 int ff_vsad16_approx_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
78  ptrdiff_t stride, int h);
79 
80 #define hadamard_func(cpu) \
81  int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, const uint8_t *src1, \
82  const uint8_t *src2, ptrdiff_t stride, int h); \
83  int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, const uint8_t *src1, \
84  const uint8_t *src2, ptrdiff_t stride, int h);
85 
87 hadamard_func(sse2)
88 hadamard_func(ssse3)
89 
90 #if HAVE_X86ASM
91 static int nsse16_mmx(MpegEncContext *c, const uint8_t *pix1, const uint8_t *pix2,
92  ptrdiff_t stride, int h)
93 {
94  int score1, score2;
95 
96  if (c)
97  score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
98  else
99  score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
100  score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
101  - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
102 
103  if (c)
104  return score1 + FFABS(score2) * c->avctx->nsse_weight;
105  else
106  return score1 + FFABS(score2) * 8;
107 }
108 
109 static int nsse8_mmx(MpegEncContext *c, const uint8_t *pix1, const uint8_t *pix2,
110  ptrdiff_t stride, int h)
111 {
112  int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
113  int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
114  ff_hf_noise8_mmx(pix2, stride, h);
115 
116  if (c)
117  return score1 + FFABS(score2) * c->avctx->nsse_weight;
118  else
119  return score1 + FFABS(score2) * 8;
120 }
121 
122 #endif /* HAVE_X86ASM */
123 
124 #if HAVE_INLINE_ASM
125 
126 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
127  0x0000000000000000ULL,
128  0x0001000100010001ULL,
129  0x0002000200020002ULL,
130 };
131 
132 static inline void sad8_4_mmx(const uint8_t *blk1, const uint8_t *blk2,
133  ptrdiff_t stride, int h)
134 {
135  x86_reg len = -stride * h;
136  __asm__ volatile (
137  "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
138  "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
139  "movq %%mm0, %%mm1 \n\t"
140  "movq %%mm2, %%mm3 \n\t"
141  "punpcklbw %%mm7, %%mm0 \n\t"
142  "punpckhbw %%mm7, %%mm1 \n\t"
143  "punpcklbw %%mm7, %%mm2 \n\t"
144  "punpckhbw %%mm7, %%mm3 \n\t"
145  "paddw %%mm2, %%mm0 \n\t"
146  "paddw %%mm3, %%mm1 \n\t"
147  ".p2align 4 \n\t"
148  "1: \n\t"
149  "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
150  "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
151  "movq %%mm2, %%mm3 \n\t"
152  "movq %%mm4, %%mm5 \n\t"
153  "punpcklbw %%mm7, %%mm2 \n\t"
154  "punpckhbw %%mm7, %%mm3 \n\t"
155  "punpcklbw %%mm7, %%mm4 \n\t"
156  "punpckhbw %%mm7, %%mm5 \n\t"
157  "paddw %%mm4, %%mm2 \n\t"
158  "paddw %%mm5, %%mm3 \n\t"
159  "movq %5, %%mm5 \n\t"
160  "paddw %%mm2, %%mm0 \n\t"
161  "paddw %%mm3, %%mm1 \n\t"
162  "paddw %%mm5, %%mm0 \n\t"
163  "paddw %%mm5, %%mm1 \n\t"
164  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
165  "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
166  "psrlw $2, %%mm0 \n\t"
167  "psrlw $2, %%mm1 \n\t"
168  "packuswb %%mm1, %%mm0 \n\t"
169  "psubusb %%mm0, %%mm4 \n\t"
170  "psubusb %%mm5, %%mm0 \n\t"
171  "por %%mm4, %%mm0 \n\t"
172  "movq %%mm0, %%mm4 \n\t"
173  "punpcklbw %%mm7, %%mm0 \n\t"
174  "punpckhbw %%mm7, %%mm4 \n\t"
175  "paddw %%mm0, %%mm6 \n\t"
176  "paddw %%mm4, %%mm6 \n\t"
177  "movq %%mm2, %%mm0 \n\t"
178  "movq %%mm3, %%mm1 \n\t"
179  "add %4, %%"FF_REG_a" \n\t"
180  " js 1b \n\t"
181  : "+a" (len)
182  : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
183  "r" (stride), "m" (round_tab[2]));
184 }
185 
186 static inline int sum_mmx(void)
187 {
188  int ret;
189  __asm__ volatile (
190  "movq %%mm6, %%mm0 \n\t"
191  "psrlq $32, %%mm6 \n\t"
192  "paddw %%mm0, %%mm6 \n\t"
193  "movq %%mm6, %%mm0 \n\t"
194  "psrlq $16, %%mm6 \n\t"
195  "paddw %%mm0, %%mm6 \n\t"
196  "movd %%mm6, %0 \n\t"
197  : "=r" (ret));
198  return ret & 0xFFFF;
199 }
200 
201 #define PIX_SADXY(suf) \
202 static int sad8_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \
203  const uint8_t *blk1, ptrdiff_t stride, int h) \
204 { \
205  __asm__ volatile ( \
206  "pxor %%mm7, %%mm7 \n\t" \
207  "pxor %%mm6, %%mm6 \n\t" \
208  ::); \
209  \
210  sad8_4_ ## suf(blk1, blk2, stride, h); \
211  \
212  return sum_ ## suf(); \
213 } \
214  \
215 static int sad16_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \
216  const uint8_t *blk1, ptrdiff_t stride, int h) \
217 { \
218  __asm__ volatile ( \
219  "pxor %%mm7, %%mm7 \n\t" \
220  "pxor %%mm6, %%mm6 \n\t" \
221  ::); \
222  \
223  sad8_4_ ## suf(blk1, blk2, stride, h); \
224  sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
225  \
226  return sum_ ## suf(); \
227 } \
228 
229 PIX_SADXY(mmx)
230 
231 #endif /* HAVE_INLINE_ASM */
232 
234 {
235  int cpu_flags = av_get_cpu_flags();
236 
237 #if HAVE_INLINE_ASM
238  if (INLINE_MMX(cpu_flags)) {
239  c->pix_abs[0][3] = sad16_xy2_mmx;
240  c->pix_abs[1][3] = sad8_xy2_mmx;
241  }
242 
243 #endif /* HAVE_INLINE_ASM */
244 
245  if (EXTERNAL_MMX(cpu_flags)) {
246  c->sse[1] = ff_sse8_mmx;
247 #if HAVE_X86ASM
248  c->nsse[0] = nsse16_mmx;
249  c->nsse[1] = nsse8_mmx;
250 #endif
251  }
252 
253  if (EXTERNAL_MMXEXT(cpu_flags)) {
254 #if !HAVE_ALIGNED_STACK
255  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
256  c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
257 #endif
258 
259  c->sad[0] = ff_sad16_mmxext;
260  c->sad[1] = ff_sad8_mmxext;
261 
262  c->pix_abs[0][0] = ff_sad16_mmxext;
263  c->pix_abs[0][1] = ff_sad16_x2_mmxext;
264  c->pix_abs[0][2] = ff_sad16_y2_mmxext;
265  c->pix_abs[1][0] = ff_sad8_mmxext;
266  c->pix_abs[1][1] = ff_sad8_x2_mmxext;
267  c->pix_abs[1][2] = ff_sad8_y2_mmxext;
268 
269  c->vsad[4] = ff_vsad_intra16_mmxext;
270  c->vsad[5] = ff_vsad_intra8_mmxext;
271 
272  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
273  c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
274  c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
275 
276  c->vsad[0] = ff_vsad16_approx_mmxext;
277  c->vsad[1] = ff_vsad8_approx_mmxext;
278  }
279  }
280 
281  if (EXTERNAL_SSE2(cpu_flags)) {
282  c->sse[0] = ff_sse16_sse2;
283  c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
284 
285 #if HAVE_ALIGNED_STACK
286  c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
287  c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
288 #endif
290  c->sad[0] = ff_sad16_sse2;
291  c->pix_abs[0][0] = ff_sad16_sse2;
292  c->pix_abs[0][1] = ff_sad16_x2_sse2;
293  c->pix_abs[0][2] = ff_sad16_y2_sse2;
294 
295  c->vsad[4] = ff_vsad_intra16_sse2;
296  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
297  c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
298  c->vsad[0] = ff_vsad16_approx_sse2;
299  }
300  }
301  }
302 
303  if (EXTERNAL_SSSE3(cpu_flags)) {
304  c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
305 #if HAVE_ALIGNED_STACK
306  c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
307  c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
308 #endif
309  }
310 }
INLINE_MMX
#define INLINE_MMX(flags)
Definition: cpu.h:87
cpu.h
ff_sad8_y2_mmxext
int ff_sad8_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
mem_internal.h
ff_sse16_mmx
int ff_sse16_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad8_approx_xy2_mmxext
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad16_approx_xy2_sse2
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ff_vsad_intra16_mmxext
int ff_vsad_intra16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad8_mmxext
int ff_sad8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ff_vsad_intra8_mmxext
int ff_vsad_intra8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ff_me_cmp_init_x86
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
DECLARE_ASM_CONST
#define DECLARE_ASM_CONST(n, t, v)
Definition: mem_internal.h:89
mpegvideo.h
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
ff_hf_noise8_mmx
int ff_hf_noise8_mmx(const uint8_t *pix1, ptrdiff_t stride, int h)
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
ff_sum_abs_dctelem_ssse3
int ff_sum_abs_dctelem_ssse3(const int16_t *block)
ff_vsad16_approx_mmxext
int ff_vsad16_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
AVCodecContext::flags
int flags
AV_CODEC_FLAG_*.
Definition: avcodec.h:506
ff_hf_noise16_mmx
int ff_hf_noise16_mmx(const uint8_t *pix1, ptrdiff_t stride, int h)
av_cold
#define av_cold
Definition: attributes.h:90
ff_vsad16_approx_sse2
int ff_vsad16_approx_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
AVCodecContext::codec_id
enum AVCodecID codec_id
Definition: avcodec.h:436
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:64
MECmpContext
Definition: me_cmp.h:55
ff_sad16_approx_xy2_mmxext
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_sad16_sse2
int ff_sad16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
cpu.h
ff_vsad8_approx_mmxext
int ff_vsad8_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
AV_CPU_FLAG_SSE2SLOW
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
Definition: cpu.h:35
asm.h
attributes.h
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
ff_sad16_y2_sse2
int ff_sad16_y2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad16_mmxext
int ff_sad16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad16_x2_mmxext
int ff_sad16_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
len
int len
Definition: vorbis_enc_data.h:426
stride
#define stride
Definition: h264pred_template.c:537
ff_sse8_mmx
int ff_sse8_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
ret
ret
Definition: filter_design.txt:187
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
ff_vsad_intra16_sse2
int ff_vsad_intra16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
me_cmp.h
ff_sad16_y2_mmxext
int ff_sad16_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
AVCodecContext
main external API structure.
Definition: avcodec.h:426
ff_sum_abs_dctelem_sse2
int ff_sum_abs_dctelem_sse2(const int16_t *block)
AV_CODEC_ID_SNOW
@ AV_CODEC_ID_SNOW
Definition: codec_id.h:266
ff_sad8_x2_mmxext
int ff_sad8_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
AV_CODEC_FLAG_BITEXACT
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
Definition: avcodec.h:321
hadamard_func
#define hadamard_func(cpu)
Definition: me_cmp_init.c:80
x86_reg
int x86_reg
Definition: asm.h:72
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2038
ff_sad16_x2_sse2
int ff_sad16_x2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
EXTERNAL_MMX
#define EXTERNAL_MMX(flags)
Definition: cpu.h:56
MpegEncContext
MpegEncContext.
Definition: mpegvideo.h:67
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:57
ff_sse16_sse2
int ff_sse16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h)