FFmpeg
hscale_fast_bilinear_simd.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "../swscale_internal.h"
22 #include "libavutil/x86/asm.h"
23 #include "libavutil/x86/cpu.h"
24 #include "libavutil/mem_internal.h"
25 
26 #define RET 0xC3 // near return opcode for x86
27 #define PREFETCH "prefetchnta"
28 
29 #if HAVE_INLINE_ASM
30 av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
31  int16_t *filter, int32_t *filterPos,
32  int numSplits)
33 {
34  uint8_t *fragmentA;
35  x86_reg imm8OfPShufW1A;
36  x86_reg imm8OfPShufW2A;
37  x86_reg fragmentLengthA;
38  uint8_t *fragmentB;
39  x86_reg imm8OfPShufW1B;
40  x86_reg imm8OfPShufW2B;
41  x86_reg fragmentLengthB;
42  int fragmentPos;
43 
44  int xpos, i;
45 
46  // create an optimized horizontal scaling routine
47  /* This scaler is made of runtime-generated MMXEXT code using specially tuned
48  * pshufw instructions. For every four output pixels, if four input pixels
49  * are enough for the fast bilinear scaling, then a chunk of fragmentB is
50  * used. If five input pixels are needed, then a chunk of fragmentA is used.
51  */
52 
53  // code fragment
54 
55  __asm__ volatile (
56  "jmp 9f \n\t"
57  // Begin
58  "0: \n\t"
59  "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
60  "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
61  "movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t"
62  "punpcklbw %%mm7, %%mm1 \n\t"
63  "punpcklbw %%mm7, %%mm0 \n\t"
64  "pshufw $0xFF, %%mm1, %%mm1 \n\t"
65  "1: \n\t"
66  "pshufw $0xFF, %%mm0, %%mm0 \n\t"
67  "2: \n\t"
68  "psubw %%mm1, %%mm0 \n\t"
69  "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
70  "pmullw %%mm3, %%mm0 \n\t"
71  "psllw $7, %%mm1 \n\t"
72  "paddw %%mm1, %%mm0 \n\t"
73 
74  "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
75 
76  "add $8, %%"FF_REG_a" \n\t"
77  // End
78  "9: \n\t"
79  "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
80  "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
81  "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
82  "dec %1 \n\t"
83  "dec %2 \n\t"
84  "sub %0, %1 \n\t"
85  "sub %0, %2 \n\t"
86  "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
87  "sub %0, %3 \n\t"
88 
89 
90  : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
91  "=r" (fragmentLengthA)
92  );
93 
94  __asm__ volatile (
95  "jmp 9f \n\t"
96  // Begin
97  "0: \n\t"
98  "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
99  "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
100  "punpcklbw %%mm7, %%mm0 \n\t"
101  "pshufw $0xFF, %%mm0, %%mm1 \n\t"
102  "1: \n\t"
103  "pshufw $0xFF, %%mm0, %%mm0 \n\t"
104  "2: \n\t"
105  "psubw %%mm1, %%mm0 \n\t"
106  "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
107  "pmullw %%mm3, %%mm0 \n\t"
108  "psllw $7, %%mm1 \n\t"
109  "paddw %%mm1, %%mm0 \n\t"
110 
111  "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
112 
113  "add $8, %%"FF_REG_a" \n\t"
114  // End
115  "9: \n\t"
116  "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
117  "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
118  "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
119  "dec %1 \n\t"
120  "dec %2 \n\t"
121  "sub %0, %1 \n\t"
122  "sub %0, %2 \n\t"
123  "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
124  "sub %0, %3 \n\t"
125 
126 
127  : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
128  "=r" (fragmentLengthB)
129  );
130 
131  xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
132  fragmentPos = 0;
133 
134  for (i = 0; i < dstW / numSplits; i++) {
135  int xx = xpos >> 16;
136 
137  if ((i & 3) == 0) {
138  int a = 0;
139  int b = ((xpos + xInc) >> 16) - xx;
140  int c = ((xpos + xInc * 2) >> 16) - xx;
141  int d = ((xpos + xInc * 3) >> 16) - xx;
142  int inc = (d + 1 < 4);
143  uint8_t *fragment = inc ? fragmentB : fragmentA;
144  x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
145  x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
146  x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
147  int maxShift = 3 - (d + inc);
148  int shift = 0;
149 
150  if (filterCode) {
151  filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
152  filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
153  filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
154  filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
155  filterPos[i / 2] = xx;
156 
157  memcpy(filterCode + fragmentPos, fragment, fragmentLength);
158 
159  filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
160  ((b + inc) << 2) |
161  ((c + inc) << 4) |
162  ((d + inc) << 6);
163  filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
164  (c << 4) |
165  (d << 6);
166 
167  if (i + 4 - inc >= dstW)
168  shift = maxShift; // avoid overread
169  else if ((filterPos[i / 2] & 3) <= maxShift)
170  shift = filterPos[i / 2] & 3; // align
171 
172  if (shift && i >= shift) {
173  filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
174  filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
175  filterPos[i / 2] -= shift;
176  }
177  }
178 
179  fragmentPos += fragmentLength;
180 
181  if (filterCode)
182  filterCode[fragmentPos] = RET;
183  }
184  xpos += xInc;
185  }
186  if (filterCode)
187  filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
188 
189  return fragmentPos + 1;
190 }
191 
192 void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
193  int dstWidth, const uint8_t *src,
194  int srcW, int xInc)
195 {
196  int32_t *filterPos = c->hLumFilterPos;
197  int16_t *filter = c->hLumFilter;
198  void *mmxextFilterCode = c->lumMmxextFilterCode;
199  int i;
200 #if ARCH_X86_64
201  uint64_t retsave;
202 #else
203 #if !HAVE_EBX_AVAILABLE
204  uint64_t ebxsave;
205 #endif
206 #endif
207 
208  __asm__ volatile(
209 #if ARCH_X86_64
210  "mov -8(%%rsp), %%"FF_REG_a" \n\t"
211  "mov %%"FF_REG_a", %5 \n\t" // retsave
212 #else
213 #if !HAVE_EBX_AVAILABLE
214  "mov %%"FF_REG_b", %5 \n\t" // ebxsave
215 #endif
216 #endif
217  "pxor %%mm7, %%mm7 \n\t"
218  "mov %0, %%"FF_REG_c" \n\t"
219  "mov %1, %%"FF_REG_D" \n\t"
220  "mov %2, %%"FF_REG_d" \n\t"
221  "mov %3, %%"FF_REG_b" \n\t"
222  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
223  PREFETCH" (%%"FF_REG_c") \n\t"
224  PREFETCH" 32(%%"FF_REG_c") \n\t"
225  PREFETCH" 64(%%"FF_REG_c") \n\t"
226 
227 #if ARCH_X86_64
228 #define CALL_MMXEXT_FILTER_CODE \
229  "movl (%%"FF_REG_b"), %%esi \n\t"\
230  "call *%4 \n\t"\
231  "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
232  "add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
233  "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
234  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
235 
236 #else
237 #define CALL_MMXEXT_FILTER_CODE \
238  "movl (%%"FF_REG_b"), %%esi \n\t"\
239  "call *%4 \n\t"\
240  "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
241  "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
242  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
243 
244 #endif /* ARCH_X86_64 */
245 
246  CALL_MMXEXT_FILTER_CODE
247  CALL_MMXEXT_FILTER_CODE
248  CALL_MMXEXT_FILTER_CODE
249  CALL_MMXEXT_FILTER_CODE
250  CALL_MMXEXT_FILTER_CODE
251  CALL_MMXEXT_FILTER_CODE
252  CALL_MMXEXT_FILTER_CODE
253  CALL_MMXEXT_FILTER_CODE
254 
255 #if ARCH_X86_64
256  "mov %5, %%"FF_REG_a" \n\t"
257  "mov %%"FF_REG_a", -8(%%rsp) \n\t"
258 #else
259 #if !HAVE_EBX_AVAILABLE
260  "mov %5, %%"FF_REG_b" \n\t"
261 #endif
262 #endif
263  :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
264  "m" (mmxextFilterCode)
265 #if ARCH_X86_64
266  ,"m"(retsave)
267 #else
268 #if !HAVE_EBX_AVAILABLE
269  ,"m" (ebxsave)
270 #endif
271 #endif
272  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
273 #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
274  ,"%"FF_REG_b
275 #endif
276  );
277 
278  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
279  dst[i] = src[srcW-1]*128;
280 }
281 
282 void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
283  int dstWidth, const uint8_t *src1,
284  const uint8_t *src2, int srcW, int xInc)
285 {
286  int32_t *filterPos = c->hChrFilterPos;
287  int16_t *filter = c->hChrFilter;
288  void *mmxextFilterCode = c->chrMmxextFilterCode;
289  int i;
290 #if ARCH_X86_64
291  DECLARE_ALIGNED(8, uint64_t, retsave);
292 #else
293 #if !HAVE_EBX_AVAILABLE
294  DECLARE_ALIGNED(8, uint64_t, ebxsave);
295 #endif
296 #endif
297  __asm__ volatile(
298 #if ARCH_X86_64
299  "mov -8(%%rsp), %%"FF_REG_a" \n\t"
300  "mov %%"FF_REG_a", %7 \n\t" // retsave
301 #else
302 #if !HAVE_EBX_AVAILABLE
303  "mov %%"FF_REG_b", %7 \n\t" // ebxsave
304 #endif
305 #endif
306  "pxor %%mm7, %%mm7 \n\t"
307  "mov %0, %%"FF_REG_c" \n\t"
308  "mov %1, %%"FF_REG_D" \n\t"
309  "mov %2, %%"FF_REG_d" \n\t"
310  "mov %3, %%"FF_REG_b" \n\t"
311  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
312  PREFETCH" (%%"FF_REG_c") \n\t"
313  PREFETCH" 32(%%"FF_REG_c") \n\t"
314  PREFETCH" 64(%%"FF_REG_c") \n\t"
315 
316  CALL_MMXEXT_FILTER_CODE
317  CALL_MMXEXT_FILTER_CODE
318  CALL_MMXEXT_FILTER_CODE
319  CALL_MMXEXT_FILTER_CODE
320  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
321  "mov %5, %%"FF_REG_c" \n\t" // src2
322  "mov %6, %%"FF_REG_D" \n\t" // dst2
323  PREFETCH" (%%"FF_REG_c") \n\t"
324  PREFETCH" 32(%%"FF_REG_c") \n\t"
325  PREFETCH" 64(%%"FF_REG_c") \n\t"
326 
327  CALL_MMXEXT_FILTER_CODE
328  CALL_MMXEXT_FILTER_CODE
329  CALL_MMXEXT_FILTER_CODE
330  CALL_MMXEXT_FILTER_CODE
331 
332 #if ARCH_X86_64
333  "mov %7, %%"FF_REG_a" \n\t"
334  "mov %%"FF_REG_a", -8(%%rsp) \n\t"
335 #else
336 #if !HAVE_EBX_AVAILABLE
337  "mov %7, %%"FF_REG_b" \n\t"
338 #endif
339 #endif
340  :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
341  "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
342 #if ARCH_X86_64
343  ,"m"(retsave)
344 #else
345 #if !HAVE_EBX_AVAILABLE
346  ,"m" (ebxsave)
347 #endif
348 #endif
349  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
350 #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
351  ,"%"FF_REG_b
352 #endif
353  );
354 
355  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
356  dst1[i] = src1[srcW-1]*128;
357  dst2[i] = src2[srcW-1]*128;
358  }
359 }
360 #endif //HAVE_INLINE_ASM
cpu.h
mem_internal.h
b
#define b
Definition: input.c:41
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
fragment
Definition: dashdec.c:35
ff_hcscale_fast_mmxext
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
av_cold
#define av_cold
Definition: attributes.h:90
int32_t
int32_t
Definition: audio_convert.c:194
ff_init_hscaler_mmxext
int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits)
src
#define src
Definition: vp8dsp.c:255
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
asm.h
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:117
src1
#define src1
Definition: h264pred.c:140
i
int i
Definition: input.c:407
LOCAL_MANGLE
#define LOCAL_MANGLE(a)
Definition: asm.h:109
uint8_t
uint8_t
Definition: audio_convert.c:194
PREFETCH
#define PREFETCH
Definition: hscale_fast_bilinear_simd.c:27
RET
#define RET
Definition: hscale_fast_bilinear_simd.c:26
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
ff_hyscale_fast_mmxext
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
shift
static int shift(int a, int b)
Definition: sonic.c:82
x86_reg
int x86_reg
Definition: asm.h:72
SwsContext
Definition: swscale_internal.h:283