FFmpeg
swscale_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <stdint.h>
22 
23 #include "libavutil/x86/asm.h"
25 
26 #undef REAL_MOVNTQ
27 #undef MOVNTQ
28 #undef MOVNTQ2
29 #undef PREFETCH
30 
31 
32 #if COMPILE_TEMPLATE_MMXEXT
33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define MOVNTQ2 "movntq "
35 #else
36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
37 #define MOVNTQ2 "movq "
38 #endif
39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
40 
41 #if !COMPILE_TEMPLATE_MMXEXT
42 static av_always_inline void
43 dither_8to16(const uint8_t *srcDither, int rot)
44 {
45  if (rot) {
46  __asm__ volatile("pxor %%mm0, %%mm0\n\t"
47  "movq (%0), %%mm3\n\t"
48  "movq %%mm3, %%mm4\n\t"
49  "psrlq $24, %%mm3\n\t"
50  "psllq $40, %%mm4\n\t"
51  "por %%mm4, %%mm3\n\t"
52  "movq %%mm3, %%mm4\n\t"
53  "punpcklbw %%mm0, %%mm3\n\t"
54  "punpckhbw %%mm0, %%mm4\n\t"
55  :: "r"(srcDither)
56  );
57  } else {
58  __asm__ volatile("pxor %%mm0, %%mm0\n\t"
59  "movq (%0), %%mm3\n\t"
60  "movq %%mm3, %%mm4\n\t"
61  "punpcklbw %%mm0, %%mm3\n\t"
62  "punpckhbw %%mm0, %%mm4\n\t"
63  :: "r"(srcDither)
64  );
65  }
66 }
67 #endif
68 
69 static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
70  const int16_t **src, uint8_t *dest, int dstW,
71  const uint8_t *dither, int offset)
72 {
74  filterSize--;
75  __asm__ volatile(
76  "movd %0, %%mm1\n\t"
77  "punpcklwd %%mm1, %%mm1\n\t"
78  "punpckldq %%mm1, %%mm1\n\t"
79  "psllw $3, %%mm1\n\t"
80  "paddw %%mm1, %%mm3\n\t"
81  "paddw %%mm1, %%mm4\n\t"
82  "psraw $4, %%mm3\n\t"
83  "psraw $4, %%mm4\n\t"
84  ::"m"(filterSize)
85  );
86 
87  __asm__ volatile(\
88  "movq %%mm3, %%mm6\n\t"
89  "movq %%mm4, %%mm7\n\t"
90  "movl %3, %%ecx\n\t"
91  "mov %0, %%"FF_REG_d" \n\t"\
92  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
93  ".p2align 4 \n\t" /* FIXME Unroll? */\
94  "1: \n\t"\
95  "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
96  "movq (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
97  "movq 8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
98  "add $16, %%"FF_REG_d" \n\t"\
99  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
100  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
101  "pmulhw %%mm0, %%mm2 \n\t"\
102  "pmulhw %%mm0, %%mm5 \n\t"\
103  "paddw %%mm2, %%mm3 \n\t"\
104  "paddw %%mm5, %%mm4 \n\t"\
105  " jnz 1b \n\t"\
106  "psraw $3, %%mm3 \n\t"\
107  "psraw $3, %%mm4 \n\t"\
108  "packuswb %%mm4, %%mm3 \n\t"
109  MOVNTQ2 " %%mm3, (%1, %%"FF_REG_c")\n\t"
110  "add $8, %%"FF_REG_c" \n\t"\
111  "cmp %2, %%"FF_REG_c" \n\t"\
112  "movq %%mm6, %%mm3\n\t"
113  "movq %%mm7, %%mm4\n\t"
114  "mov %0, %%"FF_REG_d" \n\t"\
115  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
116  "jb 1b \n\t"\
117  :: "g" (filter),
118  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
119  : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
120  );
121 }
122 
123 #define YSCALEYUV2PACKEDX_UV \
124  __asm__ volatile(\
125  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
126  ".p2align 4 \n\t"\
127  "nop \n\t"\
128  "1: \n\t"\
129  "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
130  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
131  "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
132  "movq %%mm3, %%mm4 \n\t"\
133  ".p2align 4 \n\t"\
134  "2: \n\t"\
135  "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
136  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\
137  "add %6, %%"FF_REG_S" \n\t" \
138  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\
139  "add $16, %%"FF_REG_d" \n\t"\
140  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
141  "pmulhw %%mm0, %%mm2 \n\t"\
142  "pmulhw %%mm0, %%mm5 \n\t"\
143  "paddw %%mm2, %%mm3 \n\t"\
144  "paddw %%mm5, %%mm4 \n\t"\
145  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
146  " jnz 2b \n\t"\
147 
148 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
149  "lea "offset"(%0), %%"FF_REG_d" \n\t"\
150  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
151  "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
152  "movq "#dst1", "#dst2" \n\t"\
153  ".p2align 4 \n\t"\
154  "2: \n\t"\
155  "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\
156  "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
157  "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
158  "add $16, %%"FF_REG_d" \n\t"\
159  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
160  "pmulhw "#coeff", "#src1" \n\t"\
161  "pmulhw "#coeff", "#src2" \n\t"\
162  "paddw "#src1", "#dst1" \n\t"\
163  "paddw "#src2", "#dst2" \n\t"\
164  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
165  " jnz 2b \n\t"\
166 
167 #define YSCALEYUV2PACKEDX \
168  YSCALEYUV2PACKEDX_UV \
169  YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
170 
171 #define YSCALEYUV2PACKEDX_END \
172  :: "r" (&c->redDither), \
173  "m" (dummy), "m" (dummy), "m" (dummy),\
174  "r" (dest), "m" (dstW_reg), "m"(uv_off) \
175  NAMED_CONSTRAINTS_ADD(bF8,bFC) \
176  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
177  );
178 
179 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
180  __asm__ volatile(\
181  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
182  ".p2align 4 \n\t"\
183  "nop \n\t"\
184  "1: \n\t"\
185  "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
186  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
187  "pxor %%mm4, %%mm4 \n\t"\
188  "pxor %%mm5, %%mm5 \n\t"\
189  "pxor %%mm6, %%mm6 \n\t"\
190  "pxor %%mm7, %%mm7 \n\t"\
191  ".p2align 4 \n\t"\
192  "2: \n\t"\
193  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\
194  "add %6, %%"FF_REG_S" \n\t" \
195  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\
196  "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
197  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\
198  "movq %%mm0, %%mm3 \n\t"\
199  "punpcklwd %%mm1, %%mm0 \n\t"\
200  "punpckhwd %%mm1, %%mm3 \n\t"\
201  "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\
202  "pmaddwd %%mm1, %%mm0 \n\t"\
203  "pmaddwd %%mm1, %%mm3 \n\t"\
204  "paddd %%mm0, %%mm4 \n\t"\
205  "paddd %%mm3, %%mm5 \n\t"\
206  "add %6, %%"FF_REG_S" \n\t" \
207  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\
208  "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
209  "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
210  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
211  "movq %%mm2, %%mm0 \n\t"\
212  "punpcklwd %%mm3, %%mm2 \n\t"\
213  "punpckhwd %%mm3, %%mm0 \n\t"\
214  "pmaddwd %%mm1, %%mm2 \n\t"\
215  "pmaddwd %%mm1, %%mm0 \n\t"\
216  "paddd %%mm2, %%mm6 \n\t"\
217  "paddd %%mm0, %%mm7 \n\t"\
218  " jnz 2b \n\t"\
219  "psrad $16, %%mm4 \n\t"\
220  "psrad $16, %%mm5 \n\t"\
221  "psrad $16, %%mm6 \n\t"\
222  "psrad $16, %%mm7 \n\t"\
223  "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
224  "packssdw %%mm5, %%mm4 \n\t"\
225  "packssdw %%mm7, %%mm6 \n\t"\
226  "paddw %%mm0, %%mm4 \n\t"\
227  "paddw %%mm0, %%mm6 \n\t"\
228  "movq %%mm4, "U_TEMP"(%0) \n\t"\
229  "movq %%mm6, "V_TEMP"(%0) \n\t"\
230 
231 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
232  "lea "offset"(%0), %%"FF_REG_d" \n\t"\
233  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
234  "pxor %%mm1, %%mm1 \n\t"\
235  "pxor %%mm5, %%mm5 \n\t"\
236  "pxor %%mm7, %%mm7 \n\t"\
237  "pxor %%mm6, %%mm6 \n\t"\
238  ".p2align 4 \n\t"\
239  "2: \n\t"\
240  "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
241  "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
242  "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
243  "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
244  "movq %%mm0, %%mm3 \n\t"\
245  "punpcklwd %%mm4, %%mm0 \n\t"\
246  "punpckhwd %%mm4, %%mm3 \n\t"\
247  "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\
248  "pmaddwd %%mm4, %%mm0 \n\t"\
249  "pmaddwd %%mm4, %%mm3 \n\t"\
250  "paddd %%mm0, %%mm1 \n\t"\
251  "paddd %%mm3, %%mm5 \n\t"\
252  "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
253  "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
254  "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
255  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
256  "movq %%mm2, %%mm0 \n\t"\
257  "punpcklwd %%mm3, %%mm2 \n\t"\
258  "punpckhwd %%mm3, %%mm0 \n\t"\
259  "pmaddwd %%mm4, %%mm2 \n\t"\
260  "pmaddwd %%mm4, %%mm0 \n\t"\
261  "paddd %%mm2, %%mm7 \n\t"\
262  "paddd %%mm0, %%mm6 \n\t"\
263  " jnz 2b \n\t"\
264  "psrad $16, %%mm1 \n\t"\
265  "psrad $16, %%mm5 \n\t"\
266  "psrad $16, %%mm7 \n\t"\
267  "psrad $16, %%mm6 \n\t"\
268  "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
269  "packssdw %%mm5, %%mm1 \n\t"\
270  "packssdw %%mm6, %%mm7 \n\t"\
271  "paddw %%mm0, %%mm1 \n\t"\
272  "paddw %%mm0, %%mm7 \n\t"\
273  "movq "U_TEMP"(%0), %%mm3 \n\t"\
274  "movq "V_TEMP"(%0), %%mm4 \n\t"\
275 
276 #define YSCALEYUV2PACKEDX_ACCURATE \
277  YSCALEYUV2PACKEDX_ACCURATE_UV \
278  YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
279 
280 #define YSCALEYUV2RGBX \
281  "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
282  "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
283  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
284  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
285  "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
286  "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
287  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
288  "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
289  "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
290  "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
291  "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
292  "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
293  "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
294  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
295  "paddw %%mm3, %%mm4 \n\t"\
296  "movq %%mm2, %%mm0 \n\t"\
297  "movq %%mm5, %%mm6 \n\t"\
298  "movq %%mm4, %%mm3 \n\t"\
299  "punpcklwd %%mm2, %%mm2 \n\t"\
300  "punpcklwd %%mm5, %%mm5 \n\t"\
301  "punpcklwd %%mm4, %%mm4 \n\t"\
302  "paddw %%mm1, %%mm2 \n\t"\
303  "paddw %%mm1, %%mm5 \n\t"\
304  "paddw %%mm1, %%mm4 \n\t"\
305  "punpckhwd %%mm0, %%mm0 \n\t"\
306  "punpckhwd %%mm6, %%mm6 \n\t"\
307  "punpckhwd %%mm3, %%mm3 \n\t"\
308  "paddw %%mm7, %%mm0 \n\t"\
309  "paddw %%mm7, %%mm6 \n\t"\
310  "paddw %%mm7, %%mm3 \n\t"\
311  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
312  "packuswb %%mm0, %%mm2 \n\t"\
313  "packuswb %%mm6, %%mm5 \n\t"\
314  "packuswb %%mm3, %%mm4 \n\t"\
315 
316 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
317  "movq "#b", "#q2" \n\t" /* B */\
318  "movq "#r", "#t" \n\t" /* R */\
319  "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
320  "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
321  "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
322  "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
323  "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
324  "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
325  "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
326  "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
327  "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
328  "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
329 \
330  MOVNTQ( q0, (dst, index, 4))\
331  MOVNTQ( b, 8(dst, index, 4))\
332  MOVNTQ( q2, 16(dst, index, 4))\
333  MOVNTQ( q3, 24(dst, index, 4))\
334 \
335  "add $8, "#index" \n\t"\
336  "cmp "dstw", "#index" \n\t"\
337  " jb 1b \n\t"
338 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
339 
340 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
341  const int16_t **lumSrc, int lumFilterSize,
342  const int16_t *chrFilter, const int16_t **chrUSrc,
343  const int16_t **chrVSrc,
344  int chrFilterSize, const int16_t **alpSrc,
345  uint8_t *dest, int dstW, int dstY)
346 {
347  x86_reg dummy=0;
348  x86_reg dstW_reg = dstW;
349  x86_reg uv_off = c->uv_offx2;
350 
351  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
354  "movq %%mm2, "U_TEMP"(%0) \n\t"
355  "movq %%mm4, "V_TEMP"(%0) \n\t"
356  "movq %%mm5, "Y_TEMP"(%0) \n\t"
358  "movq "Y_TEMP"(%0), %%mm5 \n\t"
359  "psraw $3, %%mm1 \n\t"
360  "psraw $3, %%mm7 \n\t"
361  "packuswb %%mm7, %%mm1 \n\t"
362  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
364  } else {
367  "pcmpeqd %%mm7, %%mm7 \n\t"
368  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
370  }
371 }
372 
373 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
374  const int16_t **lumSrc, int lumFilterSize,
375  const int16_t *chrFilter, const int16_t **chrUSrc,
376  const int16_t **chrVSrc,
377  int chrFilterSize, const int16_t **alpSrc,
378  uint8_t *dest, int dstW, int dstY)
379 {
380  x86_reg dummy=0;
381  x86_reg dstW_reg = dstW;
382  x86_reg uv_off = c->uv_offx2;
383 
384  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
387  YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
388  "psraw $3, %%mm1 \n\t"
389  "psraw $3, %%mm7 \n\t"
390  "packuswb %%mm7, %%mm1 \n\t"
391  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
393  } else {
396  "pcmpeqd %%mm7, %%mm7 \n\t"
397  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
399  }
400 }
401 
402 static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
403  const int16_t **lumSrc, int lumFilterSize,
404  const int16_t *chrFilter, const int16_t **chrUSrc,
405  const int16_t **chrVSrc,
406  int chrFilterSize, const int16_t **alpSrc,
407  uint8_t *dest, int dstW, int dstY)
408 {
409  x86_reg dummy=0;
410  x86_reg dstW_reg = dstW;
411  x86_reg uv_off = c->uv_offx2;
412 
413  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
416  YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
417  "psraw $3, %%mm1 \n\t"
418  "psraw $3, %%mm7 \n\t"
419  "packuswb %%mm7, %%mm1 \n\t"
420  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
422  } else {
425  "pcmpeqd %%mm7, %%mm7 \n\t"
426  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
428  }
429 }
430 
431 #define REAL_WRITERGB16(dst, dstw, index) \
432  "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
433  "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
434  "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
435  "psrlq $3, %%mm2 \n\t"\
436 \
437  "movq %%mm2, %%mm1 \n\t"\
438  "movq %%mm4, %%mm3 \n\t"\
439 \
440  "punpcklbw %%mm7, %%mm3 \n\t"\
441  "punpcklbw %%mm5, %%mm2 \n\t"\
442  "punpckhbw %%mm7, %%mm4 \n\t"\
443  "punpckhbw %%mm5, %%mm1 \n\t"\
444 \
445  "psllq $3, %%mm3 \n\t"\
446  "psllq $3, %%mm4 \n\t"\
447 \
448  "por %%mm3, %%mm2 \n\t"\
449  "por %%mm4, %%mm1 \n\t"\
450 \
451  MOVNTQ(%%mm2, (dst, index, 2))\
452  MOVNTQ(%%mm1, 8(dst, index, 2))\
453 \
454  "add $8, "#index" \n\t"\
455  "cmp "dstw", "#index" \n\t"\
456  " jb 1b \n\t"
457 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
458 
459 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
460  const int16_t **lumSrc, int lumFilterSize,
461  const int16_t *chrFilter, const int16_t **chrUSrc,
462  const int16_t **chrVSrc,
463  int chrFilterSize, const int16_t **alpSrc,
464  uint8_t *dest, int dstW, int dstY)
465 {
466  x86_reg dummy=0;
467  x86_reg dstW_reg = dstW;
468  x86_reg uv_off = c->uv_offx2;
469 
472  "pxor %%mm7, %%mm7 \n\t"
473  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
474 #ifdef DITHER1XBPP
475  "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
476  "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
477  "paddusb "RED_DITHER"(%0), %%mm5\n\t"
478 #endif
479  WRITERGB16(%4, "%5", %%FF_REGa)
481 }
482 
483 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
484  const int16_t **lumSrc, int lumFilterSize,
485  const int16_t *chrFilter, const int16_t **chrUSrc,
486  const int16_t **chrVSrc,
487  int chrFilterSize, const int16_t **alpSrc,
488  uint8_t *dest, int dstW, int dstY)
489 {
490  x86_reg dummy=0;
491  x86_reg dstW_reg = dstW;
492  x86_reg uv_off = c->uv_offx2;
493 
496  "pxor %%mm7, %%mm7 \n\t"
497  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
498 #ifdef DITHER1XBPP
499  "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
500  "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
501  "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
502 #endif
503  WRITERGB16(%4, "%5", %%FF_REGa)
505 }
506 
507 #define REAL_WRITERGB15(dst, dstw, index) \
508  "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
509  "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
510  "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
511  "psrlq $3, %%mm2 \n\t"\
512  "psrlq $1, %%mm5 \n\t"\
513 \
514  "movq %%mm2, %%mm1 \n\t"\
515  "movq %%mm4, %%mm3 \n\t"\
516 \
517  "punpcklbw %%mm7, %%mm3 \n\t"\
518  "punpcklbw %%mm5, %%mm2 \n\t"\
519  "punpckhbw %%mm7, %%mm4 \n\t"\
520  "punpckhbw %%mm5, %%mm1 \n\t"\
521 \
522  "psllq $2, %%mm3 \n\t"\
523  "psllq $2, %%mm4 \n\t"\
524 \
525  "por %%mm3, %%mm2 \n\t"\
526  "por %%mm4, %%mm1 \n\t"\
527 \
528  MOVNTQ(%%mm2, (dst, index, 2))\
529  MOVNTQ(%%mm1, 8(dst, index, 2))\
530 \
531  "add $8, "#index" \n\t"\
532  "cmp "dstw", "#index" \n\t"\
533  " jb 1b \n\t"
534 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
535 
536 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
537  const int16_t **lumSrc, int lumFilterSize,
538  const int16_t *chrFilter, const int16_t **chrUSrc,
539  const int16_t **chrVSrc,
540  int chrFilterSize, const int16_t **alpSrc,
541  uint8_t *dest, int dstW, int dstY)
542 {
543  x86_reg dummy=0;
544  x86_reg dstW_reg = dstW;
545  x86_reg uv_off = c->uv_offx2;
546 
549  "pxor %%mm7, %%mm7 \n\t"
550  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
551 #ifdef DITHER1XBPP
552  "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
553  "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
554  "paddusb "RED_DITHER"(%0), %%mm5\n\t"
555 #endif
556  WRITERGB15(%4, "%5", %%FF_REGa)
558 }
559 
560 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
561  const int16_t **lumSrc, int lumFilterSize,
562  const int16_t *chrFilter, const int16_t **chrUSrc,
563  const int16_t **chrVSrc,
564  int chrFilterSize, const int16_t **alpSrc,
565  uint8_t *dest, int dstW, int dstY)
566 {
567  x86_reg dummy=0;
568  x86_reg dstW_reg = dstW;
569  x86_reg uv_off = c->uv_offx2;
570 
573  "pxor %%mm7, %%mm7 \n\t"
574  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
575 #ifdef DITHER1XBPP
576  "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
577  "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
578  "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
579 #endif
580  WRITERGB15(%4, "%5", %%FF_REGa)
582 }
583 
584 #define WRITEBGR24MMX(dst, dstw, index) \
585  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
586  "movq %%mm2, %%mm1 \n\t" /* B */\
587  "movq %%mm5, %%mm6 \n\t" /* R */\
588  "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
589  "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
590  "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
591  "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
592  "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
593  "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
594  "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
595  "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
596  "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
597  "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
598 \
599  "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
600  "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
601  "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
602  "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
603 \
604  "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
605  "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
606  "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
607  "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
608 \
609  "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
610  "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
611  "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
612  "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
613 \
614  "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
615  "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
616  "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
617  "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
618  MOVNTQ(%%mm0, (dst))\
619 \
620  "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
621  "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
622  "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
623  "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
624  MOVNTQ(%%mm6, 8(dst))\
625 \
626  "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
627  "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
628  "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
629  MOVNTQ(%%mm5, 16(dst))\
630 \
631  "add $24, "#dst" \n\t"\
632 \
633  "add $8, "#index" \n\t"\
634  "cmp "dstw", "#index" \n\t"\
635  " jb 1b \n\t"
636 
637 #define WRITEBGR24MMXEXT(dst, dstw, index) \
638  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
639  "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
640  "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
641  "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
642  "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
643  "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
644 \
645  "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
646  "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
647  "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
648 \
649  "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
650  "por %%mm1, %%mm6 \n\t"\
651  "por %%mm3, %%mm6 \n\t"\
652  MOVNTQ(%%mm6, (dst))\
653 \
654  "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
655  "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
656  "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
657  "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
658 \
659  "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
660  "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
661  "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
662 \
663  "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
664  "por %%mm3, %%mm6 \n\t"\
665  MOVNTQ(%%mm6, 8(dst))\
666 \
667  "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
668  "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
669  "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
670 \
671  "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
672  "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
673  "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
674 \
675  "por %%mm1, %%mm3 \n\t"\
676  "por %%mm3, %%mm6 \n\t"\
677  MOVNTQ(%%mm6, 16(dst))\
678 \
679  "add $24, "#dst" \n\t"\
680 \
681  "add $8, "#index" \n\t"\
682  "cmp "dstw", "#index" \n\t"\
683  " jb 1b \n\t"
684 
685 #if COMPILE_TEMPLATE_MMXEXT
686 #undef WRITEBGR24
687 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
688 #else
689 #undef WRITEBGR24
690 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
691 #endif
692 
693 #if HAVE_6REGS
694 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
695  const int16_t **lumSrc, int lumFilterSize,
696  const int16_t *chrFilter, const int16_t **chrUSrc,
697  const int16_t **chrVSrc,
698  int chrFilterSize, const int16_t **alpSrc,
699  uint8_t *dest, int dstW, int dstY)
700 {
701  x86_reg dummy=0;
702  x86_reg dstW_reg = dstW;
703  x86_reg uv_off = c->uv_offx2;
704 
707  "pxor %%mm7, %%mm7 \n\t"
708  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
709  "add %4, %%"FF_REG_c" \n\t"
710  WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
711  :: "r" (&c->redDither),
712  "m" (dummy), "m" (dummy), "m" (dummy),
713  "r" (dest), "m" (dstW_reg), "m"(uv_off)
714  NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
715  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
716  );
717 }
718 
719 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
720  const int16_t **lumSrc, int lumFilterSize,
721  const int16_t *chrFilter, const int16_t **chrUSrc,
722  const int16_t **chrVSrc,
723  int chrFilterSize, const int16_t **alpSrc,
724  uint8_t *dest, int dstW, int dstY)
725 {
726  x86_reg dummy=0;
727  x86_reg dstW_reg = dstW;
728  x86_reg uv_off = c->uv_offx2;
729 
732  "pxor %%mm7, %%mm7 \n\t"
733  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
734  "add %4, %%"FF_REG_c" \n\t"
735  WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
736  :: "r" (&c->redDither),
737  "m" (dummy), "m" (dummy), "m" (dummy),
738  "r" (dest), "m" (dstW_reg), "m"(uv_off)
739  NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
740  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
741  );
742 }
743 #endif /* HAVE_6REGS */
744 
745 #define REAL_WRITEYUY2(dst, dstw, index) \
746  "packuswb %%mm3, %%mm3 \n\t"\
747  "packuswb %%mm4, %%mm4 \n\t"\
748  "packuswb %%mm7, %%mm1 \n\t"\
749  "punpcklbw %%mm4, %%mm3 \n\t"\
750  "movq %%mm1, %%mm7 \n\t"\
751  "punpcklbw %%mm3, %%mm1 \n\t"\
752  "punpckhbw %%mm3, %%mm7 \n\t"\
753 \
754  MOVNTQ(%%mm1, (dst, index, 2))\
755  MOVNTQ(%%mm7, 8(dst, index, 2))\
756 \
757  "add $8, "#index" \n\t"\
758  "cmp "dstw", "#index" \n\t"\
759  " jb 1b \n\t"
760 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
761 
762 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
763  const int16_t **lumSrc, int lumFilterSize,
764  const int16_t *chrFilter, const int16_t **chrUSrc,
765  const int16_t **chrVSrc,
766  int chrFilterSize, const int16_t **alpSrc,
767  uint8_t *dest, int dstW, int dstY)
768 {
769  x86_reg dummy=0;
770  x86_reg dstW_reg = dstW;
771  x86_reg uv_off = c->uv_offx2;
772 
774  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
775  "psraw $3, %%mm3 \n\t"
776  "psraw $3, %%mm4 \n\t"
777  "psraw $3, %%mm1 \n\t"
778  "psraw $3, %%mm7 \n\t"
779  WRITEYUY2(%4, "%5", %%FF_REGa)
781 }
782 
783 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
784  const int16_t **lumSrc, int lumFilterSize,
785  const int16_t *chrFilter, const int16_t **chrUSrc,
786  const int16_t **chrVSrc,
787  int chrFilterSize, const int16_t **alpSrc,
788  uint8_t *dest, int dstW, int dstY)
789 {
790  x86_reg dummy=0;
791  x86_reg dstW_reg = dstW;
792  x86_reg uv_off = c->uv_offx2;
793 
795  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
796  "psraw $3, %%mm3 \n\t"
797  "psraw $3, %%mm4 \n\t"
798  "psraw $3, %%mm1 \n\t"
799  "psraw $3, %%mm7 \n\t"
800  WRITEYUY2(%4, "%5", %%FF_REGa)
802 }
803 
804 #define REAL_YSCALEYUV2RGB_UV(index, c) \
805  "xor "#index", "#index" \n\t"\
806  ".p2align 4 \n\t"\
807  "1: \n\t"\
808  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
809  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
810  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
811  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
812  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
813  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
814  "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
815  "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
816  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
817  "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
818  "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
819  "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
820  "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
821  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
822  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
823  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
824  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
825  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
826  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
827  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
828  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
829  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
830 
831 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
832  "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
833  "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
834  "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
835  "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
836  "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
837  "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
838  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
839  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
840  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
841  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
842  "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
843  "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
844 
845 #define REAL_YSCALEYUV2RGB_COEFF(c) \
846  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
847  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
848  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
849  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
850  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
851  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
852  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
853  "paddw %%mm3, %%mm4 \n\t"\
854  "movq %%mm2, %%mm0 \n\t"\
855  "movq %%mm5, %%mm6 \n\t"\
856  "movq %%mm4, %%mm3 \n\t"\
857  "punpcklwd %%mm2, %%mm2 \n\t"\
858  "punpcklwd %%mm5, %%mm5 \n\t"\
859  "punpcklwd %%mm4, %%mm4 \n\t"\
860  "paddw %%mm1, %%mm2 \n\t"\
861  "paddw %%mm1, %%mm5 \n\t"\
862  "paddw %%mm1, %%mm4 \n\t"\
863  "punpckhwd %%mm0, %%mm0 \n\t"\
864  "punpckhwd %%mm6, %%mm6 \n\t"\
865  "punpckhwd %%mm3, %%mm3 \n\t"\
866  "paddw %%mm7, %%mm0 \n\t"\
867  "paddw %%mm7, %%mm6 \n\t"\
868  "paddw %%mm7, %%mm3 \n\t"\
869  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
870  "packuswb %%mm0, %%mm2 \n\t"\
871  "packuswb %%mm6, %%mm5 \n\t"\
872  "packuswb %%mm3, %%mm4 \n\t"\
873 
874 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
875 
876 #define YSCALEYUV2RGB(index, c) \
877  REAL_YSCALEYUV2RGB_UV(index, c) \
878  REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
879  REAL_YSCALEYUV2RGB_COEFF(c)
880 
881 /**
882  * vertical bilinear scale YV12 to RGB
883  */
884 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
885  const int16_t *ubuf[2], const int16_t *vbuf[2],
886  const int16_t *abuf[2], uint8_t *dest,
887  int dstW, int yalpha, int uvalpha, int y)
888 {
889  const int16_t *buf0 = buf[0], *buf1 = buf[1],
890  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
891 
892  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
893  const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
894 #if ARCH_X86_64
895  __asm__ volatile(
896  YSCALEYUV2RGB(%%r8, %5)
897  YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
898  "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
899  "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
900  "packuswb %%mm7, %%mm1 \n\t"
901  WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
902  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
903  "a" (&c->redDither),
904  "r" (abuf0), "r" (abuf1)
905  : "%r8"
906  );
907 #else
908  c->u_temp=(intptr_t)abuf0;
909  c->v_temp=(intptr_t)abuf1;
910  __asm__ volatile(
911  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
912  "mov %4, %%"FF_REG_b" \n\t"
913  "push %%"FF_REG_BP" \n\t"
914  YSCALEYUV2RGB(%%FF_REGBP, %5)
915  "push %0 \n\t"
916  "push %1 \n\t"
917  "mov "U_TEMP"(%5), %0 \n\t"
918  "mov "V_TEMP"(%5), %1 \n\t"
919  YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
920  "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
921  "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
922  "packuswb %%mm7, %%mm1 \n\t"
923  "pop %1 \n\t"
924  "pop %0 \n\t"
925  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
926  "pop %%"FF_REG_BP" \n\t"
927  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
928  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
929  "a" (&c->redDither)
930  );
931 #endif
932  } else {
933  __asm__ volatile(
934  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
935  "mov %4, %%"FF_REG_b" \n\t"
936  "push %%"FF_REG_BP" \n\t"
937  YSCALEYUV2RGB(%%FF_REGBP, %5)
938  "pcmpeqd %%mm7, %%mm7 \n\t"
939  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
940  "pop %%"FF_REG_BP" \n\t"
941  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
942  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
943  "a" (&c->redDither)
944  );
945  }
946 }
947 
948 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
949  const int16_t *ubuf[2], const int16_t *vbuf[2],
950  const int16_t *abuf[2], uint8_t *dest,
951  int dstW, int yalpha, int uvalpha, int y)
952 {
953  const int16_t *buf0 = buf[0], *buf1 = buf[1],
954  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
955 
956  __asm__ volatile(
957  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
958  "mov %4, %%"FF_REG_b" \n\t"
959  "push %%"FF_REG_BP" \n\t"
960  YSCALEYUV2RGB(%%FF_REGBP, %5)
961  "pxor %%mm7, %%mm7 \n\t"
962  WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
963  "pop %%"FF_REG_BP" \n\t"
964  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
965  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
966  "a" (&c->redDither)
967  NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
968  );
969 }
970 
971 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
972  const int16_t *ubuf[2], const int16_t *vbuf[2],
973  const int16_t *abuf[2], uint8_t *dest,
974  int dstW, int yalpha, int uvalpha, int y)
975 {
976  const int16_t *buf0 = buf[0], *buf1 = buf[1],
977  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
978 
979  __asm__ volatile(
980  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
981  "mov %4, %%"FF_REG_b" \n\t"
982  "push %%"FF_REG_BP" \n\t"
983  YSCALEYUV2RGB(%%FF_REGBP, %5)
984  "pxor %%mm7, %%mm7 \n\t"
985  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
986 #ifdef DITHER1XBPP
987  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
988  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
989  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
990 #endif
991  WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
992  "pop %%"FF_REG_BP" \n\t"
993  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
994  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
995  "a" (&c->redDither)
997  );
998 }
999 
1000 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
1001  const int16_t *ubuf[2], const int16_t *vbuf[2],
1002  const int16_t *abuf[2], uint8_t *dest,
1003  int dstW, int yalpha, int uvalpha, int y)
1004 {
1005  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1006  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1007 
1008  __asm__ volatile(
1009  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1010  "mov %4, %%"FF_REG_b" \n\t"
1011  "push %%"FF_REG_BP" \n\t"
1012  YSCALEYUV2RGB(%%FF_REGBP, %5)
1013  "pxor %%mm7, %%mm7 \n\t"
1014  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1015 #ifdef DITHER1XBPP
1016  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1017  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1018  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1019 #endif
1020  WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1021  "pop %%"FF_REG_BP" \n\t"
1022  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1023  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1024  "a" (&c->redDither)
1025  NAMED_CONSTRAINTS_ADD(bF8,bFC)
1026  );
1027 }
1028 
1029 #define REAL_YSCALEYUV2PACKED(index, c) \
1030  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1031  "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
1032  "psraw $3, %%mm0 \n\t"\
1033  "psraw $3, %%mm1 \n\t"\
1034  "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1035  "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1036  "xor "#index", "#index" \n\t"\
1037  ".p2align 4 \n\t"\
1038  "1: \n\t"\
1039  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1040  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1041  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1042  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1043  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1044  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1045  "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
1046  "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
1047  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1048  "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
1049  "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
1050  "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1051  "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1052  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
1053  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
1054  "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
1055  "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
1056  "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
1057  "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
1058  "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
1059  "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
1060  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1061  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1062  "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1063  "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1064  "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1065  "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1066 
1067 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
1068 
1069 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
1070  const int16_t *ubuf[2], const int16_t *vbuf[2],
1071  const int16_t *abuf[2], uint8_t *dest,
1072  int dstW, int yalpha, int uvalpha, int y)
1073 {
1074  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1075  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1076 
1077  __asm__ volatile(
1078  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1079  "mov %4, %%"FF_REG_b" \n\t"
1080  "push %%"FF_REG_BP" \n\t"
1081  YSCALEYUV2PACKED(%%FF_REGBP, %5)
1082  WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1083  "pop %%"FF_REG_BP" \n\t"
1084  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1085  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1086  "a" (&c->redDither)
1087  );
1088 }
1089 
1090 #define REAL_YSCALEYUV2RGB1(index, c) \
1091  "xor "#index", "#index" \n\t"\
1092  ".p2align 4 \n\t"\
1093  "1: \n\t"\
1094  "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1095  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1096  "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1097  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1098  "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1099  "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1100  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1101  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1102  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1103  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1104  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1105  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1106  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1107  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1108  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1109  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1110  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1111  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1112  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1113  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1114  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1115  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1116  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1117  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1118  "paddw %%mm3, %%mm4 \n\t"\
1119  "movq %%mm2, %%mm0 \n\t"\
1120  "movq %%mm5, %%mm6 \n\t"\
1121  "movq %%mm4, %%mm3 \n\t"\
1122  "punpcklwd %%mm2, %%mm2 \n\t"\
1123  "punpcklwd %%mm5, %%mm5 \n\t"\
1124  "punpcklwd %%mm4, %%mm4 \n\t"\
1125  "paddw %%mm1, %%mm2 \n\t"\
1126  "paddw %%mm1, %%mm5 \n\t"\
1127  "paddw %%mm1, %%mm4 \n\t"\
1128  "punpckhwd %%mm0, %%mm0 \n\t"\
1129  "punpckhwd %%mm6, %%mm6 \n\t"\
1130  "punpckhwd %%mm3, %%mm3 \n\t"\
1131  "paddw %%mm7, %%mm0 \n\t"\
1132  "paddw %%mm7, %%mm6 \n\t"\
1133  "paddw %%mm7, %%mm3 \n\t"\
1134  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1135  "packuswb %%mm0, %%mm2 \n\t"\
1136  "packuswb %%mm6, %%mm5 \n\t"\
1137  "packuswb %%mm3, %%mm4 \n\t"\
1138 
1139 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1140 
1141 // do vertical chrominance interpolation
1142 #define REAL_YSCALEYUV2RGB1b(index, c) \
1143  "xor "#index", "#index" \n\t"\
1144  ".p2align 4 \n\t"\
1145  "1: \n\t"\
1146  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1147  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1148  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1149  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1150  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1151  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1152  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1153  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1154  "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
1155  "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
1156  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1157  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1158  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1159  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1160  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1161  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1162  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1163  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1164  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1165  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1166  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1167  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1168  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1169  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1170  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1171  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1172  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1173  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1174  "paddw %%mm3, %%mm4 \n\t"\
1175  "movq %%mm2, %%mm0 \n\t"\
1176  "movq %%mm5, %%mm6 \n\t"\
1177  "movq %%mm4, %%mm3 \n\t"\
1178  "punpcklwd %%mm2, %%mm2 \n\t"\
1179  "punpcklwd %%mm5, %%mm5 \n\t"\
1180  "punpcklwd %%mm4, %%mm4 \n\t"\
1181  "paddw %%mm1, %%mm2 \n\t"\
1182  "paddw %%mm1, %%mm5 \n\t"\
1183  "paddw %%mm1, %%mm4 \n\t"\
1184  "punpckhwd %%mm0, %%mm0 \n\t"\
1185  "punpckhwd %%mm6, %%mm6 \n\t"\
1186  "punpckhwd %%mm3, %%mm3 \n\t"\
1187  "paddw %%mm7, %%mm0 \n\t"\
1188  "paddw %%mm7, %%mm6 \n\t"\
1189  "paddw %%mm7, %%mm3 \n\t"\
1190  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1191  "packuswb %%mm0, %%mm2 \n\t"\
1192  "packuswb %%mm6, %%mm5 \n\t"\
1193  "packuswb %%mm3, %%mm4 \n\t"\
1194 
1195 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1196 
1197 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1198  "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
1199  "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
1200  "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
1201  "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
1202  "packuswb %%mm1, %%mm7 \n\t"
1203 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1204 
1205 /**
1206  * YV12 to RGB without scaling or interpolating
1207  */
1208 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
1209  const int16_t *ubuf[2], const int16_t *vbuf[2],
1210  const int16_t *abuf0, uint8_t *dest,
1211  int dstW, int uvalpha, int y)
1212 {
1213  const int16_t *ubuf0 = ubuf[0];
1214  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1215 
1216  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1217  const int16_t *ubuf1 = ubuf[0];
1218  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1219  __asm__ volatile(
1220  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1221  "mov %4, %%"FF_REG_b" \n\t"
1222  "push %%"FF_REG_BP" \n\t"
1223  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1224  YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1225  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1226  "pop %%"FF_REG_BP" \n\t"
1227  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1228  :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1229  "a" (&c->redDither)
1230  );
1231  } else {
1232  __asm__ volatile(
1233  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1234  "mov %4, %%"FF_REG_b" \n\t"
1235  "push %%"FF_REG_BP" \n\t"
1236  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1237  "pcmpeqd %%mm7, %%mm7 \n\t"
1238  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1239  "pop %%"FF_REG_BP" \n\t"
1240  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1241  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1242  "a" (&c->redDither)
1243  );
1244  }
1245  } else {
1246  const int16_t *ubuf1 = ubuf[1];
1247  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1248  __asm__ volatile(
1249  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1250  "mov %4, %%"FF_REG_b" \n\t"
1251  "push %%"FF_REG_BP" \n\t"
1252  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1253  YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1254  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1255  "pop %%"FF_REG_BP" \n\t"
1256  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1257  :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1258  "a" (&c->redDither)
1259  );
1260  } else {
1261  __asm__ volatile(
1262  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1263  "mov %4, %%"FF_REG_b" \n\t"
1264  "push %%"FF_REG_BP" \n\t"
1265  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1266  "pcmpeqd %%mm7, %%mm7 \n\t"
1267  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1268  "pop %%"FF_REG_BP" \n\t"
1269  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1270  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1271  "a" (&c->redDither)
1272  );
1273  }
1274  }
1275 }
1276 
1277 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
1278  const int16_t *ubuf[2], const int16_t *vbuf[2],
1279  const int16_t *abuf0, uint8_t *dest,
1280  int dstW, int uvalpha, int y)
1281 {
1282  const int16_t *ubuf0 = ubuf[0];
1283  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1284 
1285  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1286  const int16_t *ubuf1 = ubuf[0];
1287  __asm__ volatile(
1288  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1289  "mov %4, %%"FF_REG_b" \n\t"
1290  "push %%"FF_REG_BP" \n\t"
1291  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1292  "pxor %%mm7, %%mm7 \n\t"
1293  WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1294  "pop %%"FF_REG_BP" \n\t"
1295  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1296  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1297  "a" (&c->redDither)
1298  NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1299  );
1300  } else {
1301  const int16_t *ubuf1 = ubuf[1];
1302  __asm__ volatile(
1303  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1304  "mov %4, %%"FF_REG_b" \n\t"
1305  "push %%"FF_REG_BP" \n\t"
1306  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1307  "pxor %%mm7, %%mm7 \n\t"
1308  WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1309  "pop %%"FF_REG_BP" \n\t"
1310  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1311  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1312  "a" (&c->redDither)
1313  NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1314  );
1315  }
1316 }
1317 
1318 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
1319  const int16_t *ubuf[2], const int16_t *vbuf[2],
1320  const int16_t *abuf0, uint8_t *dest,
1321  int dstW, int uvalpha, int y)
1322 {
1323  const int16_t *ubuf0 = ubuf[0];
1324  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1325 
1326  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1327  const int16_t *ubuf1 = ubuf[0];
1328  __asm__ volatile(
1329  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1330  "mov %4, %%"FF_REG_b" \n\t"
1331  "push %%"FF_REG_BP" \n\t"
1332  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1333  "pxor %%mm7, %%mm7 \n\t"
1334  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1335 #ifdef DITHER1XBPP
1336  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1337  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1338  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1339 #endif
1340  WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1341  "pop %%"FF_REG_BP" \n\t"
1342  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1343  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1344  "a" (&c->redDither)
1346  );
1347  } else {
1348  const int16_t *ubuf1 = ubuf[1];
1349  __asm__ volatile(
1350  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1351  "mov %4, %%"FF_REG_b" \n\t"
1352  "push %%"FF_REG_BP" \n\t"
1353  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1354  "pxor %%mm7, %%mm7 \n\t"
1355  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1356 #ifdef DITHER1XBPP
1357  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1358  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1359  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1360 #endif
1361  WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1362  "pop %%"FF_REG_BP" \n\t"
1363  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1364  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1365  "a" (&c->redDither)
1367  );
1368  }
1369 }
1370 
1371 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
1372  const int16_t *ubuf[2], const int16_t *vbuf[2],
1373  const int16_t *abuf0, uint8_t *dest,
1374  int dstW, int uvalpha, int y)
1375 {
1376  const int16_t *ubuf0 = ubuf[0];
1377  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1378 
1379  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1380  const int16_t *ubuf1 = ubuf[0];
1381  __asm__ volatile(
1382  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1383  "mov %4, %%"FF_REG_b" \n\t"
1384  "push %%"FF_REG_BP" \n\t"
1385  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1386  "pxor %%mm7, %%mm7 \n\t"
1387  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1388 #ifdef DITHER1XBPP
1389  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1390  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1391  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1392 #endif
1393  WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1394  "pop %%"FF_REG_BP" \n\t"
1395  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1396  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1397  "a" (&c->redDither)
1398  NAMED_CONSTRAINTS_ADD(bF8,bFC)
1399  );
1400  } else {
1401  const int16_t *ubuf1 = ubuf[1];
1402  __asm__ volatile(
1403  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1404  "mov %4, %%"FF_REG_b" \n\t"
1405  "push %%"FF_REG_BP" \n\t"
1406  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1407  "pxor %%mm7, %%mm7 \n\t"
1408  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1409 #ifdef DITHER1XBPP
1410  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1411  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1412  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1413 #endif
1414  WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1415  "pop %%"FF_REG_BP" \n\t"
1416  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1417  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1418  "a" (&c->redDither)
1419  NAMED_CONSTRAINTS_ADD(bF8,bFC)
1420  );
1421  }
1422 }
1423 
1424 #define REAL_YSCALEYUV2PACKED1(index, c) \
1425  "xor "#index", "#index" \n\t"\
1426  ".p2align 4 \n\t"\
1427  "1: \n\t"\
1428  "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1429  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1430  "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1431  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1432  "psraw $7, %%mm3 \n\t" \
1433  "psraw $7, %%mm4 \n\t" \
1434  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1435  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1436  "psraw $7, %%mm1 \n\t" \
1437  "psraw $7, %%mm7 \n\t" \
1438 
1439 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1440 
1441 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1442  "xor "#index", "#index" \n\t"\
1443  ".p2align 4 \n\t"\
1444  "1: \n\t"\
1445  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1446  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1447  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1448  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1449  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1450  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1451  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1452  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1453  "psrlw $8, %%mm3 \n\t" \
1454  "psrlw $8, %%mm4 \n\t" \
1455  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1456  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1457  "psraw $7, %%mm1 \n\t" \
1458  "psraw $7, %%mm7 \n\t"
1459 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1460 
1461 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
1462  const int16_t *ubuf[2], const int16_t *vbuf[2],
1463  const int16_t *abuf0, uint8_t *dest,
1464  int dstW, int uvalpha, int y)
1465 {
1466  const int16_t *ubuf0 = ubuf[0];
1467  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1468 
1469  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1470  const int16_t *ubuf1 = ubuf[0];
1471  __asm__ volatile(
1472  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1473  "mov %4, %%"FF_REG_b" \n\t"
1474  "push %%"FF_REG_BP" \n\t"
1475  YSCALEYUV2PACKED1(%%FF_REGBP, %5)
1476  WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1477  "pop %%"FF_REG_BP" \n\t"
1478  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1479  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1480  "a" (&c->redDither)
1481  );
1482  } else {
1483  const int16_t *ubuf1 = ubuf[1];
1484  __asm__ volatile(
1485  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1486  "mov %4, %%"FF_REG_b" \n\t"
1487  "push %%"FF_REG_BP" \n\t"
1488  YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
1489  WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1490  "pop %%"FF_REG_BP" \n\t"
1491  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1492  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1493  "a" (&c->redDither)
1494  );
1495  }
1496 }
1498 {
1499  enum AVPixelFormat dstFormat = c->dstFormat;
1500 
1501  c->use_mmx_vfilter= 0;
1502  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)
1503  && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE
1504  && !(c->flags & SWS_BITEXACT)) {
1505  if (c->flags & SWS_ACCURATE_RND) {
1506  if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1507  switch (c->dstFormat) {
1508  case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
1509 #if HAVE_6REGS
1510  case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
1511 #endif
1512  case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
1513  case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
1514  case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1515  default: break;
1516  }
1517  }
1518  } else {
1519  c->use_mmx_vfilter= 1;
1520  c->yuv2planeX = RENAME(yuv2yuvX );
1521  if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1522  switch (c->dstFormat) {
1523  case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
1524  case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
1525 #if HAVE_6REGS
1526  case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
1527 #endif
1528  case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
1529  case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
1530  case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1531  default: break;
1532  }
1533  }
1534  }
1535  if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1536  switch (c->dstFormat) {
1537  case AV_PIX_FMT_RGB32:
1538  c->yuv2packed1 = RENAME(yuv2rgb32_1);
1539  c->yuv2packed2 = RENAME(yuv2rgb32_2);
1540  break;
1541  case AV_PIX_FMT_BGR24:
1542  c->yuv2packed1 = RENAME(yuv2bgr24_1);
1543  c->yuv2packed2 = RENAME(yuv2bgr24_2);
1544  break;
1545  case AV_PIX_FMT_RGB555:
1546  c->yuv2packed1 = RENAME(yuv2rgb555_1);
1547  c->yuv2packed2 = RENAME(yuv2rgb555_2);
1548  break;
1549  case AV_PIX_FMT_RGB565:
1550  c->yuv2packed1 = RENAME(yuv2rgb565_1);
1551  c->yuv2packed2 = RENAME(yuv2rgb565_2);
1552  break;
1553  case AV_PIX_FMT_YUYV422:
1554  c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1555  c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1556  break;
1557  default:
1558  break;
1559  }
1560  }
1561  }
1562 
1563  if (c->srcBpc == 8 && c->dstBpc <= 14) {
1564  // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1565 #if COMPILE_TEMPLATE_MMXEXT
1566  if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1567  c->hyscale_fast = ff_hyscale_fast_mmxext;
1568  c->hcscale_fast = ff_hcscale_fast_mmxext;
1569  } else {
1570 #endif /* COMPILE_TEMPLATE_MMXEXT */
1571  c->hyscale_fast = NULL;
1572  c->hcscale_fast = NULL;
1573 #if COMPILE_TEMPLATE_MMXEXT
1574  }
1575 #endif /* COMPILE_TEMPLATE_MMXEXT */
1576  }
1577 }
WRITEBGR32
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
Definition: swscale_template.c:338
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
YSCALEYUV2PACKEDX_ACCURATE
#define YSCALEYUV2PACKEDX_ACCURATE
Definition: swscale_template.c:276
ALP_MMX_FILTER_OFFSET
#define ALP_MMX_FILTER_OFFSET
Definition: swscale_internal.h:459
YSCALEYUV2RGB1
#define YSCALEYUV2RGB1(index, c)
Definition: swscale_template.c:1139
YSCALEYUV2PACKEDX_YA
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
Definition: swscale_template.c:148
AV_PIX_FMT_BGR32
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:362
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
YSCALEYUV2RGB
#define YSCALEYUV2RGB(index, c)
Definition: swscale_template.c:876
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:341
SWS_FAST_BILINEAR
#define SWS_FAST_BILINEAR
Definition: swscale.h:58
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:646
SWS_BITEXACT
#define SWS_BITEXACT
Definition: swscale.h:84
DSTW_OFFSET
#define DSTW_OFFSET
Definition: swscale_internal.h:453
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:653
src
#define src
Definition: vp8dsp.c:254
ff_hcscale_fast_mmxext
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
buf
void * buf
Definition: avisynth_c.h:766
av_cold
#define av_cold
Definition: attributes.h:84
sws_init_swscale
static av_cold void sws_init_swscale(SwsContext *c)
Definition: swscale.c:565
BLUE_DITHER
#define BLUE_DITHER
Definition: swscale_internal.h:442
YSCALEYUV2RGB1b
#define YSCALEYUV2RGB1b(index, c)
Definition: swscale_template.c:1195
WRITERGB15
#define WRITERGB15(dst, dstw, index)
Definition: swscale_template.c:534
WRITEBGR24
#define WRITEBGR24(dst, dstw, index)
Definition: swscale_template.c:690
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:685
NULL
#define NULL
Definition: coverity.c:32
YSCALEYUV2PACKEDX
#define YSCALEYUV2PACKEDX
Definition: swscale_template.c:167
AV_PIX_FMT_YUYV422
@ AV_PIX_FMT_YUYV422
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:67
U_TEMP
#define U_TEMP
Definition: swscale_internal.h:456
GREEN_DITHER
#define GREEN_DITHER
Definition: swscale_internal.h:441
YSCALEYUV2RGB1_ALPHA
#define YSCALEYUV2RGB1_ALPHA(index)
Definition: swscale_template.c:1203
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
SWS_FULL_CHR_H_INT
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
asm.h
MOVNTQ2
#define MOVNTQ2
Definition: swscale_template.c:37
RED_DITHER
#define RED_DITHER
Definition: swscale_internal.h:440
AV_PIX_FMT_RGB32
#define AV_PIX_FMT_RGB32
Definition: pixfmt.h:360
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
SWS_ACCURATE_RND
#define SWS_ACCURATE_RND
Definition: swscale.h:83
YSCALEYUV2RGB_YA
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
Definition: swscale_template.c:874
RENAME
#define RENAME(name)
Definition: ffv1.h:197
AV_PIX_FMT_RGB555
#define AV_PIX_FMT_RGB555
Definition: pixfmt.h:375
av_always_inline
#define av_always_inline
Definition: attributes.h:43
swscale_internal.h
uint8_t
uint8_t
Definition: audio_convert.c:194
YSCALEYUV2PACKED
#define YSCALEYUV2PACKED(index, c)
Definition: swscale_template.c:1067
AV_PIX_FMT_RGB565
#define AV_PIX_FMT_RGB565
Definition: pixfmt.h:374
V_TEMP
#define V_TEMP
Definition: swscale_internal.h:457
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:340
WRITEYUY2
#define WRITEYUY2(dst, dstw, index)
Definition: swscale_template.c:760
dummy
int dummy
Definition: motion.c:64
Y_TEMP
#define Y_TEMP
Definition: swscale_internal.h:458
YSCALEYUV2PACKEDX_ACCURATE_YA
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
Definition: swscale_template.c:231
YSCALEYUV2RGBX
#define YSCALEYUV2RGBX
Definition: swscale_template.c:280
ff_hyscale_fast_mmxext
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
ESP_OFFSET
#define ESP_OFFSET
Definition: swscale_internal.h:454
dither_8to16
static av_always_inline void dither_8to16(const uint8_t *srcDither, int rot)
Definition: swscale_template.c:43
x86_reg
int x86_reg
Definition: asm.h:72
YSCALEYUV2PACKEDX_END
#define YSCALEYUV2PACKEDX_END
Definition: swscale_template.c:171
WRITERGB16
#define WRITERGB16(dst, dstw, index)
Definition: swscale_template.c:457
YSCALEYUV2PACKED1
#define YSCALEYUV2PACKED1(index, c)
Definition: swscale_template.c:1439
SwsContext
Definition: swscale_internal.h:280
YSCALEYUV2PACKED1b
#define YSCALEYUV2PACKED1b(index, c)
Definition: swscale_template.c:1459
dither
static const uint8_t dither[8][8]
Definition: vf_fspp.c:57