FFmpeg
rgb2rgb.c
Go to the documentation of this file.
1 /*
2  * software RGB to RGB converter
3  * pluralize by software PAL8 to RGB converter
4  * software YUV to YUV converter
5  * software YUV to RGB converter
6  * Written by Nick Kurshev.
7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8  *
9  * This file is part of FFmpeg.
10  *
11  * FFmpeg is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * FFmpeg is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with FFmpeg; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25 
26 #include <stdint.h>
27 
28 #include "config.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/x86/cpu.h"
31 #include "libavutil/cpu.h"
32 #include "libavutil/bswap.h"
33 #include "libavutil/mem_internal.h"
34 
35 #include "libswscale/rgb2rgb.h"
36 #include "libswscale/swscale.h"
38 
39 #if HAVE_INLINE_ASM
40 #include "libavutil/x86/asm.h"
41 
42 DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
43 DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
44 DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
45 DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
46 DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
47 DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
48 DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
49 DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
50 DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
51 DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
52 DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
53 DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
54 DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
55 DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
56 DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
57 DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
58 DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
59 DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
60 #define mask16b mask15b
61 DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
62 DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
63 DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
64 DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
65 DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
66 DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
67 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
68 DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
69 DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
70 DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
71 DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
72 
73 DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset);
74 DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111);
75 DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
76 
77 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
78 #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
79 #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
80 #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
81 #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
82 #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
83 #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
84 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
85 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
86 
87 // MMXEXT versions
88 #define PREFETCH "prefetchnta"
89 #define PAVGB "pavgb"
90 #define MOVNTQ "movntq"
91 #define SFENCE "sfence"
92 
93 #define EMMS "emms"
94 
95 static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
96 {
97  uint8_t *dest = dst;
98  const uint8_t *s = src;
99  const uint8_t *end;
100  const uint8_t *mm_end;
101  end = s + src_size;
102  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
103  mm_end = end - 23;
104  __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
105  while (s < mm_end) {
106  __asm__ volatile(
107  PREFETCH" 32(%1) \n\t"
108  "movd (%1), %%mm0 \n\t"
109  "punpckldq 3(%1), %%mm0 \n\t"
110  "movd 6(%1), %%mm1 \n\t"
111  "punpckldq 9(%1), %%mm1 \n\t"
112  "movd 12(%1), %%mm2 \n\t"
113  "punpckldq 15(%1), %%mm2 \n\t"
114  "movd 18(%1), %%mm3 \n\t"
115  "punpckldq 21(%1), %%mm3 \n\t"
116  "por %%mm7, %%mm0 \n\t"
117  "por %%mm7, %%mm1 \n\t"
118  "por %%mm7, %%mm2 \n\t"
119  "por %%mm7, %%mm3 \n\t"
120  MOVNTQ" %%mm0, (%0) \n\t"
121  MOVNTQ" %%mm1, 8(%0) \n\t"
122  MOVNTQ" %%mm2, 16(%0) \n\t"
123  MOVNTQ" %%mm3, 24(%0)"
124  :: "r"(dest), "r"(s)
125  :"memory");
126  dest += 32;
127  s += 24;
128  }
129  __asm__ volatile(SFENCE:::"memory");
130  __asm__ volatile(EMMS:::"memory");
131  while (s < end) {
132  *dest++ = *s++;
133  *dest++ = *s++;
134  *dest++ = *s++;
135  *dest++ = 255;
136  }
137 }
138 
139 #define STORE_BGR24_MMX \
140  "psrlq $8, %%mm2 \n\t" \
141  "psrlq $8, %%mm3 \n\t" \
142  "psrlq $8, %%mm6 \n\t" \
143  "psrlq $8, %%mm7 \n\t" \
144  "pand "MANGLE(mask24l)", %%mm0\n\t" \
145  "pand "MANGLE(mask24l)", %%mm1\n\t" \
146  "pand "MANGLE(mask24l)", %%mm4\n\t" \
147  "pand "MANGLE(mask24l)", %%mm5\n\t" \
148  "pand "MANGLE(mask24h)", %%mm2\n\t" \
149  "pand "MANGLE(mask24h)", %%mm3\n\t" \
150  "pand "MANGLE(mask24h)", %%mm6\n\t" \
151  "pand "MANGLE(mask24h)", %%mm7\n\t" \
152  "por %%mm2, %%mm0 \n\t" \
153  "por %%mm3, %%mm1 \n\t" \
154  "por %%mm6, %%mm4 \n\t" \
155  "por %%mm7, %%mm5 \n\t" \
156  \
157  "movq %%mm1, %%mm2 \n\t" \
158  "movq %%mm4, %%mm3 \n\t" \
159  "psllq $48, %%mm2 \n\t" \
160  "psllq $32, %%mm3 \n\t" \
161  "por %%mm2, %%mm0 \n\t" \
162  "psrlq $16, %%mm1 \n\t" \
163  "psrlq $32, %%mm4 \n\t" \
164  "psllq $16, %%mm5 \n\t" \
165  "por %%mm3, %%mm1 \n\t" \
166  "por %%mm5, %%mm4 \n\t" \
167  \
168  MOVNTQ" %%mm0, (%0) \n\t" \
169  MOVNTQ" %%mm1, 8(%0) \n\t" \
170  MOVNTQ" %%mm4, 16(%0)"
171 
172 
173 static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
174 {
175  uint8_t *dest = dst;
176  const uint8_t *s = src;
177  const uint8_t *end;
178  const uint8_t *mm_end;
179  end = s + src_size;
180  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
181  mm_end = end - 31;
182  while (s < mm_end) {
183  __asm__ volatile(
184  PREFETCH" 32(%1) \n\t"
185  "movq (%1), %%mm0 \n\t"
186  "movq 8(%1), %%mm1 \n\t"
187  "movq 16(%1), %%mm4 \n\t"
188  "movq 24(%1), %%mm5 \n\t"
189  "movq %%mm0, %%mm2 \n\t"
190  "movq %%mm1, %%mm3 \n\t"
191  "movq %%mm4, %%mm6 \n\t"
192  "movq %%mm5, %%mm7 \n\t"
193  STORE_BGR24_MMX
194  :: "r"(dest), "r"(s)
195  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
196  :"memory");
197  dest += 24;
198  s += 32;
199  }
200  __asm__ volatile(SFENCE:::"memory");
201  __asm__ volatile(EMMS:::"memory");
202  while (s < end) {
203  *dest++ = *s++;
204  *dest++ = *s++;
205  *dest++ = *s++;
206  s++;
207  }
208 }
209 
210 /*
211  original by Strepto/Astral
212  ported to gcc & bugfixed: A'rpi
213  MMXEXT, 3DNOW optimization by Nick Kurshev
214  32-bit C version, and and&add trick by Michael Niedermayer
215 */
216 static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
217 {
218  register const uint8_t* s=src;
219  register uint8_t* d=dst;
220  register const uint8_t *end;
221  const uint8_t *mm_end;
222  end = s + src_size;
223  __asm__ volatile(PREFETCH" %0"::"m"(*s));
224  __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
225  mm_end = end - 15;
226  while (s<mm_end) {
227  __asm__ volatile(
228  PREFETCH" 32(%1) \n\t"
229  "movq (%1), %%mm0 \n\t"
230  "movq 8(%1), %%mm2 \n\t"
231  "movq %%mm0, %%mm1 \n\t"
232  "movq %%mm2, %%mm3 \n\t"
233  "pand %%mm4, %%mm0 \n\t"
234  "pand %%mm4, %%mm2 \n\t"
235  "paddw %%mm1, %%mm0 \n\t"
236  "paddw %%mm3, %%mm2 \n\t"
237  MOVNTQ" %%mm0, (%0) \n\t"
238  MOVNTQ" %%mm2, 8(%0)"
239  :: "r"(d), "r"(s)
240  );
241  d+=16;
242  s+=16;
243  }
244  __asm__ volatile(SFENCE:::"memory");
245  __asm__ volatile(EMMS:::"memory");
246  mm_end = end - 3;
247  while (s < mm_end) {
248  register unsigned x= *((const uint32_t *)s);
249  *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
250  d+=4;
251  s+=4;
252  }
253  if (s < end) {
254  register unsigned short x= *((const uint16_t *)s);
255  *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
256  }
257 }
258 
259 static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
260 {
261  register const uint8_t* s=src;
262  register uint8_t* d=dst;
263  register const uint8_t *end;
264  const uint8_t *mm_end;
265  end = s + src_size;
266  __asm__ volatile(PREFETCH" %0"::"m"(*s));
267  __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
268  __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
269  mm_end = end - 15;
270  while (s<mm_end) {
271  __asm__ volatile(
272  PREFETCH" 32(%1) \n\t"
273  "movq (%1), %%mm0 \n\t"
274  "movq 8(%1), %%mm2 \n\t"
275  "movq %%mm0, %%mm1 \n\t"
276  "movq %%mm2, %%mm3 \n\t"
277  "psrlq $1, %%mm0 \n\t"
278  "psrlq $1, %%mm2 \n\t"
279  "pand %%mm7, %%mm0 \n\t"
280  "pand %%mm7, %%mm2 \n\t"
281  "pand %%mm6, %%mm1 \n\t"
282  "pand %%mm6, %%mm3 \n\t"
283  "por %%mm1, %%mm0 \n\t"
284  "por %%mm3, %%mm2 \n\t"
285  MOVNTQ" %%mm0, (%0) \n\t"
286  MOVNTQ" %%mm2, 8(%0)"
287  :: "r"(d), "r"(s)
288  );
289  d+=16;
290  s+=16;
291  }
292  __asm__ volatile(SFENCE:::"memory");
293  __asm__ volatile(EMMS:::"memory");
294  mm_end = end - 3;
295  while (s < mm_end) {
296  register uint32_t x= *((const uint32_t*)s);
297  *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
298  s+=4;
299  d+=4;
300  }
301  if (s < end) {
302  register uint16_t x= *((const uint16_t*)s);
303  *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
304  }
305 }
306 
307 static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
308 {
309  const uint8_t *s = src;
310  const uint8_t *end;
311  const uint8_t *mm_end;
312  uint16_t *d = (uint16_t *)dst;
313  end = s + src_size;
314  mm_end = end - 15;
315  __asm__ volatile(
316  "movq %3, %%mm5 \n\t"
317  "movq %4, %%mm6 \n\t"
318  "movq %5, %%mm7 \n\t"
319  "jmp 2f \n\t"
320  ".p2align 4 \n\t"
321  "1: \n\t"
322  PREFETCH" 32(%1) \n\t"
323  "movd (%1), %%mm0 \n\t"
324  "movd 4(%1), %%mm3 \n\t"
325  "punpckldq 8(%1), %%mm0 \n\t"
326  "punpckldq 12(%1), %%mm3 \n\t"
327  "movq %%mm0, %%mm1 \n\t"
328  "movq %%mm3, %%mm4 \n\t"
329  "pand %%mm6, %%mm0 \n\t"
330  "pand %%mm6, %%mm3 \n\t"
331  "pmaddwd %%mm7, %%mm0 \n\t"
332  "pmaddwd %%mm7, %%mm3 \n\t"
333  "pand %%mm5, %%mm1 \n\t"
334  "pand %%mm5, %%mm4 \n\t"
335  "por %%mm1, %%mm0 \n\t"
336  "por %%mm4, %%mm3 \n\t"
337  "psrld $5, %%mm0 \n\t"
338  "pslld $11, %%mm3 \n\t"
339  "por %%mm3, %%mm0 \n\t"
340  MOVNTQ" %%mm0, (%0) \n\t"
341  "add $16, %1 \n\t"
342  "add $8, %0 \n\t"
343  "2: \n\t"
344  "cmp %2, %1 \n\t"
345  " jb 1b \n\t"
346  : "+r" (d), "+r"(s)
347  : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
348  );
349  __asm__ volatile(SFENCE:::"memory");
350  __asm__ volatile(EMMS:::"memory");
351  while (s < end) {
352  register int rgb = *(const uint32_t*)s; s += 4;
353  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
354  }
355 }
356 
357 static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
358 {
359  const uint8_t *s = src;
360  const uint8_t *end;
361  const uint8_t *mm_end;
362  uint16_t *d = (uint16_t *)dst;
363  end = s + src_size;
364  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
365  __asm__ volatile(
366  "movq %0, %%mm7 \n\t"
367  "movq %1, %%mm6 \n\t"
368  ::"m"(red_16mask),"m"(green_16mask));
369  mm_end = end - 15;
370  while (s < mm_end) {
371  __asm__ volatile(
372  PREFETCH" 32(%1) \n\t"
373  "movd (%1), %%mm0 \n\t"
374  "movd 4(%1), %%mm3 \n\t"
375  "punpckldq 8(%1), %%mm0 \n\t"
376  "punpckldq 12(%1), %%mm3 \n\t"
377  "movq %%mm0, %%mm1 \n\t"
378  "movq %%mm0, %%mm2 \n\t"
379  "movq %%mm3, %%mm4 \n\t"
380  "movq %%mm3, %%mm5 \n\t"
381  "psllq $8, %%mm0 \n\t"
382  "psllq $8, %%mm3 \n\t"
383  "pand %%mm7, %%mm0 \n\t"
384  "pand %%mm7, %%mm3 \n\t"
385  "psrlq $5, %%mm1 \n\t"
386  "psrlq $5, %%mm4 \n\t"
387  "pand %%mm6, %%mm1 \n\t"
388  "pand %%mm6, %%mm4 \n\t"
389  "psrlq $19, %%mm2 \n\t"
390  "psrlq $19, %%mm5 \n\t"
391  "pand %2, %%mm2 \n\t"
392  "pand %2, %%mm5 \n\t"
393  "por %%mm1, %%mm0 \n\t"
394  "por %%mm4, %%mm3 \n\t"
395  "por %%mm2, %%mm0 \n\t"
396  "por %%mm5, %%mm3 \n\t"
397  "psllq $16, %%mm3 \n\t"
398  "por %%mm3, %%mm0 \n\t"
399  MOVNTQ" %%mm0, (%0) \n\t"
400  :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
401  d += 4;
402  s += 16;
403  }
404  __asm__ volatile(SFENCE:::"memory");
405  __asm__ volatile(EMMS:::"memory");
406  while (s < end) {
407  register int rgb = *(const uint32_t*)s; s += 4;
408  *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
409  }
410 }
411 
412 static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
413 {
414  const uint8_t *s = src;
415  const uint8_t *end;
416  const uint8_t *mm_end;
417  uint16_t *d = (uint16_t *)dst;
418  end = s + src_size;
419  mm_end = end - 15;
420  __asm__ volatile(
421  "movq %3, %%mm5 \n\t"
422  "movq %4, %%mm6 \n\t"
423  "movq %5, %%mm7 \n\t"
424  "jmp 2f \n\t"
425  ".p2align 4 \n\t"
426  "1: \n\t"
427  PREFETCH" 32(%1) \n\t"
428  "movd (%1), %%mm0 \n\t"
429  "movd 4(%1), %%mm3 \n\t"
430  "punpckldq 8(%1), %%mm0 \n\t"
431  "punpckldq 12(%1), %%mm3 \n\t"
432  "movq %%mm0, %%mm1 \n\t"
433  "movq %%mm3, %%mm4 \n\t"
434  "pand %%mm6, %%mm0 \n\t"
435  "pand %%mm6, %%mm3 \n\t"
436  "pmaddwd %%mm7, %%mm0 \n\t"
437  "pmaddwd %%mm7, %%mm3 \n\t"
438  "pand %%mm5, %%mm1 \n\t"
439  "pand %%mm5, %%mm4 \n\t"
440  "por %%mm1, %%mm0 \n\t"
441  "por %%mm4, %%mm3 \n\t"
442  "psrld $6, %%mm0 \n\t"
443  "pslld $10, %%mm3 \n\t"
444  "por %%mm3, %%mm0 \n\t"
445  MOVNTQ" %%mm0, (%0) \n\t"
446  "add $16, %1 \n\t"
447  "add $8, %0 \n\t"
448  "2: \n\t"
449  "cmp %2, %1 \n\t"
450  " jb 1b \n\t"
451  : "+r" (d), "+r"(s)
452  : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
453  );
454  __asm__ volatile(SFENCE:::"memory");
455  __asm__ volatile(EMMS:::"memory");
456  while (s < end) {
457  register int rgb = *(const uint32_t*)s; s += 4;
458  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
459  }
460 }
461 
462 static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
463 {
464  const uint8_t *s = src;
465  const uint8_t *end;
466  const uint8_t *mm_end;
467  uint16_t *d = (uint16_t *)dst;
468  end = s + src_size;
469  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
470  __asm__ volatile(
471  "movq %0, %%mm7 \n\t"
472  "movq %1, %%mm6 \n\t"
473  ::"m"(red_15mask),"m"(green_15mask));
474  mm_end = end - 15;
475  while (s < mm_end) {
476  __asm__ volatile(
477  PREFETCH" 32(%1) \n\t"
478  "movd (%1), %%mm0 \n\t"
479  "movd 4(%1), %%mm3 \n\t"
480  "punpckldq 8(%1), %%mm0 \n\t"
481  "punpckldq 12(%1), %%mm3 \n\t"
482  "movq %%mm0, %%mm1 \n\t"
483  "movq %%mm0, %%mm2 \n\t"
484  "movq %%mm3, %%mm4 \n\t"
485  "movq %%mm3, %%mm5 \n\t"
486  "psllq $7, %%mm0 \n\t"
487  "psllq $7, %%mm3 \n\t"
488  "pand %%mm7, %%mm0 \n\t"
489  "pand %%mm7, %%mm3 \n\t"
490  "psrlq $6, %%mm1 \n\t"
491  "psrlq $6, %%mm4 \n\t"
492  "pand %%mm6, %%mm1 \n\t"
493  "pand %%mm6, %%mm4 \n\t"
494  "psrlq $19, %%mm2 \n\t"
495  "psrlq $19, %%mm5 \n\t"
496  "pand %2, %%mm2 \n\t"
497  "pand %2, %%mm5 \n\t"
498  "por %%mm1, %%mm0 \n\t"
499  "por %%mm4, %%mm3 \n\t"
500  "por %%mm2, %%mm0 \n\t"
501  "por %%mm5, %%mm3 \n\t"
502  "psllq $16, %%mm3 \n\t"
503  "por %%mm3, %%mm0 \n\t"
504  MOVNTQ" %%mm0, (%0) \n\t"
505  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
506  d += 4;
507  s += 16;
508  }
509  __asm__ volatile(SFENCE:::"memory");
510  __asm__ volatile(EMMS:::"memory");
511  while (s < end) {
512  register int rgb = *(const uint32_t*)s; s += 4;
513  *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
514  }
515 }
516 
517 static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
518 {
519  const uint8_t *s = src;
520  const uint8_t *end;
521  const uint8_t *mm_end;
522  uint16_t *d = (uint16_t *)dst;
523  end = s + src_size;
524  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
525  __asm__ volatile(
526  "movq %0, %%mm7 \n\t"
527  "movq %1, %%mm6 \n\t"
528  ::"m"(red_16mask),"m"(green_16mask));
529  mm_end = end - 11;
530  while (s < mm_end) {
531  __asm__ volatile(
532  PREFETCH" 32(%1) \n\t"
533  "movd (%1), %%mm0 \n\t"
534  "movd 3(%1), %%mm3 \n\t"
535  "punpckldq 6(%1), %%mm0 \n\t"
536  "punpckldq 9(%1), %%mm3 \n\t"
537  "movq %%mm0, %%mm1 \n\t"
538  "movq %%mm0, %%mm2 \n\t"
539  "movq %%mm3, %%mm4 \n\t"
540  "movq %%mm3, %%mm5 \n\t"
541  "psrlq $3, %%mm0 \n\t"
542  "psrlq $3, %%mm3 \n\t"
543  "pand %2, %%mm0 \n\t"
544  "pand %2, %%mm3 \n\t"
545  "psrlq $5, %%mm1 \n\t"
546  "psrlq $5, %%mm4 \n\t"
547  "pand %%mm6, %%mm1 \n\t"
548  "pand %%mm6, %%mm4 \n\t"
549  "psrlq $8, %%mm2 \n\t"
550  "psrlq $8, %%mm5 \n\t"
551  "pand %%mm7, %%mm2 \n\t"
552  "pand %%mm7, %%mm5 \n\t"
553  "por %%mm1, %%mm0 \n\t"
554  "por %%mm4, %%mm3 \n\t"
555  "por %%mm2, %%mm0 \n\t"
556  "por %%mm5, %%mm3 \n\t"
557  "psllq $16, %%mm3 \n\t"
558  "por %%mm3, %%mm0 \n\t"
559  MOVNTQ" %%mm0, (%0) \n\t"
560  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
561  d += 4;
562  s += 12;
563  }
564  __asm__ volatile(SFENCE:::"memory");
565  __asm__ volatile(EMMS:::"memory");
566  while (s < end) {
567  const int b = *s++;
568  const int g = *s++;
569  const int r = *s++;
570  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
571  }
572 }
573 
574 static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
575 {
576  const uint8_t *s = src;
577  const uint8_t *end;
578  const uint8_t *mm_end;
579  uint16_t *d = (uint16_t *)dst;
580  end = s + src_size;
581  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
582  __asm__ volatile(
583  "movq %0, %%mm7 \n\t"
584  "movq %1, %%mm6 \n\t"
585  ::"m"(red_16mask),"m"(green_16mask));
586  mm_end = end - 15;
587  while (s < mm_end) {
588  __asm__ volatile(
589  PREFETCH" 32(%1) \n\t"
590  "movd (%1), %%mm0 \n\t"
591  "movd 3(%1), %%mm3 \n\t"
592  "punpckldq 6(%1), %%mm0 \n\t"
593  "punpckldq 9(%1), %%mm3 \n\t"
594  "movq %%mm0, %%mm1 \n\t"
595  "movq %%mm0, %%mm2 \n\t"
596  "movq %%mm3, %%mm4 \n\t"
597  "movq %%mm3, %%mm5 \n\t"
598  "psllq $8, %%mm0 \n\t"
599  "psllq $8, %%mm3 \n\t"
600  "pand %%mm7, %%mm0 \n\t"
601  "pand %%mm7, %%mm3 \n\t"
602  "psrlq $5, %%mm1 \n\t"
603  "psrlq $5, %%mm4 \n\t"
604  "pand %%mm6, %%mm1 \n\t"
605  "pand %%mm6, %%mm4 \n\t"
606  "psrlq $19, %%mm2 \n\t"
607  "psrlq $19, %%mm5 \n\t"
608  "pand %2, %%mm2 \n\t"
609  "pand %2, %%mm5 \n\t"
610  "por %%mm1, %%mm0 \n\t"
611  "por %%mm4, %%mm3 \n\t"
612  "por %%mm2, %%mm0 \n\t"
613  "por %%mm5, %%mm3 \n\t"
614  "psllq $16, %%mm3 \n\t"
615  "por %%mm3, %%mm0 \n\t"
616  MOVNTQ" %%mm0, (%0) \n\t"
617  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
618  d += 4;
619  s += 12;
620  }
621  __asm__ volatile(SFENCE:::"memory");
622  __asm__ volatile(EMMS:::"memory");
623  while (s < end) {
624  const int r = *s++;
625  const int g = *s++;
626  const int b = *s++;
627  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
628  }
629 }
630 
631 static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
632 {
633  const uint8_t *s = src;
634  const uint8_t *end;
635  const uint8_t *mm_end;
636  uint16_t *d = (uint16_t *)dst;
637  end = s + src_size;
638  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
639  __asm__ volatile(
640  "movq %0, %%mm7 \n\t"
641  "movq %1, %%mm6 \n\t"
642  ::"m"(red_15mask),"m"(green_15mask));
643  mm_end = end - 11;
644  while (s < mm_end) {
645  __asm__ volatile(
646  PREFETCH" 32(%1) \n\t"
647  "movd (%1), %%mm0 \n\t"
648  "movd 3(%1), %%mm3 \n\t"
649  "punpckldq 6(%1), %%mm0 \n\t"
650  "punpckldq 9(%1), %%mm3 \n\t"
651  "movq %%mm0, %%mm1 \n\t"
652  "movq %%mm0, %%mm2 \n\t"
653  "movq %%mm3, %%mm4 \n\t"
654  "movq %%mm3, %%mm5 \n\t"
655  "psrlq $3, %%mm0 \n\t"
656  "psrlq $3, %%mm3 \n\t"
657  "pand %2, %%mm0 \n\t"
658  "pand %2, %%mm3 \n\t"
659  "psrlq $6, %%mm1 \n\t"
660  "psrlq $6, %%mm4 \n\t"
661  "pand %%mm6, %%mm1 \n\t"
662  "pand %%mm6, %%mm4 \n\t"
663  "psrlq $9, %%mm2 \n\t"
664  "psrlq $9, %%mm5 \n\t"
665  "pand %%mm7, %%mm2 \n\t"
666  "pand %%mm7, %%mm5 \n\t"
667  "por %%mm1, %%mm0 \n\t"
668  "por %%mm4, %%mm3 \n\t"
669  "por %%mm2, %%mm0 \n\t"
670  "por %%mm5, %%mm3 \n\t"
671  "psllq $16, %%mm3 \n\t"
672  "por %%mm3, %%mm0 \n\t"
673  MOVNTQ" %%mm0, (%0) \n\t"
674  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
675  d += 4;
676  s += 12;
677  }
678  __asm__ volatile(SFENCE:::"memory");
679  __asm__ volatile(EMMS:::"memory");
680  while (s < end) {
681  const int b = *s++;
682  const int g = *s++;
683  const int r = *s++;
684  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
685  }
686 }
687 
688 static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
689 {
690  const uint8_t *s = src;
691  const uint8_t *end;
692  const uint8_t *mm_end;
693  uint16_t *d = (uint16_t *)dst;
694  end = s + src_size;
695  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
696  __asm__ volatile(
697  "movq %0, %%mm7 \n\t"
698  "movq %1, %%mm6 \n\t"
699  ::"m"(red_15mask),"m"(green_15mask));
700  mm_end = end - 15;
701  while (s < mm_end) {
702  __asm__ volatile(
703  PREFETCH" 32(%1) \n\t"
704  "movd (%1), %%mm0 \n\t"
705  "movd 3(%1), %%mm3 \n\t"
706  "punpckldq 6(%1), %%mm0 \n\t"
707  "punpckldq 9(%1), %%mm3 \n\t"
708  "movq %%mm0, %%mm1 \n\t"
709  "movq %%mm0, %%mm2 \n\t"
710  "movq %%mm3, %%mm4 \n\t"
711  "movq %%mm3, %%mm5 \n\t"
712  "psllq $7, %%mm0 \n\t"
713  "psllq $7, %%mm3 \n\t"
714  "pand %%mm7, %%mm0 \n\t"
715  "pand %%mm7, %%mm3 \n\t"
716  "psrlq $6, %%mm1 \n\t"
717  "psrlq $6, %%mm4 \n\t"
718  "pand %%mm6, %%mm1 \n\t"
719  "pand %%mm6, %%mm4 \n\t"
720  "psrlq $19, %%mm2 \n\t"
721  "psrlq $19, %%mm5 \n\t"
722  "pand %2, %%mm2 \n\t"
723  "pand %2, %%mm5 \n\t"
724  "por %%mm1, %%mm0 \n\t"
725  "por %%mm4, %%mm3 \n\t"
726  "por %%mm2, %%mm0 \n\t"
727  "por %%mm5, %%mm3 \n\t"
728  "psllq $16, %%mm3 \n\t"
729  "por %%mm3, %%mm0 \n\t"
730  MOVNTQ" %%mm0, (%0) \n\t"
731  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
732  d += 4;
733  s += 12;
734  }
735  __asm__ volatile(SFENCE:::"memory");
736  __asm__ volatile(EMMS:::"memory");
737  while (s < end) {
738  const int r = *s++;
739  const int g = *s++;
740  const int b = *s++;
741  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
742  }
743 }
744 
745 static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
746 {
747  const uint16_t *end;
748  const uint16_t *mm_end;
749  uint8_t *d = dst;
750  const uint16_t *s = (const uint16_t*)src;
751  end = s + src_size/2;
752  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
753  mm_end = end - 7;
754  while (s < mm_end) {
755  __asm__ volatile(
756  PREFETCH" 32(%1) \n\t"
757  "movq (%1), %%mm0 \n\t"
758  "movq (%1), %%mm1 \n\t"
759  "movq (%1), %%mm2 \n\t"
760  "pand %2, %%mm0 \n\t"
761  "pand %3, %%mm1 \n\t"
762  "pand %4, %%mm2 \n\t"
763  "psllq $5, %%mm0 \n\t"
764  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
765  "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
766  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
767  "movq %%mm0, %%mm3 \n\t"
768  "movq %%mm1, %%mm4 \n\t"
769  "movq %%mm2, %%mm5 \n\t"
770  "punpcklwd %5, %%mm0 \n\t"
771  "punpcklwd %5, %%mm1 \n\t"
772  "punpcklwd %5, %%mm2 \n\t"
773  "punpckhwd %5, %%mm3 \n\t"
774  "punpckhwd %5, %%mm4 \n\t"
775  "punpckhwd %5, %%mm5 \n\t"
776  "psllq $8, %%mm1 \n\t"
777  "psllq $16, %%mm2 \n\t"
778  "por %%mm1, %%mm0 \n\t"
779  "por %%mm2, %%mm0 \n\t"
780  "psllq $8, %%mm4 \n\t"
781  "psllq $16, %%mm5 \n\t"
782  "por %%mm4, %%mm3 \n\t"
783  "por %%mm5, %%mm3 \n\t"
784 
785  "movq %%mm0, %%mm6 \n\t"
786  "movq %%mm3, %%mm7 \n\t"
787 
788  "movq 8(%1), %%mm0 \n\t"
789  "movq 8(%1), %%mm1 \n\t"
790  "movq 8(%1), %%mm2 \n\t"
791  "pand %2, %%mm0 \n\t"
792  "pand %3, %%mm1 \n\t"
793  "pand %4, %%mm2 \n\t"
794  "psllq $5, %%mm0 \n\t"
795  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
796  "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
797  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
798  "movq %%mm0, %%mm3 \n\t"
799  "movq %%mm1, %%mm4 \n\t"
800  "movq %%mm2, %%mm5 \n\t"
801  "punpcklwd %5, %%mm0 \n\t"
802  "punpcklwd %5, %%mm1 \n\t"
803  "punpcklwd %5, %%mm2 \n\t"
804  "punpckhwd %5, %%mm3 \n\t"
805  "punpckhwd %5, %%mm4 \n\t"
806  "punpckhwd %5, %%mm5 \n\t"
807  "psllq $8, %%mm1 \n\t"
808  "psllq $16, %%mm2 \n\t"
809  "por %%mm1, %%mm0 \n\t"
810  "por %%mm2, %%mm0 \n\t"
811  "psllq $8, %%mm4 \n\t"
812  "psllq $16, %%mm5 \n\t"
813  "por %%mm4, %%mm3 \n\t"
814  "por %%mm5, %%mm3 \n\t"
815 
816  :"=m"(*d)
817  :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
818  NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
819  :"memory");
820  /* borrowed 32 to 24 */
821  __asm__ volatile(
822  "movq %%mm0, %%mm4 \n\t"
823  "movq %%mm3, %%mm5 \n\t"
824  "movq %%mm6, %%mm0 \n\t"
825  "movq %%mm7, %%mm1 \n\t"
826 
827  "movq %%mm4, %%mm6 \n\t"
828  "movq %%mm5, %%mm7 \n\t"
829  "movq %%mm0, %%mm2 \n\t"
830  "movq %%mm1, %%mm3 \n\t"
831 
832  STORE_BGR24_MMX
833 
834  :: "r"(d), "m"(*s)
835  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
836  :"memory");
837  d += 24;
838  s += 8;
839  }
840  __asm__ volatile(SFENCE:::"memory");
841  __asm__ volatile(EMMS:::"memory");
842  while (s < end) {
843  register uint16_t bgr;
844  bgr = *s++;
845  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
846  *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
847  *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
848  }
849 }
850 
851 static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
852 {
853  const uint16_t *end;
854  const uint16_t *mm_end;
855  uint8_t *d = (uint8_t *)dst;
856  const uint16_t *s = (const uint16_t *)src;
857  end = s + src_size/2;
858  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
859  mm_end = end - 7;
860  while (s < mm_end) {
861  __asm__ volatile(
862  PREFETCH" 32(%1) \n\t"
863  "movq (%1), %%mm0 \n\t"
864  "movq (%1), %%mm1 \n\t"
865  "movq (%1), %%mm2 \n\t"
866  "pand %2, %%mm0 \n\t"
867  "pand %3, %%mm1 \n\t"
868  "pand %4, %%mm2 \n\t"
869  "psllq $5, %%mm0 \n\t"
870  "psrlq $1, %%mm2 \n\t"
871  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
872  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
873  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
874  "movq %%mm0, %%mm3 \n\t"
875  "movq %%mm1, %%mm4 \n\t"
876  "movq %%mm2, %%mm5 \n\t"
877  "punpcklwd %5, %%mm0 \n\t"
878  "punpcklwd %5, %%mm1 \n\t"
879  "punpcklwd %5, %%mm2 \n\t"
880  "punpckhwd %5, %%mm3 \n\t"
881  "punpckhwd %5, %%mm4 \n\t"
882  "punpckhwd %5, %%mm5 \n\t"
883  "psllq $8, %%mm1 \n\t"
884  "psllq $16, %%mm2 \n\t"
885  "por %%mm1, %%mm0 \n\t"
886  "por %%mm2, %%mm0 \n\t"
887  "psllq $8, %%mm4 \n\t"
888  "psllq $16, %%mm5 \n\t"
889  "por %%mm4, %%mm3 \n\t"
890  "por %%mm5, %%mm3 \n\t"
891 
892  "movq %%mm0, %%mm6 \n\t"
893  "movq %%mm3, %%mm7 \n\t"
894 
895  "movq 8(%1), %%mm0 \n\t"
896  "movq 8(%1), %%mm1 \n\t"
897  "movq 8(%1), %%mm2 \n\t"
898  "pand %2, %%mm0 \n\t"
899  "pand %3, %%mm1 \n\t"
900  "pand %4, %%mm2 \n\t"
901  "psllq $5, %%mm0 \n\t"
902  "psrlq $1, %%mm2 \n\t"
903  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
904  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
905  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
906  "movq %%mm0, %%mm3 \n\t"
907  "movq %%mm1, %%mm4 \n\t"
908  "movq %%mm2, %%mm5 \n\t"
909  "punpcklwd %5, %%mm0 \n\t"
910  "punpcklwd %5, %%mm1 \n\t"
911  "punpcklwd %5, %%mm2 \n\t"
912  "punpckhwd %5, %%mm3 \n\t"
913  "punpckhwd %5, %%mm4 \n\t"
914  "punpckhwd %5, %%mm5 \n\t"
915  "psllq $8, %%mm1 \n\t"
916  "psllq $16, %%mm2 \n\t"
917  "por %%mm1, %%mm0 \n\t"
918  "por %%mm2, %%mm0 \n\t"
919  "psllq $8, %%mm4 \n\t"
920  "psllq $16, %%mm5 \n\t"
921  "por %%mm4, %%mm3 \n\t"
922  "por %%mm5, %%mm3 \n\t"
923  :"=m"(*d)
924  :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
925  NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
926  :"memory");
927  /* borrowed 32 to 24 */
928  __asm__ volatile(
929  "movq %%mm0, %%mm4 \n\t"
930  "movq %%mm3, %%mm5 \n\t"
931  "movq %%mm6, %%mm0 \n\t"
932  "movq %%mm7, %%mm1 \n\t"
933 
934  "movq %%mm4, %%mm6 \n\t"
935  "movq %%mm5, %%mm7 \n\t"
936  "movq %%mm0, %%mm2 \n\t"
937  "movq %%mm1, %%mm3 \n\t"
938 
939  STORE_BGR24_MMX
940 
941  :: "r"(d), "m"(*s)
942  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
943  :"memory");
944  d += 24;
945  s += 8;
946  }
947  __asm__ volatile(SFENCE:::"memory");
948  __asm__ volatile(EMMS:::"memory");
949  while (s < end) {
950  register uint16_t bgr;
951  bgr = *s++;
952  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
953  *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
954  *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
955  }
956 }
957 
958 /*
959  * mm0 = 00 B3 00 B2 00 B1 00 B0
960  * mm1 = 00 G3 00 G2 00 G1 00 G0
961  * mm2 = 00 R3 00 R2 00 R1 00 R0
962  * mm6 = FF FF FF FF FF FF FF FF
963  * mm7 = 00 00 00 00 00 00 00 00
964  */
965 #define PACK_RGB32 \
966  "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
967  "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
968  "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
969  "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
970  "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
971  "movq %%mm0, %%mm3 \n\t" \
972  "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
973  "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
974  MOVNTQ" %%mm0, (%0) \n\t" \
975  MOVNTQ" %%mm3, 8(%0) \n\t" \
976 
977 static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
978 {
979  const uint16_t *end;
980  const uint16_t *mm_end;
981  uint8_t *d = dst;
982  const uint16_t *s = (const uint16_t *)src;
983  end = s + src_size/2;
984  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
985  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
986  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
987  mm_end = end - 3;
988  while (s < mm_end) {
989  __asm__ volatile(
990  PREFETCH" 32(%1) \n\t"
991  "movq (%1), %%mm0 \n\t"
992  "movq (%1), %%mm1 \n\t"
993  "movq (%1), %%mm2 \n\t"
994  "pand %2, %%mm0 \n\t"
995  "pand %3, %%mm1 \n\t"
996  "pand %4, %%mm2 \n\t"
997  "psllq $5, %%mm0 \n\t"
998  "pmulhw %5, %%mm0 \n\t"
999  "pmulhw %5, %%mm1 \n\t"
1000  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
1001  PACK_RGB32
1002  ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
1003  NAMED_CONSTRAINTS_ADD(mul15_hi)
1004  :"memory");
1005  d += 16;
1006  s += 4;
1007  }
1008  __asm__ volatile(SFENCE:::"memory");
1009  __asm__ volatile(EMMS:::"memory");
1010  while (s < end) {
1011  register uint16_t bgr;
1012  bgr = *s++;
1013  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1014  *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
1015  *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
1016  *d++ = 255;
1017  }
1018 }
1019 
1020 static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1021 {
1022  const uint16_t *end;
1023  const uint16_t *mm_end;
1024  uint8_t *d = dst;
1025  const uint16_t *s = (const uint16_t*)src;
1026  end = s + src_size/2;
1027  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1028  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1029  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1030  mm_end = end - 3;
1031  while (s < mm_end) {
1032  __asm__ volatile(
1033  PREFETCH" 32(%1) \n\t"
1034  "movq (%1), %%mm0 \n\t"
1035  "movq (%1), %%mm1 \n\t"
1036  "movq (%1), %%mm2 \n\t"
1037  "pand %2, %%mm0 \n\t"
1038  "pand %3, %%mm1 \n\t"
1039  "pand %4, %%mm2 \n\t"
1040  "psllq $5, %%mm0 \n\t"
1041  "psrlq $1, %%mm2 \n\t"
1042  "pmulhw %5, %%mm0 \n\t"
1043  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
1044  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
1045  PACK_RGB32
1046  ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1047  NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1048  :"memory");
1049  d += 16;
1050  s += 4;
1051  }
1052  __asm__ volatile(SFENCE:::"memory");
1053  __asm__ volatile(EMMS:::"memory");
1054  while (s < end) {
1055  register uint16_t bgr;
1056  bgr = *s++;
1057  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1058  *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1059  *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1060  *d++ = 255;
1061  }
1062 }
1063 
1064 static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1065 {
1066  x86_reg mmx_size= 23 - src_size;
1067  __asm__ volatile (
1068  "test %%"FF_REG_a", %%"FF_REG_a" \n\t"
1069  "jns 2f \n\t"
1070  "movq "MANGLE(mask24r)", %%mm5 \n\t"
1071  "movq "MANGLE(mask24g)", %%mm6 \n\t"
1072  "movq "MANGLE(mask24b)", %%mm7 \n\t"
1073  ".p2align 4 \n\t"
1074  "1: \n\t"
1075  PREFETCH" 32(%1, %%"FF_REG_a") \n\t"
1076  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
1077  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG
1078  "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B
1079  "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1080  "pand %%mm5, %%mm0 \n\t"
1081  "pand %%mm6, %%mm1 \n\t"
1082  "pand %%mm7, %%mm2 \n\t"
1083  "por %%mm0, %%mm1 \n\t"
1084  "por %%mm2, %%mm1 \n\t"
1085  "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
1086  MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG
1087  "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B
1088  "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR
1089  "pand %%mm7, %%mm0 \n\t"
1090  "pand %%mm5, %%mm1 \n\t"
1091  "pand %%mm6, %%mm2 \n\t"
1092  "por %%mm0, %%mm1 \n\t"
1093  "por %%mm2, %%mm1 \n\t"
1094  "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B
1095  MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
1096  "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR
1097  "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG
1098  "pand %%mm6, %%mm0 \n\t"
1099  "pand %%mm7, %%mm1 \n\t"
1100  "pand %%mm5, %%mm2 \n\t"
1101  "por %%mm0, %%mm1 \n\t"
1102  "por %%mm2, %%mm1 \n\t"
1103  MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t"
1104  "add $24, %%"FF_REG_a" \n\t"
1105  " js 1b \n\t"
1106  "2: \n\t"
1107  : "+a" (mmx_size)
1108  : "r" (src-mmx_size), "r"(dst-mmx_size)
1109  NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1110  );
1111 
1112  __asm__ volatile(SFENCE:::"memory");
1113  __asm__ volatile(EMMS:::"memory");
1114 
1115  if (mmx_size==23) return; //finished, was multiple of 8
1116 
1117  src+= src_size;
1118  dst+= src_size;
1119  src_size= 23-mmx_size;
1120  src-= src_size;
1121  dst-= src_size;
1122  for (unsigned i = 0; i < src_size; i +=3) {
1123  register uint8_t x;
1124  x = src[i + 2];
1125  dst[i + 1] = src[i + 1];
1126  dst[i + 2] = src[i + 0];
1127  dst[i + 0] = x;
1128  }
1129 }
1130 
1131 static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1132  int width, int height,
1133  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1134 {
1135  const x86_reg chromWidth= width>>1;
1136  for (int y = 0; y < height; y++) {
1137  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1138  __asm__ volatile(
1139  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1140  ".p2align 4 \n\t"
1141  "1: \n\t"
1142  PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
1143  PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
1144  PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
1145  "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
1146  "movq %%mm0, %%mm2 \n\t" // U(0)
1147  "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
1148  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1149  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1150 
1151  "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
1152  "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
1153  "movq %%mm3, %%mm4 \n\t" // Y(0)
1154  "movq %%mm5, %%mm6 \n\t" // Y(8)
1155  "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1156  "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1157  "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1158  "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1159 
1160  MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t"
1161  MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
1162  MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t"
1163  MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
1164 
1165  "add $8, %%"FF_REG_a" \n\t"
1166  "cmp %4, %%"FF_REG_a" \n\t"
1167  " jb 1b \n\t"
1168  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1169  : "%"FF_REG_a
1170  );
1171  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1172  usrc += chromStride;
1173  vsrc += chromStride;
1174  }
1175  ysrc += lumStride;
1176  dst += dstStride;
1177  }
1178  __asm__(EMMS" \n\t"
1179  SFENCE" \n\t"
1180  :::"memory");
1181 }
1182 
1183 /**
1184  * Height should be a multiple of 2 and width should be a multiple of 16.
1185  * (If this is a problem for anyone then tell me, and I will fix it.)
1186  */
1187 static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1188  int width, int height,
1189  int lumStride, int chromStride, int dstStride)
1190 {
1191  //FIXME interpolate chroma
1192  yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1193 }
1194 
1195 static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1196  int width, int height,
1197  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1198 {
1199  const x86_reg chromWidth= width>>1;
1200  for (int y = 0; y < height; y++) {
1201  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1202  __asm__ volatile(
1203  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1204  ".p2align 4 \n\t"
1205  "1: \n\t"
1206  PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
1207  PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
1208  PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
1209  "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
1210  "movq %%mm0, %%mm2 \n\t" // U(0)
1211  "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
1212  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1213  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1214 
1215  "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
1216  "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
1217  "movq %%mm0, %%mm4 \n\t" // Y(0)
1218  "movq %%mm2, %%mm6 \n\t" // Y(8)
1219  "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1220  "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1221  "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1222  "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1223 
1224  MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t"
1225  MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
1226  MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t"
1227  MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
1228 
1229  "add $8, %%"FF_REG_a" \n\t"
1230  "cmp %4, %%"FF_REG_a" \n\t"
1231  " jb 1b \n\t"
1232  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1233  : "%"FF_REG_a
1234  );
1235  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1236  usrc += chromStride;
1237  vsrc += chromStride;
1238  }
1239  ysrc += lumStride;
1240  dst += dstStride;
1241  }
1242  __asm__(EMMS" \n\t"
1243  SFENCE" \n\t"
1244  :::"memory");
1245 }
1246 
1247 /**
1248  * Height should be a multiple of 2 and width should be a multiple of 16
1249  * (If this is a problem for anyone then tell me, and I will fix it.)
1250  */
1251 static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1252  int width, int height,
1253  int lumStride, int chromStride, int dstStride)
1254 {
1255  //FIXME interpolate chroma
1256  yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1257 }
1258 
1259 /**
1260  * Width should be a multiple of 16.
1261  */
1262 static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1263  int width, int height,
1264  int lumStride, int chromStride, int dstStride)
1265 {
1266  yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1267 }
1268 
1269 /**
1270  * Width should be a multiple of 16.
1271  */
1272 static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1273  int width, int height,
1274  int lumStride, int chromStride, int dstStride)
1275 {
1276  yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1277 }
1278 
1279 /**
1280  * Height should be a multiple of 2 and width should be a multiple of 16.
1281  * (If this is a problem for anyone then tell me, and I will fix it.)
1282  */
1283 static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1284  int width, int height,
1285  int lumStride, int chromStride, int srcStride)
1286 {
1287  const x86_reg chromWidth= width>>1;
1288  for (int y = 0; y < height; y += 2) {
1289  __asm__ volatile(
1290  "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
1291  "pcmpeqw %%mm7, %%mm7 \n\t"
1292  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1293  ".p2align 4 \n\t"
1294  "1: \n\t"
1295  PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
1296  "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1297  "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1298  "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1299  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1300  "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1301  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1302  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1303  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1304  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1305  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1306 
1307  MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
1308 
1309  "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1310  "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1311  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1312  "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1313  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1314  "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1315  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1316  "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1317  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1318  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1319 
1320  MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
1321 
1322  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1323  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1324  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1325  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1326  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1327  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1328  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1329  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1330 
1331  MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t"
1332  MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t"
1333 
1334  "add $8, %%"FF_REG_a" \n\t"
1335  "cmp %4, %%"FF_REG_a" \n\t"
1336  " jb 1b \n\t"
1337  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1338  : "memory", "%"FF_REG_a
1339  );
1340 
1341  ydst += lumStride;
1342  src += srcStride;
1343 
1344  __asm__ volatile(
1345  "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
1346  ".p2align 4 \n\t"
1347  "1: \n\t"
1348  PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
1349  "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1350  "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1351  "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1352  "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1353  "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1354  "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1355  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1356  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1357  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1358  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1359 
1360  MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t"
1361  MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
1362 
1363  "add $8, %%"FF_REG_a"\n\t"
1364  "cmp %4, %%"FF_REG_a"\n\t"
1365  " jb 1b \n\t"
1366 
1367  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1368  : "memory", "%"FF_REG_a
1369  );
1370  udst += chromStride;
1371  vdst += chromStride;
1372  ydst += lumStride;
1373  src += srcStride;
1374  }
1375  __asm__ volatile(EMMS" \n\t"
1376  SFENCE" \n\t"
1377  :::"memory");
1378 }
1379 
1380 static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1381 {
1382  dst[0]= src[0];
1383 
1384  // first line
1385  for (int x = 0; x < srcWidth - 1; x++) {
1386  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1387  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1388  }
1389  dst[2*srcWidth-1]= src[srcWidth-1];
1390 
1391  dst+= dstStride;
1392 
1393  for (int y = 1; y < srcHeight; y++) {
1394  x86_reg mmxSize= srcWidth&~15;
1395 
1396  if (mmxSize) {
1397  __asm__ volatile(
1398  "mov %4, %%"FF_REG_a" \n\t"
1399  "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1400  "movq (%0, %%"FF_REG_a"), %%mm4 \n\t"
1401  "movq %%mm4, %%mm2 \n\t"
1402  "psllq $8, %%mm4 \n\t"
1403  "pand %%mm0, %%mm2 \n\t"
1404  "por %%mm2, %%mm4 \n\t"
1405  "movq (%1, %%"FF_REG_a"), %%mm5 \n\t"
1406  "movq %%mm5, %%mm3 \n\t"
1407  "psllq $8, %%mm5 \n\t"
1408  "pand %%mm0, %%mm3 \n\t"
1409  "por %%mm3, %%mm5 \n\t"
1410  "1: \n\t"
1411  "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
1412  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
1413  "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t"
1414  "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t"
1415  PAVGB" %%mm0, %%mm5 \n\t"
1416  PAVGB" %%mm0, %%mm3 \n\t"
1417  PAVGB" %%mm0, %%mm5 \n\t"
1418  PAVGB" %%mm0, %%mm3 \n\t"
1419  PAVGB" %%mm1, %%mm4 \n\t"
1420  PAVGB" %%mm1, %%mm2 \n\t"
1421  PAVGB" %%mm1, %%mm4 \n\t"
1422  PAVGB" %%mm1, %%mm2 \n\t"
1423  "movq %%mm5, %%mm7 \n\t"
1424  "movq %%mm4, %%mm6 \n\t"
1425  "punpcklbw %%mm3, %%mm5 \n\t"
1426  "punpckhbw %%mm3, %%mm7 \n\t"
1427  "punpcklbw %%mm2, %%mm4 \n\t"
1428  "punpckhbw %%mm2, %%mm6 \n\t"
1429  MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t"
1430  MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t"
1431  MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t"
1432  MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t"
1433  "add $8, %%"FF_REG_a" \n\t"
1434  "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t"
1435  "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t"
1436  " js 1b \n\t"
1437  :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1438  "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1439  "g" (-mmxSize)
1440  NAMED_CONSTRAINTS_ADD(mmx_ff)
1441  : "%"FF_REG_a
1442  );
1443  } else {
1444  mmxSize = 1;
1445  dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
1446  dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
1447  }
1448 
1449  for (int x = mmxSize - 1; x < srcWidth - 1; x++) {
1450  dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1451  dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1452  dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1453  dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1454  }
1455  dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1456  dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1457 
1458  dst+=dstStride*2;
1459  src+=srcStride;
1460  }
1461 
1462  // last line
1463  dst[0]= src[0];
1464 
1465  for (int x = 0; x < srcWidth - 1; x++) {
1466  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1467  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1468  }
1469  dst[2*srcWidth-1]= src[srcWidth-1];
1470 
1471  __asm__ volatile(EMMS" \n\t"
1472  SFENCE" \n\t"
1473  :::"memory");
1474 }
1475 
1476 /**
1477  * Height should be a multiple of 2 and width should be a multiple of 2.
1478  * (If this is a problem for anyone then tell me, and I will fix it.)
1479  * Chrominance data is only taken from every second line,
1480  * others are ignored in the C version.
1481  * FIXME: Write HQ version.
1482  */
1483 #if ARCH_X86_32 && HAVE_7REGS
1484 static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1485  int width, int height,
1486  int lumStride, int chromStride, int srcStride,
1487  const int32_t *rgb2yuv)
1488 {
1489 #define BGR2Y_IDX "16*4+16*32"
1490 #define BGR2U_IDX "16*4+16*33"
1491 #define BGR2V_IDX "16*4+16*34"
1492  int y;
1493  const x86_reg chromWidth= width>>1;
1494 
1495  if (height > 2) {
1496  ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
1497  src += 2*srcStride;
1498  ydst += 2*lumStride;
1499  udst += chromStride;
1500  vdst += chromStride;
1501  height -= 2;
1502  }
1503 
1504  for (y = 0; y < height - 2; y += 2) {
1505  for (int i = 0; i < 2; i++) {
1506  __asm__ volatile(
1507  "mov %2, %%"FF_REG_a"\n\t"
1508  "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
1509  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1510  "pxor %%mm7, %%mm7 \n\t"
1511  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1512  ".p2align 4 \n\t"
1513  "1: \n\t"
1514  PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
1515  "movd (%0, %%"FF_REG_d"), %%mm0 \n\t"
1516  "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t"
1517  "punpcklbw %%mm7, %%mm0 \n\t"
1518  "punpcklbw %%mm7, %%mm1 \n\t"
1519  "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
1520  "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t"
1521  "punpcklbw %%mm7, %%mm2 \n\t"
1522  "punpcklbw %%mm7, %%mm3 \n\t"
1523  "pmaddwd %%mm6, %%mm0 \n\t"
1524  "pmaddwd %%mm6, %%mm1 \n\t"
1525  "pmaddwd %%mm6, %%mm2 \n\t"
1526  "pmaddwd %%mm6, %%mm3 \n\t"
1527  "psrad $8, %%mm0 \n\t"
1528  "psrad $8, %%mm1 \n\t"
1529  "psrad $8, %%mm2 \n\t"
1530  "psrad $8, %%mm3 \n\t"
1531  "packssdw %%mm1, %%mm0 \n\t"
1532  "packssdw %%mm3, %%mm2 \n\t"
1533  "pmaddwd %%mm5, %%mm0 \n\t"
1534  "pmaddwd %%mm5, %%mm2 \n\t"
1535  "packssdw %%mm2, %%mm0 \n\t"
1536  "psraw $7, %%mm0 \n\t"
1537 
1538  "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
1539  "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t"
1540  "punpcklbw %%mm7, %%mm4 \n\t"
1541  "punpcklbw %%mm7, %%mm1 \n\t"
1542  "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
1543  "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t"
1544  "punpcklbw %%mm7, %%mm2 \n\t"
1545  "punpcklbw %%mm7, %%mm3 \n\t"
1546  "pmaddwd %%mm6, %%mm4 \n\t"
1547  "pmaddwd %%mm6, %%mm1 \n\t"
1548  "pmaddwd %%mm6, %%mm2 \n\t"
1549  "pmaddwd %%mm6, %%mm3 \n\t"
1550  "psrad $8, %%mm4 \n\t"
1551  "psrad $8, %%mm1 \n\t"
1552  "psrad $8, %%mm2 \n\t"
1553  "psrad $8, %%mm3 \n\t"
1554  "packssdw %%mm1, %%mm4 \n\t"
1555  "packssdw %%mm3, %%mm2 \n\t"
1556  "pmaddwd %%mm5, %%mm4 \n\t"
1557  "pmaddwd %%mm5, %%mm2 \n\t"
1558  "add $24, %%"FF_REG_d"\n\t"
1559  "packssdw %%mm2, %%mm4 \n\t"
1560  "psraw $7, %%mm4 \n\t"
1561 
1562  "packuswb %%mm4, %%mm0 \n\t"
1563  "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1564 
1565  MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t"
1566  "add $8, %%"FF_REG_a" \n\t"
1567  " js 1b \n\t"
1568  : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1569  NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
1570  : "%"FF_REG_a, "%"FF_REG_d
1571  );
1572  ydst += lumStride;
1573  src += srcStride;
1574  }
1575  src -= srcStride*2;
1576  __asm__ volatile(
1577  "mov %4, %%"FF_REG_a"\n\t"
1578  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1579  "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
1580  "pxor %%mm7, %%mm7 \n\t"
1581  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1582  "add %%"FF_REG_d", %%"FF_REG_d"\n\t"
1583  ".p2align 4 \n\t"
1584  "1: \n\t"
1585  PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
1586  PREFETCH" 64(%1, %%"FF_REG_d") \n\t"
1587  "movq (%0, %%"FF_REG_d"), %%mm0 \n\t"
1588  "movq (%1, %%"FF_REG_d"), %%mm1 \n\t"
1589  "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
1590  "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t"
1591  PAVGB" %%mm1, %%mm0 \n\t"
1592  PAVGB" %%mm3, %%mm2 \n\t"
1593  "movq %%mm0, %%mm1 \n\t"
1594  "movq %%mm2, %%mm3 \n\t"
1595  "psrlq $24, %%mm0 \n\t"
1596  "psrlq $24, %%mm2 \n\t"
1597  PAVGB" %%mm1, %%mm0 \n\t"
1598  PAVGB" %%mm3, %%mm2 \n\t"
1599  "punpcklbw %%mm7, %%mm0 \n\t"
1600  "punpcklbw %%mm7, %%mm2 \n\t"
1601  "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1602  "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1603 
1604  "pmaddwd %%mm0, %%mm1 \n\t"
1605  "pmaddwd %%mm2, %%mm3 \n\t"
1606  "pmaddwd %%mm6, %%mm0 \n\t"
1607  "pmaddwd %%mm6, %%mm2 \n\t"
1608  "psrad $8, %%mm0 \n\t"
1609  "psrad $8, %%mm1 \n\t"
1610  "psrad $8, %%mm2 \n\t"
1611  "psrad $8, %%mm3 \n\t"
1612  "packssdw %%mm2, %%mm0 \n\t"
1613  "packssdw %%mm3, %%mm1 \n\t"
1614  "pmaddwd %%mm5, %%mm0 \n\t"
1615  "pmaddwd %%mm5, %%mm1 \n\t"
1616  "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1617  "psraw $7, %%mm0 \n\t"
1618 
1619  "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
1620  "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t"
1621  "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
1622  "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t"
1623  PAVGB" %%mm1, %%mm4 \n\t"
1624  PAVGB" %%mm3, %%mm2 \n\t"
1625  "movq %%mm4, %%mm1 \n\t"
1626  "movq %%mm2, %%mm3 \n\t"
1627  "psrlq $24, %%mm4 \n\t"
1628  "psrlq $24, %%mm2 \n\t"
1629  PAVGB" %%mm1, %%mm4 \n\t"
1630  PAVGB" %%mm3, %%mm2 \n\t"
1631  "punpcklbw %%mm7, %%mm4 \n\t"
1632  "punpcklbw %%mm7, %%mm2 \n\t"
1633  "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1634  "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1635 
1636  "pmaddwd %%mm4, %%mm1 \n\t"
1637  "pmaddwd %%mm2, %%mm3 \n\t"
1638  "pmaddwd %%mm6, %%mm4 \n\t"
1639  "pmaddwd %%mm6, %%mm2 \n\t"
1640  "psrad $8, %%mm4 \n\t"
1641  "psrad $8, %%mm1 \n\t"
1642  "psrad $8, %%mm2 \n\t"
1643  "psrad $8, %%mm3 \n\t"
1644  "packssdw %%mm2, %%mm4 \n\t"
1645  "packssdw %%mm3, %%mm1 \n\t"
1646  "pmaddwd %%mm5, %%mm4 \n\t"
1647  "pmaddwd %%mm5, %%mm1 \n\t"
1648  "add $24, %%"FF_REG_d"\n\t"
1649  "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1650  "psraw $7, %%mm4 \n\t"
1651 
1652  "movq %%mm0, %%mm1 \n\t"
1653  "punpckldq %%mm4, %%mm0 \n\t"
1654  "punpckhdq %%mm4, %%mm1 \n\t"
1655  "packsswb %%mm1, %%mm0 \n\t"
1656  "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1657  "movd %%mm0, (%2, %%"FF_REG_a") \n\t"
1658  "punpckhdq %%mm0, %%mm0 \n\t"
1659  "movd %%mm0, (%3, %%"FF_REG_a") \n\t"
1660  "add $4, %%"FF_REG_a" \n\t"
1661  " js 1b \n\t"
1662  : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1663  NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
1664  : "%"FF_REG_a, "%"FF_REG_d
1665  );
1666 
1667  udst += chromStride;
1668  vdst += chromStride;
1669  src += srcStride*2;
1670  }
1671 
1672  __asm__ volatile(EMMS" \n\t"
1673  SFENCE" \n\t"
1674  :::"memory");
1675 
1676  ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1677 }
1678 #endif /* HAVE_7REGS */
1679 
1680 static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2,
1681  uint8_t *dst1, uint8_t *dst2,
1682  int width, int height,
1683  int srcStride1, int srcStride2,
1684  int dstStride1, int dstStride2)
1685 {
1686  int w,h;
1687  w=width/2; h=height/2;
1688  __asm__ volatile(
1689  PREFETCH" %0 \n\t"
1690  PREFETCH" %1 \n\t"
1691  ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1692  for (x86_reg y = 0; y < h; y++) {
1693  const uint8_t* s1=src1+srcStride1*(y>>1);
1694  uint8_t* d=dst1+dstStride1*y;
1695  x86_reg x = 0;
1696  for (;x<w-31;x+=32) {
1697  __asm__ volatile(
1698  PREFETCH" 32(%1,%2) \n\t"
1699  "movq (%1,%2), %%mm0 \n\t"
1700  "movq 8(%1,%2), %%mm2 \n\t"
1701  "movq 16(%1,%2), %%mm4 \n\t"
1702  "movq 24(%1,%2), %%mm6 \n\t"
1703  "movq %%mm0, %%mm1 \n\t"
1704  "movq %%mm2, %%mm3 \n\t"
1705  "movq %%mm4, %%mm5 \n\t"
1706  "movq %%mm6, %%mm7 \n\t"
1707  "punpcklbw %%mm0, %%mm0 \n\t"
1708  "punpckhbw %%mm1, %%mm1 \n\t"
1709  "punpcklbw %%mm2, %%mm2 \n\t"
1710  "punpckhbw %%mm3, %%mm3 \n\t"
1711  "punpcklbw %%mm4, %%mm4 \n\t"
1712  "punpckhbw %%mm5, %%mm5 \n\t"
1713  "punpcklbw %%mm6, %%mm6 \n\t"
1714  "punpckhbw %%mm7, %%mm7 \n\t"
1715  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1716  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1717  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1718  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1719  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1720  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1721  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1722  MOVNTQ" %%mm7, 56(%0,%2,2)"
1723  :: "r"(d), "r"(s1), "r"(x)
1724  :"memory");
1725  }
1726  for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1727  }
1728  for (x86_reg y = 0; y < h; y++) {
1729  const uint8_t* s2=src2+srcStride2*(y>>1);
1730  uint8_t* d=dst2+dstStride2*y;
1731  x86_reg x = 0;
1732  for (;x<w-31;x+=32) {
1733  __asm__ volatile(
1734  PREFETCH" 32(%1,%2) \n\t"
1735  "movq (%1,%2), %%mm0 \n\t"
1736  "movq 8(%1,%2), %%mm2 \n\t"
1737  "movq 16(%1,%2), %%mm4 \n\t"
1738  "movq 24(%1,%2), %%mm6 \n\t"
1739  "movq %%mm0, %%mm1 \n\t"
1740  "movq %%mm2, %%mm3 \n\t"
1741  "movq %%mm4, %%mm5 \n\t"
1742  "movq %%mm6, %%mm7 \n\t"
1743  "punpcklbw %%mm0, %%mm0 \n\t"
1744  "punpckhbw %%mm1, %%mm1 \n\t"
1745  "punpcklbw %%mm2, %%mm2 \n\t"
1746  "punpckhbw %%mm3, %%mm3 \n\t"
1747  "punpcklbw %%mm4, %%mm4 \n\t"
1748  "punpckhbw %%mm5, %%mm5 \n\t"
1749  "punpcklbw %%mm6, %%mm6 \n\t"
1750  "punpckhbw %%mm7, %%mm7 \n\t"
1751  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1752  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1753  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1754  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1755  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1756  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1757  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1758  MOVNTQ" %%mm7, 56(%0,%2,2)"
1759  :: "r"(d), "r"(s2), "r"(x)
1760  :"memory");
1761  }
1762  for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
1763  }
1764  __asm__(
1765  EMMS" \n\t"
1766  SFENCE" \n\t"
1767  ::: "memory"
1768  );
1769 }
1770 
1771 static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
1772  uint8_t *dst,
1773  int width, int height,
1774  int srcStride1, int srcStride2,
1775  int srcStride3, int dstStride)
1776 {
1777  int w,h;
1778  w=width/2; h=height;
1779  for (int y = 0; y < h; y++) {
1780  const uint8_t* yp=src1+srcStride1*y;
1781  const uint8_t* up=src2+srcStride2*(y>>2);
1782  const uint8_t* vp=src3+srcStride3*(y>>2);
1783  uint8_t* d=dst+dstStride*y;
1784  x86_reg x = 0;
1785  for (;x<w-7;x+=8) {
1786  __asm__ volatile(
1787  PREFETCH" 32(%1, %0) \n\t"
1788  PREFETCH" 32(%2, %0) \n\t"
1789  PREFETCH" 32(%3, %0) \n\t"
1790  "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1791  "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
1792  "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
1793  "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1794  "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
1795  "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
1796  "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
1797  "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
1798  "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
1799  "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
1800 
1801  "movq %%mm1, %%mm6 \n\t"
1802  "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
1803  "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
1804  "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
1805  MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
1806  MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
1807 
1808  "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
1809  "movq 8(%1, %0, 4), %%mm0 \n\t"
1810  "movq %%mm0, %%mm3 \n\t"
1811  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
1812  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
1813  MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
1814  MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
1815 
1816  "movq %%mm4, %%mm6 \n\t"
1817  "movq 16(%1, %0, 4), %%mm0 \n\t"
1818  "movq %%mm0, %%mm3 \n\t"
1819  "punpcklbw %%mm5, %%mm4 \n\t"
1820  "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
1821  "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
1822  MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
1823  MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
1824 
1825  "punpckhbw %%mm5, %%mm6 \n\t"
1826  "movq 24(%1, %0, 4), %%mm0 \n\t"
1827  "movq %%mm0, %%mm3 \n\t"
1828  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
1829  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
1830  MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
1831  MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
1832 
1833  : "+r" (x)
1834  : "r"(yp), "r" (up), "r"(vp), "r"(d)
1835  :"memory");
1836  }
1837  for (; x<w; x++) {
1838  const int x2 = x<<2;
1839  d[8*x+0] = yp[x2];
1840  d[8*x+1] = up[x];
1841  d[8*x+2] = yp[x2+1];
1842  d[8*x+3] = vp[x];
1843  d[8*x+4] = yp[x2+2];
1844  d[8*x+5] = up[x];
1845  d[8*x+6] = yp[x2+3];
1846  d[8*x+7] = vp[x];
1847  }
1848  }
1849  __asm__(
1850  EMMS" \n\t"
1851  SFENCE" \n\t"
1852  ::: "memory"
1853  );
1854 }
1855 
1856 static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1857 {
1858  dst += count;
1859  src += 2*count;
1860  count= - count;
1861 
1862  if(count <= -16) {
1863  count += 15;
1864  __asm__ volatile(
1865  "pcmpeqw %%mm7, %%mm7 \n\t"
1866  "psrlw $8, %%mm7 \n\t"
1867  "1: \n\t"
1868  "movq -30(%1, %0, 2), %%mm0 \n\t"
1869  "movq -22(%1, %0, 2), %%mm1 \n\t"
1870  "movq -14(%1, %0, 2), %%mm2 \n\t"
1871  "movq -6(%1, %0, 2), %%mm3 \n\t"
1872  "pand %%mm7, %%mm0 \n\t"
1873  "pand %%mm7, %%mm1 \n\t"
1874  "pand %%mm7, %%mm2 \n\t"
1875  "pand %%mm7, %%mm3 \n\t"
1876  "packuswb %%mm1, %%mm0 \n\t"
1877  "packuswb %%mm3, %%mm2 \n\t"
1878  MOVNTQ" %%mm0,-15(%2, %0) \n\t"
1879  MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
1880  "add $16, %0 \n\t"
1881  " js 1b \n\t"
1882  : "+r"(count)
1883  : "r"(src), "r"(dst)
1884  );
1885  count -= 15;
1886  }
1887  while(count<0) {
1888  dst[count]= src[2*count];
1889  count++;
1890  }
1891 }
1892 
1893 static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1894 {
1895  src ++;
1896  dst += count;
1897  src += 2*count;
1898  count= - count;
1899 
1900  if(count < -16) {
1901  count += 16;
1902  __asm__ volatile(
1903  "pcmpeqw %%mm7, %%mm7 \n\t"
1904  "psrlw $8, %%mm7 \n\t"
1905  "1: \n\t"
1906  "movq -32(%1, %0, 2), %%mm0 \n\t"
1907  "movq -24(%1, %0, 2), %%mm1 \n\t"
1908  "movq -16(%1, %0, 2), %%mm2 \n\t"
1909  "movq -8(%1, %0, 2), %%mm3 \n\t"
1910  "pand %%mm7, %%mm0 \n\t"
1911  "pand %%mm7, %%mm1 \n\t"
1912  "pand %%mm7, %%mm2 \n\t"
1913  "pand %%mm7, %%mm3 \n\t"
1914  "packuswb %%mm1, %%mm0 \n\t"
1915  "packuswb %%mm3, %%mm2 \n\t"
1916  MOVNTQ" %%mm0,-16(%2, %0) \n\t"
1917  MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
1918  "add $16, %0 \n\t"
1919  " js 1b \n\t"
1920  : "+r"(count)
1921  : "r"(src), "r"(dst)
1922  );
1923  count -= 16;
1924  }
1925  while(count<0) {
1926  dst[count]= src[2*count];
1927  count++;
1928  }
1929 }
1930 
1931 #if ARCH_X86_32
1932 static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1933 {
1934  dst0+= count;
1935  dst1+= count;
1936  src += 4*count;
1937  count= - count;
1938  if(count <= -8) {
1939  count += 7;
1940  __asm__ volatile(
1941  "pcmpeqw %%mm7, %%mm7 \n\t"
1942  "psrlw $8, %%mm7 \n\t"
1943  "1: \n\t"
1944  "movq -28(%1, %0, 4), %%mm0 \n\t"
1945  "movq -20(%1, %0, 4), %%mm1 \n\t"
1946  "movq -12(%1, %0, 4), %%mm2 \n\t"
1947  "movq -4(%1, %0, 4), %%mm3 \n\t"
1948  "pand %%mm7, %%mm0 \n\t"
1949  "pand %%mm7, %%mm1 \n\t"
1950  "pand %%mm7, %%mm2 \n\t"
1951  "pand %%mm7, %%mm3 \n\t"
1952  "packuswb %%mm1, %%mm0 \n\t"
1953  "packuswb %%mm3, %%mm2 \n\t"
1954  "movq %%mm0, %%mm1 \n\t"
1955  "movq %%mm2, %%mm3 \n\t"
1956  "psrlw $8, %%mm0 \n\t"
1957  "psrlw $8, %%mm2 \n\t"
1958  "pand %%mm7, %%mm1 \n\t"
1959  "pand %%mm7, %%mm3 \n\t"
1960  "packuswb %%mm2, %%mm0 \n\t"
1961  "packuswb %%mm3, %%mm1 \n\t"
1962  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
1963  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
1964  "add $8, %0 \n\t"
1965  " js 1b \n\t"
1966  : "+r"(count)
1967  : "r"(src), "r"(dst0), "r"(dst1)
1968  );
1969  count -= 7;
1970  }
1971  while(count<0) {
1972  dst0[count]= src[4*count+0];
1973  dst1[count]= src[4*count+2];
1974  count++;
1975  }
1976 }
1977 #endif /* ARCH_X86_32 */
1978 
1979 static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1980 {
1981  dst0 += count;
1982  dst1 += count;
1983  src0 += 4*count;
1984  src1 += 4*count;
1985  count= - count;
1986 #ifdef PAVGB
1987  if(count <= -8) {
1988  count += 7;
1989  __asm__ volatile(
1990  "pcmpeqw %%mm7, %%mm7 \n\t"
1991  "psrlw $8, %%mm7 \n\t"
1992  "1: \n\t"
1993  "movq -28(%1, %0, 4), %%mm0 \n\t"
1994  "movq -20(%1, %0, 4), %%mm1 \n\t"
1995  "movq -12(%1, %0, 4), %%mm2 \n\t"
1996  "movq -4(%1, %0, 4), %%mm3 \n\t"
1997  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
1998  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
1999  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2000  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2001  "pand %%mm7, %%mm0 \n\t"
2002  "pand %%mm7, %%mm1 \n\t"
2003  "pand %%mm7, %%mm2 \n\t"
2004  "pand %%mm7, %%mm3 \n\t"
2005  "packuswb %%mm1, %%mm0 \n\t"
2006  "packuswb %%mm3, %%mm2 \n\t"
2007  "movq %%mm0, %%mm1 \n\t"
2008  "movq %%mm2, %%mm3 \n\t"
2009  "psrlw $8, %%mm0 \n\t"
2010  "psrlw $8, %%mm2 \n\t"
2011  "pand %%mm7, %%mm1 \n\t"
2012  "pand %%mm7, %%mm3 \n\t"
2013  "packuswb %%mm2, %%mm0 \n\t"
2014  "packuswb %%mm3, %%mm1 \n\t"
2015  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2016  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2017  "add $8, %0 \n\t"
2018  " js 1b \n\t"
2019  : "+r"(count)
2020  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2021  );
2022  count -= 7;
2023  }
2024 #endif
2025  while(count<0) {
2026  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2027  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2028  count++;
2029  }
2030 }
2031 
2032 static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2033 {
2034  dst0+= count;
2035  dst1+= count;
2036  src += 4*count;
2037  count= - count;
2038  if(count <= -8) {
2039  count += 7;
2040  __asm__ volatile(
2041  "pcmpeqw %%mm7, %%mm7 \n\t"
2042  "psrlw $8, %%mm7 \n\t"
2043  "1: \n\t"
2044  "movq -28(%1, %0, 4), %%mm0 \n\t"
2045  "movq -20(%1, %0, 4), %%mm1 \n\t"
2046  "movq -12(%1, %0, 4), %%mm2 \n\t"
2047  "movq -4(%1, %0, 4), %%mm3 \n\t"
2048  "psrlw $8, %%mm0 \n\t"
2049  "psrlw $8, %%mm1 \n\t"
2050  "psrlw $8, %%mm2 \n\t"
2051  "psrlw $8, %%mm3 \n\t"
2052  "packuswb %%mm1, %%mm0 \n\t"
2053  "packuswb %%mm3, %%mm2 \n\t"
2054  "movq %%mm0, %%mm1 \n\t"
2055  "movq %%mm2, %%mm3 \n\t"
2056  "psrlw $8, %%mm0 \n\t"
2057  "psrlw $8, %%mm2 \n\t"
2058  "pand %%mm7, %%mm1 \n\t"
2059  "pand %%mm7, %%mm3 \n\t"
2060  "packuswb %%mm2, %%mm0 \n\t"
2061  "packuswb %%mm3, %%mm1 \n\t"
2062  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2063  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2064  "add $8, %0 \n\t"
2065  " js 1b \n\t"
2066  : "+r"(count)
2067  : "r"(src), "r"(dst0), "r"(dst1)
2068  );
2069  count -= 7;
2070  }
2071  src++;
2072  while(count<0) {
2073  dst0[count]= src[4*count+0];
2074  dst1[count]= src[4*count+2];
2075  count++;
2076  }
2077 }
2078 
2079 static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2080 {
2081  dst0 += count;
2082  dst1 += count;
2083  src0 += 4*count;
2084  src1 += 4*count;
2085  count= - count;
2086 #ifdef PAVGB
2087  if(count <= -8) {
2088  count += 7;
2089  __asm__ volatile(
2090  "pcmpeqw %%mm7, %%mm7 \n\t"
2091  "psrlw $8, %%mm7 \n\t"
2092  "1: \n\t"
2093  "movq -28(%1, %0, 4), %%mm0 \n\t"
2094  "movq -20(%1, %0, 4), %%mm1 \n\t"
2095  "movq -12(%1, %0, 4), %%mm2 \n\t"
2096  "movq -4(%1, %0, 4), %%mm3 \n\t"
2097  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2098  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2099  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2100  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2101  "psrlw $8, %%mm0 \n\t"
2102  "psrlw $8, %%mm1 \n\t"
2103  "psrlw $8, %%mm2 \n\t"
2104  "psrlw $8, %%mm3 \n\t"
2105  "packuswb %%mm1, %%mm0 \n\t"
2106  "packuswb %%mm3, %%mm2 \n\t"
2107  "movq %%mm0, %%mm1 \n\t"
2108  "movq %%mm2, %%mm3 \n\t"
2109  "psrlw $8, %%mm0 \n\t"
2110  "psrlw $8, %%mm2 \n\t"
2111  "pand %%mm7, %%mm1 \n\t"
2112  "pand %%mm7, %%mm3 \n\t"
2113  "packuswb %%mm2, %%mm0 \n\t"
2114  "packuswb %%mm3, %%mm1 \n\t"
2115  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2116  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2117  "add $8, %0 \n\t"
2118  " js 1b \n\t"
2119  : "+r"(count)
2120  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2121  );
2122  count -= 7;
2123  }
2124 #endif
2125  src0++;
2126  src1++;
2127  while(count<0) {
2128  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2129  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2130  count++;
2131  }
2132 }
2133 
2134 static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2135  int width, int height,
2136  int lumStride, int chromStride, int srcStride)
2137 {
2138  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2139 
2140  for (int y = 0; y < height; y++) {
2141  extract_even_mmxext(src, ydst, width);
2142  if(y&1) {
2143  extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
2144  udst+= chromStride;
2145  vdst+= chromStride;
2146  }
2147 
2148  src += srcStride;
2149  ydst+= lumStride;
2150  }
2151  __asm__(
2152  EMMS" \n\t"
2153  SFENCE" \n\t"
2154  ::: "memory"
2155  );
2156 }
2157 
2158 static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2159  int width, int height,
2160  int lumStride, int chromStride, int srcStride)
2161 {
2162  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2163 
2164  for (int y = 0; y < height; y++) {
2165  extract_even_mmxext(src, ydst, width);
2166  extract_odd2_mmxext(src, udst, vdst, chromWidth);
2167 
2168  src += srcStride;
2169  ydst+= lumStride;
2170  udst+= chromStride;
2171  vdst+= chromStride;
2172  }
2173  __asm__(
2174  EMMS" \n\t"
2175  SFENCE" \n\t"
2176  ::: "memory"
2177  );
2178 }
2179 
2180 static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2181  int width, int height,
2182  int lumStride, int chromStride, int srcStride)
2183 {
2184  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2185 
2186  for (int y = 0; y < height; y++) {
2187  extract_odd_mmxext(src, ydst, width);
2188  if(y&1) {
2189  extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
2190  udst+= chromStride;
2191  vdst+= chromStride;
2192  }
2193 
2194  src += srcStride;
2195  ydst+= lumStride;
2196  }
2197  __asm__(
2198  EMMS" \n\t"
2199  SFENCE" \n\t"
2200  ::: "memory"
2201  );
2202 }
2203 
2204 #if ARCH_X86_32
2205 static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2206  int width, int height,
2207  int lumStride, int chromStride, int srcStride)
2208 {
2209  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2210 
2211  for (int y = 0; y < height; y++) {
2212  extract_odd_mmxext(src, ydst, width);
2213  extract_even2_mmxext(src, udst, vdst, chromWidth);
2214 
2215  src += srcStride;
2216  ydst+= lumStride;
2217  udst+= chromStride;
2218  vdst+= chromStride;
2219  }
2220  __asm__(
2221  EMMS" \n\t"
2222  SFENCE" \n\t"
2223  ::: "memory"
2224  );
2225 }
2226 #endif /* ARCH_X86_32 */
2227 
2228 static av_cold void rgb2rgb_init_mmxext(void)
2229 {
2230  rgb15to16 = rgb15to16_mmxext;
2231  rgb15tobgr24 = rgb15tobgr24_mmxext;
2232  rgb15to32 = rgb15to32_mmxext;
2233  rgb16tobgr24 = rgb16tobgr24_mmxext;
2234  rgb16to32 = rgb16to32_mmxext;
2235  rgb16to15 = rgb16to15_mmxext;
2236  rgb24tobgr16 = rgb24tobgr16_mmxext;
2237  rgb24tobgr15 = rgb24tobgr15_mmxext;
2238  rgb24tobgr32 = rgb24tobgr32_mmxext;
2239  rgb32to16 = rgb32to16_mmxext;
2240  rgb32to15 = rgb32to15_mmxext;
2241  rgb32tobgr24 = rgb32tobgr24_mmxext;
2242  rgb24to15 = rgb24to15_mmxext;
2243  rgb24to16 = rgb24to16_mmxext;
2244  rgb24tobgr24 = rgb24tobgr24_mmxext;
2245  rgb32tobgr16 = rgb32tobgr16_mmxext;
2246  rgb32tobgr15 = rgb32tobgr15_mmxext;
2247  yv12toyuy2 = yv12toyuy2_mmxext;
2248  yv12touyvy = yv12touyvy_mmxext;
2249  yuv422ptoyuy2 = yuv422ptoyuy2_mmxext;
2250  yuv422ptouyvy = yuv422ptouyvy_mmxext;
2251  yuy2toyv12 = yuy2toyv12_mmxext;
2252  vu9_to_vu12 = vu9_to_vu12_mmxext;
2253  yvu9_to_yuy2 = yvu9_to_yuy2_mmxext;
2254 #if ARCH_X86_32
2255  uyvytoyuv422 = uyvytoyuv422_mmxext;
2256 #endif
2257  yuyvtoyuv422 = yuyvtoyuv422_mmxext;
2258 
2259  planar2x = planar2x_mmxext;
2260 #if ARCH_X86_32 && HAVE_7REGS
2261  ff_rgb24toyv12 = rgb24toyv12_mmxext;
2262 #endif /* ARCH_X86_32 && HAVE_7REGS */
2263 
2264  yuyvtoyuv420 = yuyvtoyuv420_mmxext;
2265  uyvytoyuv420 = uyvytoyuv420_mmxext;
2266 }
2267 
2268 //SSE2 versions
2269 static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2270  int width, int height, int src1Stride,
2271  int src2Stride, int dstStride)
2272 {
2273  for (int h = 0; h < height; h++) {
2274  if (width >= 16) {
2275  if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
2276  __asm__(
2277  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
2278  "1: \n\t"
2279  PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
2280  PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
2281  "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t"
2282  "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t"
2283  "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t"
2284  "punpcklbw %%xmm2, %%xmm0 \n\t"
2285  "punpckhbw %%xmm2, %%xmm1 \n\t"
2286  "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t"
2287  "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
2288  "add $16, %%"FF_REG_a" \n\t"
2289  "cmp %3, %%"FF_REG_a" \n\t"
2290  " jb 1b \n\t"
2291  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2292  : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
2293  );
2294  } else
2295  __asm__(
2296  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
2297  "1: \n\t"
2298  PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
2299  PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
2300  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
2301  "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t"
2302  "movq %%mm0, %%mm1 \n\t"
2303  "movq %%mm2, %%mm3 \n\t"
2304  "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
2305  "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t"
2306  "punpcklbw %%mm4, %%mm0 \n\t"
2307  "punpckhbw %%mm4, %%mm1 \n\t"
2308  "punpcklbw %%mm5, %%mm2 \n\t"
2309  "punpckhbw %%mm5, %%mm3 \n\t"
2310  MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t"
2311  MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t"
2312  MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
2313  MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
2314  "add $16, %%"FF_REG_a" \n\t"
2315  "cmp %3, %%"FF_REG_a" \n\t"
2316  " jb 1b \n\t"
2317  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2318  : "memory", "%"FF_REG_a
2319  );
2320 
2321  }
2322  for (int w = (width & (~15)); w < width; w++) {
2323  dest[2*w+0] = src1[w];
2324  dest[2*w+1] = src2[w];
2325  }
2326  dest += dstStride;
2327  src1 += src1Stride;
2328  src2 += src2Stride;
2329  }
2330  __asm__(
2331  EMMS" \n\t"
2332  SFENCE" \n\t"
2333  ::: "memory"
2334  );
2335 }
2336 
2337 /*
2338  RGB15->RGB16 original by Strepto/Astral
2339  ported to gcc & bugfixed : A'rpi
2340  MMXEXT, 3DNOW optimization by Nick Kurshev
2341  32-bit C version, and and&add trick by Michael Niedermayer
2342 */
2343 
2344 #endif /* HAVE_INLINE_ASM */
2345 
2346 void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2347 void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2348 void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2349 void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2350 void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2351 void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2352 void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2353 void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2354 void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2355 
2356 #if ARCH_X86_64
2357 void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2358 void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2359 void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2360 void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2361 void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2362 void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2363 void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2364 void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2365 void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2366 
2367 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2368  const uint8_t *src, int width, int height,
2369  int lumStride, int chromStride, int srcStride);
2370 void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2371  const uint8_t *src, int width, int height,
2372  int lumStride, int chromStride, int srcStride);
2373 void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2374  const uint8_t *src, int width, int height,
2375  int lumStride, int chromStride, int srcStride);
2376 #endif
2377 
2378 #define DEINTERLEAVE_BYTES(cpuext) \
2379 void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV, \
2380  const uint8_t *unused, \
2381  const uint8_t *src1, \
2382  const uint8_t *src2, \
2383  int w, \
2384  uint32_t *unused2, \
2385  void *opq); \
2386 static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \
2387  int width, int height, int srcStride, \
2388  int dst1Stride, int dst2Stride) \
2389 { \
2390  for (int h = 0; h < height; h++) { \
2391  if (width >= 16) \
2392  ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \
2393  for (int w = (width & (~15)); w < width; w++) { \
2394  dst1[w] = src[2*w+0]; \
2395  dst2[w] = src[2*w+1]; \
2396  } \
2397  src += srcStride; \
2398  dst1 += dst1Stride; \
2399  dst2 += dst2Stride; \
2400  } \
2401 }
2402 
2403 #if HAVE_SSE2_EXTERNAL
2404 DEINTERLEAVE_BYTES(sse2)
2405 #endif
2406 #if HAVE_AVX_EXTERNAL
2407 DEINTERLEAVE_BYTES(avx)
2408 #endif
2409 
2411 {
2412  int cpu_flags = av_get_cpu_flags();
2413 
2414 #if HAVE_INLINE_ASM
2415  if (INLINE_MMXEXT(cpu_flags))
2416  rgb2rgb_init_mmxext();
2417  if (INLINE_SSE2(cpu_flags))
2418  interleaveBytes = interleave_bytes_sse2;
2419 #endif /* HAVE_INLINE_ASM */
2420 
2421 #if HAVE_SSE2_EXTERNAL
2422  if (EXTERNAL_SSE2(cpu_flags)) {
2423 #if ARCH_X86_64
2424  uyvytoyuv422 = ff_uyvytoyuv422_sse2;
2425 #endif
2426  deinterleaveBytes = deinterleave_bytes_sse2;
2427  }
2428 #endif
2429  if (EXTERNAL_SSSE3(cpu_flags)) {
2439  }
2440 #if HAVE_AVX_EXTERNAL
2441  if (EXTERNAL_AVX(cpu_flags)) {
2442  deinterleaveBytes = deinterleave_bytes_avx;
2443 #if ARCH_X86_64
2444  uyvytoyuv422 = ff_uyvytoyuv422_avx;
2445  }
2447  shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
2448  shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
2449  shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
2450  shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
2451  shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
2452  shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2;
2453  shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2;
2454  shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
2455  shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
2456  }
2458  uyvytoyuv422 = ff_uyvytoyuv422_avx2;
2459 #endif
2460  }
2461 #endif
2462 }
rgb32tobgr24
void(* rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:35
shuffle_bytes_3012
void(* shuffle_bytes_3012)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:57
cpu.h
r
const char * r
Definition: vf_curves.c:127
mem_internal.h
yv12toyuy2
void(* yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:65
ff_shuffle_bytes_3210_ssse3
void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
src1
const pixel * src1
Definition: h264pred_template.c:421
DEINTERLEAVE_BYTES
#define DEINTERLEAVE_BYTES(cpuext)
Definition: rgb2rgb.c:2378
x86_reg
int x86_reg
Definition: asm.h:72
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
w
uint8_t w
Definition: llviddspenc.c:38
yuy2toyv12
void(* yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:81
b
#define b
Definition: input.c:41
shuffle_bytes_3210
void(* shuffle_bytes_3210)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:58
rgb2yuv
static const char rgb2yuv[]
Definition: vf_scale_vulkan.c:69
rgb32tobgr16
void(* rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:36
yuyvtoyuv422
void(* yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:117
rgb24tobgr16
void(* rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:40
DECLARE_ASM_CONST
#define DECLARE_ASM_CONST(n, t, v)
Definition: mem_internal.h:90
rgb15to32
void(* rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:52
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
yv12touyvy
void(* yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:69
rgb32to16
void(* rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:45
rgb
Definition: rpzaenc.c:60
shuffle_bytes_2130
void(* shuffle_bytes_2130)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:61
MANGLE
#define MANGLE(a)
Definition: asm.h:127
ff_shuffle_bytes_0321_ssse3
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
ff_shuffle_bytes_2013_ssse3
void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
av_cold
#define av_cold
Definition: attributes.h:90
rgb16tobgr24
void(* rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:42
ff_shuffle_bytes_3012_ssse3
void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
s
#define s(width, name)
Definition: cbs_vp9.c:198
AV_CEIL_RSHIFT
#define AV_CEIL_RSHIFT(a, b)
Definition: common.h:60
INLINE_SSE2
#define INLINE_SSE2(flags)
Definition: cpu.h:90
g
const char * g
Definition: vf_curves.c:128
shuffle_bytes_1230
void(* shuffle_bytes_1230)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:56
rgb15tobgr24
void(* rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:43
yuv422ptoyuy2
void(* yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
Definition: rgb2rgb.c:73
shuffle_bytes_2103
void(* shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:55
rgb32tobgr15
void(* rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:37
interleaveBytes
void(* interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, int width, int height, int src1Stride, int src2Stride, int dstStride)
Definition: rgb2rgb.c:92
yvu9_to_yuy2
void(* yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
Definition: rgb2rgb.c:103
shuffle_bytes_3102
void(* shuffle_bytes_3102)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:59
rgb16to15
void(* rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:50
asm.h
yuyvtoyuv420
void(* yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:114
rgb24tobgr32
void(* rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:38
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:81
height
#define height
Definition: dsp.h:85
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:104
ff_shuffle_bytes_3102_ssse3
void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
rgb2rgb_init_x86
av_cold void rgb2rgb_init_x86(void)
Definition: rgb2rgb.c:2410
shuffle_bytes_0321
void(* shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:54
attributes.h
rgb24to16
void(* rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:47
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
uyvytoyuv422
void(* uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:111
ff_rgb24toyv12
void(* ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, const int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
Definition: rgb2rgb.c:85
ff_shuffle_bytes_2130_ssse3
void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
rgb24to15
void(* rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:48
src2
const pixel * src2
Definition: h264pred_template.c:422
rgb32to15
void(* rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:46
swscale_internal.h
PREFETCH
#define PREFETCH
Definition: hscale_fast_bilinear_simd.c:28
ff_shuffle_bytes_1203_ssse3
void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
bswap.h
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
deinterleaveBytes
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
Definition: rgb2rgb.c:95
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
uyvytoyuv420
void(* uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:108
rgb16to32
void(* rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:49
rgb24tobgr15
void(* rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:41
shuffle_bytes_2013
void(* shuffle_bytes_2013)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:60
MOVNTQ
#define MOVNTQ(a, b)
Definition: swscale_template.c:34
rgb15to16
void(* rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:51
shuffle_bytes_1203
void(* shuffle_bytes_1203)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:62
yuv422ptouyvy
void(* yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
Definition: rgb2rgb.c:77
vu9_to_vu12
void(* vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
Definition: rgb2rgb.c:98
INLINE_MMXEXT
#define INLINE_MMXEXT(flags)
Definition: cpu.h:88
rgb24tobgr24
void(* rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:39
src0
const pixel *const src0
Definition: h264pred_template.c:420
ff_shuffle_bytes_1230_ssse3
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
ff_shuffle_bytes_2103_ssse3
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
int32_t
int32_t
Definition: audioconvert.c:56
ff_rgb24toyv12_c
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, const int32_t *rgb2yuv)
width should be a multiple of 2.
Definition: rgb2rgb_template.c:650
planar2x
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
Definition: rgb2rgb.c:90
h
h
Definition: vp9dsp_template.c:2070
width
#define width
Definition: dsp.h:85
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
rgb2rgb.h
src
#define src
Definition: vp8dsp.c:248
swscale.h