FFmpeg
postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/3dnow postprocess code.
24  */
25 
26 #include "libavutil/x86/asm.h"
27 
28 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
29  * included. The following macros will define its dependencies to 1 as well
30  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
31  * TEMPLATE_PP_* need to be undef at the end. */
32 
33 #ifdef TEMPLATE_PP_C
34 # define RENAME(a) a ## _C
35 #else
36 # define TEMPLATE_PP_C 0
37 #endif
38 
39 #ifdef TEMPLATE_PP_ALTIVEC
40 # define RENAME(a) a ## _altivec
41 #else
42 # define TEMPLATE_PP_ALTIVEC 0
43 #endif
44 
45 #ifdef TEMPLATE_PP_MMX
46 # define RENAME(a) a ## _MMX
47 #else
48 # define TEMPLATE_PP_MMX 0
49 #endif
50 
51 #ifdef TEMPLATE_PP_MMXEXT
52 # undef TEMPLATE_PP_MMX
53 # define TEMPLATE_PP_MMX 1
54 # define RENAME(a) a ## _MMX2
55 #else
56 # define TEMPLATE_PP_MMXEXT 0
57 #endif
58 
59 #ifdef TEMPLATE_PP_3DNOW
60 # undef TEMPLATE_PP_MMX
61 # define TEMPLATE_PP_MMX 1
62 # define RENAME(a) a ## _3DNow
63 #else
64 # define TEMPLATE_PP_3DNOW 0
65 #endif
66 
67 #ifdef TEMPLATE_PP_SSE2
68 # undef TEMPLATE_PP_MMX
69 # define TEMPLATE_PP_MMX 1
70 # undef TEMPLATE_PP_MMXEXT
71 # define TEMPLATE_PP_MMXEXT 1
72 # define RENAME(a) a ## _SSE2
73 #else
74 # define TEMPLATE_PP_SSE2 0
75 #endif
76 
77 #undef REAL_PAVGB
78 #undef PAVGB
79 #undef PMINUB
80 #undef PMAXUB
81 
82 #if TEMPLATE_PP_MMXEXT
83 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
84 #elif TEMPLATE_PP_3DNOW
85 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
86 #endif
87 #define PAVGB(a,b) REAL_PAVGB(a,b)
88 
89 #if TEMPLATE_PP_MMXEXT
90 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
91 #elif TEMPLATE_PP_MMX
92 #define PMINUB(b,a,t) \
93  "movq " #a ", " #t " \n\t"\
94  "psubusb " #b ", " #t " \n\t"\
95  "psubb " #t ", " #a " \n\t"
96 #endif
97 
98 #if TEMPLATE_PP_MMXEXT
99 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
100 #elif TEMPLATE_PP_MMX
101 #define PMAXUB(a,b) \
102  "psubusb " #a ", " #b " \n\t"\
103  "paddb " #a ", " #b " \n\t"
104 #endif
105 
106 //FIXME? |255-0| = 1 (should not be a problem ...)
107 #if TEMPLATE_PP_MMX
108 /**
109  * Check if the middle 8x8 Block in the given 8x16 block is flat
110  */
111 static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
112  int numEq= 0, dcOk;
113  src+= stride*4; // src points to begin of the 8x8 Block
114  __asm__ volatile(
115  "movq %0, %%mm7 \n\t"
116  "movq %1, %%mm6 \n\t"
117  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
118  );
119 
120  __asm__ volatile(
121  "lea (%2, %3), %%"FF_REG_a" \n\t"
122 // 0 1 2 3 4 5 6 7 8 9
123 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
124 
125  "movq (%2), %%mm0 \n\t"
126  "movq (%%"FF_REG_a"), %%mm1 \n\t"
127  "movq %%mm0, %%mm3 \n\t"
128  "movq %%mm0, %%mm4 \n\t"
129  PMAXUB(%%mm1, %%mm4)
130  PMINUB(%%mm1, %%mm3, %%mm5)
131  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
132  "paddb %%mm7, %%mm0 \n\t"
133  "pcmpgtb %%mm6, %%mm0 \n\t"
134 
135  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
136  PMAXUB(%%mm2, %%mm4)
137  PMINUB(%%mm2, %%mm3, %%mm5)
138  "psubb %%mm2, %%mm1 \n\t"
139  "paddb %%mm7, %%mm1 \n\t"
140  "pcmpgtb %%mm6, %%mm1 \n\t"
141  "paddb %%mm1, %%mm0 \n\t"
142 
143  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
144  PMAXUB(%%mm1, %%mm4)
145  PMINUB(%%mm1, %%mm3, %%mm5)
146  "psubb %%mm1, %%mm2 \n\t"
147  "paddb %%mm7, %%mm2 \n\t"
148  "pcmpgtb %%mm6, %%mm2 \n\t"
149  "paddb %%mm2, %%mm0 \n\t"
150 
151  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
152 
153  "movq (%2, %3, 4), %%mm2 \n\t"
154  PMAXUB(%%mm2, %%mm4)
155  PMINUB(%%mm2, %%mm3, %%mm5)
156  "psubb %%mm2, %%mm1 \n\t"
157  "paddb %%mm7, %%mm1 \n\t"
158  "pcmpgtb %%mm6, %%mm1 \n\t"
159  "paddb %%mm1, %%mm0 \n\t"
160 
161  "movq (%%"FF_REG_a"), %%mm1 \n\t"
162  PMAXUB(%%mm1, %%mm4)
163  PMINUB(%%mm1, %%mm3, %%mm5)
164  "psubb %%mm1, %%mm2 \n\t"
165  "paddb %%mm7, %%mm2 \n\t"
166  "pcmpgtb %%mm6, %%mm2 \n\t"
167  "paddb %%mm2, %%mm0 \n\t"
168 
169  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
170  PMAXUB(%%mm2, %%mm4)
171  PMINUB(%%mm2, %%mm3, %%mm5)
172  "psubb %%mm2, %%mm1 \n\t"
173  "paddb %%mm7, %%mm1 \n\t"
174  "pcmpgtb %%mm6, %%mm1 \n\t"
175  "paddb %%mm1, %%mm0 \n\t"
176 
177  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
178  PMAXUB(%%mm1, %%mm4)
179  PMINUB(%%mm1, %%mm3, %%mm5)
180  "psubb %%mm1, %%mm2 \n\t"
181  "paddb %%mm7, %%mm2 \n\t"
182  "pcmpgtb %%mm6, %%mm2 \n\t"
183  "paddb %%mm2, %%mm0 \n\t"
184  "psubusb %%mm3, %%mm4 \n\t"
185 
186  " \n\t"
187 #if TEMPLATE_PP_MMXEXT
188  "pxor %%mm7, %%mm7 \n\t"
189  "psadbw %%mm7, %%mm0 \n\t"
190 #else
191  "movq %%mm0, %%mm1 \n\t"
192  "psrlw $8, %%mm0 \n\t"
193  "paddb %%mm1, %%mm0 \n\t"
194  "movq %%mm0, %%mm1 \n\t"
195  "psrlq $16, %%mm0 \n\t"
196  "paddb %%mm1, %%mm0 \n\t"
197  "movq %%mm0, %%mm1 \n\t"
198  "psrlq $32, %%mm0 \n\t"
199  "paddb %%mm1, %%mm0 \n\t"
200 #endif
201  "movq %4, %%mm7 \n\t" // QP,..., QP
202  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
203  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
204  "packssdw %%mm4, %%mm4 \n\t"
205  "movd %%mm0, %0 \n\t"
206  "movd %%mm4, %1 \n\t"
207 
208  : "=r" (numEq), "=r" (dcOk)
209  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
210  : "%"FF_REG_a
211  );
212 
213  numEq= (-numEq) &0xFF;
214  if(numEq > c->ppMode.flatnessThreshold){
215  if(dcOk) return 0;
216  else return 1;
217  }else{
218  return 2;
219  }
220 }
221 #endif //TEMPLATE_PP_MMX
222 
223 /**
224  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
225  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
226  */
227 #if !TEMPLATE_PP_ALTIVEC
228 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
229 {
230 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
231  src+= stride*3;
232  __asm__ volatile( //"movv %0 %1 %2\n\t"
233  "movq %2, %%mm0 \n\t" // QP,..., QP
234  "pxor %%mm4, %%mm4 \n\t"
235 
236  "movq (%0), %%mm6 \n\t"
237  "movq (%0, %1), %%mm5 \n\t"
238  "movq %%mm5, %%mm1 \n\t"
239  "movq %%mm6, %%mm2 \n\t"
240  "psubusb %%mm6, %%mm5 \n\t"
241  "psubusb %%mm1, %%mm2 \n\t"
242  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
243  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
244  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
245 
246  "pand %%mm2, %%mm6 \n\t"
247  "pandn %%mm1, %%mm2 \n\t"
248  "por %%mm2, %%mm6 \n\t"// First Line to Filter
249 
250  "movq (%0, %1, 8), %%mm5 \n\t"
251  "lea (%0, %1, 4), %%"FF_REG_a" \n\t"
252  "lea (%0, %1, 8), %%"FF_REG_c" \n\t"
253  "sub %1, %%"FF_REG_c" \n\t"
254  "add %1, %0 \n\t" // %0 points to line 1 not 0
255  "movq (%0, %1, 8), %%mm7 \n\t"
256  "movq %%mm5, %%mm1 \n\t"
257  "movq %%mm7, %%mm2 \n\t"
258  "psubusb %%mm7, %%mm5 \n\t"
259  "psubusb %%mm1, %%mm2 \n\t"
260  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
261  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
262  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
263 
264  "pand %%mm2, %%mm7 \n\t"
265  "pandn %%mm1, %%mm2 \n\t"
266  "por %%mm2, %%mm7 \n\t" // First Line to Filter
267 
268 
269  // 1 2 3 4 5 6 7 8
270  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
271  // 6 4 2 2 1 1
272  // 6 4 4 2
273  // 6 8 2
274 
275  "movq (%0, %1), %%mm0 \n\t" // 1
276  "movq %%mm0, %%mm1 \n\t" // 1
277  PAVGB(%%mm6, %%mm0) //1 1 /2
278  PAVGB(%%mm6, %%mm0) //3 1 /4
279 
280  "movq (%0, %1, 4), %%mm2 \n\t" // 1
281  "movq %%mm2, %%mm5 \n\t" // 1
282  PAVGB((%%FF_REGa), %%mm2) // 11 /2
283  PAVGB((%0, %1, 2), %%mm2) // 211 /4
284  "movq %%mm2, %%mm3 \n\t" // 211 /4
285  "movq (%0), %%mm4 \n\t" // 1
286  PAVGB(%%mm4, %%mm3) // 4 211 /8
287  PAVGB(%%mm0, %%mm3) //642211 /16
288  "movq %%mm3, (%0) \n\t" // X
289  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
290  "movq %%mm1, %%mm0 \n\t" // 1
291  PAVGB(%%mm6, %%mm0) //1 1 /2
292  "movq %%mm4, %%mm3 \n\t" // 1
293  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
294  PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
295  PAVGB((%%FF_REGa), %%mm5) // 211 /4
296  PAVGB(%%mm5, %%mm3) // 2 2211 /8
297  PAVGB(%%mm0, %%mm3) //4242211 /16
298  "movq %%mm3, (%0,%1) \n\t" // X
299  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
300  PAVGB(%%mm4, %%mm6) //11 /2
301  "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
302  PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
303  "movq %%mm0, %%mm3 \n\t" // 11/2
304  PAVGB(%%mm1, %%mm0) // 2 11/4
305  PAVGB(%%mm6, %%mm0) //222 11/8
306  PAVGB(%%mm2, %%mm0) //22242211/16
307  "movq (%0, %1, 2), %%mm2 \n\t" // 1
308  "movq %%mm0, (%0, %1, 2) \n\t" // X
309  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
310  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
311  PAVGB((%%FF_REGc), %%mm0) // 11 /2
312  PAVGB(%%mm0, %%mm6) //11 11 /4
313  PAVGB(%%mm1, %%mm4) // 11 /2
314  PAVGB(%%mm2, %%mm1) // 11 /2
315  PAVGB(%%mm1, %%mm6) //1122 11 /8
316  PAVGB(%%mm5, %%mm6) //112242211 /16
317  "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
318  "movq %%mm6, (%%"FF_REG_a") \n\t" // X
319  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
320  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
321  PAVGB(%%mm7, %%mm6) // 11 /2
322  PAVGB(%%mm4, %%mm6) // 11 11 /4
323  PAVGB(%%mm3, %%mm6) // 11 2211 /8
324  PAVGB(%%mm5, %%mm2) // 11 /2
325  "movq (%0, %1, 4), %%mm4 \n\t" // 1
326  PAVGB(%%mm4, %%mm2) // 112 /4
327  PAVGB(%%mm2, %%mm6) // 112242211 /16
328  "movq %%mm6, (%0, %1, 4) \n\t" // X
329  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
330  PAVGB(%%mm7, %%mm1) // 11 2 /4
331  PAVGB(%%mm4, %%mm5) // 11 /2
332  PAVGB(%%mm5, %%mm0) // 11 11 /4
333  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
334  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
335  PAVGB(%%mm0, %%mm1) // 11224222 /16
336  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
337  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
338  PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
339  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
340  PAVGB(%%mm0, %%mm6) // 1 1 /2
341  PAVGB(%%mm7, %%mm6) // 1 12 /4
342  PAVGB(%%mm2, %%mm6) // 1122424 /4
343  "movq %%mm6, (%%"FF_REG_c") \n\t" // X
344  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
345  PAVGB(%%mm7, %%mm5) // 11 2 /4
346  PAVGB(%%mm7, %%mm5) // 11 6 /8
347 
348  PAVGB(%%mm3, %%mm0) // 112 /4
349  PAVGB(%%mm0, %%mm5) // 112246 /16
350  "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
351  "sub %1, %0 \n\t"
352 
353  :
354  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
355  : "%"FF_REG_a, "%"FF_REG_c
356  );
357 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
358  const int l1= stride;
359  const int l2= stride + l1;
360  const int l3= stride + l2;
361  const int l4= stride + l3;
362  const int l5= stride + l4;
363  const int l6= stride + l5;
364  const int l7= stride + l6;
365  const int l8= stride + l7;
366  const int l9= stride + l8;
367  int x;
368  src+= stride*3;
369  for(x=0; x<BLOCK_SIZE; x++){
370  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
371  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
372 
373  int sums[10];
374  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
375  sums[1] = sums[0] - first + src[l4];
376  sums[2] = sums[1] - first + src[l5];
377  sums[3] = sums[2] - first + src[l6];
378  sums[4] = sums[3] - first + src[l7];
379  sums[5] = sums[4] - src[l1] + src[l8];
380  sums[6] = sums[5] - src[l2] + last;
381  sums[7] = sums[6] - src[l3] + last;
382  sums[8] = sums[7] - src[l4] + last;
383  sums[9] = sums[8] - src[l5] + last;
384 
385  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
386  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
387  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
388  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
389  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
390  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
391  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
392  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
393 
394  src++;
395  }
396 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
397 }
398 #endif //TEMPLATE_PP_ALTIVEC
399 
400 /**
401  * Experimental Filter 1
402  * will not damage linear gradients
403  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
404  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
405  * MMX2 version does correct clipping C version does not
406  */
407 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
408 {
409 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
410  src+= stride*3;
411 
412  __asm__ volatile(
413  "pxor %%mm7, %%mm7 \n\t" // 0
414  "lea (%0, %1), %%"FF_REG_a" \n\t"
415  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
416 // 0 1 2 3 4 5 6 7 8 9
417 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
418  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
419  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
420  "movq %%mm1, %%mm2 \n\t" // line 4
421  "psubusb %%mm0, %%mm1 \n\t"
422  "psubusb %%mm2, %%mm0 \n\t"
423  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
424  "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
425  "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
426  "movq %%mm3, %%mm5 \n\t" // line 5
427  "psubusb %%mm4, %%mm3 \n\t"
428  "psubusb %%mm5, %%mm4 \n\t"
429  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
430  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
431  "movq %%mm2, %%mm1 \n\t" // line 4
432  "psubusb %%mm5, %%mm2 \n\t"
433  "movq %%mm2, %%mm4 \n\t"
434  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
435  "psubusb %%mm1, %%mm5 \n\t"
436  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
437  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
438  "movq %%mm4, %%mm3 \n\t" // d
439  "movq %2, %%mm0 \n\t"
440  "paddusb %%mm0, %%mm0 \n\t"
441  "psubusb %%mm0, %%mm4 \n\t"
442  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
443  "psubusb "MANGLE(b01)", %%mm3 \n\t"
444  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
445 
446  PAVGB(%%mm7, %%mm3) // d/2
447  "movq %%mm3, %%mm1 \n\t" // d/2
448  PAVGB(%%mm7, %%mm3) // d/4
449  PAVGB(%%mm1, %%mm3) // 3*d/8
450 
451  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
452  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
453  "psubusb %%mm3, %%mm0 \n\t"
454  "pxor %%mm2, %%mm0 \n\t"
455  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
456 
457  "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
458  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
459  "paddusb %%mm3, %%mm0 \n\t"
460  "pxor %%mm2, %%mm0 \n\t"
461  "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
462 
463  PAVGB(%%mm7, %%mm1) // d/4
464 
465  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
466  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
467  "psubusb %%mm1, %%mm0 \n\t"
468  "pxor %%mm2, %%mm0 \n\t"
469  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
470 
471  "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
472  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
473  "paddusb %%mm1, %%mm0 \n\t"
474  "pxor %%mm2, %%mm0 \n\t"
475  "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
476 
477  PAVGB(%%mm7, %%mm1) // d/8
478 
479  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
480  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
481  "psubusb %%mm1, %%mm0 \n\t"
482  "pxor %%mm2, %%mm0 \n\t"
483  "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
484 
485  "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
486  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
487  "paddusb %%mm1, %%mm0 \n\t"
488  "pxor %%mm2, %%mm0 \n\t"
489  "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
490 
491  :
492  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
494  : "%"FF_REG_a, "%"FF_REG_c
495  );
496 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
497 
498  const int l1= stride;
499  const int l2= stride + l1;
500  const int l3= stride + l2;
501  const int l4= stride + l3;
502  const int l5= stride + l4;
503  const int l6= stride + l5;
504  const int l7= stride + l6;
505 // const int l8= stride + l7;
506 // const int l9= stride + l8;
507  int x;
508 
509  src+= stride*3;
510  for(x=0; x<BLOCK_SIZE; x++){
511  int a= src[l3] - src[l4];
512  int b= src[l4] - src[l5];
513  int c= src[l5] - src[l6];
514 
515  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
516  d= FFMAX(d, 0);
517 
518  if(d < co->QP*2){
519  int v = d * FFSIGN(-b);
520 
521  src[l2] +=v>>3;
522  src[l3] +=v>>2;
523  src[l4] +=(3*v)>>3;
524  src[l5] -=(3*v)>>3;
525  src[l6] -=v>>2;
526  src[l7] -=v>>3;
527  }
528  src++;
529  }
530 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
531 }
532 
533 #if !TEMPLATE_PP_ALTIVEC
534 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
535 {
536 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
537 /*
538  uint8_t tmp[16];
539  const int l1= stride;
540  const int l2= stride + l1;
541  const int l3= stride + l2;
542  const int l4= (int)tmp - (int)src - stride*3;
543  const int l5= (int)tmp - (int)src - stride*3 + 8;
544  const int l6= stride*3 + l3;
545  const int l7= stride + l6;
546  const int l8= stride + l7;
547 
548  memcpy(tmp, src+stride*7, 8);
549  memcpy(tmp+8, src+stride*8, 8);
550 */
551  src+= stride*4;
552  __asm__ volatile(
553 
554 #if 0 //slightly more accurate and slightly slower
555  "pxor %%mm7, %%mm7 \n\t" // 0
556  "lea (%0, %1), %%"FF_REG_a" \n\t"
557  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
558 // 0 1 2 3 4 5 6 7
559 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
560 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
561 
562 
563  "movq (%0, %1, 2), %%mm0 \n\t" // l2
564  "movq (%0), %%mm1 \n\t" // l0
565  "movq %%mm0, %%mm2 \n\t" // l2
566  PAVGB(%%mm7, %%mm0) // ~l2/2
567  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
568  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
569 
570  "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
571  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
572  "movq %%mm1, %%mm4 \n\t" // l1
573  PAVGB(%%mm7, %%mm1) // ~l1/2
574  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
575  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
576 
577  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
578  "psubusb %%mm1, %%mm0 \n\t"
579  "psubusb %%mm4, %%mm1 \n\t"
580  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
581 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
582 
583  "movq (%0, %1, 4), %%mm0 \n\t" // l4
584  "movq %%mm0, %%mm4 \n\t" // l4
585  PAVGB(%%mm7, %%mm0) // ~l4/2
586  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
587  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
588 
589  "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
590  "movq %%mm3, %%mm5 \n\t" // l3
591  PAVGB(%%mm7, %%mm3) // ~l3/2
592  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
593  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
594 
595  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
596  "psubusb %%mm3, %%mm0 \n\t"
597  "psubusb %%mm6, %%mm3 \n\t"
598  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
599  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
600 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
601 
602  "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
603  "movq %%mm6, %%mm5 \n\t" // l6
604  PAVGB(%%mm7, %%mm6) // ~l6/2
605  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
606  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
607 
608  "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
609  "movq %%mm2, %%mm4 \n\t" // l5
610  PAVGB(%%mm7, %%mm2) // ~l5/2
611  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
612  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
613 
614  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
615  "psubusb %%mm2, %%mm6 \n\t"
616  "psubusb %%mm4, %%mm2 \n\t"
617  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
618 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
619 
620 
621  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
622  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
623  "paddusb "MANGLE(b01)", %%mm4 \n\t"
624  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
625  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
626  "pand %%mm4, %%mm3 \n\t"
627 
628  "movq %%mm3, %%mm1 \n\t"
629 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
630  PAVGB(%%mm7, %%mm3)
631  PAVGB(%%mm7, %%mm3)
632  "paddusb %%mm1, %%mm3 \n\t"
633 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
634 
635  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
636  "movq (%0, %1, 4), %%mm5 \n\t" //l4
637  "movq (%0, %1, 4), %%mm4 \n\t" //l4
638  "psubusb %%mm6, %%mm5 \n\t"
639  "psubusb %%mm4, %%mm6 \n\t"
640  "por %%mm6, %%mm5 \n\t" // |l3-l4|
641  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
642  "pxor %%mm6, %%mm0 \n\t"
643  "pand %%mm0, %%mm3 \n\t"
644  PMINUB(%%mm5, %%mm3, %%mm0)
645 
646  "psubusb "MANGLE(b01)", %%mm3 \n\t"
647  PAVGB(%%mm7, %%mm3)
648 
649  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
650  "movq (%0, %1, 4), %%mm2 \n\t"
651  "pxor %%mm6, %%mm0 \n\t"
652  "pxor %%mm6, %%mm2 \n\t"
653  "psubb %%mm3, %%mm0 \n\t"
654  "paddb %%mm3, %%mm2 \n\t"
655  "pxor %%mm6, %%mm0 \n\t"
656  "pxor %%mm6, %%mm2 \n\t"
657  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
658  "movq %%mm2, (%0, %1, 4) \n\t"
659 #endif //0
660 
661  "lea (%0, %1), %%"FF_REG_a" \n\t"
662  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
663 // 0 1 2 3 4 5 6 7
664 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
665 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
666 
667 
668  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
669  "movq (%0, %1, 4), %%mm0 \n\t" // l4
670  "pxor %%mm6, %%mm1 \n\t" // -l3-1
671  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
672 // mm1=-l3-1, mm0=128-q
673 
674  "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
675  "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
676  "pxor %%mm6, %%mm2 \n\t" // -l5-1
677  "movq %%mm2, %%mm5 \n\t" // -l5-1
678  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
679  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
680  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
681  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
682  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
683  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
684 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
685 
686  "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
687  "pxor %%mm6, %%mm2 \n\t" // -l1-1
688  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
689  PAVGB((%0), %%mm1) // (l0-l3+256)/2
690  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
691  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
692  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
693  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
694 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
695 
696  PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
697  "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
698  "pxor %%mm6, %%mm1 \n\t" // -l7-1
699  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
700  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
701  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
702  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
703  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
704 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
705 
706  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
707  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
708  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
709  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
710  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
711  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
712  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
713 
714 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
715 
716  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
717  "movq %2, %%mm2 \n\t" // QP
718  PAVGB(%%mm6, %%mm2) // 128 + QP/2
719  "psubb %%mm6, %%mm2 \n\t"
720 
721  "movq %%mm4, %%mm1 \n\t"
722  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
723  "pxor %%mm1, %%mm4 \n\t"
724  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
725  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
726  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
727 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
728 
729  "movq %%mm4, %%mm3 \n\t" // d
730  "psubusb "MANGLE(b01)", %%mm4 \n\t"
731  PAVGB(%%mm7, %%mm4) // d/32
732  PAVGB(%%mm7, %%mm4) // (d + 32)/64
733  "paddb %%mm3, %%mm4 \n\t" // 5d/64
734  "pand %%mm2, %%mm4 \n\t"
735 
736  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
737  "psubb %%mm0, %%mm5 \n\t" // q
738  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
739  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
740  "pxor %%mm7, %%mm5 \n\t"
741 
742  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
743  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
744 
745  "pand %%mm7, %%mm4 \n\t"
746  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
747  "movq (%0, %1, 4), %%mm2 \n\t"
748  "pxor %%mm1, %%mm0 \n\t"
749  "pxor %%mm1, %%mm2 \n\t"
750  "paddb %%mm4, %%mm0 \n\t"
751  "psubb %%mm4, %%mm2 \n\t"
752  "pxor %%mm1, %%mm0 \n\t"
753  "pxor %%mm1, %%mm2 \n\t"
754  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
755  "movq %%mm2, (%0, %1, 4) \n\t"
756 
757  :
758  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
759  NAMED_CONSTRAINTS_ADD(b80,b00,b01)
760  : "%"FF_REG_a, "%"FF_REG_c
761  );
762 
763 /*
764  {
765  int x;
766  src-= stride;
767  for(x=0; x<BLOCK_SIZE; x++){
768  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
769  if(FFABS(middleEnergy)< 8*QP){
770  const int q=(src[l4] - src[l5])/2;
771  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
772  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
773 
774  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
775  d= FFMAX(d, 0);
776 
777  d= (5*d + 32) >> 6;
778  d*= FFSIGN(-middleEnergy);
779 
780  if(q>0){
781  d= d<0 ? 0 : d;
782  d= d>q ? q : d;
783  }else{
784  d= d>0 ? 0 : d;
785  d= d<q ? q : d;
786  }
787 
788  src[l4]-= d;
789  src[l5]+= d;
790  }
791  src++;
792  }
793  src-=8;
794  for(x=0; x<8; x++){
795  int y;
796  for(y=4; y<6; y++){
797  int d= src[x+y*stride] - tmp[x+(y-4)*8];
798  int ad= FFABS(d);
799  static int max=0;
800  static int sum=0;
801  static int num=0;
802  static int bias=0;
803 
804  if(max<ad) max=ad;
805  sum+= ad>3 ? 1 : 0;
806  if(ad>3){
807  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
808  }
809  if(y==4) bias+=d;
810  num++;
811  if(num%1000000 == 0){
812  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
813  }
814  }
815  }
816 }
817 */
818 #elif TEMPLATE_PP_MMX
819  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
820  src+= stride*4;
821  __asm__ volatile(
822  "pxor %%mm7, %%mm7 \n\t"
823 // 0 1 2 3 4 5 6 7
824 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
825 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
826 
827  "movq (%0), %%mm0 \n\t"
828  "movq %%mm0, %%mm1 \n\t"
829  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
830  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
831 
832  "movq (%0, %1), %%mm2 \n\t"
833  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
834  "movq %%mm2, %%mm3 \n\t"
835  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
836  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
837 
838  "movq (%%"FF_REG_a"), %%mm4 \n\t"
839  "movq %%mm4, %%mm5 \n\t"
840  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
841  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
842 
843  "paddw %%mm0, %%mm0 \n\t" // 2L0
844  "paddw %%mm1, %%mm1 \n\t" // 2H0
845  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
846  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
847  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
848  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
849 
850  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
851  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
852  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
853  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
854 
855  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
856  "movq %%mm2, %%mm3 \n\t"
857  "punpcklbw %%mm7, %%mm2 \n\t" // L3
858  "punpckhbw %%mm7, %%mm3 \n\t" // H3
859 
860  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
861  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
862  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
863  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
864  "movq %%mm0, (%3) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
865  "movq %%mm1, 8(%3) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
866 
867  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
868  "movq %%mm0, %%mm1 \n\t"
869  "punpcklbw %%mm7, %%mm0 \n\t" // L4
870  "punpckhbw %%mm7, %%mm1 \n\t" // H4
871 
872  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
873  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
874  "movq %%mm2, 16(%3) \n\t" // L3 - L4
875  "movq %%mm3, 24(%3) \n\t" // H3 - H4
876  "paddw %%mm4, %%mm4 \n\t" // 2L2
877  "paddw %%mm5, %%mm5 \n\t" // 2H2
878  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
879  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
880 
881  "lea (%%"FF_REG_a", %1), %0 \n\t"
882  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
883  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
884  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
885  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
886 //50 opcodes so far
887  "movq (%0, %1, 2), %%mm2 \n\t"
888  "movq %%mm2, %%mm3 \n\t"
889  "punpcklbw %%mm7, %%mm2 \n\t" // L5
890  "punpckhbw %%mm7, %%mm3 \n\t" // H5
891  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
892  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
893  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
894  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
895 
896  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
897  "punpcklbw %%mm7, %%mm6 \n\t" // L6
898  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
899  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
900  "punpckhbw %%mm7, %%mm6 \n\t" // H6
901  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
902 
903  "paddw %%mm0, %%mm0 \n\t" // 2L4
904  "paddw %%mm1, %%mm1 \n\t" // 2H4
905  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
906  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
907 
908  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
909  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
910  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
911  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
912 
913  "movq (%0, %1, 4), %%mm2 \n\t"
914  "movq %%mm2, %%mm3 \n\t"
915  "punpcklbw %%mm7, %%mm2 \n\t" // L7
916  "punpckhbw %%mm7, %%mm3 \n\t" // H7
917 
918  "paddw %%mm2, %%mm2 \n\t" // 2L7
919  "paddw %%mm3, %%mm3 \n\t" // 2H7
920  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
921  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
922 
923  "movq (%3), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
924  "movq 8(%3), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
925 
926 #if TEMPLATE_PP_MMXEXT
927  "movq %%mm7, %%mm6 \n\t" // 0
928  "psubw %%mm0, %%mm6 \n\t"
929  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
930  "movq %%mm7, %%mm6 \n\t" // 0
931  "psubw %%mm1, %%mm6 \n\t"
932  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
933  "movq %%mm7, %%mm6 \n\t" // 0
934  "psubw %%mm2, %%mm6 \n\t"
935  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
936  "movq %%mm7, %%mm6 \n\t" // 0
937  "psubw %%mm3, %%mm6 \n\t"
938  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
939 #else
940  "movq %%mm7, %%mm6 \n\t" // 0
941  "pcmpgtw %%mm0, %%mm6 \n\t"
942  "pxor %%mm6, %%mm0 \n\t"
943  "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
944  "movq %%mm7, %%mm6 \n\t" // 0
945  "pcmpgtw %%mm1, %%mm6 \n\t"
946  "pxor %%mm6, %%mm1 \n\t"
947  "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
948  "movq %%mm7, %%mm6 \n\t" // 0
949  "pcmpgtw %%mm2, %%mm6 \n\t"
950  "pxor %%mm6, %%mm2 \n\t"
951  "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
952  "movq %%mm7, %%mm6 \n\t" // 0
953  "pcmpgtw %%mm3, %%mm6 \n\t"
954  "pxor %%mm6, %%mm3 \n\t"
955  "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
956 #endif
957 
958 #if TEMPLATE_PP_MMXEXT
959  "pminsw %%mm2, %%mm0 \n\t"
960  "pminsw %%mm3, %%mm1 \n\t"
961 #else
962  "movq %%mm0, %%mm6 \n\t"
963  "psubusw %%mm2, %%mm6 \n\t"
964  "psubw %%mm6, %%mm0 \n\t"
965  "movq %%mm1, %%mm6 \n\t"
966  "psubusw %%mm3, %%mm6 \n\t"
967  "psubw %%mm6, %%mm1 \n\t"
968 #endif
969 
970  "movd %2, %%mm2 \n\t" // QP
971  "punpcklbw %%mm7, %%mm2 \n\t"
972 
973  "movq %%mm7, %%mm6 \n\t" // 0
974  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
975  "pxor %%mm6, %%mm4 \n\t"
976  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
977  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
978  "pxor %%mm7, %%mm5 \n\t"
979  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
980 // 100 opcodes
981  "psllw $3, %%mm2 \n\t" // 8QP
982  "movq %%mm2, %%mm3 \n\t" // 8QP
983  "pcmpgtw %%mm4, %%mm2 \n\t"
984  "pcmpgtw %%mm5, %%mm3 \n\t"
985  "pand %%mm2, %%mm4 \n\t"
986  "pand %%mm3, %%mm5 \n\t"
987 
988 
989  "psubusw %%mm0, %%mm4 \n\t" // hd
990  "psubusw %%mm1, %%mm5 \n\t" // ld
991 
992 
993  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
994  "pmullw %%mm2, %%mm4 \n\t"
995  "pmullw %%mm2, %%mm5 \n\t"
996  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
997  "paddw %%mm2, %%mm4 \n\t"
998  "paddw %%mm2, %%mm5 \n\t"
999  "psrlw $6, %%mm4 \n\t"
1000  "psrlw $6, %%mm5 \n\t"
1001 
1002  "movq 16(%3), %%mm0 \n\t" // L3 - L4
1003  "movq 24(%3), %%mm1 \n\t" // H3 - H4
1004 
1005  "pxor %%mm2, %%mm2 \n\t"
1006  "pxor %%mm3, %%mm3 \n\t"
1007 
1008  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1009  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1010  "pxor %%mm2, %%mm0 \n\t"
1011  "pxor %%mm3, %%mm1 \n\t"
1012  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1013  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1014  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1015  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1016 
1017  "pxor %%mm6, %%mm2 \n\t"
1018  "pxor %%mm7, %%mm3 \n\t"
1019  "pand %%mm2, %%mm4 \n\t"
1020  "pand %%mm3, %%mm5 \n\t"
1021 
1022 #if TEMPLATE_PP_MMXEXT
1023  "pminsw %%mm0, %%mm4 \n\t"
1024  "pminsw %%mm1, %%mm5 \n\t"
1025 #else
1026  "movq %%mm4, %%mm2 \n\t"
1027  "psubusw %%mm0, %%mm2 \n\t"
1028  "psubw %%mm2, %%mm4 \n\t"
1029  "movq %%mm5, %%mm2 \n\t"
1030  "psubusw %%mm1, %%mm2 \n\t"
1031  "psubw %%mm2, %%mm5 \n\t"
1032 #endif
1033  "pxor %%mm6, %%mm4 \n\t"
1034  "pxor %%mm7, %%mm5 \n\t"
1035  "psubw %%mm6, %%mm4 \n\t"
1036  "psubw %%mm7, %%mm5 \n\t"
1037  "packsswb %%mm5, %%mm4 \n\t"
1038  "movq (%0), %%mm0 \n\t"
1039  "paddb %%mm4, %%mm0 \n\t"
1040  "movq %%mm0, (%0) \n\t"
1041  "movq (%0, %1), %%mm0 \n\t"
1042  "psubb %%mm4, %%mm0 \n\t"
1043  "movq %%mm0, (%0, %1) \n\t"
1044 
1045  : "+r" (src)
1046  : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp)
1047  NAMED_CONSTRAINTS_ADD(w05,w20)
1048  : "%"FF_REG_a
1049  );
1050 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1051  const int l1= stride;
1052  const int l2= stride + l1;
1053  const int l3= stride + l2;
1054  const int l4= stride + l3;
1055  const int l5= stride + l4;
1056  const int l6= stride + l5;
1057  const int l7= stride + l6;
1058  const int l8= stride + l7;
1059 // const int l9= stride + l8;
1060  int x;
1061  src+= stride*3;
1062  for(x=0; x<BLOCK_SIZE; x++){
1063  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1064  if(FFABS(middleEnergy) < 8*c->QP){
1065  const int q=(src[l4] - src[l5])/2;
1066  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1067  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1068 
1069  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1070  d= FFMAX(d, 0);
1071 
1072  d= (5*d + 32) >> 6;
1073  d*= FFSIGN(-middleEnergy);
1074 
1075  if(q>0){
1076  d = FFMAX(d, 0);
1077  d = FFMIN(d, q);
1078  }else{
1079  d = FFMIN(d, 0);
1080  d = FFMAX(d, q);
1081  }
1082 
1083  src[l4]-= d;
1084  src[l5]+= d;
1085  }
1086  src++;
1087  }
1088 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1089 }
1090 #endif //TEMPLATE_PP_ALTIVEC
1091 
1092 #if !TEMPLATE_PP_ALTIVEC
1093 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1094 {
1095 #if HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
1096  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
1097  __asm__ volatile(
1098  "pxor %%mm6, %%mm6 \n\t"
1099  "pcmpeqb %%mm7, %%mm7 \n\t"
1100  "movq %2, %%mm0 \n\t"
1101  "punpcklbw %%mm6, %%mm0 \n\t"
1102  "psrlw $1, %%mm0 \n\t"
1103  "psubw %%mm7, %%mm0 \n\t"
1104  "packuswb %%mm0, %%mm0 \n\t"
1105  "movq %%mm0, %3 \n\t"
1106 
1107  "lea (%0, %1), %%"FF_REG_a" \n\t"
1108  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1109 
1110 // 0 1 2 3 4 5 6 7 8 9
1111 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1112 
1113 #undef REAL_FIND_MIN_MAX
1114 #undef FIND_MIN_MAX
1115 #if TEMPLATE_PP_MMXEXT
1116 #define REAL_FIND_MIN_MAX(addr)\
1117  "movq " #addr ", %%mm0 \n\t"\
1118  "pminub %%mm0, %%mm7 \n\t"\
1119  "pmaxub %%mm0, %%mm6 \n\t"
1120 #else
1121 #define REAL_FIND_MIN_MAX(addr)\
1122  "movq " #addr ", %%mm0 \n\t"\
1123  "movq %%mm7, %%mm1 \n\t"\
1124  "psubusb %%mm0, %%mm6 \n\t"\
1125  "paddb %%mm0, %%mm6 \n\t"\
1126  "psubusb %%mm0, %%mm1 \n\t"\
1127  "psubb %%mm1, %%mm7 \n\t"
1128 #endif
1129 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
1130 
1131 FIND_MIN_MAX((%%FF_REGa))
1132 FIND_MIN_MAX((%%FF_REGa, %1))
1133 FIND_MIN_MAX((%%FF_REGa, %1, 2))
1134 FIND_MIN_MAX((%0, %1, 4))
1135 FIND_MIN_MAX((%%FF_REGd))
1136 FIND_MIN_MAX((%%FF_REGd, %1))
1137 FIND_MIN_MAX((%%FF_REGd, %1, 2))
1138 FIND_MIN_MAX((%0, %1, 8))
1139 
1140  "movq %%mm7, %%mm4 \n\t"
1141  "psrlq $8, %%mm7 \n\t"
1142 #if TEMPLATE_PP_MMXEXT
1143  "pminub %%mm4, %%mm7 \n\t" // min of pixels
1144  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1145  "pminub %%mm4, %%mm7 \n\t" // min of pixels
1146  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1147  "pminub %%mm4, %%mm7 \n\t"
1148 #else
1149  "movq %%mm7, %%mm1 \n\t"
1150  "psubusb %%mm4, %%mm1 \n\t"
1151  "psubb %%mm1, %%mm7 \n\t"
1152  "movq %%mm7, %%mm4 \n\t"
1153  "psrlq $16, %%mm7 \n\t"
1154  "movq %%mm7, %%mm1 \n\t"
1155  "psubusb %%mm4, %%mm1 \n\t"
1156  "psubb %%mm1, %%mm7 \n\t"
1157  "movq %%mm7, %%mm4 \n\t"
1158  "psrlq $32, %%mm7 \n\t"
1159  "movq %%mm7, %%mm1 \n\t"
1160  "psubusb %%mm4, %%mm1 \n\t"
1161  "psubb %%mm1, %%mm7 \n\t"
1162 #endif
1163 
1164 
1165  "movq %%mm6, %%mm4 \n\t"
1166  "psrlq $8, %%mm6 \n\t"
1167 #if TEMPLATE_PP_MMXEXT
1168  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1169  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1170  "pmaxub %%mm4, %%mm6 \n\t"
1171  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1172  "pmaxub %%mm4, %%mm6 \n\t"
1173 #else
1174  "psubusb %%mm4, %%mm6 \n\t"
1175  "paddb %%mm4, %%mm6 \n\t"
1176  "movq %%mm6, %%mm4 \n\t"
1177  "psrlq $16, %%mm6 \n\t"
1178  "psubusb %%mm4, %%mm6 \n\t"
1179  "paddb %%mm4, %%mm6 \n\t"
1180  "movq %%mm6, %%mm4 \n\t"
1181  "psrlq $32, %%mm6 \n\t"
1182  "psubusb %%mm4, %%mm6 \n\t"
1183  "paddb %%mm4, %%mm6 \n\t"
1184 #endif
1185  "movq %%mm6, %%mm0 \n\t" // max
1186  "psubb %%mm7, %%mm6 \n\t" // max - min
1187  "push %%"FF_REG_a" \n\t"
1188  "movd %%mm6, %%eax \n\t"
1189  "cmpb "MANGLE(deringThreshold)", %%al \n\t"
1190  "pop %%"FF_REG_a" \n\t"
1191  " jb 1f \n\t"
1192  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1193  "punpcklbw %%mm7, %%mm7 \n\t"
1194  "punpcklbw %%mm7, %%mm7 \n\t"
1195  "punpcklbw %%mm7, %%mm7 \n\t"
1196  "movq %%mm7, (%4) \n\t"
1197 
1198  "movq (%0), %%mm0 \n\t" // L10
1199  "movq %%mm0, %%mm1 \n\t" // L10
1200  "movq %%mm0, %%mm2 \n\t" // L10
1201  "psllq $8, %%mm1 \n\t"
1202  "psrlq $8, %%mm2 \n\t"
1203  "movd -4(%0), %%mm3 \n\t"
1204  "movd 8(%0), %%mm4 \n\t"
1205  "psrlq $24, %%mm3 \n\t"
1206  "psllq $56, %%mm4 \n\t"
1207  "por %%mm3, %%mm1 \n\t" // L00
1208  "por %%mm4, %%mm2 \n\t" // L20
1209  "movq %%mm1, %%mm3 \n\t" // L00
1210  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1211  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1212  "psubusb %%mm7, %%mm0 \n\t"
1213  "psubusb %%mm7, %%mm2 \n\t"
1214  "psubusb %%mm7, %%mm3 \n\t"
1215  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
1216  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
1217  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
1218  "paddb %%mm2, %%mm0 \n\t"
1219  "paddb %%mm3, %%mm0 \n\t"
1220 
1221  "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11
1222  "movq %%mm2, %%mm3 \n\t" // L11
1223  "movq %%mm2, %%mm4 \n\t" // L11
1224  "psllq $8, %%mm3 \n\t"
1225  "psrlq $8, %%mm4 \n\t"
1226  "movd -4(%%"FF_REG_a"), %%mm5 \n\t"
1227  "movd 8(%%"FF_REG_a"), %%mm6 \n\t"
1228  "psrlq $24, %%mm5 \n\t"
1229  "psllq $56, %%mm6 \n\t"
1230  "por %%mm5, %%mm3 \n\t" // L01
1231  "por %%mm6, %%mm4 \n\t" // L21
1232  "movq %%mm3, %%mm5 \n\t" // L01
1233  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1234  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1235  "psubusb %%mm7, %%mm2 \n\t"
1236  "psubusb %%mm7, %%mm4 \n\t"
1237  "psubusb %%mm7, %%mm5 \n\t"
1238  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
1239  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
1240  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
1241  "paddb %%mm4, %%mm2 \n\t"
1242  "paddb %%mm5, %%mm2 \n\t"
1243 // 0, 2, 3, 1
1244 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1245  "movq " #src ", " #sx " \n\t" /* src[0] */\
1246  "movq " #sx ", " #lx " \n\t" /* src[0] */\
1247  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1248  "psllq $8, " #lx " \n\t"\
1249  "psrlq $8, " #t0 " \n\t"\
1250  "movd -4" #src ", " #t1 " \n\t"\
1251  "psrlq $24, " #t1 " \n\t"\
1252  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1253  "movd 8" #src ", " #t1 " \n\t"\
1254  "psllq $56, " #t1 " \n\t"\
1255  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1256  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1257  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1258  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1259  PAVGB(lx, pplx) \
1260  "movq " #lx ", 8(%4) \n\t"\
1261  "movq (%4), " #lx " \n\t"\
1262  "psubusb " #lx ", " #t1 " \n\t"\
1263  "psubusb " #lx ", " #t0 " \n\t"\
1264  "psubusb " #lx ", " #sx " \n\t"\
1265  "movq "MANGLE(b00)", " #lx " \n\t"\
1266  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1267  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1268  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1269  "paddb " #t1 ", " #t0 " \n\t"\
1270  "paddb " #t0 ", " #sx " \n\t"\
1271 \
1272  PAVGB(plx, pplx) /* filtered */\
1273  "movq " #dst ", " #t0 " \n\t" /* dst */\
1274  "movq " #t0 ", " #t1 " \n\t" /* dst */\
1275  "psubusb %3, " #t0 " \n\t"\
1276  "paddusb %3, " #t1 " \n\t"\
1277  PMAXUB(t0, pplx)\
1278  PMINUB(t1, pplx, t0)\
1279  "paddb " #sx ", " #ppsx " \n\t"\
1280  "paddb " #psx ", " #ppsx " \n\t"\
1281  "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1282  "pand "MANGLE(b08)", " #ppsx " \n\t"\
1283  "pcmpeqb " #lx ", " #ppsx " \n\t"\
1284  "pand " #ppsx ", " #pplx " \n\t"\
1285  "pandn " #dst ", " #ppsx " \n\t"\
1286  "por " #pplx ", " #ppsx " \n\t"\
1287  "movq " #ppsx ", " #dst " \n\t"\
1288  "movq 8(%4), " #lx " \n\t"
1289 
1290 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1291  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1292 /*
1293 0000000
1294 1111111
1295 
1296 1111110
1297 1111101
1298 1111100
1299 1111011
1300 1111010
1301 1111001
1302 
1303 1111000
1304 1110111
1305 
1306 */
1307 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1308 DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1309 DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1310 DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1311 DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1312 DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1313 DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1314 DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1315 DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1316 
1317  "1: \n\t"
1318  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1319  NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08)
1320  : "%"FF_REG_a, "%"FF_REG_d
1321  );
1322 #else // HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
1323  int y;
1324  int min=255;
1325  int max=0;
1326  int avg;
1327  uint8_t *p;
1328  int s[10];
1329  const int QP2= c->QP/2 + 1;
1330 
1331  src --;
1332  for(y=1; y<9; y++){
1333  int x;
1334  p= src + stride*y;
1335  for(x=1; x<9; x++){
1336  p++;
1337  if(*p > max) max= *p;
1338  if(*p < min) min= *p;
1339  }
1340  }
1341  avg= (min + max + 1)>>1;
1342 
1343  if(max - min <deringThreshold) return;
1344 
1345  for(y=0; y<10; y++){
1346  int t = 0;
1347 
1348  if(src[stride*y + 0] > avg) t+= 1;
1349  if(src[stride*y + 1] > avg) t+= 2;
1350  if(src[stride*y + 2] > avg) t+= 4;
1351  if(src[stride*y + 3] > avg) t+= 8;
1352  if(src[stride*y + 4] > avg) t+= 16;
1353  if(src[stride*y + 5] > avg) t+= 32;
1354  if(src[stride*y + 6] > avg) t+= 64;
1355  if(src[stride*y + 7] > avg) t+= 128;
1356  if(src[stride*y + 8] > avg) t+= 256;
1357  if(src[stride*y + 9] > avg) t+= 512;
1358 
1359  t |= (~t)<<16;
1360  t &= (t<<1) & (t>>1);
1361  s[y] = t;
1362  }
1363 
1364  for(y=1; y<9; y++){
1365  int t = s[y-1] & s[y] & s[y+1];
1366  t|= t>>16;
1367  s[y-1]= t;
1368  }
1369 
1370  for(y=1; y<9; y++){
1371  int x;
1372  int t = s[y-1];
1373 
1374  p= src + stride*y;
1375  for(x=1; x<9; x++){
1376  p++;
1377  if(t & (1<<x)){
1378  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1379  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1380  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1381  f= (f + 8)>>4;
1382 
1383 #ifdef DEBUG_DERING_THRESHOLD
1384  __asm__ volatile("emms\n\t":);
1385  {
1386  static uint64_t numPixels=0;
1387  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1388 // if((max-min)<20 || (max-min)*QP<200)
1389 // if((max-min)*QP < 500)
1390 // if(max-min<QP/2)
1391  if(max-min < 20){
1392  static int numSkipped=0;
1393  static int errorSum=0;
1394  static int worstQP=0;
1395  static int worstRange=0;
1396  static int worstDiff=0;
1397  int diff= (f - *p);
1398  int absDiff= FFABS(diff);
1399  int error= diff*diff;
1400 
1401  if(x==1 || x==8 || y==1 || y==8) continue;
1402 
1403  numSkipped++;
1404  if(absDiff > worstDiff){
1405  worstDiff= absDiff;
1406  worstQP= QP;
1407  worstRange= max-min;
1408  }
1409  errorSum+= error;
1410 
1411  if(1024LL*1024LL*1024LL % numSkipped == 0){
1412  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1413  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1414  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1415  worstDiff, (float)numSkipped/numPixels);
1416  }
1417  }
1418  }
1419 #endif
1420  if (*p + QP2 < f) *p= *p + QP2;
1421  else if(*p - QP2 > f) *p= *p - QP2;
1422  else *p=f;
1423  }
1424  }
1425  }
1426 #ifdef DEBUG_DERING_THRESHOLD
1427  if(max-min < 20){
1428  for(y=1; y<9; y++){
1429  int x;
1430  int t = 0;
1431  p= src + stride*y;
1432  for(x=1; x<9; x++){
1433  p++;
1434  *p = FFMIN(*p + 20, 255);
1435  }
1436  }
1437 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1438  }
1439 #endif
1440 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1441 }
1442 #endif //TEMPLATE_PP_ALTIVEC
1443 
1444 /**
1445  * Deinterlace the given block by linearly interpolating every second line.
1446  * will be called for every 8x8 block and can read & write from line 4-15
1447  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1448  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1449  */
1450 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1451 {
1452 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1453  src+= 4*stride;
1454  __asm__ volatile(
1455  "lea (%0, %1), %%"FF_REG_a" \n\t"
1456  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
1457 // 0 1 2 3 4 5 6 7 8 9
1458 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1459 
1460  "movq (%0), %%mm0 \n\t"
1461  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1462  PAVGB(%%mm1, %%mm0)
1463  "movq %%mm0, (%%"FF_REG_a") \n\t"
1464  "movq (%0, %1, 4), %%mm0 \n\t"
1465  PAVGB(%%mm0, %%mm1)
1466  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t"
1467  "movq (%%"FF_REG_c", %1), %%mm1 \n\t"
1468  PAVGB(%%mm1, %%mm0)
1469  "movq %%mm0, (%%"FF_REG_c") \n\t"
1470  "movq (%0, %1, 8), %%mm0 \n\t"
1471  PAVGB(%%mm0, %%mm1)
1472  "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t"
1473 
1474  : : "r" (src), "r" ((x86_reg)stride)
1475  : "%"FF_REG_a, "%"FF_REG_c
1476  );
1477 #else
1478  int a, b, x;
1479  src+= 4*stride;
1480 
1481  for(x=0; x<2; x++){
1482  a= *(uint32_t*)&src[stride*0];
1483  b= *(uint32_t*)&src[stride*2];
1484  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1485  a= *(uint32_t*)&src[stride*4];
1486  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1487  b= *(uint32_t*)&src[stride*6];
1488  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1489  a= *(uint32_t*)&src[stride*8];
1490  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1491  src += 4;
1492  }
1493 #endif
1494 }
1495 
1496 /**
1497  * Deinterlace the given block by cubic interpolating every second line.
1498  * will be called for every 8x8 block and can read & write from line 4-15
1499  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1500  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1501  * this filter will read lines 3-15 and write 7-13
1502  */
1503 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1504 {
1505 #if TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1506  src+= stride*3;
1507  __asm__ volatile(
1508  "lea (%0, %1), %%"FF_REG_a" \n\t"
1509  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1510  "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
1511  "add %1, %%"FF_REG_c" \n\t"
1512 #if TEMPLATE_PP_SSE2
1513  "pxor %%xmm7, %%xmm7 \n\t"
1514 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1515  "movq " #a ", %%xmm0 \n\t"\
1516  "movq " #b ", %%xmm1 \n\t"\
1517  "movq " #d ", %%xmm2 \n\t"\
1518  "movq " #e ", %%xmm3 \n\t"\
1519  "pavgb %%xmm2, %%xmm1 \n\t"\
1520  "pavgb %%xmm3, %%xmm0 \n\t"\
1521  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1522  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1523  "psubw %%xmm1, %%xmm0 \n\t"\
1524  "psraw $3, %%xmm0 \n\t"\
1525  "psubw %%xmm0, %%xmm1 \n\t"\
1526  "packuswb %%xmm1, %%xmm1 \n\t"\
1527  "movlps %%xmm1, " #c " \n\t"
1528 #else //TEMPLATE_PP_SSE2
1529  "pxor %%mm7, %%mm7 \n\t"
1530 // 0 1 2 3 4 5 6 7 8 9 10
1531 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1532 
1533 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1534  "movq " #a ", %%mm0 \n\t"\
1535  "movq " #b ", %%mm1 \n\t"\
1536  "movq " #d ", %%mm2 \n\t"\
1537  "movq " #e ", %%mm3 \n\t"\
1538  PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1539  PAVGB(%%mm3, %%mm0) /* (a+e) /2 */\
1540  "movq %%mm0, %%mm2 \n\t"\
1541  "punpcklbw %%mm7, %%mm0 \n\t"\
1542  "punpckhbw %%mm7, %%mm2 \n\t"\
1543  "movq %%mm1, %%mm3 \n\t"\
1544  "punpcklbw %%mm7, %%mm1 \n\t"\
1545  "punpckhbw %%mm7, %%mm3 \n\t"\
1546  "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1547  "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1548  "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1549  "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1550  "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1551  "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1552  "packuswb %%mm3, %%mm1 \n\t"\
1553  "movq %%mm1, " #c " \n\t"
1554 #endif //TEMPLATE_PP_SSE2
1555 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1556 
1557 DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
1558 DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8))
1559 DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc))
1560 DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2))
1561 
1562  : : "r" (src), "r" ((x86_reg)stride)
1563  :
1564 #if TEMPLATE_PP_SSE2
1565  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1566 #endif
1567  "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
1568  );
1569 #undef REAL_DEINT_CUBIC
1570 #else //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1571  int x;
1572  src+= stride*3;
1573  for(x=0; x<8; x++){
1574  src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1575  src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1576  src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1577  src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1578  src++;
1579  }
1580 #endif //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1581 }
1582 
1583 /**
1584  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1585  * will be called for every 8x8 block and can read & write from line 4-15
1586  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1587  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1588  * this filter will read lines 4-13 and write 5-11
1589  */
1590 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1591 {
1592 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1593  src+= stride*4;
1594  __asm__ volatile(
1595  "lea (%0, %1), %%"FF_REG_a" \n\t"
1596  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1597  "pxor %%mm7, %%mm7 \n\t"
1598  "movq (%2), %%mm0 \n\t"
1599 // 0 1 2 3 4 5 6 7 8 9 10
1600 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1601 
1602 #define REAL_DEINT_FF(a,b,c,d)\
1603  "movq " #a ", %%mm1 \n\t"\
1604  "movq " #b ", %%mm2 \n\t"\
1605  "movq " #c ", %%mm3 \n\t"\
1606  "movq " #d ", %%mm4 \n\t"\
1607  PAVGB(%%mm3, %%mm1) \
1608  PAVGB(%%mm4, %%mm0) \
1609  "movq %%mm0, %%mm3 \n\t"\
1610  "punpcklbw %%mm7, %%mm0 \n\t"\
1611  "punpckhbw %%mm7, %%mm3 \n\t"\
1612  "movq %%mm1, %%mm4 \n\t"\
1613  "punpcklbw %%mm7, %%mm1 \n\t"\
1614  "punpckhbw %%mm7, %%mm4 \n\t"\
1615  "psllw $2, %%mm1 \n\t"\
1616  "psllw $2, %%mm4 \n\t"\
1617  "psubw %%mm0, %%mm1 \n\t"\
1618  "psubw %%mm3, %%mm4 \n\t"\
1619  "movq %%mm2, %%mm5 \n\t"\
1620  "movq %%mm2, %%mm0 \n\t"\
1621  "punpcklbw %%mm7, %%mm2 \n\t"\
1622  "punpckhbw %%mm7, %%mm5 \n\t"\
1623  "paddw %%mm2, %%mm1 \n\t"\
1624  "paddw %%mm5, %%mm4 \n\t"\
1625  "psraw $2, %%mm1 \n\t"\
1626  "psraw $2, %%mm4 \n\t"\
1627  "packuswb %%mm4, %%mm1 \n\t"\
1628  "movq %%mm1, " #b " \n\t"\
1629 
1630 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1631 
1632 DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2))
1633 DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1634 DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2))
1635 DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1636 
1637  "movq %%mm0, (%2) \n\t"
1638  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1639  : "%"FF_REG_a, "%"FF_REG_d
1640  );
1641 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1642  int x;
1643  src+= stride*4;
1644  for(x=0; x<8; x++){
1645  int t1= tmp[x];
1646  int t2= src[stride*1];
1647 
1648  src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1649  t1= src[stride*4];
1650  src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1651  t2= src[stride*6];
1652  src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1653  t1= src[stride*8];
1654  src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1655  tmp[x]= t1;
1656 
1657  src++;
1658  }
1659 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1660 }
1661 
1662 /**
1663  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1664  * will be called for every 8x8 block and can read & write from line 4-15
1665  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1666  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1667  * this filter will read lines 4-13 and write 4-11
1668  */
1669 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1670 {
1671 #if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1672  src+= stride*4;
1673  __asm__ volatile(
1674  "lea (%0, %1), %%"FF_REG_a" \n\t"
1675  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1676  "pxor %%mm7, %%mm7 \n\t"
1677  "movq (%2), %%mm0 \n\t"
1678  "movq (%3), %%mm1 \n\t"
1679 // 0 1 2 3 4 5 6 7 8 9 10
1680 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1681 
1682 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1683  "movq " #a ", %%mm2 \n\t"\
1684  "movq " #b ", %%mm3 \n\t"\
1685  "movq " #c ", %%mm4 \n\t"\
1686  PAVGB(t2, %%mm3) \
1687  PAVGB(t1, %%mm4) \
1688  "movq %%mm2, %%mm5 \n\t"\
1689  "movq %%mm2, " #t1 " \n\t"\
1690  "punpcklbw %%mm7, %%mm2 \n\t"\
1691  "punpckhbw %%mm7, %%mm5 \n\t"\
1692  "movq %%mm2, %%mm6 \n\t"\
1693  "paddw %%mm2, %%mm2 \n\t"\
1694  "paddw %%mm6, %%mm2 \n\t"\
1695  "movq %%mm5, %%mm6 \n\t"\
1696  "paddw %%mm5, %%mm5 \n\t"\
1697  "paddw %%mm6, %%mm5 \n\t"\
1698  "movq %%mm3, %%mm6 \n\t"\
1699  "punpcklbw %%mm7, %%mm3 \n\t"\
1700  "punpckhbw %%mm7, %%mm6 \n\t"\
1701  "paddw %%mm3, %%mm3 \n\t"\
1702  "paddw %%mm6, %%mm6 \n\t"\
1703  "paddw %%mm3, %%mm2 \n\t"\
1704  "paddw %%mm6, %%mm5 \n\t"\
1705  "movq %%mm4, %%mm6 \n\t"\
1706  "punpcklbw %%mm7, %%mm4 \n\t"\
1707  "punpckhbw %%mm7, %%mm6 \n\t"\
1708  "psubw %%mm4, %%mm2 \n\t"\
1709  "psubw %%mm6, %%mm5 \n\t"\
1710  "psraw $2, %%mm2 \n\t"\
1711  "psraw $2, %%mm5 \n\t"\
1712  "packuswb %%mm5, %%mm2 \n\t"\
1713  "movq %%mm2, " #a " \n\t"\
1714 
1715 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1716 
1717 DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) )
1718 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2))
1719 DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) )
1720 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1721 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) )
1722 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2))
1723 DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) )
1724 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1725 
1726  "movq %%mm0, (%2) \n\t"
1727  "movq %%mm1, (%3) \n\t"
1728  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1729  : "%"FF_REG_a, "%"FF_REG_d
1730  );
1731 #else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1732  int x;
1733  src+= stride*4;
1734  for(x=0; x<8; x++){
1735  int t1= tmp[x];
1736  int t2= tmp2[x];
1737  int t3= src[0];
1738 
1739  src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1740  t1= src[stride*1];
1741  src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1742  t2= src[stride*2];
1743  src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1744  t3= src[stride*3];
1745  src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1746  t1= src[stride*4];
1747  src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1748  t2= src[stride*5];
1749  src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1750  t3= src[stride*6];
1751  src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1752  t1= src[stride*7];
1753  src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1754 
1755  tmp[x]= t3;
1756  tmp2[x]= t1;
1757 
1758  src++;
1759  }
1760 #endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1761 }
1762 
1763 /**
1764  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1765  * will be called for every 8x8 block and can read & write from line 4-15
1766  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1767  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1768  * this filter will read lines 4-13 and write 4-11
1769  */
1770 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1771 {
1772 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1773  src+= 4*stride;
1774  __asm__ volatile(
1775  "lea (%0, %1), %%"FF_REG_a" \n\t"
1776  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1777 // 0 1 2 3 4 5 6 7 8 9
1778 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1779 
1780  "movq (%2), %%mm0 \n\t" // L0
1781  "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2
1782  PAVGB(%%mm1, %%mm0) // L0+L2
1783  "movq (%0), %%mm2 \n\t" // L1
1784  PAVGB(%%mm2, %%mm0)
1785  "movq %%mm0, (%0) \n\t"
1786  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3
1787  PAVGB(%%mm0, %%mm2) // L1+L3
1788  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1789  "movq %%mm2, (%%"FF_REG_a") \n\t"
1790  "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4
1791  PAVGB(%%mm2, %%mm1) // L2+L4
1792  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1793  "movq %%mm1, (%%"FF_REG_a", %1) \n\t"
1794  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1795  PAVGB(%%mm1, %%mm0) // L3+L5
1796  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1797  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
1798  "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6
1799  PAVGB(%%mm0, %%mm2) // L4+L6
1800  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1801  "movq %%mm2, (%0, %1, 4) \n\t"
1802  "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7
1803  PAVGB(%%mm2, %%mm1) // L5+L7
1804  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1805  "movq %%mm1, (%%"FF_REG_d") \n\t"
1806  "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8
1807  PAVGB(%%mm1, %%mm0) // L6+L8
1808  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1809  "movq %%mm0, (%%"FF_REG_d", %1) \n\t"
1810  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1811  PAVGB(%%mm0, %%mm2) // L7+L9
1812  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1813  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1814  "movq %%mm1, (%2) \n\t"
1815 
1816  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1817  : "%"FF_REG_a, "%"FF_REG_d
1818  );
1819 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1820  int a, b, c, x;
1821  src+= 4*stride;
1822 
1823  for(x=0; x<2; x++){
1824  a= *(uint32_t*)&tmp[stride*0];
1825  b= *(uint32_t*)&src[stride*0];
1826  c= *(uint32_t*)&src[stride*1];
1827  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1828  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1829 
1830  a= *(uint32_t*)&src[stride*2];
1831  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1832  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1833 
1834  b= *(uint32_t*)&src[stride*3];
1835  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1836  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1837 
1838  c= *(uint32_t*)&src[stride*4];
1839  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1840  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1841 
1842  a= *(uint32_t*)&src[stride*5];
1843  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1844  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1845 
1846  b= *(uint32_t*)&src[stride*6];
1847  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1848  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1849 
1850  c= *(uint32_t*)&src[stride*7];
1851  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1852  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1853 
1854  a= *(uint32_t*)&src[stride*8];
1855  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1856  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1857 
1858  *(uint32_t*)&tmp[stride*0]= c;
1859  src += 4;
1860  tmp += 4;
1861  }
1862 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1863 }
1864 
1865 /**
1866  * Deinterlace the given block by applying a median filter to every second line.
1867  * will be called for every 8x8 block and can read & write from line 4-15,
1868  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1869  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1870  */
1871 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1872 {
1873 #if TEMPLATE_PP_MMX
1874  src+= 4*stride;
1875 #if TEMPLATE_PP_MMXEXT
1876  __asm__ volatile(
1877  "lea (%0, %1), %%"FF_REG_a" \n\t"
1878  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1879 // 0 1 2 3 4 5 6 7 8 9
1880 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1881 
1882  "movq (%0), %%mm0 \n\t"
1883  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
1884  "movq (%%"FF_REG_a"), %%mm1 \n\t"
1885  "movq %%mm0, %%mm3 \n\t"
1886  "pmaxub %%mm1, %%mm0 \n\t"
1887  "pminub %%mm3, %%mm1 \n\t"
1888  "pmaxub %%mm2, %%mm1 \n\t"
1889  "pminub %%mm1, %%mm0 \n\t"
1890  "movq %%mm0, (%%"FF_REG_a") \n\t"
1891 
1892  "movq (%0, %1, 4), %%mm0 \n\t"
1893  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t"
1894  "movq %%mm2, %%mm3 \n\t"
1895  "pmaxub %%mm1, %%mm2 \n\t"
1896  "pminub %%mm3, %%mm1 \n\t"
1897  "pmaxub %%mm0, %%mm1 \n\t"
1898  "pminub %%mm1, %%mm2 \n\t"
1899  "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t"
1900 
1901  "movq (%%"FF_REG_d"), %%mm2 \n\t"
1902  "movq (%%"FF_REG_d", %1), %%mm1 \n\t"
1903  "movq %%mm2, %%mm3 \n\t"
1904  "pmaxub %%mm0, %%mm2 \n\t"
1905  "pminub %%mm3, %%mm0 \n\t"
1906  "pmaxub %%mm1, %%mm0 \n\t"
1907  "pminub %%mm0, %%mm2 \n\t"
1908  "movq %%mm2, (%%"FF_REG_d") \n\t"
1909 
1910  "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t"
1911  "movq (%0, %1, 8), %%mm0 \n\t"
1912  "movq %%mm2, %%mm3 \n\t"
1913  "pmaxub %%mm0, %%mm2 \n\t"
1914  "pminub %%mm3, %%mm0 \n\t"
1915  "pmaxub %%mm1, %%mm0 \n\t"
1916  "pminub %%mm0, %%mm2 \n\t"
1917  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1918 
1919 
1920  : : "r" (src), "r" ((x86_reg)stride)
1921  : "%"FF_REG_a, "%"FF_REG_d
1922  );
1923 
1924 #else // MMX without MMX2
1925  __asm__ volatile(
1926  "lea (%0, %1), %%"FF_REG_a" \n\t"
1927  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1928 // 0 1 2 3 4 5 6 7 8 9
1929 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1930  "pxor %%mm7, %%mm7 \n\t"
1931 
1932 #define REAL_MEDIAN(a,b,c)\
1933  "movq " #a ", %%mm0 \n\t"\
1934  "movq " #b ", %%mm2 \n\t"\
1935  "movq " #c ", %%mm1 \n\t"\
1936  "movq %%mm0, %%mm3 \n\t"\
1937  "movq %%mm1, %%mm4 \n\t"\
1938  "movq %%mm2, %%mm5 \n\t"\
1939  "psubusb %%mm1, %%mm3 \n\t"\
1940  "psubusb %%mm2, %%mm4 \n\t"\
1941  "psubusb %%mm0, %%mm5 \n\t"\
1942  "pcmpeqb %%mm7, %%mm3 \n\t"\
1943  "pcmpeqb %%mm7, %%mm4 \n\t"\
1944  "pcmpeqb %%mm7, %%mm5 \n\t"\
1945  "movq %%mm3, %%mm6 \n\t"\
1946  "pxor %%mm4, %%mm3 \n\t"\
1947  "pxor %%mm5, %%mm4 \n\t"\
1948  "pxor %%mm6, %%mm5 \n\t"\
1949  "por %%mm3, %%mm1 \n\t"\
1950  "por %%mm4, %%mm2 \n\t"\
1951  "por %%mm5, %%mm0 \n\t"\
1952  "pand %%mm2, %%mm0 \n\t"\
1953  "pand %%mm1, %%mm0 \n\t"\
1954  "movq %%mm0, " #b " \n\t"
1955 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
1956 
1957 MEDIAN((%0) , (%%FF_REGa) , (%%FF_REGa, %1))
1958 MEDIAN((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4))
1959 MEDIAN((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1))
1960 MEDIAN((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8))
1961 
1962  : : "r" (src), "r" ((x86_reg)stride)
1963  : "%"FF_REG_a, "%"FF_REG_d
1964  );
1965 #endif //TEMPLATE_PP_MMXEXT
1966 #else //TEMPLATE_PP_MMX
1967  int x, y;
1968  src+= 4*stride;
1969  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1970  for(x=0; x<8; x++){
1971  uint8_t *colsrc = src;
1972  for (y=0; y<4; y++){
1973  int a, b, c, d, e, f;
1974  a = colsrc[0 ];
1975  b = colsrc[stride ];
1976  c = colsrc[stride*2];
1977  d = (a-b)>>31;
1978  e = (b-c)>>31;
1979  f = (c-a)>>31;
1980  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1981  colsrc += stride*2;
1982  }
1983  src++;
1984  }
1985 #endif //TEMPLATE_PP_MMX
1986 }
1987 
1988 #if TEMPLATE_PP_MMX
1989 /**
1990  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1991  */
1992 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride)
1993 {
1994  __asm__(
1995  "lea (%0, %1), %%"FF_REG_a" \n\t"
1996 // 0 1 2 3 4 5 6 7 8 9
1997 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1998  "movq (%0), %%mm0 \n\t" // 12345678
1999  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
2000  "movq %%mm0, %%mm2 \n\t" // 12345678
2001  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2002  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2003 
2004  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
2005  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
2006  "movq %%mm1, %%mm4 \n\t"
2007  "punpcklbw %%mm3, %%mm1 \n\t"
2008  "punpckhbw %%mm3, %%mm4 \n\t"
2009 
2010  "movq %%mm0, %%mm3 \n\t"
2011  "punpcklwd %%mm1, %%mm0 \n\t"
2012  "punpckhwd %%mm1, %%mm3 \n\t"
2013  "movq %%mm2, %%mm1 \n\t"
2014  "punpcklwd %%mm4, %%mm2 \n\t"
2015  "punpckhwd %%mm4, %%mm1 \n\t"
2016 
2017  "movd %%mm0, 128(%2) \n\t"
2018  "psrlq $32, %%mm0 \n\t"
2019  "movd %%mm0, 144(%2) \n\t"
2020  "movd %%mm3, 160(%2) \n\t"
2021  "psrlq $32, %%mm3 \n\t"
2022  "movd %%mm3, 176(%2) \n\t"
2023  "movd %%mm3, 48(%3) \n\t"
2024  "movd %%mm2, 192(%2) \n\t"
2025  "movd %%mm2, 64(%3) \n\t"
2026  "psrlq $32, %%mm2 \n\t"
2027  "movd %%mm2, 80(%3) \n\t"
2028  "movd %%mm1, 96(%3) \n\t"
2029  "psrlq $32, %%mm1 \n\t"
2030  "movd %%mm1, 112(%3) \n\t"
2031 
2032  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
2033 
2034  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2035  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
2036  "movq %%mm0, %%mm2 \n\t" // 12345678
2037  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2038  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2039 
2040  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
2041  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
2042  "movq %%mm1, %%mm4 \n\t"
2043  "punpcklbw %%mm3, %%mm1 \n\t"
2044  "punpckhbw %%mm3, %%mm4 \n\t"
2045 
2046  "movq %%mm0, %%mm3 \n\t"
2047  "punpcklwd %%mm1, %%mm0 \n\t"
2048  "punpckhwd %%mm1, %%mm3 \n\t"
2049  "movq %%mm2, %%mm1 \n\t"
2050  "punpcklwd %%mm4, %%mm2 \n\t"
2051  "punpckhwd %%mm4, %%mm1 \n\t"
2052 
2053  "movd %%mm0, 132(%2) \n\t"
2054  "psrlq $32, %%mm0 \n\t"
2055  "movd %%mm0, 148(%2) \n\t"
2056  "movd %%mm3, 164(%2) \n\t"
2057  "psrlq $32, %%mm3 \n\t"
2058  "movd %%mm3, 180(%2) \n\t"
2059  "movd %%mm3, 52(%3) \n\t"
2060  "movd %%mm2, 196(%2) \n\t"
2061  "movd %%mm2, 68(%3) \n\t"
2062  "psrlq $32, %%mm2 \n\t"
2063  "movd %%mm2, 84(%3) \n\t"
2064  "movd %%mm1, 100(%3) \n\t"
2065  "psrlq $32, %%mm1 \n\t"
2066  "movd %%mm1, 116(%3) \n\t"
2067 
2068 
2069  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
2070  : "%"FF_REG_a
2071  );
2072 }
2073 
2074 /**
2075  * Transpose the given 8x8 block.
2076  */
2077 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
2078 {
2079  __asm__(
2080  "lea (%0, %1), %%"FF_REG_a" \n\t"
2081  "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
2082 // 0 1 2 3 4 5 6 7 8 9
2083 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2084  "movq (%2), %%mm0 \n\t" // 12345678
2085  "movq 16(%2), %%mm1 \n\t" // abcdefgh
2086  "movq %%mm0, %%mm2 \n\t" // 12345678
2087  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2088  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2089 
2090  "movq 32(%2), %%mm1 \n\t"
2091  "movq 48(%2), %%mm3 \n\t"
2092  "movq %%mm1, %%mm4 \n\t"
2093  "punpcklbw %%mm3, %%mm1 \n\t"
2094  "punpckhbw %%mm3, %%mm4 \n\t"
2095 
2096  "movq %%mm0, %%mm3 \n\t"
2097  "punpcklwd %%mm1, %%mm0 \n\t"
2098  "punpckhwd %%mm1, %%mm3 \n\t"
2099  "movq %%mm2, %%mm1 \n\t"
2100  "punpcklwd %%mm4, %%mm2 \n\t"
2101  "punpckhwd %%mm4, %%mm1 \n\t"
2102 
2103  "movd %%mm0, (%0) \n\t"
2104  "psrlq $32, %%mm0 \n\t"
2105  "movd %%mm0, (%%"FF_REG_a") \n\t"
2106  "movd %%mm3, (%%"FF_REG_a", %1) \n\t"
2107  "psrlq $32, %%mm3 \n\t"
2108  "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
2109  "movd %%mm2, (%0, %1, 4) \n\t"
2110  "psrlq $32, %%mm2 \n\t"
2111  "movd %%mm2, (%%"FF_REG_d") \n\t"
2112  "movd %%mm1, (%%"FF_REG_d", %1) \n\t"
2113  "psrlq $32, %%mm1 \n\t"
2114  "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
2115 
2116 
2117  "movq 64(%2), %%mm0 \n\t" // 12345678
2118  "movq 80(%2), %%mm1 \n\t" // abcdefgh
2119  "movq %%mm0, %%mm2 \n\t" // 12345678
2120  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2121  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2122 
2123  "movq 96(%2), %%mm1 \n\t"
2124  "movq 112(%2), %%mm3 \n\t"
2125  "movq %%mm1, %%mm4 \n\t"
2126  "punpcklbw %%mm3, %%mm1 \n\t"
2127  "punpckhbw %%mm3, %%mm4 \n\t"
2128 
2129  "movq %%mm0, %%mm3 \n\t"
2130  "punpcklwd %%mm1, %%mm0 \n\t"
2131  "punpckhwd %%mm1, %%mm3 \n\t"
2132  "movq %%mm2, %%mm1 \n\t"
2133  "punpcklwd %%mm4, %%mm2 \n\t"
2134  "punpckhwd %%mm4, %%mm1 \n\t"
2135 
2136  "movd %%mm0, 4(%0) \n\t"
2137  "psrlq $32, %%mm0 \n\t"
2138  "movd %%mm0, 4(%%"FF_REG_a") \n\t"
2139  "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
2140  "psrlq $32, %%mm3 \n\t"
2141  "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
2142  "movd %%mm2, 4(%0, %1, 4) \n\t"
2143  "psrlq $32, %%mm2 \n\t"
2144  "movd %%mm2, 4(%%"FF_REG_d") \n\t"
2145  "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
2146  "psrlq $32, %%mm1 \n\t"
2147  "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
2148 
2149  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
2150  : "%"FF_REG_a, "%"FF_REG_d
2151  );
2152 }
2153 #endif //TEMPLATE_PP_MMX
2154 //static long test=0;
2155 
2156 #if !TEMPLATE_PP_ALTIVEC
2157 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2158  uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
2159 {
2160  // to save a register (FIXME do this outside of the loops)
2161  tempBlurredPast[127]= maxNoise[0];
2162  tempBlurredPast[128]= maxNoise[1];
2163  tempBlurredPast[129]= maxNoise[2];
2164 
2165 #define FAST_L2_DIFF
2166 //#define L1_DIFF //u should change the thresholds too if u try that one
2167 #if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2168  __asm__ volatile(
2169  "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
2170  "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
2171  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2172 // 0 1 2 3 4 5 6 7 8 9
2173 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2174 //FIXME reorder?
2175 #ifdef L1_DIFF //needs mmx2
2176  "movq (%0), %%mm0 \n\t" // L0
2177  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2178  "movq (%0, %2), %%mm1 \n\t" // L1
2179  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2180  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2181  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2182  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2183  "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
2184 
2185  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2186  "paddw %%mm1, %%mm0 \n\t"
2187  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2188  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2189  "paddw %%mm2, %%mm0 \n\t"
2190  "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
2191  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2192  "paddw %%mm3, %%mm0 \n\t"
2193  "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
2194  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2195  "paddw %%mm4, %%mm0 \n\t"
2196  "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
2197  "paddw %%mm5, %%mm6 \n\t"
2198  "paddw %%mm7, %%mm6 \n\t"
2199  "paddw %%mm6, %%mm0 \n\t"
2200 #else //L1_DIFF
2201 #if defined (FAST_L2_DIFF)
2202  "pcmpeqb %%mm7, %%mm7 \n\t"
2203  "movq "MANGLE(b80)", %%mm6 \n\t"
2204  "pxor %%mm0, %%mm0 \n\t"
2205 #define REAL_L2_DIFF_CORE(a, b)\
2206  "movq " #a ", %%mm5 \n\t"\
2207  "movq " #b ", %%mm2 \n\t"\
2208  "pxor %%mm7, %%mm2 \n\t"\
2209  PAVGB(%%mm2, %%mm5)\
2210  "paddb %%mm6, %%mm5 \n\t"\
2211  "movq %%mm5, %%mm2 \n\t"\
2212  "psllw $8, %%mm5 \n\t"\
2213  "pmaddwd %%mm5, %%mm5 \n\t"\
2214  "pmaddwd %%mm2, %%mm2 \n\t"\
2215  "paddd %%mm2, %%mm5 \n\t"\
2216  "psrld $14, %%mm5 \n\t"\
2217  "paddd %%mm5, %%mm0 \n\t"
2218 
2219 #else //defined (FAST_L2_DIFF)
2220  "pxor %%mm7, %%mm7 \n\t"
2221  "pxor %%mm0, %%mm0 \n\t"
2222 #define REAL_L2_DIFF_CORE(a, b)\
2223  "movq " #a ", %%mm5 \n\t"\
2224  "movq " #b ", %%mm2 \n\t"\
2225  "movq %%mm5, %%mm1 \n\t"\
2226  "movq %%mm2, %%mm3 \n\t"\
2227  "punpcklbw %%mm7, %%mm5 \n\t"\
2228  "punpckhbw %%mm7, %%mm1 \n\t"\
2229  "punpcklbw %%mm7, %%mm2 \n\t"\
2230  "punpckhbw %%mm7, %%mm3 \n\t"\
2231  "psubw %%mm2, %%mm5 \n\t"\
2232  "psubw %%mm3, %%mm1 \n\t"\
2233  "pmaddwd %%mm5, %%mm5 \n\t"\
2234  "pmaddwd %%mm1, %%mm1 \n\t"\
2235  "paddd %%mm1, %%mm5 \n\t"\
2236  "paddd %%mm5, %%mm0 \n\t"
2237 
2238 #endif //defined (FAST_L2_DIFF)
2239 
2240 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
2241 
2242 L2_DIFF_CORE((%0) , (%1))
2243 L2_DIFF_CORE((%0, %2) , (%1, %2))
2244 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
2245 L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
2246 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
2247 L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
2248 L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
2249 L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
2250 
2251 #endif //L1_DIFF
2252 
2253  "movq %%mm0, %%mm4 \n\t"
2254  "psrlq $32, %%mm0 \n\t"
2255  "paddd %%mm0, %%mm4 \n\t"
2256  "movd %%mm4, %%ecx \n\t"
2257  "shll $2, %%ecx \n\t"
2258  "mov %3, %%"FF_REG_d" \n\t"
2259  "addl -4(%%"FF_REG_d"), %%ecx \n\t"
2260  "addl 4(%%"FF_REG_d"), %%ecx \n\t"
2261  "addl -1024(%%"FF_REG_d"), %%ecx \n\t"
2262  "addl $4, %%ecx \n\t"
2263  "addl 1024(%%"FF_REG_d"), %%ecx \n\t"
2264  "shrl $3, %%ecx \n\t"
2265  "movl %%ecx, (%%"FF_REG_d") \n\t"
2266 
2267 // "mov %3, %%"FF_REG_c" \n\t"
2268 // "mov %%"FF_REG_c", test \n\t"
2269 // "jmp 4f \n\t"
2270  "cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
2271  " jb 2f \n\t"
2272  "cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
2273  " jb 1f \n\t"
2274 
2275  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2276  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2277  "movq (%0), %%mm0 \n\t" // L0
2278  "movq (%0, %2), %%mm1 \n\t" // L1
2279  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2280  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2281  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2282  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2283  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2284  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2285  "movq %%mm0, (%1) \n\t" // L0
2286  "movq %%mm1, (%1, %2) \n\t" // L1
2287  "movq %%mm2, (%1, %2, 2) \n\t" // L2
2288  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
2289  "movq %%mm4, (%1, %2, 4) \n\t" // L4
2290  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
2291  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
2292  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
2293  "jmp 4f \n\t"
2294 
2295  "1: \n\t"
2296  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2297  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2298  "movq (%0), %%mm0 \n\t" // L0
2299  PAVGB((%1), %%mm0) // L0
2300  "movq (%0, %2), %%mm1 \n\t" // L1
2301  PAVGB((%1, %2), %%mm1) // L1
2302  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2303  PAVGB((%1, %2, 2), %%mm2) // L2
2304  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2305  PAVGB((%1, %%FF_REGa), %%mm3) // L3
2306  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2307  PAVGB((%1, %2, 4), %%mm4) // L4
2308  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2309  PAVGB((%1, %%FF_REGd), %%mm5) // L5
2310  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2311  PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
2312  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2313  PAVGB((%1, %%FF_REGc), %%mm7) // L7
2314  "movq %%mm0, (%1) \n\t" // R0
2315  "movq %%mm1, (%1, %2) \n\t" // R1
2316  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2317  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2318  "movq %%mm4, (%1, %2, 4) \n\t" // R4
2319  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
2320  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
2321  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
2322  "movq %%mm0, (%0) \n\t" // L0
2323  "movq %%mm1, (%0, %2) \n\t" // L1
2324  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2325  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2326  "movq %%mm4, (%0, %2, 4) \n\t" // L4
2327  "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
2328  "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
2329  "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
2330  "jmp 4f \n\t"
2331 
2332  "2: \n\t"
2333  "cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
2334  " jb 3f \n\t"
2335 
2336  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2337  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2338  "movq (%0), %%mm0 \n\t" // L0
2339  "movq (%0, %2), %%mm1 \n\t" // L1
2340  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2341  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2342  "movq (%1), %%mm4 \n\t" // R0
2343  "movq (%1, %2), %%mm5 \n\t" // R1
2344  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2345  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2346  PAVGB(%%mm4, %%mm0)
2347  PAVGB(%%mm5, %%mm1)
2348  PAVGB(%%mm6, %%mm2)
2349  PAVGB(%%mm7, %%mm3)
2350  PAVGB(%%mm4, %%mm0)
2351  PAVGB(%%mm5, %%mm1)
2352  PAVGB(%%mm6, %%mm2)
2353  PAVGB(%%mm7, %%mm3)
2354  "movq %%mm0, (%1) \n\t" // R0
2355  "movq %%mm1, (%1, %2) \n\t" // R1
2356  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2357  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2358  "movq %%mm0, (%0) \n\t" // L0
2359  "movq %%mm1, (%0, %2) \n\t" // L1
2360  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2361  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2362 
2363  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2364  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2365  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2366  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2367  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2368  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2369  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2370  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2371  PAVGB(%%mm4, %%mm0)
2372  PAVGB(%%mm5, %%mm1)
2373  PAVGB(%%mm6, %%mm2)
2374  PAVGB(%%mm7, %%mm3)
2375  PAVGB(%%mm4, %%mm0)
2376  PAVGB(%%mm5, %%mm1)
2377  PAVGB(%%mm6, %%mm2)
2378  PAVGB(%%mm7, %%mm3)
2379  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2380  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2381  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2382  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2383  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2384  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2385  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2386  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2387  "jmp 4f \n\t"
2388 
2389  "3: \n\t"
2390  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2391  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2392  "movq (%0), %%mm0 \n\t" // L0
2393  "movq (%0, %2), %%mm1 \n\t" // L1
2394  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2395  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2396  "movq (%1), %%mm4 \n\t" // R0
2397  "movq (%1, %2), %%mm5 \n\t" // R1
2398  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2399  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2400  PAVGB(%%mm4, %%mm0)
2401  PAVGB(%%mm5, %%mm1)
2402  PAVGB(%%mm6, %%mm2)
2403  PAVGB(%%mm7, %%mm3)
2404  PAVGB(%%mm4, %%mm0)
2405  PAVGB(%%mm5, %%mm1)
2406  PAVGB(%%mm6, %%mm2)
2407  PAVGB(%%mm7, %%mm3)
2408  PAVGB(%%mm4, %%mm0)
2409  PAVGB(%%mm5, %%mm1)
2410  PAVGB(%%mm6, %%mm2)
2411  PAVGB(%%mm7, %%mm3)
2412  "movq %%mm0, (%1) \n\t" // R0
2413  "movq %%mm1, (%1, %2) \n\t" // R1
2414  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2415  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2416  "movq %%mm0, (%0) \n\t" // L0
2417  "movq %%mm1, (%0, %2) \n\t" // L1
2418  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2419  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2420 
2421  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2422  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2423  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2424  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2425  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2426  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2427  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2428  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2429  PAVGB(%%mm4, %%mm0)
2430  PAVGB(%%mm5, %%mm1)
2431  PAVGB(%%mm6, %%mm2)
2432  PAVGB(%%mm7, %%mm3)
2433  PAVGB(%%mm4, %%mm0)
2434  PAVGB(%%mm5, %%mm1)
2435  PAVGB(%%mm6, %%mm2)
2436  PAVGB(%%mm7, %%mm3)
2437  PAVGB(%%mm4, %%mm0)
2438  PAVGB(%%mm5, %%mm1)
2439  PAVGB(%%mm6, %%mm2)
2440  PAVGB(%%mm7, %%mm3)
2441  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2442  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2443  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2444  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2445  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2446  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2447  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2448  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2449 
2450  "4: \n\t"
2451 
2452  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2454  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
2455  );
2456 #else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2457 {
2458  int y;
2459  int d=0;
2460 // int sysd=0;
2461  int i;
2462 
2463  for(y=0; y<8; y++){
2464  int x;
2465  for(x=0; x<8; x++){
2466  int ref= tempBlurred[ x + y*stride ];
2467  int cur= src[ x + y*stride ];
2468  int d1=ref - cur;
2469 // if(x==0 || x==7) d1+= d1>>1;
2470 // if(y==0 || y==7) d1+= d1>>1;
2471 // d+= FFABS(d1);
2472  d+= d1*d1;
2473 // sysd+= d1;
2474  }
2475  }
2476  i=d;
2477  d= (
2478  4*d
2479  +(*(tempBlurredPast-256))
2480  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2481  +(*(tempBlurredPast+256))
2482  +4)>>3;
2483  *tempBlurredPast=i;
2484 // ((*tempBlurredPast)*3 + d + 2)>>2;
2485 
2486 /*
2487 Switch between
2488  1 0 0 0 0 0 0 (0)
2489 64 32 16 8 4 2 1 (1)
2490 64 48 36 27 20 15 11 (33) (approx)
2491 64 56 49 43 37 33 29 (200) (approx)
2492 */
2493  if(d > maxNoise[1]){
2494  if(d < maxNoise[2]){
2495  for(y=0; y<8; y++){
2496  int x;
2497  for(x=0; x<8; x++){
2498  int ref= tempBlurred[ x + y*stride ];
2499  int cur= src[ x + y*stride ];
2500  tempBlurred[ x + y*stride ]=
2501  src[ x + y*stride ]=
2502  (ref + cur + 1)>>1;
2503  }
2504  }
2505  }else{
2506  for(y=0; y<8; y++){
2507  int x;
2508  for(x=0; x<8; x++){
2509  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2510  }
2511  }
2512  }
2513  }else{
2514  if(d < maxNoise[0]){
2515  for(y=0; y<8; y++){
2516  int x;
2517  for(x=0; x<8; x++){
2518  int ref= tempBlurred[ x + y*stride ];
2519  int cur= src[ x + y*stride ];
2520  tempBlurred[ x + y*stride ]=
2521  src[ x + y*stride ]=
2522  (ref*7 + cur + 4)>>3;
2523  }
2524  }
2525  }else{
2526  for(y=0; y<8; y++){
2527  int x;
2528  for(x=0; x<8; x++){
2529  int ref= tempBlurred[ x + y*stride ];
2530  int cur= src[ x + y*stride ];
2531  tempBlurred[ x + y*stride ]=
2532  src[ x + y*stride ]=
2533  (ref*3 + cur + 2)>>2;
2534  }
2535  }
2536  }
2537  }
2538 }
2539 #endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2540 }
2541 #endif //TEMPLATE_PP_ALTIVEC
2542 
2543 #if TEMPLATE_PP_MMX
2544 /**
2545  * accurate deblock filter
2546  */
2547 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
2548  int64_t dc_mask, eq_mask, both_masks;
2549  int64_t sums[10*8*2];
2550  src+= step*3; // src points to begin of the 8x8 Block
2551 
2552  __asm__ volatile(
2553  "movq %0, %%mm7 \n\t"
2554  "movq %1, %%mm6 \n\t"
2555  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2556  );
2557 
2558  __asm__ volatile(
2559  "lea (%2, %3), %%"FF_REG_a" \n\t"
2560 // 0 1 2 3 4 5 6 7 8 9
2561 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2562 
2563  "movq (%2), %%mm0 \n\t"
2564  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2565  "movq %%mm1, %%mm3 \n\t"
2566  "movq %%mm1, %%mm4 \n\t"
2567  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2568  "paddb %%mm7, %%mm0 \n\t"
2569  "pcmpgtb %%mm6, %%mm0 \n\t"
2570 
2571  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
2572  PMAXUB(%%mm2, %%mm4)
2573  PMINUB(%%mm2, %%mm3, %%mm5)
2574  "psubb %%mm2, %%mm1 \n\t"
2575  "paddb %%mm7, %%mm1 \n\t"
2576  "pcmpgtb %%mm6, %%mm1 \n\t"
2577  "paddb %%mm1, %%mm0 \n\t"
2578 
2579  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2580  PMAXUB(%%mm1, %%mm4)
2581  PMINUB(%%mm1, %%mm3, %%mm5)
2582  "psubb %%mm1, %%mm2 \n\t"
2583  "paddb %%mm7, %%mm2 \n\t"
2584  "pcmpgtb %%mm6, %%mm2 \n\t"
2585  "paddb %%mm2, %%mm0 \n\t"
2586 
2587  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
2588 
2589  "movq (%2, %3, 4), %%mm2 \n\t"
2590  PMAXUB(%%mm2, %%mm4)
2591  PMINUB(%%mm2, %%mm3, %%mm5)
2592  "psubb %%mm2, %%mm1 \n\t"
2593  "paddb %%mm7, %%mm1 \n\t"
2594  "pcmpgtb %%mm6, %%mm1 \n\t"
2595  "paddb %%mm1, %%mm0 \n\t"
2596 
2597  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2598  PMAXUB(%%mm1, %%mm4)
2599  PMINUB(%%mm1, %%mm3, %%mm5)
2600  "psubb %%mm1, %%mm2 \n\t"
2601  "paddb %%mm7, %%mm2 \n\t"
2602  "pcmpgtb %%mm6, %%mm2 \n\t"
2603  "paddb %%mm2, %%mm0 \n\t"
2604 
2605  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
2606  PMAXUB(%%mm2, %%mm4)
2607  PMINUB(%%mm2, %%mm3, %%mm5)
2608  "psubb %%mm2, %%mm1 \n\t"
2609  "paddb %%mm7, %%mm1 \n\t"
2610  "pcmpgtb %%mm6, %%mm1 \n\t"
2611  "paddb %%mm1, %%mm0 \n\t"
2612 
2613  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2614  PMAXUB(%%mm1, %%mm4)
2615  PMINUB(%%mm1, %%mm3, %%mm5)
2616  "psubb %%mm1, %%mm2 \n\t"
2617  "paddb %%mm7, %%mm2 \n\t"
2618  "pcmpgtb %%mm6, %%mm2 \n\t"
2619  "paddb %%mm2, %%mm0 \n\t"
2620 
2621  "movq (%2, %3, 8), %%mm2 \n\t"
2622  PMAXUB(%%mm2, %%mm4)
2623  PMINUB(%%mm2, %%mm3, %%mm5)
2624  "psubb %%mm2, %%mm1 \n\t"
2625  "paddb %%mm7, %%mm1 \n\t"
2626  "pcmpgtb %%mm6, %%mm1 \n\t"
2627  "paddb %%mm1, %%mm0 \n\t"
2628 
2629  "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2630  "psubb %%mm1, %%mm2 \n\t"
2631  "paddb %%mm7, %%mm2 \n\t"
2632  "pcmpgtb %%mm6, %%mm2 \n\t"
2633  "paddb %%mm2, %%mm0 \n\t"
2634  "psubusb %%mm3, %%mm4 \n\t"
2635 
2636  "pxor %%mm6, %%mm6 \n\t"
2637  "movq %4, %%mm7 \n\t" // QP,..., QP
2638  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2639  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2640  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2641  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2642  "movq %%mm7, %1 \n\t"
2643 
2644  "movq %5, %%mm7 \n\t"
2645  "punpcklbw %%mm7, %%mm7 \n\t"
2646  "punpcklbw %%mm7, %%mm7 \n\t"
2647  "punpcklbw %%mm7, %%mm7 \n\t"
2648  "psubb %%mm0, %%mm6 \n\t"
2649  "pcmpgtb %%mm7, %%mm6 \n\t"
2650  "movq %%mm6, %0 \n\t"
2651 
2652  : "=m" (eq_mask), "=m" (dc_mask)
2653  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2654  : "%"FF_REG_a
2655  );
2656 
2657  both_masks = dc_mask & eq_mask;
2658 
2659  if(both_masks){
2660  x86_reg offset= -8*step;
2661  int64_t *temp_sums= sums;
2662 
2663  __asm__ volatile(
2664  "movq %2, %%mm0 \n\t" // QP,..., QP
2665  "pxor %%mm4, %%mm4 \n\t"
2666 
2667  "movq (%0), %%mm6 \n\t"
2668  "movq (%0, %1), %%mm5 \n\t"
2669  "movq %%mm5, %%mm1 \n\t"
2670  "movq %%mm6, %%mm2 \n\t"
2671  "psubusb %%mm6, %%mm5 \n\t"
2672  "psubusb %%mm1, %%mm2 \n\t"
2673  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2674  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2675  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2676 
2677  "pxor %%mm6, %%mm1 \n\t"
2678  "pand %%mm0, %%mm1 \n\t"
2679  "pxor %%mm1, %%mm6 \n\t"
2680  // 0:QP 6:First
2681 
2682  "movq (%0, %1, 8), %%mm5 \n\t"
2683  "add %1, %0 \n\t" // %0 points to line 1 not 0
2684  "movq (%0, %1, 8), %%mm7 \n\t"
2685  "movq %%mm5, %%mm1 \n\t"
2686  "movq %%mm7, %%mm2 \n\t"
2687  "psubusb %%mm7, %%mm5 \n\t"
2688  "psubusb %%mm1, %%mm2 \n\t"
2689  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2690  "movq %2, %%mm0 \n\t" // QP,..., QP
2691  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2692  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2693 
2694  "pxor %%mm7, %%mm1 \n\t"
2695  "pand %%mm0, %%mm1 \n\t"
2696  "pxor %%mm1, %%mm7 \n\t"
2697 
2698  "movq %%mm6, %%mm5 \n\t"
2699  "punpckhbw %%mm4, %%mm6 \n\t"
2700  "punpcklbw %%mm4, %%mm5 \n\t"
2701  // 4:0 5/6:First 7:Last
2702 
2703  "movq %%mm5, %%mm0 \n\t"
2704  "movq %%mm6, %%mm1 \n\t"
2705  "psllw $2, %%mm0 \n\t"
2706  "psllw $2, %%mm1 \n\t"
2707  "paddw "MANGLE(w04)", %%mm0 \n\t"
2708  "paddw "MANGLE(w04)", %%mm1 \n\t"
2709 
2710 #define NEXT\
2711  "movq (%0), %%mm2 \n\t"\
2712  "movq (%0), %%mm3 \n\t"\
2713  "add %1, %0 \n\t"\
2714  "punpcklbw %%mm4, %%mm2 \n\t"\
2715  "punpckhbw %%mm4, %%mm3 \n\t"\
2716  "paddw %%mm2, %%mm0 \n\t"\
2717  "paddw %%mm3, %%mm1 \n\t"
2718 
2719 #define PREV\
2720  "movq (%0), %%mm2 \n\t"\
2721  "movq (%0), %%mm3 \n\t"\
2722  "add %1, %0 \n\t"\
2723  "punpcklbw %%mm4, %%mm2 \n\t"\
2724  "punpckhbw %%mm4, %%mm3 \n\t"\
2725  "psubw %%mm2, %%mm0 \n\t"\
2726  "psubw %%mm3, %%mm1 \n\t"
2727 
2728 
2729  NEXT //0
2730  NEXT //1
2731  NEXT //2
2732  "movq %%mm0, (%3) \n\t"
2733  "movq %%mm1, 8(%3) \n\t"
2734 
2735  NEXT //3
2736  "psubw %%mm5, %%mm0 \n\t"
2737  "psubw %%mm6, %%mm1 \n\t"
2738  "movq %%mm0, 16(%3) \n\t"
2739  "movq %%mm1, 24(%3) \n\t"
2740 
2741  NEXT //4
2742  "psubw %%mm5, %%mm0 \n\t"
2743  "psubw %%mm6, %%mm1 \n\t"
2744  "movq %%mm0, 32(%3) \n\t"
2745  "movq %%mm1, 40(%3) \n\t"
2746 
2747  NEXT //5
2748  "psubw %%mm5, %%mm0 \n\t"
2749  "psubw %%mm6, %%mm1 \n\t"
2750  "movq %%mm0, 48(%3) \n\t"
2751  "movq %%mm1, 56(%3) \n\t"
2752 
2753  NEXT //6
2754  "psubw %%mm5, %%mm0 \n\t"
2755  "psubw %%mm6, %%mm1 \n\t"
2756  "movq %%mm0, 64(%3) \n\t"
2757  "movq %%mm1, 72(%3) \n\t"
2758 
2759  "movq %%mm7, %%mm6 \n\t"
2760  "punpckhbw %%mm4, %%mm7 \n\t"
2761  "punpcklbw %%mm4, %%mm6 \n\t"
2762 
2763  NEXT //7
2764  "mov %4, %0 \n\t"
2765  "add %1, %0 \n\t"
2766  PREV //0
2767  "movq %%mm0, 80(%3) \n\t"
2768  "movq %%mm1, 88(%3) \n\t"
2769 
2770  PREV //1
2771  "paddw %%mm6, %%mm0 \n\t"
2772  "paddw %%mm7, %%mm1 \n\t"
2773  "movq %%mm0, 96(%3) \n\t"
2774  "movq %%mm1, 104(%3) \n\t"
2775 
2776  PREV //2
2777  "paddw %%mm6, %%mm0 \n\t"
2778  "paddw %%mm7, %%mm1 \n\t"
2779  "movq %%mm0, 112(%3) \n\t"
2780  "movq %%mm1, 120(%3) \n\t"
2781 
2782  PREV //3
2783  "paddw %%mm6, %%mm0 \n\t"
2784  "paddw %%mm7, %%mm1 \n\t"
2785  "movq %%mm0, 128(%3) \n\t"
2786  "movq %%mm1, 136(%3) \n\t"
2787 
2788  PREV //4
2789  "paddw %%mm6, %%mm0 \n\t"
2790  "paddw %%mm7, %%mm1 \n\t"
2791  "movq %%mm0, 144(%3) \n\t"
2792  "movq %%mm1, 152(%3) \n\t"
2793 
2794  "mov %4, %0 \n\t" //FIXME
2795 
2796  : "+&r"(src)
2797  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2799  );
2800 
2801  src+= step; // src points to begin of the 8x8 Block
2802 
2803  __asm__ volatile(
2804  "movq %4, %%mm6 \n\t"
2805  "pcmpeqb %%mm5, %%mm5 \n\t"
2806  "pxor %%mm6, %%mm5 \n\t"
2807  "pxor %%mm7, %%mm7 \n\t"
2808 
2809  "1: \n\t"
2810  "movq (%1), %%mm0 \n\t"
2811  "movq 8(%1), %%mm1 \n\t"
2812  "paddw 32(%1), %%mm0 \n\t"
2813  "paddw 40(%1), %%mm1 \n\t"
2814  "movq (%0, %3), %%mm2 \n\t"
2815  "movq %%mm2, %%mm3 \n\t"
2816  "movq %%mm2, %%mm4 \n\t"
2817  "punpcklbw %%mm7, %%mm2 \n\t"
2818  "punpckhbw %%mm7, %%mm3 \n\t"
2819  "paddw %%mm2, %%mm0 \n\t"
2820  "paddw %%mm3, %%mm1 \n\t"
2821  "paddw %%mm2, %%mm0 \n\t"
2822  "paddw %%mm3, %%mm1 \n\t"
2823  "psrlw $4, %%mm0 \n\t"
2824  "psrlw $4, %%mm1 \n\t"
2825  "packuswb %%mm1, %%mm0 \n\t"
2826  "pand %%mm6, %%mm0 \n\t"
2827  "pand %%mm5, %%mm4 \n\t"
2828  "por %%mm4, %%mm0 \n\t"
2829  "movq %%mm0, (%0, %3) \n\t"
2830  "add $16, %1 \n\t"
2831  "add %2, %0 \n\t"
2832  " js 1b \n\t"
2833 
2834  : "+r"(offset), "+r"(temp_sums)
2835  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2836  );
2837  }else
2838  src+= step; // src points to begin of the 8x8 Block
2839 
2840  if(eq_mask != -1LL){
2841  uint8_t *temp_src= src;
2842  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2843  __asm__ volatile(
2844  "pxor %%mm7, %%mm7 \n\t"
2845 // 0 1 2 3 4 5 6 7 8 9
2846 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2847 
2848  "movq (%0), %%mm0 \n\t"
2849  "movq %%mm0, %%mm1 \n\t"
2850  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2851  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2852 
2853  "movq (%0, %1), %%mm2 \n\t"
2854  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
2855  "movq %%mm2, %%mm3 \n\t"
2856  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2857  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2858 
2859  "movq (%%"FF_REG_a"), %%mm4 \n\t"
2860  "movq %%mm4, %%mm5 \n\t"
2861  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2862  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2863 
2864  "paddw %%mm0, %%mm0 \n\t" // 2L0
2865  "paddw %%mm1, %%mm1 \n\t" // 2H0
2866  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2867  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2868  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2869  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2870 
2871  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2872  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2873  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2874  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2875 
2876  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
2877  "movq %%mm2, %%mm3 \n\t"
2878  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2879  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2880 
2881  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2882  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2883  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2884  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2885  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2886  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2887 
2888  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
2889  "movq %%mm0, %%mm1 \n\t"
2890  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2891  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2892 
2893  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2894  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2895  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2896  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2897  "paddw %%mm4, %%mm4 \n\t" // 2L2
2898  "paddw %%mm5, %%mm5 \n\t" // 2H2
2899  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2900  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2901 
2902  "lea (%%"FF_REG_a", %1), %0 \n\t"
2903  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2904  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2905  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2906  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2907 //50 opcodes so far
2908  "movq (%0, %1, 2), %%mm2 \n\t"
2909  "movq %%mm2, %%mm3 \n\t"
2910  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2911  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2912  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2913  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2914  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2915  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2916 
2917  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2918  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2919  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2920  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2921  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2922  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2923 
2924  "paddw %%mm0, %%mm0 \n\t" // 2L4
2925  "paddw %%mm1, %%mm1 \n\t" // 2H4
2926  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2927  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2928 
2929  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2930  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2931  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2932  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2933 
2934  "movq (%0, %1, 4), %%mm2 \n\t"
2935  "movq %%mm2, %%mm3 \n\t"
2936  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2937  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2938 
2939  "paddw %%mm2, %%mm2 \n\t" // 2L7
2940  "paddw %%mm3, %%mm3 \n\t" // 2H7
2941  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2942  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2943 
2944  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2945  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2946 
2947 #if TEMPLATE_PP_MMXEXT
2948  "movq %%mm7, %%mm6 \n\t" // 0
2949  "psubw %%mm0, %%mm6 \n\t"
2950  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2951  "movq %%mm7, %%mm6 \n\t" // 0
2952  "psubw %%mm1, %%mm6 \n\t"
2953  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2954  "movq %%mm7, %%mm6 \n\t" // 0
2955  "psubw %%mm2, %%mm6 \n\t"
2956  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2957  "movq %%mm7, %%mm6 \n\t" // 0
2958  "psubw %%mm3, %%mm6 \n\t"
2959  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2960 #else
2961  "movq %%mm7, %%mm6 \n\t" // 0
2962  "pcmpgtw %%mm0, %%mm6 \n\t"
2963  "pxor %%mm6, %%mm0 \n\t"
2964  "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2965  "movq %%mm7, %%mm6 \n\t" // 0
2966  "pcmpgtw %%mm1, %%mm6 \n\t"
2967  "pxor %%mm6, %%mm1 \n\t"
2968  "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2969  "movq %%mm7, %%mm6 \n\t" // 0
2970  "pcmpgtw %%mm2, %%mm6 \n\t"
2971  "pxor %%mm6, %%mm2 \n\t"
2972  "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2973  "movq %%mm7, %%mm6 \n\t" // 0
2974  "pcmpgtw %%mm3, %%mm6 \n\t"
2975  "pxor %%mm6, %%mm3 \n\t"
2976  "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2977 #endif
2978 
2979 #if TEMPLATE_PP_MMXEXT
2980  "pminsw %%mm2, %%mm0 \n\t"
2981  "pminsw %%mm3, %%mm1 \n\t"
2982 #else
2983  "movq %%mm0, %%mm6 \n\t"
2984  "psubusw %%mm2, %%mm6 \n\t"
2985  "psubw %%mm6, %%mm0 \n\t"
2986  "movq %%mm1, %%mm6 \n\t"
2987  "psubusw %%mm3, %%mm6 \n\t"
2988  "psubw %%mm6, %%mm1 \n\t"
2989 #endif
2990 
2991  "movd %2, %%mm2 \n\t" // QP
2992  "punpcklbw %%mm7, %%mm2 \n\t"
2993 
2994  "movq %%mm7, %%mm6 \n\t" // 0
2995  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2996  "pxor %%mm6, %%mm4 \n\t"
2997  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2998  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2999  "pxor %%mm7, %%mm5 \n\t"
3000  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3001 // 100 opcodes
3002  "psllw $3, %%mm2 \n\t" // 8QP
3003  "movq %%mm2, %%mm3 \n\t" // 8QP
3004  "pcmpgtw %%mm4, %%mm2 \n\t"
3005  "pcmpgtw %%mm5, %%mm3 \n\t"
3006  "pand %%mm2, %%mm4 \n\t"
3007  "pand %%mm3, %%mm5 \n\t"
3008 
3009 
3010  "psubusw %%mm0, %%mm4 \n\t" // hd
3011  "psubusw %%mm1, %%mm5 \n\t" // ld
3012 
3013 
3014  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
3015  "pmullw %%mm2, %%mm4 \n\t"
3016  "pmullw %%mm2, %%mm5 \n\t"
3017  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
3018  "paddw %%mm2, %%mm4 \n\t"
3019  "paddw %%mm2, %%mm5 \n\t"
3020  "psrlw $6, %%mm4 \n\t"
3021  "psrlw $6, %%mm5 \n\t"
3022 
3023  "movq 16(%4), %%mm0 \n\t" // L3 - L4
3024  "movq 24(%4), %%mm1 \n\t" // H3 - H4
3025 
3026  "pxor %%mm2, %%mm2 \n\t"
3027  "pxor %%mm3, %%mm3 \n\t"
3028 
3029  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
3030  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
3031  "pxor %%mm2, %%mm0 \n\t"
3032  "pxor %%mm3, %%mm1 \n\t"
3033  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
3034  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
3035  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
3036  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3037 
3038  "pxor %%mm6, %%mm2 \n\t"
3039  "pxor %%mm7, %%mm3 \n\t"
3040  "pand %%mm2, %%mm4 \n\t"
3041  "pand %%mm3, %%mm5 \n\t"
3042 
3043 #if TEMPLATE_PP_MMXEXT
3044  "pminsw %%mm0, %%mm4 \n\t"
3045  "pminsw %%mm1, %%mm5 \n\t"
3046 #else
3047  "movq %%mm4, %%mm2 \n\t"
3048  "psubusw %%mm0, %%mm2 \n\t"
3049  "psubw %%mm2, %%mm4 \n\t"
3050  "movq %%mm5, %%mm2 \n\t"
3051  "psubusw %%mm1, %%mm2 \n\t"
3052  "psubw %%mm2, %%mm5 \n\t"
3053 #endif
3054  "pxor %%mm6, %%mm4 \n\t"
3055  "pxor %%mm7, %%mm5 \n\t"
3056  "psubw %%mm6, %%mm4 \n\t"
3057  "psubw %%mm7, %%mm5 \n\t"
3058  "packsswb %%mm5, %%mm4 \n\t"
3059  "movq %3, %%mm1 \n\t"
3060  "pandn %%mm4, %%mm1 \n\t"
3061  "movq (%0), %%mm0 \n\t"
3062  "paddb %%mm1, %%mm0 \n\t"
3063  "movq %%mm0, (%0) \n\t"
3064  "movq (%0, %1), %%mm0 \n\t"
3065  "psubb %%mm1, %%mm0 \n\t"
3066  "movq %%mm0, (%0, %1) \n\t"
3067 
3068  : "+r" (temp_src)
3069  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
3070  NAMED_CONSTRAINTS_ADD(w05,w20)
3071  : "%"FF_REG_a
3072  );
3073  }
3074 }
3075 #endif //TEMPLATE_PP_MMX
3076 
3077 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3078  const int8_t QPs[], int QPStride, int isColor, PPContext *c);
3079 
3080 /**
3081  * Copy a block from src to dst and fixes the blacklevel.
3082  * levelFix == 0 -> do not touch the brightness & contrast
3083  */
3084 #undef REAL_SCALED_CPY
3085 #undef SCALED_CPY
3086 
3087 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3088  int levelFix, int64_t *packedOffsetAndScale)
3089 {
3090 #if !TEMPLATE_PP_MMX || !HAVE_6REGS
3091  int i;
3092 #endif
3093  if(levelFix){
3094 #if TEMPLATE_PP_MMX && HAVE_6REGS
3095  __asm__ volatile(
3096  "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
3097  "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
3098  "lea (%2,%4), %%"FF_REG_a" \n\t"
3099  "lea (%3,%5), %%"FF_REG_d" \n\t"
3100  "pxor %%mm4, %%mm4 \n\t"
3101 #if TEMPLATE_PP_MMXEXT
3102 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3103  "movq " #src1 ", %%mm0 \n\t"\
3104  "movq " #src1 ", %%mm5 \n\t"\
3105  "movq " #src2 ", %%mm1 \n\t"\
3106  "movq " #src2 ", %%mm6 \n\t"\
3107  "punpcklbw %%mm0, %%mm0 \n\t"\
3108  "punpckhbw %%mm5, %%mm5 \n\t"\
3109  "punpcklbw %%mm1, %%mm1 \n\t"\
3110  "punpckhbw %%mm6, %%mm6 \n\t"\
3111  "pmulhuw %%mm3, %%mm0 \n\t"\
3112  "pmulhuw %%mm3, %%mm5 \n\t"\
3113  "pmulhuw %%mm3, %%mm1 \n\t"\
3114  "pmulhuw %%mm3, %%mm6 \n\t"\
3115  "psubw %%mm2, %%mm0 \n\t"\
3116  "psubw %%mm2, %%mm5 \n\t"\
3117  "psubw %%mm2, %%mm1 \n\t"\
3118  "psubw %%mm2, %%mm6 \n\t"\
3119  "packuswb %%mm5, %%mm0 \n\t"\
3120  "packuswb %%mm6, %%mm1 \n\t"\
3121  "movq %%mm0, " #dst1 " \n\t"\
3122  "movq %%mm1, " #dst2 " \n\t"\
3123 
3124 #else //TEMPLATE_PP_MMXEXT
3125 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3126  "movq " #src1 ", %%mm0 \n\t"\
3127  "movq " #src1 ", %%mm5 \n\t"\
3128  "punpcklbw %%mm4, %%mm0 \n\t"\
3129  "punpckhbw %%mm4, %%mm5 \n\t"\
3130  "psubw %%mm2, %%mm0 \n\t"\
3131  "psubw %%mm2, %%mm5 \n\t"\
3132  "movq " #src2 ", %%mm1 \n\t"\
3133  "psllw $6, %%mm0 \n\t"\
3134  "psllw $6, %%mm5 \n\t"\
3135  "pmulhw %%mm3, %%mm0 \n\t"\
3136  "movq " #src2 ", %%mm6 \n\t"\
3137  "pmulhw %%mm3, %%mm5 \n\t"\
3138  "punpcklbw %%mm4, %%mm1 \n\t"\
3139  "punpckhbw %%mm4, %%mm6 \n\t"\
3140  "psubw %%mm2, %%mm1 \n\t"\
3141  "psubw %%mm2, %%mm6 \n\t"\
3142  "psllw $6, %%mm1 \n\t"\
3143  "psllw $6, %%mm6 \n\t"\
3144  "pmulhw %%mm3, %%mm1 \n\t"\
3145  "pmulhw %%mm3, %%mm6 \n\t"\
3146  "packuswb %%mm5, %%mm0 \n\t"\
3147  "packuswb %%mm6, %%mm1 \n\t"\
3148  "movq %%mm0, " #dst1 " \n\t"\
3149  "movq %%mm1, " #dst2 " \n\t"\
3150 
3151 #endif //TEMPLATE_PP_MMXEXT
3152 #define SCALED_CPY(src1, src2, dst1, dst2)\
3153  REAL_SCALED_CPY(src1, src2, dst1, dst2)
3154 
3155 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
3156 SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
3157 SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
3158  "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
3159  "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
3160 SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
3161 
3162 
3163  : "=&a" (packedOffsetAndScale)
3164  : "0" (packedOffsetAndScale),
3165  "r"(src),
3166  "r"(dst),
3167  "r" ((x86_reg)srcStride),
3168  "r" ((x86_reg)dstStride)
3169  : "%"FF_REG_d
3170  );
3171 #else //TEMPLATE_PP_MMX && HAVE_6REGS
3172  for(i=0; i<8; i++)
3173  memcpy( &(dst[dstStride*i]),
3174  &(src[srcStride*i]), BLOCK_SIZE);
3175 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
3176  }else{
3177 #if TEMPLATE_PP_MMX && HAVE_6REGS
3178  __asm__ volatile(
3179  "lea (%0,%2), %%"FF_REG_a" \n\t"
3180  "lea (%1,%3), %%"FF_REG_d" \n\t"
3181 
3182 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
3183  "movq " #src1 ", %%mm0 \n\t"\
3184  "movq " #src2 ", %%mm1 \n\t"\
3185  "movq %%mm0, " #dst1 " \n\t"\
3186  "movq %%mm1, " #dst2 " \n\t"\
3187 
3188 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
3189  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
3190 
3191 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3192 SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
3193 SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
3194  "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
3195  "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
3196 SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
3197 
3198  : : "r" (src),
3199  "r" (dst),
3200  "r" ((x86_reg)srcStride),
3201  "r" ((x86_reg)dstStride)
3202  : "%"FF_REG_a, "%"FF_REG_d
3203  );
3204 #else //TEMPLATE_PP_MMX && HAVE_6REGS
3205  for(i=0; i<8; i++)
3206  memcpy( &(dst[dstStride*i]),
3207  &(src[srcStride*i]), BLOCK_SIZE);
3208 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
3209  }
3210 }
3211 
3212 /**
3213  * Duplicate the given 8 src pixels ? times upward
3214  */
3215 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3216 {
3217 #if TEMPLATE_PP_MMX
3218  __asm__ volatile(
3219  "movq (%0), %%mm0 \n\t"
3220  "movq %%mm0, (%0, %1, 4) \n\t"
3221  "add %1, %0 \n\t"
3222  "movq %%mm0, (%0) \n\t"
3223  "movq %%mm0, (%0, %1) \n\t"
3224  "movq %%mm0, (%0, %1, 2) \n\t"
3225  "movq %%mm0, (%0, %1, 4) \n\t"
3226  : "+r" (src)
3227  : "r" ((x86_reg)-stride)
3228  );
3229 #else
3230  int i;
3231  uint8_t *p=src;
3232  for(i=0; i<5; i++){
3233  p-= stride;
3234  memcpy(p, src, 8);
3235  }
3236 #endif
3237 }
3238 
3239 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
3240 static inline void RENAME(prefetchnta)(const void *p)
3241 {
3242  __asm__ volatile( "prefetchnta (%0)\n\t"
3243  : : "r" (p)
3244  );
3245 }
3246 
3247 static inline void RENAME(prefetcht0)(const void *p)
3248 {
3249  __asm__ volatile( "prefetcht0 (%0)\n\t"
3250  : : "r" (p)
3251  );
3252 }
3253 
3254 static inline void RENAME(prefetcht1)(const void *p)
3255 {
3256  __asm__ volatile( "prefetcht1 (%0)\n\t"
3257  : : "r" (p)
3258  );
3259 }
3260 
3261 static inline void RENAME(prefetcht2)(const void *p)
3262 {
3263  __asm__ volatile( "prefetcht2 (%0)\n\t"
3264  : : "r" (p)
3265  );
3266 }
3267 #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
3268 static inline void RENAME(prefetchnta)(const void *p)
3269 {
3270  __builtin_prefetch(p,0,0);
3271 }
3272 static inline void RENAME(prefetcht0)(const void *p)
3273 {
3274  __builtin_prefetch(p,0,1);
3275 }
3276 static inline void RENAME(prefetcht1)(const void *p)
3277 {
3278  __builtin_prefetch(p,0,2);
3279 }
3280 static inline void RENAME(prefetcht2)(const void *p)
3281 {
3282  __builtin_prefetch(p,0,3);
3283 }
3284 #else
3285 static inline void RENAME(prefetchnta)(const void *p)
3286 {
3287  return;
3288 }
3289 static inline void RENAME(prefetcht0)(const void *p)
3290 {
3291  return;
3292 }
3293 static inline void RENAME(prefetcht1)(const void *p)
3294 {
3295  return;
3296 }
3297 static inline void RENAME(prefetcht2)(const void *p)
3298 {
3299  return;
3300 }
3301 #endif
3302 /**
3303  * Filter array of bytes (Y or U or V values)
3304  */
3305 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3306  const int8_t QPs[], int QPStride, int isColor, PPContext *c2)
3307 {
3308  DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3309  int x,y;
3310 #ifdef TEMPLATE_PP_TIME_MODE
3311  const int mode= TEMPLATE_PP_TIME_MODE;
3312 #else
3313  const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3314 #endif
3315  int black=0, white=255; // blackest black and whitest white in the picture
3316  int QPCorrecture= 256*256;
3317 
3318  int copyAhead;
3319 #if TEMPLATE_PP_MMX
3320  int i;
3321 #endif
3322 
3323  const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3324  const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3325 
3326  //FIXME remove
3327  uint64_t * const yHistogram= c.yHistogram;
3328  uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3329  uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32;
3330  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3331 
3332  if (mode & VISUALIZE){
3333  if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
3334  av_log(c2, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
3335  }
3336  }
3337 
3338 #if TEMPLATE_PP_MMX
3339  for(i=0; i<57; i++){
3340  int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3341  int threshold= offset*2 + 1;
3342  c.mmxDcOffset[i]= 0x7F - offset;
3343  c.mmxDcThreshold[i]= 0x7F - threshold;
3344  c.mmxDcOffset[i]*= 0x0101010101010101LL;
3345  c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3346  }
3347 #endif
3348 
3349  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3350  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
3351  || (mode & FFMPEG_DEINT_FILTER)
3352  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3353  else if( (mode & V_DEBLOCK)
3355  || (mode & MEDIAN_DEINT_FILTER)
3356  || (mode & V_A_DEBLOCK)) copyAhead=13;
3357  else if(mode & V_X1_FILTER) copyAhead=11;
3358 // else if(mode & V_RK1_FILTER) copyAhead=10;
3359  else if(mode & DERING) copyAhead=9;
3360  else copyAhead=8;
3361 
3362  copyAhead-= 8;
3363 
3364  if(!isColor){
3365  uint64_t sum= 0;
3366  int i;
3367  uint64_t maxClipped;
3368  uint64_t clipped;
3369  AVRational scale;
3370 
3371  c.frameNum++;
3372  // first frame is fscked so we ignore it
3373  if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256;
3374 
3375  for(i=0; i<256; i++){
3376  sum+= yHistogram[i];
3377  }
3378 
3379  /* We always get a completely black picture first. */
3380  maxClipped= av_rescale(sum, c.ppMode.maxClippedThreshold.num, c.ppMode.maxClippedThreshold.den);
3381 
3382  clipped= sum;
3383  for(black=255; black>0; black--){
3384  if(clipped < maxClipped) break;
3385  clipped-= yHistogram[black];
3386  }
3387 
3388  clipped= sum;
3389  for(white=0; white<256; white++){
3390  if(clipped < maxClipped) break;
3391  clipped-= yHistogram[white];
3392  }
3393 
3394  scale = (AVRational){c.ppMode.maxAllowedY - c.ppMode.minAllowedY, white - black};
3395 
3396 #if TEMPLATE_PP_MMXEXT
3397  c.packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
3398  c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3399 #else
3400  c.packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
3401  c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3402 #endif
3403 
3404  c.packedYOffset|= c.packedYOffset<<32;
3405  c.packedYOffset|= c.packedYOffset<<16;
3406 
3407  c.packedYScale|= c.packedYScale<<32;
3408  c.packedYScale|= c.packedYScale<<16;
3409 
3410  if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
3411  else QPCorrecture= 256*256;
3412  }else{
3413  c.packedYScale= 0x0100010001000100LL;
3414  c.packedYOffset= 0;
3415  QPCorrecture= 256*256;
3416  }
3417 
3418  /* copy & deinterlace first row of blocks */
3419  y=-BLOCK_SIZE;
3420  {
3421  const uint8_t *srcBlock= &(src[y*srcStride]);
3422  uint8_t *dstBlock= tempDst + dstStride;
3423 
3424  // From this point on it is guaranteed that we can read and write 16 lines downward
3425  // finish 1 block before the next otherwise we might have a problem
3426  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3427  for(x=0; x<width; x+=BLOCK_SIZE){
3428  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3429  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3430  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3431  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3432 
3433  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3434  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3435 
3436  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3437 
3439  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3440  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3441  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3442  else if(mode & MEDIAN_DEINT_FILTER)
3443  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3444  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3445  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3446  else if(mode & FFMPEG_DEINT_FILTER)
3447  RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3448  else if(mode & LOWPASS5_DEINT_FILTER)
3449  RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3450 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3451  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3452 */
3453  dstBlock+=8;
3454  srcBlock+=8;
3455  }
3456  if(width==FFABS(dstStride))
3457  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3458  else{
3459  int i;
3460  for(i=0; i<copyAhead; i++){
3461  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3462  }
3463  }
3464  }
3465 
3466  for(y=0; y<height; y+=BLOCK_SIZE){
3467  //1% speedup if these are here instead of the inner loop
3468  const uint8_t *srcBlock= &(src[y*srcStride]);
3469  uint8_t *dstBlock= &(dst[y*dstStride]);
3470 #if TEMPLATE_PP_MMX
3471  uint8_t *tempBlock1= c.tempBlocks;
3472  uint8_t *tempBlock2= c.tempBlocks + 8;
3473 #endif
3474  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3475  int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3476  int QP=0, nonBQP=0;
3477  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3478  if not than use a temporary buffer */
3479  if(y+15 >= height){
3480  int i;
3481  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3482  blockcopy to dst later */
3483  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3484  FFMAX(height-y-copyAhead, 0), srcStride);
3485 
3486  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3487  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3488  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3489 
3490  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3491  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3492 
3493  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3494  for(i=height-y+1; i<=copyAhead; i++)
3495  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3496 
3497  dstBlock= tempDst + dstStride;
3498  srcBlock= tempSrc;
3499  }
3500 
3501  // From this point on it is guaranteed that we can read and write 16 lines downward
3502  // finish 1 block before the next otherwise we might have a problem
3503  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3504  for(x=0; x<width; ){
3505  int startx = x;
3506  int endx = FFMIN(width, x+32);
3507  uint8_t *dstBlockStart = dstBlock;
3508  const uint8_t *srcBlockStart = srcBlock;
3509  int qp_index = 0;
3510  for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
3511  QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3512  nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3513  if(!isColor){
3514  QP= (QP* QPCorrecture + 256*128)>>16;
3515  nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
3516  yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
3517  }
3518  c.QP_block[qp_index] = QP;
3519  c.nonBQP_block[qp_index] = nonBQP;
3520 #if TEMPLATE_PP_MMX
3521  __asm__ volatile(
3522  "movd %1, %%mm7 \n\t"
3523  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3524  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3525  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3526  "movq %%mm7, %0 \n\t"
3527  : "=m" (c.pQPb_block[qp_index])
3528  : "r" (QP)
3529  );
3530 #endif
3531  }
3532  for(; x < endx; x+=BLOCK_SIZE){
3533  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3534  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3535  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3536  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3537 
3538  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3539  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3540 
3542  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3543  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3544  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3545  else if(mode & MEDIAN_DEINT_FILTER)
3546  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3547  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3548  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3549  else if(mode & FFMPEG_DEINT_FILTER)
3550  RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3551  else if(mode & LOWPASS5_DEINT_FILTER)
3552  RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3553 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3554  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3555 */
3556  dstBlock+=8;
3557  srcBlock+=8;
3558  }
3559 
3560  dstBlock = dstBlockStart;
3561  srcBlock = srcBlockStart;
3562 
3563  for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
3564  const int stride= dstStride;
3565  //temporary while changing QP stuff to make things continue to work
3566  //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
3567  c.QP = c.QP_block[qp_index];
3568  c.nonBQP = c.nonBQP_block[qp_index];
3569  c.pQPb = c.pQPb_block[qp_index];
3570  c.pQPb2 = c.pQPb2_block[qp_index];
3571 
3572  /* only deblock if we have 2 blocks */
3573  if(y + 8 < height){
3574  if(mode & V_X1_FILTER)
3575  RENAME(vertX1Filter)(dstBlock, stride, &c);
3576  else if(mode & V_DEBLOCK){
3577  const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3578 
3579  if(t==1)
3580  RENAME(doVertLowPass)(dstBlock, stride, &c);
3581  else if(t==2)
3582  RENAME(doVertDefFilter)(dstBlock, stride, &c);
3583  }else if(mode & V_A_DEBLOCK){
3584  RENAME(do_a_deblock)(dstBlock, stride, 1, &c, mode);
3585  }
3586  }
3587 
3588  dstBlock+=8;
3589  srcBlock+=8;
3590  }
3591 
3592  dstBlock = dstBlockStart;
3593  srcBlock = srcBlockStart;
3594 
3595  for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
3596  const int stride= dstStride;
3597  av_unused uint8_t *tmpXchg;
3598  c.QP = c.QP_block[qp_index];
3599  c.nonBQP = c.nonBQP_block[qp_index];
3600  c.pQPb = c.pQPb_block[qp_index];
3601  c.pQPb2 = c.pQPb2_block[qp_index];
3602 #if TEMPLATE_PP_MMX
3603  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3604 #endif
3605  /* check if we have a previous block to deblock it with dstBlock */
3606  if(x - 8 >= 0){
3607 #if TEMPLATE_PP_MMX
3608  if(mode & H_X1_FILTER)
3609  RENAME(vertX1Filter)(tempBlock1, 16, &c);
3610  else if(mode & H_DEBLOCK){
3611  const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3612  if(t==1)
3613  RENAME(doVertLowPass)(tempBlock1, 16, &c);
3614  else if(t==2)
3615  RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3616  }else if(mode & H_A_DEBLOCK){
3617  RENAME(do_a_deblock)(tempBlock1, 16, 1, &c, mode);
3618  }
3619 
3620  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3621 
3622 #else
3623  if(mode & H_X1_FILTER)
3624  horizX1Filter(dstBlock-4, stride, c.QP);
3625  else if(mode & H_DEBLOCK){
3626 #if TEMPLATE_PP_ALTIVEC
3627  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3628  int t;
3629  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3630 
3631  t = vertClassify_altivec(tempBlock-48, 16, &c);
3632  if(t==1) {
3633  doVertLowPass_altivec(tempBlock-48, 16, &c);
3634  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3635  }
3636  else if(t==2) {
3637  doVertDefFilter_altivec(tempBlock-48, 16, &c);
3638  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3639  }
3640 #else
3641  const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3642 
3643  if(t==1)
3644  RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3645  else if(t==2)
3646  RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3647 #endif
3648  }else if(mode & H_A_DEBLOCK){
3649  RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c, mode);
3650  }
3651 #endif //TEMPLATE_PP_MMX
3652  if(mode & DERING){
3653  //FIXME filter first line
3654  if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3655  }
3656 
3657  if(mode & TEMP_NOISE_FILTER)
3658  {
3659  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3660  c.tempBlurred[isColor] + y*dstStride + x,
3661  c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3662  c.ppMode.maxTmpNoise);
3663  }
3664  }
3665 
3666  dstBlock+=8;
3667  srcBlock+=8;
3668 
3669 #if TEMPLATE_PP_MMX
3670  tmpXchg= tempBlock1;
3671  tempBlock1= tempBlock2;
3672  tempBlock2 = tmpXchg;
3673 #endif
3674  }
3675  }
3676 
3677  if(mode & DERING){
3678  if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3679  }
3680 
3681  if((mode & TEMP_NOISE_FILTER)){
3682  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3683  c.tempBlurred[isColor] + y*dstStride + x,
3684  c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3685  c.ppMode.maxTmpNoise);
3686  }
3687 
3688  /* did we use a tmp buffer for the last lines*/
3689  if(y+15 >= height){
3690  uint8_t *dstBlock= &(dst[y*dstStride]);
3691  if(width==FFABS(dstStride))
3692  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3693  else{
3694  int i;
3695  for(i=0; i<height-y; i++){
3696  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3697  }
3698  }
3699  }
3700  }
3701 #if TEMPLATE_PP_3DNOW
3702  __asm__ volatile("femms");
3703 #elif TEMPLATE_PP_MMX
3704  __asm__ volatile("emms");
3705 #endif
3706 
3707 #ifdef DEBUG_BRIGHTNESS
3708  if(!isColor){
3709  int max=1;
3710  int i;
3711  for(i=0; i<256; i++)
3712  if(yHistogram[i] > max) max=yHistogram[i];
3713 
3714  for(i=1; i<256; i++){
3715  int x;
3716  int start=yHistogram[i-1]/(max/256+1);
3717  int end=yHistogram[i]/(max/256+1);
3718  int inc= end > start ? 1 : -1;
3719  for(x=start; x!=end+inc; x+=inc)
3720  dst[ i*dstStride + x]+=128;
3721  }
3722 
3723  for(i=0; i<100; i+=2){
3724  dst[ (white)*dstStride + i]+=128;
3725  dst[ (black)*dstStride + i]+=128;
3726  }
3727  }
3728 #endif
3729 
3730  *c2= c; //copy local context back
3731 
3732 }
3733 
3734 #undef RENAME
3735 #undef TEMPLATE_PP_C
3736 #undef TEMPLATE_PP_ALTIVEC
3737 #undef TEMPLATE_PP_MMX
3738 #undef TEMPLATE_PP_MMXEXT
3739 #undef TEMPLATE_PP_3DNOW
3740 #undef TEMPLATE_PP_SSE2
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:29
FFMPEG_DEINT_FILTER
#define FFMPEG_DEINT_FILTER
Definition: postprocess_internal.h:66
stride
int stride
Definition: mace.c:144
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:182
PPContext
postprocess context.
Definition: postprocess_internal.h:115
av_unused
#define av_unused
Definition: attributes.h:131
end
static av_cold int end(AVCodecContext *avctx)
Definition: avrndec.c:92
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
LOWPASS5_DEINT_FILTER
#define LOWPASS5_DEINT_FILTER
Definition: postprocess_internal.h:67
b
#define b
Definition: input.c:41
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
horizX1Filter
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:346
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:213
MEDIAN
@ MEDIAN
Definition: huffyuv.h:52
t1
#define t1
Definition: regdef.h:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
H_A_DEBLOCK
#define H_A_DEBLOCK
Definition: postprocess_internal.h:55
FFSIGN
#define FFSIGN(a)
Definition: common.h:73
QP
#define QP(qP, depth)
Definition: h264data.c:190
AVRational::num
int num
Numerator.
Definition: rational.h:59
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
postProcess
static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
Definition: postprocess.c:558
width
#define width
s
#define s(width, name)
Definition: cbs_vp9.c:257
V_A_DEBLOCK
#define V_A_DEBLOCK
Definition: postprocess_internal.h:51
V_DEBLOCK
#define V_DEBLOCK
Definition: postprocess_internal.h:35
TEMP_NOISE_FILTER
#define TEMP_NOISE_FILTER
Definition: postprocess_internal.h:69
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
f
#define f(width, name)
Definition: cbs_vp9.c:255
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
TEMPLATE_PP_SSE2
#define TEMPLATE_PP_SSE2
Definition: postprocess_template.c:74
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
MEDIAN_DEINT_FILTER
#define MEDIAN_DEINT_FILTER
Definition: postprocess_internal.h:65
src
#define src
Definition: vp8dsp.c:254
linecpy
static void linecpy(void *dest, const void *src, int lines, int stride)
Definition: postprocess_internal.h:176
V_X1_FILTER
#define V_X1_FILTER
Definition: postprocess_internal.h:50
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1015
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:87
FFMAX
#define FFMAX(a, b)
Definition: common.h:94
asm.h
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1120
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
PREV
@ PREV
Definition: vf_fftdnoiz.c:31
height
#define height
FFMIN
#define FFMIN(a, b)
Definition: common.h:96
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
H_DEBLOCK
#define H_DEBLOCK
Definition: postprocess_internal.h:36
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:187
DERING
#define DERING
Definition: postprocess_internal.h:37
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
VISUALIZE
#define VISUALIZE
Definition: postprocess_internal.h:72
t3
#define t3
Definition: regdef.h:31
RENAME
#define RENAME(name)
Definition: ffv1.h:196
av_always_inline
#define av_always_inline
Definition: attributes.h:49
uint8_t
uint8_t
Definition: audio_convert.c:194
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
NEXT
@ NEXT
Definition: vf_fftdnoiz.c:32
CUBIC_IPOL_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:64
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:58
TEMPLATE_PP_MMX
#define TEMPLATE_PP_MMX
Definition: postprocess_template.c:48
c2
static const uint64_t c2
Definition: murmur3.c:50
t2
#define t2
Definition: regdef.h:30
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:411
AVRational::den
int den
Denominator.
Definition: rational.h:60
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
LINEAR_BLEND_DEINT_FILTER
#define LINEAR_BLEND_DEINT_FILTER
Definition: postprocess_internal.h:62
LINEAR_IPOL_DEINT_FILTER
#define LINEAR_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:61
MANGLE
#define MANGLE(a)
Definition: asm.h:127
diff
static av_always_inline int diff(const uint32_t a, const uint32_t b)
Definition: vf_palettegen.c:136
H_X1_FILTER
#define H_X1_FILTER
Definition: postprocess_internal.h:54
x86_reg
int x86_reg
Definition: asm.h:72
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:28
BLOCK_SIZE
#define BLOCK_SIZE
Definition: adx.h:53
int
int
Definition: ffmpeg_filter.c:192
LEVEL_FIX
#define LEVEL_FIX
Brightness & Contrast.
Definition: postprocess_internal.h:38
min
float min
Definition: vorbis_enc_data.h:456