FFmpeg
postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/sse2 postprocess code.
24  */
25 #include "config.h"
26 
27 #include "libavutil/mem_internal.h"
28 #if ARCH_X86
29 #include "libavutil/x86/asm.h"
30 #endif
31 
32 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
33  * included. The following macros will define its dependencies to 1 as well
34  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
35  * TEMPLATE_PP_* need to be undef at the end. */
36 
37 #ifdef TEMPLATE_PP_C
38 # define RENAME(a) a ## _C
39 #else
40 # define TEMPLATE_PP_C 0
41 #endif
42 
43 #ifdef TEMPLATE_PP_ALTIVEC
44 # define RENAME(a) a ## _altivec
45 #else
46 # define TEMPLATE_PP_ALTIVEC 0
47 #endif
48 
49 #ifdef TEMPLATE_PP_MMX
50 # define RENAME(a) a ## _MMX
51 #else
52 # define TEMPLATE_PP_MMX 0
53 #endif
54 
55 #ifdef TEMPLATE_PP_MMXEXT
56 # undef TEMPLATE_PP_MMX
57 # define TEMPLATE_PP_MMX 1
58 # define RENAME(a) a ## _MMX2
59 #else
60 # define TEMPLATE_PP_MMXEXT 0
61 #endif
62 
63 #ifdef TEMPLATE_PP_SSE2
64 # undef TEMPLATE_PP_MMX
65 # define TEMPLATE_PP_MMX 1
66 # undef TEMPLATE_PP_MMXEXT
67 # define TEMPLATE_PP_MMXEXT 1
68 # define RENAME(a) a ## _SSE2
69 #else
70 # define TEMPLATE_PP_SSE2 0
71 #endif
72 
73 #undef REAL_PAVGB
74 #undef PAVGB
75 #undef PMINUB
76 #undef PMAXUB
77 
78 #if TEMPLATE_PP_MMXEXT
79 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
80 #endif
81 #define PAVGB(a,b) REAL_PAVGB(a,b)
82 
83 #if TEMPLATE_PP_MMXEXT
84 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
85 #endif
86 
87 #if TEMPLATE_PP_MMXEXT
88 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
89 #endif
90 
91 //FIXME? |255-0| = 1 (should not be a problem ...)
92 #if TEMPLATE_PP_MMXEXT
93 /**
94  * Check if the middle 8x8 Block in the given 8x16 block is flat
95  */
96 static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
97  int numEq= 0, dcOk;
98  src+= stride*4; // src points to begin of the 8x8 Block
99  __asm__ volatile(
100  "movq %0, %%mm7 \n\t"
101  "movq %1, %%mm6 \n\t"
102  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
103  );
104 
105  __asm__ volatile(
106  "lea (%2, %3), %%"FF_REG_a" \n\t"
107 // 0 1 2 3 4 5 6 7 8 9
108 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
109 
110  "movq (%2), %%mm0 \n\t"
111  "movq (%%"FF_REG_a"), %%mm1 \n\t"
112  "movq %%mm0, %%mm3 \n\t"
113  "movq %%mm0, %%mm4 \n\t"
114  PMAXUB(%%mm1, %%mm4)
115  PMINUB(%%mm1, %%mm3, %%mm5)
116  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
117  "paddb %%mm7, %%mm0 \n\t"
118  "pcmpgtb %%mm6, %%mm0 \n\t"
119 
120  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
121  PMAXUB(%%mm2, %%mm4)
122  PMINUB(%%mm2, %%mm3, %%mm5)
123  "psubb %%mm2, %%mm1 \n\t"
124  "paddb %%mm7, %%mm1 \n\t"
125  "pcmpgtb %%mm6, %%mm1 \n\t"
126  "paddb %%mm1, %%mm0 \n\t"
127 
128  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
129  PMAXUB(%%mm1, %%mm4)
130  PMINUB(%%mm1, %%mm3, %%mm5)
131  "psubb %%mm1, %%mm2 \n\t"
132  "paddb %%mm7, %%mm2 \n\t"
133  "pcmpgtb %%mm6, %%mm2 \n\t"
134  "paddb %%mm2, %%mm0 \n\t"
135 
136  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
137 
138  "movq (%2, %3, 4), %%mm2 \n\t"
139  PMAXUB(%%mm2, %%mm4)
140  PMINUB(%%mm2, %%mm3, %%mm5)
141  "psubb %%mm2, %%mm1 \n\t"
142  "paddb %%mm7, %%mm1 \n\t"
143  "pcmpgtb %%mm6, %%mm1 \n\t"
144  "paddb %%mm1, %%mm0 \n\t"
145 
146  "movq (%%"FF_REG_a"), %%mm1 \n\t"
147  PMAXUB(%%mm1, %%mm4)
148  PMINUB(%%mm1, %%mm3, %%mm5)
149  "psubb %%mm1, %%mm2 \n\t"
150  "paddb %%mm7, %%mm2 \n\t"
151  "pcmpgtb %%mm6, %%mm2 \n\t"
152  "paddb %%mm2, %%mm0 \n\t"
153 
154  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
155  PMAXUB(%%mm2, %%mm4)
156  PMINUB(%%mm2, %%mm3, %%mm5)
157  "psubb %%mm2, %%mm1 \n\t"
158  "paddb %%mm7, %%mm1 \n\t"
159  "pcmpgtb %%mm6, %%mm1 \n\t"
160  "paddb %%mm1, %%mm0 \n\t"
161 
162  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
163  PMAXUB(%%mm1, %%mm4)
164  PMINUB(%%mm1, %%mm3, %%mm5)
165  "psubb %%mm1, %%mm2 \n\t"
166  "paddb %%mm7, %%mm2 \n\t"
167  "pcmpgtb %%mm6, %%mm2 \n\t"
168  "paddb %%mm2, %%mm0 \n\t"
169  "psubusb %%mm3, %%mm4 \n\t"
170 
171  " \n\t"
172  "pxor %%mm7, %%mm7 \n\t"
173  "psadbw %%mm7, %%mm0 \n\t"
174  "movq %4, %%mm7 \n\t" // QP,..., QP
175  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
176  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
177  "packssdw %%mm4, %%mm4 \n\t"
178  "movd %%mm0, %0 \n\t"
179  "movd %%mm4, %1 \n\t"
180 
181  : "=r" (numEq), "=r" (dcOk)
182  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
183  : "%"FF_REG_a
184  );
185 
186  numEq= (-numEq) &0xFF;
187  if(numEq > c->ppMode.flatnessThreshold){
188  if(dcOk) return 0;
189  else return 1;
190  }else{
191  return 2;
192  }
193 }
194 #endif //TEMPLATE_PP_MMXEXT
195 
196 /**
197  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
198  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
199  */
200 #if !TEMPLATE_PP_ALTIVEC
201 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
202 {
203 #if TEMPLATE_PP_MMXEXT
204  src+= stride*3;
205  __asm__ volatile( //"movv %0 %1 %2\n\t"
206  "movq %2, %%mm0 \n\t" // QP,..., QP
207  "pxor %%mm4, %%mm4 \n\t"
208 
209  "movq (%0), %%mm6 \n\t"
210  "movq (%0, %1), %%mm5 \n\t"
211  "movq %%mm5, %%mm1 \n\t"
212  "movq %%mm6, %%mm2 \n\t"
213  "psubusb %%mm6, %%mm5 \n\t"
214  "psubusb %%mm1, %%mm2 \n\t"
215  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
216  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
217  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
218 
219  "pand %%mm2, %%mm6 \n\t"
220  "pandn %%mm1, %%mm2 \n\t"
221  "por %%mm2, %%mm6 \n\t"// First Line to Filter
222 
223  "movq (%0, %1, 8), %%mm5 \n\t"
224  "lea (%0, %1, 4), %%"FF_REG_a" \n\t"
225  "lea (%0, %1, 8), %%"FF_REG_c" \n\t"
226  "sub %1, %%"FF_REG_c" \n\t"
227  "add %1, %0 \n\t" // %0 points to line 1 not 0
228  "movq (%0, %1, 8), %%mm7 \n\t"
229  "movq %%mm5, %%mm1 \n\t"
230  "movq %%mm7, %%mm2 \n\t"
231  "psubusb %%mm7, %%mm5 \n\t"
232  "psubusb %%mm1, %%mm2 \n\t"
233  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
234  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
235  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
236 
237  "pand %%mm2, %%mm7 \n\t"
238  "pandn %%mm1, %%mm2 \n\t"
239  "por %%mm2, %%mm7 \n\t" // First Line to Filter
240 
241 
242  // 1 2 3 4 5 6 7 8
243  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
244  // 6 4 2 2 1 1
245  // 6 4 4 2
246  // 6 8 2
247 
248  "movq (%0, %1), %%mm0 \n\t" // 1
249  "movq %%mm0, %%mm1 \n\t" // 1
250  PAVGB(%%mm6, %%mm0) //1 1 /2
251  PAVGB(%%mm6, %%mm0) //3 1 /4
252 
253  "movq (%0, %1, 4), %%mm2 \n\t" // 1
254  "movq %%mm2, %%mm5 \n\t" // 1
255  PAVGB((%%FF_REGa), %%mm2) // 11 /2
256  PAVGB((%0, %1, 2), %%mm2) // 211 /4
257  "movq %%mm2, %%mm3 \n\t" // 211 /4
258  "movq (%0), %%mm4 \n\t" // 1
259  PAVGB(%%mm4, %%mm3) // 4 211 /8
260  PAVGB(%%mm0, %%mm3) //642211 /16
261  "movq %%mm3, (%0) \n\t" // X
262  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
263  "movq %%mm1, %%mm0 \n\t" // 1
264  PAVGB(%%mm6, %%mm0) //1 1 /2
265  "movq %%mm4, %%mm3 \n\t" // 1
266  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
267  PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
268  PAVGB((%%FF_REGa), %%mm5) // 211 /4
269  PAVGB(%%mm5, %%mm3) // 2 2211 /8
270  PAVGB(%%mm0, %%mm3) //4242211 /16
271  "movq %%mm3, (%0,%1) \n\t" // X
272  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
273  PAVGB(%%mm4, %%mm6) //11 /2
274  "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
275  PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
276  "movq %%mm0, %%mm3 \n\t" // 11/2
277  PAVGB(%%mm1, %%mm0) // 2 11/4
278  PAVGB(%%mm6, %%mm0) //222 11/8
279  PAVGB(%%mm2, %%mm0) //22242211/16
280  "movq (%0, %1, 2), %%mm2 \n\t" // 1
281  "movq %%mm0, (%0, %1, 2) \n\t" // X
282  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
283  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
284  PAVGB((%%FF_REGc), %%mm0) // 11 /2
285  PAVGB(%%mm0, %%mm6) //11 11 /4
286  PAVGB(%%mm1, %%mm4) // 11 /2
287  PAVGB(%%mm2, %%mm1) // 11 /2
288  PAVGB(%%mm1, %%mm6) //1122 11 /8
289  PAVGB(%%mm5, %%mm6) //112242211 /16
290  "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
291  "movq %%mm6, (%%"FF_REG_a") \n\t" // X
292  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
293  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
294  PAVGB(%%mm7, %%mm6) // 11 /2
295  PAVGB(%%mm4, %%mm6) // 11 11 /4
296  PAVGB(%%mm3, %%mm6) // 11 2211 /8
297  PAVGB(%%mm5, %%mm2) // 11 /2
298  "movq (%0, %1, 4), %%mm4 \n\t" // 1
299  PAVGB(%%mm4, %%mm2) // 112 /4
300  PAVGB(%%mm2, %%mm6) // 112242211 /16
301  "movq %%mm6, (%0, %1, 4) \n\t" // X
302  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
303  PAVGB(%%mm7, %%mm1) // 11 2 /4
304  PAVGB(%%mm4, %%mm5) // 11 /2
305  PAVGB(%%mm5, %%mm0) // 11 11 /4
306  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
307  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
308  PAVGB(%%mm0, %%mm1) // 11224222 /16
309  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
310  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
311  PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
312  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
313  PAVGB(%%mm0, %%mm6) // 1 1 /2
314  PAVGB(%%mm7, %%mm6) // 1 12 /4
315  PAVGB(%%mm2, %%mm6) // 1122424 /4
316  "movq %%mm6, (%%"FF_REG_c") \n\t" // X
317  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
318  PAVGB(%%mm7, %%mm5) // 11 2 /4
319  PAVGB(%%mm7, %%mm5) // 11 6 /8
320 
321  PAVGB(%%mm3, %%mm0) // 112 /4
322  PAVGB(%%mm0, %%mm5) // 112246 /16
323  "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
324  "sub %1, %0 \n\t"
325 
326  :
327  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
328  : "%"FF_REG_a, "%"FF_REG_c
329  );
330 #else //TEMPLATE_PP_MMXEXT
331  const int l1= stride;
332  const int l2= stride + l1;
333  const int l3= stride + l2;
334  const int l4= stride + l3;
335  const int l5= stride + l4;
336  const int l6= stride + l5;
337  const int l7= stride + l6;
338  const int l8= stride + l7;
339  const int l9= stride + l8;
340  int x;
341  src+= stride*3;
342  for(x=0; x<BLOCK_SIZE; x++){
343  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
344  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
345 
346  int sums[10];
347  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
348  sums[1] = sums[0] - first + src[l4];
349  sums[2] = sums[1] - first + src[l5];
350  sums[3] = sums[2] - first + src[l6];
351  sums[4] = sums[3] - first + src[l7];
352  sums[5] = sums[4] - src[l1] + src[l8];
353  sums[6] = sums[5] - src[l2] + last;
354  sums[7] = sums[6] - src[l3] + last;
355  sums[8] = sums[7] - src[l4] + last;
356  sums[9] = sums[8] - src[l5] + last;
357 
358  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
359  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
360  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
361  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
362  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
363  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
364  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
365  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
366 
367  src++;
368  }
369 #endif //TEMPLATE_PP_MMXEXT
370 }
371 #endif //TEMPLATE_PP_ALTIVEC
372 
373 /**
374  * Experimental Filter 1
375  * will not damage linear gradients
376  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
377  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
378  * MMX2 version does correct clipping C version does not
379  */
380 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
381 {
382 #if TEMPLATE_PP_MMXEXT
383  src+= stride*3;
384 
385  __asm__ volatile(
386  "pxor %%mm7, %%mm7 \n\t" // 0
387  "lea (%0, %1), %%"FF_REG_a" \n\t"
388  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
389 // 0 1 2 3 4 5 6 7 8 9
390 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
391  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
392  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
393  "movq %%mm1, %%mm2 \n\t" // line 4
394  "psubusb %%mm0, %%mm1 \n\t"
395  "psubusb %%mm2, %%mm0 \n\t"
396  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
397  "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
398  "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
399  "movq %%mm3, %%mm5 \n\t" // line 5
400  "psubusb %%mm4, %%mm3 \n\t"
401  "psubusb %%mm5, %%mm4 \n\t"
402  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
403  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
404  "movq %%mm2, %%mm1 \n\t" // line 4
405  "psubusb %%mm5, %%mm2 \n\t"
406  "movq %%mm2, %%mm4 \n\t"
407  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
408  "psubusb %%mm1, %%mm5 \n\t"
409  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
410  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
411  "movq %%mm4, %%mm3 \n\t" // d
412  "movq %2, %%mm0 \n\t"
413  "paddusb %%mm0, %%mm0 \n\t"
414  "psubusb %%mm0, %%mm4 \n\t"
415  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
416  "psubusb "MANGLE(b01)", %%mm3 \n\t"
417  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
418 
419  PAVGB(%%mm7, %%mm3) // d/2
420  "movq %%mm3, %%mm1 \n\t" // d/2
421  PAVGB(%%mm7, %%mm3) // d/4
422  PAVGB(%%mm1, %%mm3) // 3*d/8
423 
424  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
425  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
426  "psubusb %%mm3, %%mm0 \n\t"
427  "pxor %%mm2, %%mm0 \n\t"
428  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
429 
430  "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
431  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
432  "paddusb %%mm3, %%mm0 \n\t"
433  "pxor %%mm2, %%mm0 \n\t"
434  "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
435 
436  PAVGB(%%mm7, %%mm1) // d/4
437 
438  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
439  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
440  "psubusb %%mm1, %%mm0 \n\t"
441  "pxor %%mm2, %%mm0 \n\t"
442  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
443 
444  "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
445  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
446  "paddusb %%mm1, %%mm0 \n\t"
447  "pxor %%mm2, %%mm0 \n\t"
448  "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
449 
450  PAVGB(%%mm7, %%mm1) // d/8
451 
452  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
453  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
454  "psubusb %%mm1, %%mm0 \n\t"
455  "pxor %%mm2, %%mm0 \n\t"
456  "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
457 
458  "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
459  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
460  "paddusb %%mm1, %%mm0 \n\t"
461  "pxor %%mm2, %%mm0 \n\t"
462  "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
463 
464  :
465  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
467  : "%"FF_REG_a, "%"FF_REG_c
468  );
469 #else //TEMPLATE_PP_MMXEXT
470 
471  const int l1= stride;
472  const int l2= stride + l1;
473  const int l3= stride + l2;
474  const int l4= stride + l3;
475  const int l5= stride + l4;
476  const int l6= stride + l5;
477  const int l7= stride + l6;
478 // const int l8= stride + l7;
479 // const int l9= stride + l8;
480  int x;
481 
482  src+= stride*3;
483  for(x=0; x<BLOCK_SIZE; x++){
484  int a= src[l3] - src[l4];
485  int b= src[l4] - src[l5];
486  int c= src[l5] - src[l6];
487 
488  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
489  d= FFMAX(d, 0);
490 
491  if(d < co->QP*2){
492  int v = d * FFSIGN(-b);
493 
494  src[l2] +=v>>3;
495  src[l3] +=v>>2;
496  src[l4] +=(3*v)>>3;
497  src[l5] -=(3*v)>>3;
498  src[l6] -=v>>2;
499  src[l7] -=v>>3;
500  }
501  src++;
502  }
503 #endif //TEMPLATE_PP_MMXEXT
504 }
505 
506 #if !TEMPLATE_PP_ALTIVEC
507 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
508 {
509 #if TEMPLATE_PP_MMXEXT
510 /*
511  uint8_t tmp[16];
512  const int l1= stride;
513  const int l2= stride + l1;
514  const int l3= stride + l2;
515  const int l4= (int)tmp - (int)src - stride*3;
516  const int l5= (int)tmp - (int)src - stride*3 + 8;
517  const int l6= stride*3 + l3;
518  const int l7= stride + l6;
519  const int l8= stride + l7;
520 
521  memcpy(tmp, src+stride*7, 8);
522  memcpy(tmp+8, src+stride*8, 8);
523 */
524  src+= stride*4;
525  __asm__ volatile(
526 
527 #if 0 //slightly more accurate and slightly slower
528  "pxor %%mm7, %%mm7 \n\t" // 0
529  "lea (%0, %1), %%"FF_REG_a" \n\t"
530  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
531 // 0 1 2 3 4 5 6 7
532 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
533 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
534 
535 
536  "movq (%0, %1, 2), %%mm0 \n\t" // l2
537  "movq (%0), %%mm1 \n\t" // l0
538  "movq %%mm0, %%mm2 \n\t" // l2
539  PAVGB(%%mm7, %%mm0) // ~l2/2
540  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
541  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
542 
543  "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
544  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
545  "movq %%mm1, %%mm4 \n\t" // l1
546  PAVGB(%%mm7, %%mm1) // ~l1/2
547  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
548  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
549 
550  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
551  "psubusb %%mm1, %%mm0 \n\t"
552  "psubusb %%mm4, %%mm1 \n\t"
553  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
554 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
555 
556  "movq (%0, %1, 4), %%mm0 \n\t" // l4
557  "movq %%mm0, %%mm4 \n\t" // l4
558  PAVGB(%%mm7, %%mm0) // ~l4/2
559  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
560  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
561 
562  "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
563  "movq %%mm3, %%mm5 \n\t" // l3
564  PAVGB(%%mm7, %%mm3) // ~l3/2
565  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
566  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
567 
568  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
569  "psubusb %%mm3, %%mm0 \n\t"
570  "psubusb %%mm6, %%mm3 \n\t"
571  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
572  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
573 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
574 
575  "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
576  "movq %%mm6, %%mm5 \n\t" // l6
577  PAVGB(%%mm7, %%mm6) // ~l6/2
578  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
579  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
580 
581  "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
582  "movq %%mm2, %%mm4 \n\t" // l5
583  PAVGB(%%mm7, %%mm2) // ~l5/2
584  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
585  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
586 
587  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
588  "psubusb %%mm2, %%mm6 \n\t"
589  "psubusb %%mm4, %%mm2 \n\t"
590  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
591 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
592 
593 
594  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
595  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
596  "paddusb "MANGLE(b01)", %%mm4 \n\t"
597  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
598  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
599  "pand %%mm4, %%mm3 \n\t"
600 
601  "movq %%mm3, %%mm1 \n\t"
602 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
603  PAVGB(%%mm7, %%mm3)
604  PAVGB(%%mm7, %%mm3)
605  "paddusb %%mm1, %%mm3 \n\t"
606 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
607 
608  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
609  "movq (%0, %1, 4), %%mm5 \n\t" //l4
610  "movq (%0, %1, 4), %%mm4 \n\t" //l4
611  "psubusb %%mm6, %%mm5 \n\t"
612  "psubusb %%mm4, %%mm6 \n\t"
613  "por %%mm6, %%mm5 \n\t" // |l3-l4|
614  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
615  "pxor %%mm6, %%mm0 \n\t"
616  "pand %%mm0, %%mm3 \n\t"
617  PMINUB(%%mm5, %%mm3, %%mm0)
618 
619  "psubusb "MANGLE(b01)", %%mm3 \n\t"
620  PAVGB(%%mm7, %%mm3)
621 
622  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
623  "movq (%0, %1, 4), %%mm2 \n\t"
624  "pxor %%mm6, %%mm0 \n\t"
625  "pxor %%mm6, %%mm2 \n\t"
626  "psubb %%mm3, %%mm0 \n\t"
627  "paddb %%mm3, %%mm2 \n\t"
628  "pxor %%mm6, %%mm0 \n\t"
629  "pxor %%mm6, %%mm2 \n\t"
630  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
631  "movq %%mm2, (%0, %1, 4) \n\t"
632 #endif //0
633 
634  "lea (%0, %1), %%"FF_REG_a" \n\t"
635  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
636 // 0 1 2 3 4 5 6 7
637 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
638 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
639 
640 
641  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
642  "movq (%0, %1, 4), %%mm0 \n\t" // l4
643  "pxor %%mm6, %%mm1 \n\t" // -l3-1
644  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
645 // mm1=-l3-1, mm0=128-q
646 
647  "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
648  "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
649  "pxor %%mm6, %%mm2 \n\t" // -l5-1
650  "movq %%mm2, %%mm5 \n\t" // -l5-1
651  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
652  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
653  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
654  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
655  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
656  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
657 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
658 
659  "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
660  "pxor %%mm6, %%mm2 \n\t" // -l1-1
661  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
662  PAVGB((%0), %%mm1) // (l0-l3+256)/2
663  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
664  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
665  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
666  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
667 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
668 
669  PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
670  "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
671  "pxor %%mm6, %%mm1 \n\t" // -l7-1
672  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
673  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
674  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
675  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
676  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
677 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
678 
679  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
680  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
681  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
682  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
683  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
684  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
685  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
686 
687 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
688 
689  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
690  "movq %2, %%mm2 \n\t" // QP
691  PAVGB(%%mm6, %%mm2) // 128 + QP/2
692  "psubb %%mm6, %%mm2 \n\t"
693 
694  "movq %%mm4, %%mm1 \n\t"
695  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
696  "pxor %%mm1, %%mm4 \n\t"
697  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
698  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
699  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
700 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
701 
702  "movq %%mm4, %%mm3 \n\t" // d
703  "psubusb "MANGLE(b01)", %%mm4 \n\t"
704  PAVGB(%%mm7, %%mm4) // d/32
705  PAVGB(%%mm7, %%mm4) // (d + 32)/64
706  "paddb %%mm3, %%mm4 \n\t" // 5d/64
707  "pand %%mm2, %%mm4 \n\t"
708 
709  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
710  "psubb %%mm0, %%mm5 \n\t" // q
711  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
712  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
713  "pxor %%mm7, %%mm5 \n\t"
714 
715  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
716  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
717 
718  "pand %%mm7, %%mm4 \n\t"
719  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
720  "movq (%0, %1, 4), %%mm2 \n\t"
721  "pxor %%mm1, %%mm0 \n\t"
722  "pxor %%mm1, %%mm2 \n\t"
723  "paddb %%mm4, %%mm0 \n\t"
724  "psubb %%mm4, %%mm2 \n\t"
725  "pxor %%mm1, %%mm0 \n\t"
726  "pxor %%mm1, %%mm2 \n\t"
727  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
728  "movq %%mm2, (%0, %1, 4) \n\t"
729 
730  :
731  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
732  NAMED_CONSTRAINTS_ADD(b80,b00,b01)
733  : "%"FF_REG_a, "%"FF_REG_c
734  );
735 
736 /*
737  {
738  int x;
739  src-= stride;
740  for(x=0; x<BLOCK_SIZE; x++){
741  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
742  if(FFABS(middleEnergy)< 8*QP){
743  const int q=(src[l4] - src[l5])/2;
744  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
745  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
746 
747  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
748  d= FFMAX(d, 0);
749 
750  d= (5*d + 32) >> 6;
751  d*= FFSIGN(-middleEnergy);
752 
753  if(q>0){
754  d= d<0 ? 0 : d;
755  d= d>q ? q : d;
756  }else{
757  d= d>0 ? 0 : d;
758  d= d<q ? q : d;
759  }
760 
761  src[l4]-= d;
762  src[l5]+= d;
763  }
764  src++;
765  }
766  src-=8;
767  for(x=0; x<8; x++){
768  int y;
769  for(y=4; y<6; y++){
770  int d= src[x+y*stride] - tmp[x+(y-4)*8];
771  int ad= FFABS(d);
772  static int max=0;
773  static int sum=0;
774  static int num=0;
775  static int bias=0;
776 
777  if(max<ad) max=ad;
778  sum+= ad>3 ? 1 : 0;
779  if(ad>3){
780  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
781  }
782  if(y==4) bias+=d;
783  num++;
784  if(num%1000000 == 0){
785  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
786  }
787  }
788  }
789 }
790 */
791 #else //TEMPLATE_PP_MMXEXT
792  const int l1= stride;
793  const int l2= stride + l1;
794  const int l3= stride + l2;
795  const int l4= stride + l3;
796  const int l5= stride + l4;
797  const int l6= stride + l5;
798  const int l7= stride + l6;
799  const int l8= stride + l7;
800 // const int l9= stride + l8;
801  int x;
802  src+= stride*3;
803  for(x=0; x<BLOCK_SIZE; x++){
804  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
805  if(FFABS(middleEnergy) < 8*c->QP){
806  const int q=(src[l4] - src[l5])/2;
807  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
808  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
809 
810  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
811  d= FFMAX(d, 0);
812 
813  d= (5*d + 32) >> 6;
814  d*= FFSIGN(-middleEnergy);
815 
816  if(q>0){
817  d = FFMAX(d, 0);
818  d = FFMIN(d, q);
819  }else{
820  d = FFMIN(d, 0);
821  d = FFMAX(d, q);
822  }
823 
824  src[l4]-= d;
825  src[l5]+= d;
826  }
827  src++;
828  }
829 #endif //TEMPLATE_PP_MMXEXT
830 }
831 #endif //TEMPLATE_PP_ALTIVEC
832 
833 #if !TEMPLATE_PP_ALTIVEC
834 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
835 {
836 #if TEMPLATE_PP_MMXEXT && HAVE_7REGS
837  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
838  __asm__ volatile(
839  "pxor %%mm6, %%mm6 \n\t"
840  "pcmpeqb %%mm7, %%mm7 \n\t"
841  "movq %2, %%mm0 \n\t"
842  "punpcklbw %%mm6, %%mm0 \n\t"
843  "psrlw $1, %%mm0 \n\t"
844  "psubw %%mm7, %%mm0 \n\t"
845  "packuswb %%mm0, %%mm0 \n\t"
846  "movq %%mm0, %3 \n\t"
847 
848  "lea (%0, %1), %%"FF_REG_a" \n\t"
849  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
850 
851 // 0 1 2 3 4 5 6 7 8 9
852 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
853 
854 #undef REAL_FIND_MIN_MAX
855 #undef FIND_MIN_MAX
856 #define REAL_FIND_MIN_MAX(addr)\
857  "movq " #addr ", %%mm0 \n\t"\
858  "pminub %%mm0, %%mm7 \n\t"\
859  "pmaxub %%mm0, %%mm6 \n\t"
860 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
861 
862 FIND_MIN_MAX((%%FF_REGa))
863 FIND_MIN_MAX((%%FF_REGa, %1))
864 FIND_MIN_MAX((%%FF_REGa, %1, 2))
865 FIND_MIN_MAX((%0, %1, 4))
866 FIND_MIN_MAX((%%FF_REGd))
867 FIND_MIN_MAX((%%FF_REGd, %1))
868 FIND_MIN_MAX((%%FF_REGd, %1, 2))
869 FIND_MIN_MAX((%0, %1, 8))
870 
871  "movq %%mm7, %%mm4 \n\t"
872  "psrlq $8, %%mm7 \n\t"
873  "pminub %%mm4, %%mm7 \n\t" // min of pixels
874  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
875  "pminub %%mm4, %%mm7 \n\t" // min of pixels
876  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
877  "pminub %%mm4, %%mm7 \n\t"
878 
879 
880  "movq %%mm6, %%mm4 \n\t"
881  "psrlq $8, %%mm6 \n\t"
882  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
883  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
884  "pmaxub %%mm4, %%mm6 \n\t"
885  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
886  "pmaxub %%mm4, %%mm6 \n\t"
887  "movq %%mm6, %%mm0 \n\t" // max
888  "psubb %%mm7, %%mm6 \n\t" // max - min
889  "push %%"FF_REG_a" \n\t"
890  "movd %%mm6, %%eax \n\t"
891  "cmpb "MANGLE(deringThreshold)", %%al \n\t"
892  "pop %%"FF_REG_a" \n\t"
893  " jb 1f \n\t"
894  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
895  "punpcklbw %%mm7, %%mm7 \n\t"
896  "punpcklbw %%mm7, %%mm7 \n\t"
897  "punpcklbw %%mm7, %%mm7 \n\t"
898  "movq %%mm7, (%4) \n\t"
899 
900  "movq (%0), %%mm0 \n\t" // L10
901  "movq %%mm0, %%mm1 \n\t" // L10
902  "movq %%mm0, %%mm2 \n\t" // L10
903  "psllq $8, %%mm1 \n\t"
904  "psrlq $8, %%mm2 \n\t"
905  "movd -4(%0), %%mm3 \n\t"
906  "movd 8(%0), %%mm4 \n\t"
907  "psrlq $24, %%mm3 \n\t"
908  "psllq $56, %%mm4 \n\t"
909  "por %%mm3, %%mm1 \n\t" // L00
910  "por %%mm4, %%mm2 \n\t" // L20
911  "movq %%mm1, %%mm3 \n\t" // L00
912  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
913  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
914  "psubusb %%mm7, %%mm0 \n\t"
915  "psubusb %%mm7, %%mm2 \n\t"
916  "psubusb %%mm7, %%mm3 \n\t"
917  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
918  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
919  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
920  "paddb %%mm2, %%mm0 \n\t"
921  "paddb %%mm3, %%mm0 \n\t"
922 
923  "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11
924  "movq %%mm2, %%mm3 \n\t" // L11
925  "movq %%mm2, %%mm4 \n\t" // L11
926  "psllq $8, %%mm3 \n\t"
927  "psrlq $8, %%mm4 \n\t"
928  "movd -4(%%"FF_REG_a"), %%mm5 \n\t"
929  "movd 8(%%"FF_REG_a"), %%mm6 \n\t"
930  "psrlq $24, %%mm5 \n\t"
931  "psllq $56, %%mm6 \n\t"
932  "por %%mm5, %%mm3 \n\t" // L01
933  "por %%mm6, %%mm4 \n\t" // L21
934  "movq %%mm3, %%mm5 \n\t" // L01
935  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
936  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
937  "psubusb %%mm7, %%mm2 \n\t"
938  "psubusb %%mm7, %%mm4 \n\t"
939  "psubusb %%mm7, %%mm5 \n\t"
940  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
941  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
942  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
943  "paddb %%mm4, %%mm2 \n\t"
944  "paddb %%mm5, %%mm2 \n\t"
945 // 0, 2, 3, 1
946 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
947  "movq " #src ", " #sx " \n\t" /* src[0] */\
948  "movq " #sx ", " #lx " \n\t" /* src[0] */\
949  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
950  "psllq $8, " #lx " \n\t"\
951  "psrlq $8, " #t0 " \n\t"\
952  "movd -4" #src ", " #t1 " \n\t"\
953  "psrlq $24, " #t1 " \n\t"\
954  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
955  "movd 8" #src ", " #t1 " \n\t"\
956  "psllq $56, " #t1 " \n\t"\
957  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
958  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
959  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
960  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
961  PAVGB(lx, pplx) \
962  "movq " #lx ", 8(%4) \n\t"\
963  "movq (%4), " #lx " \n\t"\
964  "psubusb " #lx ", " #t1 " \n\t"\
965  "psubusb " #lx ", " #t0 " \n\t"\
966  "psubusb " #lx ", " #sx " \n\t"\
967  "movq "MANGLE(b00)", " #lx " \n\t"\
968  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
969  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
970  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
971  "paddb " #t1 ", " #t0 " \n\t"\
972  "paddb " #t0 ", " #sx " \n\t"\
973 \
974  PAVGB(plx, pplx) /* filtered */\
975  "movq " #dst ", " #t0 " \n\t" /* dst */\
976  "movq " #t0 ", " #t1 " \n\t" /* dst */\
977  "psubusb %3, " #t0 " \n\t"\
978  "paddusb %3, " #t1 " \n\t"\
979  PMAXUB(t0, pplx)\
980  PMINUB(t1, pplx, t0)\
981  "paddb " #sx ", " #ppsx " \n\t"\
982  "paddb " #psx ", " #ppsx " \n\t"\
983  "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
984  "pand "MANGLE(b08)", " #ppsx " \n\t"\
985  "pcmpeqb " #lx ", " #ppsx " \n\t"\
986  "pand " #ppsx ", " #pplx " \n\t"\
987  "pandn " #dst ", " #ppsx " \n\t"\
988  "por " #pplx ", " #ppsx " \n\t"\
989  "movq " #ppsx ", " #dst " \n\t"\
990  "movq 8(%4), " #lx " \n\t"
991 
992 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
993  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
994 /*
995 0000000
996 1111111
997 
998 1111110
999 1111101
1000 1111100
1001 1111011
1002 1111010
1003 1111001
1004 
1005 1111000
1006 1110111
1007 
1008 */
1009 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1010 DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1011 DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1012 DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1013 DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1014 DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1015 DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1016 DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1017 DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1018 
1019  "1: \n\t"
1020  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1021  NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08)
1022  : "%"FF_REG_a, "%"FF_REG_d
1023  );
1024 #else // HAVE_7REGS && TEMPLATE_PP_MMXEXT
1025  int y;
1026  int min=255;
1027  int max=0;
1028  int avg;
1029  uint8_t *p;
1030  int s[10];
1031  const int QP2= c->QP/2 + 1;
1032 
1033  src --;
1034  for(y=1; y<9; y++){
1035  int x;
1036  p= src + stride*y;
1037  for(x=1; x<9; x++){
1038  p++;
1039  if(*p > max) max= *p;
1040  if(*p < min) min= *p;
1041  }
1042  }
1043  avg= (min + max + 1)>>1;
1044 
1045  if(max - min <deringThreshold) return;
1046 
1047  for(y=0; y<10; y++){
1048  int t = 0;
1049 
1050  if(src[stride*y + 0] > avg) t+= 1;
1051  if(src[stride*y + 1] > avg) t+= 2;
1052  if(src[stride*y + 2] > avg) t+= 4;
1053  if(src[stride*y + 3] > avg) t+= 8;
1054  if(src[stride*y + 4] > avg) t+= 16;
1055  if(src[stride*y + 5] > avg) t+= 32;
1056  if(src[stride*y + 6] > avg) t+= 64;
1057  if(src[stride*y + 7] > avg) t+= 128;
1058  if(src[stride*y + 8] > avg) t+= 256;
1059  if(src[stride*y + 9] > avg) t+= 512;
1060 
1061  t |= (~t)<<16;
1062  t &= (t<<1) & (t>>1);
1063  s[y] = t;
1064  }
1065 
1066  for(y=1; y<9; y++){
1067  int t = s[y-1] & s[y] & s[y+1];
1068  t|= t>>16;
1069  s[y-1]= t;
1070  }
1071 
1072  for(y=1; y<9; y++){
1073  int x;
1074  int t = s[y-1];
1075 
1076  p= src + stride*y;
1077  for(x=1; x<9; x++){
1078  p++;
1079  if(t & (1<<x)){
1080  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1081  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1082  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1083  f= (f + 8)>>4;
1084 
1085 #ifdef DEBUG_DERING_THRESHOLD
1086  __asm__ volatile("emms\n\t":);
1087  {
1088  static uint64_t numPixels=0;
1089  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1090 // if((max-min)<20 || (max-min)*QP<200)
1091 // if((max-min)*QP < 500)
1092 // if(max-min<QP/2)
1093  if(max-min < 20){
1094  static int numSkipped=0;
1095  static int errorSum=0;
1096  static int worstQP=0;
1097  static int worstRange=0;
1098  static int worstDiff=0;
1099  int diff= (f - *p);
1100  int absDiff= FFABS(diff);
1101  int error= diff*diff;
1102 
1103  if(x==1 || x==8 || y==1 || y==8) continue;
1104 
1105  numSkipped++;
1106  if(absDiff > worstDiff){
1107  worstDiff= absDiff;
1108  worstQP= QP;
1109  worstRange= max-min;
1110  }
1111  errorSum+= error;
1112 
1113  if(1024LL*1024LL*1024LL % numSkipped == 0){
1114  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1115  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1116  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1117  worstDiff, (float)numSkipped/numPixels);
1118  }
1119  }
1120  }
1121 #endif
1122  if (*p + QP2 < f) *p= *p + QP2;
1123  else if(*p - QP2 > f) *p= *p - QP2;
1124  else *p=f;
1125  }
1126  }
1127  }
1128 #ifdef DEBUG_DERING_THRESHOLD
1129  if(max-min < 20){
1130  for(y=1; y<9; y++){
1131  int x;
1132  int t = 0;
1133  p= src + stride*y;
1134  for(x=1; x<9; x++){
1135  p++;
1136  *p = FFMIN(*p + 20, 255);
1137  }
1138  }
1139 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1140  }
1141 #endif
1142 #endif //TEMPLATE_PP_MMXEXT
1143 }
1144 #endif //TEMPLATE_PP_ALTIVEC
1145 
1146 /**
1147  * Deinterlace the given block by linearly interpolating every second line.
1148  * will be called for every 8x8 block and can read & write from line 4-15
1149  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1150  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1151  */
1152 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1153 {
1154 #if TEMPLATE_PP_MMXEXT
1155  src+= 4*stride;
1156  __asm__ volatile(
1157  "lea (%0, %1), %%"FF_REG_a" \n\t"
1158  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
1159 // 0 1 2 3 4 5 6 7 8 9
1160 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1161 
1162  "movq (%0), %%mm0 \n\t"
1163  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1164  PAVGB(%%mm1, %%mm0)
1165  "movq %%mm0, (%%"FF_REG_a") \n\t"
1166  "movq (%0, %1, 4), %%mm0 \n\t"
1167  PAVGB(%%mm0, %%mm1)
1168  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t"
1169  "movq (%%"FF_REG_c", %1), %%mm1 \n\t"
1170  PAVGB(%%mm1, %%mm0)
1171  "movq %%mm0, (%%"FF_REG_c") \n\t"
1172  "movq (%0, %1, 8), %%mm0 \n\t"
1173  PAVGB(%%mm0, %%mm1)
1174  "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t"
1175 
1176  : : "r" (src), "r" ((x86_reg)stride)
1177  : "%"FF_REG_a, "%"FF_REG_c
1178  );
1179 #else
1180  int a, b, x;
1181  src+= 4*stride;
1182 
1183  for(x=0; x<2; x++){
1184  a= *(uint32_t*)&src[stride*0];
1185  b= *(uint32_t*)&src[stride*2];
1186  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1187  a= *(uint32_t*)&src[stride*4];
1188  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1189  b= *(uint32_t*)&src[stride*6];
1190  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1191  a= *(uint32_t*)&src[stride*8];
1192  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1193  src += 4;
1194  }
1195 #endif
1196 }
1197 
1198 /**
1199  * Deinterlace the given block by cubic interpolating every second line.
1200  * will be called for every 8x8 block and can read & write from line 4-15
1201  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1202  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1203  * this filter will read lines 3-15 and write 7-13
1204  */
1205 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1206 {
1207 #if TEMPLATE_PP_SSE2
1208  src+= stride*3;
1209  __asm__ volatile(
1210  "lea (%0, %1), %%"FF_REG_a" \n\t"
1211  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1212  "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
1213  "add %1, %%"FF_REG_c" \n\t"
1214  "pxor %%xmm7, %%xmm7 \n\t"
1215 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1216  "movq " #a ", %%xmm0 \n\t"\
1217  "movq " #b ", %%xmm1 \n\t"\
1218  "movq " #d ", %%xmm2 \n\t"\
1219  "movq " #e ", %%xmm3 \n\t"\
1220  "pavgb %%xmm2, %%xmm1 \n\t"\
1221  "pavgb %%xmm3, %%xmm0 \n\t"\
1222  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1223  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1224  "psubw %%xmm1, %%xmm0 \n\t"\
1225  "psraw $3, %%xmm0 \n\t"\
1226  "psubw %%xmm0, %%xmm1 \n\t"\
1227  "packuswb %%xmm1, %%xmm1 \n\t"\
1228  "movlps %%xmm1, " #c " \n\t"
1229 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1230 
1231 DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
1232 DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8))
1233 DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc))
1234 DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2))
1235 
1236  : : "r" (src), "r" ((x86_reg)stride)
1237  :
1238  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1239  "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
1240  );
1241 #undef REAL_DEINT_CUBIC
1242 #else //TEMPLATE_PP_SSE2
1243  int x;
1244  src+= stride*3;
1245  for(x=0; x<8; x++){
1246  src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1247  src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1248  src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1249  src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1250  src++;
1251  }
1252 #endif //TEMPLATE_PP_SSE2
1253 }
1254 
1255 /**
1256  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1257  * will be called for every 8x8 block and can read & write from line 4-15
1258  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1259  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1260  * this filter will read lines 4-13 and write 5-11
1261  */
1262 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1263 {
1264 #if TEMPLATE_PP_MMXEXT
1265  src+= stride*4;
1266  __asm__ volatile(
1267  "lea (%0, %1), %%"FF_REG_a" \n\t"
1268  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1269  "pxor %%mm7, %%mm7 \n\t"
1270  "movq (%2), %%mm0 \n\t"
1271 // 0 1 2 3 4 5 6 7 8 9 10
1272 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1273 
1274 #define REAL_DEINT_FF(a,b,c,d)\
1275  "movq " #a ", %%mm1 \n\t"\
1276  "movq " #b ", %%mm2 \n\t"\
1277  "movq " #c ", %%mm3 \n\t"\
1278  "movq " #d ", %%mm4 \n\t"\
1279  PAVGB(%%mm3, %%mm1) \
1280  PAVGB(%%mm4, %%mm0) \
1281  "movq %%mm0, %%mm3 \n\t"\
1282  "punpcklbw %%mm7, %%mm0 \n\t"\
1283  "punpckhbw %%mm7, %%mm3 \n\t"\
1284  "movq %%mm1, %%mm4 \n\t"\
1285  "punpcklbw %%mm7, %%mm1 \n\t"\
1286  "punpckhbw %%mm7, %%mm4 \n\t"\
1287  "psllw $2, %%mm1 \n\t"\
1288  "psllw $2, %%mm4 \n\t"\
1289  "psubw %%mm0, %%mm1 \n\t"\
1290  "psubw %%mm3, %%mm4 \n\t"\
1291  "movq %%mm2, %%mm5 \n\t"\
1292  "movq %%mm2, %%mm0 \n\t"\
1293  "punpcklbw %%mm7, %%mm2 \n\t"\
1294  "punpckhbw %%mm7, %%mm5 \n\t"\
1295  "paddw %%mm2, %%mm1 \n\t"\
1296  "paddw %%mm5, %%mm4 \n\t"\
1297  "psraw $2, %%mm1 \n\t"\
1298  "psraw $2, %%mm4 \n\t"\
1299  "packuswb %%mm4, %%mm1 \n\t"\
1300  "movq %%mm1, " #b " \n\t"\
1301 
1302 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1303 
1304 DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2))
1305 DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1306 DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2))
1307 DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1308 
1309  "movq %%mm0, (%2) \n\t"
1310  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1311  : "%"FF_REG_a, "%"FF_REG_d
1312  );
1313 #else //TEMPLATE_PP_MMXEXT
1314  int x;
1315  src+= stride*4;
1316  for(x=0; x<8; x++){
1317  int t1= tmp[x];
1318  int t2= src[stride*1];
1319 
1320  src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1321  t1= src[stride*4];
1322  src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1323  t2= src[stride*6];
1324  src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1325  t1= src[stride*8];
1326  src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1327  tmp[x]= t1;
1328 
1329  src++;
1330  }
1331 #endif //TEMPLATE_PP_MMXEXT
1332 }
1333 
1334 /**
1335  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1336  * will be called for every 8x8 block and can read & write from line 4-15
1337  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1338  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1339  * this filter will read lines 4-13 and write 4-11
1340  */
1341 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1342 {
1343 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1344  src+= stride*4;
1345  __asm__ volatile(
1346  "lea (%0, %1), %%"FF_REG_a" \n\t"
1347  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1348  "pxor %%mm7, %%mm7 \n\t"
1349  "movq (%2), %%mm0 \n\t"
1350  "movq (%3), %%mm1 \n\t"
1351 // 0 1 2 3 4 5 6 7 8 9 10
1352 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1353 
1354 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1355  "movq " #a ", %%mm2 \n\t"\
1356  "movq " #b ", %%mm3 \n\t"\
1357  "movq " #c ", %%mm4 \n\t"\
1358  PAVGB(t2, %%mm3) \
1359  PAVGB(t1, %%mm4) \
1360  "movq %%mm2, %%mm5 \n\t"\
1361  "movq %%mm2, " #t1 " \n\t"\
1362  "punpcklbw %%mm7, %%mm2 \n\t"\
1363  "punpckhbw %%mm7, %%mm5 \n\t"\
1364  "movq %%mm2, %%mm6 \n\t"\
1365  "paddw %%mm2, %%mm2 \n\t"\
1366  "paddw %%mm6, %%mm2 \n\t"\
1367  "movq %%mm5, %%mm6 \n\t"\
1368  "paddw %%mm5, %%mm5 \n\t"\
1369  "paddw %%mm6, %%mm5 \n\t"\
1370  "movq %%mm3, %%mm6 \n\t"\
1371  "punpcklbw %%mm7, %%mm3 \n\t"\
1372  "punpckhbw %%mm7, %%mm6 \n\t"\
1373  "paddw %%mm3, %%mm3 \n\t"\
1374  "paddw %%mm6, %%mm6 \n\t"\
1375  "paddw %%mm3, %%mm2 \n\t"\
1376  "paddw %%mm6, %%mm5 \n\t"\
1377  "movq %%mm4, %%mm6 \n\t"\
1378  "punpcklbw %%mm7, %%mm4 \n\t"\
1379  "punpckhbw %%mm7, %%mm6 \n\t"\
1380  "psubw %%mm4, %%mm2 \n\t"\
1381  "psubw %%mm6, %%mm5 \n\t"\
1382  "psraw $2, %%mm2 \n\t"\
1383  "psraw $2, %%mm5 \n\t"\
1384  "packuswb %%mm5, %%mm2 \n\t"\
1385  "movq %%mm2, " #a " \n\t"\
1386 
1387 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1388 
1389 DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) )
1390 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2))
1391 DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) )
1392 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1393 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) )
1394 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2))
1395 DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) )
1396 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1397 
1398  "movq %%mm0, (%2) \n\t"
1399  "movq %%mm1, (%3) \n\t"
1400  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1401  : "%"FF_REG_a, "%"FF_REG_d
1402  );
1403 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
1404  int x;
1405  src+= stride*4;
1406  for(x=0; x<8; x++){
1407  int t1= tmp[x];
1408  int t2= tmp2[x];
1409  int t3= src[0];
1410 
1411  src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1412  t1= src[stride*1];
1413  src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1414  t2= src[stride*2];
1415  src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1416  t3= src[stride*3];
1417  src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1418  t1= src[stride*4];
1419  src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1420  t2= src[stride*5];
1421  src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1422  t3= src[stride*6];
1423  src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1424  t1= src[stride*7];
1425  src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1426 
1427  tmp[x]= t3;
1428  tmp2[x]= t1;
1429 
1430  src++;
1431  }
1432 #endif // TEMPLATE_PP_MMXEXT && HAVE_6REGS
1433 }
1434 
1435 /**
1436  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1437  * will be called for every 8x8 block and can read & write from line 4-15
1438  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1439  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1440  * this filter will read lines 4-13 and write 4-11
1441  */
1442 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1443 {
1444 #if TEMPLATE_PP_MMXEXT
1445  src+= 4*stride;
1446  __asm__ volatile(
1447  "lea (%0, %1), %%"FF_REG_a" \n\t"
1448  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1449 // 0 1 2 3 4 5 6 7 8 9
1450 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1451 
1452  "movq (%2), %%mm0 \n\t" // L0
1453  "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2
1454  PAVGB(%%mm1, %%mm0) // L0+L2
1455  "movq (%0), %%mm2 \n\t" // L1
1456  PAVGB(%%mm2, %%mm0)
1457  "movq %%mm0, (%0) \n\t"
1458  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3
1459  PAVGB(%%mm0, %%mm2) // L1+L3
1460  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1461  "movq %%mm2, (%%"FF_REG_a") \n\t"
1462  "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4
1463  PAVGB(%%mm2, %%mm1) // L2+L4
1464  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1465  "movq %%mm1, (%%"FF_REG_a", %1) \n\t"
1466  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1467  PAVGB(%%mm1, %%mm0) // L3+L5
1468  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1469  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
1470  "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6
1471  PAVGB(%%mm0, %%mm2) // L4+L6
1472  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1473  "movq %%mm2, (%0, %1, 4) \n\t"
1474  "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7
1475  PAVGB(%%mm2, %%mm1) // L5+L7
1476  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1477  "movq %%mm1, (%%"FF_REG_d") \n\t"
1478  "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8
1479  PAVGB(%%mm1, %%mm0) // L6+L8
1480  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1481  "movq %%mm0, (%%"FF_REG_d", %1) \n\t"
1482  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1483  PAVGB(%%mm0, %%mm2) // L7+L9
1484  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1485  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1486  "movq %%mm1, (%2) \n\t"
1487 
1488  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1489  : "%"FF_REG_a, "%"FF_REG_d
1490  );
1491 #else //TEMPLATE_PP_MMXEXT
1492  int a, b, c, x;
1493  src+= 4*stride;
1494 
1495  for(x=0; x<2; x++){
1496  a= *(uint32_t*)&tmp[stride*0];
1497  b= *(uint32_t*)&src[stride*0];
1498  c= *(uint32_t*)&src[stride*1];
1499  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1500  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1501 
1502  a= *(uint32_t*)&src[stride*2];
1503  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1504  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1505 
1506  b= *(uint32_t*)&src[stride*3];
1507  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1508  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1509 
1510  c= *(uint32_t*)&src[stride*4];
1511  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1512  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1513 
1514  a= *(uint32_t*)&src[stride*5];
1515  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1516  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1517 
1518  b= *(uint32_t*)&src[stride*6];
1519  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1520  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1521 
1522  c= *(uint32_t*)&src[stride*7];
1523  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1524  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1525 
1526  a= *(uint32_t*)&src[stride*8];
1527  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1528  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1529 
1530  *(uint32_t*)&tmp[stride*0]= c;
1531  src += 4;
1532  tmp += 4;
1533  }
1534 #endif //TEMPLATE_PP_MMXEXT
1535 }
1536 
1537 /**
1538  * Deinterlace the given block by applying a median filter to every second line.
1539  * will be called for every 8x8 block and can read & write from line 4-15,
1540  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1541  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1542  */
1543 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1544 {
1545 #if TEMPLATE_PP_MMXEXT
1546  src+= 4*stride;
1547  __asm__ volatile(
1548  "lea (%0, %1), %%"FF_REG_a" \n\t"
1549  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1550 // 0 1 2 3 4 5 6 7 8 9
1551 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1552 
1553  "movq (%0), %%mm0 \n\t"
1554  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
1555  "movq (%%"FF_REG_a"), %%mm1 \n\t"
1556  "movq %%mm0, %%mm3 \n\t"
1557  "pmaxub %%mm1, %%mm0 \n\t"
1558  "pminub %%mm3, %%mm1 \n\t"
1559  "pmaxub %%mm2, %%mm1 \n\t"
1560  "pminub %%mm1, %%mm0 \n\t"
1561  "movq %%mm0, (%%"FF_REG_a") \n\t"
1562 
1563  "movq (%0, %1, 4), %%mm0 \n\t"
1564  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t"
1565  "movq %%mm2, %%mm3 \n\t"
1566  "pmaxub %%mm1, %%mm2 \n\t"
1567  "pminub %%mm3, %%mm1 \n\t"
1568  "pmaxub %%mm0, %%mm1 \n\t"
1569  "pminub %%mm1, %%mm2 \n\t"
1570  "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t"
1571 
1572  "movq (%%"FF_REG_d"), %%mm2 \n\t"
1573  "movq (%%"FF_REG_d", %1), %%mm1 \n\t"
1574  "movq %%mm2, %%mm3 \n\t"
1575  "pmaxub %%mm0, %%mm2 \n\t"
1576  "pminub %%mm3, %%mm0 \n\t"
1577  "pmaxub %%mm1, %%mm0 \n\t"
1578  "pminub %%mm0, %%mm2 \n\t"
1579  "movq %%mm2, (%%"FF_REG_d") \n\t"
1580 
1581  "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t"
1582  "movq (%0, %1, 8), %%mm0 \n\t"
1583  "movq %%mm2, %%mm3 \n\t"
1584  "pmaxub %%mm0, %%mm2 \n\t"
1585  "pminub %%mm3, %%mm0 \n\t"
1586  "pmaxub %%mm1, %%mm0 \n\t"
1587  "pminub %%mm0, %%mm2 \n\t"
1588  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1589 
1590 
1591  : : "r" (src), "r" ((x86_reg)stride)
1592  : "%"FF_REG_a, "%"FF_REG_d
1593  );
1594 
1595 #else //TEMPLATE_PP_MMX
1596  int x, y;
1597  src+= 4*stride;
1598  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1599  for(x=0; x<8; x++){
1600  uint8_t *colsrc = src;
1601  for (y=0; y<4; y++){
1602  int a, b, c, d, e, f;
1603  a = colsrc[0 ];
1604  b = colsrc[stride ];
1605  c = colsrc[stride*2];
1606  d = (a-b)>>31;
1607  e = (b-c)>>31;
1608  f = (c-a)>>31;
1609  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1610  colsrc += stride*2;
1611  }
1612  src++;
1613  }
1614 #endif //TEMPLATE_PP_MMX
1615 }
1616 
1617 #if TEMPLATE_PP_MMX
1618 /**
1619  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1620  */
1621 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride)
1622 {
1623  __asm__(
1624  "lea (%0, %1), %%"FF_REG_a" \n\t"
1625 // 0 1 2 3 4 5 6 7 8 9
1626 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1627  "movq (%0), %%mm0 \n\t" // 12345678
1628  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1629  "movq %%mm0, %%mm2 \n\t" // 12345678
1630  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1631  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1632 
1633  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1634  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1635  "movq %%mm1, %%mm4 \n\t"
1636  "punpcklbw %%mm3, %%mm1 \n\t"
1637  "punpckhbw %%mm3, %%mm4 \n\t"
1638 
1639  "movq %%mm0, %%mm3 \n\t"
1640  "punpcklwd %%mm1, %%mm0 \n\t"
1641  "punpckhwd %%mm1, %%mm3 \n\t"
1642  "movq %%mm2, %%mm1 \n\t"
1643  "punpcklwd %%mm4, %%mm2 \n\t"
1644  "punpckhwd %%mm4, %%mm1 \n\t"
1645 
1646  "movd %%mm0, 128(%2) \n\t"
1647  "psrlq $32, %%mm0 \n\t"
1648  "movd %%mm0, 144(%2) \n\t"
1649  "movd %%mm3, 160(%2) \n\t"
1650  "psrlq $32, %%mm3 \n\t"
1651  "movd %%mm3, 176(%2) \n\t"
1652  "movd %%mm3, 48(%3) \n\t"
1653  "movd %%mm2, 192(%2) \n\t"
1654  "movd %%mm2, 64(%3) \n\t"
1655  "psrlq $32, %%mm2 \n\t"
1656  "movd %%mm2, 80(%3) \n\t"
1657  "movd %%mm1, 96(%3) \n\t"
1658  "psrlq $32, %%mm1 \n\t"
1659  "movd %%mm1, 112(%3) \n\t"
1660 
1661  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
1662 
1663  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
1664  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1665  "movq %%mm0, %%mm2 \n\t" // 12345678
1666  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1667  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1668 
1669  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1670  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1671  "movq %%mm1, %%mm4 \n\t"
1672  "punpcklbw %%mm3, %%mm1 \n\t"
1673  "punpckhbw %%mm3, %%mm4 \n\t"
1674 
1675  "movq %%mm0, %%mm3 \n\t"
1676  "punpcklwd %%mm1, %%mm0 \n\t"
1677  "punpckhwd %%mm1, %%mm3 \n\t"
1678  "movq %%mm2, %%mm1 \n\t"
1679  "punpcklwd %%mm4, %%mm2 \n\t"
1680  "punpckhwd %%mm4, %%mm1 \n\t"
1681 
1682  "movd %%mm0, 132(%2) \n\t"
1683  "psrlq $32, %%mm0 \n\t"
1684  "movd %%mm0, 148(%2) \n\t"
1685  "movd %%mm3, 164(%2) \n\t"
1686  "psrlq $32, %%mm3 \n\t"
1687  "movd %%mm3, 180(%2) \n\t"
1688  "movd %%mm3, 52(%3) \n\t"
1689  "movd %%mm2, 196(%2) \n\t"
1690  "movd %%mm2, 68(%3) \n\t"
1691  "psrlq $32, %%mm2 \n\t"
1692  "movd %%mm2, 84(%3) \n\t"
1693  "movd %%mm1, 100(%3) \n\t"
1694  "psrlq $32, %%mm1 \n\t"
1695  "movd %%mm1, 116(%3) \n\t"
1696 
1697 
1698  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
1699  : "%"FF_REG_a
1700  );
1701 }
1702 
1703 /**
1704  * Transpose the given 8x8 block.
1705  */
1706 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
1707 {
1708  __asm__(
1709  "lea (%0, %1), %%"FF_REG_a" \n\t"
1710  "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
1711 // 0 1 2 3 4 5 6 7 8 9
1712 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1713  "movq (%2), %%mm0 \n\t" // 12345678
1714  "movq 16(%2), %%mm1 \n\t" // abcdefgh
1715  "movq %%mm0, %%mm2 \n\t" // 12345678
1716  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1717  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1718 
1719  "movq 32(%2), %%mm1 \n\t"
1720  "movq 48(%2), %%mm3 \n\t"
1721  "movq %%mm1, %%mm4 \n\t"
1722  "punpcklbw %%mm3, %%mm1 \n\t"
1723  "punpckhbw %%mm3, %%mm4 \n\t"
1724 
1725  "movq %%mm0, %%mm3 \n\t"
1726  "punpcklwd %%mm1, %%mm0 \n\t"
1727  "punpckhwd %%mm1, %%mm3 \n\t"
1728  "movq %%mm2, %%mm1 \n\t"
1729  "punpcklwd %%mm4, %%mm2 \n\t"
1730  "punpckhwd %%mm4, %%mm1 \n\t"
1731 
1732  "movd %%mm0, (%0) \n\t"
1733  "psrlq $32, %%mm0 \n\t"
1734  "movd %%mm0, (%%"FF_REG_a") \n\t"
1735  "movd %%mm3, (%%"FF_REG_a", %1) \n\t"
1736  "psrlq $32, %%mm3 \n\t"
1737  "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
1738  "movd %%mm2, (%0, %1, 4) \n\t"
1739  "psrlq $32, %%mm2 \n\t"
1740  "movd %%mm2, (%%"FF_REG_d") \n\t"
1741  "movd %%mm1, (%%"FF_REG_d", %1) \n\t"
1742  "psrlq $32, %%mm1 \n\t"
1743  "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
1744 
1745 
1746  "movq 64(%2), %%mm0 \n\t" // 12345678
1747  "movq 80(%2), %%mm1 \n\t" // abcdefgh
1748  "movq %%mm0, %%mm2 \n\t" // 12345678
1749  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1750  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1751 
1752  "movq 96(%2), %%mm1 \n\t"
1753  "movq 112(%2), %%mm3 \n\t"
1754  "movq %%mm1, %%mm4 \n\t"
1755  "punpcklbw %%mm3, %%mm1 \n\t"
1756  "punpckhbw %%mm3, %%mm4 \n\t"
1757 
1758  "movq %%mm0, %%mm3 \n\t"
1759  "punpcklwd %%mm1, %%mm0 \n\t"
1760  "punpckhwd %%mm1, %%mm3 \n\t"
1761  "movq %%mm2, %%mm1 \n\t"
1762  "punpcklwd %%mm4, %%mm2 \n\t"
1763  "punpckhwd %%mm4, %%mm1 \n\t"
1764 
1765  "movd %%mm0, 4(%0) \n\t"
1766  "psrlq $32, %%mm0 \n\t"
1767  "movd %%mm0, 4(%%"FF_REG_a") \n\t"
1768  "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
1769  "psrlq $32, %%mm3 \n\t"
1770  "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
1771  "movd %%mm2, 4(%0, %1, 4) \n\t"
1772  "psrlq $32, %%mm2 \n\t"
1773  "movd %%mm2, 4(%%"FF_REG_d") \n\t"
1774  "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
1775  "psrlq $32, %%mm1 \n\t"
1776  "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
1777 
1778  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
1779  : "%"FF_REG_a, "%"FF_REG_d
1780  );
1781 }
1782 #endif //TEMPLATE_PP_MMX
1783 //static long test=0;
1784 
1785 #if !TEMPLATE_PP_ALTIVEC
1786 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
1787  uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
1788 {
1789  // to save a register (FIXME do this outside of the loops)
1790  tempBlurredPast[127]= maxNoise[0];
1791  tempBlurredPast[128]= maxNoise[1];
1792  tempBlurredPast[129]= maxNoise[2];
1793 
1794 #define FAST_L2_DIFF
1795 //#define L1_DIFF //u should change the thresholds too if u try that one
1796 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1797  __asm__ volatile(
1798  "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
1799  "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
1800  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1801 // 0 1 2 3 4 5 6 7 8 9
1802 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
1803 //FIXME reorder?
1804 #ifdef L1_DIFF //needs mmx2
1805  "movq (%0), %%mm0 \n\t" // L0
1806  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
1807  "movq (%0, %2), %%mm1 \n\t" // L1
1808  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
1809  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1810  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
1811  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1812  "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
1813 
1814  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1815  "paddw %%mm1, %%mm0 \n\t"
1816  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
1817  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1818  "paddw %%mm2, %%mm0 \n\t"
1819  "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
1820  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1821  "paddw %%mm3, %%mm0 \n\t"
1822  "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
1823  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1824  "paddw %%mm4, %%mm0 \n\t"
1825  "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
1826  "paddw %%mm5, %%mm6 \n\t"
1827  "paddw %%mm7, %%mm6 \n\t"
1828  "paddw %%mm6, %%mm0 \n\t"
1829 #else //L1_DIFF
1830 #if defined (FAST_L2_DIFF)
1831  "pcmpeqb %%mm7, %%mm7 \n\t"
1832  "movq "MANGLE(b80)", %%mm6 \n\t"
1833  "pxor %%mm0, %%mm0 \n\t"
1834 #define REAL_L2_DIFF_CORE(a, b)\
1835  "movq " #a ", %%mm5 \n\t"\
1836  "movq " #b ", %%mm2 \n\t"\
1837  "pxor %%mm7, %%mm2 \n\t"\
1838  PAVGB(%%mm2, %%mm5)\
1839  "paddb %%mm6, %%mm5 \n\t"\
1840  "movq %%mm5, %%mm2 \n\t"\
1841  "psllw $8, %%mm5 \n\t"\
1842  "pmaddwd %%mm5, %%mm5 \n\t"\
1843  "pmaddwd %%mm2, %%mm2 \n\t"\
1844  "paddd %%mm2, %%mm5 \n\t"\
1845  "psrld $14, %%mm5 \n\t"\
1846  "paddd %%mm5, %%mm0 \n\t"
1847 
1848 #else //defined (FAST_L2_DIFF)
1849  "pxor %%mm7, %%mm7 \n\t"
1850  "pxor %%mm0, %%mm0 \n\t"
1851 #define REAL_L2_DIFF_CORE(a, b)\
1852  "movq " #a ", %%mm5 \n\t"\
1853  "movq " #b ", %%mm2 \n\t"\
1854  "movq %%mm5, %%mm1 \n\t"\
1855  "movq %%mm2, %%mm3 \n\t"\
1856  "punpcklbw %%mm7, %%mm5 \n\t"\
1857  "punpckhbw %%mm7, %%mm1 \n\t"\
1858  "punpcklbw %%mm7, %%mm2 \n\t"\
1859  "punpckhbw %%mm7, %%mm3 \n\t"\
1860  "psubw %%mm2, %%mm5 \n\t"\
1861  "psubw %%mm3, %%mm1 \n\t"\
1862  "pmaddwd %%mm5, %%mm5 \n\t"\
1863  "pmaddwd %%mm1, %%mm1 \n\t"\
1864  "paddd %%mm1, %%mm5 \n\t"\
1865  "paddd %%mm5, %%mm0 \n\t"
1866 
1867 #endif //defined (FAST_L2_DIFF)
1868 
1869 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
1870 
1871 L2_DIFF_CORE((%0) , (%1))
1872 L2_DIFF_CORE((%0, %2) , (%1, %2))
1873 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
1874 L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
1875 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
1876 L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
1877 L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
1878 L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
1879 
1880 #endif //L1_DIFF
1881 
1882  "movq %%mm0, %%mm4 \n\t"
1883  "psrlq $32, %%mm0 \n\t"
1884  "paddd %%mm0, %%mm4 \n\t"
1885  "movd %%mm4, %%ecx \n\t"
1886  "shll $2, %%ecx \n\t"
1887  "mov %3, %%"FF_REG_d" \n\t"
1888  "addl -4(%%"FF_REG_d"), %%ecx \n\t"
1889  "addl 4(%%"FF_REG_d"), %%ecx \n\t"
1890  "addl -1024(%%"FF_REG_d"), %%ecx \n\t"
1891  "addl $4, %%ecx \n\t"
1892  "addl 1024(%%"FF_REG_d"), %%ecx \n\t"
1893  "shrl $3, %%ecx \n\t"
1894  "movl %%ecx, (%%"FF_REG_d") \n\t"
1895 
1896 // "mov %3, %%"FF_REG_c" \n\t"
1897 // "mov %%"FF_REG_c", test \n\t"
1898 // "jmp 4f \n\t"
1899  "cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
1900  " jb 2f \n\t"
1901  "cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
1902  " jb 1f \n\t"
1903 
1904  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1905  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1906  "movq (%0), %%mm0 \n\t" // L0
1907  "movq (%0, %2), %%mm1 \n\t" // L1
1908  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1909  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1910  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1911  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1912  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1913  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1914  "movq %%mm0, (%1) \n\t" // L0
1915  "movq %%mm1, (%1, %2) \n\t" // L1
1916  "movq %%mm2, (%1, %2, 2) \n\t" // L2
1917  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
1918  "movq %%mm4, (%1, %2, 4) \n\t" // L4
1919  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
1920  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
1921  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
1922  "jmp 4f \n\t"
1923 
1924  "1: \n\t"
1925  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1926  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1927  "movq (%0), %%mm0 \n\t" // L0
1928  PAVGB((%1), %%mm0) // L0
1929  "movq (%0, %2), %%mm1 \n\t" // L1
1930  PAVGB((%1, %2), %%mm1) // L1
1931  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1932  PAVGB((%1, %2, 2), %%mm2) // L2
1933  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1934  PAVGB((%1, %%FF_REGa), %%mm3) // L3
1935  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1936  PAVGB((%1, %2, 4), %%mm4) // L4
1937  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1938  PAVGB((%1, %%FF_REGd), %%mm5) // L5
1939  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1940  PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
1941  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1942  PAVGB((%1, %%FF_REGc), %%mm7) // L7
1943  "movq %%mm0, (%1) \n\t" // R0
1944  "movq %%mm1, (%1, %2) \n\t" // R1
1945  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1946  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1947  "movq %%mm4, (%1, %2, 4) \n\t" // R4
1948  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
1949  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
1950  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
1951  "movq %%mm0, (%0) \n\t" // L0
1952  "movq %%mm1, (%0, %2) \n\t" // L1
1953  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1954  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1955  "movq %%mm4, (%0, %2, 4) \n\t" // L4
1956  "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
1957  "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
1958  "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
1959  "jmp 4f \n\t"
1960 
1961  "2: \n\t"
1962  "cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
1963  " jb 3f \n\t"
1964 
1965  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1966  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1967  "movq (%0), %%mm0 \n\t" // L0
1968  "movq (%0, %2), %%mm1 \n\t" // L1
1969  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1970  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1971  "movq (%1), %%mm4 \n\t" // R0
1972  "movq (%1, %2), %%mm5 \n\t" // R1
1973  "movq (%1, %2, 2), %%mm6 \n\t" // R2
1974  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
1975  PAVGB(%%mm4, %%mm0)
1976  PAVGB(%%mm5, %%mm1)
1977  PAVGB(%%mm6, %%mm2)
1978  PAVGB(%%mm7, %%mm3)
1979  PAVGB(%%mm4, %%mm0)
1980  PAVGB(%%mm5, %%mm1)
1981  PAVGB(%%mm6, %%mm2)
1982  PAVGB(%%mm7, %%mm3)
1983  "movq %%mm0, (%1) \n\t" // R0
1984  "movq %%mm1, (%1, %2) \n\t" // R1
1985  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1986  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1987  "movq %%mm0, (%0) \n\t" // L0
1988  "movq %%mm1, (%0, %2) \n\t" // L1
1989  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1990  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1991 
1992  "movq (%0, %2, 4), %%mm0 \n\t" // L4
1993  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
1994  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
1995  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
1996  "movq (%1, %2, 4), %%mm4 \n\t" // R4
1997  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
1998  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
1999  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2000  PAVGB(%%mm4, %%mm0)
2001  PAVGB(%%mm5, %%mm1)
2002  PAVGB(%%mm6, %%mm2)
2003  PAVGB(%%mm7, %%mm3)
2004  PAVGB(%%mm4, %%mm0)
2005  PAVGB(%%mm5, %%mm1)
2006  PAVGB(%%mm6, %%mm2)
2007  PAVGB(%%mm7, %%mm3)
2008  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2009  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2010  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2011  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2012  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2013  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2014  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2015  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2016  "jmp 4f \n\t"
2017 
2018  "3: \n\t"
2019  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2020  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2021  "movq (%0), %%mm0 \n\t" // L0
2022  "movq (%0, %2), %%mm1 \n\t" // L1
2023  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2024  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2025  "movq (%1), %%mm4 \n\t" // R0
2026  "movq (%1, %2), %%mm5 \n\t" // R1
2027  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2028  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2029  PAVGB(%%mm4, %%mm0)
2030  PAVGB(%%mm5, %%mm1)
2031  PAVGB(%%mm6, %%mm2)
2032  PAVGB(%%mm7, %%mm3)
2033  PAVGB(%%mm4, %%mm0)
2034  PAVGB(%%mm5, %%mm1)
2035  PAVGB(%%mm6, %%mm2)
2036  PAVGB(%%mm7, %%mm3)
2037  PAVGB(%%mm4, %%mm0)
2038  PAVGB(%%mm5, %%mm1)
2039  PAVGB(%%mm6, %%mm2)
2040  PAVGB(%%mm7, %%mm3)
2041  "movq %%mm0, (%1) \n\t" // R0
2042  "movq %%mm1, (%1, %2) \n\t" // R1
2043  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2044  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2045  "movq %%mm0, (%0) \n\t" // L0
2046  "movq %%mm1, (%0, %2) \n\t" // L1
2047  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2048  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2049 
2050  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2051  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2052  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2053  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2054  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2055  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2056  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2057  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2058  PAVGB(%%mm4, %%mm0)
2059  PAVGB(%%mm5, %%mm1)
2060  PAVGB(%%mm6, %%mm2)
2061  PAVGB(%%mm7, %%mm3)
2062  PAVGB(%%mm4, %%mm0)
2063  PAVGB(%%mm5, %%mm1)
2064  PAVGB(%%mm6, %%mm2)
2065  PAVGB(%%mm7, %%mm3)
2066  PAVGB(%%mm4, %%mm0)
2067  PAVGB(%%mm5, %%mm1)
2068  PAVGB(%%mm6, %%mm2)
2069  PAVGB(%%mm7, %%mm3)
2070  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2071  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2072  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2073  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2074  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2075  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2076  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2077  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2078 
2079  "4: \n\t"
2080 
2081  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2083  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
2084  );
2085 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2086 {
2087  int y;
2088  int d=0;
2089 // int sysd=0;
2090  int i;
2091 
2092  for(y=0; y<8; y++){
2093  int x;
2094  for(x=0; x<8; x++){
2095  int ref= tempBlurred[ x + y*stride ];
2096  int cur= src[ x + y*stride ];
2097  int d1=ref - cur;
2098 // if(x==0 || x==7) d1+= d1>>1;
2099 // if(y==0 || y==7) d1+= d1>>1;
2100 // d+= FFABS(d1);
2101  d+= d1*d1;
2102 // sysd+= d1;
2103  }
2104  }
2105  i=d;
2106  d= (
2107  4*d
2108  +(*(tempBlurredPast-256))
2109  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2110  +(*(tempBlurredPast+256))
2111  +4)>>3;
2112  *tempBlurredPast=i;
2113 // ((*tempBlurredPast)*3 + d + 2)>>2;
2114 
2115 /*
2116 Switch between
2117  1 0 0 0 0 0 0 (0)
2118 64 32 16 8 4 2 1 (1)
2119 64 48 36 27 20 15 11 (33) (approx)
2120 64 56 49 43 37 33 29 (200) (approx)
2121 */
2122  if(d > maxNoise[1]){
2123  if(d < maxNoise[2]){
2124  for(y=0; y<8; y++){
2125  int x;
2126  for(x=0; x<8; x++){
2127  int ref= tempBlurred[ x + y*stride ];
2128  int cur= src[ x + y*stride ];
2129  tempBlurred[ x + y*stride ]=
2130  src[ x + y*stride ]=
2131  (ref + cur + 1)>>1;
2132  }
2133  }
2134  }else{
2135  for(y=0; y<8; y++){
2136  int x;
2137  for(x=0; x<8; x++){
2138  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2139  }
2140  }
2141  }
2142  }else{
2143  if(d < maxNoise[0]){
2144  for(y=0; y<8; y++){
2145  int x;
2146  for(x=0; x<8; x++){
2147  int ref= tempBlurred[ x + y*stride ];
2148  int cur= src[ x + y*stride ];
2149  tempBlurred[ x + y*stride ]=
2150  src[ x + y*stride ]=
2151  (ref*7 + cur + 4)>>3;
2152  }
2153  }
2154  }else{
2155  for(y=0; y<8; y++){
2156  int x;
2157  for(x=0; x<8; x++){
2158  int ref= tempBlurred[ x + y*stride ];
2159  int cur= src[ x + y*stride ];
2160  tempBlurred[ x + y*stride ]=
2161  src[ x + y*stride ]=
2162  (ref*3 + cur + 2)>>2;
2163  }
2164  }
2165  }
2166  }
2167 }
2168 #endif //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2169 }
2170 #endif //TEMPLATE_PP_ALTIVEC
2171 
2172 #if TEMPLATE_PP_MMXEXT
2173 /**
2174  * accurate deblock filter
2175  */
2176 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
2177  int64_t dc_mask, eq_mask, both_masks;
2178  int64_t sums[10*8*2];
2179  src+= step*3; // src points to begin of the 8x8 Block
2180 
2181  __asm__ volatile(
2182  "movq %0, %%mm7 \n\t"
2183  "movq %1, %%mm6 \n\t"
2184  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2185  );
2186 
2187  __asm__ volatile(
2188  "lea (%2, %3), %%"FF_REG_a" \n\t"
2189 // 0 1 2 3 4 5 6 7 8 9
2190 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2191 
2192  "movq (%2), %%mm0 \n\t"
2193  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2194  "movq %%mm1, %%mm3 \n\t"
2195  "movq %%mm1, %%mm4 \n\t"
2196  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2197  "paddb %%mm7, %%mm0 \n\t"
2198  "pcmpgtb %%mm6, %%mm0 \n\t"
2199 
2200  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
2201  PMAXUB(%%mm2, %%mm4)
2202  PMINUB(%%mm2, %%mm3, %%mm5)
2203  "psubb %%mm2, %%mm1 \n\t"
2204  "paddb %%mm7, %%mm1 \n\t"
2205  "pcmpgtb %%mm6, %%mm1 \n\t"
2206  "paddb %%mm1, %%mm0 \n\t"
2207 
2208  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2209  PMAXUB(%%mm1, %%mm4)
2210  PMINUB(%%mm1, %%mm3, %%mm5)
2211  "psubb %%mm1, %%mm2 \n\t"
2212  "paddb %%mm7, %%mm2 \n\t"
2213  "pcmpgtb %%mm6, %%mm2 \n\t"
2214  "paddb %%mm2, %%mm0 \n\t"
2215 
2216  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
2217 
2218  "movq (%2, %3, 4), %%mm2 \n\t"
2219  PMAXUB(%%mm2, %%mm4)
2220  PMINUB(%%mm2, %%mm3, %%mm5)
2221  "psubb %%mm2, %%mm1 \n\t"
2222  "paddb %%mm7, %%mm1 \n\t"
2223  "pcmpgtb %%mm6, %%mm1 \n\t"
2224  "paddb %%mm1, %%mm0 \n\t"
2225 
2226  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2227  PMAXUB(%%mm1, %%mm4)
2228  PMINUB(%%mm1, %%mm3, %%mm5)
2229  "psubb %%mm1, %%mm2 \n\t"
2230  "paddb %%mm7, %%mm2 \n\t"
2231  "pcmpgtb %%mm6, %%mm2 \n\t"
2232  "paddb %%mm2, %%mm0 \n\t"
2233 
2234  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
2235  PMAXUB(%%mm2, %%mm4)
2236  PMINUB(%%mm2, %%mm3, %%mm5)
2237  "psubb %%mm2, %%mm1 \n\t"
2238  "paddb %%mm7, %%mm1 \n\t"
2239  "pcmpgtb %%mm6, %%mm1 \n\t"
2240  "paddb %%mm1, %%mm0 \n\t"
2241 
2242  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2243  PMAXUB(%%mm1, %%mm4)
2244  PMINUB(%%mm1, %%mm3, %%mm5)
2245  "psubb %%mm1, %%mm2 \n\t"
2246  "paddb %%mm7, %%mm2 \n\t"
2247  "pcmpgtb %%mm6, %%mm2 \n\t"
2248  "paddb %%mm2, %%mm0 \n\t"
2249 
2250  "movq (%2, %3, 8), %%mm2 \n\t"
2251  PMAXUB(%%mm2, %%mm4)
2252  PMINUB(%%mm2, %%mm3, %%mm5)
2253  "psubb %%mm2, %%mm1 \n\t"
2254  "paddb %%mm7, %%mm1 \n\t"
2255  "pcmpgtb %%mm6, %%mm1 \n\t"
2256  "paddb %%mm1, %%mm0 \n\t"
2257 
2258  "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2259  "psubb %%mm1, %%mm2 \n\t"
2260  "paddb %%mm7, %%mm2 \n\t"
2261  "pcmpgtb %%mm6, %%mm2 \n\t"
2262  "paddb %%mm2, %%mm0 \n\t"
2263  "psubusb %%mm3, %%mm4 \n\t"
2264 
2265  "pxor %%mm6, %%mm6 \n\t"
2266  "movq %4, %%mm7 \n\t" // QP,..., QP
2267  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2268  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2269  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2270  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2271  "movq %%mm7, %1 \n\t"
2272 
2273  "movq %5, %%mm7 \n\t"
2274  "punpcklbw %%mm7, %%mm7 \n\t"
2275  "punpcklbw %%mm7, %%mm7 \n\t"
2276  "punpcklbw %%mm7, %%mm7 \n\t"
2277  "psubb %%mm0, %%mm6 \n\t"
2278  "pcmpgtb %%mm7, %%mm6 \n\t"
2279  "movq %%mm6, %0 \n\t"
2280 
2281  : "=m" (eq_mask), "=m" (dc_mask)
2282  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2283  : "%"FF_REG_a
2284  );
2285 
2286  both_masks = dc_mask & eq_mask;
2287 
2288  if(both_masks){
2289  x86_reg offset= -8*step;
2290  int64_t *temp_sums= sums;
2291 
2292  __asm__ volatile(
2293  "movq %2, %%mm0 \n\t" // QP,..., QP
2294  "pxor %%mm4, %%mm4 \n\t"
2295 
2296  "movq (%0), %%mm6 \n\t"
2297  "movq (%0, %1), %%mm5 \n\t"
2298  "movq %%mm5, %%mm1 \n\t"
2299  "movq %%mm6, %%mm2 \n\t"
2300  "psubusb %%mm6, %%mm5 \n\t"
2301  "psubusb %%mm1, %%mm2 \n\t"
2302  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2303  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2304  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2305 
2306  "pxor %%mm6, %%mm1 \n\t"
2307  "pand %%mm0, %%mm1 \n\t"
2308  "pxor %%mm1, %%mm6 \n\t"
2309  // 0:QP 6:First
2310 
2311  "movq (%0, %1, 8), %%mm5 \n\t"
2312  "add %1, %0 \n\t" // %0 points to line 1 not 0
2313  "movq (%0, %1, 8), %%mm7 \n\t"
2314  "movq %%mm5, %%mm1 \n\t"
2315  "movq %%mm7, %%mm2 \n\t"
2316  "psubusb %%mm7, %%mm5 \n\t"
2317  "psubusb %%mm1, %%mm2 \n\t"
2318  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2319  "movq %2, %%mm0 \n\t" // QP,..., QP
2320  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2321  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2322 
2323  "pxor %%mm7, %%mm1 \n\t"
2324  "pand %%mm0, %%mm1 \n\t"
2325  "pxor %%mm1, %%mm7 \n\t"
2326 
2327  "movq %%mm6, %%mm5 \n\t"
2328  "punpckhbw %%mm4, %%mm6 \n\t"
2329  "punpcklbw %%mm4, %%mm5 \n\t"
2330  // 4:0 5/6:First 7:Last
2331 
2332  "movq %%mm5, %%mm0 \n\t"
2333  "movq %%mm6, %%mm1 \n\t"
2334  "psllw $2, %%mm0 \n\t"
2335  "psllw $2, %%mm1 \n\t"
2336  "paddw "MANGLE(w04)", %%mm0 \n\t"
2337  "paddw "MANGLE(w04)", %%mm1 \n\t"
2338 
2339 #define NEXT\
2340  "movq (%0), %%mm2 \n\t"\
2341  "movq (%0), %%mm3 \n\t"\
2342  "add %1, %0 \n\t"\
2343  "punpcklbw %%mm4, %%mm2 \n\t"\
2344  "punpckhbw %%mm4, %%mm3 \n\t"\
2345  "paddw %%mm2, %%mm0 \n\t"\
2346  "paddw %%mm3, %%mm1 \n\t"
2347 
2348 #define PREV\
2349  "movq (%0), %%mm2 \n\t"\
2350  "movq (%0), %%mm3 \n\t"\
2351  "add %1, %0 \n\t"\
2352  "punpcklbw %%mm4, %%mm2 \n\t"\
2353  "punpckhbw %%mm4, %%mm3 \n\t"\
2354  "psubw %%mm2, %%mm0 \n\t"\
2355  "psubw %%mm3, %%mm1 \n\t"
2356 
2357 
2358  NEXT //0
2359  NEXT //1
2360  NEXT //2
2361  "movq %%mm0, (%3) \n\t"
2362  "movq %%mm1, 8(%3) \n\t"
2363 
2364  NEXT //3
2365  "psubw %%mm5, %%mm0 \n\t"
2366  "psubw %%mm6, %%mm1 \n\t"
2367  "movq %%mm0, 16(%3) \n\t"
2368  "movq %%mm1, 24(%3) \n\t"
2369 
2370  NEXT //4
2371  "psubw %%mm5, %%mm0 \n\t"
2372  "psubw %%mm6, %%mm1 \n\t"
2373  "movq %%mm0, 32(%3) \n\t"
2374  "movq %%mm1, 40(%3) \n\t"
2375 
2376  NEXT //5
2377  "psubw %%mm5, %%mm0 \n\t"
2378  "psubw %%mm6, %%mm1 \n\t"
2379  "movq %%mm0, 48(%3) \n\t"
2380  "movq %%mm1, 56(%3) \n\t"
2381 
2382  NEXT //6
2383  "psubw %%mm5, %%mm0 \n\t"
2384  "psubw %%mm6, %%mm1 \n\t"
2385  "movq %%mm0, 64(%3) \n\t"
2386  "movq %%mm1, 72(%3) \n\t"
2387 
2388  "movq %%mm7, %%mm6 \n\t"
2389  "punpckhbw %%mm4, %%mm7 \n\t"
2390  "punpcklbw %%mm4, %%mm6 \n\t"
2391 
2392  NEXT //7
2393  "mov %4, %0 \n\t"
2394  "add %1, %0 \n\t"
2395  PREV //0
2396  "movq %%mm0, 80(%3) \n\t"
2397  "movq %%mm1, 88(%3) \n\t"
2398 
2399  PREV //1
2400  "paddw %%mm6, %%mm0 \n\t"
2401  "paddw %%mm7, %%mm1 \n\t"
2402  "movq %%mm0, 96(%3) \n\t"
2403  "movq %%mm1, 104(%3) \n\t"
2404 
2405  PREV //2
2406  "paddw %%mm6, %%mm0 \n\t"
2407  "paddw %%mm7, %%mm1 \n\t"
2408  "movq %%mm0, 112(%3) \n\t"
2409  "movq %%mm1, 120(%3) \n\t"
2410 
2411  PREV //3
2412  "paddw %%mm6, %%mm0 \n\t"
2413  "paddw %%mm7, %%mm1 \n\t"
2414  "movq %%mm0, 128(%3) \n\t"
2415  "movq %%mm1, 136(%3) \n\t"
2416 
2417  PREV //4
2418  "paddw %%mm6, %%mm0 \n\t"
2419  "paddw %%mm7, %%mm1 \n\t"
2420  "movq %%mm0, 144(%3) \n\t"
2421  "movq %%mm1, 152(%3) \n\t"
2422 
2423  "mov %4, %0 \n\t" //FIXME
2424 
2425  : "+&r"(src)
2426  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2428  );
2429 
2430  src+= step; // src points to begin of the 8x8 Block
2431 
2432  __asm__ volatile(
2433  "movq %4, %%mm6 \n\t"
2434  "pcmpeqb %%mm5, %%mm5 \n\t"
2435  "pxor %%mm6, %%mm5 \n\t"
2436  "pxor %%mm7, %%mm7 \n\t"
2437 
2438  "1: \n\t"
2439  "movq (%1), %%mm0 \n\t"
2440  "movq 8(%1), %%mm1 \n\t"
2441  "paddw 32(%1), %%mm0 \n\t"
2442  "paddw 40(%1), %%mm1 \n\t"
2443  "movq (%0, %3), %%mm2 \n\t"
2444  "movq %%mm2, %%mm3 \n\t"
2445  "movq %%mm2, %%mm4 \n\t"
2446  "punpcklbw %%mm7, %%mm2 \n\t"
2447  "punpckhbw %%mm7, %%mm3 \n\t"
2448  "paddw %%mm2, %%mm0 \n\t"
2449  "paddw %%mm3, %%mm1 \n\t"
2450  "paddw %%mm2, %%mm0 \n\t"
2451  "paddw %%mm3, %%mm1 \n\t"
2452  "psrlw $4, %%mm0 \n\t"
2453  "psrlw $4, %%mm1 \n\t"
2454  "packuswb %%mm1, %%mm0 \n\t"
2455  "pand %%mm6, %%mm0 \n\t"
2456  "pand %%mm5, %%mm4 \n\t"
2457  "por %%mm4, %%mm0 \n\t"
2458  "movq %%mm0, (%0, %3) \n\t"
2459  "add $16, %1 \n\t"
2460  "add %2, %0 \n\t"
2461  " js 1b \n\t"
2462 
2463  : "+r"(offset), "+r"(temp_sums)
2464  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2465  );
2466  }else
2467  src+= step; // src points to begin of the 8x8 Block
2468 
2469  if(eq_mask != -1LL){
2470  uint8_t *temp_src= src;
2471  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2472  __asm__ volatile(
2473  "pxor %%mm7, %%mm7 \n\t"
2474 // 0 1 2 3 4 5 6 7 8 9
2475 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2476 
2477  "movq (%0), %%mm0 \n\t"
2478  "movq %%mm0, %%mm1 \n\t"
2479  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2480  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2481 
2482  "movq (%0, %1), %%mm2 \n\t"
2483  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
2484  "movq %%mm2, %%mm3 \n\t"
2485  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2486  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2487 
2488  "movq (%%"FF_REG_a"), %%mm4 \n\t"
2489  "movq %%mm4, %%mm5 \n\t"
2490  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2491  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2492 
2493  "paddw %%mm0, %%mm0 \n\t" // 2L0
2494  "paddw %%mm1, %%mm1 \n\t" // 2H0
2495  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2496  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2497  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2498  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2499 
2500  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2501  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2502  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2503  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2504 
2505  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
2506  "movq %%mm2, %%mm3 \n\t"
2507  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2508  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2509 
2510  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2511  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2512  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2513  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2514  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2515  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2516 
2517  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
2518  "movq %%mm0, %%mm1 \n\t"
2519  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2520  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2521 
2522  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2523  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2524  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2525  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2526  "paddw %%mm4, %%mm4 \n\t" // 2L2
2527  "paddw %%mm5, %%mm5 \n\t" // 2H2
2528  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2529  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2530 
2531  "lea (%%"FF_REG_a", %1), %0 \n\t"
2532  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2533  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2534  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2535  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2536 //50 opcodes so far
2537  "movq (%0, %1, 2), %%mm2 \n\t"
2538  "movq %%mm2, %%mm3 \n\t"
2539  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2540  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2541  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2542  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2543  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2544  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2545 
2546  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2547  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2548  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2549  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2550  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2551  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2552 
2553  "paddw %%mm0, %%mm0 \n\t" // 2L4
2554  "paddw %%mm1, %%mm1 \n\t" // 2H4
2555  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2556  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2557 
2558  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2559  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2560  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2561  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2562 
2563  "movq (%0, %1, 4), %%mm2 \n\t"
2564  "movq %%mm2, %%mm3 \n\t"
2565  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2566  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2567 
2568  "paddw %%mm2, %%mm2 \n\t" // 2L7
2569  "paddw %%mm3, %%mm3 \n\t" // 2H7
2570  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2571  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2572 
2573  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2574  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2575 
2576  "movq %%mm7, %%mm6 \n\t" // 0
2577  "psubw %%mm0, %%mm6 \n\t"
2578  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2579  "movq %%mm7, %%mm6 \n\t" // 0
2580  "psubw %%mm1, %%mm6 \n\t"
2581  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2582  "movq %%mm7, %%mm6 \n\t" // 0
2583  "psubw %%mm2, %%mm6 \n\t"
2584  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2585  "movq %%mm7, %%mm6 \n\t" // 0
2586  "psubw %%mm3, %%mm6 \n\t"
2587  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2588 
2589  "pminsw %%mm2, %%mm0 \n\t"
2590  "pminsw %%mm3, %%mm1 \n\t"
2591 
2592  "movd %2, %%mm2 \n\t" // QP
2593  "punpcklbw %%mm7, %%mm2 \n\t"
2594 
2595  "movq %%mm7, %%mm6 \n\t" // 0
2596  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2597  "pxor %%mm6, %%mm4 \n\t"
2598  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2599  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2600  "pxor %%mm7, %%mm5 \n\t"
2601  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
2602 // 100 opcodes
2603  "psllw $3, %%mm2 \n\t" // 8QP
2604  "movq %%mm2, %%mm3 \n\t" // 8QP
2605  "pcmpgtw %%mm4, %%mm2 \n\t"
2606  "pcmpgtw %%mm5, %%mm3 \n\t"
2607  "pand %%mm2, %%mm4 \n\t"
2608  "pand %%mm3, %%mm5 \n\t"
2609 
2610 
2611  "psubusw %%mm0, %%mm4 \n\t" // hd
2612  "psubusw %%mm1, %%mm5 \n\t" // ld
2613 
2614 
2615  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
2616  "pmullw %%mm2, %%mm4 \n\t"
2617  "pmullw %%mm2, %%mm5 \n\t"
2618  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
2619  "paddw %%mm2, %%mm4 \n\t"
2620  "paddw %%mm2, %%mm5 \n\t"
2621  "psrlw $6, %%mm4 \n\t"
2622  "psrlw $6, %%mm5 \n\t"
2623 
2624  "movq 16(%4), %%mm0 \n\t" // L3 - L4
2625  "movq 24(%4), %%mm1 \n\t" // H3 - H4
2626 
2627  "pxor %%mm2, %%mm2 \n\t"
2628  "pxor %%mm3, %%mm3 \n\t"
2629 
2630  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
2631  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
2632  "pxor %%mm2, %%mm0 \n\t"
2633  "pxor %%mm3, %%mm1 \n\t"
2634  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
2635  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
2636  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
2637  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
2638 
2639  "pxor %%mm6, %%mm2 \n\t"
2640  "pxor %%mm7, %%mm3 \n\t"
2641  "pand %%mm2, %%mm4 \n\t"
2642  "pand %%mm3, %%mm5 \n\t"
2643 
2644  "pminsw %%mm0, %%mm4 \n\t"
2645  "pminsw %%mm1, %%mm5 \n\t"
2646  "pxor %%mm6, %%mm4 \n\t"
2647  "pxor %%mm7, %%mm5 \n\t"
2648  "psubw %%mm6, %%mm4 \n\t"
2649  "psubw %%mm7, %%mm5 \n\t"
2650  "packsswb %%mm5, %%mm4 \n\t"
2651  "movq %3, %%mm1 \n\t"
2652  "pandn %%mm4, %%mm1 \n\t"
2653  "movq (%0), %%mm0 \n\t"
2654  "paddb %%mm1, %%mm0 \n\t"
2655  "movq %%mm0, (%0) \n\t"
2656  "movq (%0, %1), %%mm0 \n\t"
2657  "psubb %%mm1, %%mm0 \n\t"
2658  "movq %%mm0, (%0, %1) \n\t"
2659 
2660  : "+r" (temp_src)
2661  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
2662  NAMED_CONSTRAINTS_ADD(w05,w20)
2663  : "%"FF_REG_a
2664  );
2665  }
2666 }
2667 #endif //TEMPLATE_PP_MMX
2668 
2669 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2670  const int8_t QPs[], int QPStride, int isColor, PPContext *c);
2671 
2672 /**
2673  * Copy a block from src to dst and fixes the blacklevel.
2674  * levelFix == 0 -> do not touch the brightness & contrast
2675  */
2676 #undef REAL_SCALED_CPY
2677 #undef SCALED_CPY
2678 
2679 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
2680  int levelFix, int64_t *packedOffsetAndScale)
2681 {
2682  if(levelFix){
2683 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
2684  __asm__ volatile(
2685  "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
2686  "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
2687  "lea (%2,%4), %%"FF_REG_a" \n\t"
2688  "lea (%3,%5), %%"FF_REG_d" \n\t"
2689  "pxor %%mm4, %%mm4 \n\t"
2690 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
2691  "movq " #src1 ", %%mm0 \n\t"\
2692  "movq " #src1 ", %%mm5 \n\t"\
2693  "movq " #src2 ", %%mm1 \n\t"\
2694  "movq " #src2 ", %%mm6 \n\t"\
2695  "punpcklbw %%mm0, %%mm0 \n\t"\
2696  "punpckhbw %%mm5, %%mm5 \n\t"\
2697  "punpcklbw %%mm1, %%mm1 \n\t"\
2698  "punpckhbw %%mm6, %%mm6 \n\t"\
2699  "pmulhuw %%mm3, %%mm0 \n\t"\
2700  "pmulhuw %%mm3, %%mm5 \n\t"\
2701  "pmulhuw %%mm3, %%mm1 \n\t"\
2702  "pmulhuw %%mm3, %%mm6 \n\t"\
2703  "psubw %%mm2, %%mm0 \n\t"\
2704  "psubw %%mm2, %%mm5 \n\t"\
2705  "psubw %%mm2, %%mm1 \n\t"\
2706  "psubw %%mm2, %%mm6 \n\t"\
2707  "packuswb %%mm5, %%mm0 \n\t"\
2708  "packuswb %%mm6, %%mm1 \n\t"\
2709  "movq %%mm0, " #dst1 " \n\t"\
2710  "movq %%mm1, " #dst2 " \n\t"\
2711 
2712 #define SCALED_CPY(src1, src2, dst1, dst2)\
2713  REAL_SCALED_CPY(src1, src2, dst1, dst2)
2714 
2715 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
2716 SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
2717 SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
2718  "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
2719  "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
2720 SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
2721 
2722 
2723  : "=&a" (packedOffsetAndScale)
2724  : "0" (packedOffsetAndScale),
2725  "r"(src),
2726  "r"(dst),
2727  "r" ((x86_reg)srcStride),
2728  "r" ((x86_reg)dstStride)
2729  : "%"FF_REG_d
2730  );
2731 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2732  for (int i = 0; i < 8; i++)
2733  memcpy( &(dst[dstStride*i]),
2734  &(src[srcStride*i]), BLOCK_SIZE);
2735 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2736  }else{
2737 #if TEMPLATE_PP_MMX && HAVE_6REGS
2738  __asm__ volatile(
2739  "lea (%0,%2), %%"FF_REG_a" \n\t"
2740  "lea (%1,%3), %%"FF_REG_d" \n\t"
2741 
2742 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
2743  "movq " #src1 ", %%mm0 \n\t"\
2744  "movq " #src2 ", %%mm1 \n\t"\
2745  "movq %%mm0, " #dst1 " \n\t"\
2746  "movq %%mm1, " #dst2 " \n\t"\
2747 
2748 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
2749  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
2750 
2751 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
2752 SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
2753 SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
2754  "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
2755  "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
2756 SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
2757 
2758  : : "r" (src),
2759  "r" (dst),
2760  "r" ((x86_reg)srcStride),
2761  "r" ((x86_reg)dstStride)
2762  : "%"FF_REG_a, "%"FF_REG_d
2763  );
2764 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2765  for (int i = 0; i < 8; i++)
2766  memcpy( &(dst[dstStride*i]),
2767  &(src[srcStride*i]), BLOCK_SIZE);
2768 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2769  }
2770 }
2771 
2772 /**
2773  * Duplicate the given 8 src pixels ? times upward
2774  */
2775 static inline void RENAME(duplicate)(uint8_t src[], int stride)
2776 {
2777 #if TEMPLATE_PP_MMX
2778  __asm__ volatile(
2779  "movq (%0), %%mm0 \n\t"
2780  "movq %%mm0, (%0, %1, 4) \n\t"
2781  "add %1, %0 \n\t"
2782  "movq %%mm0, (%0) \n\t"
2783  "movq %%mm0, (%0, %1) \n\t"
2784  "movq %%mm0, (%0, %1, 2) \n\t"
2785  "movq %%mm0, (%0, %1, 4) \n\t"
2786  : "+r" (src)
2787  : "r" ((x86_reg)-stride)
2788  );
2789 #else
2790  int i;
2791  uint8_t *p=src;
2792  for(i=0; i<5; i++){
2793  p-= stride;
2794  memcpy(p, src, 8);
2795  }
2796 #endif
2797 }
2798 
2799 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
2800 static inline void RENAME(prefetchnta)(const void *p)
2801 {
2802  __asm__ volatile( "prefetchnta (%0)\n\t"
2803  : : "r" (p)
2804  );
2805 }
2806 
2807 static inline void RENAME(prefetcht0)(const void *p)
2808 {
2809  __asm__ volatile( "prefetcht0 (%0)\n\t"
2810  : : "r" (p)
2811  );
2812 }
2813 
2814 static inline void RENAME(prefetcht1)(const void *p)
2815 {
2816  __asm__ volatile( "prefetcht1 (%0)\n\t"
2817  : : "r" (p)
2818  );
2819 }
2820 
2821 static inline void RENAME(prefetcht2)(const void *p)
2822 {
2823  __asm__ volatile( "prefetcht2 (%0)\n\t"
2824  : : "r" (p)
2825  );
2826 }
2827 #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
2828 static inline void RENAME(prefetchnta)(const void *p)
2829 {
2830  __builtin_prefetch(p,0,0);
2831 }
2832 static inline void RENAME(prefetcht0)(const void *p)
2833 {
2834  __builtin_prefetch(p,0,1);
2835 }
2836 static inline void RENAME(prefetcht1)(const void *p)
2837 {
2838  __builtin_prefetch(p,0,2);
2839 }
2840 static inline void RENAME(prefetcht2)(const void *p)
2841 {
2842  __builtin_prefetch(p,0,3);
2843 }
2844 #else
2845 static inline void RENAME(prefetchnta)(const void *p)
2846 {
2847  return;
2848 }
2849 static inline void RENAME(prefetcht0)(const void *p)
2850 {
2851  return;
2852 }
2853 static inline void RENAME(prefetcht1)(const void *p)
2854 {
2855  return;
2856 }
2857 static inline void RENAME(prefetcht2)(const void *p)
2858 {
2859  return;
2860 }
2861 #endif
2862 /**
2863  * Filter array of bytes (Y or U or V values)
2864  */
2865 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2866  const int8_t QPs[], int QPStride, int isColor, PPContext *c)
2867 {
2868  int x,y;
2869 #ifdef TEMPLATE_PP_TIME_MODE
2870  const int mode= TEMPLATE_PP_TIME_MODE;
2871 #else
2872  const int mode = isColor ? c->ppMode.chromMode : c->ppMode.lumMode;
2873 #endif
2874  int black=0, white=255; // blackest black and whitest white in the picture
2875  int QPCorrecture= 256*256;
2876 
2877  int copyAhead;
2878 #if TEMPLATE_PP_MMX
2879  int i;
2880 #endif
2881 
2882  const int qpHShift = isColor ? 4 - c->hChromaSubSample : 4;
2883  const int qpVShift = isColor ? 4 - c->vChromaSubSample : 4;
2884 
2885  //FIXME remove
2886  uint64_t * const yHistogram= c->yHistogram;
2887  uint8_t * const tempSrc = srcStride > 0 ? c->tempSrc : c->tempSrc - 23*srcStride;
2888  uint8_t * const tempDst = (dstStride > 0 ? c->tempDst : c->tempDst - 23*dstStride) + 32;
2889  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
2890 
2891  if (mode & VISUALIZE){
2892  if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
2893  av_log(c, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
2894  }
2895  }
2896 
2897 #if TEMPLATE_PP_MMX
2898  for(i=0; i<57; i++){
2899  int offset = ((i * c->ppMode.baseDcDiff) >> 8) + 1;
2900  int threshold= offset*2 + 1;
2901  c->mmxDcOffset[i] = 0x7F - offset;
2902  c->mmxDcThreshold[i] = 0x7F - threshold;
2903  c->mmxDcOffset[i] *= 0x0101010101010101LL;
2904  c->mmxDcThreshold[i] *= 0x0101010101010101LL;
2905  }
2906 #endif
2907 
2908  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2909  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
2910  || (mode & FFMPEG_DEINT_FILTER)
2911  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
2912  else if( (mode & V_DEBLOCK)
2914  || (mode & MEDIAN_DEINT_FILTER)
2915  || (mode & V_A_DEBLOCK)) copyAhead=13;
2916  else if(mode & V_X1_FILTER) copyAhead=11;
2917 // else if(mode & V_RK1_FILTER) copyAhead=10;
2918  else if(mode & DERING) copyAhead=9;
2919  else copyAhead=8;
2920 
2921  copyAhead-= 8;
2922 
2923  if(!isColor){
2924  uint64_t sum= 0;
2925  int i;
2926  uint64_t maxClipped;
2927  uint64_t clipped;
2928  AVRational scale;
2929 
2930  c->frameNum++;
2931  // first frame is fscked so we ignore it
2932  if (c->frameNum == 1)
2933  yHistogram[0] = width * (uint64_t)height/64*15/256;
2934 
2935  for(i=0; i<256; i++){
2936  sum+= yHistogram[i];
2937  }
2938 
2939  /* We always get a completely black picture first. */
2940  maxClipped = av_rescale(sum, c->ppMode.maxClippedThreshold.num,
2941  c->ppMode.maxClippedThreshold.den);
2942 
2943  clipped= sum;
2944  for(black=255; black>0; black--){
2945  if(clipped < maxClipped) break;
2946  clipped-= yHistogram[black];
2947  }
2948 
2949  clipped= sum;
2950  for(white=0; white<256; white++){
2951  if(clipped < maxClipped) break;
2952  clipped-= yHistogram[white];
2953  }
2954 
2955  scale = (AVRational){c->ppMode.maxAllowedY - c->ppMode.minAllowedY, white - black};
2956 
2957 #if TEMPLATE_PP_MMXEXT
2958  c->packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
2959  c->packedYOffset = (((black*c->packedYScale)>>8) - c->ppMode.minAllowedY) & 0xFFFF;
2960 #else
2961  c->packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
2962  c->packedYOffset = (black - c->ppMode.minAllowedY) & 0xFFFF;
2963 #endif
2964 
2965  c->packedYOffset |= c->packedYOffset<<32;
2966  c->packedYOffset |= c->packedYOffset<<16;
2967 
2968  c->packedYScale |= c->packedYScale<<32;
2969  c->packedYScale |= c->packedYScale<<16;
2970 
2971  if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
2972  else QPCorrecture= 256*256;
2973  }else{
2974  c->packedYScale = 0x0100010001000100LL;
2975  c->packedYOffset = 0;
2976  QPCorrecture= 256*256;
2977  }
2978 
2979  /* copy & deinterlace first row of blocks */
2980  y=-BLOCK_SIZE;
2981  {
2982  const uint8_t *srcBlock= &(src[y*srcStride]);
2983  uint8_t *dstBlock= tempDst + dstStride;
2984 
2985  // From this point on it is guaranteed that we can read and write 16 lines downward
2986  // finish 1 block before the next otherwise we might have a problem
2987  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
2988  for(x=0; x<width; x+=BLOCK_SIZE){
2989  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
2990  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
2991  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
2992  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
2993 
2994  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2995  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
2996 
2997  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2998 
3000  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3001  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3002  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3003  else if(mode & MEDIAN_DEINT_FILTER)
3004  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3005  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3006  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3007  else if(mode & FFMPEG_DEINT_FILTER)
3008  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3009  else if(mode & LOWPASS5_DEINT_FILTER)
3010  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3011 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3012  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3013 */
3014  dstBlock+=8;
3015  srcBlock+=8;
3016  }
3017  if(width==FFABS(dstStride))
3018  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3019  else{
3020  int i;
3021  for(i=0; i<copyAhead; i++){
3022  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3023  }
3024  }
3025  }
3026 
3027  for(y=0; y<height; y+=BLOCK_SIZE){
3028  //1% speedup if these are here instead of the inner loop
3029  const uint8_t *srcBlock= &(src[y*srcStride]);
3030  uint8_t *dstBlock= &(dst[y*dstStride]);
3031 #if TEMPLATE_PP_MMX
3032  uint8_t *tempBlock1 = c->tempBlocks;
3033  uint8_t *tempBlock2 = c->tempBlocks + 8;
3034 #endif
3035  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3036  int8_t *nonBQPptr = &c->nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3037  int QP=0, nonBQP=0;
3038  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3039  if not than use a temporary buffer */
3040  if(y+15 >= height){
3041  int i;
3042  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3043  blockcopy to dst later */
3044  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3045  FFMAX(height-y-copyAhead, 0), srcStride);
3046 
3047  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3048  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3049  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3050 
3051  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3052  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3053 
3054  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3055  for(i=height-y+1; i<=copyAhead; i++)
3056  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3057 
3058  dstBlock= tempDst + dstStride;
3059  srcBlock= tempSrc;
3060  }
3061 
3062  // From this point on it is guaranteed that we can read and write 16 lines downward
3063  // finish 1 block before the next otherwise we might have a problem
3064  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3065  for(x=0; x<width; ){
3066  int startx = x;
3067  int endx = FFMIN(width, x+32);
3068  uint8_t *dstBlockStart = dstBlock;
3069  const uint8_t *srcBlockStart = srcBlock;
3070  int qp_index = 0;
3071  for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
3072  QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3073  nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3074  if(!isColor){
3075  QP= (QP* QPCorrecture + 256*128)>>16;
3076  nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
3077  yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
3078  }
3079  c->QP_block[qp_index] = QP;
3080  c->nonBQP_block[qp_index] = nonBQP;
3081 #if TEMPLATE_PP_MMX
3082  __asm__ volatile(
3083  "movd %1, %%mm7 \n\t"
3084  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3085  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3086  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3087  "movq %%mm7, %0 \n\t"
3088  : "=m" (c->pQPb_block[qp_index])
3089  : "r" (QP)
3090  );
3091 #endif
3092  }
3093  for(; x < endx; x+=BLOCK_SIZE){
3094  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3095  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3096  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3097  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3098 
3099  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3100  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
3101 
3103  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3104  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3105  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3106  else if(mode & MEDIAN_DEINT_FILTER)
3107  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3108  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3109  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3110  else if(mode & FFMPEG_DEINT_FILTER)
3111  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3112  else if(mode & LOWPASS5_DEINT_FILTER)
3113  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3114 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3115  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3116 */
3117  dstBlock+=8;
3118  srcBlock+=8;
3119  }
3120 
3121  dstBlock = dstBlockStart;
3122  srcBlock = srcBlockStart;
3123 
3124  for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
3125  const int stride= dstStride;
3126  //temporary while changing QP stuff to make things continue to work
3127  //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
3128  c->QP = c->QP_block[qp_index];
3129  c->nonBQP = c->nonBQP_block[qp_index];
3130  c->pQPb = c->pQPb_block[qp_index];
3131  c->pQPb2 = c->pQPb2_block[qp_index];
3132 
3133  /* only deblock if we have 2 blocks */
3134  if(y + 8 < height){
3135  if(mode & V_X1_FILTER)
3136  RENAME(vertX1Filter)(dstBlock, stride, c);
3137  else if(mode & V_DEBLOCK){
3138  const int t = RENAME(vertClassify)(dstBlock, stride, c);
3139 
3140  if(t==1)
3141  RENAME(doVertLowPass)(dstBlock, stride, c);
3142  else if(t==2)
3143  RENAME(doVertDefFilter)(dstBlock, stride, c);
3144  }else if(mode & V_A_DEBLOCK){
3145  RENAME(do_a_deblock)(dstBlock, stride, 1, c, mode);
3146  }
3147  }
3148 
3149  dstBlock+=8;
3150  srcBlock+=8;
3151  }
3152 
3153  dstBlock = dstBlockStart;
3154  srcBlock = srcBlockStart;
3155 
3156  for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
3157  const int stride= dstStride;
3158  c->QP = c->QP_block[qp_index];
3159  c->nonBQP = c->nonBQP_block[qp_index];
3160  c->pQPb = c->pQPb_block[qp_index];
3161  c->pQPb2 = c->pQPb2_block[qp_index];
3162 #if TEMPLATE_PP_MMX
3163  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3164 #endif
3165  /* check if we have a previous block to deblock it with dstBlock */
3166  if(x - 8 >= 0){
3167 #if TEMPLATE_PP_MMX
3168  if(mode & H_X1_FILTER)
3169  RENAME(vertX1Filter)(tempBlock1, 16, c);
3170  else if(mode & H_DEBLOCK){
3171  const int t= RENAME(vertClassify)(tempBlock1, 16, c);
3172  if(t==1)
3173  RENAME(doVertLowPass)(tempBlock1, 16, c);
3174  else if(t==2)
3175  RENAME(doVertDefFilter)(tempBlock1, 16, c);
3176  }else if(mode & H_A_DEBLOCK){
3177  RENAME(do_a_deblock)(tempBlock1, 16, 1, c, mode);
3178  }
3179 
3180  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3181 
3182 #else
3183  if(mode & H_X1_FILTER)
3184  horizX1Filter(dstBlock-4, stride, c->QP);
3185  else if(mode & H_DEBLOCK){
3186 #if TEMPLATE_PP_ALTIVEC
3187  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3188  int t;
3189  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3190 
3191  t = vertClassify_altivec(tempBlock-48, 16, c);
3192  if(t==1) {
3193  doVertLowPass_altivec(tempBlock-48, 16, c);
3194  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3195  }
3196  else if(t==2) {
3197  doVertDefFilter_altivec(tempBlock-48, 16, c);
3198  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3199  }
3200 #else
3201  const int t= RENAME(horizClassify)(dstBlock-4, stride, c);
3202 
3203  if(t==1)
3204  RENAME(doHorizLowPass)(dstBlock-4, stride, c);
3205  else if(t==2)
3206  RENAME(doHorizDefFilter)(dstBlock-4, stride, c);
3207 #endif
3208  }else if(mode & H_A_DEBLOCK){
3209  RENAME(do_a_deblock)(dstBlock-8, 1, stride, c, mode);
3210  }
3211 #endif //TEMPLATE_PP_MMX
3212  if(mode & DERING){
3213  //FIXME filter first line
3214  if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, c);
3215  }
3216 
3217  if(mode & TEMP_NOISE_FILTER)
3218  {
3219  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3220  c->tempBlurred[isColor] + y*dstStride + x,
3221  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3222  c->ppMode.maxTmpNoise);
3223  }
3224  }
3225 
3226  dstBlock+=8;
3227  srcBlock+=8;
3228 
3229 #if TEMPLATE_PP_MMX
3230  FFSWAP(uint8_t *, tempBlock1, tempBlock2);
3231 #endif
3232  }
3233  }
3234 
3235  if(mode & DERING){
3236  if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, c);
3237  }
3238 
3239  if((mode & TEMP_NOISE_FILTER)){
3240  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3241  c->tempBlurred[isColor] + y*dstStride + x,
3242  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3243  c->ppMode.maxTmpNoise);
3244  }
3245 
3246  /* did we use a tmp buffer for the last lines*/
3247  if(y+15 >= height){
3248  uint8_t *dstBlock= &(dst[y*dstStride]);
3249  if(width==FFABS(dstStride))
3250  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3251  else{
3252  int i;
3253  for(i=0; i<height-y; i++){
3254  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3255  }
3256  }
3257  }
3258  }
3259 #if TEMPLATE_PP_MMX
3260  __asm__ volatile("emms");
3261 #endif
3262 
3263 #ifdef DEBUG_BRIGHTNESS
3264  if(!isColor){
3265  int max=1;
3266  int i;
3267  for(i=0; i<256; i++)
3268  if(yHistogram[i] > max) max=yHistogram[i];
3269 
3270  for(i=1; i<256; i++){
3271  int x;
3272  int start=yHistogram[i-1]/(max/256+1);
3273  int end=yHistogram[i]/(max/256+1);
3274  int inc= end > start ? 1 : -1;
3275  for(x=start; x!=end+inc; x+=inc)
3276  dst[ i*dstStride + x]+=128;
3277  }
3278 
3279  for(i=0; i<100; i+=2){
3280  dst[ (white)*dstStride + i]+=128;
3281  dst[ (black)*dstStride + i]+=128;
3282  }
3283  }
3284 #endif
3285 }
3286 
3287 #undef RENAME
3288 #undef TEMPLATE_PP_C
3289 #undef TEMPLATE_PP_ALTIVEC
3290 #undef TEMPLATE_PP_MMX
3291 #undef TEMPLATE_PP_MMXEXT
3292 #undef TEMPLATE_PP_SSE2
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
FFMPEG_DEINT_FILTER
#define FFMPEG_DEINT_FILTER
Definition: postprocess_internal.h:67
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:186
mem_internal.h
PPContext
postprocess context.
Definition: postprocess_internal.h:116
x86_reg
int x86_reg
Definition: asm.h:72
int64_t
long long int64_t
Definition: coverity.c:34
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
LOWPASS5_DEINT_FILTER
#define LOWPASS5_DEINT_FILTER
Definition: postprocess_internal.h:68
b
#define b
Definition: input.c:41
horizX1Filter
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:324
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:214
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
H_A_DEBLOCK
#define H_A_DEBLOCK
Definition: postprocess_internal.h:56
FFSIGN
#define FFSIGN(a)
Definition: common.h:75
QP
#define QP(qP, depth)
Definition: h264data.c:190
MANGLE
#define MANGLE(a)
Definition: asm.h:127
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
postProcess
static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
Definition: postprocess.c:523
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
width
#define width
s
#define s(width, name)
Definition: cbs_vp9.c:198
V_A_DEBLOCK
#define V_A_DEBLOCK
Definition: postprocess_internal.h:52
V_DEBLOCK
#define V_DEBLOCK
Definition: postprocess_internal.h:36
TEMP_NOISE_FILTER
#define TEMP_NOISE_FILTER
Definition: postprocess_internal.h:70
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:74
asm.h
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
MEDIAN_DEINT_FILTER
#define MEDIAN_DEINT_FILTER
Definition: postprocess_internal.h:66
linecpy
static void linecpy(void *dest, const void *src, int lines, int stride)
Definition: postprocess_internal.h:177
V_X1_FILTER
#define V_X1_FILTER
Definition: postprocess_internal.h:51
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
inc
static int inc(int num, int period)
Definition: perlin.c:34
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1016
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:81
f
f
Definition: af_crystalizer.c:122
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:109
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1121
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
PREV
@ PREV
Definition: vf_fftdnoiz.c:37
diff
static av_always_inline int diff(const struct color_info *a, const struct color_info *b, const int trans_thresh)
Definition: vf_paletteuse.c:164
height
#define height
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
H_DEBLOCK
#define H_DEBLOCK
Definition: postprocess_internal.h:37
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:191
DERING
#define DERING
Definition: postprocess_internal.h:38
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
VISUALIZE
#define VISUALIZE
Definition: postprocess_internal.h:73
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
stride
#define stride
Definition: h264pred_template.c:537
NEXT
@ NEXT
Definition: vf_fftdnoiz.c:38
CUBIC_IPOL_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:65
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:59
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
TEMPLATE_PP_MMX
#define TEMPLATE_PP_MMX
Definition: postprocess_template.c:52
RENAME
#define RENAME(element)
Definition: ac3enc_template.c:44
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:412
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:112
LINEAR_BLEND_DEINT_FILTER
#define LINEAR_BLEND_DEINT_FILTER
Definition: postprocess_internal.h:63
av_clip_uint8
#define av_clip_uint8
Definition: common.h:106
LINEAR_IPOL_DEINT_FILTER
#define LINEAR_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:62
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:291
H_X1_FILTER
#define H_X1_FILTER
Definition: postprocess_internal.h:55
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
BLOCK_SIZE
#define BLOCK_SIZE
Definition: adx.h:51
LEVEL_FIX
#define LEVEL_FIX
Brightness & Contrast.
Definition: postprocess_internal.h:39
min
float min
Definition: vorbis_enc_data.h:429