FFmpeg
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/sse2 postprocess code.
24  */
25 #include "config.h"
26 
27 #include "libavutil/mem_internal.h"
28 #if ARCH_X86
29 #include "libavutil/x86/asm.h"
30 #endif
31 
32 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
33  * included. The following macros will define its dependencies to 1 as well
34  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
35  * TEMPLATE_PP_* need to be undef at the end. */
36 
37 #ifdef TEMPLATE_PP_C
38 # define RENAME(a) a ## _C
39 #else
40 # define TEMPLATE_PP_C 0
41 #endif
42 
43 #ifdef TEMPLATE_PP_ALTIVEC
44 # define RENAME(a) a ## _altivec
45 #else
46 # define TEMPLATE_PP_ALTIVEC 0
47 #endif
48 
49 #ifdef TEMPLATE_PP_MMX
50 # define RENAME(a) a ## _MMX
51 #else
52 # define TEMPLATE_PP_MMX 0
53 #endif
54 
55 #ifdef TEMPLATE_PP_MMXEXT
56 # undef TEMPLATE_PP_MMX
57 # define TEMPLATE_PP_MMX 1
58 # define RENAME(a) a ## _MMX2
59 #else
60 # define TEMPLATE_PP_MMXEXT 0
61 #endif
62 
63 #ifdef TEMPLATE_PP_SSE2
64 # undef TEMPLATE_PP_MMX
65 # define TEMPLATE_PP_MMX 1
66 # undef TEMPLATE_PP_MMXEXT
67 # define TEMPLATE_PP_MMXEXT 1
68 # define RENAME(a) a ## _SSE2
69 #else
70 # define TEMPLATE_PP_SSE2 0
71 #endif
72 
73 #undef REAL_PAVGB
74 #undef PAVGB
75 #undef PMINUB
76 #undef PMAXUB
77 
78 #if TEMPLATE_PP_MMXEXT
79 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
80 #endif
81 #define PAVGB(a,b) REAL_PAVGB(a,b)
82 
83 #if TEMPLATE_PP_MMXEXT
84 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
85 #endif
86 
87 #if TEMPLATE_PP_MMXEXT
88 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
89 #endif
90 
91 //FIXME? |255-0| = 1 (should not be a problem ...)
92 #if TEMPLATE_PP_MMXEXT
93 /**
94  * Check if the middle 8x8 Block in the given 8x16 block is flat
95  */
96 static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
97  int numEq= 0, dcOk;
98  src+= stride*4; // src points to begin of the 8x8 Block
99  __asm__ volatile(
100  "movq %0, %%mm7 \n\t"
101  "movq %1, %%mm6 \n\t"
102  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
103  );
104 
105  __asm__ volatile(
106  "lea (%2, %3), %%"FF_REG_a" \n\t"
107 // 0 1 2 3 4 5 6 7 8 9
108 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
109 
110  "movq (%2), %%mm0 \n\t"
111  "movq (%%"FF_REG_a"), %%mm1 \n\t"
112  "movq %%mm0, %%mm3 \n\t"
113  "movq %%mm0, %%mm4 \n\t"
114  PMAXUB(%%mm1, %%mm4)
115  PMINUB(%%mm1, %%mm3, %%mm5)
116  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
117  "paddb %%mm7, %%mm0 \n\t"
118  "pcmpgtb %%mm6, %%mm0 \n\t"
119 
120  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
121  PMAXUB(%%mm2, %%mm4)
122  PMINUB(%%mm2, %%mm3, %%mm5)
123  "psubb %%mm2, %%mm1 \n\t"
124  "paddb %%mm7, %%mm1 \n\t"
125  "pcmpgtb %%mm6, %%mm1 \n\t"
126  "paddb %%mm1, %%mm0 \n\t"
127 
128  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
129  PMAXUB(%%mm1, %%mm4)
130  PMINUB(%%mm1, %%mm3, %%mm5)
131  "psubb %%mm1, %%mm2 \n\t"
132  "paddb %%mm7, %%mm2 \n\t"
133  "pcmpgtb %%mm6, %%mm2 \n\t"
134  "paddb %%mm2, %%mm0 \n\t"
135 
136  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
137 
138  "movq (%2, %3, 4), %%mm2 \n\t"
139  PMAXUB(%%mm2, %%mm4)
140  PMINUB(%%mm2, %%mm3, %%mm5)
141  "psubb %%mm2, %%mm1 \n\t"
142  "paddb %%mm7, %%mm1 \n\t"
143  "pcmpgtb %%mm6, %%mm1 \n\t"
144  "paddb %%mm1, %%mm0 \n\t"
145 
146  "movq (%%"FF_REG_a"), %%mm1 \n\t"
147  PMAXUB(%%mm1, %%mm4)
148  PMINUB(%%mm1, %%mm3, %%mm5)
149  "psubb %%mm1, %%mm2 \n\t"
150  "paddb %%mm7, %%mm2 \n\t"
151  "pcmpgtb %%mm6, %%mm2 \n\t"
152  "paddb %%mm2, %%mm0 \n\t"
153 
154  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
155  PMAXUB(%%mm2, %%mm4)
156  PMINUB(%%mm2, %%mm3, %%mm5)
157  "psubb %%mm2, %%mm1 \n\t"
158  "paddb %%mm7, %%mm1 \n\t"
159  "pcmpgtb %%mm6, %%mm1 \n\t"
160  "paddb %%mm1, %%mm0 \n\t"
161 
162  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
163  PMAXUB(%%mm1, %%mm4)
164  PMINUB(%%mm1, %%mm3, %%mm5)
165  "psubb %%mm1, %%mm2 \n\t"
166  "paddb %%mm7, %%mm2 \n\t"
167  "pcmpgtb %%mm6, %%mm2 \n\t"
168  "paddb %%mm2, %%mm0 \n\t"
169  "psubusb %%mm3, %%mm4 \n\t"
170 
171  " \n\t"
172  "pxor %%mm7, %%mm7 \n\t"
173  "psadbw %%mm7, %%mm0 \n\t"
174  "movq %4, %%mm7 \n\t" // QP,..., QP
175  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
176  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
177  "packssdw %%mm4, %%mm4 \n\t"
178  "movd %%mm0, %0 \n\t"
179  "movd %%mm4, %1 \n\t"
180 
181  : "=r" (numEq), "=r" (dcOk)
182  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
183  : "%"FF_REG_a
184  );
185 
186  numEq= (-numEq) &0xFF;
187  if(numEq > c->ppMode.flatnessThreshold){
188  if(dcOk) return 0;
189  else return 1;
190  }else{
191  return 2;
192  }
193 }
194 #endif //TEMPLATE_PP_MMXEXT
195 
196 /**
197  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
198  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
199  */
200 #if !TEMPLATE_PP_ALTIVEC
201 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
202 {
203 #if TEMPLATE_PP_MMXEXT
204  src+= stride*3;
205  __asm__ volatile( //"movv %0 %1 %2\n\t"
206  "movq %2, %%mm0 \n\t" // QP,..., QP
207  "pxor %%mm4, %%mm4 \n\t"
208 
209  "movq (%0), %%mm6 \n\t"
210  "movq (%0, %1), %%mm5 \n\t"
211  "movq %%mm5, %%mm1 \n\t"
212  "movq %%mm6, %%mm2 \n\t"
213  "psubusb %%mm6, %%mm5 \n\t"
214  "psubusb %%mm1, %%mm2 \n\t"
215  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
216  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
217  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
218 
219  "pand %%mm2, %%mm6 \n\t"
220  "pandn %%mm1, %%mm2 \n\t"
221  "por %%mm2, %%mm6 \n\t"// First Line to Filter
222 
223  "movq (%0, %1, 8), %%mm5 \n\t"
224  "lea (%0, %1, 4), %%"FF_REG_a" \n\t"
225  "lea (%0, %1, 8), %%"FF_REG_c" \n\t"
226  "sub %1, %%"FF_REG_c" \n\t"
227  "add %1, %0 \n\t" // %0 points to line 1 not 0
228  "movq (%0, %1, 8), %%mm7 \n\t"
229  "movq %%mm5, %%mm1 \n\t"
230  "movq %%mm7, %%mm2 \n\t"
231  "psubusb %%mm7, %%mm5 \n\t"
232  "psubusb %%mm1, %%mm2 \n\t"
233  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
234  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
235  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
236 
237  "pand %%mm2, %%mm7 \n\t"
238  "pandn %%mm1, %%mm2 \n\t"
239  "por %%mm2, %%mm7 \n\t" // First Line to Filter
240 
241 
242  // 1 2 3 4 5 6 7 8
243  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
244  // 6 4 2 2 1 1
245  // 6 4 4 2
246  // 6 8 2
247 
248  "movq (%0, %1), %%mm0 \n\t" // 1
249  "movq %%mm0, %%mm1 \n\t" // 1
250  PAVGB(%%mm6, %%mm0) //1 1 /2
251  PAVGB(%%mm6, %%mm0) //3 1 /4
252 
253  "movq (%0, %1, 4), %%mm2 \n\t" // 1
254  "movq %%mm2, %%mm5 \n\t" // 1
255  PAVGB((%%FF_REGa), %%mm2) // 11 /2
256  PAVGB((%0, %1, 2), %%mm2) // 211 /4
257  "movq %%mm2, %%mm3 \n\t" // 211 /4
258  "movq (%0), %%mm4 \n\t" // 1
259  PAVGB(%%mm4, %%mm3) // 4 211 /8
260  PAVGB(%%mm0, %%mm3) //642211 /16
261  "movq %%mm3, (%0) \n\t" // X
262  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
263  "movq %%mm1, %%mm0 \n\t" // 1
264  PAVGB(%%mm6, %%mm0) //1 1 /2
265  "movq %%mm4, %%mm3 \n\t" // 1
266  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
267  PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
268  PAVGB((%%FF_REGa), %%mm5) // 211 /4
269  PAVGB(%%mm5, %%mm3) // 2 2211 /8
270  PAVGB(%%mm0, %%mm3) //4242211 /16
271  "movq %%mm3, (%0,%1) \n\t" // X
272  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
273  PAVGB(%%mm4, %%mm6) //11 /2
274  "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
275  PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
276  "movq %%mm0, %%mm3 \n\t" // 11/2
277  PAVGB(%%mm1, %%mm0) // 2 11/4
278  PAVGB(%%mm6, %%mm0) //222 11/8
279  PAVGB(%%mm2, %%mm0) //22242211/16
280  "movq (%0, %1, 2), %%mm2 \n\t" // 1
281  "movq %%mm0, (%0, %1, 2) \n\t" // X
282  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
283  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
284  PAVGB((%%FF_REGc), %%mm0) // 11 /2
285  PAVGB(%%mm0, %%mm6) //11 11 /4
286  PAVGB(%%mm1, %%mm4) // 11 /2
287  PAVGB(%%mm2, %%mm1) // 11 /2
288  PAVGB(%%mm1, %%mm6) //1122 11 /8
289  PAVGB(%%mm5, %%mm6) //112242211 /16
290  "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
291  "movq %%mm6, (%%"FF_REG_a") \n\t" // X
292  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
293  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
294  PAVGB(%%mm7, %%mm6) // 11 /2
295  PAVGB(%%mm4, %%mm6) // 11 11 /4
296  PAVGB(%%mm3, %%mm6) // 11 2211 /8
297  PAVGB(%%mm5, %%mm2) // 11 /2
298  "movq (%0, %1, 4), %%mm4 \n\t" // 1
299  PAVGB(%%mm4, %%mm2) // 112 /4
300  PAVGB(%%mm2, %%mm6) // 112242211 /16
301  "movq %%mm6, (%0, %1, 4) \n\t" // X
302  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
303  PAVGB(%%mm7, %%mm1) // 11 2 /4
304  PAVGB(%%mm4, %%mm5) // 11 /2
305  PAVGB(%%mm5, %%mm0) // 11 11 /4
306  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
307  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
308  PAVGB(%%mm0, %%mm1) // 11224222 /16
309  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
310  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
311  PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
312  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
313  PAVGB(%%mm0, %%mm6) // 1 1 /2
314  PAVGB(%%mm7, %%mm6) // 1 12 /4
315  PAVGB(%%mm2, %%mm6) // 1122424 /4
316  "movq %%mm6, (%%"FF_REG_c") \n\t" // X
317  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
318  PAVGB(%%mm7, %%mm5) // 11 2 /4
319  PAVGB(%%mm7, %%mm5) // 11 6 /8
320 
321  PAVGB(%%mm3, %%mm0) // 112 /4
322  PAVGB(%%mm0, %%mm5) // 112246 /16
323  "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
324  "sub %1, %0 \n\t"
325 
326  :
327  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
328  : "%"FF_REG_a, "%"FF_REG_c
329  );
330 #else //TEMPLATE_PP_MMXEXT
331  const int l1= stride;
332  const int l2= stride + l1;
333  const int l3= stride + l2;
334  const int l4= stride + l3;
335  const int l5= stride + l4;
336  const int l6= stride + l5;
337  const int l7= stride + l6;
338  const int l8= stride + l7;
339  const int l9= stride + l8;
340  int x;
341  src+= stride*3;
342  for(x=0; x<BLOCK_SIZE; x++){
343  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
344  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
345 
346  int sums[10];
347  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
348  sums[1] = sums[0] - first + src[l4];
349  sums[2] = sums[1] - first + src[l5];
350  sums[3] = sums[2] - first + src[l6];
351  sums[4] = sums[3] - first + src[l7];
352  sums[5] = sums[4] - src[l1] + src[l8];
353  sums[6] = sums[5] - src[l2] + last;
354  sums[7] = sums[6] - src[l3] + last;
355  sums[8] = sums[7] - src[l4] + last;
356  sums[9] = sums[8] - src[l5] + last;
357 
358  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
359  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
360  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
361  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
362  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
363  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
364  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
365  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
366 
367  src++;
368  }
369 #endif //TEMPLATE_PP_MMXEXT
370 }
371 #endif //TEMPLATE_PP_ALTIVEC
372 
373 /**
374  * Experimental Filter 1
375  * will not damage linear gradients
376  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
377  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
378  * MMX2 version does correct clipping C version does not
379  */
380 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
381 {
382 #if TEMPLATE_PP_MMXEXT
383  src+= stride*3;
384 
385  __asm__ volatile(
386  "pxor %%mm7, %%mm7 \n\t" // 0
387  "lea (%0, %1), %%"FF_REG_a" \n\t"
388  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
389 // 0 1 2 3 4 5 6 7 8 9
390 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
391  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
392  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
393  "movq %%mm1, %%mm2 \n\t" // line 4
394  "psubusb %%mm0, %%mm1 \n\t"
395  "psubusb %%mm2, %%mm0 \n\t"
396  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
397  "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
398  "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
399  "movq %%mm3, %%mm5 \n\t" // line 5
400  "psubusb %%mm4, %%mm3 \n\t"
401  "psubusb %%mm5, %%mm4 \n\t"
402  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
403  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
404  "movq %%mm2, %%mm1 \n\t" // line 4
405  "psubusb %%mm5, %%mm2 \n\t"
406  "movq %%mm2, %%mm4 \n\t"
407  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
408  "psubusb %%mm1, %%mm5 \n\t"
409  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
410  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
411  "movq %%mm4, %%mm3 \n\t" // d
412  "movq %2, %%mm0 \n\t"
413  "paddusb %%mm0, %%mm0 \n\t"
414  "psubusb %%mm0, %%mm4 \n\t"
415  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
416  "psubusb "MANGLE(b01)", %%mm3 \n\t"
417  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
418 
419  PAVGB(%%mm7, %%mm3) // d/2
420  "movq %%mm3, %%mm1 \n\t" // d/2
421  PAVGB(%%mm7, %%mm3) // d/4
422  PAVGB(%%mm1, %%mm3) // 3*d/8
423 
424  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
425  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
426  "psubusb %%mm3, %%mm0 \n\t"
427  "pxor %%mm2, %%mm0 \n\t"
428  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
429 
430  "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
431  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
432  "paddusb %%mm3, %%mm0 \n\t"
433  "pxor %%mm2, %%mm0 \n\t"
434  "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
435 
436  PAVGB(%%mm7, %%mm1) // d/4
437 
438  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
439  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
440  "psubusb %%mm1, %%mm0 \n\t"
441  "pxor %%mm2, %%mm0 \n\t"
442  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
443 
444  "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
445  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
446  "paddusb %%mm1, %%mm0 \n\t"
447  "pxor %%mm2, %%mm0 \n\t"
448  "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
449 
450  PAVGB(%%mm7, %%mm1) // d/8
451 
452  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
453  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
454  "psubusb %%mm1, %%mm0 \n\t"
455  "pxor %%mm2, %%mm0 \n\t"
456  "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
457 
458  "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
459  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
460  "paddusb %%mm1, %%mm0 \n\t"
461  "pxor %%mm2, %%mm0 \n\t"
462  "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
463 
464  :
465  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
467  : "%"FF_REG_a, "%"FF_REG_c
468  );
469 #else //TEMPLATE_PP_MMXEXT
470 
471  const int l1= stride;
472  const int l2= stride + l1;
473  const int l3= stride + l2;
474  const int l4= stride + l3;
475  const int l5= stride + l4;
476  const int l6= stride + l5;
477  const int l7= stride + l6;
478 // const int l8= stride + l7;
479 // const int l9= stride + l8;
480  int x;
481 
482  src+= stride*3;
483  for(x=0; x<BLOCK_SIZE; x++){
484  int a= src[l3] - src[l4];
485  int b= src[l4] - src[l5];
486  int c= src[l5] - src[l6];
487 
488  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
489  d= FFMAX(d, 0);
490 
491  if(d < co->QP*2){
492  int v = d * FFSIGN(-b);
493 
494  src[l2] +=v>>3;
495  src[l3] +=v>>2;
496  src[l4] +=(3*v)>>3;
497  src[l5] -=(3*v)>>3;
498  src[l6] -=v>>2;
499  src[l7] -=v>>3;
500  }
501  src++;
502  }
503 #endif //TEMPLATE_PP_MMXEXT
504 }
505 
506 #if !TEMPLATE_PP_ALTIVEC
507 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
508 {
509 #if TEMPLATE_PP_MMXEXT
510 /*
511  uint8_t tmp[16];
512  const int l1= stride;
513  const int l2= stride + l1;
514  const int l3= stride + l2;
515  const int l4= (int)tmp - (int)src - stride*3;
516  const int l5= (int)tmp - (int)src - stride*3 + 8;
517  const int l6= stride*3 + l3;
518  const int l7= stride + l6;
519  const int l8= stride + l7;
520 
521  memcpy(tmp, src+stride*7, 8);
522  memcpy(tmp+8, src+stride*8, 8);
523 */
524  src+= stride*4;
525  __asm__ volatile(
526 
527 #if 0 //slightly more accurate and slightly slower
528  "pxor %%mm7, %%mm7 \n\t" // 0
529  "lea (%0, %1), %%"FF_REG_a" \n\t"
530  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
531 // 0 1 2 3 4 5 6 7
532 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
533 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
534 
535 
536  "movq (%0, %1, 2), %%mm0 \n\t" // l2
537  "movq (%0), %%mm1 \n\t" // l0
538  "movq %%mm0, %%mm2 \n\t" // l2
539  PAVGB(%%mm7, %%mm0) // ~l2/2
540  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
541  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
542 
543  "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
544  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
545  "movq %%mm1, %%mm4 \n\t" // l1
546  PAVGB(%%mm7, %%mm1) // ~l1/2
547  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
548  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
549 
550  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
551  "psubusb %%mm1, %%mm0 \n\t"
552  "psubusb %%mm4, %%mm1 \n\t"
553  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
554 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
555 
556  "movq (%0, %1, 4), %%mm0 \n\t" // l4
557  "movq %%mm0, %%mm4 \n\t" // l4
558  PAVGB(%%mm7, %%mm0) // ~l4/2
559  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
560  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
561 
562  "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
563  "movq %%mm3, %%mm5 \n\t" // l3
564  PAVGB(%%mm7, %%mm3) // ~l3/2
565  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
566  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
567 
568  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
569  "psubusb %%mm3, %%mm0 \n\t"
570  "psubusb %%mm6, %%mm3 \n\t"
571  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
572  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
573 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
574 
575  "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
576  "movq %%mm6, %%mm5 \n\t" // l6
577  PAVGB(%%mm7, %%mm6) // ~l6/2
578  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
579  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
580 
581  "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
582  "movq %%mm2, %%mm4 \n\t" // l5
583  PAVGB(%%mm7, %%mm2) // ~l5/2
584  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
585  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
586 
587  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
588  "psubusb %%mm2, %%mm6 \n\t"
589  "psubusb %%mm4, %%mm2 \n\t"
590  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
591 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
592 
593 
594  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
595  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
596  "paddusb "MANGLE(b01)", %%mm4 \n\t"
597  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
598  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
599  "pand %%mm4, %%mm3 \n\t"
600 
601  "movq %%mm3, %%mm1 \n\t"
602 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
603  PAVGB(%%mm7, %%mm3)
604  PAVGB(%%mm7, %%mm3)
605  "paddusb %%mm1, %%mm3 \n\t"
606 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
607 
608  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
609  "movq (%0, %1, 4), %%mm5 \n\t" //l4
610  "movq (%0, %1, 4), %%mm4 \n\t" //l4
611  "psubusb %%mm6, %%mm5 \n\t"
612  "psubusb %%mm4, %%mm6 \n\t"
613  "por %%mm6, %%mm5 \n\t" // |l3-l4|
614  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
615  "pxor %%mm6, %%mm0 \n\t"
616  "pand %%mm0, %%mm3 \n\t"
617  PMINUB(%%mm5, %%mm3, %%mm0)
618 
619  "psubusb "MANGLE(b01)", %%mm3 \n\t"
620  PAVGB(%%mm7, %%mm3)
621 
622  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
623  "movq (%0, %1, 4), %%mm2 \n\t"
624  "pxor %%mm6, %%mm0 \n\t"
625  "pxor %%mm6, %%mm2 \n\t"
626  "psubb %%mm3, %%mm0 \n\t"
627  "paddb %%mm3, %%mm2 \n\t"
628  "pxor %%mm6, %%mm0 \n\t"
629  "pxor %%mm6, %%mm2 \n\t"
630  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
631  "movq %%mm2, (%0, %1, 4) \n\t"
632 #endif //0
633 
634  "lea (%0, %1), %%"FF_REG_a" \n\t"
635  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
636 // 0 1 2 3 4 5 6 7
637 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
638 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
639 
640 
641  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
642  "movq (%0, %1, 4), %%mm0 \n\t" // l4
643  "pxor %%mm6, %%mm1 \n\t" // -l3-1
644  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
645 // mm1=-l3-1, mm0=128-q
646 
647  "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
648  "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
649  "pxor %%mm6, %%mm2 \n\t" // -l5-1
650  "movq %%mm2, %%mm5 \n\t" // -l5-1
651  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
652  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
653  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
654  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
655  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
656  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
657 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
658 
659  "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
660  "pxor %%mm6, %%mm2 \n\t" // -l1-1
661  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
662  PAVGB((%0), %%mm1) // (l0-l3+256)/2
663  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
664  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
665  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
666  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
667 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
668 
669  PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
670  "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
671  "pxor %%mm6, %%mm1 \n\t" // -l7-1
672  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
673  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
674  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
675  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
676  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
677 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
678 
679  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
680  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
681  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
682  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
683  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
684  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
685  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
686 
687 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
688 
689  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
690  "movq %2, %%mm2 \n\t" // QP
691  PAVGB(%%mm6, %%mm2) // 128 + QP/2
692  "psubb %%mm6, %%mm2 \n\t"
693 
694  "movq %%mm4, %%mm1 \n\t"
695  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
696  "pxor %%mm1, %%mm4 \n\t"
697  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
698  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
699  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
700 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
701 
702  "movq %%mm4, %%mm3 \n\t" // d
703  "psubusb "MANGLE(b01)", %%mm4 \n\t"
704  PAVGB(%%mm7, %%mm4) // d/32
705  PAVGB(%%mm7, %%mm4) // (d + 32)/64
706  "paddb %%mm3, %%mm4 \n\t" // 5d/64
707  "pand %%mm2, %%mm4 \n\t"
708 
709  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
710  "psubb %%mm0, %%mm5 \n\t" // q
711  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
712  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
713  "pxor %%mm7, %%mm5 \n\t"
714 
715  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
716  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
717 
718  "pand %%mm7, %%mm4 \n\t"
719  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
720  "movq (%0, %1, 4), %%mm2 \n\t"
721  "pxor %%mm1, %%mm0 \n\t"
722  "pxor %%mm1, %%mm2 \n\t"
723  "paddb %%mm4, %%mm0 \n\t"
724  "psubb %%mm4, %%mm2 \n\t"
725  "pxor %%mm1, %%mm0 \n\t"
726  "pxor %%mm1, %%mm2 \n\t"
727  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
728  "movq %%mm2, (%0, %1, 4) \n\t"
729 
730  :
731  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
732  NAMED_CONSTRAINTS_ADD(b80,b00,b01)
733  : "%"FF_REG_a, "%"FF_REG_c
734  );
735 
736 /*
737  {
738  int x;
739  src-= stride;
740  for(x=0; x<BLOCK_SIZE; x++){
741  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
742  if(FFABS(middleEnergy)< 8*QP){
743  const int q=(src[l4] - src[l5])/2;
744  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
745  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
746 
747  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
748  d= FFMAX(d, 0);
749 
750  d= (5*d + 32) >> 6;
751  d*= FFSIGN(-middleEnergy);
752 
753  if(q>0){
754  d= d<0 ? 0 : d;
755  d= d>q ? q : d;
756  }else{
757  d= d>0 ? 0 : d;
758  d= d<q ? q : d;
759  }
760 
761  src[l4]-= d;
762  src[l5]+= d;
763  }
764  src++;
765  }
766  src-=8;
767  for(x=0; x<8; x++){
768  int y;
769  for(y=4; y<6; y++){
770  int d= src[x+y*stride] - tmp[x+(y-4)*8];
771  int ad= FFABS(d);
772  static int max=0;
773  static int sum=0;
774  static int num=0;
775  static int bias=0;
776 
777  if(max<ad) max=ad;
778  sum+= ad>3 ? 1 : 0;
779  if(ad>3){
780  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
781  }
782  if(y==4) bias+=d;
783  num++;
784  if(num%1000000 == 0){
785  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
786  }
787  }
788  }
789 }
790 */
791 #else //TEMPLATE_PP_MMXEXT
792  const int l1= stride;
793  const int l2= stride + l1;
794  const int l3= stride + l2;
795  const int l4= stride + l3;
796  const int l5= stride + l4;
797  const int l6= stride + l5;
798  const int l7= stride + l6;
799  const int l8= stride + l7;
800 // const int l9= stride + l8;
801  int x;
802  src+= stride*3;
803  for(x=0; x<BLOCK_SIZE; x++){
804  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
805  if(FFABS(middleEnergy) < 8*c->QP){
806  const int q=(src[l4] - src[l5])/2;
807  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
808  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
809 
810  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
811  d= FFMAX(d, 0);
812 
813  d= (5*d + 32) >> 6;
814  d*= FFSIGN(-middleEnergy);
815 
816  if(q>0){
817  d = FFMAX(d, 0);
818  d = FFMIN(d, q);
819  }else{
820  d = FFMIN(d, 0);
821  d = FFMAX(d, q);
822  }
823 
824  src[l4]-= d;
825  src[l5]+= d;
826  }
827  src++;
828  }
829 #endif //TEMPLATE_PP_MMXEXT
830 }
831 #endif //TEMPLATE_PP_ALTIVEC
832 
833 #if !TEMPLATE_PP_ALTIVEC
834 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
835 {
836 #if TEMPLATE_PP_MMXEXT && HAVE_7REGS
837  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
838  __asm__ volatile(
839  "pxor %%mm6, %%mm6 \n\t"
840  "pcmpeqb %%mm7, %%mm7 \n\t"
841  "movq %2, %%mm0 \n\t"
842  "punpcklbw %%mm6, %%mm0 \n\t"
843  "psrlw $1, %%mm0 \n\t"
844  "psubw %%mm7, %%mm0 \n\t"
845  "packuswb %%mm0, %%mm0 \n\t"
846  "movq %%mm0, %3 \n\t"
847 
848  "lea (%0, %1), %%"FF_REG_a" \n\t"
849  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
850 
851 // 0 1 2 3 4 5 6 7 8 9
852 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
853 
854 #undef REAL_FIND_MIN_MAX
855 #undef FIND_MIN_MAX
856 #define REAL_FIND_MIN_MAX(addr)\
857  "movq " #addr ", %%mm0 \n\t"\
858  "pminub %%mm0, %%mm7 \n\t"\
859  "pmaxub %%mm0, %%mm6 \n\t"
860 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
861 
862 FIND_MIN_MAX((%%FF_REGa))
863 FIND_MIN_MAX((%%FF_REGa, %1))
864 FIND_MIN_MAX((%%FF_REGa, %1, 2))
865 FIND_MIN_MAX((%0, %1, 4))
866 FIND_MIN_MAX((%%FF_REGd))
867 FIND_MIN_MAX((%%FF_REGd, %1))
868 FIND_MIN_MAX((%%FF_REGd, %1, 2))
869 FIND_MIN_MAX((%0, %1, 8))
870 
871  "movq %%mm7, %%mm4 \n\t"
872  "psrlq $8, %%mm7 \n\t"
873  "pminub %%mm4, %%mm7 \n\t" // min of pixels
874  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
875  "pminub %%mm4, %%mm7 \n\t" // min of pixels
876  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
877  "pminub %%mm4, %%mm7 \n\t"
878 
879 
880  "movq %%mm6, %%mm4 \n\t"
881  "psrlq $8, %%mm6 \n\t"
882  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
883  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
884  "pmaxub %%mm4, %%mm6 \n\t"
885  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
886  "pmaxub %%mm4, %%mm6 \n\t"
887  "movq %%mm6, %%mm0 \n\t" // max
888  "psubb %%mm7, %%mm6 \n\t" // max - min
889  "push %%"FF_REG_a" \n\t"
890  "movd %%mm6, %%eax \n\t"
891  "cmpb $"AV_STRINGIFY(DERING_THRESHOLD)", %%al \n\t"
892  "pop %%"FF_REG_a" \n\t"
893  " jb 1f \n\t"
894  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
895  "punpcklbw %%mm7, %%mm7 \n\t"
896  "punpcklbw %%mm7, %%mm7 \n\t"
897  "punpcklbw %%mm7, %%mm7 \n\t"
898  "movq %%mm7, (%4) \n\t"
899 
900  "movq (%0), %%mm0 \n\t" // L10
901  "movq %%mm0, %%mm1 \n\t" // L10
902  "movq %%mm0, %%mm2 \n\t" // L10
903  "psllq $8, %%mm1 \n\t"
904  "psrlq $8, %%mm2 \n\t"
905  "movd -4(%0), %%mm3 \n\t"
906  "movd 8(%0), %%mm4 \n\t"
907  "psrlq $24, %%mm3 \n\t"
908  "psllq $56, %%mm4 \n\t"
909  "por %%mm3, %%mm1 \n\t" // L00
910  "por %%mm4, %%mm2 \n\t" // L20
911  "movq %%mm1, %%mm3 \n\t" // L00
912  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
913  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
914  "psubusb %%mm7, %%mm0 \n\t"
915  "psubusb %%mm7, %%mm2 \n\t"
916  "psubusb %%mm7, %%mm3 \n\t"
917  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
918  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
919  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
920  "paddb %%mm2, %%mm0 \n\t"
921  "paddb %%mm3, %%mm0 \n\t"
922 
923  "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11
924  "movq %%mm2, %%mm3 \n\t" // L11
925  "movq %%mm2, %%mm4 \n\t" // L11
926  "psllq $8, %%mm3 \n\t"
927  "psrlq $8, %%mm4 \n\t"
928  "movd -4(%%"FF_REG_a"), %%mm5 \n\t"
929  "movd 8(%%"FF_REG_a"), %%mm6 \n\t"
930  "psrlq $24, %%mm5 \n\t"
931  "psllq $56, %%mm6 \n\t"
932  "por %%mm5, %%mm3 \n\t" // L01
933  "por %%mm6, %%mm4 \n\t" // L21
934  "movq %%mm3, %%mm5 \n\t" // L01
935  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
936  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
937  "psubusb %%mm7, %%mm2 \n\t"
938  "psubusb %%mm7, %%mm4 \n\t"
939  "psubusb %%mm7, %%mm5 \n\t"
940  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
941  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
942  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
943  "paddb %%mm4, %%mm2 \n\t"
944  "paddb %%mm5, %%mm2 \n\t"
945 // 0, 2, 3, 1
946 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
947  "movq " #src ", " #sx " \n\t" /* src[0] */\
948  "movq " #sx ", " #lx " \n\t" /* src[0] */\
949  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
950  "psllq $8, " #lx " \n\t"\
951  "psrlq $8, " #t0 " \n\t"\
952  "movd -4" #src ", " #t1 " \n\t"\
953  "psrlq $24, " #t1 " \n\t"\
954  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
955  "movd 8" #src ", " #t1 " \n\t"\
956  "psllq $56, " #t1 " \n\t"\
957  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
958  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
959  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
960  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
961  PAVGB(lx, pplx) \
962  "movq " #lx ", 8(%4) \n\t"\
963  "movq (%4), " #lx " \n\t"\
964  "psubusb " #lx ", " #t1 " \n\t"\
965  "psubusb " #lx ", " #t0 " \n\t"\
966  "psubusb " #lx ", " #sx " \n\t"\
967  "movq "MANGLE(b00)", " #lx " \n\t"\
968  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
969  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
970  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
971  "paddb " #t1 ", " #t0 " \n\t"\
972  "paddb " #t0 ", " #sx " \n\t"\
973 \
974  PAVGB(plx, pplx) /* filtered */\
975  "movq " #dst ", " #t0 " \n\t" /* dst */\
976  "movq " #t0 ", " #t1 " \n\t" /* dst */\
977  "psubusb %3, " #t0 " \n\t"\
978  "paddusb %3, " #t1 " \n\t"\
979  PMAXUB(t0, pplx)\
980  PMINUB(t1, pplx, t0)\
981  "paddb " #sx ", " #ppsx " \n\t"\
982  "paddb " #psx ", " #ppsx " \n\t"\
983  "pand "MANGLE(b08)", " #ppsx " \n\t"\
984  "pcmpeqb " #lx ", " #ppsx " \n\t"\
985  "pand " #ppsx ", " #pplx " \n\t"\
986  "pandn " #dst ", " #ppsx " \n\t"\
987  "por " #pplx ", " #ppsx " \n\t"\
988  "movq " #ppsx ", " #dst " \n\t"\
989  "movq 8(%4), " #lx " \n\t"
990 
991 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
992  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
993 /*
994 0000000
995 1111111
996 
997 1111110
998 1111101
999 1111100
1000 1111011
1001 1111010
1002 1111001
1003 
1004 1111000
1005 1110111
1006 
1007 */
1008 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1009 DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1010 DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1011 DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1012 DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1013 DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1014 DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1015 DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1016 DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1017 
1018  "1: \n\t"
1019  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1020  NAMED_CONSTRAINTS_ADD(b00,b08)
1021  : "%"FF_REG_a, "%"FF_REG_d
1022  );
1023 #else // HAVE_7REGS && TEMPLATE_PP_MMXEXT
1024  int y;
1025  int min=255;
1026  int max=0;
1027  int avg;
1028  uint8_t *p;
1029  int s[10];
1030  const int QP2= c->QP/2 + 1;
1031 
1032  src --;
1033  for(y=1; y<9; y++){
1034  int x;
1035  p= src + stride*y;
1036  for(x=1; x<9; x++){
1037  p++;
1038  if(*p > max) max= *p;
1039  if(*p < min) min= *p;
1040  }
1041  }
1042  avg= (min + max + 1)>>1;
1043 
1044  if (max - min < DERING_THRESHOLD) return;
1045 
1046  for(y=0; y<10; y++){
1047  int t = 0;
1048 
1049  if(src[stride*y + 0] > avg) t+= 1;
1050  if(src[stride*y + 1] > avg) t+= 2;
1051  if(src[stride*y + 2] > avg) t+= 4;
1052  if(src[stride*y + 3] > avg) t+= 8;
1053  if(src[stride*y + 4] > avg) t+= 16;
1054  if(src[stride*y + 5] > avg) t+= 32;
1055  if(src[stride*y + 6] > avg) t+= 64;
1056  if(src[stride*y + 7] > avg) t+= 128;
1057  if(src[stride*y + 8] > avg) t+= 256;
1058  if(src[stride*y + 9] > avg) t+= 512;
1059 
1060  t |= (~t)<<16;
1061  t &= (t<<1) & (t>>1);
1062  s[y] = t;
1063  }
1064 
1065  for(y=1; y<9; y++){
1066  int t = s[y-1] & s[y] & s[y+1];
1067  t|= t>>16;
1068  s[y-1]= t;
1069  }
1070 
1071  for(y=1; y<9; y++){
1072  int x;
1073  int t = s[y-1];
1074 
1075  p= src + stride*y;
1076  for(x=1; x<9; x++){
1077  p++;
1078  if(t & (1<<x)){
1079  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1080  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1081  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1082  f= (f + 8)>>4;
1083 
1084 #ifdef DEBUG_DERING_THRESHOLD
1085  __asm__ volatile("emms\n\t":);
1086  {
1087  static uint64_t numPixels=0;
1088  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1089 // if((max-min)<20 || (max-min)*QP<200)
1090 // if((max-min)*QP < 500)
1091 // if(max-min<QP/2)
1092  if(max-min < 20){
1093  static int numSkipped=0;
1094  static int errorSum=0;
1095  static int worstQP=0;
1096  static int worstRange=0;
1097  static int worstDiff=0;
1098  int diff= (f - *p);
1099  int absDiff= FFABS(diff);
1100  int error= diff*diff;
1101 
1102  if(x==1 || x==8 || y==1 || y==8) continue;
1103 
1104  numSkipped++;
1105  if(absDiff > worstDiff){
1106  worstDiff= absDiff;
1107  worstQP= QP;
1108  worstRange= max-min;
1109  }
1110  errorSum+= error;
1111 
1112  if(1024LL*1024LL*1024LL % numSkipped == 0){
1113  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1114  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1115  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1116  worstDiff, (float)numSkipped/numPixels);
1117  }
1118  }
1119  }
1120 #endif
1121  if (*p + QP2 < f) *p= *p + QP2;
1122  else if(*p - QP2 > f) *p= *p - QP2;
1123  else *p=f;
1124  }
1125  }
1126  }
1127 #ifdef DEBUG_DERING_THRESHOLD
1128  if(max-min < 20){
1129  for(y=1; y<9; y++){
1130  int x;
1131  int t = 0;
1132  p= src + stride*y;
1133  for(x=1; x<9; x++){
1134  p++;
1135  *p = FFMIN(*p + 20, 255);
1136  }
1137  }
1138 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1139  }
1140 #endif
1141 #endif //TEMPLATE_PP_MMXEXT
1142 }
1143 #endif //TEMPLATE_PP_ALTIVEC
1144 
1145 /**
1146  * Deinterlace the given block by linearly interpolating every second line.
1147  * will be called for every 8x8 block and can read & write from line 4-15
1148  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1149  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1150  */
1151 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1152 {
1153 #if TEMPLATE_PP_MMXEXT
1154  src+= 4*stride;
1155  __asm__ volatile(
1156  "lea (%0, %1), %%"FF_REG_a" \n\t"
1157  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
1158 // 0 1 2 3 4 5 6 7 8 9
1159 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1160 
1161  "movq (%0), %%mm0 \n\t"
1162  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1163  PAVGB(%%mm1, %%mm0)
1164  "movq %%mm0, (%%"FF_REG_a") \n\t"
1165  "movq (%0, %1, 4), %%mm0 \n\t"
1166  PAVGB(%%mm0, %%mm1)
1167  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t"
1168  "movq (%%"FF_REG_c", %1), %%mm1 \n\t"
1169  PAVGB(%%mm1, %%mm0)
1170  "movq %%mm0, (%%"FF_REG_c") \n\t"
1171  "movq (%0, %1, 8), %%mm0 \n\t"
1172  PAVGB(%%mm0, %%mm1)
1173  "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t"
1174 
1175  : : "r" (src), "r" ((x86_reg)stride)
1176  : "%"FF_REG_a, "%"FF_REG_c
1177  );
1178 #else
1179  int a, b, x;
1180  src+= 4*stride;
1181 
1182  for(x=0; x<2; x++){
1183  a= *(uint32_t*)&src[stride*0];
1184  b= *(uint32_t*)&src[stride*2];
1185  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1186  a= *(uint32_t*)&src[stride*4];
1187  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1188  b= *(uint32_t*)&src[stride*6];
1189  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1190  a= *(uint32_t*)&src[stride*8];
1191  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1192  src += 4;
1193  }
1194 #endif
1195 }
1196 
1197 /**
1198  * Deinterlace the given block by cubic interpolating every second line.
1199  * will be called for every 8x8 block and can read & write from line 4-15
1200  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1201  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1202  * this filter will read lines 3-15 and write 7-13
1203  */
1204 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1205 {
1206 #if TEMPLATE_PP_SSE2
1207  src+= stride*3;
1208  __asm__ volatile(
1209  "lea (%0, %1), %%"FF_REG_a" \n\t"
1210  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1211  "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
1212  "add %1, %%"FF_REG_c" \n\t"
1213  "pxor %%xmm7, %%xmm7 \n\t"
1214 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1215  "movq " #a ", %%xmm0 \n\t"\
1216  "movq " #b ", %%xmm1 \n\t"\
1217  "movq " #d ", %%xmm2 \n\t"\
1218  "movq " #e ", %%xmm3 \n\t"\
1219  "pavgb %%xmm2, %%xmm1 \n\t"\
1220  "pavgb %%xmm3, %%xmm0 \n\t"\
1221  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1222  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1223  "psubw %%xmm1, %%xmm0 \n\t"\
1224  "psraw $3, %%xmm0 \n\t"\
1225  "psubw %%xmm0, %%xmm1 \n\t"\
1226  "packuswb %%xmm1, %%xmm1 \n\t"\
1227  "movlps %%xmm1, " #c " \n\t"
1228 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1229 
1230 DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
1231 DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8))
1232 DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc))
1233 DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2))
1234 
1235  : : "r" (src), "r" ((x86_reg)stride)
1236  :
1237  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1238  "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
1239  );
1240 #undef REAL_DEINT_CUBIC
1241 #else //TEMPLATE_PP_SSE2
1242  int x;
1243  src+= stride*3;
1244  for(x=0; x<8; x++){
1245  src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1246  src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1247  src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1248  src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1249  src++;
1250  }
1251 #endif //TEMPLATE_PP_SSE2
1252 }
1253 
1254 /**
1255  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1256  * will be called for every 8x8 block and can read & write from line 4-15
1257  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1258  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1259  * this filter will read lines 4-13 and write 5-11
1260  */
1261 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1262 {
1263 #if TEMPLATE_PP_MMXEXT
1264  src+= stride*4;
1265  __asm__ volatile(
1266  "lea (%0, %1), %%"FF_REG_a" \n\t"
1267  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1268  "pxor %%mm7, %%mm7 \n\t"
1269  "movq (%2), %%mm0 \n\t"
1270 // 0 1 2 3 4 5 6 7 8 9 10
1271 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1272 
1273 #define REAL_DEINT_FF(a,b,c,d)\
1274  "movq " #a ", %%mm1 \n\t"\
1275  "movq " #b ", %%mm2 \n\t"\
1276  "movq " #c ", %%mm3 \n\t"\
1277  "movq " #d ", %%mm4 \n\t"\
1278  PAVGB(%%mm3, %%mm1) \
1279  PAVGB(%%mm4, %%mm0) \
1280  "movq %%mm0, %%mm3 \n\t"\
1281  "punpcklbw %%mm7, %%mm0 \n\t"\
1282  "punpckhbw %%mm7, %%mm3 \n\t"\
1283  "movq %%mm1, %%mm4 \n\t"\
1284  "punpcklbw %%mm7, %%mm1 \n\t"\
1285  "punpckhbw %%mm7, %%mm4 \n\t"\
1286  "psllw $2, %%mm1 \n\t"\
1287  "psllw $2, %%mm4 \n\t"\
1288  "psubw %%mm0, %%mm1 \n\t"\
1289  "psubw %%mm3, %%mm4 \n\t"\
1290  "movq %%mm2, %%mm5 \n\t"\
1291  "movq %%mm2, %%mm0 \n\t"\
1292  "punpcklbw %%mm7, %%mm2 \n\t"\
1293  "punpckhbw %%mm7, %%mm5 \n\t"\
1294  "paddw %%mm2, %%mm1 \n\t"\
1295  "paddw %%mm5, %%mm4 \n\t"\
1296  "psraw $2, %%mm1 \n\t"\
1297  "psraw $2, %%mm4 \n\t"\
1298  "packuswb %%mm4, %%mm1 \n\t"\
1299  "movq %%mm1, " #b " \n\t"\
1300 
1301 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1302 
1303 DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2))
1304 DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1305 DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2))
1306 DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1307 
1308  "movq %%mm0, (%2) \n\t"
1309  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1310  : "%"FF_REG_a, "%"FF_REG_d
1311  );
1312 #else //TEMPLATE_PP_MMXEXT
1313  int x;
1314  src+= stride*4;
1315  for(x=0; x<8; x++){
1316  int t1= tmp[x];
1317  int t2= src[stride*1];
1318 
1319  src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1320  t1= src[stride*4];
1321  src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1322  t2= src[stride*6];
1323  src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1324  t1= src[stride*8];
1325  src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1326  tmp[x]= t1;
1327 
1328  src++;
1329  }
1330 #endif //TEMPLATE_PP_MMXEXT
1331 }
1332 
1333 /**
1334  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1335  * will be called for every 8x8 block and can read & write from line 4-15
1336  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1337  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1338  * this filter will read lines 4-13 and write 4-11
1339  */
1340 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1341 {
1342 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1343  src+= stride*4;
1344  __asm__ volatile(
1345  "lea (%0, %1), %%"FF_REG_a" \n\t"
1346  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1347  "pxor %%mm7, %%mm7 \n\t"
1348  "movq (%2), %%mm0 \n\t"
1349  "movq (%3), %%mm1 \n\t"
1350 // 0 1 2 3 4 5 6 7 8 9 10
1351 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1352 
1353 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1354  "movq " #a ", %%mm2 \n\t"\
1355  "movq " #b ", %%mm3 \n\t"\
1356  "movq " #c ", %%mm4 \n\t"\
1357  PAVGB(t2, %%mm3) \
1358  PAVGB(t1, %%mm4) \
1359  "movq %%mm2, %%mm5 \n\t"\
1360  "movq %%mm2, " #t1 " \n\t"\
1361  "punpcklbw %%mm7, %%mm2 \n\t"\
1362  "punpckhbw %%mm7, %%mm5 \n\t"\
1363  "movq %%mm2, %%mm6 \n\t"\
1364  "paddw %%mm2, %%mm2 \n\t"\
1365  "paddw %%mm6, %%mm2 \n\t"\
1366  "movq %%mm5, %%mm6 \n\t"\
1367  "paddw %%mm5, %%mm5 \n\t"\
1368  "paddw %%mm6, %%mm5 \n\t"\
1369  "movq %%mm3, %%mm6 \n\t"\
1370  "punpcklbw %%mm7, %%mm3 \n\t"\
1371  "punpckhbw %%mm7, %%mm6 \n\t"\
1372  "paddw %%mm3, %%mm3 \n\t"\
1373  "paddw %%mm6, %%mm6 \n\t"\
1374  "paddw %%mm3, %%mm2 \n\t"\
1375  "paddw %%mm6, %%mm5 \n\t"\
1376  "movq %%mm4, %%mm6 \n\t"\
1377  "punpcklbw %%mm7, %%mm4 \n\t"\
1378  "punpckhbw %%mm7, %%mm6 \n\t"\
1379  "psubw %%mm4, %%mm2 \n\t"\
1380  "psubw %%mm6, %%mm5 \n\t"\
1381  "psraw $2, %%mm2 \n\t"\
1382  "psraw $2, %%mm5 \n\t"\
1383  "packuswb %%mm5, %%mm2 \n\t"\
1384  "movq %%mm2, " #a " \n\t"\
1385 
1386 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1387 
1388 DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) )
1389 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2))
1390 DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) )
1391 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1392 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) )
1393 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2))
1394 DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) )
1395 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1396 
1397  "movq %%mm0, (%2) \n\t"
1398  "movq %%mm1, (%3) \n\t"
1399  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1400  : "%"FF_REG_a, "%"FF_REG_d
1401  );
1402 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
1403  int x;
1404  src+= stride*4;
1405  for(x=0; x<8; x++){
1406  int t1= tmp[x];
1407  int t2= tmp2[x];
1408  int t3= src[0];
1409 
1410  src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1411  t1= src[stride*1];
1412  src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1413  t2= src[stride*2];
1414  src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1415  t3= src[stride*3];
1416  src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1417  t1= src[stride*4];
1418  src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1419  t2= src[stride*5];
1420  src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1421  t3= src[stride*6];
1422  src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1423  t1= src[stride*7];
1424  src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1425 
1426  tmp[x]= t3;
1427  tmp2[x]= t1;
1428 
1429  src++;
1430  }
1431 #endif // TEMPLATE_PP_MMXEXT && HAVE_6REGS
1432 }
1433 
1434 /**
1435  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1436  * will be called for every 8x8 block and can read & write from line 4-15
1437  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1438  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1439  * this filter will read lines 4-13 and write 4-11
1440  */
1441 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1442 {
1443 #if TEMPLATE_PP_MMXEXT
1444  src+= 4*stride;
1445  __asm__ volatile(
1446  "lea (%0, %1), %%"FF_REG_a" \n\t"
1447  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1448 // 0 1 2 3 4 5 6 7 8 9
1449 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1450 
1451  "movq (%2), %%mm0 \n\t" // L0
1452  "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2
1453  PAVGB(%%mm1, %%mm0) // L0+L2
1454  "movq (%0), %%mm2 \n\t" // L1
1455  PAVGB(%%mm2, %%mm0)
1456  "movq %%mm0, (%0) \n\t"
1457  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3
1458  PAVGB(%%mm0, %%mm2) // L1+L3
1459  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1460  "movq %%mm2, (%%"FF_REG_a") \n\t"
1461  "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4
1462  PAVGB(%%mm2, %%mm1) // L2+L4
1463  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1464  "movq %%mm1, (%%"FF_REG_a", %1) \n\t"
1465  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1466  PAVGB(%%mm1, %%mm0) // L3+L5
1467  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1468  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
1469  "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6
1470  PAVGB(%%mm0, %%mm2) // L4+L6
1471  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1472  "movq %%mm2, (%0, %1, 4) \n\t"
1473  "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7
1474  PAVGB(%%mm2, %%mm1) // L5+L7
1475  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1476  "movq %%mm1, (%%"FF_REG_d") \n\t"
1477  "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8
1478  PAVGB(%%mm1, %%mm0) // L6+L8
1479  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1480  "movq %%mm0, (%%"FF_REG_d", %1) \n\t"
1481  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1482  PAVGB(%%mm0, %%mm2) // L7+L9
1483  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1484  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1485  "movq %%mm1, (%2) \n\t"
1486 
1487  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1488  : "%"FF_REG_a, "%"FF_REG_d
1489  );
1490 #else //TEMPLATE_PP_MMXEXT
1491  int a, b, c, x;
1492  src+= 4*stride;
1493 
1494  for(x=0; x<2; x++){
1495  a= *(uint32_t*)&tmp[stride*0];
1496  b= *(uint32_t*)&src[stride*0];
1497  c= *(uint32_t*)&src[stride*1];
1498  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1499  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1500 
1501  a= *(uint32_t*)&src[stride*2];
1502  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1503  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1504 
1505  b= *(uint32_t*)&src[stride*3];
1506  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1507  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1508 
1509  c= *(uint32_t*)&src[stride*4];
1510  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1511  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1512 
1513  a= *(uint32_t*)&src[stride*5];
1514  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1515  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1516 
1517  b= *(uint32_t*)&src[stride*6];
1518  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1519  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1520 
1521  c= *(uint32_t*)&src[stride*7];
1522  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1523  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1524 
1525  a= *(uint32_t*)&src[stride*8];
1526  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1527  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1528 
1529  *(uint32_t*)&tmp[stride*0]= c;
1530  src += 4;
1531  tmp += 4;
1532  }
1533 #endif //TEMPLATE_PP_MMXEXT
1534 }
1535 
1536 /**
1537  * Deinterlace the given block by applying a median filter to every second line.
1538  * will be called for every 8x8 block and can read & write from line 4-15,
1539  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1540  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1541  */
1542 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1543 {
1544 #if TEMPLATE_PP_MMXEXT
1545  src+= 4*stride;
1546  __asm__ volatile(
1547  "lea (%0, %1), %%"FF_REG_a" \n\t"
1548  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1549 // 0 1 2 3 4 5 6 7 8 9
1550 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1551 
1552  "movq (%0), %%mm0 \n\t"
1553  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
1554  "movq (%%"FF_REG_a"), %%mm1 \n\t"
1555  "movq %%mm0, %%mm3 \n\t"
1556  "pmaxub %%mm1, %%mm0 \n\t"
1557  "pminub %%mm3, %%mm1 \n\t"
1558  "pmaxub %%mm2, %%mm1 \n\t"
1559  "pminub %%mm1, %%mm0 \n\t"
1560  "movq %%mm0, (%%"FF_REG_a") \n\t"
1561 
1562  "movq (%0, %1, 4), %%mm0 \n\t"
1563  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t"
1564  "movq %%mm2, %%mm3 \n\t"
1565  "pmaxub %%mm1, %%mm2 \n\t"
1566  "pminub %%mm3, %%mm1 \n\t"
1567  "pmaxub %%mm0, %%mm1 \n\t"
1568  "pminub %%mm1, %%mm2 \n\t"
1569  "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t"
1570 
1571  "movq (%%"FF_REG_d"), %%mm2 \n\t"
1572  "movq (%%"FF_REG_d", %1), %%mm1 \n\t"
1573  "movq %%mm2, %%mm3 \n\t"
1574  "pmaxub %%mm0, %%mm2 \n\t"
1575  "pminub %%mm3, %%mm0 \n\t"
1576  "pmaxub %%mm1, %%mm0 \n\t"
1577  "pminub %%mm0, %%mm2 \n\t"
1578  "movq %%mm2, (%%"FF_REG_d") \n\t"
1579 
1580  "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t"
1581  "movq (%0, %1, 8), %%mm0 \n\t"
1582  "movq %%mm2, %%mm3 \n\t"
1583  "pmaxub %%mm0, %%mm2 \n\t"
1584  "pminub %%mm3, %%mm0 \n\t"
1585  "pmaxub %%mm1, %%mm0 \n\t"
1586  "pminub %%mm0, %%mm2 \n\t"
1587  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1588 
1589 
1590  : : "r" (src), "r" ((x86_reg)stride)
1591  : "%"FF_REG_a, "%"FF_REG_d
1592  );
1593 
1594 #else //TEMPLATE_PP_MMX
1595  int x, y;
1596  src+= 4*stride;
1597  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1598  for(x=0; x<8; x++){
1599  uint8_t *colsrc = src;
1600  for (y=0; y<4; y++){
1601  int a, b, c, d, e, f;
1602  a = colsrc[0 ];
1603  b = colsrc[stride ];
1604  c = colsrc[stride*2];
1605  d = (a-b)>>31;
1606  e = (b-c)>>31;
1607  f = (c-a)>>31;
1608  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1609  colsrc += stride*2;
1610  }
1611  src++;
1612  }
1613 #endif //TEMPLATE_PP_MMX
1614 }
1615 
1616 #if TEMPLATE_PP_MMX
1617 /**
1618  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1619  */
1620 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride)
1621 {
1622  __asm__(
1623  "lea (%0, %1), %%"FF_REG_a" \n\t"
1624 // 0 1 2 3 4 5 6 7 8 9
1625 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1626  "movq (%0), %%mm0 \n\t" // 12345678
1627  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1628  "movq %%mm0, %%mm2 \n\t" // 12345678
1629  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1630  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1631 
1632  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1633  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1634  "movq %%mm1, %%mm4 \n\t"
1635  "punpcklbw %%mm3, %%mm1 \n\t"
1636  "punpckhbw %%mm3, %%mm4 \n\t"
1637 
1638  "movq %%mm0, %%mm3 \n\t"
1639  "punpcklwd %%mm1, %%mm0 \n\t"
1640  "punpckhwd %%mm1, %%mm3 \n\t"
1641  "movq %%mm2, %%mm1 \n\t"
1642  "punpcklwd %%mm4, %%mm2 \n\t"
1643  "punpckhwd %%mm4, %%mm1 \n\t"
1644 
1645  "movd %%mm0, 128(%2) \n\t"
1646  "psrlq $32, %%mm0 \n\t"
1647  "movd %%mm0, 144(%2) \n\t"
1648  "movd %%mm3, 160(%2) \n\t"
1649  "psrlq $32, %%mm3 \n\t"
1650  "movd %%mm3, 176(%2) \n\t"
1651  "movd %%mm3, 48(%3) \n\t"
1652  "movd %%mm2, 192(%2) \n\t"
1653  "movd %%mm2, 64(%3) \n\t"
1654  "psrlq $32, %%mm2 \n\t"
1655  "movd %%mm2, 80(%3) \n\t"
1656  "movd %%mm1, 96(%3) \n\t"
1657  "psrlq $32, %%mm1 \n\t"
1658  "movd %%mm1, 112(%3) \n\t"
1659 
1660  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
1661 
1662  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
1663  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1664  "movq %%mm0, %%mm2 \n\t" // 12345678
1665  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1666  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1667 
1668  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1669  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1670  "movq %%mm1, %%mm4 \n\t"
1671  "punpcklbw %%mm3, %%mm1 \n\t"
1672  "punpckhbw %%mm3, %%mm4 \n\t"
1673 
1674  "movq %%mm0, %%mm3 \n\t"
1675  "punpcklwd %%mm1, %%mm0 \n\t"
1676  "punpckhwd %%mm1, %%mm3 \n\t"
1677  "movq %%mm2, %%mm1 \n\t"
1678  "punpcklwd %%mm4, %%mm2 \n\t"
1679  "punpckhwd %%mm4, %%mm1 \n\t"
1680 
1681  "movd %%mm0, 132(%2) \n\t"
1682  "psrlq $32, %%mm0 \n\t"
1683  "movd %%mm0, 148(%2) \n\t"
1684  "movd %%mm3, 164(%2) \n\t"
1685  "psrlq $32, %%mm3 \n\t"
1686  "movd %%mm3, 180(%2) \n\t"
1687  "movd %%mm3, 52(%3) \n\t"
1688  "movd %%mm2, 196(%2) \n\t"
1689  "movd %%mm2, 68(%3) \n\t"
1690  "psrlq $32, %%mm2 \n\t"
1691  "movd %%mm2, 84(%3) \n\t"
1692  "movd %%mm1, 100(%3) \n\t"
1693  "psrlq $32, %%mm1 \n\t"
1694  "movd %%mm1, 116(%3) \n\t"
1695 
1696 
1697  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
1698  : "%"FF_REG_a
1699  );
1700 }
1701 
1702 /**
1703  * Transpose the given 8x8 block.
1704  */
1705 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
1706 {
1707  __asm__(
1708  "lea (%0, %1), %%"FF_REG_a" \n\t"
1709  "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
1710 // 0 1 2 3 4 5 6 7 8 9
1711 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1712  "movq (%2), %%mm0 \n\t" // 12345678
1713  "movq 16(%2), %%mm1 \n\t" // abcdefgh
1714  "movq %%mm0, %%mm2 \n\t" // 12345678
1715  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1716  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1717 
1718  "movq 32(%2), %%mm1 \n\t"
1719  "movq 48(%2), %%mm3 \n\t"
1720  "movq %%mm1, %%mm4 \n\t"
1721  "punpcklbw %%mm3, %%mm1 \n\t"
1722  "punpckhbw %%mm3, %%mm4 \n\t"
1723 
1724  "movq %%mm0, %%mm3 \n\t"
1725  "punpcklwd %%mm1, %%mm0 \n\t"
1726  "punpckhwd %%mm1, %%mm3 \n\t"
1727  "movq %%mm2, %%mm1 \n\t"
1728  "punpcklwd %%mm4, %%mm2 \n\t"
1729  "punpckhwd %%mm4, %%mm1 \n\t"
1730 
1731  "movd %%mm0, (%0) \n\t"
1732  "psrlq $32, %%mm0 \n\t"
1733  "movd %%mm0, (%%"FF_REG_a") \n\t"
1734  "movd %%mm3, (%%"FF_REG_a", %1) \n\t"
1735  "psrlq $32, %%mm3 \n\t"
1736  "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
1737  "movd %%mm2, (%0, %1, 4) \n\t"
1738  "psrlq $32, %%mm2 \n\t"
1739  "movd %%mm2, (%%"FF_REG_d") \n\t"
1740  "movd %%mm1, (%%"FF_REG_d", %1) \n\t"
1741  "psrlq $32, %%mm1 \n\t"
1742  "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
1743 
1744 
1745  "movq 64(%2), %%mm0 \n\t" // 12345678
1746  "movq 80(%2), %%mm1 \n\t" // abcdefgh
1747  "movq %%mm0, %%mm2 \n\t" // 12345678
1748  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1749  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1750 
1751  "movq 96(%2), %%mm1 \n\t"
1752  "movq 112(%2), %%mm3 \n\t"
1753  "movq %%mm1, %%mm4 \n\t"
1754  "punpcklbw %%mm3, %%mm1 \n\t"
1755  "punpckhbw %%mm3, %%mm4 \n\t"
1756 
1757  "movq %%mm0, %%mm3 \n\t"
1758  "punpcklwd %%mm1, %%mm0 \n\t"
1759  "punpckhwd %%mm1, %%mm3 \n\t"
1760  "movq %%mm2, %%mm1 \n\t"
1761  "punpcklwd %%mm4, %%mm2 \n\t"
1762  "punpckhwd %%mm4, %%mm1 \n\t"
1763 
1764  "movd %%mm0, 4(%0) \n\t"
1765  "psrlq $32, %%mm0 \n\t"
1766  "movd %%mm0, 4(%%"FF_REG_a") \n\t"
1767  "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
1768  "psrlq $32, %%mm3 \n\t"
1769  "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
1770  "movd %%mm2, 4(%0, %1, 4) \n\t"
1771  "psrlq $32, %%mm2 \n\t"
1772  "movd %%mm2, 4(%%"FF_REG_d") \n\t"
1773  "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
1774  "psrlq $32, %%mm1 \n\t"
1775  "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
1776 
1777  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
1778  : "%"FF_REG_a, "%"FF_REG_d
1779  );
1780 }
1781 #endif //TEMPLATE_PP_MMX
1782 //static long test=0;
1783 
1784 #if !TEMPLATE_PP_ALTIVEC
1785 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
1786  uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
1787 {
1788  // to save a register (FIXME do this outside of the loops)
1789  tempBlurredPast[127]= maxNoise[0];
1790  tempBlurredPast[128]= maxNoise[1];
1791  tempBlurredPast[129]= maxNoise[2];
1792 
1793 #define FAST_L2_DIFF
1794 //#define L1_DIFF //u should change the thresholds too if u try that one
1795 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1796  __asm__ volatile(
1797  "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
1798  "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
1799  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1800 // 0 1 2 3 4 5 6 7 8 9
1801 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
1802 //FIXME reorder?
1803 #ifdef L1_DIFF //needs mmx2
1804  "movq (%0), %%mm0 \n\t" // L0
1805  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
1806  "movq (%0, %2), %%mm1 \n\t" // L1
1807  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
1808  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1809  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
1810  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1811  "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
1812 
1813  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1814  "paddw %%mm1, %%mm0 \n\t"
1815  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
1816  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1817  "paddw %%mm2, %%mm0 \n\t"
1818  "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
1819  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1820  "paddw %%mm3, %%mm0 \n\t"
1821  "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
1822  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1823  "paddw %%mm4, %%mm0 \n\t"
1824  "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
1825  "paddw %%mm5, %%mm6 \n\t"
1826  "paddw %%mm7, %%mm6 \n\t"
1827  "paddw %%mm6, %%mm0 \n\t"
1828 #else //L1_DIFF
1829 #if defined (FAST_L2_DIFF)
1830  "pcmpeqb %%mm7, %%mm7 \n\t"
1831  "movq "MANGLE(b80)", %%mm6 \n\t"
1832  "pxor %%mm0, %%mm0 \n\t"
1833 #define REAL_L2_DIFF_CORE(a, b)\
1834  "movq " #a ", %%mm5 \n\t"\
1835  "movq " #b ", %%mm2 \n\t"\
1836  "pxor %%mm7, %%mm2 \n\t"\
1837  PAVGB(%%mm2, %%mm5)\
1838  "paddb %%mm6, %%mm5 \n\t"\
1839  "movq %%mm5, %%mm2 \n\t"\
1840  "psllw $8, %%mm5 \n\t"\
1841  "pmaddwd %%mm5, %%mm5 \n\t"\
1842  "pmaddwd %%mm2, %%mm2 \n\t"\
1843  "paddd %%mm2, %%mm5 \n\t"\
1844  "psrld $14, %%mm5 \n\t"\
1845  "paddd %%mm5, %%mm0 \n\t"
1846 
1847 #else //defined (FAST_L2_DIFF)
1848  "pxor %%mm7, %%mm7 \n\t"
1849  "pxor %%mm0, %%mm0 \n\t"
1850 #define REAL_L2_DIFF_CORE(a, b)\
1851  "movq " #a ", %%mm5 \n\t"\
1852  "movq " #b ", %%mm2 \n\t"\
1853  "movq %%mm5, %%mm1 \n\t"\
1854  "movq %%mm2, %%mm3 \n\t"\
1855  "punpcklbw %%mm7, %%mm5 \n\t"\
1856  "punpckhbw %%mm7, %%mm1 \n\t"\
1857  "punpcklbw %%mm7, %%mm2 \n\t"\
1858  "punpckhbw %%mm7, %%mm3 \n\t"\
1859  "psubw %%mm2, %%mm5 \n\t"\
1860  "psubw %%mm3, %%mm1 \n\t"\
1861  "pmaddwd %%mm5, %%mm5 \n\t"\
1862  "pmaddwd %%mm1, %%mm1 \n\t"\
1863  "paddd %%mm1, %%mm5 \n\t"\
1864  "paddd %%mm5, %%mm0 \n\t"
1865 
1866 #endif //defined (FAST_L2_DIFF)
1867 
1868 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
1869 
1870 L2_DIFF_CORE((%0) , (%1))
1871 L2_DIFF_CORE((%0, %2) , (%1, %2))
1872 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
1873 L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
1874 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
1875 L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
1876 L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
1877 L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
1878 
1879 #endif //L1_DIFF
1880 
1881  "movq %%mm0, %%mm4 \n\t"
1882  "psrlq $32, %%mm0 \n\t"
1883  "paddd %%mm0, %%mm4 \n\t"
1884  "movd %%mm4, %%ecx \n\t"
1885  "shll $2, %%ecx \n\t"
1886  "mov %3, %%"FF_REG_d" \n\t"
1887  "addl -4(%%"FF_REG_d"), %%ecx \n\t"
1888  "addl 4(%%"FF_REG_d"), %%ecx \n\t"
1889  "addl -1024(%%"FF_REG_d"), %%ecx \n\t"
1890  "addl $4, %%ecx \n\t"
1891  "addl 1024(%%"FF_REG_d"), %%ecx \n\t"
1892  "shrl $3, %%ecx \n\t"
1893  "movl %%ecx, (%%"FF_REG_d") \n\t"
1894 
1895 // "mov %3, %%"FF_REG_c" \n\t"
1896 // "mov %%"FF_REG_c", test \n\t"
1897 // "jmp 4f \n\t"
1898  "cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
1899  " jb 2f \n\t"
1900  "cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
1901  " jb 1f \n\t"
1902 
1903  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1904  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1905  "movq (%0), %%mm0 \n\t" // L0
1906  "movq (%0, %2), %%mm1 \n\t" // L1
1907  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1908  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1909  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1910  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1911  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1912  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1913  "movq %%mm0, (%1) \n\t" // L0
1914  "movq %%mm1, (%1, %2) \n\t" // L1
1915  "movq %%mm2, (%1, %2, 2) \n\t" // L2
1916  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
1917  "movq %%mm4, (%1, %2, 4) \n\t" // L4
1918  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
1919  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
1920  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
1921  "jmp 4f \n\t"
1922 
1923  "1: \n\t"
1924  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1925  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1926  "movq (%0), %%mm0 \n\t" // L0
1927  PAVGB((%1), %%mm0) // L0
1928  "movq (%0, %2), %%mm1 \n\t" // L1
1929  PAVGB((%1, %2), %%mm1) // L1
1930  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1931  PAVGB((%1, %2, 2), %%mm2) // L2
1932  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1933  PAVGB((%1, %%FF_REGa), %%mm3) // L3
1934  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1935  PAVGB((%1, %2, 4), %%mm4) // L4
1936  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1937  PAVGB((%1, %%FF_REGd), %%mm5) // L5
1938  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1939  PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
1940  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1941  PAVGB((%1, %%FF_REGc), %%mm7) // L7
1942  "movq %%mm0, (%1) \n\t" // R0
1943  "movq %%mm1, (%1, %2) \n\t" // R1
1944  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1945  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1946  "movq %%mm4, (%1, %2, 4) \n\t" // R4
1947  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
1948  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
1949  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
1950  "movq %%mm0, (%0) \n\t" // L0
1951  "movq %%mm1, (%0, %2) \n\t" // L1
1952  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1953  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1954  "movq %%mm4, (%0, %2, 4) \n\t" // L4
1955  "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
1956  "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
1957  "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
1958  "jmp 4f \n\t"
1959 
1960  "2: \n\t"
1961  "cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
1962  " jb 3f \n\t"
1963 
1964  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1965  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1966  "movq (%0), %%mm0 \n\t" // L0
1967  "movq (%0, %2), %%mm1 \n\t" // L1
1968  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1969  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1970  "movq (%1), %%mm4 \n\t" // R0
1971  "movq (%1, %2), %%mm5 \n\t" // R1
1972  "movq (%1, %2, 2), %%mm6 \n\t" // R2
1973  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
1974  PAVGB(%%mm4, %%mm0)
1975  PAVGB(%%mm5, %%mm1)
1976  PAVGB(%%mm6, %%mm2)
1977  PAVGB(%%mm7, %%mm3)
1978  PAVGB(%%mm4, %%mm0)
1979  PAVGB(%%mm5, %%mm1)
1980  PAVGB(%%mm6, %%mm2)
1981  PAVGB(%%mm7, %%mm3)
1982  "movq %%mm0, (%1) \n\t" // R0
1983  "movq %%mm1, (%1, %2) \n\t" // R1
1984  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1985  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1986  "movq %%mm0, (%0) \n\t" // L0
1987  "movq %%mm1, (%0, %2) \n\t" // L1
1988  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1989  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1990 
1991  "movq (%0, %2, 4), %%mm0 \n\t" // L4
1992  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
1993  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
1994  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
1995  "movq (%1, %2, 4), %%mm4 \n\t" // R4
1996  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
1997  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
1998  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
1999  PAVGB(%%mm4, %%mm0)
2000  PAVGB(%%mm5, %%mm1)
2001  PAVGB(%%mm6, %%mm2)
2002  PAVGB(%%mm7, %%mm3)
2003  PAVGB(%%mm4, %%mm0)
2004  PAVGB(%%mm5, %%mm1)
2005  PAVGB(%%mm6, %%mm2)
2006  PAVGB(%%mm7, %%mm3)
2007  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2008  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2009  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2010  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2011  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2012  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2013  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2014  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2015  "jmp 4f \n\t"
2016 
2017  "3: \n\t"
2018  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2019  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2020  "movq (%0), %%mm0 \n\t" // L0
2021  "movq (%0, %2), %%mm1 \n\t" // L1
2022  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2023  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2024  "movq (%1), %%mm4 \n\t" // R0
2025  "movq (%1, %2), %%mm5 \n\t" // R1
2026  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2027  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2028  PAVGB(%%mm4, %%mm0)
2029  PAVGB(%%mm5, %%mm1)
2030  PAVGB(%%mm6, %%mm2)
2031  PAVGB(%%mm7, %%mm3)
2032  PAVGB(%%mm4, %%mm0)
2033  PAVGB(%%mm5, %%mm1)
2034  PAVGB(%%mm6, %%mm2)
2035  PAVGB(%%mm7, %%mm3)
2036  PAVGB(%%mm4, %%mm0)
2037  PAVGB(%%mm5, %%mm1)
2038  PAVGB(%%mm6, %%mm2)
2039  PAVGB(%%mm7, %%mm3)
2040  "movq %%mm0, (%1) \n\t" // R0
2041  "movq %%mm1, (%1, %2) \n\t" // R1
2042  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2043  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2044  "movq %%mm0, (%0) \n\t" // L0
2045  "movq %%mm1, (%0, %2) \n\t" // L1
2046  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2047  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2048 
2049  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2050  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2051  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2052  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2053  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2054  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2055  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2056  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2057  PAVGB(%%mm4, %%mm0)
2058  PAVGB(%%mm5, %%mm1)
2059  PAVGB(%%mm6, %%mm2)
2060  PAVGB(%%mm7, %%mm3)
2061  PAVGB(%%mm4, %%mm0)
2062  PAVGB(%%mm5, %%mm1)
2063  PAVGB(%%mm6, %%mm2)
2064  PAVGB(%%mm7, %%mm3)
2065  PAVGB(%%mm4, %%mm0)
2066  PAVGB(%%mm5, %%mm1)
2067  PAVGB(%%mm6, %%mm2)
2068  PAVGB(%%mm7, %%mm3)
2069  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2070  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2071  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2072  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2073  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2074  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2075  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2076  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2077 
2078  "4: \n\t"
2079 
2080  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2082  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
2083  );
2084 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2085 {
2086  int y;
2087  int d=0;
2088 // int sysd=0;
2089  int i;
2090 
2091  for(y=0; y<8; y++){
2092  int x;
2093  for(x=0; x<8; x++){
2094  int ref= tempBlurred[ x + y*stride ];
2095  int cur= src[ x + y*stride ];
2096  int d1=ref - cur;
2097 // if(x==0 || x==7) d1+= d1>>1;
2098 // if(y==0 || y==7) d1+= d1>>1;
2099 // d+= FFABS(d1);
2100  d+= d1*d1;
2101 // sysd+= d1;
2102  }
2103  }
2104  i=d;
2105  d= (
2106  4*d
2107  +(*(tempBlurredPast-256))
2108  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2109  +(*(tempBlurredPast+256))
2110  +4)>>3;
2111  *tempBlurredPast=i;
2112 // ((*tempBlurredPast)*3 + d + 2)>>2;
2113 
2114 /*
2115 Switch between
2116  1 0 0 0 0 0 0 (0)
2117 64 32 16 8 4 2 1 (1)
2118 64 48 36 27 20 15 11 (33) (approx)
2119 64 56 49 43 37 33 29 (200) (approx)
2120 */
2121  if(d > maxNoise[1]){
2122  if(d < maxNoise[2]){
2123  for(y=0; y<8; y++){
2124  int x;
2125  for(x=0; x<8; x++){
2126  int ref= tempBlurred[ x + y*stride ];
2127  int cur= src[ x + y*stride ];
2128  tempBlurred[ x + y*stride ]=
2129  src[ x + y*stride ]=
2130  (ref + cur + 1)>>1;
2131  }
2132  }
2133  }else{
2134  for(y=0; y<8; y++){
2135  int x;
2136  for(x=0; x<8; x++){
2137  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2138  }
2139  }
2140  }
2141  }else{
2142  if(d < maxNoise[0]){
2143  for(y=0; y<8; y++){
2144  int x;
2145  for(x=0; x<8; x++){
2146  int ref= tempBlurred[ x + y*stride ];
2147  int cur= src[ x + y*stride ];
2148  tempBlurred[ x + y*stride ]=
2149  src[ x + y*stride ]=
2150  (ref*7 + cur + 4)>>3;
2151  }
2152  }
2153  }else{
2154  for(y=0; y<8; y++){
2155  int x;
2156  for(x=0; x<8; x++){
2157  int ref= tempBlurred[ x + y*stride ];
2158  int cur= src[ x + y*stride ];
2159  tempBlurred[ x + y*stride ]=
2160  src[ x + y*stride ]=
2161  (ref*3 + cur + 2)>>2;
2162  }
2163  }
2164  }
2165  }
2166 }
2167 #endif //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2168 }
2169 #endif //TEMPLATE_PP_ALTIVEC
2170 
2171 #if TEMPLATE_PP_MMXEXT
2172 /**
2173  * accurate deblock filter
2174  */
2175 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
2176  int64_t dc_mask, eq_mask, both_masks;
2177  int64_t sums[10*8*2];
2178  src+= step*3; // src points to begin of the 8x8 Block
2179 
2180  __asm__ volatile(
2181  "movq %0, %%mm7 \n\t"
2182  "movq %1, %%mm6 \n\t"
2183  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2184  );
2185 
2186  __asm__ volatile(
2187  "lea (%2, %3), %%"FF_REG_a" \n\t"
2188 // 0 1 2 3 4 5 6 7 8 9
2189 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2190 
2191  "movq (%2), %%mm0 \n\t"
2192  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2193  "movq %%mm1, %%mm3 \n\t"
2194  "movq %%mm1, %%mm4 \n\t"
2195  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2196  "paddb %%mm7, %%mm0 \n\t"
2197  "pcmpgtb %%mm6, %%mm0 \n\t"
2198 
2199  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
2200  PMAXUB(%%mm2, %%mm4)
2201  PMINUB(%%mm2, %%mm3, %%mm5)
2202  "psubb %%mm2, %%mm1 \n\t"
2203  "paddb %%mm7, %%mm1 \n\t"
2204  "pcmpgtb %%mm6, %%mm1 \n\t"
2205  "paddb %%mm1, %%mm0 \n\t"
2206 
2207  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2208  PMAXUB(%%mm1, %%mm4)
2209  PMINUB(%%mm1, %%mm3, %%mm5)
2210  "psubb %%mm1, %%mm2 \n\t"
2211  "paddb %%mm7, %%mm2 \n\t"
2212  "pcmpgtb %%mm6, %%mm2 \n\t"
2213  "paddb %%mm2, %%mm0 \n\t"
2214 
2215  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
2216 
2217  "movq (%2, %3, 4), %%mm2 \n\t"
2218  PMAXUB(%%mm2, %%mm4)
2219  PMINUB(%%mm2, %%mm3, %%mm5)
2220  "psubb %%mm2, %%mm1 \n\t"
2221  "paddb %%mm7, %%mm1 \n\t"
2222  "pcmpgtb %%mm6, %%mm1 \n\t"
2223  "paddb %%mm1, %%mm0 \n\t"
2224 
2225  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2226  PMAXUB(%%mm1, %%mm4)
2227  PMINUB(%%mm1, %%mm3, %%mm5)
2228  "psubb %%mm1, %%mm2 \n\t"
2229  "paddb %%mm7, %%mm2 \n\t"
2230  "pcmpgtb %%mm6, %%mm2 \n\t"
2231  "paddb %%mm2, %%mm0 \n\t"
2232 
2233  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
2234  PMAXUB(%%mm2, %%mm4)
2235  PMINUB(%%mm2, %%mm3, %%mm5)
2236  "psubb %%mm2, %%mm1 \n\t"
2237  "paddb %%mm7, %%mm1 \n\t"
2238  "pcmpgtb %%mm6, %%mm1 \n\t"
2239  "paddb %%mm1, %%mm0 \n\t"
2240 
2241  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2242  PMAXUB(%%mm1, %%mm4)
2243  PMINUB(%%mm1, %%mm3, %%mm5)
2244  "psubb %%mm1, %%mm2 \n\t"
2245  "paddb %%mm7, %%mm2 \n\t"
2246  "pcmpgtb %%mm6, %%mm2 \n\t"
2247  "paddb %%mm2, %%mm0 \n\t"
2248 
2249  "movq (%2, %3, 8), %%mm2 \n\t"
2250  PMAXUB(%%mm2, %%mm4)
2251  PMINUB(%%mm2, %%mm3, %%mm5)
2252  "psubb %%mm2, %%mm1 \n\t"
2253  "paddb %%mm7, %%mm1 \n\t"
2254  "pcmpgtb %%mm6, %%mm1 \n\t"
2255  "paddb %%mm1, %%mm0 \n\t"
2256 
2257  "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2258  "psubb %%mm1, %%mm2 \n\t"
2259  "paddb %%mm7, %%mm2 \n\t"
2260  "pcmpgtb %%mm6, %%mm2 \n\t"
2261  "paddb %%mm2, %%mm0 \n\t"
2262  "psubusb %%mm3, %%mm4 \n\t"
2263 
2264  "pxor %%mm6, %%mm6 \n\t"
2265  "movq %4, %%mm7 \n\t" // QP,..., QP
2266  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2267  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2268  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2269  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2270  "movq %%mm7, %1 \n\t"
2271 
2272  "movq %5, %%mm7 \n\t"
2273  "punpcklbw %%mm7, %%mm7 \n\t"
2274  "punpcklbw %%mm7, %%mm7 \n\t"
2275  "punpcklbw %%mm7, %%mm7 \n\t"
2276  "psubb %%mm0, %%mm6 \n\t"
2277  "pcmpgtb %%mm7, %%mm6 \n\t"
2278  "movq %%mm6, %0 \n\t"
2279 
2280  : "=m" (eq_mask), "=m" (dc_mask)
2281  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2282  : "%"FF_REG_a
2283  );
2284 
2285  both_masks = dc_mask & eq_mask;
2286 
2287  if(both_masks){
2288  x86_reg offset= -8*step;
2289  int64_t *temp_sums= sums;
2290 
2291  __asm__ volatile(
2292  "movq %2, %%mm0 \n\t" // QP,..., QP
2293  "pxor %%mm4, %%mm4 \n\t"
2294 
2295  "movq (%0), %%mm6 \n\t"
2296  "movq (%0, %1), %%mm5 \n\t"
2297  "movq %%mm5, %%mm1 \n\t"
2298  "movq %%mm6, %%mm2 \n\t"
2299  "psubusb %%mm6, %%mm5 \n\t"
2300  "psubusb %%mm1, %%mm2 \n\t"
2301  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2302  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2303  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2304 
2305  "pxor %%mm6, %%mm1 \n\t"
2306  "pand %%mm0, %%mm1 \n\t"
2307  "pxor %%mm1, %%mm6 \n\t"
2308  // 0:QP 6:First
2309 
2310  "movq (%0, %1, 8), %%mm5 \n\t"
2311  "add %1, %0 \n\t" // %0 points to line 1 not 0
2312  "movq (%0, %1, 8), %%mm7 \n\t"
2313  "movq %%mm5, %%mm1 \n\t"
2314  "movq %%mm7, %%mm2 \n\t"
2315  "psubusb %%mm7, %%mm5 \n\t"
2316  "psubusb %%mm1, %%mm2 \n\t"
2317  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2318  "movq %2, %%mm0 \n\t" // QP,..., QP
2319  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2320  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2321 
2322  "pxor %%mm7, %%mm1 \n\t"
2323  "pand %%mm0, %%mm1 \n\t"
2324  "pxor %%mm1, %%mm7 \n\t"
2325 
2326  "movq %%mm6, %%mm5 \n\t"
2327  "punpckhbw %%mm4, %%mm6 \n\t"
2328  "punpcklbw %%mm4, %%mm5 \n\t"
2329  // 4:0 5/6:First 7:Last
2330 
2331  "movq %%mm5, %%mm0 \n\t"
2332  "movq %%mm6, %%mm1 \n\t"
2333  "psllw $2, %%mm0 \n\t"
2334  "psllw $2, %%mm1 \n\t"
2335  "paddw "MANGLE(w04)", %%mm0 \n\t"
2336  "paddw "MANGLE(w04)", %%mm1 \n\t"
2337 
2338 #define NEXT\
2339  "movq (%0), %%mm2 \n\t"\
2340  "movq (%0), %%mm3 \n\t"\
2341  "add %1, %0 \n\t"\
2342  "punpcklbw %%mm4, %%mm2 \n\t"\
2343  "punpckhbw %%mm4, %%mm3 \n\t"\
2344  "paddw %%mm2, %%mm0 \n\t"\
2345  "paddw %%mm3, %%mm1 \n\t"
2346 
2347 #define PREV\
2348  "movq (%0), %%mm2 \n\t"\
2349  "movq (%0), %%mm3 \n\t"\
2350  "add %1, %0 \n\t"\
2351  "punpcklbw %%mm4, %%mm2 \n\t"\
2352  "punpckhbw %%mm4, %%mm3 \n\t"\
2353  "psubw %%mm2, %%mm0 \n\t"\
2354  "psubw %%mm3, %%mm1 \n\t"
2355 
2356 
2357  NEXT //0
2358  NEXT //1
2359  NEXT //2
2360  "movq %%mm0, (%3) \n\t"
2361  "movq %%mm1, 8(%3) \n\t"
2362 
2363  NEXT //3
2364  "psubw %%mm5, %%mm0 \n\t"
2365  "psubw %%mm6, %%mm1 \n\t"
2366  "movq %%mm0, 16(%3) \n\t"
2367  "movq %%mm1, 24(%3) \n\t"
2368 
2369  NEXT //4
2370  "psubw %%mm5, %%mm0 \n\t"
2371  "psubw %%mm6, %%mm1 \n\t"
2372  "movq %%mm0, 32(%3) \n\t"
2373  "movq %%mm1, 40(%3) \n\t"
2374 
2375  NEXT //5
2376  "psubw %%mm5, %%mm0 \n\t"
2377  "psubw %%mm6, %%mm1 \n\t"
2378  "movq %%mm0, 48(%3) \n\t"
2379  "movq %%mm1, 56(%3) \n\t"
2380 
2381  NEXT //6
2382  "psubw %%mm5, %%mm0 \n\t"
2383  "psubw %%mm6, %%mm1 \n\t"
2384  "movq %%mm0, 64(%3) \n\t"
2385  "movq %%mm1, 72(%3) \n\t"
2386 
2387  "movq %%mm7, %%mm6 \n\t"
2388  "punpckhbw %%mm4, %%mm7 \n\t"
2389  "punpcklbw %%mm4, %%mm6 \n\t"
2390 
2391  NEXT //7
2392  "mov %4, %0 \n\t"
2393  "add %1, %0 \n\t"
2394  PREV //0
2395  "movq %%mm0, 80(%3) \n\t"
2396  "movq %%mm1, 88(%3) \n\t"
2397 
2398  PREV //1
2399  "paddw %%mm6, %%mm0 \n\t"
2400  "paddw %%mm7, %%mm1 \n\t"
2401  "movq %%mm0, 96(%3) \n\t"
2402  "movq %%mm1, 104(%3) \n\t"
2403 
2404  PREV //2
2405  "paddw %%mm6, %%mm0 \n\t"
2406  "paddw %%mm7, %%mm1 \n\t"
2407  "movq %%mm0, 112(%3) \n\t"
2408  "movq %%mm1, 120(%3) \n\t"
2409 
2410  PREV //3
2411  "paddw %%mm6, %%mm0 \n\t"
2412  "paddw %%mm7, %%mm1 \n\t"
2413  "movq %%mm0, 128(%3) \n\t"
2414  "movq %%mm1, 136(%3) \n\t"
2415 
2416  PREV //4
2417  "paddw %%mm6, %%mm0 \n\t"
2418  "paddw %%mm7, %%mm1 \n\t"
2419  "movq %%mm0, 144(%3) \n\t"
2420  "movq %%mm1, 152(%3) \n\t"
2421 
2422  "mov %4, %0 \n\t" //FIXME
2423 
2424  : "+&r"(src)
2425  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2427  );
2428 
2429  src+= step; // src points to begin of the 8x8 Block
2430 
2431  __asm__ volatile(
2432  "movq %4, %%mm6 \n\t"
2433  "pcmpeqb %%mm5, %%mm5 \n\t"
2434  "pxor %%mm6, %%mm5 \n\t"
2435  "pxor %%mm7, %%mm7 \n\t"
2436 
2437  "1: \n\t"
2438  "movq (%1), %%mm0 \n\t"
2439  "movq 8(%1), %%mm1 \n\t"
2440  "paddw 32(%1), %%mm0 \n\t"
2441  "paddw 40(%1), %%mm1 \n\t"
2442  "movq (%0, %3), %%mm2 \n\t"
2443  "movq %%mm2, %%mm3 \n\t"
2444  "movq %%mm2, %%mm4 \n\t"
2445  "punpcklbw %%mm7, %%mm2 \n\t"
2446  "punpckhbw %%mm7, %%mm3 \n\t"
2447  "paddw %%mm2, %%mm0 \n\t"
2448  "paddw %%mm3, %%mm1 \n\t"
2449  "paddw %%mm2, %%mm0 \n\t"
2450  "paddw %%mm3, %%mm1 \n\t"
2451  "psrlw $4, %%mm0 \n\t"
2452  "psrlw $4, %%mm1 \n\t"
2453  "packuswb %%mm1, %%mm0 \n\t"
2454  "pand %%mm6, %%mm0 \n\t"
2455  "pand %%mm5, %%mm4 \n\t"
2456  "por %%mm4, %%mm0 \n\t"
2457  "movq %%mm0, (%0, %3) \n\t"
2458  "add $16, %1 \n\t"
2459  "add %2, %0 \n\t"
2460  " js 1b \n\t"
2461 
2462  : "+r"(offset), "+r"(temp_sums)
2463  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2464  );
2465  }else
2466  src+= step; // src points to begin of the 8x8 Block
2467 
2468  if(eq_mask != -1LL){
2469  uint8_t *temp_src= src;
2470  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2471  __asm__ volatile(
2472  "pxor %%mm7, %%mm7 \n\t"
2473 // 0 1 2 3 4 5 6 7 8 9
2474 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2475 
2476  "movq (%0), %%mm0 \n\t"
2477  "movq %%mm0, %%mm1 \n\t"
2478  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2479  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2480 
2481  "movq (%0, %1), %%mm2 \n\t"
2482  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
2483  "movq %%mm2, %%mm3 \n\t"
2484  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2485  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2486 
2487  "movq (%%"FF_REG_a"), %%mm4 \n\t"
2488  "movq %%mm4, %%mm5 \n\t"
2489  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2490  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2491 
2492  "paddw %%mm0, %%mm0 \n\t" // 2L0
2493  "paddw %%mm1, %%mm1 \n\t" // 2H0
2494  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2495  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2496  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2497  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2498 
2499  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2500  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2501  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2502  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2503 
2504  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
2505  "movq %%mm2, %%mm3 \n\t"
2506  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2507  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2508 
2509  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2510  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2511  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2512  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2513  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2514  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2515 
2516  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
2517  "movq %%mm0, %%mm1 \n\t"
2518  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2519  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2520 
2521  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2522  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2523  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2524  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2525  "paddw %%mm4, %%mm4 \n\t" // 2L2
2526  "paddw %%mm5, %%mm5 \n\t" // 2H2
2527  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2528  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2529 
2530  "lea (%%"FF_REG_a", %1), %0 \n\t"
2531  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2532  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2533  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2534  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2535 //50 opcodes so far
2536  "movq (%0, %1, 2), %%mm2 \n\t"
2537  "movq %%mm2, %%mm3 \n\t"
2538  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2539  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2540  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2541  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2542  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2543  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2544 
2545  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2546  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2547  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2548  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2549  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2550  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2551 
2552  "paddw %%mm0, %%mm0 \n\t" // 2L4
2553  "paddw %%mm1, %%mm1 \n\t" // 2H4
2554  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2555  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2556 
2557  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2558  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2559  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2560  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2561 
2562  "movq (%0, %1, 4), %%mm2 \n\t"
2563  "movq %%mm2, %%mm3 \n\t"
2564  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2565  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2566 
2567  "paddw %%mm2, %%mm2 \n\t" // 2L7
2568  "paddw %%mm3, %%mm3 \n\t" // 2H7
2569  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2570  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2571 
2572  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2573  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2574 
2575  "movq %%mm7, %%mm6 \n\t" // 0
2576  "psubw %%mm0, %%mm6 \n\t"
2577  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2578  "movq %%mm7, %%mm6 \n\t" // 0
2579  "psubw %%mm1, %%mm6 \n\t"
2580  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2581  "movq %%mm7, %%mm6 \n\t" // 0
2582  "psubw %%mm2, %%mm6 \n\t"
2583  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2584  "movq %%mm7, %%mm6 \n\t" // 0
2585  "psubw %%mm3, %%mm6 \n\t"
2586  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2587 
2588  "pminsw %%mm2, %%mm0 \n\t"
2589  "pminsw %%mm3, %%mm1 \n\t"
2590 
2591  "movd %2, %%mm2 \n\t" // QP
2592  "punpcklbw %%mm7, %%mm2 \n\t"
2593 
2594  "movq %%mm7, %%mm6 \n\t" // 0
2595  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2596  "pxor %%mm6, %%mm4 \n\t"
2597  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2598  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2599  "pxor %%mm7, %%mm5 \n\t"
2600  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
2601 // 100 opcodes
2602  "psllw $3, %%mm2 \n\t" // 8QP
2603  "movq %%mm2, %%mm3 \n\t" // 8QP
2604  "pcmpgtw %%mm4, %%mm2 \n\t"
2605  "pcmpgtw %%mm5, %%mm3 \n\t"
2606  "pand %%mm2, %%mm4 \n\t"
2607  "pand %%mm3, %%mm5 \n\t"
2608 
2609 
2610  "psubusw %%mm0, %%mm4 \n\t" // hd
2611  "psubusw %%mm1, %%mm5 \n\t" // ld
2612 
2613 
2614  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
2615  "pmullw %%mm2, %%mm4 \n\t"
2616  "pmullw %%mm2, %%mm5 \n\t"
2617  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
2618  "paddw %%mm2, %%mm4 \n\t"
2619  "paddw %%mm2, %%mm5 \n\t"
2620  "psrlw $6, %%mm4 \n\t"
2621  "psrlw $6, %%mm5 \n\t"
2622 
2623  "movq 16(%4), %%mm0 \n\t" // L3 - L4
2624  "movq 24(%4), %%mm1 \n\t" // H3 - H4
2625 
2626  "pxor %%mm2, %%mm2 \n\t"
2627  "pxor %%mm3, %%mm3 \n\t"
2628 
2629  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
2630  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
2631  "pxor %%mm2, %%mm0 \n\t"
2632  "pxor %%mm3, %%mm1 \n\t"
2633  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
2634  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
2635  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
2636  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
2637 
2638  "pxor %%mm6, %%mm2 \n\t"
2639  "pxor %%mm7, %%mm3 \n\t"
2640  "pand %%mm2, %%mm4 \n\t"
2641  "pand %%mm3, %%mm5 \n\t"
2642 
2643  "pminsw %%mm0, %%mm4 \n\t"
2644  "pminsw %%mm1, %%mm5 \n\t"
2645  "pxor %%mm6, %%mm4 \n\t"
2646  "pxor %%mm7, %%mm5 \n\t"
2647  "psubw %%mm6, %%mm4 \n\t"
2648  "psubw %%mm7, %%mm5 \n\t"
2649  "packsswb %%mm5, %%mm4 \n\t"
2650  "movq %3, %%mm1 \n\t"
2651  "pandn %%mm4, %%mm1 \n\t"
2652  "movq (%0), %%mm0 \n\t"
2653  "paddb %%mm1, %%mm0 \n\t"
2654  "movq %%mm0, (%0) \n\t"
2655  "movq (%0, %1), %%mm0 \n\t"
2656  "psubb %%mm1, %%mm0 \n\t"
2657  "movq %%mm0, (%0, %1) \n\t"
2658 
2659  : "+r" (temp_src)
2660  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
2661  NAMED_CONSTRAINTS_ADD(w05,w20)
2662  : "%"FF_REG_a
2663  );
2664  }
2665 }
2666 #endif //TEMPLATE_PP_MMX
2667 
2668 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2669  const int8_t QPs[], int QPStride, int isColor, PPContext *c);
2670 
2671 /**
2672  * Copy a block from src to dst and fixes the blacklevel.
2673  * levelFix == 0 -> do not touch the brightness & contrast
2674  */
2675 #undef REAL_SCALED_CPY
2676 #undef SCALED_CPY
2677 
2678 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
2679  int levelFix, int64_t *packedOffsetAndScale)
2680 {
2681  if(levelFix){
2682 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
2683  __asm__ volatile(
2684  "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
2685  "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
2686  "lea (%2,%4), %%"FF_REG_a" \n\t"
2687  "lea (%3,%5), %%"FF_REG_d" \n\t"
2688  "pxor %%mm4, %%mm4 \n\t"
2689 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
2690  "movq " #src1 ", %%mm0 \n\t"\
2691  "movq " #src1 ", %%mm5 \n\t"\
2692  "movq " #src2 ", %%mm1 \n\t"\
2693  "movq " #src2 ", %%mm6 \n\t"\
2694  "punpcklbw %%mm0, %%mm0 \n\t"\
2695  "punpckhbw %%mm5, %%mm5 \n\t"\
2696  "punpcklbw %%mm1, %%mm1 \n\t"\
2697  "punpckhbw %%mm6, %%mm6 \n\t"\
2698  "pmulhuw %%mm3, %%mm0 \n\t"\
2699  "pmulhuw %%mm3, %%mm5 \n\t"\
2700  "pmulhuw %%mm3, %%mm1 \n\t"\
2701  "pmulhuw %%mm3, %%mm6 \n\t"\
2702  "psubw %%mm2, %%mm0 \n\t"\
2703  "psubw %%mm2, %%mm5 \n\t"\
2704  "psubw %%mm2, %%mm1 \n\t"\
2705  "psubw %%mm2, %%mm6 \n\t"\
2706  "packuswb %%mm5, %%mm0 \n\t"\
2707  "packuswb %%mm6, %%mm1 \n\t"\
2708  "movq %%mm0, " #dst1 " \n\t"\
2709  "movq %%mm1, " #dst2 " \n\t"\
2710 
2711 #define SCALED_CPY(src1, src2, dst1, dst2)\
2712  REAL_SCALED_CPY(src1, src2, dst1, dst2)
2713 
2714 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
2715 SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
2716 SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
2717  "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
2718  "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
2719 SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
2720 
2721 
2722  : "=&a" (packedOffsetAndScale)
2723  : "0" (packedOffsetAndScale),
2724  "r"(src),
2725  "r"(dst),
2726  "r" ((x86_reg)srcStride),
2727  "r" ((x86_reg)dstStride)
2728  : "%"FF_REG_d
2729  );
2730 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2731  for (int i = 0; i < 8; i++)
2732  memcpy( &(dst[dstStride*i]),
2733  &(src[srcStride*i]), BLOCK_SIZE);
2734 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2735  }else{
2736 #if TEMPLATE_PP_MMX && HAVE_6REGS
2737  __asm__ volatile(
2738  "lea (%0,%2), %%"FF_REG_a" \n\t"
2739  "lea (%1,%3), %%"FF_REG_d" \n\t"
2740 
2741 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
2742  "movq " #src1 ", %%mm0 \n\t"\
2743  "movq " #src2 ", %%mm1 \n\t"\
2744  "movq %%mm0, " #dst1 " \n\t"\
2745  "movq %%mm1, " #dst2 " \n\t"\
2746 
2747 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
2748  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
2749 
2750 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
2751 SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
2752 SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
2753  "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
2754  "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
2755 SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
2756 
2757  : : "r" (src),
2758  "r" (dst),
2759  "r" ((x86_reg)srcStride),
2760  "r" ((x86_reg)dstStride)
2761  : "%"FF_REG_a, "%"FF_REG_d
2762  );
2763 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2764  for (int i = 0; i < 8; i++)
2765  memcpy( &(dst[dstStride*i]),
2766  &(src[srcStride*i]), BLOCK_SIZE);
2767 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2768  }
2769 }
2770 
2771 /**
2772  * Duplicate the given 8 src pixels ? times upward
2773  */
2774 static inline void RENAME(duplicate)(uint8_t src[], int stride)
2775 {
2776 #if TEMPLATE_PP_MMX
2777  __asm__ volatile(
2778  "movq (%0), %%mm0 \n\t"
2779  "movq %%mm0, (%0, %1, 4) \n\t"
2780  "add %1, %0 \n\t"
2781  "movq %%mm0, (%0) \n\t"
2782  "movq %%mm0, (%0, %1) \n\t"
2783  "movq %%mm0, (%0, %1, 2) \n\t"
2784  "movq %%mm0, (%0, %1, 4) \n\t"
2785  : "+r" (src)
2786  : "r" ((x86_reg)-stride)
2787  );
2788 #else
2789  int i;
2790  uint8_t *p=src;
2791  for(i=0; i<5; i++){
2792  p-= stride;
2793  memcpy(p, src, 8);
2794  }
2795 #endif
2796 }
2797 
2798 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
2799 static inline void RENAME(prefetchnta)(const void *p)
2800 {
2801  __asm__ volatile( "prefetchnta (%0)\n\t"
2802  : : "r" (p)
2803  );
2804 }
2805 
2806 static inline void RENAME(prefetcht0)(const void *p)
2807 {
2808  __asm__ volatile( "prefetcht0 (%0)\n\t"
2809  : : "r" (p)
2810  );
2811 }
2812 
2813 static inline void RENAME(prefetcht1)(const void *p)
2814 {
2815  __asm__ volatile( "prefetcht1 (%0)\n\t"
2816  : : "r" (p)
2817  );
2818 }
2819 
2820 static inline void RENAME(prefetcht2)(const void *p)
2821 {
2822  __asm__ volatile( "prefetcht2 (%0)\n\t"
2823  : : "r" (p)
2824  );
2825 }
2826 #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
2827 static inline void RENAME(prefetchnta)(const void *p)
2828 {
2829  __builtin_prefetch(p,0,0);
2830 }
2831 static inline void RENAME(prefetcht0)(const void *p)
2832 {
2833  __builtin_prefetch(p,0,1);
2834 }
2835 static inline void RENAME(prefetcht1)(const void *p)
2836 {
2837  __builtin_prefetch(p,0,2);
2838 }
2839 static inline void RENAME(prefetcht2)(const void *p)
2840 {
2841  __builtin_prefetch(p,0,3);
2842 }
2843 #else
2844 static inline void RENAME(prefetchnta)(const void *p)
2845 {
2846  return;
2847 }
2848 static inline void RENAME(prefetcht0)(const void *p)
2849 {
2850  return;
2851 }
2852 static inline void RENAME(prefetcht1)(const void *p)
2853 {
2854  return;
2855 }
2856 static inline void RENAME(prefetcht2)(const void *p)
2857 {
2858  return;
2859 }
2860 #endif
2861 /**
2862  * Filter array of bytes (Y or U or V values)
2863  */
2864 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2865  const int8_t QPs[], int QPStride, int isColor, PPContext *c)
2866 {
2867  int x,y;
2868 #ifdef TEMPLATE_PP_TIME_MODE
2869  const int mode= TEMPLATE_PP_TIME_MODE;
2870 #else
2871  const int mode = isColor ? c->ppMode.chromMode : c->ppMode.lumMode;
2872 #endif
2873  int black=0, white=255; // blackest black and whitest white in the picture
2874  int QPCorrecture= 256*256;
2875 
2876  int copyAhead;
2877 #if TEMPLATE_PP_MMX
2878  int i;
2879 #endif
2880 
2881  const int qpHShift = isColor ? 4 - c->hChromaSubSample : 4;
2882  const int qpVShift = isColor ? 4 - c->vChromaSubSample : 4;
2883 
2884  //FIXME remove
2885  uint64_t * const yHistogram= c->yHistogram;
2886  uint8_t * const tempSrc = srcStride > 0 ? c->tempSrc : c->tempSrc - 23*srcStride;
2887  uint8_t * const tempDst = (dstStride > 0 ? c->tempDst : c->tempDst - 23*dstStride) + 32;
2888  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
2889 
2890  if (mode & VISUALIZE){
2891  if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
2892  av_log(c, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
2893  }
2894  }
2895 
2896 #if TEMPLATE_PP_MMX
2897  for(i=0; i<57; i++){
2898  int offset = ((i * c->ppMode.baseDcDiff) >> 8) + 1;
2899  int threshold= offset*2 + 1;
2900  c->mmxDcOffset[i] = 0x7F - offset;
2901  c->mmxDcThreshold[i] = 0x7F - threshold;
2902  c->mmxDcOffset[i] *= 0x0101010101010101LL;
2903  c->mmxDcThreshold[i] *= 0x0101010101010101LL;
2904  }
2905 #endif
2906 
2907  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2908  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
2909  || (mode & FFMPEG_DEINT_FILTER)
2910  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
2911  else if( (mode & V_DEBLOCK)
2913  || (mode & MEDIAN_DEINT_FILTER)
2914  || (mode & V_A_DEBLOCK)) copyAhead=13;
2915  else if(mode & V_X1_FILTER) copyAhead=11;
2916 // else if(mode & V_RK1_FILTER) copyAhead=10;
2917  else if(mode & DERING) copyAhead=9;
2918  else copyAhead=8;
2919 
2920  copyAhead-= 8;
2921 
2922  if(!isColor){
2923  uint64_t sum= 0;
2924  int i;
2925  uint64_t maxClipped;
2926  uint64_t clipped;
2927  AVRational scale;
2928 
2929  c->frameNum++;
2930  // first frame is fscked so we ignore it
2931  if (c->frameNum == 1)
2932  yHistogram[0] = width * (uint64_t)height/64*15/256;
2933 
2934  for(i=0; i<256; i++){
2935  sum+= yHistogram[i];
2936  }
2937 
2938  /* We always get a completely black picture first. */
2939  maxClipped = av_rescale(sum, c->ppMode.maxClippedThreshold.num,
2940  c->ppMode.maxClippedThreshold.den);
2941 
2942  clipped= sum;
2943  for(black=255; black>0; black--){
2944  if(clipped < maxClipped) break;
2945  clipped-= yHistogram[black];
2946  }
2947 
2948  clipped= sum;
2949  for(white=0; white<256; white++){
2950  if(clipped < maxClipped) break;
2951  clipped-= yHistogram[white];
2952  }
2953 
2954  scale = (AVRational){c->ppMode.maxAllowedY - c->ppMode.minAllowedY, white - black};
2955 
2956 #if TEMPLATE_PP_MMXEXT
2957  c->packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
2958  c->packedYOffset = (((black*c->packedYScale)>>8) - c->ppMode.minAllowedY) & 0xFFFF;
2959 #else
2960  c->packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
2961  c->packedYOffset = (black - c->ppMode.minAllowedY) & 0xFFFF;
2962 #endif
2963 
2964  c->packedYOffset |= c->packedYOffset<<32;
2965  c->packedYOffset |= c->packedYOffset<<16;
2966 
2967  c->packedYScale |= c->packedYScale<<32;
2968  c->packedYScale |= c->packedYScale<<16;
2969 
2970  if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
2971  else QPCorrecture= 256*256;
2972  }else{
2973  c->packedYScale = 0x0100010001000100LL;
2974  c->packedYOffset = 0;
2975  QPCorrecture= 256*256;
2976  }
2977 
2978  /* copy & deinterlace first row of blocks */
2979  y=-BLOCK_SIZE;
2980  {
2981  const uint8_t *srcBlock= &(src[y*srcStride]);
2982  uint8_t *dstBlock= tempDst + dstStride;
2983 
2984  // From this point on it is guaranteed that we can read and write 16 lines downward
2985  // finish 1 block before the next otherwise we might have a problem
2986  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
2987  for(x=0; x<width; x+=BLOCK_SIZE){
2988  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
2989  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
2990  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
2991  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
2992 
2993  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2994  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
2995 
2996  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2997 
2999  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3000  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3001  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3002  else if(mode & MEDIAN_DEINT_FILTER)
3003  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3004  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3005  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3006  else if(mode & FFMPEG_DEINT_FILTER)
3007  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3008  else if(mode & LOWPASS5_DEINT_FILTER)
3009  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3010 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3011  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3012 */
3013  dstBlock+=8;
3014  srcBlock+=8;
3015  }
3016  if(width==FFABS(dstStride))
3017  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3018  else{
3019  int i;
3020  for(i=0; i<copyAhead; i++){
3021  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3022  }
3023  }
3024  }
3025 
3026  for(y=0; y<height; y+=BLOCK_SIZE){
3027  //1% speedup if these are here instead of the inner loop
3028  const uint8_t *srcBlock= &(src[y*srcStride]);
3029  uint8_t *dstBlock= &(dst[y*dstStride]);
3030 #if TEMPLATE_PP_MMX
3031  uint8_t *tempBlock1 = c->tempBlocks;
3032  uint8_t *tempBlock2 = c->tempBlocks + 8;
3033 #endif
3034  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3035  int8_t *nonBQPptr = &c->nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3036  int QP=0, nonBQP=0;
3037  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3038  if not than use a temporary buffer */
3039  if(y+15 >= height){
3040  int i;
3041  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3042  blockcopy to dst later */
3043  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3044  FFMAX(height-y-copyAhead, 0), srcStride);
3045 
3046  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3047  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3048  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3049 
3050  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3051  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3052 
3053  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3054  for(i=height-y+1; i<=copyAhead; i++)
3055  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3056 
3057  dstBlock= tempDst + dstStride;
3058  srcBlock= tempSrc;
3059  }
3060 
3061  // From this point on it is guaranteed that we can read and write 16 lines downward
3062  // finish 1 block before the next otherwise we might have a problem
3063  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3064  for(x=0; x<width; ){
3065  int startx = x;
3066  int endx = FFMIN(width, x+32);
3067  uint8_t *dstBlockStart = dstBlock;
3068  const uint8_t *srcBlockStart = srcBlock;
3069  int qp_index = 0;
3070  for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
3071  QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3072  nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3073  if(!isColor){
3074  QP= (QP* QPCorrecture + 256*128)>>16;
3075  nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
3076  yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
3077  }
3078  c->QP_block[qp_index] = QP;
3079  c->nonBQP_block[qp_index] = nonBQP;
3080 #if TEMPLATE_PP_MMX
3081  __asm__ volatile(
3082  "movd %1, %%mm7 \n\t"
3083  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3084  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3085  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3086  "movq %%mm7, %0 \n\t"
3087  : "=m" (c->pQPb_block[qp_index])
3088  : "r" (QP)
3089  );
3090 #endif
3091  }
3092  for(; x < endx; x+=BLOCK_SIZE){
3093  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3094  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3095  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3096  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3097 
3098  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3099  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
3100 
3102  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3103  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3104  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3105  else if(mode & MEDIAN_DEINT_FILTER)
3106  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3107  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3108  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3109  else if(mode & FFMPEG_DEINT_FILTER)
3110  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3111  else if(mode & LOWPASS5_DEINT_FILTER)
3112  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3113 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3114  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3115 */
3116  dstBlock+=8;
3117  srcBlock+=8;
3118  }
3119 
3120  dstBlock = dstBlockStart;
3121  srcBlock = srcBlockStart;
3122 
3123  for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
3124  const int stride= dstStride;
3125  //temporary while changing QP stuff to make things continue to work
3126  //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
3127  c->QP = c->QP_block[qp_index];
3128  c->nonBQP = c->nonBQP_block[qp_index];
3129  c->pQPb = c->pQPb_block[qp_index];
3130  c->pQPb2 = c->pQPb2_block[qp_index];
3131 
3132  /* only deblock if we have 2 blocks */
3133  if(y + 8 < height){
3134  if(mode & V_X1_FILTER)
3135  RENAME(vertX1Filter)(dstBlock, stride, c);
3136  else if(mode & V_DEBLOCK){
3137  const int t = RENAME(vertClassify)(dstBlock, stride, c);
3138 
3139  if(t==1)
3140  RENAME(doVertLowPass)(dstBlock, stride, c);
3141  else if(t==2)
3142  RENAME(doVertDefFilter)(dstBlock, stride, c);
3143  }else if(mode & V_A_DEBLOCK){
3144  RENAME(do_a_deblock)(dstBlock, stride, 1, c, mode);
3145  }
3146  }
3147 
3148  dstBlock+=8;
3149  srcBlock+=8;
3150  }
3151 
3152  dstBlock = dstBlockStart;
3153  srcBlock = srcBlockStart;
3154 
3155  for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
3156  const int stride= dstStride;
3157  c->QP = c->QP_block[qp_index];
3158  c->nonBQP = c->nonBQP_block[qp_index];
3159  c->pQPb = c->pQPb_block[qp_index];
3160  c->pQPb2 = c->pQPb2_block[qp_index];
3161 #if TEMPLATE_PP_MMX
3162  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3163 #endif
3164  /* check if we have a previous block to deblock it with dstBlock */
3165  if(x - 8 >= 0){
3166 #if TEMPLATE_PP_MMX
3167  if(mode & H_X1_FILTER)
3168  RENAME(vertX1Filter)(tempBlock1, 16, c);
3169  else if(mode & H_DEBLOCK){
3170  const int t= RENAME(vertClassify)(tempBlock1, 16, c);
3171  if(t==1)
3172  RENAME(doVertLowPass)(tempBlock1, 16, c);
3173  else if(t==2)
3174  RENAME(doVertDefFilter)(tempBlock1, 16, c);
3175  }else if(mode & H_A_DEBLOCK){
3176  RENAME(do_a_deblock)(tempBlock1, 16, 1, c, mode);
3177  }
3178 
3179  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3180 
3181 #else
3182  if(mode & H_X1_FILTER)
3183  horizX1Filter(dstBlock-4, stride, c->QP);
3184  else if(mode & H_DEBLOCK){
3185 #if TEMPLATE_PP_ALTIVEC
3186  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3187  int t;
3188  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3189 
3190  t = vertClassify_altivec(tempBlock-48, 16, c);
3191  if(t==1) {
3192  doVertLowPass_altivec(tempBlock-48, 16, c);
3193  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3194  }
3195  else if(t==2) {
3196  doVertDefFilter_altivec(tempBlock-48, 16, c);
3197  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3198  }
3199 #else
3200  const int t= RENAME(horizClassify)(dstBlock-4, stride, c);
3201 
3202  if(t==1)
3203  RENAME(doHorizLowPass)(dstBlock-4, stride, c);
3204  else if(t==2)
3205  RENAME(doHorizDefFilter)(dstBlock-4, stride, c);
3206 #endif
3207  }else if(mode & H_A_DEBLOCK){
3208  RENAME(do_a_deblock)(dstBlock-8, 1, stride, c, mode);
3209  }
3210 #endif //TEMPLATE_PP_MMX
3211  if(mode & DERING){
3212  //FIXME filter first line
3213  if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, c);
3214  }
3215 
3216  if(mode & TEMP_NOISE_FILTER)
3217  {
3218  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3219  c->tempBlurred[isColor] + y*dstStride + x,
3220  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3221  c->ppMode.maxTmpNoise);
3222  }
3223  }
3224 
3225  dstBlock+=8;
3226  srcBlock+=8;
3227 
3228 #if TEMPLATE_PP_MMX
3229  FFSWAP(uint8_t *, tempBlock1, tempBlock2);
3230 #endif
3231  }
3232  }
3233 
3234  if(mode & DERING){
3235  if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, c);
3236  }
3237 
3238  if((mode & TEMP_NOISE_FILTER)){
3239  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3240  c->tempBlurred[isColor] + y*dstStride + x,
3241  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3242  c->ppMode.maxTmpNoise);
3243  }
3244 
3245  /* did we use a tmp buffer for the last lines*/
3246  if(y+15 >= height){
3247  uint8_t *dstBlock= &(dst[y*dstStride]);
3248  if(width==FFABS(dstStride))
3249  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3250  else{
3251  int i;
3252  for(i=0; i<height-y; i++){
3253  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3254  }
3255  }
3256  }
3257  }
3258 #if TEMPLATE_PP_MMX
3259  __asm__ volatile("emms");
3260 #endif
3261 
3262 #ifdef DEBUG_BRIGHTNESS
3263  if(!isColor){
3264  int max=1;
3265  int i;
3266  for(i=0; i<256; i++)
3267  if(yHistogram[i] > max) max=yHistogram[i];
3268 
3269  for(i=1; i<256; i++){
3270  int x;
3271  int start=yHistogram[i-1]/(max/256+1);
3272  int end=yHistogram[i]/(max/256+1);
3273  int inc= end > start ? 1 : -1;
3274  for(x=start; x!=end+inc; x+=inc)
3275  dst[ i*dstStride + x]+=128;
3276  }
3277 
3278  for(i=0; i<100; i+=2){
3279  dst[ (white)*dstStride + i]+=128;
3280  dst[ (black)*dstStride + i]+=128;
3281  }
3282  }
3283 #endif
3284 }
3285 
3286 #undef RENAME
3287 #undef TEMPLATE_PP_C
3288 #undef TEMPLATE_PP_ALTIVEC
3289 #undef TEMPLATE_PP_MMX
3290 #undef TEMPLATE_PP_MMXEXT
3291 #undef TEMPLATE_PP_SSE2
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
FFMPEG_DEINT_FILTER
#define FFMPEG_DEINT_FILTER
Definition: postprocess_internal.h:67
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:215
mem_internal.h
PPContext
postprocess context.
Definition: postprocess_internal.h:116
x86_reg
int x86_reg
Definition: asm.h:72
int64_t
long long int64_t
Definition: coverity.c:34
mode
Definition: swscale.c:56
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
LOWPASS5_DEINT_FILTER
#define LOWPASS5_DEINT_FILTER
Definition: postprocess_internal.h:68
b
#define b
Definition: input.c:42
horizX1Filter
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:322
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:214
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
H_A_DEBLOCK
#define H_A_DEBLOCK
Definition: postprocess_internal.h:56
FFSIGN
#define FFSIGN(a)
Definition: common.h:75
QP
#define QP(qP, depth)
Definition: h264data.c:190
MANGLE
#define MANGLE(a)
Definition: asm.h:127
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
postProcess
static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
Definition: postprocess.c:521
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
s
#define s(width, name)
Definition: cbs_vp9.c:198
V_A_DEBLOCK
#define V_A_DEBLOCK
Definition: postprocess_internal.h:52
DERING_THRESHOLD
#define DERING_THRESHOLD
Definition: postprocess.c:98
V_DEBLOCK
#define V_DEBLOCK
Definition: postprocess_internal.h:36
TEMP_NOISE_FILTER
#define TEMP_NOISE_FILTER
Definition: postprocess_internal.h:70
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:74
asm.h
tmp
static uint8_t tmp[20]
Definition: aes_ctr.c:47
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
MEDIAN_DEINT_FILTER
#define MEDIAN_DEINT_FILTER
Definition: postprocess_internal.h:66
linecpy
static void linecpy(void *dest, const void *src, int lines, int stride)
Definition: postprocess_internal.h:177
V_X1_FILTER
#define V_X1_FILTER
Definition: postprocess_internal.h:51
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
inc
static int inc(int num, int period)
Definition: perlin.c:34
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1016
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:81
f
f
Definition: af_crystalizer.c:122
height
#define height
Definition: dsp.h:85
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:104
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1121
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
PREV
@ PREV
Definition: vf_fftdnoiz.c:37
diff
static av_always_inline int diff(const struct color_info *a, const struct color_info *b, const int trans_thresh)
Definition: vf_paletteuse.c:166
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
H_DEBLOCK
#define H_DEBLOCK
Definition: postprocess_internal.h:37
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:220
DERING
#define DERING
Definition: postprocess_internal.h:38
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
VISUALIZE
#define VISUALIZE
Definition: postprocess_internal.h:73
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
AV_STRINGIFY
#define AV_STRINGIFY(s)
Definition: macros.h:66
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
stride
#define stride
Definition: h264pred_template.c:536
NEXT
@ NEXT
Definition: vf_fftdnoiz.c:38
CUBIC_IPOL_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:65
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:59
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
TEMPLATE_PP_MMX
#define TEMPLATE_PP_MMX
Definition: postprocess_template.c:52
RENAME
#define RENAME(element)
Definition: ac3enc_template.c:44
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:412
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:117
LINEAR_BLEND_DEINT_FILTER
#define LINEAR_BLEND_DEINT_FILTER
Definition: postprocess_internal.h:63
av_clip_uint8
#define av_clip_uint8
Definition: common.h:106
LINEAR_IPOL_DEINT_FILTER
#define LINEAR_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:62
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:291
H_X1_FILTER
#define H_X1_FILTER
Definition: postprocess_internal.h:55
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
BLOCK_SIZE
#define BLOCK_SIZE
Definition: adx.h:51
width
#define width
Definition: dsp.h:85
LEVEL_FIX
#define LEVEL_FIX
Brightness & Contrast.
Definition: postprocess_internal.h:39
src
#define src
Definition: vp8dsp.c:248
min
float min
Definition: vorbis_enc_data.h:429