FFmpeg: libpostproc/postprocess.c Source File

00001 /*
00002  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
00003  *
00004  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or modify
00009  * it under the terms of the GNU General Public License as published by
00010  * the Free Software Foundation; either version 2 of the License, or
00011  * (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00028 /*
00029                         C       MMX     MMX2    3DNow   AltiVec
00030 isVertDC                Ec      Ec                      Ec
00031 isVertMinMaxOk          Ec      Ec                      Ec
00032 doVertLowPass           E               e       e       Ec
00033 doVertDefFilter         Ec      Ec      e       e       Ec
00034 isHorizDC               Ec      Ec                      Ec
00035 isHorizMinMaxOk         a       E                       Ec
00036 doHorizLowPass          E               e       e       Ec
00037 doHorizDefFilter        Ec      Ec      e       e       Ec
00038 do_a_deblock            Ec      E       Ec      E
00039 deRing                  E               e       e*      Ecp
00040 Vertical RKAlgo1        E               a       a
00041 Horizontal RKAlgo1                      a       a
00042 Vertical X1#            a               E       E
00043 Horizontal X1#          a               E       E
00044 LinIpolDeinterlace      e               E       E*
00045 CubicIpolDeinterlace    a               e       e*
00046 LinBlendDeinterlace     e               E       E*
00047 MedianDeinterlace#      E       Ec      Ec
00048 TempDeNoiser#           E               e       e       Ec
00049 
00050 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
00051 # more or less selfinvented filters so the exactness is not too meaningful
00052 E = Exact implementation
00053 e = almost exact implementation (slightly different rounding,...)
00054 a = alternative / approximate impl
00055 c = checked against the other implementations (-vo md5)
00056 p = partially optimized, still some work to do
00057 */
00058 
00059 /*
00060 TODO:
00061 reduce the time wasted on the mem transfer
00062 unroll stuff if instructions depend too much on the prior one
00063 move YScale thing to the end instead of fixing QP
00064 write a faster and higher quality deblocking filter :)
00065 make the mainloop more flexible (variable number of blocks at once
00066         (the if/else stuff per block is slowing things down)
00067 compare the quality & speed of all filters
00068 split this huge file
00069 optimize c versions
00070 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
00071 ...
00072 */
00073 
00074 //Changelog: use git log
00075 
00076 #include "config.h"
00077 #include "libavutil/avutil.h"
00078 #include <inttypes.h>
00079 #include <stdio.h>
00080 #include <stdlib.h>
00081 #include <string.h>
00082 //#undef HAVE_MMX2
00083 //#define HAVE_AMD3DNOW
00084 //#undef HAVE_MMX
00085 //#undef ARCH_X86
00086 //#define DEBUG_BRIGHTNESS
00087 #include "postprocess.h"
00088 #include "postprocess_internal.h"
00089 #include "libavutil/avstring.h"
00090 
00091 unsigned postproc_version(void)
00092 {
00093     return LIBPOSTPROC_VERSION_INT;
00094 }
00095 
00096 const char *postproc_configuration(void)
00097 {
00098     return FFMPEG_CONFIGURATION;
00099 }
00100 
00101 const char *postproc_license(void)
00102 {
00103 #define LICENSE_PREFIX "libpostproc license: "
00104     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
00105 }
00106 
00107 #if HAVE_ALTIVEC_H
00108 #include <altivec.h>
00109 #endif
00110 
00111 #define GET_MODE_BUFFER_SIZE 500
00112 #define OPTIONS_ARRAY_SIZE 10
00113 #define BLOCK_SIZE 8
00114 #define TEMP_STRIDE 8
00115 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
00116 
00117 #if ARCH_X86
00118 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
00119 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
00120 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
00121 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
00122 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
00123 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
00124 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
00125 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
00126 #endif
00127 
00128 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
00129 
00130 
00131 static struct PPFilter filters[]=
00132 {
00133     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
00134     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
00135 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
00136     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
00137     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
00138     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
00139     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
00140     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
00141     {"dr", "dering",                1, 5, 6, DERING},
00142     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
00143     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
00144     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
00145     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
00146     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
00147     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
00148     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
00149     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
00150     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
00151     {NULL, NULL,0,0,0,0} //End Marker
00152 };
00153 
00154 static const char *replaceTable[]=
00155 {
00156     "default",      "hb:a,vb:a,dr:a",
00157     "de",           "hb:a,vb:a,dr:a",
00158     "fast",         "h1:a,v1:a,dr:a",
00159     "fa",           "h1:a,v1:a,dr:a",
00160     "ac",           "ha:a:128:7,va:a,dr:a",
00161     NULL //End Marker
00162 };
00163 
00164 
00165 #if ARCH_X86
00166 static inline void prefetchnta(void *p)
00167 {
00168     __asm__ volatile(   "prefetchnta (%0)\n\t"
00169         : : "r" (p)
00170     );
00171 }
00172 
00173 static inline void prefetcht0(void *p)
00174 {
00175     __asm__ volatile(   "prefetcht0 (%0)\n\t"
00176         : : "r" (p)
00177     );
00178 }
00179 
00180 static inline void prefetcht1(void *p)
00181 {
00182     __asm__ volatile(   "prefetcht1 (%0)\n\t"
00183         : : "r" (p)
00184     );
00185 }
00186 
00187 static inline void prefetcht2(void *p)
00188 {
00189     __asm__ volatile(   "prefetcht2 (%0)\n\t"
00190         : : "r" (p)
00191     );
00192 }
00193 #endif
00194 
00195 /* The horizontal functions exist only in C because the MMX
00196  * code is faster with vertical filters and transposing. */
00197 
00201 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
00202 {
00203     int numEq= 0;
00204     int y;
00205     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
00206     const int dcThreshold= dcOffset*2 + 1;
00207 
00208     for(y=0; y<BLOCK_SIZE; y++){
00209         if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
00210         if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
00211         if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
00212         if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
00213         if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
00214         if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
00215         if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
00216         src+= stride;
00217     }
00218     return numEq > c->ppMode.flatnessThreshold;
00219 }
00220 
00224 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c)
00225 {
00226     int numEq= 0;
00227     int y;
00228     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
00229     const int dcThreshold= dcOffset*2 + 1;
00230 
00231     src+= stride*4; // src points to begin of the 8x8 Block
00232     for(y=0; y<BLOCK_SIZE-1; y++){
00233         if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
00234         if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
00235         if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
00236         if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
00237         if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
00238         if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
00239         if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
00240         if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
00241         src+= stride;
00242     }
00243     return numEq > c->ppMode.flatnessThreshold;
00244 }
00245 
00246 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
00247 {
00248     int i;
00249     for(i=0; i<2; i++){
00250         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
00251         src += stride;
00252         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
00253         src += stride;
00254         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
00255         src += stride;
00256         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
00257         src += stride;
00258     }
00259     return 1;
00260 }
00261 
00262 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
00263 {
00264     int x;
00265     src+= stride*4;
00266     for(x=0; x<BLOCK_SIZE; x+=4){
00267         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
00268         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
00269         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
00270         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
00271     }
00272     return 1;
00273 }
00274 
00275 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c)
00276 {
00277     if( isHorizDC_C(src, stride, c) ){
00278         if( isHorizMinMaxOk_C(src, stride, c->QP) )
00279             return 1;
00280         else
00281             return 0;
00282     }else{
00283         return 2;
00284     }
00285 }
00286 
00287 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c)
00288 {
00289     if( isVertDC_C(src, stride, c) ){
00290         if( isVertMinMaxOk_C(src, stride, c->QP) )
00291             return 1;
00292         else
00293             return 0;
00294     }else{
00295         return 2;
00296     }
00297 }
00298 
00299 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
00300 {
00301     int y;
00302     for(y=0; y<BLOCK_SIZE; y++){
00303         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
00304 
00305         if(FFABS(middleEnergy) < 8*c->QP){
00306             const int q=(dst[3] - dst[4])/2;
00307             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
00308             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
00309 
00310             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
00311             d= FFMAX(d, 0);
00312 
00313             d= (5*d + 32) >> 6;
00314             d*= FFSIGN(-middleEnergy);
00315 
00316             if(q>0)
00317             {
00318                 d= d<0 ? 0 : d;
00319                 d= d>q ? q : d;
00320             }
00321             else
00322             {
00323                 d= d>0 ? 0 : d;
00324                 d= d<q ? q : d;
00325             }
00326 
00327             dst[3]-= d;
00328             dst[4]+= d;
00329         }
00330         dst+= stride;
00331     }
00332 }
00333 
00338 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
00339 {
00340     int y;
00341     for(y=0; y<BLOCK_SIZE; y++){
00342         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
00343         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
00344 
00345         int sums[10];
00346         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
00347         sums[1] = sums[0] - first  + dst[3];
00348         sums[2] = sums[1] - first  + dst[4];
00349         sums[3] = sums[2] - first  + dst[5];
00350         sums[4] = sums[3] - first  + dst[6];
00351         sums[5] = sums[4] - dst[0] + dst[7];
00352         sums[6] = sums[5] - dst[1] + last;
00353         sums[7] = sums[6] - dst[2] + last;
00354         sums[8] = sums[7] - dst[3] + last;
00355         sums[9] = sums[8] - dst[4] + last;
00356 
00357         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
00358         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
00359         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
00360         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
00361         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
00362         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
00363         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
00364         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
00365 
00366         dst+= stride;
00367     }
00368 }
00369 
00378 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
00379 {
00380     int y;
00381     static uint64_t *lut= NULL;
00382     if(lut==NULL)
00383     {
00384         int i;
00385         lut = av_malloc(256*8);
00386         for(i=0; i<256; i++)
00387         {
00388             int v= i < 128 ? 2*i : 2*(i-256);
00389 /*
00390 //Simulate 112242211 9-Tap filter
00391             uint64_t a= (v/16)  & 0xFF;
00392             uint64_t b= (v/8)   & 0xFF;
00393             uint64_t c= (v/4)   & 0xFF;
00394             uint64_t d= (3*v/8) & 0xFF;
00395 */
00396 //Simulate piecewise linear interpolation
00397             uint64_t a= (v/16)   & 0xFF;
00398             uint64_t b= (v*3/16) & 0xFF;
00399             uint64_t c= (v*5/16) & 0xFF;
00400             uint64_t d= (7*v/16) & 0xFF;
00401             uint64_t A= (0x100 - a)&0xFF;
00402             uint64_t B= (0x100 - b)&0xFF;
00403             uint64_t C= (0x100 - c)&0xFF;
00404             uint64_t D= (0x100 - c)&0xFF;
00405 
00406             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
00407                        (D<<24) | (C<<16) | (B<<8)  | (A);
00408             //lut[i] = (v<<32) | (v<<24);
00409         }
00410     }
00411 
00412     for(y=0; y<BLOCK_SIZE; y++){
00413         int a= src[1] - src[2];
00414         int b= src[3] - src[4];
00415         int c= src[5] - src[6];
00416 
00417         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
00418 
00419         if(d < QP){
00420             int v = d * FFSIGN(-b);
00421 
00422             src[1] +=v/8;
00423             src[2] +=v/4;
00424             src[3] +=3*v/8;
00425             src[4] -=3*v/8;
00426             src[5] -=v/4;
00427             src[6] -=v/8;
00428         }
00429         src+=stride;
00430     }
00431 }
00432 
00436 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
00437     int y;
00438     const int QP= c->QP;
00439     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
00440     const int dcThreshold= dcOffset*2 + 1;
00441 //START_TIMER
00442     src+= step*4; // src points to begin of the 8x8 Block
00443     for(y=0; y<8; y++){
00444         int numEq= 0;
00445 
00446         if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
00447         if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
00448         if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
00449         if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
00450         if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
00451         if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
00452         if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
00453         if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
00454         if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
00455         if(numEq > c->ppMode.flatnessThreshold){
00456             int min, max, x;
00457 
00458             if(src[0] > src[step]){
00459                 max= src[0];
00460                 min= src[step];
00461             }else{
00462                 max= src[step];
00463                 min= src[0];
00464             }
00465             for(x=2; x<8; x+=2){
00466                 if(src[x*step] > src[(x+1)*step]){
00467                         if(src[x    *step] > max) max= src[ x   *step];
00468                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
00469                 }else{
00470                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
00471                         if(src[ x   *step] < min) min= src[ x   *step];
00472                 }
00473             }
00474             if(max-min < 2*QP){
00475                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
00476                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
00477 
00478                 int sums[10];
00479                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
00480                 sums[1] = sums[0] - first       + src[3*step];
00481                 sums[2] = sums[1] - first       + src[4*step];
00482                 sums[3] = sums[2] - first       + src[5*step];
00483                 sums[4] = sums[3] - first       + src[6*step];
00484                 sums[5] = sums[4] - src[0*step] + src[7*step];
00485                 sums[6] = sums[5] - src[1*step] + last;
00486                 sums[7] = sums[6] - src[2*step] + last;
00487                 sums[8] = sums[7] - src[3*step] + last;
00488                 sums[9] = sums[8] - src[4*step] + last;
00489 
00490                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
00491                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
00492                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
00493                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
00494                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
00495                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
00496                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
00497                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
00498             }
00499         }else{
00500             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
00501 
00502             if(FFABS(middleEnergy) < 8*QP){
00503                 const int q=(src[3*step] - src[4*step])/2;
00504                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
00505                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
00506 
00507                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
00508                 d= FFMAX(d, 0);
00509 
00510                 d= (5*d + 32) >> 6;
00511                 d*= FFSIGN(-middleEnergy);
00512 
00513                 if(q>0){
00514                     d= d<0 ? 0 : d;
00515                     d= d>q ? q : d;
00516                 }else{
00517                     d= d>0 ? 0 : d;
00518                     d= d<q ? q : d;
00519                 }
00520 
00521                 src[3*step]-= d;
00522                 src[4*step]+= d;
00523             }
00524         }
00525 
00526         src += stride;
00527     }
00528 /*if(step==16){
00529     STOP_TIMER("step16")
00530 }else{
00531     STOP_TIMER("stepX")
00532 }*/
00533 }
00534 
00535 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
00536 //Plain C versions
00537 #if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT
00538 #define COMPILE_C
00539 #endif
00540 
00541 #if HAVE_ALTIVEC
00542 #define COMPILE_ALTIVEC
00543 #endif //HAVE_ALTIVEC
00544 
00545 #if ARCH_X86
00546 
00547 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
00548 #define COMPILE_MMX
00549 #endif
00550 
00551 #if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
00552 #define COMPILE_MMX2
00553 #endif
00554 
00555 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
00556 #define COMPILE_3DNOW
00557 #endif
00558 #endif /* ARCH_X86 */
00559 
00560 #undef HAVE_MMX
00561 #define HAVE_MMX 0
00562 #undef HAVE_MMX2
00563 #define HAVE_MMX2 0
00564 #undef HAVE_AMD3DNOW
00565 #define HAVE_AMD3DNOW 0
00566 #undef HAVE_ALTIVEC
00567 #define HAVE_ALTIVEC 0
00568 
00569 #ifdef COMPILE_C
00570 #define RENAME(a) a ## _C
00571 #include "postprocess_template.c"
00572 #endif
00573 
00574 #ifdef COMPILE_ALTIVEC
00575 #undef RENAME
00576 #undef HAVE_ALTIVEC
00577 #define HAVE_ALTIVEC 1
00578 #define RENAME(a) a ## _altivec
00579 #include "postprocess_altivec_template.c"
00580 #include "postprocess_template.c"
00581 #endif
00582 
00583 //MMX versions
00584 #ifdef COMPILE_MMX
00585 #undef RENAME
00586 #undef HAVE_MMX
00587 #define HAVE_MMX 1
00588 #define RENAME(a) a ## _MMX
00589 #include "postprocess_template.c"
00590 #endif
00591 
00592 //MMX2 versions
00593 #ifdef COMPILE_MMX2
00594 #undef RENAME
00595 #undef HAVE_MMX
00596 #undef HAVE_MMX2
00597 #define HAVE_MMX 1
00598 #define HAVE_MMX2 1
00599 #define RENAME(a) a ## _MMX2
00600 #include "postprocess_template.c"
00601 #endif
00602 
00603 //3DNOW versions
00604 #ifdef COMPILE_3DNOW
00605 #undef RENAME
00606 #undef HAVE_MMX
00607 #undef HAVE_MMX2
00608 #undef HAVE_AMD3DNOW
00609 #define HAVE_MMX 1
00610 #define HAVE_MMX2 0
00611 #define HAVE_AMD3DNOW 1
00612 #define RENAME(a) a ## _3DNow
00613 #include "postprocess_template.c"
00614 #endif
00615 
00616 // minor note: the HAVE_xyz is messed up after that line so do not use it.
00617 
00618 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
00619         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
00620 {
00621     PPContext *c= (PPContext *)vc;
00622     PPMode *ppMode= (PPMode *)vm;
00623     c->ppMode= *ppMode; //FIXME
00624 
00625     // Using ifs here as they are faster than function pointers although the
00626     // difference would not be measurable here but it is much better because
00627     // someone might exchange the CPU whithout restarting MPlayer ;)
00628 #if CONFIG_RUNTIME_CPUDETECT
00629 #if ARCH_X86
00630     // ordered per speed fastest first
00631     if(c->cpuCaps & PP_CPU_CAPS_MMX2)
00632         postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00633     else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
00634         postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00635     else if(c->cpuCaps & PP_CPU_CAPS_MMX)
00636         postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00637     else
00638         postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00639 #else
00640 #if HAVE_ALTIVEC
00641     if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
00642             postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00643     else
00644 #endif
00645             postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00646 #endif
00647 #else /* CONFIG_RUNTIME_CPUDETECT */
00648 #if   HAVE_MMX2
00649             postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00650 #elif HAVE_AMD3DNOW
00651             postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00652 #elif HAVE_MMX
00653             postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00654 #elif HAVE_ALTIVEC
00655             postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00656 #else
00657             postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00658 #endif
00659 #endif /* !CONFIG_RUNTIME_CPUDETECT */
00660 }
00661 
00662 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
00663 //        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
00664 
00665 /* -pp Command line Help
00666 */
00667 #if LIBPOSTPROC_VERSION_INT < (52<<16)
00668 const char *const pp_help=
00669 #else
00670 const char pp_help[] =
00671 #endif
00672 "Available postprocessing filters:\n"
00673 "Filters                        Options\n"
00674 "short  long name       short   long option     Description\n"
00675 "*      *               a       autoq           CPU power dependent enabler\n"
00676 "                       c       chrom           chrominance filtering enabled\n"
00677 "                       y       nochrom         chrominance filtering disabled\n"
00678 "                       n       noluma          luma filtering disabled\n"
00679 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
00680 "       1. difference factor: default=32, higher -> more deblocking\n"
00681 "       2. flatness threshold: default=39, lower -> more deblocking\n"
00682 "                       the h & v deblocking filters share these\n"
00683 "                       so you can't set different thresholds for h / v\n"
00684 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
00685 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
00686 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
00687 "h1     x1hdeblock                              experimental h deblock filter 1\n"
00688 "v1     x1vdeblock                              experimental v deblock filter 1\n"
00689 "dr     dering                                  deringing filter\n"
00690 "al     autolevels                              automatic brightness / contrast\n"
00691 "                       f        fullyrange     stretch luminance to (0..255)\n"
00692 "lb     linblenddeint                           linear blend deinterlacer\n"
00693 "li     linipoldeint                            linear interpolating deinterlace\n"
00694 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
00695 "md     mediandeint                             median deinterlacer\n"
00696 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
00697 "l5     lowpass5                                FIR lowpass deinterlacer\n"
00698 "de     default                                 hb:a,vb:a,dr:a\n"
00699 "fa     fast                                    h1:a,v1:a,dr:a\n"
00700 "ac                                             ha:a:128:7,va:a,dr:a\n"
00701 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
00702 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
00703 "fq     forceQuant      <quantizer>             force quantizer\n"
00704 "Usage:\n"
00705 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
00706 "long form example:\n"
00707 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
00708 "short form example:\n"
00709 "vb:a/hb:a/lb                                   de,-vb\n"
00710 "more examples:\n"
00711 "tn:64:128:256\n"
00712 "\n"
00713 ;
00714 
00715 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
00716 {
00717     char temp[GET_MODE_BUFFER_SIZE];
00718     char *p= temp;
00719     static const char filterDelimiters[] = ",/";
00720     static const char optionDelimiters[] = ":";
00721     struct PPMode *ppMode;
00722     char *filterToken;
00723 
00724     ppMode= av_malloc(sizeof(PPMode));
00725 
00726     ppMode->lumMode= 0;
00727     ppMode->chromMode= 0;
00728     ppMode->maxTmpNoise[0]= 700;
00729     ppMode->maxTmpNoise[1]= 1500;
00730     ppMode->maxTmpNoise[2]= 3000;
00731     ppMode->maxAllowedY= 234;
00732     ppMode->minAllowedY= 16;
00733     ppMode->baseDcDiff= 256/8;
00734     ppMode->flatnessThreshold= 56-16-1;
00735     ppMode->maxClippedThreshold= 0.01;
00736     ppMode->error=0;
00737 
00738     memset(temp, 0, GET_MODE_BUFFER_SIZE);
00739     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
00740 
00741     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
00742 
00743     for(;;){
00744         char *filterName;
00745         int q= 1000000; //PP_QUALITY_MAX;
00746         int chrom=-1;
00747         int luma=-1;
00748         char *option;
00749         char *options[OPTIONS_ARRAY_SIZE];
00750         int i;
00751         int filterNameOk=0;
00752         int numOfUnknownOptions=0;
00753         int enable=1; //does the user want us to enabled or disabled the filter
00754 
00755         filterToken= strtok(p, filterDelimiters);
00756         if(filterToken == NULL) break;
00757         p+= strlen(filterToken) + 1; // p points to next filterToken
00758         filterName= strtok(filterToken, optionDelimiters);
00759         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
00760 
00761         if(*filterName == '-'){
00762             enable=0;
00763             filterName++;
00764         }
00765 
00766         for(;;){ //for all options
00767             option= strtok(NULL, optionDelimiters);
00768             if(option == NULL) break;
00769 
00770             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
00771             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
00772             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
00773             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
00774             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
00775             else{
00776                 options[numOfUnknownOptions] = option;
00777                 numOfUnknownOptions++;
00778             }
00779             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
00780         }
00781         options[numOfUnknownOptions] = NULL;
00782 
00783         /* replace stuff from the replace Table */
00784         for(i=0; replaceTable[2*i]!=NULL; i++){
00785             if(!strcmp(replaceTable[2*i], filterName)){
00786                 int newlen= strlen(replaceTable[2*i + 1]);
00787                 int plen;
00788                 int spaceLeft;
00789 
00790                 p--, *p=',';
00791 
00792                 plen= strlen(p);
00793                 spaceLeft= p - temp + plen;
00794                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
00795                     ppMode->error++;
00796                     break;
00797                 }
00798                 memmove(p + newlen, p, plen+1);
00799                 memcpy(p, replaceTable[2*i + 1], newlen);
00800                 filterNameOk=1;
00801             }
00802         }
00803 
00804         for(i=0; filters[i].shortName!=NULL; i++){
00805             if(   !strcmp(filters[i].longName, filterName)
00806                || !strcmp(filters[i].shortName, filterName)){
00807                 ppMode->lumMode &= ~filters[i].mask;
00808                 ppMode->chromMode &= ~filters[i].mask;
00809 
00810                 filterNameOk=1;
00811                 if(!enable) break; // user wants to disable it
00812 
00813                 if(q >= filters[i].minLumQuality && luma)
00814                     ppMode->lumMode|= filters[i].mask;
00815                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
00816                     if(q >= filters[i].minChromQuality)
00817                             ppMode->chromMode|= filters[i].mask;
00818 
00819                 if(filters[i].mask == LEVEL_FIX){
00820                     int o;
00821                     ppMode->minAllowedY= 16;
00822                     ppMode->maxAllowedY= 234;
00823                     for(o=0; options[o]!=NULL; o++){
00824                         if(  !strcmp(options[o],"fullyrange")
00825                            ||!strcmp(options[o],"f")){
00826                             ppMode->minAllowedY= 0;
00827                             ppMode->maxAllowedY= 255;
00828                             numOfUnknownOptions--;
00829                         }
00830                     }
00831                 }
00832                 else if(filters[i].mask == TEMP_NOISE_FILTER)
00833                 {
00834                     int o;
00835                     int numOfNoises=0;
00836 
00837                     for(o=0; options[o]!=NULL; o++){
00838                         char *tail;
00839                         ppMode->maxTmpNoise[numOfNoises]=
00840                             strtol(options[o], &tail, 0);
00841                         if(tail!=options[o]){
00842                             numOfNoises++;
00843                             numOfUnknownOptions--;
00844                             if(numOfNoises >= 3) break;
00845                         }
00846                     }
00847                 }
00848                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
00849                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
00850                     int o;
00851 
00852                     for(o=0; options[o]!=NULL && o<2; o++){
00853                         char *tail;
00854                         int val= strtol(options[o], &tail, 0);
00855                         if(tail==options[o]) break;
00856 
00857                         numOfUnknownOptions--;
00858                         if(o==0) ppMode->baseDcDiff= val;
00859                         else ppMode->flatnessThreshold= val;
00860                     }
00861                 }
00862                 else if(filters[i].mask == FORCE_QUANT){
00863                     int o;
00864                     ppMode->forcedQuant= 15;
00865 
00866                     for(o=0; options[o]!=NULL && o<1; o++){
00867                         char *tail;
00868                         int val= strtol(options[o], &tail, 0);
00869                         if(tail==options[o]) break;
00870 
00871                         numOfUnknownOptions--;
00872                         ppMode->forcedQuant= val;
00873                     }
00874                 }
00875             }
00876         }
00877         if(!filterNameOk) ppMode->error++;
00878         ppMode->error += numOfUnknownOptions;
00879     }
00880 
00881     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
00882     if(ppMode->error){
00883         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
00884         av_free(ppMode);
00885         return NULL;
00886     }
00887     return ppMode;
00888 }
00889 
00890 void pp_free_mode(pp_mode *mode){
00891     av_free(mode);
00892 }
00893 
00894 static void reallocAlign(void **p, int alignment, int size){
00895     av_free(*p);
00896     *p= av_mallocz(size);
00897 }
00898 
00899 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
00900     int mbWidth = (width+15)>>4;
00901     int mbHeight= (height+15)>>4;
00902     int i;
00903 
00904     c->stride= stride;
00905     c->qpStride= qpStride;
00906 
00907     reallocAlign((void **)&c->tempDst, 8, stride*24);
00908     reallocAlign((void **)&c->tempSrc, 8, stride*24);
00909     reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
00910     reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
00911     for(i=0; i<256; i++)
00912             c->yHistogram[i]= width*height/64*15/256;
00913 
00914     for(i=0; i<3; i++){
00915         //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
00916         reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
00917         reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
00918     }
00919 
00920     reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
00921     reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
00922     reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
00923     reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
00924 }
00925 
00926 static const char * context_to_name(void * ptr) {
00927     return "postproc";
00928 }
00929 
00930 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
00931 
00932 pp_context *pp_get_context(int width, int height, int cpuCaps){
00933     PPContext *c= av_malloc(sizeof(PPContext));
00934     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
00935     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
00936 
00937     memset(c, 0, sizeof(PPContext));
00938     c->av_class = &av_codec_context_class;
00939     c->cpuCaps= cpuCaps;
00940     if(cpuCaps&PP_FORMAT){
00941         c->hChromaSubSample= cpuCaps&0x3;
00942         c->vChromaSubSample= (cpuCaps>>4)&0x3;
00943     }else{
00944         c->hChromaSubSample= 1;
00945         c->vChromaSubSample= 1;
00946     }
00947 
00948     reallocBuffers(c, width, height, stride, qpStride);
00949 
00950     c->frameNum=-1;
00951 
00952     return c;
00953 }
00954 
00955 void pp_free_context(void *vc){
00956     PPContext *c = (PPContext*)vc;
00957     int i;
00958 
00959     for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
00960     for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
00961 
00962     av_free(c->tempBlocks);
00963     av_free(c->yHistogram);
00964     av_free(c->tempDst);
00965     av_free(c->tempSrc);
00966     av_free(c->deintTemp);
00967     av_free(c->stdQPTable);
00968     av_free(c->nonBQPTable);
00969     av_free(c->forcedQPTable);
00970 
00971     memset(c, 0, sizeof(PPContext));
00972 
00973     av_free(c);
00974 }
00975 
00976 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
00977                      uint8_t * dst[3], const int dstStride[3],
00978                      int width, int height,
00979                      const QP_STORE_T *QP_store,  int QPStride,
00980                      pp_mode *vm,  void *vc, int pict_type)
00981 {
00982     int mbWidth = (width+15)>>4;
00983     int mbHeight= (height+15)>>4;
00984     PPMode *mode = (PPMode*)vm;
00985     PPContext *c = (PPContext*)vc;
00986     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
00987     int absQPStride = FFABS(QPStride);
00988 
00989     // c->stride and c->QPStride are always positive
00990     if(c->stride < minStride || c->qpStride < absQPStride)
00991         reallocBuffers(c, width, height,
00992                        FFMAX(minStride, c->stride),
00993                        FFMAX(c->qpStride, absQPStride));
00994 
00995     if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
00996         int i;
00997         QP_store= c->forcedQPTable;
00998         absQPStride = QPStride = 0;
00999         if(mode->lumMode & FORCE_QUANT)
01000             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
01001         else
01002             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
01003     }
01004 
01005     if(pict_type & PP_PICT_TYPE_QP2){
01006         int i;
01007         const int count= mbHeight * absQPStride;
01008         for(i=0; i<(count>>2); i++){
01009             ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
01010         }
01011         for(i<<=2; i<count; i++){
01012             c->stdQPTable[i] = QP_store[i]>>1;
01013         }
01014         QP_store= c->stdQPTable;
01015         QPStride= absQPStride;
01016     }
01017 
01018     if(0){
01019         int x,y;
01020         for(y=0; y<mbHeight; y++){
01021             for(x=0; x<mbWidth; x++){
01022                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
01023             }
01024             av_log(c, AV_LOG_INFO, "\n");
01025         }
01026         av_log(c, AV_LOG_INFO, "\n");
01027     }
01028 
01029     if((pict_type&7)!=3){
01030         if (QPStride >= 0){
01031             int i;
01032             const int count= mbHeight * QPStride;
01033             for(i=0; i<(count>>2); i++){
01034                 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
01035             }
01036             for(i<<=2; i<count; i++){
01037                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
01038             }
01039         } else {
01040             int i,j;
01041             for(i=0; i<mbHeight; i++) {
01042                 for(j=0; j<absQPStride; j++) {
01043                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
01044                 }
01045             }
01046         }
01047     }
01048 
01049     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
01050            mode->lumMode, mode->chromMode);
01051 
01052     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
01053                 width, height, QP_store, QPStride, 0, mode, c);
01054 
01055     width  = (width )>>c->hChromaSubSample;
01056     height = (height)>>c->vChromaSubSample;
01057 
01058     if(mode->chromMode){
01059         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
01060                     width, height, QP_store, QPStride, 1, mode, c);
01061         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
01062                     width, height, QP_store, QPStride, 2, mode, c);
01063     }
01064     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
01065         linecpy(dst[1], src[1], height, srcStride[1]);
01066         linecpy(dst[2], src[2], height, srcStride[2]);
01067     }else{
01068         int y;
01069         for(y=0; y<height; y++){
01070             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
01071             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
01072         }
01073     }
01074 }