[Ffmpeg-cvslog] CVS: ffmpeg/libavcodec dsputil.c, 1.133, 1.134 dsputil.h, 1.127, 1.128 h264.c, 1.186, 1.187 h264idct.c, 1.4, 1.5

Loren Merritt CVS lorenm
Fri Feb 10 07:55:28 CET 2006


Update of /cvsroot/ffmpeg/ffmpeg/libavcodec
In directory mail:/var2/tmp/cvs-serv3337

Modified Files:
	dsputil.c dsputil.h h264.c h264idct.c 
Log Message:
h264: special case dc-only idct. ~1% faster overall



Index: dsputil.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.c,v
retrieving revision 1.133
retrieving revision 1.134
diff -u -d -r1.133 -r1.134
--- dsputil.c	5 Feb 2006 13:35:16 -0000	1.133
+++ dsputil.c	10 Feb 2006 06:55:24 -0000	1.134
@@ -3851,6 +3851,8 @@
 
     c->h264_idct_add= ff_h264_idct_add_c;
     c->h264_idct8_add= ff_h264_idct8_add_c;
+    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
+    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
 
     c->get_pixels = get_pixels_c;
     c->diff_pixels = diff_pixels_c;

Index: dsputil.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.h,v
retrieving revision 1.127
retrieving revision 1.128
diff -u -d -r1.127 -r1.128
--- dsputil.h	5 Feb 2006 13:35:16 -0000	1.127
+++ dsputil.h	10 Feb 2006 06:55:24 -0000	1.128
@@ -52,6 +52,8 @@
 
 void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
 
@@ -330,6 +332,8 @@
 
     void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
 } DSPContext;
 
 void dsputil_static_init(void);

Index: h264.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/h264.c,v
retrieving revision 1.186
retrieving revision 1.187
diff -u -d -r1.186 -r1.187
--- h264.c	8 Feb 2006 07:04:32 -0000	1.186
+++ h264.c	10 Feb 2006 06:55:25 -0000	1.187
@@ -3314,6 +3314,7 @@
     const unsigned int bottom = mb_y & 1;
     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
 
     if(!s->decode)
         return;
@@ -3337,9 +3338,16 @@
 //        dct_offset = s->linesize * 16;
     }
 
-    idct_add = transform_bypass
-             ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4
-             : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add;
+    if(transform_bypass){
+        idct_dc_add =
+        idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
+    }else if(IS_8x8DCT(mb_type)){
+        idct_dc_add = s->dsp.h264_idct8_dc_add;
+        idct_add = s->dsp.h264_idct8_add;
+    }else{
+        idct_dc_add = s->dsp.h264_idct_dc_add;
+        idct_add = s->dsp.h264_idct_add;
+    }
 
     if (IS_INTRA_PCM(mb_type)) {
         unsigned int x, y;
@@ -3389,17 +3397,22 @@
                         for(i=0; i<16; i+=4){
                             uint8_t * const ptr= dest_y + block_offset[i];
                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+                            const int nnz = h->non_zero_count_cache[ scan8[i] ];
                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
-                            if(h->non_zero_count_cache[ scan8[i] ])
-                                idct_add(ptr, h->mb + i*16, linesize);
+                            if(nnz){
+                                if(nnz == 1 && h->mb[i*16])
+                                    idct_dc_add(ptr, h->mb + i*16, linesize);
+                                else
+                                    idct_add(ptr, h->mb + i*16, linesize);
+                            }
                         }
                     }else
                     for(i=0; i<16; i++){
                         uint8_t * const ptr= dest_y + block_offset[i];
                         uint8_t *topright;
                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-                        int tr;
+                        int nnz, tr;
 
                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
@@ -3413,10 +3426,14 @@
                             topright= NULL;
 
                         h->pred4x4[ dir ](ptr, topright, linesize);
-                        if(h->non_zero_count_cache[ scan8[i] ]){
-                            if(s->codec_id == CODEC_ID_H264)
-                                idct_add(ptr, h->mb + i*16, linesize);
-                            else
+                        nnz = h->non_zero_count_cache[ scan8[i] ];
+                        if(nnz){
+                            if(s->codec_id == CODEC_ID_H264){
+                                if(nnz == 1 && h->mb[i*16])
+                                    idct_dc_add(ptr, h->mb + i*16, linesize);
+                                else
+                                    idct_add(ptr, h->mb + i*16, linesize);
+                            }else
                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
                         }
                     }
@@ -3453,11 +3470,23 @@
 
         if(!IS_INTRA4x4(mb_type)){
             if(s->codec_id == CODEC_ID_H264){
-                const int di = IS_8x8DCT(mb_type) ? 4 : 1;
-                for(i=0; i<16; i+=di){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
-                        uint8_t * const ptr= dest_y + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, linesize);
+                if(IS_INTRA16x16(mb_type)){
+                    for(i=0; i<16; i++){
+                        if(h->non_zero_count_cache[ scan8[i] ])
+                            idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                        else if(h->mb[i*16])
+                            idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                    }
+                }else{
+                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
+                    for(i=0; i<16; i+=di){
+                        int nnz = h->non_zero_count_cache[ scan8[i] ];
+                        if(nnz){
+                            if(nnz==1 && h->mb[i*16])
+                                idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                            else
+                                idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                        }
                     }
                 }
             }else{
@@ -3471,34 +3500,26 @@
         }
 
         if(!(s->flags&CODEC_FLAG_GRAY)){
-            idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add;
-            if(!transform_bypass){
+            uint8_t *dest[2] = {dest_cb, dest_cr};
+            if(transform_bypass){
+                idct_add = idct_dc_add = s->dsp.add_pixels4;
+            }else{
+                idct_add = s->dsp.h264_idct_add;
+                idct_dc_add = s->dsp.h264_idct_dc_add;
                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
             }
             if(s->codec_id == CODEC_ID_H264){
-                for(i=16; i<16+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cb + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, uvlinesize);
-                    }
-                }
-                for(i=20; i<20+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cr + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, uvlinesize);
-                    }
+                for(i=16; i<16+8; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ])
+                        idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
+                    else if(h->mb[i*16])
+                        idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
                 }
             }else{
-                for(i=16; i<16+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cb + block_offset[i];
-                        svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
-                    }
-                }
-                for(i=20; i<20+4; i++){
+                for(i=16; i<16+8; i++){
                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cr + block_offset[i];
+                        uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
                     }
                 }
@@ -5131,7 +5152,7 @@
                                 return -1;
                         }
                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
-                        nnz[0] |= nnz[1] | nnz[8] | nnz[9];
+                        nnz[0] += nnz[1] + nnz[8] + nnz[9];
                     }else{
                         for(i4x4=0; i4x4<4; i4x4++){
                             const int index= i4x4 + 4*i8x8;
@@ -5690,7 +5711,7 @@
         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
     else {
         assert( cat == 5 );
-        fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1);
+        fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
     }
 
     for( i = coeff_count - 1; i >= 0; i-- ) {

Index: h264idct.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/h264idct.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- h264idct.c	12 Jan 2006 22:43:15 -0000	1.4
+++ h264idct.c	10 Feb 2006 06:55:25 -0000	1.5
@@ -139,3 +139,28 @@
         dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
     }
 }
+
+// assumes all AC coefs are 0
+void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    int i, j;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    int dc = (block[0] + 32) >> 6;
+    for( j = 0; j < 4; j++ )
+    {
+        for( i = 0; i < 4; i++ )
+            dst[i] = cm[ dst[i] + dc ];
+        dst += stride;
+    }
+}
+
+void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    int i, j;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    int dc = (block[0] + 32) >> 6;
+    for( j = 0; j < 8; j++ )
+    {
+        for( i = 0; i < 8; i++ )
+            dst[i] = cm[ dst[i] + dc ];
+        dst += stride;
+    }
+}





More information about the ffmpeg-cvslog mailing list