[FFmpeg-devel] [PATCH] A rather simple H.264 speed optimization

Michael Niedermayer michaelni
Mon Jul 28 20:33:56 CEST 2008


On Mon, Jul 28, 2008 at 09:44:06AM -0600, Jason Garrett-Glaser wrote:
> On Mon, Jul 28, 2008 at 9:19 AM, Michael Niedermayer <michaelni at gmx.at> wrote:
> > On Sun, Jul 27, 2008 at 10:50:29PM -0600, Jason Garrett-Glaser wrote:
> >> $subject, gains 6 clock cycles or so per decode_cabac_residual call on
> >> an ordinary source.
> >
> > ok
> 
> applied
> 
> round 2:
> 
> I used the extra casts in the p[0] p[8] etc cases for clarity, to
> avoid changing the array indices.
> 
> Index: libavcodec/h264.c
> ===================================================================
> --- libavcodec/h264.c	(revision 14459)
> +++ libavcodec/h264.c	(working copy)
> @@ -354,19 +354,15 @@
>              if(USES_LIST(top_type, list)){
>                  const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
>                  const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
> -                *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]=
> *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
> -                *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]=
> *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
> -                *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]=
> *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
> -                *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]=
> *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
> +                *(uint64_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]=
> *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0];
> +                *(uint64_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]=
> *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2];

This patch looks like it has been salted with \n ...
I doubt patch would accept that as input ;)



>                  h->ref_cache[list][scan8[0] + 0 - 1*8]=
>                  h->ref_cache[list][scan8[0] + 1 - 1*8]=
> s->current_picture.ref_index[list][b8_xy + 0];
>                  h->ref_cache[list][scan8[0] + 2 - 1*8]=
>                  h->ref_cache[list][scan8[0] + 3 - 1*8]=
> s->current_picture.ref_index[list][b8_xy + 1];
>              }else{
> -                *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
> -                *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
> -                *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
> -                *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
> +                *(uint64_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
> +                *(uint64_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= 0;
>                  *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]=
> ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
>              }
> 
> @@ -428,15 +424,11 @@
>                  /* XXX beurk, Load mvd */
>                  if(USES_LIST(top_type, list)){
>                      const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
> -                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 -
> 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
> -                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 -
> 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
> -                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 -
> 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
> -                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 -
> 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
> +                    *(uint64_t*)h->mvd_cache[list][scan8[0] + 0 -
> 1*8]= *(uint64_t*)h->mvd_table[list][b_xy + 0];
> +                    *(uint64_t*)h->mvd_cache[list][scan8[0] + 2 -
> 1*8]= *(uint64_t*)h->mvd_table[list][b_xy + 2];

How does this affect benchmark scores? [and on what cpu, 32/64?]



[...]
> @@ -5698,6 +5690,7 @@
>                          int mpx, mpy;
>                          int mx, my;
>                          const int index= 4*i + block_width*j;
> +                        uint32_t mv, mvd;
>                          int16_t (* mv_cache)[2]= &h->mv_cache[list][
> scan8[index] ];
>                          int16_t (* mvd_cache)[2]=
> &h->mvd_cache[list][ scan8[index] ];
>                          pred_motion(h, index, block_width, list,
> h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
> @@ -5706,40 +5699,26 @@
>                          my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
>                          tprintf(s->avctx, "final mv:%d %d\n", mx, my);
> 
> +                        mv  = pack16to32( mx, my );
> +                        mvd = pack16to32( mx - mpx, my - mpy );
>                          if(IS_SUB_8X8(sub_mb_type)){
> -                            mv_cache[ 1 ][0]=
> -                            mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
> -                            mv_cache[ 1 ][1]=
> -                            mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
> -
> -                            mvd_cache[ 1 ][0]=
> -                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
> -                            mvd_cache[ 1 ][1]=
> -                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
> +                            *(uint32_t*) mv_cache[1]=*(uint32_t*)
> mv_cache[8]=*(uint32_t*) mv_cache[9]= mv;
> +
> *(uint32_t*)mvd_cache[1]=*(uint32_t*)mvd_cache[8]=*(uint32_t*)mvd_cache[9]=
> mvd;
>                          }else if(IS_SUB_8X4(sub_mb_type)){
> -                            mv_cache[ 1 ][0]= mx;
> -                            mv_cache[ 1 ][1]= my;
> -
> -                            mvd_cache[ 1 ][0]= mx - mpx;
> -                            mvd_cache[ 1 ][1]= my - mpy;
> +                            *(uint32_t*) mv_cache[1]= mv;
> +                            *(uint32_t*)mvd_cache[1]= mvd;
>                          }else if(IS_SUB_4X8(sub_mb_type)){
> -                            mv_cache[ 8 ][0]= mx;
> -                            mv_cache[ 8 ][1]= my;
> -
> -                            mvd_cache[ 8 ][0]= mx - mpx;
> -                            mvd_cache[ 8 ][1]= my - mpy;
> +                            *(uint32_t*) mv_cache[8]= mv;
> +                            *(uint32_t*)mvd_cache[8]= mvd;
>                          }
> -                        mv_cache[ 0 ][0]= mx;
> -                        mv_cache[ 0 ][1]= my;
> -
> -                        mvd_cache[ 0 ][0]= mx - mpx;
> -                        mvd_cache[ 0 ][1]= my - mpy;
> +                        *(uint32_t*) mv_cache[0]= mv;
> +                        *(uint32_t*)mvd_cache[0]= mvd;
>                      }

This looks independant of the 32->64 change and should be in a seperate
patch. (its impossible to benchmark either correctly if they are combined)


>                  }else{
> -                    uint32_t *p= (uint32_t *)&h->mv_cache[list][
> scan8[4*i] ][0];
> -                    uint32_t *pd= (uint32_t *)&h->mvd_cache[list][
> scan8[4*i] ][0];
> +                    uint32_t *p = (uint32_t*)&h->mv_cache[list][
> scan8[4*i] ][0];
> +                    uint32_t *pd= (uint32_t*)&h->mvd_cache[list][
> scan8[4*i] ][0];

Did anything beside whitespace change in these?



[...]
> @@ -5908,7 +5887,7 @@
>                      }
>                  } else {
>                      uint8_t * const nnz= &h->non_zero_count_cache[
> scan8[4*i8x8] ];
> -                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
> +                    *(uint16_t*)&nnz[0] = *(uint16_t*)&nnz[8] = 0;
>                  }
>              }
>          }
> @@ -5933,14 +5912,14 @@
>              }
>          } else {
>              uint8_t * const nnz= &h->non_zero_count_cache[0];
> -            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8
> ] =nnz[ scan8[16]+9 ] =
> -            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8
> ] =nnz[ scan8[20]+9 ] = 0;
> +            *(uint16_t*)&nnz[ scan8[16]+0 ] = *(uint16_t*)&nnz[ scan8[16]+8 ] =
> +            *(uint16_t*)&nnz[ scan8[20]+0 ] = *(uint16_t*)&nnz[
> scan8[20]+8 ] = 0;
>          }
>      } else {
>          uint8_t * const nnz= &h->non_zero_count_cache[0];
>          fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
> -        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ]
> =nnz[ scan8[16]+9 ] =
> -        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ]
> =nnz[ scan8[20]+9 ] = 0;
> +        *(uint16_t*)&nnz[ scan8[16]+0 ] = *(uint16_t*)&nnz[ scan8[16]+8 ] =
> +        *(uint16_t*)&nnz[ scan8[20]+0 ] = *(uint16_t*)&nnz[ scan8[20]+8 ] = 0;
>          h->last_qscale_diff = 0;
>      }

ok


> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at mplayerhq.hu
> https://lists.mplayerhq.hu/mailman/listinfo/ffmpeg-devel
> 

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Republics decline into democracies and democracies degenerate into
despotisms. -- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080728/d64373a6/attachment.pgp>



More information about the ffmpeg-devel mailing list