[Ffmpeg-devel] [PATCH] put_mpeg4_qpel16_h_lowpass altivec, take 2

Luca Barbato lu_zero
Sun Nov 26 17:23:35 CET 2006


Brian Foley wrote:
> On Mon, Nov 20, 2006 at 02:43:17AM +0100, Michael Niedermayer wrote:
>> Hi
>>
>> On Sun, Nov 19, 2006 at 11:20:14PM +0000, Brian Foley wrote:
>>> +static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
>> code duplication, move copy_block17 to a common header like dsputil.h or
>> dsputil_internal.h or whatever, dont copy and paste it
> 
> OK, done. I've moved all the copy_block* stuff into dsputil.h. sh4 was
> using a copy of this too as it turns out.

make it a separate patch alone.

> 
> I'm not really sure the best way to handle this. I could have an aligned
> load in an 'if (((int) src & 0xf) == 0)', but I suspect the branch would
> hurt us quite badly. The other approach is to have 3 other functions
> where we assert src1 or src2 or both (and their strides) are aligned.

requires benchmark =/

> 
> I guess I should look at it with simg4 at some point, but there are much
> more CPU intensive functions that could be sped up first.

try using valgrind callgrind and oprofile if you could ^^;

> 
> Interesting. I implemented your suggestion, and it actually does work out
> a little faster, and it saves on setting up a couple of constant vectors
> too, which is no bad thing. The other tweaks suggested here save reading
> another constant vector from memory too.

good =)

> I've tidied up the use of types so it compiles cleanly with GCC 3.4 on
> Linux/PPC. Hopefully it should be about ready to commit now.

Comment inlined as usual

> 
> +
> +static void put_pixels16_l2_altivec(uint8_t *dst, const uint8_t *src1,
> +        const uint8_t *src2, int dst_stride, int src_stride1,
> +        int src_stride2, int h)
> +{
> +    register vector unsigned char src1v, src2v, dstv;
> +    register vector unsigned char tmp1, tmp2, mask, edges, align;
> +    int i;
> +
> +    for(i=0; i<h; i++) {
> +        /* Unaligned load */
> +        src1v = vec_perm(
> +            vec_ld(0, src1), vec_ld(15, src1), vec_lvsl(0, src1));
> +        src2v = vec_perm(
> +            vec_ld(0, src2), vec_ld(15, src2), vec_lvsl(0, src2));

if the stride is a multiple of 16 you could put vec_lvsl out the loop

> +
> +        /*Altivec's vec_avg is exactly the (a+b+1)>>1 that we want */
> +        dstv = vec_avg(src1v, src2v);
> +
> +        if ((int)dst & 0xf) {
> +            /* Unaligned store */
> +            tmp2 = vec_ld(15, dst);
> +            tmp1 = vec_ld(0, dst);
> +        
> +            mask = vec_lvsl(0, dst);
> +            edges = vec_perm(tmp2, tmp1, mask);
> +            align = vec_lvsr(0, dst);

same for mask and align

> +static void put_mpeg4_qpel16_h_lowpass_altivec(uint8_t *dst, uint8_t *src,
> +        int dstStride, int srcStride, int height)
> +{
> +    POWERPC_PERF_DECLARE(put_mpeg4_qpel16_h_lowpass_altivec, 1);

are you really using those stuff? if not you could avoid them (less
lines is always better)

> Index: ppc/dsputil_ppc.c
> ===================================================================
> --- ppc/dsputil_ppc.c	(revision 7166)
> +++ ppc/dsputil_ppc.c	(working copy)
> @@ -35,6 +35,7 @@
>  
>  void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
>  
> +void dsputil_mpeg4_init_altivec(DSPContext* c, AVCodecContext *avctx);
>  void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
>  void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
>  void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
> @@ -274,7 +275,12 @@
>      }
>  
>  #ifdef HAVE_ALTIVEC
> -    if(ENABLE_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
> +#ifdef ENABLE_H264_DECODER
> +    dsputil_h264_init_ppc(c, avctx);
> +#endif

cosmetic....


> +#ifdef ENABLE_MPEG4_DECODER
> +    dsputil_mpeg4_init_altivec(c, avctx);
> +#endif
>  
>      if (has_altivec()) {
>          mm_flags |= MM_ALTIVEC;
> Index: dsputil.c
> ===================================================================
> --- dsputil.c	(revision 7166)
> +++ dsputil.c	(working copy)
> @@ -1513,83 +1513,7 @@
>      }
>  }
>  
> -static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> -    int i;
> -    for(i=0; i<h; i++)
> -    {
> -        ST16(dst   , LD16(src   ));
> -        dst+=dstStride;
> -        src+=srcStride;
> -    }
> -}
>  
> -static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> -    int i;
> -    for(i=0; i<h; i++)
> -    {
> -        ST32(dst   , LD32(src   ));
> -        dst+=dstStride;
> -        src+=srcStride;
> -    }
> -}
> -
> -static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> -    int i;
> -    for(i=0; i<h; i++)
> -    {
> -        ST32(dst   , LD32(src   ));
> -        ST32(dst+4 , LD32(src+4 ));
> -        dst+=dstStride;
> -        src+=srcStride;
> -    }
> -}
> -
> -static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> -    int i;
> -    for(i=0; i<h; i++)
> -    {
> -        ST32(dst   , LD32(src   ));
> -        ST32(dst+4 , LD32(src+4 ));
> -        ST32(dst+8 , LD32(src+8 ));
> -        ST32(dst+12, LD32(src+12));
> -        dst+=dstStride;
> -        src+=srcStride;
> -    }
> -}
> -
> -static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> -    int i;
> -    for(i=0; i<h; i++)
> -    {
> -        ST32(dst   , LD32(src   ));
> -        ST32(dst+4 , LD32(src+4 ));
> -        ST32(dst+8 , LD32(src+8 ));
> -        ST32(dst+12, LD32(src+12));
> -        dst[16]= src[16];
> -        dst+=dstStride;
> -        src+=srcStride;
> -    }
> -}
> -
> -static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> -    int i;
> -    for(i=0; i<h; i++)
> -    {
> -        ST32(dst   , LD32(src   ));
> -        ST32(dst+4 , LD32(src+4 ));
> -        dst[8]= src[8];
> -        dst+=dstStride;
> -        src+=srcStride;
> -    }
> -}
> -
> -
>  #define QPEL_MC(r, OPNAME, RND, OP) \
>  static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
>      uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\


Make the following a separate patch please

> Index: dsputil.h
> ===================================================================
> --- dsputil.h	(revision 7166)
> +++ dsputil.h	(working copy)


I'm unsure about it, those function aren't mpeg4-only?

> Index: Makefile
> ===================================================================
> --- Makefile	(revision 7166)
> +++ Makefile	(working copy)
> @@ -383,6 +383,7 @@
>                                            sh4/dsputil_align.o \
>  
>  OBJS-$(TARGET_ALTIVEC)                 += ppc/dsputil_altivec.o      \
> +                                          ppc/mpeg4_altivec.o        \
>                                            ppc/mpegvideo_altivec.o    \
>                                            ppc/idct_altivec.o         \
>                                            ppc/fft_altivec.o          \


keep it in the other patch

> Index: sh4/qpel.c
> ===================================================================
> --- sh4/qpel.c	(revision 7166)
> +++ sh4/qpel.c	(working copy)


-- 

Luca Barbato

Gentoo/linux Gentoo/PPC
http://dev.gentoo.org/~lu_zero





More information about the ffmpeg-devel mailing list