[FFmpeg-devel] [PATCH] Altivec version of h264_idct_add

Luca Barbato lu_zero
Sat Jun 2 13:22:27 CEST 2007


David Conrad wrote:
> Hi,
> 

overall it is 1/100 1/70 faster in decoding on G4 but is completely
w/out effect on CELL...

> 
> Index: libavcodec/ppc/h264_altivec.c
> ===================================================================
> --- libavcodec/ppc/h264_altivec.c	(revision 9167)
> +++ libavcodec/ppc/h264_altivec.c	(working copy)
> @@ -404,6 +404,119 @@
>   * IDCT transform:
>   ****************************************************************************/
>  
> +#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)              \
> +   /* 1st stage */                                               \
> +   vz0 = vec_add(vb0,vb2);       /* temp[0] = Y[0] + Y[2] */     \
> +   vz1 = vec_sub(vb0,vb2);       /* temp[1] = Y[0] - Y[2] */     \
> +   vz2 = vec_sra(vb1,vec_splat_u16(1));                          \
> +   vz2 = vec_sub(vz2,vb3);       /* temp[2] = Y[1].1/2 - Y[3] */ \
> +   vz3 = vec_sra(vb3,vec_splat_u16(1));                          \
> +   vz3 = vec_add(vb1,vz3);       /* temp[3] = Y[1] + Y[3].1/2 */ \
> +   /* 2nd stage: output */                                       \
> +   va0 = vec_add(vz0,vz3);       /* x[0] = temp[0] + temp[3] */  \
> +   va1 = vec_add(vz1,vz2);       /* x[1] = temp[1] + temp[2] */  \
> +   va2 = vec_sub(vz1,vz2);       /* x[2] = temp[1] - temp[2] */  \
> +   va3 = vec_sub(vz0,vz3)        /* x[3] = temp[0] - temp[3] */
> +
> +#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
> +    b0 = vec_mergeh( a0, a0 ); \
> +    b1 = vec_mergeh( a1, a0 ); \
> +    b2 = vec_mergeh( a2, a0 ); \
> +    b3 = vec_mergeh( a3, a0 ); \
> +    a0 = vec_mergeh( b0, b2 ); \
> +    a1 = vec_mergel( b0, b2 ); \
> +    a2 = vec_mergeh( b1, b3 ); \
> +    a3 = vec_mergel( b1, b3 ); \
> +    b0 = vec_mergeh( a0, a2 ); \
> +    b1 = vec_mergel( a0, a2 ); \
> +    b2 = vec_mergeh( a1, a3 ); \
> +    b3 = vec_mergel( a1, a3 )
> +

hmmm... (more below)

> +#define VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(p,mask,va,perm)\
> +    vdst_orig = vec_ld(0,p);                                  \
> +    vdst = vec_perm(vdst_orig, zero_u8v, mask);               \
> +    vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst);         \
> +    va = vec_add(va,vdst_ss);                                 \
> +    va_u8 = vec_packsu(va, zero_s16v);                        \

                              ^^^^^^^^^
> +    vfdst = vec_perm(vdst_orig, va_u8, perm);                 \
> +    vec_st(vfdst, 0, dst);
> +
> +#define VEC_LOAD_U8_ADD_S16_STORE_U8(p,va,perm)               \
> +    vdst = vec_ld(0, p);                                      \
> +    vdst_ss = (vec_s16_t)vec_mergeh(zero_u8v, vdst);          \
> +    va = vec_add(va,vdst_ss);                                 \
> +    va_u8 = vec_packsu(va, zero_u8v);                         \

			      ^^^^^^^^ should be zero_s16v
> +    vfdst = vec_perm(vdst, va_u8, perm);                      \
> +    vec_st(vfdst, 0, dst);
> +
> +static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
> +{
> +    vec_s16_t va0, va1, va2, va3;
> +    vec_s16_t vz0, vz1, vz2, vz3;
> +    vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
> +    vec_u8_t va_u8;
> +    vec_s16_t vdst_ss;
> +    const vec_u16_t v6us = vec_splat_u16(6);
> +    vec_u8_t dstperm;
> +    vec_u8_t vdst, vdst_orig, vfdst;
> +    LOAD_ZERO;
> +
> +    block[0] += 32;  /* add 32 as a DC-level for rounding */
> +
> +    vtmp0 = vec_ld(0,block);
> +    vtmp1 = vec_sld(vtmp0, vtmp0, 8);
> +    vtmp2 = vec_ld(16,block);
> +    vtmp3 = vec_sld(vtmp2, vtmp2, 8);
> +
> +    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
> +    VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
> +    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
> +
> +    va0 = vec_sra(va0,v6us);
> +    va1 = vec_sra(va1,v6us);
> +    va2 = vec_sra(va2,v6us);
> +    va3 = vec_sra(va3,v6us);
> +

That is probably the CELL killer... You should be able to use vec_ste
instead of this switch.

> +    if ((unsigned long)dst & 0xF){
> +        vec_u8_t vdst_mask;
> +        switch ((unsigned long)dst & 0xF){
> +        case 4:
> +            dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
> +                                    0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
> +            break;
> +        case 8:
> +            dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> +                                    0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F);
> +            break;
> +        default:    // case 12
> +            dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> +                                    0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13);
> +            break;
> +        }
> +
> +        vdst_mask = vec_lvsl(0, dst);
> +
> +        VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va0,dstperm);
> +        dst += stride;
> +        VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va1,dstperm);
> +        dst += stride;
> +        VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va2,dstperm);
> +        dst += stride;
> +        VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va3,dstperm);
> +    }
> +    else{
> +        dstperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
> +                                0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
> +        VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va0,dstperm);
> +        dst += stride;
> +        VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va1,dstperm);
> +        dst += stride;
> +        VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va2,dstperm);
> +        dst += stride;
> +        VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va3,dstperm);
> +    }
> +}
> +

Sorry if you'll receive dups, seems that the network isn't exactly sane
today...

lu

-- 

Luca Barbato

Gentoo/linux Gentoo/PPC
http://dev.gentoo.org/~lu_zero




More information about the ffmpeg-devel mailing list