[FFmpeg-devel] [RFC][PATCH] DSPUtilize some functions from APE decoder

Michael Niedermayer michaelni
Thu Jul 3 14:09:52 CEST 2008


On Wed, Jul 02, 2008 at 04:26:25PM +0300, Kostya wrote:
> I'm not satisfied with the decoding speed of APE decoder,
> so I've decided to finally dsputilize functions marked as such.
> 
> Altivec version is in development.
> Index: libavcodec/i386/dsputil_mmx.c
> ===================================================================
> --- libavcodec/i386/dsputil_mmx.c	(revision 14044)
> +++ libavcodec/i386/dsputil_mmx.c	(working copy)
> @@ -2061,6 +2061,66 @@
>  extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
>                            int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
>  
> +
> +static void vector_int16_add_sse(int16_t * v1, int16_t * v2, int order)
> +{
> +    int i;
> +    for(i = 0; i < order; i += 8){
> +        asm volatile(
> +	"movdqa  (%0),   %%xmm0 \n\t"
> +	"movdqu  (%1),   %%xmm1 \n\t"
> +	"paddw   %%xmm1, %%xmm0 \n\t"
> +	"movdqa  %%xmm0, (%0)   \n\t"
> +	: "+r"(v1), "+r"(v2)
> +	);
> +	v1 += 8;
> +	v2 += 8;
> +    }
> +}

tabs, loop should be in asm not C


[...]
> +static int32_t vector_int16_scalarproduct_sse(int16_t * v1, int16_t * v2, int order)
> +{
> +    int i;
> +    int res = 0, *resp=&res;
> +
> +    asm volatile("pxor %xmm7, %xmm7 \n\t");
> +
> +    for(i = 0; i < order; i += 8){
> +        asm volatile(
> +	"movdqu   (%0),   %%xmm0 \n\t"
> +	"movdqa   (%1),   %%xmm1 \n\t"
> +	"pmaddwd  %%xmm1, %%xmm0 \n\t"
> +	"movhlps  %%xmm0, %%xmm2 \n\t"
> +
> +	"paddd    %%xmm2, %%xmm0 \n\t"
> +	"pshufd  $0x01, %%xmm0,%%xmm2 \n\t"
> +	"paddd    %%xmm2, %%xmm0 \n\t"
> +	"paddd   %%xmm0, %%xmm7 \n\t"
> +	: "+r"(v1), "+r"(v2)
> +	);
> +	v1 += 8;
> +	v2 += 8;

":1
"movdqu     (%0, %2), %%xmm0 \n\t"
"movdqa     (%1, %2), %%xmm1 \n\t"
"pmaddwd    %%xmm1  , %%xmm0 \n\t"
"paddd      %%xmm0  , %%xmm2 \n\t"
"add        $16     , %2     \n\t"
"jnc 1b                      \n\t"

"movhlps  %%xmm2, %%xmm0 \n\t"
"paddd    %%xmm2, %%xmm0 \n\t"
"pshufd  $0x01, %%xmm0,%%xmm2 \n\t"
"paddd    %%xmm2, %%xmm0 \n\t"
(or faster horizontal combineing code)


[...]
> +static int32_t vector_int16_scalarproduct_c(int16_t * v1, int16_t * v2, int order)
> +{
> +    int res = 0;
> +
> +    while (order--)
> +        res += *v1++ * *v2++;
> +
> +    return res;
> +}

duplicate of dot_product() from acelp_math.c


[...]
>  
> +    /* ape functions */
> +    /* Add second vector values to the first one. v1 is aligned, v2 is not. */
> +    void (*vector_int16_add)(int16_t *v1, int16_t *v2, int len);
> +    /* Add second vector values to the first one. v1 is aligned, v2 is not. */
> +    void (*vector_int16_sub)(int16_t *v1, int16_t *v2, int len);

names should be in line with existing add_bytes / diff_bytes (iam of course
fine with changing the existing names if it helps clarity)


> +    /* Calculate scalar product of two vectors. v1 is unaligned, v2 is aligned. */
> +    int32_t (*vector_int16_scalarproduct)(int16_t *v1, int16_t *v2, int len);

doxygen
the exact alignment requirements and len%8 requirements should be
documented.


-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Complexity theory is the science of finding the exact solution to an
approximation. Benchmarking OTOH is finding an approximation of the exact
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080703/def7969a/attachment.pgp>



More information about the ffmpeg-devel mailing list