# [FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations

Michael Niedermayer michaelni
Wed Jul 9 11:53:45 CEST 2008

```On Mon, Jul 07, 2008 at 09:02:28PM +0200, Victor Pollex wrote:
> Michael Niedermayer schrieb:
>> On Thu, Jul 03, 2008 at 02:51:18PM +0200, Victor Pollex wrote:
[...]
>>> +/*
>>> +    precodition:
>>> +        for all values v in r0, r1, r2, r3: -3971 <= v <= 3971
>>> +
>>> +    postcondition:
>>> +        r3 = ((17 * (r0 + r2) + (22 * r1 + 10 * r3) + c) >> 3)
>>> +        r4 = ((17 * (r0 - r2) - (10 * r1 - 22 * r3) + c) >> 3)
>>> +        r1 = ((17 * (r0 - r2) + (10 * r1 - 22 * r3) + c) >> 3)
>>> +        r2 = ((17 * (r0 + r2) - (22 * r1 + 10 * r3) + c) >> 3)
>>> +        r0 undefined
>>> +        r5 undefined
>>> +        r6 undefined
>>> +        r7 undefined
>>> +*/
>>> +#define TRANSFORM_4X4_ROW(r0,r1,r2,r3,r4,r5,r6,r7,c)\
>>> +    TRANSPOSE4(r0,r1,r2,r3,r4)\
>>> +    TRANSFORM_4X4_COMMON(r0,r3,r4,r2,r1,r5,r6,r7,c)\
>>> +    "paddw "#r4", "#r4"\n\t" /* 2 * (r0 + r2) */\
>>> +    SUMSUB_BA(r3,r4)\
>>> +    "paddw "#r1", "#r3"\n\t"\
>>> +    "paddw "#r7", "#r4"\n\t"\
>>> +    "paddw "#r0", "#r0"\n\t" /* 2 * (r0 - r2) */\
>>> +    SUMSUB_BA(r2,r0)\
>>> +    "paddw "#r5", "#r0"\n\t"\
>>> +    "paddw "#r6", "#r2"\n\t"\
>>> +    TRANSPOSE4(r3,r0,r2,r4,r1)
>>>
>>
>> It should be possible to merge one transpose into the scantble (the
>> mpeg1/2/4
>> decoder does that too)
>>
>>
> I'm not sure if this should be done as I found the following lines in
> decode_sequence_header in vc1.c
>    if (!v->res_fasttx)
>    {
>        v->s.dsp.vc1_inv_trans_8x8 = ff_simple_idct;
>        v->s.dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
>        v->s.dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
>        v->s.dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
>    }

The used permutation should of course depend on the used idct

>
>> [...]
>>
>>> +/*
>>> +    postcondition:
>>> +        r0 = [15:0](2 * r0);
>>> +        r1 = [15:0](3 * r0);
>>> +*/
>>> +#define G3X(r0,r1)\
>>> +    "movq  "#r0", "#r1"\n\t" /* r0 */\
>>> +    "paddw "#r0", "#r0"\n\t" /* 2 * r0 */\
>>> +    "paddw "#r0", "#r1"\n\t" /* 3 * r0 */
>>>
>>
>> 4 uses, saving 8 lines, macro with docs is 9 lines
>>
> removed docs

I would prefer if you would remove the macro, it would make the code easier
to understand and more flexible.

[...]
> Index: libavcodec/i386/vc1dsp_mmx.c
> ===================================================================
> --- libavcodec/i386/vc1dsp_mmx.c	(Revision 14101)
> +++ libavcodec/i386/vc1dsp_mmx.c	(Arbeitskopie)
> @@ -1,6 +1,7 @@
>  /*
>   * VC-1 and WMV3 - DSP functions MMX-optimized
>   * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
> + * Copyright (c) 2008 Victor Pollex
>   *
>   * Permission is hereby granted, free of charge, to any person
>   * obtaining a copy of this software and associated documentation
> @@ -467,7 +468,466 @@
>  DECLARE_FUNCTION(3, 2)
>  DECLARE_FUNCTION(3, 3)
>
> +#define OPC_SS_AB(opc, src, dst0, dst1)\
> +    #opc" "#src","#dst0"\n\t"\
> +    #opc" "#src","#dst1"\n\t"
> +
> +#define OPC_SSSS_ABCD(opc, src, dst0, dst1, dst2, dst3)\
> +    OPC_SS_AB(opc, src, dst0, dst1)\
> +    OPC_SS_AB(opc, src, dst2, dst3)
> +

> +#define ADD1SUB1(src, dst0, dst1)\
> +    "paddw "#src", "#dst0"\n\t"\
> +    "psubw "#src", "#dst1"\n\t"

using

"paddw src, dst0\n\t    psubw src, dst1\n\t"

instead of yet another macro
will be 4 lines shorter and more readable

> +
> +/*
> +    precodition:
> +        for all values v in r0, r1, r2, r3: -2^15 <= 5 * v + c / 2 <= 2^15 - 1
> +
> +    postcondition:
> +        r0 =     r0 - r2
> +        r1 = 2 * r1 + r3
> +        r2 =     r0 + r2
> +        r3 = 2 * r3 - r1
> +        r4 = (((r0 + r2 + c) >> 1) + (3 * r1 + r3)) >> 2
> +        r5 = (((r0 - r2 + c) >> 1) - (3 * r3 - r1)) >> 2
> +        r6 = (((r0 - r2 + c) >> 1) + (3 * r3 - r1)) >> 2
> +        r7 = (((r0 + r2 + c) >> 1) - (3 * r1 + r3)) >> 2
> +*/
> +#define TRANSFORM_4X4_COMMON(r0,r1,r2,r3,r4,r5,r6,r7,c)\

> +    SUMSUB_BA(r2,r0)\
> +    "movq     "#r0", "#r5"\n\t" /* r0 - r2 */\
> +    "movq     "#r2", "#r7"\n\t" /* r0 + r2 */\
> +    "movq      "#c", "#r4"\n\t" /* c */\
> +    "paddw    "#r4", "#r5"\n\t" /* r0 - r2 + c */\
> +    "paddw    "#r4", "#r7"\n\t" /* r0 + r2 + c */\

r5= c       c
r0-=r2      r0-r2
r2+=r2      2r2
r6=r2       2r2
r2+=r0      r2+r2
r5+=r0      r0-r2+c
r6+=r5      r0+r2+c

one instruction less

> +    OPC_SS_AB(psraw,\$1,r5,r7)\

> +    "movq     "#r1", "#r4"\n\t" /* r1 */\
> +    "movq     "#r3", "#r6"\n\t" /* r3 */\
> +    "paddw    "#r1", "#r1"\n\t" /* 2 * r1 */\
> +    "paddw    "#r6", "#r1"\n\t" /* 2 * r1 + r3 */\
> +    "paddw    "#r3", "#r3"\n\t" /* 2 * r3 */\
> +    "psubw    "#r4", "#r3"\n\t" /* 2 * r3 - r1 */\
> +    "paddw    "#r1", "#r4"\n\t" /* 3 * r1 + r3 */\
> +    "paddw    "#r3", "#r6"\n\t" /* 3 * r3 - r1 */\

r4=r3       r3
r3+=r1      r3+r1
r1+=r1      2r1
r3+=r1      r3+3r1
r4-=r1      r3-2r1
r1+=r1      2r3-4r1
r1+=r3      3r3-r1

one instruction less

[...]
> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
> +{
> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
> +    asm volatile(
> +    TRANSFORM_8X4_ROW(0x00(%0),0x00%1)
> +    TRANSFORM_8X4_ROW(0x40(%0),0x40%1)
> +
> +    LOAD4(0x10,0x00%1,%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> +    STORE4(0x10,0x00%1,%%mm0,%%mm3,%%mm4,%%mm2)
> +    LOAD4(0x10,0x08%1,%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> +    STORE4(0x10,0x08%1,%%mm0,%%mm3,%%mm4,%%mm2)
> +
> +    LOAD4(0x10,0x40%1,%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> +    STORE4(0x10,0x40%1,%%mm0,%%mm3,%%mm4,%%mm2)
> +    LOAD4(0x10,0x48%1,%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> +    STORE4(0x10,0x48%1,%%mm0,%%mm3,%%mm4,%%mm2)
> +
> +    TRANSFORM_4X8_COL(0x00%1,0x00(%0),%2)
> +    TRANSFORM_4X8_COL(0x08%1,0x08(%0),%2)
> +    :
> +    : "r"(block), "m"(temp[0]), "m"(constants[4])
> +    : "memory"
> +    );
> +}
> +
> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
> +    asm volatile(
> +    TRANSFORM_8X4_ROW(0x00(%1),0x00%2)
> +
> +    LOAD4(0x10,0x00%2,%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> +    TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
> +    "pxor %%mm7, %%mm7\n\t"
> +    LOAD_ADD_CLAMP_STORE_4X4(%4,%0,%%mm1,%%mm3,%%mm0,%%mm2,%%mm4)
> +
> +    LOAD4(0x10,0x08%2,%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> +    TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
> +    "pxor %%mm7, %%mm7\n\t"
> +    LOAD_ADD_CLAMP_STORE_4X4(%4,%3,%%mm1,%%mm3,%%mm0,%%mm2,%%mm4)
> +    : "+r"(dest)
> +    : "r"(block), "m"(temp[0]), "r"(dest+4), "r"(linesize), "m"(constants[4])
> +    : "memory"
> +    );
> +}
> +
> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
> +    asm volatile(
> +    LOAD4(0x10,0x00(%1),%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
> +    STORE4(0x10,0x00%2,%%mm3,%%mm4,%%mm1,%%mm2)
> +    LOAD4(0x10,0x40(%1),%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
> +    STORE4(0x10,0x40%2,%%mm3,%%mm4,%%mm1,%%mm2)
> +
> +    TRANSFORM_4X8_COL(0x00%2,0x00(%1),0x08+%4)
> +
> +    "pxor %%mm7, %%mm7\n\t"
> +    LOAD4(0x10,0x00(%1),%%mm0,%%mm1,%%mm2,%%mm3)
> +    LOAD_ADD_CLAMP_STORE_4X4(%3,%0,%%mm4,%%mm0,%%mm1,%%mm2,%%mm3)
> +    "add     %3,    %0\n\t"
> +    LOAD4(0x10,0x40(%1),%%mm0,%%mm1,%%mm2,%%mm3)
> +    LOAD_ADD_CLAMP_STORE_4X4(%3,%0,%%mm4,%%mm0,%%mm1,%%mm2,%%mm3)
> +    : "+r"(dest)
> +    : "r"(block), "m"(temp[0]), "r"(linesize), "m"(constants[0])
> +    : "memory"
> +    );
> +}

some of the load&store are avoidable

[...]
--
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Good people do not need laws to tell them to act responsibly, while bad
people will find a way around the laws. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080709/2f38554e/attachment.pgp>

```

More information about the ffmpeg-devel mailing list