[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations

Victor Pollex victor.pollex
Mon Jul 28 21:27:24 CEST 2008


Michael Niedermayer schrieb:
> On Thu, Jul 17, 2008 at 11:29:59PM +0200, Victor Pollex wrote:
>   
>> Michael Niedermayer schrieb:
>>     
>>> On Mon, Jul 07, 2008 at 09:02:28PM +0200, Victor Pollex wrote:
>>>   
>>>       
>>>> Michael Niedermayer schrieb:
>>>>     
>>>>         
>>>>> On Thu, Jul 03, 2008 at 02:51:18PM +0200, Victor Pollex wrote:
>>>>>       
>>>>>           
>>> [...]
>>>   
>>>       
>>>>>> +/*
>>>>>> +    precodition:
>>>>>> +        for all values v in r0, r1, r2, r3: -3971 <= v <= 3971
>>>>>> +
>>>>>> +    postcondition:
>>>>>> +        r3 = ((17 * (r0 + r2) + (22 * r1 + 10 * r3) + c) >> 3)
>>>>>> +        r4 = ((17 * (r0 - r2) - (10 * r1 - 22 * r3) + c) >> 3)
>>>>>> +        r1 = ((17 * (r0 - r2) + (10 * r1 - 22 * r3) + c) >> 3)
>>>>>> +        r2 = ((17 * (r0 + r2) - (22 * r1 + 10 * r3) + c) >> 3)
>>>>>> +        r0 undefined
>>>>>> +        r5 undefined
>>>>>> +        r6 undefined
>>>>>> +        r7 undefined
>>>>>> +*/
>>>>>> +#define TRANSFORM_4X4_ROW(r0,r1,r2,r3,r4,r5,r6,r7,c)\
>>>>>> +    TRANSPOSE4(r0,r1,r2,r3,r4)\
>>>>>> +    TRANSFORM_4X4_COMMON(r0,r3,r4,r2,r1,r5,r6,r7,c)\
>>>>>> +    "paddw "#r4", "#r4"\n\t" /* 2 * (r0 + r2) */\
>>>>>> +    SUMSUB_BA(r3,r4)\
>>>>>> +    "paddw "#r1", "#r3"\n\t"\
>>>>>> +    "paddw "#r7", "#r4"\n\t"\
>>>>>> +    "paddw "#r0", "#r0"\n\t" /* 2 * (r0 - r2) */\
>>>>>> +    SUMSUB_BA(r2,r0)\
>>>>>> +    "paddw "#r5", "#r0"\n\t"\
>>>>>> +    "paddw "#r6", "#r2"\n\t"\
>>>>>> +    TRANSPOSE4(r3,r0,r2,r4,r1)
>>>>>>             
>>>>>>             
>>>>> It should be possible to merge one transpose into the scantble (the 
>>>>> mpeg1/2/4
>>>>> decoder does that too)
>>>>>
>>>>>         
>>>>>           
>>>> I'm not sure if this should be done as I found the following lines in 
>>>> decode_sequence_header in vc1.c
>>>>    if (!v->res_fasttx)
>>>>    {
>>>>        v->s.dsp.vc1_inv_trans_8x8 = ff_simple_idct;
>>>>        v->s.dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
>>>>        v->s.dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
>>>>        v->s.dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
>>>>    }
>>>>     
>>>>         
>>> The used permutation should of course depend on the used idct
>>>   
>>>       
>> ok, changed it. I could also try to do the same with the scantables for the 
>> 8x4 and 4x8 transformations if desired.
>>     
>
> yes, please do, review & approval of the changes to the vc1 non asm code is
> left for kostya, he is maintainer of that ...
>
> [...]
>   

changed the other scantables aswell, though i haven't tested the 8x4 and 
4x8 for the advanced profile, as I haven't found a progressive video 
encoded with the advanced profile.

>   
>>> [...]
>>>   
>>>       
>>>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>>>> +{
>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> +    asm volatile(
>>>> +    TRANSFORM_8X4_ROW(0x00(%0),0x00%1)
>>>> +    TRANSFORM_8X4_ROW(0x40(%0),0x40%1)
>>>> +
>>>> +    LOAD4(0x10,0x00%1,%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>>>> +    STORE4(0x10,0x00%1,%%mm0,%%mm3,%%mm4,%%mm2)
>>>> +    LOAD4(0x10,0x08%1,%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>>>> +    STORE4(0x10,0x08%1,%%mm0,%%mm3,%%mm4,%%mm2)
>>>> +
>>>> +    LOAD4(0x10,0x40%1,%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>>>> +    STORE4(0x10,0x40%1,%%mm0,%%mm3,%%mm4,%%mm2)
>>>> +    LOAD4(0x10,0x48%1,%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>>>> +    STORE4(0x10,0x48%1,%%mm0,%%mm3,%%mm4,%%mm2)
>>>> +
>>>> +    TRANSFORM_4X8_COL(0x00%1,0x00(%0),%2)
>>>> +    TRANSFORM_4X8_COL(0x08%1,0x08(%0),%2)
>>>> +    :
>>>> +    : "r"(block), "m"(temp[0]), "m"(constants[4])
>>>> +    : "memory"
>>>> +    );
>>>> +}
>>>> +
>>>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>> *block)
>>>> +{
>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> +    asm volatile(
>>>> +    TRANSFORM_8X4_ROW(0x00(%1),0x00%2)
>>>> +
>>>> +    LOAD4(0x10,0x00%2,%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>>>> +    
>>>> TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
>>>> +    "pxor %%mm7, %%mm7\n\t"
>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%4,%0,%%mm1,%%mm3,%%mm0,%%mm2,%%mm4)
>>>> +
>>>> +    LOAD4(0x10,0x08%2,%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>>>> +    
>>>> TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
>>>> +    "pxor %%mm7, %%mm7\n\t"
>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%4,%3,%%mm1,%%mm3,%%mm0,%%mm2,%%mm4)
>>>> +    : "+r"(dest)
>>>> +    : "r"(block), "m"(temp[0]), "r"(dest+4), "r"(linesize), 
>>>> "m"(constants[4])
>>>> +    : "memory"
>>>> +    );
>>>> +}
>>>> +
>>>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>> *block)
>>>> +{
>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> +    asm volatile(
>>>> +    LOAD4(0x10,0x00(%1),%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    
>>>> TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
>>>> +    STORE4(0x10,0x00%2,%%mm3,%%mm4,%%mm1,%%mm2)
>>>> +    LOAD4(0x10,0x40(%1),%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    
>>>> TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
>>>> +    STORE4(0x10,0x40%2,%%mm3,%%mm4,%%mm1,%%mm2)
>>>> +
>>>> +    TRANSFORM_4X8_COL(0x00%2,0x00(%1),0x08+%4)
>>>> +
>>>> +    "pxor %%mm7, %%mm7\n\t"
>>>> +    LOAD4(0x10,0x00(%1),%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%3,%0,%%mm4,%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    "add     %3,    %0\n\t"
>>>> +    LOAD4(0x10,0x40(%1),%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%3,%0,%%mm4,%%mm0,%%mm1,%%mm2,%%mm3)
>>>> +    : "+r"(dest)
>>>> +    : "r"(block), "m"(temp[0]), "r"(linesize), "m"(constants[0])
>>>> +    : "memory"
>>>> +    );
>>>> +}
>>>>     
>>>>         
>>> some of the load&store are avoidable
>>>
>>>   
>>>       
>> The only thing I can think of is in the 4x8 transformation, where the 
>> column transformation is done in two halfs. Instead of writing the lines to 
>> "block" after each half, I could add them to "dest". Is this what you had 
>> in mind?
>>     
>
> not only ...
>
> you load a 4x4 block, transform it and store it
> you load the second 4x4 block, transform it and STORE it
> then you LOAD the 4x8 block transform it and STORE it
> then you LOAD a 4x4 block and add it into dest.
> then you LOAD the second  4x4 block and add it into dest.
>
> upper case ones are at least partially avoidable
>   

Hopefully I haven't overlooked any possibilities to save some loads 
and/or stores

>
> [...]
>   
>> +#define TRANSFORM_8X4_ROW_H1(in,out)\
>>     
>
> macro is just used once thus not needed
>
>
> [...]
>
>   
>> +#define TRANSFORM_8X4_ROW_H2(in,out)\
>>     
>
> same
>
>
> [...]
>   
>> +#define TRANSFORM_4X8_COL_H1(in,out,c)\
>>     
>
> and this as well
>
>
>   
>> +#define TRANSFORM_4X8_COL_H2(in,out,c) \
>>     
>
> too
>
>
> [...]
>   
these macros are now used more often, removed the other macros in which 
these were used


00_constants.patch:
changes the ff_pw_4 and ff_pw_64 constants to be able to use them with sse2

01_dsputil_mmx.patch:
adds an parameter to the LOAD4 / STORE4 macros to specify a mov mnemonic 
suffix

02_vc1dsp.patch:
the new "transposed" scantables and altered code to use them

03_vc1dsp_mmx.patch:
adds a new header file with macros for the transformations and adds an 
mmx version for the transformations

04_vc1dsp_sse2.path:
adds an sse2/mmx version for the transformations and uses a bit of sse 
instructions. I hope this is acceptable.
someone should benchmark the 4x8 and 8x4 versions, as for me the former 
was slower and the latter a bit faster compared to the mmx versions

-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: 00_constants.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080728/08c40ccc/attachment.asc>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: 01_dsputil_mmx.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080728/08c40ccc/attachment.txt>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: 02_vc1dsp.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080728/08c40ccc/attachment-0001.asc>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: 03_vc1dsp_mmx.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080728/08c40ccc/attachment-0001.txt>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: 04_vc1dsp_sse2.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080728/08c40ccc/attachment-0002.asc>



More information about the ffmpeg-devel mailing list