[FFmpeg-devel] [PATCH] VC-1 MMX DSP functions

Michael Niedermayer michaelni
Sun Nov 18 01:16:35 CET 2007


On Sat, Nov 17, 2007 at 12:33:31PM +0100, Christophe GISQUET wrote:
> Michael Niedermayer a ?crit :
> > you can apply the bias after both filters, just before the >>7 and a filter
> > indepandant bias should be possible
> > after the >>7 you unbias it
> > the first can be merged with the rounder so its just 1 sub more
> 
> Done.
> 
> > all that assumes that you remove all incorrect saturation arithmetic
> 
> Some harmed, some didn't. But saturation is of any use here, so I
> changed all of them.
[...]
> +#define SHIFT2_16B_END_LINE(R)                  \
> +    "psraw     %5, %%mm"#R"           \n\t"     \
> +    "movq      %%mm"#R", (%2)         \n\t"     \
> +    "add       %3, %1                 \n\t"     \
> +    "add       $24, %2                \n\t"

the $24 add can be avoided by using a offset for the movq above


[...]
> +/**
> + * Purely vertical or horizontal 1/2 shift interpolation.
> + * Sacrify mm5 and mm6.
> + */
> +static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
> +                               long int stride, int rnd, long int offset)
> +{
> +    int h = 8;
> +
> +    rnd = 8-rnd;
> +    asm volatile(
> +        LOAD_ROUNDER_MMX("%6")
> +        "1:                                \n\t"
> +        "movd      0(%1   ), %%mm3         \n\t"
> +        "movd      4(%1   ), %%mm4         \n\t"
> +        "movd      0(%1,%3), %%mm1         \n\t"
> +        "movd      4(%1,%3), %%mm2         \n\t"
> +        "add       %3, %1                  \n\t"
> +        "punpcklbw %%mm0, %%mm3            \n\t"
> +        "punpcklbw %%mm0, %%mm4            \n\t"
> +        "punpcklbw %%mm0, %%mm1            \n\t"
> +        "punpcklbw %%mm0, %%mm2            \n\t"
> +        "movd      0(%1,%4  ), %%mm5       \n\t"
> +        "movd      4(%1,%4  ), %%mm6       \n\t"
> +        "paddw     %%mm1, %%mm3            \n\t" /* 0,1,1,0*/
> +        "paddw     %%mm2, %%mm4            \n\t" /* 0,1,1,0*/
> +        "punpcklbw %%mm0, %%mm5            \n\t"
> +        "punpcklbw %%mm0, %%mm6            \n\t"



> +        "movq      %%mm3, %%mm1            \n\t" /* 0,1,1,0*/
> +        "movq      %%mm4, %%mm2            \n\t" /* 0,1,1,0*/
> +        "psubw     %%mm5, %%mm3            \n\t" /*-1,1,1,0*/
> +        "psubw     %%mm6, %%mm4            \n\t" /*-1,1,1,0*/
> +        "psllw     $3, %%mm1               \n\t" /* 0,8,8,0*/
> +        "psllw     $3, %%mm2               \n\t" /* 0,8,8,0*/
> +        "movd      0(%1,%3), %%mm5         \n\t"
> +        "movd      4(%1,%3), %%mm6         \n\t"
> +        "paddw     %%mm1, %%mm3            \n\t" /*-1,9,9,0*/
> +        "paddw     %%mm2, %%mm4            \n\t" /*-1,9,9,0*/
> +        "punpcklbw %%mm0, %%mm5            \n\t"
> +        "punpcklbw %%mm0, %%mm6            \n\t"
> +        "psubw     %%mm5, %%mm3            \n\t" /*-1,9,9,-1*/
> +        "psubw     %%mm6, %%mm4            \n\t" /*-1,9,9,-1*/


        "psubw     %%mm3, %%mm5            \n\t" /* 1,-1,-1, 0*/
        "psubw     %%mm4, %%mm6            \n\t" /* 1,-1,-1, 0*/
        "psllw     $3, %%mm3               \n\t" /* 0,8,8,0*/
        "psllw     $3, %%mm4               \n\t" /* 0,8,8,0*/
        "movd      0(%1,%3), %%mm1         \n\t"
        "movd      4(%1,%3), %%mm2         \n\t"
        "psubw     %%mm5, %%mm3            \n\t" /*-1,9,9,0*/
        "psubw     %%mm6, %%mm4            \n\t" /*-1,9,9,0*/
        "punpcklbw %%mm0, %%mm1            \n\t"
        "punpcklbw %%mm0, %%mm2            \n\t"
        "psubw     %%mm1, %%mm3            \n\t" /*-1,9,9,-1*/
        "psubw     %%mm2, %%mm4            \n\t" /*-1,9,9,-1*/



[...]
> +     "movq      %%mm1, %%mm3    \n\t"                      \
> +     "movq      %%mm2, %%mm4    \n\t"                      \
> +     "paddw     %%mm1, %%mm1    \n\t"                      \
> +     "paddw     %%mm2, %%mm2    \n\t"                      \
> +     "paddw     %%mm3, %%mm1    \n\t" /* 3* */             \
> +     "paddw     %%mm4, %%mm2    \n\t" /* 3* */             \

have you checked that pmullw with 3 is not faster?


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Observe your enemies, for they first find out your faults. -- Antisthenes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20071118/41f377c1/attachment.pgp>



More information about the ffmpeg-devel mailing list