[FFmpeg-devel] [PATCH 1/4] avcodec/x86: add put_pixels16_x2_sse2

Ronald S. Bultje rsbultje at gmail.com
Sun Feb 3 20:30:56 CET 2013


Hi,

On Sun, Feb 3, 2013 at 7:31 AM, Michael Niedermayer <michaelni at gmx.at> wrote:
> about 1% faster P frame motion compensation for matrixbench on i7
>
> Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
> ---
>  libavcodec/x86/dsputil_mmx.c |    4 ++++
>  libavcodec/x86/hpeldsp.asm   |   31 ++++++++++++++++++++++++++++++-
>  2 files changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
> index 2e8300a..29d87a1 100644
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -1523,6 +1523,8 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
>
>  void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
>                            int line_size, int h);
> +void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
> +                              int line_size, int h);
>  void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
>                            int line_size, int h);
>
> @@ -2034,6 +2036,8 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
>          // these functions are slower than mmx on AMD, but faster on Intel
>          if (!high_bit_depth) {
>              c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
> +            c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
> +
>              c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
>              c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
>          }
> diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
> index 7f0c285..81b6901 100644
> --- a/libavcodec/x86/hpeldsp.asm
> +++ b/libavcodec/x86/hpeldsp.asm
> @@ -2,7 +2,7 @@
>  ;*
>  ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice at bellard.org>
>  ;* Copyright (c)      Nick Kurshev <nickols_k at mail.ru>
> -;* Copyright (c) 2002 Michael Niedermayer <michaelni at gmx.at>
> +;* Copyright (c) 2002-2013 Michael Niedermayer <michaelni at gmx.at>
>  ;* Copyright (c) 2002 Zdenek Kabelac <kabi at informatics.muni.cz>
>  ;* Copyright (c) 2013 Daniel Kang
>  ;*
> @@ -513,3 +513,32 @@ cglobal avg_pixels16, 4,5,4
>      lea          r0, [r0+r2*4]
>      jnz       .loop
>      REP_RET
> +
> +; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
> +cglobal put_pixels16_x2, 4, 5, 4
> +    movsxdifnidn r2, r2d
> +    lea          r4, [r2*2]
> +.loop:
> +    movu         m0, [r1]
> +    movu         m1, [r1+r2]
> +    movu         m2, [r1+1]
> +    movu         m3, [r1+r2+1]
> +    pavgb        m0, m2
> +    pavgb        m1, m3
> +    mova       [r0], m0
> +    mova    [r0+r2], m1
> +    add          r1, r4
> +    add          r0, r4
> +    movu         m0, [r1]
> +    movu         m1, [r1+r2]
> +    movu         m2, [r1+1]
> +    movu         m3, [r1+r2+1]
> +    pavgb        m0, m2
> +    pavgb        m1, m3
> +    add          r1, r4
> +    mova       [r0], m0
> +    mova    [r0+r2], m1
> +    add          r0, r4
> +    sub         r3d, 4
> +    jne .loop
> +    REP_RET

I bet that this code is identical to the 8-pixel mmx/mmx2 version. How
about you extend that version to use %+ mmsize in the cglobal line,
and then INIT_MMX mmx callmacro INIT_XMM sse2 callmacro so you use the
same macro for both versions = smaller and more maintainable source
code size?

Ronald


More information about the ffmpeg-devel mailing list