[FFmpeg-devel] [PATCH] vp9: add fullpel (avg) SIMD for 10/12bpp.

Wed Sep 16 03:38:38 CEST 2015

On 9/15/2015 9:24 PM, Ronald S. Bultje wrote:
> ---
>  libavcodec/x86/vp9dsp_init_16bpp.c | 42 ++++++++++++++++++++++++++++++--------

Why not just add all this to vp9dsp_init.c and selectively initialize
everything by checking the existing bpp argument in ff_vp9dsp_init_x86()?
If you think you wont be able to reuse the existing macros in vp9dsp_init.c
then i guess a new file would indeed be better to avoid bloat.

Alternatively, macros that can be shared could be moved to a header and
then used by both init files (like fpel_func and init_fpel).

>  libavcodec/x86/vp9mc.asm           | 24 ++++++++++++++++++++++
>  2 files changed, 58 insertions(+), 8 deletions(-)
> 
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
> index 3319012..25a7b2a 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> @@ -36,14 +36,22 @@ void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \

The naming scheme for other dsp functions in libavcodec is more like
ff_codec_function_size_{8,10,12}_cpuflag(). Not too important for these as
most are shared between all bpps, but for other functions it may be a good
idea to rename the 8bit functions to follow the above naming scheme in
preparation for the new 10/12bits ones.

>                                const uint8_t *src, ptrdiff_t src_stride, \
>                                int h, int mx, int my)
>  
> -fpel_func(put,   8, mmx);
> -fpel_func(put,  16, sse);
> -fpel_func(put,  32, sse);
> -fpel_func(put,  64, sse);
> -fpel_func(put, 128, sse);
> -fpel_func(put,  32, avx);
> -fpel_func(put,  64, avx);
> -fpel_func(put, 128, avx);
> +fpel_func(put,     8, mmx);
> +fpel_func(avg16,   8, mmxext);
> +fpel_func(put,    16, sse);
> +fpel_func(put,    32, sse);
> +fpel_func(put,    64, sse);
> +fpel_func(put,   128, sse);
> +fpel_func(avg16,  16, sse2);
> +fpel_func(avg16,  32, sse2);
> +fpel_func(avg16,  64, sse2);
> +fpel_func(avg16, 128, sse2);
> +fpel_func(put,    32, avx);
> +fpel_func(put,    64, avx);
> +fpel_func(put,   128, avx);
> +fpel_func(avg16,  32, avx2);
> +fpel_func(avg16,  64, avx2);
> +fpel_func(avg16, 128, avx2);
>  #undef fpel_func
>  
>  #endif /* HAVE_YASM */
> @@ -67,18 +75,36 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp)
>          init_fpel(4, 0,   8, put, mmx);
>      }
>  
> +    if (EXTERNAL_MMX(cpu_flags)) {
> +        init_fpel(4, 1,   8, avg16, mmxext);
> +    }
> +
>      if (EXTERNAL_SSE(cpu_flags)) {
>          init_fpel(3, 0,  16, put, sse);
>          init_fpel(2, 0,  32, put, sse);
>          init_fpel(1, 0,  64, put, sse);
>          init_fpel(0, 0, 128, put, sse);
>      }
> +
> +    if (EXTERNAL_SSE2(cpu_flags)) {
> +        init_fpel(3, 1,  16, avg16, sse2);
> +        init_fpel(2, 1,  32, avg16, sse2);
> +        init_fpel(1, 1,  64, avg16, sse2);
> +        init_fpel(0, 1, 128, avg16, sse2);
> +    }
> +
>      if (EXTERNAL_AVX_FAST(cpu_flags)) {
>          init_fpel(2, 0,  32, put, avx);
>          init_fpel(1, 0,  64, put, avx);
>          init_fpel(0, 0, 128, put, avx);
>      }
>  
> +    if (EXTERNAL_AVX2(cpu_flags)) {
> +        init_fpel(2, 1,  32, avg16, avx2);
> +        init_fpel(1, 1,  64, avg16, avx2);
> +        init_fpel(0, 1, 128, avg16, avx2);
> +    }
> +
>  #undef init_fpel
>  
>  #endif /* HAVE_YASM */
> diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
> index fb5b1e9..ebfb200 100644
> --- a/libavcodec/x86/vp9mc.asm
> +++ b/libavcodec/x86/vp9mc.asm
> @@ -586,6 +586,17 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h
>      pavgb       m1, [dstq+d%3]
>      pavgb       m2, [dstq+d%4]
>      pavgb       m3, [dstq+d%5]
> +%elifidn %1, avg16

Nit: Maybe a separate parameter that's either b or w to make this pavg%#
instead?

> +    pavgw       m0, [dstq]
> +    pavgw       m1, [dstq+d%3]
> +    pavgw       m2, [dstq+d%4]
> +    pavgw       m3, [dstq+d%5]
> +%if %2/mmsize == 8
> +    pavgw       m4, [dstq+mmsize*4]
> +    pavgw       m5, [dstq+mmsize*5]
> +    pavgw       m6, [dstq+mmsize*6]
> +    pavgw       m7, [dstq+mmsize*7]
> +%endif
>  %endif
>      %%dstfn [dstq], m0
>      %%dstfn [dstq+d%3], m1
> @@ -631,6 +642,19 @@ INIT_YMM avx2
>  fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
>  fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2
>  %endif
> +INIT_MMX mmxext
> +fpel_fn avg16,  8,  strideq, strideq*2, stride3q, 4
> +INIT_XMM sse2
> +fpel_fn avg16,  16, strideq, strideq*2, stride3q, 4
> +fpel_fn avg16,  32, mmsize,  strideq,   strideq+mmsize, 2
> +fpel_fn avg16,  64, mmsize,  mmsize*2,  mmsize*3, 1
> +fpel_fn avg16, 128, mmsize,  mmsize*2,  mmsize*3, 1, 8
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +fpel_fn avg16,  32, strideq, strideq*2, stride3q, 4
> +fpel_fn avg16,  64, mmsize,  strideq,   strideq+mmsize, 2
> +fpel_fn avg16, 128, mmsize,  mmsize*2,  mmsize*3, 1
> +%endif
>  %undef s16
>  %undef d16
>  %undef s32
>