[FFmpeg-devel] [PATCH] vp9: add fullpel (avg) SIMD for 10/12bpp.
James Almer
jamrial at gmail.com
Wed Sep 16 03:38:38 CEST 2015
On 9/15/2015 9:24 PM, Ronald S. Bultje wrote:
> ---
> libavcodec/x86/vp9dsp_init_16bpp.c | 42 ++++++++++++++++++++++++++++++--------
Why not just add all this to vp9dsp_init.c and selectively initialize
everything by checking the existing bpp argument in ff_vp9dsp_init_x86()?
If you think you wont be able to reuse the existing macros in vp9dsp_init.c
then i guess a new file would indeed be better to avoid bloat.
Alternatively, macros that can be shared could be moved to a header and
then used by both init files (like fpel_func and init_fpel).
> libavcodec/x86/vp9mc.asm | 24 ++++++++++++++++++++++
> 2 files changed, 58 insertions(+), 8 deletions(-)
>
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
> index 3319012..25a7b2a 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> @@ -36,14 +36,22 @@ void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
The naming scheme for other dsp functions in libavcodec is more like
ff_codec_function_size_{8,10,12}_cpuflag(). Not too important for these as
most are shared between all bpps, but for other functions it may be a good
idea to rename the 8bit functions to follow the above naming scheme in
preparation for the new 10/12bits ones.
> const uint8_t *src, ptrdiff_t src_stride, \
> int h, int mx, int my)
>
> -fpel_func(put, 8, mmx);
> -fpel_func(put, 16, sse);
> -fpel_func(put, 32, sse);
> -fpel_func(put, 64, sse);
> -fpel_func(put, 128, sse);
> -fpel_func(put, 32, avx);
> -fpel_func(put, 64, avx);
> -fpel_func(put, 128, avx);
> +fpel_func(put, 8, mmx);
> +fpel_func(avg16, 8, mmxext);
> +fpel_func(put, 16, sse);
> +fpel_func(put, 32, sse);
> +fpel_func(put, 64, sse);
> +fpel_func(put, 128, sse);
> +fpel_func(avg16, 16, sse2);
> +fpel_func(avg16, 32, sse2);
> +fpel_func(avg16, 64, sse2);
> +fpel_func(avg16, 128, sse2);
> +fpel_func(put, 32, avx);
> +fpel_func(put, 64, avx);
> +fpel_func(put, 128, avx);
> +fpel_func(avg16, 32, avx2);
> +fpel_func(avg16, 64, avx2);
> +fpel_func(avg16, 128, avx2);
> #undef fpel_func
>
> #endif /* HAVE_YASM */
> @@ -67,18 +75,36 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp)
> init_fpel(4, 0, 8, put, mmx);
> }
>
> + if (EXTERNAL_MMX(cpu_flags)) {
> + init_fpel(4, 1, 8, avg16, mmxext);
> + }
> +
> if (EXTERNAL_SSE(cpu_flags)) {
> init_fpel(3, 0, 16, put, sse);
> init_fpel(2, 0, 32, put, sse);
> init_fpel(1, 0, 64, put, sse);
> init_fpel(0, 0, 128, put, sse);
> }
> +
> + if (EXTERNAL_SSE2(cpu_flags)) {
> + init_fpel(3, 1, 16, avg16, sse2);
> + init_fpel(2, 1, 32, avg16, sse2);
> + init_fpel(1, 1, 64, avg16, sse2);
> + init_fpel(0, 1, 128, avg16, sse2);
> + }
> +
> if (EXTERNAL_AVX_FAST(cpu_flags)) {
> init_fpel(2, 0, 32, put, avx);
> init_fpel(1, 0, 64, put, avx);
> init_fpel(0, 0, 128, put, avx);
> }
>
> + if (EXTERNAL_AVX2(cpu_flags)) {
> + init_fpel(2, 1, 32, avg16, avx2);
> + init_fpel(1, 1, 64, avg16, avx2);
> + init_fpel(0, 1, 128, avg16, avx2);
> + }
> +
> #undef init_fpel
>
> #endif /* HAVE_YASM */
> diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
> index fb5b1e9..ebfb200 100644
> --- a/libavcodec/x86/vp9mc.asm
> +++ b/libavcodec/x86/vp9mc.asm
> @@ -586,6 +586,17 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h
> pavgb m1, [dstq+d%3]
> pavgb m2, [dstq+d%4]
> pavgb m3, [dstq+d%5]
> +%elifidn %1, avg16
Nit: Maybe a separate parameter that's either b or w to make this pavg%#
instead?
> + pavgw m0, [dstq]
> + pavgw m1, [dstq+d%3]
> + pavgw m2, [dstq+d%4]
> + pavgw m3, [dstq+d%5]
> +%if %2/mmsize == 8
> + pavgw m4, [dstq+mmsize*4]
> + pavgw m5, [dstq+mmsize*5]
> + pavgw m6, [dstq+mmsize*6]
> + pavgw m7, [dstq+mmsize*7]
> +%endif
> %endif
> %%dstfn [dstq], m0
> %%dstfn [dstq+d%3], m1
> @@ -631,6 +642,19 @@ INIT_YMM avx2
> fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
> fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2
> %endif
> +INIT_MMX mmxext
> +fpel_fn avg16, 8, strideq, strideq*2, stride3q, 4
> +INIT_XMM sse2
> +fpel_fn avg16, 16, strideq, strideq*2, stride3q, 4
> +fpel_fn avg16, 32, mmsize, strideq, strideq+mmsize, 2
> +fpel_fn avg16, 64, mmsize, mmsize*2, mmsize*3, 1
> +fpel_fn avg16, 128, mmsize, mmsize*2, mmsize*3, 1, 8
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +fpel_fn avg16, 32, strideq, strideq*2, stride3q, 4
> +fpel_fn avg16, 64, mmsize, strideq, strideq+mmsize, 2
> +fpel_fn avg16, 128, mmsize, mmsize*2, mmsize*3, 1
> +%endif
> %undef s16
> %undef d16
> %undef s32
>
More information about the ffmpeg-devel
mailing list