[FFmpeg-devel] [PATCH] vp9: add fullpel (avg) SIMD for 10/12bpp.

Ronald S. Bultje rsbultje at gmail.com
Wed Sep 16 02:24:26 CEST 2015


---
 libavcodec/x86/vp9dsp_init_16bpp.c | 42 ++++++++++++++++++++++++++++++--------
 libavcodec/x86/vp9mc.asm           | 24 ++++++++++++++++++++++
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index 3319012..25a7b2a 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -36,14 +36,22 @@ void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
                               const uint8_t *src, ptrdiff_t src_stride, \
                               int h, int mx, int my)
 
-fpel_func(put,   8, mmx);
-fpel_func(put,  16, sse);
-fpel_func(put,  32, sse);
-fpel_func(put,  64, sse);
-fpel_func(put, 128, sse);
-fpel_func(put,  32, avx);
-fpel_func(put,  64, avx);
-fpel_func(put, 128, avx);
+fpel_func(put,     8, mmx);
+fpel_func(avg16,   8, mmxext);
+fpel_func(put,    16, sse);
+fpel_func(put,    32, sse);
+fpel_func(put,    64, sse);
+fpel_func(put,   128, sse);
+fpel_func(avg16,  16, sse2);
+fpel_func(avg16,  32, sse2);
+fpel_func(avg16,  64, sse2);
+fpel_func(avg16, 128, sse2);
+fpel_func(put,    32, avx);
+fpel_func(put,    64, avx);
+fpel_func(put,   128, avx);
+fpel_func(avg16,  32, avx2);
+fpel_func(avg16,  64, avx2);
+fpel_func(avg16, 128, avx2);
 #undef fpel_func
 
 #endif /* HAVE_YASM */
@@ -67,18 +75,36 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp)
         init_fpel(4, 0,   8, put, mmx);
     }
 
+    if (EXTERNAL_MMX(cpu_flags)) {
+        init_fpel(4, 1,   8, avg16, mmxext);
+    }
+
     if (EXTERNAL_SSE(cpu_flags)) {
         init_fpel(3, 0,  16, put, sse);
         init_fpel(2, 0,  32, put, sse);
         init_fpel(1, 0,  64, put, sse);
         init_fpel(0, 0, 128, put, sse);
     }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_fpel(3, 1,  16, avg16, sse2);
+        init_fpel(2, 1,  32, avg16, sse2);
+        init_fpel(1, 1,  64, avg16, sse2);
+        init_fpel(0, 1, 128, avg16, sse2);
+    }
+
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         init_fpel(2, 0,  32, put, avx);
         init_fpel(1, 0,  64, put, avx);
         init_fpel(0, 0, 128, put, avx);
     }
 
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        init_fpel(2, 1,  32, avg16, avx2);
+        init_fpel(1, 1,  64, avg16, avx2);
+        init_fpel(0, 1, 128, avg16, avx2);
+    }
+
 #undef init_fpel
 
 #endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index fb5b1e9..ebfb200 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -586,6 +586,17 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h
     pavgb       m1, [dstq+d%3]
     pavgb       m2, [dstq+d%4]
     pavgb       m3, [dstq+d%5]
+%elifidn %1, avg16
+    pavgw       m0, [dstq]
+    pavgw       m1, [dstq+d%3]
+    pavgw       m2, [dstq+d%4]
+    pavgw       m3, [dstq+d%5]
+%if %2/mmsize == 8
+    pavgw       m4, [dstq+mmsize*4]
+    pavgw       m5, [dstq+mmsize*5]
+    pavgw       m6, [dstq+mmsize*6]
+    pavgw       m7, [dstq+mmsize*7]
+%endif
 %endif
     %%dstfn [dstq], m0
     %%dstfn [dstq+d%3], m1
@@ -631,6 +642,19 @@ INIT_YMM avx2
 fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
 fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2
 %endif
+INIT_MMX mmxext
+fpel_fn avg16,  8,  strideq, strideq*2, stride3q, 4
+INIT_XMM sse2
+fpel_fn avg16,  16, strideq, strideq*2, stride3q, 4
+fpel_fn avg16,  32, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn avg16,  64, mmsize,  mmsize*2,  mmsize*3, 1
+fpel_fn avg16, 128, mmsize,  mmsize*2,  mmsize*3, 1, 8
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg16,  32, strideq, strideq*2, stride3q, 4
+fpel_fn avg16,  64, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn avg16, 128, mmsize,  mmsize*2,  mmsize*3, 1
+%endif
 %undef s16
 %undef d16
 %undef s32
-- 
2.1.2



More information about the ffmpeg-devel mailing list