[FFmpeg-devel] [PATCH] vp9: add fullpel (avg) SIMD for 10/12bpp.
Ronald S. Bultje
rsbultje at gmail.com
Wed Sep 16 02:24:26 CEST 2015
---
libavcodec/x86/vp9dsp_init_16bpp.c | 42 ++++++++++++++++++++++++++++++--------
libavcodec/x86/vp9mc.asm | 24 ++++++++++++++++++++++
2 files changed, 58 insertions(+), 8 deletions(-)
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index 3319012..25a7b2a 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -36,14 +36,22 @@ void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
-fpel_func(put, 8, mmx);
-fpel_func(put, 16, sse);
-fpel_func(put, 32, sse);
-fpel_func(put, 64, sse);
-fpel_func(put, 128, sse);
-fpel_func(put, 32, avx);
-fpel_func(put, 64, avx);
-fpel_func(put, 128, avx);
+fpel_func(put, 8, mmx);
+fpel_func(avg16, 8, mmxext);
+fpel_func(put, 16, sse);
+fpel_func(put, 32, sse);
+fpel_func(put, 64, sse);
+fpel_func(put, 128, sse);
+fpel_func(avg16, 16, sse2);
+fpel_func(avg16, 32, sse2);
+fpel_func(avg16, 64, sse2);
+fpel_func(avg16, 128, sse2);
+fpel_func(put, 32, avx);
+fpel_func(put, 64, avx);
+fpel_func(put, 128, avx);
+fpel_func(avg16, 32, avx2);
+fpel_func(avg16, 64, avx2);
+fpel_func(avg16, 128, avx2);
#undef fpel_func
#endif /* HAVE_YASM */
@@ -67,18 +75,36 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp)
init_fpel(4, 0, 8, put, mmx);
}
+ if (EXTERNAL_MMX(cpu_flags)) {
+ init_fpel(4, 1, 8, avg16, mmxext);
+ }
+
if (EXTERNAL_SSE(cpu_flags)) {
init_fpel(3, 0, 16, put, sse);
init_fpel(2, 0, 32, put, sse);
init_fpel(1, 0, 64, put, sse);
init_fpel(0, 0, 128, put, sse);
}
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ init_fpel(3, 1, 16, avg16, sse2);
+ init_fpel(2, 1, 32, avg16, sse2);
+ init_fpel(1, 1, 64, avg16, sse2);
+ init_fpel(0, 1, 128, avg16, sse2);
+ }
+
if (EXTERNAL_AVX_FAST(cpu_flags)) {
init_fpel(2, 0, 32, put, avx);
init_fpel(1, 0, 64, put, avx);
init_fpel(0, 0, 128, put, avx);
}
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ init_fpel(2, 1, 32, avg16, avx2);
+ init_fpel(1, 1, 64, avg16, avx2);
+ init_fpel(0, 1, 128, avg16, avx2);
+ }
+
#undef init_fpel
#endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index fb5b1e9..ebfb200 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -586,6 +586,17 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h
pavgb m1, [dstq+d%3]
pavgb m2, [dstq+d%4]
pavgb m3, [dstq+d%5]
+%elifidn %1, avg16
+ pavgw m0, [dstq]
+ pavgw m1, [dstq+d%3]
+ pavgw m2, [dstq+d%4]
+ pavgw m3, [dstq+d%5]
+%if %2/mmsize == 8
+ pavgw m4, [dstq+mmsize*4]
+ pavgw m5, [dstq+mmsize*5]
+ pavgw m6, [dstq+mmsize*6]
+ pavgw m7, [dstq+mmsize*7]
+%endif
%endif
%%dstfn [dstq], m0
%%dstfn [dstq+d%3], m1
@@ -631,6 +642,19 @@ INIT_YMM avx2
fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2
%endif
+INIT_MMX mmxext
+fpel_fn avg16, 8, strideq, strideq*2, stride3q, 4
+INIT_XMM sse2
+fpel_fn avg16, 16, strideq, strideq*2, stride3q, 4
+fpel_fn avg16, 32, mmsize, strideq, strideq+mmsize, 2
+fpel_fn avg16, 64, mmsize, mmsize*2, mmsize*3, 1
+fpel_fn avg16, 128, mmsize, mmsize*2, mmsize*3, 1, 8
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg16, 32, strideq, strideq*2, stride3q, 4
+fpel_fn avg16, 64, mmsize, strideq, strideq+mmsize, 2
+fpel_fn avg16, 128, mmsize, mmsize*2, mmsize*3, 1
+%endif
%undef s16
%undef d16
%undef s32
--
2.1.2
More information about the ffmpeg-devel
mailing list