[FFmpeg-cvslog] vp9: add fullpel (put) MC SIMD for 10/12bpp.

Thu Sep 17 03:12:00 CEST 2015

ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Wed Sep 16 09:08:04 2015 -0400| [6354ff03833b5f64d930c195ae3801cc4061505f] | committer: Ronald S. Bultje

vp9: add fullpel (put) MC SIMD for 10/12bpp.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6354ff03833b5f64d930c195ae3801cc4061505f
---

 libavcodec/x86/Makefile            |    3 +-
 libavcodec/x86/vp9dsp_init.c       |   74 ++++++++++++++++--------------------
 libavcodec/x86/vp9dsp_init.h       |   39 +++++++++++++++++++
 libavcodec/x86/vp9dsp_init_16bpp.c |   65 +++++++++++++++++++++++++++++++
 libavcodec/x86/vp9mc.asm           |   18 ++++++++-
 5 files changed, 155 insertions(+), 44 deletions(-)

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 3c3cc1c..616f830 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -62,7 +62,8 @@ OBJS-$(CONFIG_V210_ENCODER)            += x86/v210enc_init.o
 OBJS-$(CONFIG_VC1_DECODER)             += x86/vc1dsp_init.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
-OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o            \
+                                          x86/vp9dsp_init_16bpp.o
 OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
 
 
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index f24cb67..ebfd963 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -23,31 +23,26 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
 
 #if HAVE_YASM
 
-#define fpel_func(avg, sz, opt) \
-void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
-                              const uint8_t *src, ptrdiff_t src_stride, \
-                              int h, int mx, int my)
-fpel_func(put,  4, mmx);
-fpel_func(put,  8, mmx);
-fpel_func(put, 16, sse);
-fpel_func(put, 32, sse);
-fpel_func(put, 64, sse);
-fpel_func(avg,  4, mmxext);
-fpel_func(avg,  8, mmxext);
-fpel_func(avg, 16, sse2);
-fpel_func(avg, 32, sse2);
-fpel_func(avg, 64, sse2);
-fpel_func(put, 32, avx);
-fpel_func(put, 64, avx);
-fpel_func(avg, 32, avx2);
-fpel_func(avg, 64, avx2);
-#undef fpel_func
+decl_fpel_func(put,  4, mmx);
+decl_fpel_func(put,  8, mmx);
+decl_fpel_func(put, 16, sse);
+decl_fpel_func(put, 32, sse);
+decl_fpel_func(put, 64, sse);
+decl_fpel_func(avg,  4, mmxext);
+decl_fpel_func(avg,  8, mmxext);
+decl_fpel_func(avg, 16, sse2);
+decl_fpel_func(avg, 32, sse2);
+decl_fpel_func(avg, 64, sse2);
+decl_fpel_func(put, 32, avx);
+decl_fpel_func(put, 64, avx);
+decl_fpel_func(avg, 32, avx2);
+decl_fpel_func(avg, 64, avx2);
 
 #define mc_func(avg, sz, dir, opt, type, f_sz) \
 void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
@@ -311,16 +306,13 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
 {
 #if HAVE_YASM
     int cpu_flags;
-    if (bpp != 8) return;
+    if (bpp != 8) {
+        ff_vp9dsp_init_16bpp_x86(dsp, bpp);
+        return;
+    }
 
     cpu_flags = av_get_cpu_flags();
 
-#define init_fpel(idx1, idx2, sz, type, opt) \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
-    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
-    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##_##opt
-
 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \
     dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
@@ -386,8 +378,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
 } while (0)
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        init_fpel(4, 0,  4, put, mmx);
-        init_fpel(3, 0,  8, put, mmx);
+        init_fpel_func(4, 0,  4, put, mmx);
+        init_fpel_func(3, 0,  8, put, mmx);
         if (!bitexact) {
             dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
             dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
@@ -400,8 +392,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         init_subpel2(4, 0, 4, put, mmxext);
         init_subpel2(4, 1, 4, avg, mmxext);
-        init_fpel(4, 1,  4, avg, mmxext);
-        init_fpel(3, 1,  8, avg, mmxext);
+        init_fpel_func(4, 1,  4, avg, mmxext);
+        init_fpel_func(3, 1,  8, avg, mmxext);
         dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
         init_dc_ipred(4, mmxext);
         init_dc_ipred(8, mmxext);
@@ -409,9 +401,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
     }
 
     if (EXTERNAL_SSE(cpu_flags)) {
-        init_fpel(2, 0, 16, put, sse);
-        init_fpel(1, 0, 32, put, sse);
-        init_fpel(0, 0, 64, put, sse);
+        init_fpel_func(2, 0, 16, put, sse);
+        init_fpel_func(1, 0, 32, put, sse);
+        init_fpel_func(0, 0, 64, put, sse);
         init_ipred(16, sse, v, VERT);
         init_ipred(32, sse, v, VERT);
     }
@@ -419,9 +411,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
     if (EXTERNAL_SSE2(cpu_flags)) {
         init_subpel3_8to64(0, put, sse2);
         init_subpel3_8to64(1, avg, sse2);
-        init_fpel(2, 1, 16, avg, sse2);
-        init_fpel(1, 1, 32, avg, sse2);
-        init_fpel(0, 1, 64, avg, sse2);
+        init_fpel_func(2, 1, 16, avg, sse2);
+        init_fpel_func(1, 1, 32, avg, sse2);
+        init_fpel_func(0, 1, 64, avg, sse2);
         init_lpf(sse2);
         dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_sse2;
         dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_sse2;
@@ -491,14 +483,14 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
         init_dir_tm_h_ipred(32, avx);
     }
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
-        init_fpel(1, 0, 32, put, avx);
-        init_fpel(0, 0, 64, put, avx);
+        init_fpel_func(1, 0, 32, put, avx);
+        init_fpel_func(0, 0, 64, put, avx);
         init_ipred(32, avx, v, VERT);
     }
 
     if (EXTERNAL_AVX2(cpu_flags)) {
-        init_fpel(1, 1, 32, avg, avx2);
-        init_fpel(0, 1, 64, avg, avx2);
+        init_fpel_func(1, 1, 32, avg, avx2);
+        init_fpel_func(0, 1, 64, avg, avx2);
         if (ARCH_X86_64) {
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
             init_subpel3_32_64(0, put, avx2);
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
new file mode 100644
index 0000000..8c99c0d
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -0,0 +1,39 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP9DSP_INIT_H
+#define AVCODEC_X86_VP9DSP_INIT_H
+
+#define decl_fpel_func(avg, sz, opt) \
+void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                              const uint8_t *src, ptrdiff_t src_stride, \
+                              int h, int mx, int my)
+
+#define init_fpel_func(idx1, idx2, sz, type, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##_##opt
+
+void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp);
+
+#endif /* AVCODEC_X86_VP9DSP_INIT_H */
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
new file mode 100644
index 0000000..6f2c50d
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -0,0 +1,65 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+decl_fpel_func(put,   8, mmx);
+decl_fpel_func(put,  16, sse);
+decl_fpel_func(put,  32, sse);
+decl_fpel_func(put,  64, sse);
+decl_fpel_func(put, 128, sse);
+decl_fpel_func(put,  32, avx);
+decl_fpel_func(put,  64, avx);
+decl_fpel_func(put, 128, avx);
+
+#endif /* HAVE_YASM */
+
+av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        init_fpel_func(4, 0,   8, put, mmx);
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        init_fpel_func(3, 0,  16, put, sse);
+        init_fpel_func(2, 0,  32, put, sse);
+        init_fpel_func(1, 0,  64, put, sse);
+        init_fpel_func(0, 0, 128, put, sse);
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(2, 0,  32, put, avx);
+        init_fpel_func(1, 0,  64, put, avx);
+        init_fpel_func(0, 0, 128, put, avx);
+    }
+
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index 5393957..fb5b1e9 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -553,7 +553,7 @@ filter_vx2_fn avg
 
 %endif ; ARCH_X86_64
 
-%macro fpel_fn 6
+%macro fpel_fn 6-7 4
 %if %2 == 4
 %define %%srcfn movh
 %define %%dstfn movh
@@ -567,13 +567,19 @@ cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
     lea  sstride3q, [sstrideq*3]
     lea  dstride3q, [dstrideq*3]
 %else
-cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
+cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h
 %endif
 .loop:
     %%srcfn     m0, [srcq]
     %%srcfn     m1, [srcq+s%3]
     %%srcfn     m2, [srcq+s%4]
     %%srcfn     m3, [srcq+s%5]
+%if %2/mmsize == 8
+    %%srcfn     m4, [srcq+mmsize*4]
+    %%srcfn     m5, [srcq+mmsize*5]
+    %%srcfn     m6, [srcq+mmsize*6]
+    %%srcfn     m7, [srcq+mmsize*7]
+%endif
     lea       srcq, [srcq+sstrideq*%6]
 %ifidn %1, avg
     pavgb       m0, [dstq]
@@ -585,6 +591,12 @@ cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
     %%dstfn [dstq+d%3], m1
     %%dstfn [dstq+d%4], m2
     %%dstfn [dstq+d%5], m3
+%if %2/mmsize == 8
+    %%dstfn [dstq+mmsize*4], m4
+    %%dstfn [dstq+mmsize*5], m5
+    %%dstfn [dstq+mmsize*6], m6
+    %%dstfn [dstq+mmsize*7], m7
+%endif
     lea       dstq, [dstq+dstrideq*%6]
     sub         hd, %6
     jnz .loop
@@ -605,6 +617,7 @@ INIT_XMM sse
 fpel_fn put, 16, strideq, strideq*2, stride3q, 4
 fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
 fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
+fpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 8
 INIT_XMM sse2
 fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
 fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
@@ -612,6 +625,7 @@ fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
 INIT_YMM avx
 fpel_fn put, 32, strideq, strideq*2, stride3q, 4
 fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 fpel_fn avg, 32, strideq, strideq*2, stride3q, 4