[FFmpeg-cvslog] Full-pixel MC functions.

Thu Oct 3 13:59:30 CEST 2013

ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Sat Sep 21 22:03:00 2013 -0400| [f1548c008fc5d50488bb7bbc2aa4cf49d89a0bda] | committer: Ronald S. Bultje

Full-pixel MC functions.

Decoding time of ped1080p.webm goes from 11.3sec to 11.1sec.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f1548c008fc5d50488bb7bbc2aa4cf49d89a0bda
---

 libavcodec/x86/vp9dsp.asm    |   57 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/vp9dsp_init.c |   43 +++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)

diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm
index 30db0ca..740d67c 100644
--- a/libavcodec/x86/vp9dsp.asm
+++ b/libavcodec/x86/vp9dsp.asm
@@ -219,3 +219,60 @@ filter_v_fn avg
 INIT_XMM ssse3
 filter_v_fn put
 filter_v_fn avg
+
+%macro fpel_fn 6
+%if %2 == 4
+%define %%srcfn movh
+%define %%dstfn movh
+%else
+%define %%srcfn movu
+%define %%dstfn mova
+%endif
+
+%if %2 <= 16
+cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
+    lea  sstride3q, [sstrideq*3]
+    lea  dstride3q, [dstrideq*3]
+%else
+cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h
+%endif
+.loop:
+    %%srcfn     m0, [srcq]
+    %%srcfn     m1, [srcq+s%3]
+    %%srcfn     m2, [srcq+s%4]
+    %%srcfn     m3, [srcq+s%5]
+    lea       srcq, [srcq+sstrideq*%6]
+%ifidn %1, avg
+    pavgb       m0, [dstq]
+    pavgb       m1, [dstq+d%3]
+    pavgb       m2, [dstq+d%4]
+    pavgb       m3, [dstq+d%5]
+%endif
+    %%dstfn [dstq], m0
+    %%dstfn [dstq+d%3], m1
+    %%dstfn [dstq+d%4], m2
+    %%dstfn [dstq+d%5], m3
+    lea       dstq, [dstq+dstrideq*%6]
+    sub         hd, %6
+    jnz .loop
+    RET
+%endmacro
+
+%define d16 16
+%define s16 16
+INIT_MMX mmx
+fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
+fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
+INIT_MMX sse
+fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4
+fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4
+INIT_XMM sse
+fpel_fn put, 16, strideq, strideq*2, stride3q, 4
+fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
+fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
+%undef s16
+%undef d16
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index d135cf4..cf7a1a4 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -27,6 +27,22 @@
 
 #if HAVE_YASM
 
+#define fpel_func(avg, sz, opt) \
+void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                          const uint8_t *src, ptrdiff_t src_stride, \
+                          int h, int mx, int my)
+fpel_func(put,  4, mmx);
+fpel_func(put,  8, mmx);
+fpel_func(put, 16, sse);
+fpel_func(put, 32, sse);
+fpel_func(put, 64, sse);
+fpel_func(avg,  4, sse);
+fpel_func(avg,  8, sse);
+fpel_func(avg, 16, sse2);
+fpel_func(avg, 32, sse2);
+fpel_func(avg, 64, sse2);
+#undef fpel_func
+
 #define mc_func(avg, sz, dir, opt) \
 void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
                                              const uint8_t *src, ptrdiff_t src_stride, \
@@ -141,6 +157,13 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
 #if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
+#define init_fpel(idx1, idx2, sz, type, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_##type##sz##_##opt
+
+
 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \
     dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
@@ -158,11 +181,31 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
     init_subpel2(idx, 0, 1, v, type, opt); \
     init_subpel2(idx, 1, 0, h, type, opt)
 
+    if (cpu_flags & AV_CPU_FLAG_MMX) {
+        init_fpel(4, 0,  4, put, mmx);
+        init_fpel(3, 0,  8, put, mmx);
+    }
+
+    if (cpu_flags & AV_CPU_FLAG_SSE) {
+        init_fpel(2, 0, 16, put, sse);
+        init_fpel(1, 0, 32, put, sse);
+        init_fpel(0, 0, 64, put, sse);
+        init_fpel(4, 1,  4, avg, sse);
+        init_fpel(3, 1,  8, avg, sse);
+    }
+
+    if (cpu_flags & AV_CPU_FLAG_SSE2) {
+        init_fpel(2, 1, 16, avg, sse2);
+        init_fpel(1, 1, 32, avg, sse2);
+        init_fpel(0, 1, 64, avg, sse2);
+    }
+
     if (cpu_flags & AV_CPU_FLAG_SSSE3) {
         init_subpel3(0, put, ssse3);
         init_subpel3(1, avg, ssse3);
     }
 
+#undef init_fpel
 #undef init_subpel1
 #undef init_subpel2
 #undef init_subpel3