[FFmpeg-cvslog] vp9/x86: add AVX for itxfm and lpf.

Clément Bœsch git at videolan.org
Wed Jan 15 13:53:15 CET 2014


ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Tue Jan 14 08:09:48 2014 +0100| [8b4190da9382434758e390370b1752583bf4ce3a] | committer: Clément Bœsch

vp9/x86: add AVX for itxfm and lpf.

4412 decicycles in ff_vp9_loop_filter_h_16_16_ssse3, 4193462 runs, 842 skips
3600 decicycles in ff_vp9_loop_filter_h_16_16_avx, 4193621 runs, 683 skips

3010 decicycles in ff_vp9_loop_filter_v_16_16_ssse3, 4193528 runs, 776 skips
2678 decicycles in ff_vp9_loop_filter_v_16_16_avx, 4193742 runs, 562 skips

23025 decicycles in ff_vp9_idct_idct_32x32_add_ssse3, 2096871 runs, 281 skips
19943 decicycles in ff_vp9_idct_idct_32x32_add_avx, 2096815 runs, 337 skips

4675 decicycles in ff_vp9_idct_idct_16x16_add_ssse3, 4194018 runs, 286 skips
3980 decicycles in ff_vp9_idct_idct_16x16_add_avx, 4194022 runs, 282 skips

967 decicycles in ff_vp9_idct_idct_8x8_add_ssse3, 16776972 runs, 244 skips
887 decicycles in ff_vp9_idct_idct_8x8_add_avx, 16777002 runs, 214 skips

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8b4190da9382434758e390370b1752583bf4ce3a
---

 libavcodec/x86/vp9dsp_init.c |   18 ++++++++++++++++++
 libavcodec/x86/vp9itxfm.asm  |   21 ++++++++++++++++++---
 libavcodec/x86/vp9lpf.asm    |    7 ++++++-
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index c3ef73d..3651641 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -159,11 +159,16 @@ filters_8tap_1d_fn3(avg)
 
 void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 void ff_vp9_idct_idct_8x8_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+void ff_vp9_idct_idct_8x8_add_avx  (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 void ff_vp9_idct_idct_16x16_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+void ff_vp9_idct_idct_16x16_add_avx  (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 void ff_vp9_idct_idct_32x32_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+void ff_vp9_idct_idct_32x32_add_avx  (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 
 void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
+void ff_vp9_loop_filter_v_16_16_avx  (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
 void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
+void ff_vp9_loop_filter_h_16_16_avx  (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
 
 #endif /* HAVE_YASM */
 
@@ -231,6 +236,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
         }
     }
 
+    if (EXTERNAL_AVX(cpu_flags)) {
+        if (ARCH_X86_64) {
+            dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+            dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+            dsp->itxfm_add[TX_32X32][ADST_ADST] =
+            dsp->itxfm_add[TX_32X32][ADST_DCT] =
+            dsp->itxfm_add[TX_32X32][DCT_ADST] =
+            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
+            dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx;
+            dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx;
+        }
+    }
+
 #undef init_fpel
 #undef init_subpel1
 #undef init_subpel2
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index fe04b81..33c0bc7 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -289,7 +289,8 @@ cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
     VP9_STORE_2X        10, 11,  6,  7,  4
 %endmacro
 
-INIT_XMM ssse3
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 1
+INIT_XMM %1
 cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
 
     mova               m12, [pw_11585x2]    ; often used
@@ -376,6 +377,10 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
     mova      [blockq+112], m4
     VP9_IDCT8_WRITEOUT
     RET
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3
+VP9_IDCT_IDCT_8x8_ADD_XMM avx
 
 ;---------------------------------------------------------------------------------------------
 ; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@@ -655,7 +660,8 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
     mova         [dstq+%7], m%4
 %endmacro
 
-INIT_XMM ssse3
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
 cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
     ; 2x2=eob=3, 4x4=eob=10
     cmp eobd, 38
@@ -724,6 +730,10 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
     ; use that to zero out block coefficients
     ZERO_BLOCK      blockq, 32, 16, m0
     RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
 
 ;---------------------------------------------------------------------------------------------
 ; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@@ -1102,7 +1112,8 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
 %endif
 %endmacro
 
-INIT_XMM ssse3
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
 cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
     cmp eobd, 135
     jg .idctfull
@@ -1213,5 +1224,9 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
     ; use that to zero out block coefficients
     ZERO_BLOCK      blockq, 64, 32, m7
     RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
 
 %endif ; x86-64
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index e2dc8d9..c5e5df9 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -655,12 +655,17 @@ SECTION .text
 %endif
 %endmacro
 
-INIT_XMM ssse3
+%macro LPF_16_16_VH 1
+INIT_XMM %1
 cglobal vp9_loop_filter_v_16_16, 5,8,16,      dst, stride, E, I, H, mstride, dst1, dst2
     LPF_16_16 v
     RET
 cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst1, dst2
     LPF_16_16 h
     RET
+%endmacro
+
+LPF_16_16_VH ssse3
+LPF_16_16_VH avx
 
 %endif ; x86-64



More information about the ffmpeg-cvslog mailing list