[FFmpeg-cvslog] x86/hecv_res_add: add ff_hevc_transform_add{8, 16, 32}_8_avx

James Almer git at videolan.org
Wed Aug 20 21:56:06 CEST 2014


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Aug 19 23:52:05 2014 -0300| [76a99d467fff3faf826c46ccde66bd1303876a0d] | committer: James Almer

x86/hecv_res_add: add ff_hevc_transform_add{8,16,32}_8_avx

~15% faster than sse2

Reviewed-by: Mickaël Raulet <mraulet at gmail.com>
Reviewed-by: Christophe Gisquet <christophe.gisquet at gmail.com>
Signed-off-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=76a99d467fff3faf826c46ccde66bd1303876a0d
---

 libavcodec/x86/hevc_res_add.asm |   15 +++++++++++----
 libavcodec/x86/hevcdsp.h        |    4 ++++
 libavcodec/x86/hevcdsp_init.c   |    4 ++++
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
index 47022d3..feea50c 100644
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -156,8 +156,8 @@ cglobal hevc_transform_add4_8, 3, 4, 6
 %endmacro
 
 
-INIT_XMM sse2
-; void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+%macro TRANSFORM_ADD_8 0
+; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_transform_add8_8, 3, 4, 8
     lea               r3, [r2*3]
     TR_ADD_SSE_8_8
@@ -167,7 +167,7 @@ cglobal hevc_transform_add8_8, 3, 4, 8
     RET
 
 %if ARCH_X86_64
-; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_transform_add16_8, 3, 4, 12
     lea               r3, [r2*3]
     TR_ADD_SSE_16_8
@@ -178,7 +178,7 @@ cglobal hevc_transform_add16_8, 3, 4, 12
 %endrep
     RET
 
-; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_transform_add32_8, 3, 4, 12
 
     TR_ADD_SSE_32_8
@@ -190,6 +190,13 @@ cglobal hevc_transform_add32_8, 3, 4, 12
     RET
 
 %endif ;ARCH_X86_64
+%endmacro
+
+INIT_XMM sse2
+TRANSFORM_ADD_8
+INIT_XMM avx
+TRANSFORM_ADD_8
+
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 7ced22c..74b5173 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -139,6 +139,10 @@ void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stri
 void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 
+void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
 void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 0f9fe7d..f6f0a4b 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -509,7 +509,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             if (ARCH_X86_64) {
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+
+                c->transform_add[2]    = ff_hevc_transform_add16_8_avx;
+                c->transform_add[3]    = ff_hevc_transform_add32_8_avx;
             }
+            c->transform_add[1]    = ff_hevc_transform_add8_8_avx;
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
             c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;



More information about the ffmpeg-cvslog mailing list