[FFmpeg-devel] [PATCH] x86/hecv_res_add: add ff_hevc_transform_add{8, 16, 32}_8_avx

Mickaël Raulet mraulet at gmail.com
Wed Aug 20 09:25:22 CEST 2014


Patch ok

Mickael

Le mercredi 20 août 2014, James Almer <jamrial at gmail.com> a écrit :

> ~15% faster than sse2
>
> Signed-off-by: James Almer <jamrial at gmail.com <javascript:;>>
> ---
>  libavcodec/x86/hevc_res_add.asm | 15 +++++++++++----
>  libavcodec/x86/hevcdsp.h        |  4 ++++
>  libavcodec/x86/hevcdsp_init.c   |  4 ++++
>  3 files changed, 19 insertions(+), 4 deletions(-)
>
> diff --git a/libavcodec/x86/hevc_res_add.asm
> b/libavcodec/x86/hevc_res_add.asm
> index 47022d3..feea50c 100644
> --- a/libavcodec/x86/hevc_res_add.asm
> +++ b/libavcodec/x86/hevc_res_add.asm
> @@ -156,8 +156,8 @@ cglobal hevc_transform_add4_8, 3, 4, 6
>  %endmacro
>
>
> -INIT_XMM sse2
> -; void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride)
> +%macro TRANSFORM_ADD_8 0
> +; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride)
>  cglobal hevc_transform_add8_8, 3, 4, 8
>      lea               r3, [r2*3]
>      TR_ADD_SSE_8_8
> @@ -167,7 +167,7 @@ cglobal hevc_transform_add8_8, 3, 4, 8
>      RET
>
>  %if ARCH_X86_64
> -; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride)
> +; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride)
>  cglobal hevc_transform_add16_8, 3, 4, 12
>      lea               r3, [r2*3]
>      TR_ADD_SSE_16_8
> @@ -178,7 +178,7 @@ cglobal hevc_transform_add16_8, 3, 4, 12
>  %endrep
>      RET
>
> -; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride)
> +; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride)
>  cglobal hevc_transform_add32_8, 3, 4, 12
>
>      TR_ADD_SSE_32_8
> @@ -190,6 +190,13 @@ cglobal hevc_transform_add32_8, 3, 4, 12
>      RET
>
>  %endif ;ARCH_X86_64
> +%endmacro
> +
> +INIT_XMM sse2
> +TRANSFORM_ADD_8
> +INIT_XMM avx
> +TRANSFORM_ADD_8
> +
>
>  ;-----------------------------------------------------------------------------
>  ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
>
>  ;-----------------------------------------------------------------------------
> diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
> index 7ced22c..74b5173 100644
> --- a/libavcodec/x86/hevcdsp.h
> +++ b/libavcodec/x86/hevcdsp.h
> @@ -139,6 +139,10 @@ void ff_hevc_transform_add8_8_sse2(uint8_t *dst,
> int16_t *coeffs, ptrdiff_t stri
>  void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride);
>  void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride);
>
> +void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride);
> +void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride);
> +void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride);
> +
>  void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride);
>  void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride);
>  void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs,
> ptrdiff_t stride);
> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
> index 0f9fe7d..f6f0a4b 100644
> --- a/libavcodec/x86/hevcdsp_init.c
> +++ b/libavcodec/x86/hevcdsp_init.c
> @@ -509,7 +509,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const
> int bit_depth)
>              if (ARCH_X86_64) {
>                  c->hevc_v_loop_filter_luma =
> ff_hevc_v_loop_filter_luma_8_avx;
>                  c->hevc_h_loop_filter_luma =
> ff_hevc_h_loop_filter_luma_8_avx;
> +
> +                c->transform_add[2]    = ff_hevc_transform_add16_8_avx;
> +                c->transform_add[3]    = ff_hevc_transform_add32_8_avx;
>              }
> +            c->transform_add[1]    = ff_hevc_transform_add8_8_avx;
>          }
>          if (EXTERNAL_AVX2(cpu_flags)) {
>              c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
> --
> 1.8.5.5
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org <javascript:;>
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>


More information about the ffmpeg-devel mailing list