[FFmpeg-devel] [PATCH 4/5] avcodec/h264: add avx 8-bit h264_idct_add

James Almer jamrial at gmail.com
Wed Apr 5 06:44:15 EEST 2017


On 4/4/2017 10:53 PM, James Darnley wrote:
> Haswell:
>  - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext
> 
> Skylake-U:
>  - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext

Again, you should add an SSE2 version first, then an AVX one if it's
measurably faster than the SSE2 one.

> ---
>  libavcodec/x86/h264_idct.asm  | 33 ++++++++++++++++++++++++++++++++-
>  libavcodec/x86/h264dsp_init.c |  3 +++
>  2 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
> index bc4dce4..24fb4d2 100644
> --- a/libavcodec/x86/h264_idct.asm
> +++ b/libavcodec/x86/h264_idct.asm
> @@ -65,7 +65,15 @@ SECTION .text
>  
>      IDCT4_1D      w, 0, 1, 2, 3, 4, 5
>      mova         m6, [pw_32]
> -    TRANSPOSE4x4W 0, 1, 2, 3, 4
> +    %if mmsize == 8
> +        TRANSPOSE4x4W 0, 1, 2, 3, 4
> +    %else
> +        punpcklwd m0, m1
> +        punpcklwd m2, m3
> +        SBUTTERFLY dq, 0, 2, 4
> +        MOVHL m1, m0
> +        MOVHL m3, m2
> +    %endif
>      paddw        m0, m6
>      IDCT4_1D      w, 0, 1, 2, 3, 4, 5
>      pxor         m7, m7
> @@ -1131,3 +1139,26 @@ INIT_MMX mmx
>  IDCT_DC_DEQUANT 0
>  INIT_MMX sse2
>  IDCT_DC_DEQUANT 7
> +
> +INIT_XMM avx
> +
> +; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
> +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
> +    movd       %3, [%7]
> +    movd       %4, [%7+%8]
> +    psraw      %1, %6
> +    psraw      %2, %6
> +    punpcklbw  %3, %5
> +    punpcklbw  %4, %5
> +    paddw      %3, %1
> +    paddw      %4, %2
> +    packuswb   %3, %5
> +    packuswb   %4, %5
> +    movd     [%7], %3
> +    movd  [%7+%8], %4
> +%endmacro
> +
> +cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
> +    movsxdifnidn stride_q, stride_d
> +    IDCT4_ADD    dst_q, block_q, stride_q
> +RET
> diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
> index 0643b37..8ba085f 100644
> --- a/libavcodec/x86/h264dsp_init.c
> +++ b/libavcodec/x86/h264dsp_init.c
> @@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
>                                                         int stride);
>  
>  IDCT_ADD_FUNC(, 8, mmx)
> +IDCT_ADD_FUNC(, 8, avx)
>  IDCT_ADD_FUNC(, 10, sse2)
>  IDCT_ADD_FUNC(_dc, 8, mmxext)
>  IDCT_ADD_FUNC(_dc, 10, mmxext)
> @@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
>                  c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx;
>                  c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
>              }
> +
> +            c->h264_idct_add        = ff_h264_idct_add_8_avx;
>          }
>      } else if (bit_depth == 10) {
>          if (EXTERNAL_MMXEXT(cpu_flags)) {
> 



More information about the ffmpeg-devel mailing list