[FFmpeg-devel] [PATCH 06/10] avcodec/vc1: Arm 32-bit NEON deblocking filter fast paths

Fri Mar 25 21:27:22 EET 2022

25 Mar 2022, 19:52 by bavison at riscosopen.org:

> checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. Note that the C
> version can still outperform the NEON version in specific cases. The balance
> between different code paths is stream-dependent, but in practice the best
> case happens about 5% of the time, the worst case happens about 40% of the
> time, and the complexity of the remaining cases fall somewhere in between.
> Therefore, taking the average of the best and worst case timings is
> probably a conservative estimate of the degree by which the NEON code
> improves performance.
>
> vc1dsp.vc1_h_loop_filter4_bestcase_c: 19.0
> vc1dsp.vc1_h_loop_filter4_bestcase_neon: 48.5
> vc1dsp.vc1_h_loop_filter4_worstcase_c: 144.7
> vc1dsp.vc1_h_loop_filter4_worstcase_neon: 76.2
> vc1dsp.vc1_h_loop_filter8_bestcase_c: 41.0
> vc1dsp.vc1_h_loop_filter8_bestcase_neon: 75.0
> vc1dsp.vc1_h_loop_filter8_worstcase_c: 294.0
> vc1dsp.vc1_h_loop_filter8_worstcase_neon: 102.7
> vc1dsp.vc1_h_loop_filter16_bestcase_c: 54.7
> vc1dsp.vc1_h_loop_filter16_bestcase_neon: 130.0
> vc1dsp.vc1_h_loop_filter16_worstcase_c: 569.7
> vc1dsp.vc1_h_loop_filter16_worstcase_neon: 186.7
> vc1dsp.vc1_v_loop_filter4_bestcase_c: 20.2
> vc1dsp.vc1_v_loop_filter4_bestcase_neon: 47.2
> vc1dsp.vc1_v_loop_filter4_worstcase_c: 164.2
> vc1dsp.vc1_v_loop_filter4_worstcase_neon: 68.5
> vc1dsp.vc1_v_loop_filter8_bestcase_c: 43.5
> vc1dsp.vc1_v_loop_filter8_bestcase_neon: 55.2
> vc1dsp.vc1_v_loop_filter8_worstcase_c: 316.2
> vc1dsp.vc1_v_loop_filter8_worstcase_neon: 72.7
> vc1dsp.vc1_v_loop_filter16_bestcase_c: 62.2
> vc1dsp.vc1_v_loop_filter16_bestcase_neon: 103.7
> vc1dsp.vc1_v_loop_filter16_worstcase_c: 646.5
> vc1dsp.vc1_v_loop_filter16_worstcase_neon: 110.7
>
> Signed-off-by: Ben Avison <bavison at riscosopen.org>
> ---
>  libavcodec/arm/vc1dsp_init_neon.c |  14 +
>  libavcodec/arm/vc1dsp_neon.S      | 643 ++++++++++++++++++++++++++++++
>  2 files changed, 657 insertions(+)
>
> diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
> index 2cca784f5a..f5f5c702d7 100644
> --- a/libavcodec/arm/vc1dsp_init_neon.c
> +++ b/libavcodec/arm/vc1dsp_init_neon.c
> @@ -32,6 +32,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc
>  void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
>  void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
>  
> +void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
> +
>  void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
>  ptrdiff_t line_size, int rnd);
>  
> @@ -92,6 +99,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
>  dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
>  dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
>  
> +    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
> +    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
> +    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
> +    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
> +    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
> +    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
> +
>  dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
>  FN_ASSIGN(1, 0);
>  FN_ASSIGN(2, 0);
> diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
> index 93f043bf08..a639e81171 100644
> --- a/libavcodec/arm/vc1dsp_neon.S
> +++ b/libavcodec/arm/vc1dsp_neon.S
> @@ -1161,3 +1161,646 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1
>  vst1.32         {d1[1]},  [r0,:32]
>  bx              lr
>  endfunc
> +
> +@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
> +@ On entry:
> +@   r0 -> top-left pel of lower block
> +@   r1 = row stride, bytes
> +@   r2 = PQUANT bitstream parameter
> +function ff_vc1_v_loop_filter4_neon, export=1
> +        sub             r3, r0, r1, lsl #2
> +        vldr            d0, .Lcoeffs
> +        vld1.32         {d1[0]}, [r0], r1       @ P5
> +        vld1.32         {d2[0]}, [r3], r1       @ P1
> +        vld1.32         {d3[0]}, [r3], r1       @ P2
> +        vld1.32         {d4[0]}, [r0], r1       @ P6
> +        vld1.32         {d5[0]}, [r3], r1       @ P3
> +        vld1.32         {d6[0]}, [r0], r1       @ P7
> +        vld1.32         {d7[0]}, [r3]           @ P4
> +        vld1.32         {d16[0]}, [r0]          @ P8
>

Nice patches, but 2 notes so far:
What's with the weird comment syntax used only in this commit?
Different indentation style used. We try to indent our Arm assembly to:
<8 spaces><instruction><spaces until and column 24><instruction arguments>.
Take a look at e.g. libavcodec/aarch64/vp9itxfm_neon.S. It's just something that
stuck around.