[FFmpeg-devel] [PATCH 06/10] avcodec/vc1: Arm 32-bit NEON deblocking filter fast paths
Lynne
dev at lynne.ee
Fri Mar 25 21:27:22 EET 2022
25 Mar 2022, 19:52 by bavison at riscosopen.org:
> checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. Note that the C
> version can still outperform the NEON version in specific cases. The balance
> between different code paths is stream-dependent, but in practice the best
> case happens about 5% of the time, the worst case happens about 40% of the
> time, and the complexity of the remaining cases fall somewhere in between.
> Therefore, taking the average of the best and worst case timings is
> probably a conservative estimate of the degree by which the NEON code
> improves performance.
>
> vc1dsp.vc1_h_loop_filter4_bestcase_c: 19.0
> vc1dsp.vc1_h_loop_filter4_bestcase_neon: 48.5
> vc1dsp.vc1_h_loop_filter4_worstcase_c: 144.7
> vc1dsp.vc1_h_loop_filter4_worstcase_neon: 76.2
> vc1dsp.vc1_h_loop_filter8_bestcase_c: 41.0
> vc1dsp.vc1_h_loop_filter8_bestcase_neon: 75.0
> vc1dsp.vc1_h_loop_filter8_worstcase_c: 294.0
> vc1dsp.vc1_h_loop_filter8_worstcase_neon: 102.7
> vc1dsp.vc1_h_loop_filter16_bestcase_c: 54.7
> vc1dsp.vc1_h_loop_filter16_bestcase_neon: 130.0
> vc1dsp.vc1_h_loop_filter16_worstcase_c: 569.7
> vc1dsp.vc1_h_loop_filter16_worstcase_neon: 186.7
> vc1dsp.vc1_v_loop_filter4_bestcase_c: 20.2
> vc1dsp.vc1_v_loop_filter4_bestcase_neon: 47.2
> vc1dsp.vc1_v_loop_filter4_worstcase_c: 164.2
> vc1dsp.vc1_v_loop_filter4_worstcase_neon: 68.5
> vc1dsp.vc1_v_loop_filter8_bestcase_c: 43.5
> vc1dsp.vc1_v_loop_filter8_bestcase_neon: 55.2
> vc1dsp.vc1_v_loop_filter8_worstcase_c: 316.2
> vc1dsp.vc1_v_loop_filter8_worstcase_neon: 72.7
> vc1dsp.vc1_v_loop_filter16_bestcase_c: 62.2
> vc1dsp.vc1_v_loop_filter16_bestcase_neon: 103.7
> vc1dsp.vc1_v_loop_filter16_worstcase_c: 646.5
> vc1dsp.vc1_v_loop_filter16_worstcase_neon: 110.7
>
> Signed-off-by: Ben Avison <bavison at riscosopen.org>
> ---
> libavcodec/arm/vc1dsp_init_neon.c | 14 +
> libavcodec/arm/vc1dsp_neon.S | 643 ++++++++++++++++++++++++++++++
> 2 files changed, 657 insertions(+)
>
> diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
> index 2cca784f5a..f5f5c702d7 100644
> --- a/libavcodec/arm/vc1dsp_init_neon.c
> +++ b/libavcodec/arm/vc1dsp_init_neon.c
> @@ -32,6 +32,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc
> void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
> void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
>
> +void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
> +void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
> +
> void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
> ptrdiff_t line_size, int rnd);
>
> @@ -92,6 +99,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
> dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
> dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
>
> + dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon;
> + dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon;
> + dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon;
> + dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon;
> + dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
> + dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
> +
> dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
> FN_ASSIGN(1, 0);
> FN_ASSIGN(2, 0);
> diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
> index 93f043bf08..a639e81171 100644
> --- a/libavcodec/arm/vc1dsp_neon.S
> +++ b/libavcodec/arm/vc1dsp_neon.S
> @@ -1161,3 +1161,646 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1
> vst1.32 {d1[1]}, [r0,:32]
> bx lr
> endfunc
> +
> +@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
> +@ On entry:
> +@ r0 -> top-left pel of lower block
> +@ r1 = row stride, bytes
> +@ r2 = PQUANT bitstream parameter
> +function ff_vc1_v_loop_filter4_neon, export=1
> + sub r3, r0, r1, lsl #2
> + vldr d0, .Lcoeffs
> + vld1.32 {d1[0]}, [r0], r1 @ P5
> + vld1.32 {d2[0]}, [r3], r1 @ P1
> + vld1.32 {d3[0]}, [r3], r1 @ P2
> + vld1.32 {d4[0]}, [r0], r1 @ P6
> + vld1.32 {d5[0]}, [r3], r1 @ P3
> + vld1.32 {d6[0]}, [r0], r1 @ P7
> + vld1.32 {d7[0]}, [r3] @ P4
> + vld1.32 {d16[0]}, [r0] @ P8
>
Nice patches, but 2 notes so far:
What's with the weird comment syntax used only in this commit?
Different indentation style used. We try to indent our Arm assembly to:
<8 spaces><instruction><spaces until and column 24><instruction arguments>.
Take a look at e.g. libavcodec/aarch64/vp9itxfm_neon.S. It's just something that
stuck around.
More information about the ffmpeg-devel
mailing list