[FFmpeg-devel] [PATCH v2 2/3] aarch64/vvc: Add dmvr_hv

Thu Sep 26 14:36:10 EEST 2024

On Mon, 23 Sep 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao at tencent.com>
>
> dmvr_hv_8_12x20_c:                                       8.0 ( 1.00x)
> dmvr_hv_8_12x20_neon:                                    1.2 ( 6.62x)
> dmvr_hv_8_20x12_c:                                       8.0 ( 1.00x)
> dmvr_hv_8_20x12_neon:                                    0.9 ( 8.37x)
> dmvr_hv_8_20x20_c:                                      12.9 ( 1.00x)
> dmvr_hv_8_20x20_neon:                                    1.7 ( 7.62x)
> dmvr_hv_10_12x20_c:                                      7.0 ( 1.00x)
> dmvr_hv_10_12x20_neon:                                   1.7 ( 4.09x)
> dmvr_hv_10_20x12_c:                                      7.0 ( 1.00x)
> dmvr_hv_10_20x12_neon:                                   1.7 ( 4.09x)
> dmvr_hv_10_20x20_c:                                     11.2 ( 1.00x)
> dmvr_hv_10_20x20_neon:                                   2.7 ( 4.15x)
> dmvr_hv_12_12x20_c:                                      6.5 ( 1.00x)
> dmvr_hv_12_12x20_neon:                                   1.7 ( 3.79x)
> dmvr_hv_12_20x12_c:                                      6.5 ( 1.00x)
> dmvr_hv_12_20x12_neon:                                   1.7 ( 3.79x)
> dmvr_hv_12_20x20_c:                                     10.2 ( 1.00x)
> dmvr_hv_12_20x20_neon:                                   2.2 ( 4.64x)
> ---
> libavcodec/aarch64/vvc/dsp_init.c |  12 ++
> libavcodec/aarch64/vvc/inter.S    | 307 ++++++++++++++++++++++++++++++
> 2 files changed, 319 insertions(+)
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index b39ebb83fc..995e26d163 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -83,6 +83,15 @@ W_AVG_FUN(8)
> W_AVG_FUN(10)
> W_AVG_FUN(12)
>
> +#define DMVR_FUN(fn, bd) \
> +    void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
> +        const uint8_t *_src, const ptrdiff_t _src_stride, const int height, \
> +        const intptr_t mx, const intptr_t my, const int width);

Unnecessary const on scalar parameters

> +
> +DMVR_FUN(hv_, 8)
> +DMVR_FUN(hv_, 10)
> +DMVR_FUN(hv_, 12)
> +
> void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
> {
>     int cpu_flags = av_get_cpu_flags();
> @@ -155,6 +164,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
>
>         c->inter.avg = ff_vvc_avg_8_neon;
>         c->inter.w_avg = vvc_w_avg_8;
> +        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
>
>         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
>             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
> @@ -196,12 +206,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
>     } else if (bd == 10) {
>         c->inter.avg = ff_vvc_avg_10_neon;
>         c->inter.w_avg = vvc_w_avg_10;
> +        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
>
>         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
>         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
>     } else if (bd == 12) {
>         c->inter.avg = ff_vvc_avg_12_neon;
>         c->inter.w_avg = vvc_w_avg_12;
> +        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
>
>         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
>         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
> diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
> index c4c6ab1a72..a0bb356f07 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -226,3 +226,310 @@ vvc_avg avg, 12
> vvc_avg w_avg, 8
> vvc_avg w_avg, 10
> vvc_avg w_avg, 12
> +
> +/* x0: int16_t *dst
> + * x1: const uint8_t *_src
> + * x2: const ptrdiff_t _src_stride
> + * w3: const int height
> + * x4: const intptr_t mx
> + * x5: const intptr_t my
> + * w6: const int width

Unnecessary const

> + */
> +function ff_vvc_dmvr_hv_8_neon, export=1
> +        dst             .req x0
> +        src             .req x1
> +        src_stride      .req x2
> +        height          .req w3
> +        mx              .req x4
> +        my              .req x5
> +        width           .req w6
> +        tmp0            .req x7
> +        tmp1            .req x8
> +
> +        sub             sp, sp, #(VVC_MAX_PB_SIZE * 4)
> +
> +        movrel          x9, X(ff_vvc_inter_luma_dmvr_filters)
> +        add             x12, x9, mx, lsl #1
> +        ldrb            w10, [x12]
> +        ldrb            w11, [x12, #1]
> +        mov             tmp0, sp
> +        add             tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
> +        // We know the value are positive
> +        dup             v0.8h, w10                  // filter_x[0]
> +        dup             v1.8h, w11                  // filter_x[1]

If we don't need these values in GPRs, we could also just do ld1r, 
although that requires incrementing the pointer (which probably can be 
done with a post-increment, [x12], #1) between the loads. Then again, I 
see you load 8 bits but you want them in 16 bit elements, so that would 
require a separate uxtl. So then I guess this use of GPRs for loading is 
reasonable.

All in all, the patch seems fine, except for the unnecessary consts.

// Martin