[FFmpeg-devel] [PATCH v2 2/3] aarch64/vvc: Add dmvr_hv
Martin Storsjö
martin at martin.st
Thu Sep 26 14:36:10 EEST 2024
On Mon, 23 Sep 2024, Zhao Zhili wrote:
> From: Zhao Zhili <zhilizhao at tencent.com>
>
> dmvr_hv_8_12x20_c: 8.0 ( 1.00x)
> dmvr_hv_8_12x20_neon: 1.2 ( 6.62x)
> dmvr_hv_8_20x12_c: 8.0 ( 1.00x)
> dmvr_hv_8_20x12_neon: 0.9 ( 8.37x)
> dmvr_hv_8_20x20_c: 12.9 ( 1.00x)
> dmvr_hv_8_20x20_neon: 1.7 ( 7.62x)
> dmvr_hv_10_12x20_c: 7.0 ( 1.00x)
> dmvr_hv_10_12x20_neon: 1.7 ( 4.09x)
> dmvr_hv_10_20x12_c: 7.0 ( 1.00x)
> dmvr_hv_10_20x12_neon: 1.7 ( 4.09x)
> dmvr_hv_10_20x20_c: 11.2 ( 1.00x)
> dmvr_hv_10_20x20_neon: 2.7 ( 4.15x)
> dmvr_hv_12_12x20_c: 6.5 ( 1.00x)
> dmvr_hv_12_12x20_neon: 1.7 ( 3.79x)
> dmvr_hv_12_20x12_c: 6.5 ( 1.00x)
> dmvr_hv_12_20x12_neon: 1.7 ( 3.79x)
> dmvr_hv_12_20x20_c: 10.2 ( 1.00x)
> dmvr_hv_12_20x20_neon: 2.2 ( 4.64x)
> ---
> libavcodec/aarch64/vvc/dsp_init.c | 12 ++
> libavcodec/aarch64/vvc/inter.S | 307 ++++++++++++++++++++++++++++++
> 2 files changed, 319 insertions(+)
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index b39ebb83fc..995e26d163 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -83,6 +83,15 @@ W_AVG_FUN(8)
> W_AVG_FUN(10)
> W_AVG_FUN(12)
>
> +#define DMVR_FUN(fn, bd) \
> + void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
> + const uint8_t *_src, const ptrdiff_t _src_stride, const int height, \
> + const intptr_t mx, const intptr_t my, const int width);
Unnecessary const on scalar parameters
> +
> +DMVR_FUN(hv_, 8)
> +DMVR_FUN(hv_, 10)
> +DMVR_FUN(hv_, 12)
> +
> void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -155,6 +164,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
>
> c->inter.avg = ff_vvc_avg_8_neon;
> c->inter.w_avg = vvc_w_avg_8;
> + c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
>
> for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
> c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
> @@ -196,12 +206,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
> } else if (bd == 10) {
> c->inter.avg = ff_vvc_avg_10_neon;
> c->inter.w_avg = vvc_w_avg_10;
> + c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
>
> c->alf.filter[LUMA] = alf_filter_luma_10_neon;
> c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
> } else if (bd == 12) {
> c->inter.avg = ff_vvc_avg_12_neon;
> c->inter.w_avg = vvc_w_avg_12;
> + c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
>
> c->alf.filter[LUMA] = alf_filter_luma_12_neon;
> c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
> diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
> index c4c6ab1a72..a0bb356f07 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -226,3 +226,310 @@ vvc_avg avg, 12
> vvc_avg w_avg, 8
> vvc_avg w_avg, 10
> vvc_avg w_avg, 12
> +
> +/* x0: int16_t *dst
> + * x1: const uint8_t *_src
> + * x2: const ptrdiff_t _src_stride
> + * w3: const int height
> + * x4: const intptr_t mx
> + * x5: const intptr_t my
> + * w6: const int width
Unnecessary const
> + */
> +function ff_vvc_dmvr_hv_8_neon, export=1
> + dst .req x0
> + src .req x1
> + src_stride .req x2
> + height .req w3
> + mx .req x4
> + my .req x5
> + width .req w6
> + tmp0 .req x7
> + tmp1 .req x8
> +
> + sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
> +
> + movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
> + add x12, x9, mx, lsl #1
> + ldrb w10, [x12]
> + ldrb w11, [x12, #1]
> + mov tmp0, sp
> + add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
> + // We know the value are positive
> + dup v0.8h, w10 // filter_x[0]
> + dup v1.8h, w11 // filter_x[1]
If we don't need these values in GPRs, we could also just do ld1r,
although that requires incrementing the pointer (which probably can be
done with a post-increment, [x12], #1) between the loads. Then again, I
see you load 8 bits but you want them in 16 bit elements, so that would
require a separate uxtl. So then I guess this use of GPRs for loading is
reasonable.
All in all, the patch seems fine, except for the unnecessary consts.
// Martin
More information about the ffmpeg-devel
mailing list