[FFmpeg-devel] [PATCH] avcodec/riscv: add h264 dc idct rvv
Rémi Denis-Courmont
remi at remlab.net
Mon Jun 10 18:03:11 EEST 2024
Le maanantaina 10. kesäkuuta 2024, 9.22.14 EEST J. Dekker a écrit :
> checkasm: bench runs 131072 (1 << 17)
> h264_idct4_add_dc_8bpp_c: 1.5
> h264_idct4_add_dc_8bpp_rvv_i64: 0.7
RDCYCLE would exhibit larger and more meaningful numbers here.
> h264_idct4_add_dc_9bpp_c: 1.5
> h264_idct4_add_dc_9bpp_rvv_i64: 0.7
> h264_idct4_add_dc_10bpp_c: 1.5
> h264_idct4_add_dc_10bpp_rvv_i64: 0.7
> h264_idct4_add_dc_12bpp_c: 1.2
> h264_idct4_add_dc_12bpp_rvv_i64: 0.7
> h264_idct4_add_dc_14bpp_c: 1.2
> h264_idct4_add_dc_14bpp_rvv_i64: 0.7
> h264_idct8_add_dc_8bpp_c: 5.2
> h264_idct8_add_dc_8bpp_rvv_i64: 1.5
> h264_idct8_add_dc_9bpp_c: 5.5
> h264_idct8_add_dc_9bpp_rvv_i64: 1.2
> h264_idct8_add_dc_10bpp_c: 5.5
> h264_idct8_add_dc_10bpp_rvv_i64: 1.2
> h264_idct8_add_dc_12bpp_c: 4.2
> h264_idct8_add_dc_12bpp_rvv_i64: 1.2
> h264_idct8_add_dc_14bpp_c: 4.2
> h264_idct8_add_dc_14bpp_rvv_i64: 1.2
>
> Signed-off-by: J. Dekker <jdek at itanimul.li>
> ---
> libavcodec/riscv/Makefile | 1 +
> libavcodec/riscv/h264dsp_init.c | 40 +++++++++-
> libavcodec/riscv/h264dsp_rvv.S | 129 ++++++++++++++++++++++++++++++++
> 3 files changed, 167 insertions(+), 3 deletions(-)
> create mode 100644 libavcodec/riscv/h264dsp_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 590655f829..cb63631de4 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
> OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
> RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
> OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
> +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
> OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
> RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
> OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
> diff --git a/libavcodec/riscv/h264dsp_init.c
> b/libavcodec/riscv/h264dsp_init.c index dbbf3db400..fe3caf686d 100644
> --- a/libavcodec/riscv/h264dsp_init.c
> +++ b/libavcodec/riscv/h264dsp_init.c
> @@ -1,4 +1,5 @@
> /*
> + * Copyright (c) 2024 J. Dekker <jdek at itanimul.li>
> * Copyright © 2024 Rémi Denis-Courmont.
> *
> * This file is part of FFmpeg.
> @@ -24,22 +25,55 @@
>
> #include "libavutil/attributes.h"
> #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> #include "libavcodec/h264dsp.h"
>
> extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
> extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
> +void ff_h264_idct4_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_10_rvv(uint8_t *dst, int16_t *block, int
> stride); +void ff_h264_idct4_dc_add_12_rvv(uint8_t *dst, int16_t *block,
> int stride); +void ff_h264_idct8_dc_add_12_rvv(uint8_t *dst, int16_t
> *block, int stride); +void ff_h264_idct4_dc_add_14_rvv(uint8_t *dst,
> int16_t *block, int stride); +void ff_h264_idct8_dc_add_14_rvv(uint8_t
> *dst, int16_t *block, int stride);
>
> -av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int
> bit_depth, +av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int
> bit_depth, const int chroma_format_idc)
> {
> #if HAVE_RV
> int flags = av_get_cpu_flags();
>
> if (flags & AV_CPU_FLAG_RVB_BASIC)
> - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> + c->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> # if HAVE_RVV
> if (flags & AV_CPU_FLAG_RVV_I32)
> - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> + c->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> # endif
> + if ((flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) {
ff_rv_vlen_least()
> + if (bit_depth == 8) {
switch()
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
> + }
> + if (bit_depth == 9) {
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
> + }
> + if (bit_depth == 10) {
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
> + }
> + if (bit_depth == 12) {
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
> + }
> + if (bit_depth == 14) {
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
> + }
> + }
> #endif
> }
> diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
> new file mode 100644
> index 0000000000..c392403add
> --- /dev/null
> +++ b/libavcodec/riscv/h264dsp_rvv.S
> @@ -0,0 +1,129 @@
> +/*
> + * SPDX-License-Identifier: BSD-2-Clause
> + *
> + * Copyright (c) 2024 J. Dekker <jdek at itanimul.li>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
> IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
> PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY
> THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT
> (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE
> OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro clip min, max, regs:vararg
> +.irp x, \regs
> + vmax.vx \x, \x, \min
> +.endr
> +.irp x, \regs
> + vmin.vx \x, \x, \max
> +.endr
> +.endm
In my experience, single use macros juse make the code harder to follow and
hide possible optimisations (indeed, this is the case here).
> +
> +.macro idct_dc_add depth
> +func ff_h264_idct_dc_add_\depth\()_rvv, zve64x, zbb
In this particular case, it seems to me that sharing the code between 8-bit
and 16-bit is counter-productive.
> +.if \depth == 8
> + lh a3, 0(a1)
> +.else
> + lw a3, 0(a1)
> +.endif
> + addi a3, a3, 32
> + mv t1, a4 // lines = cols
> + srai a3, a3, 6
> +.if \depth == 8
> + sh zero, 0(a1)
> +.else
> + sw zero, 0(a1)
> +.endif
> +1: add t4, a0, a2 // &src[line * 1]
> + sh1add t5, a2, a0 // &src[line * 2]
> + sh1add t6, a2, t4 // &src[line * 3]
> +.if \depth == 8
> + vsetvli zero, a4, e8, mf2, ta, ma
> + vle8.v v0, (a0)
> + vle8.v v1, (t4)
> + vle8.v v2, (t5)
> + vle8.v v3, (t6)
I doubt that you'll gain anything from unrolling 4x here. Maybe 2x will help
amortise consecutive latencies, but that's pretty much it.
> + vwcvtu.x.x.v v8, v0
> + vwcvtu.x.x.v v9, v1
> + vwcvtu.x.x.v v10, v2
> + vwcvtu.x.x.v v11, v3
As a general note, when up-converting integers, you have to choose between
VWCVT{,U} and V{S,Z}EXT. Here, VZEXT fits better because you could stick with
an element size of 16 bits.
> + vsetvli zero, a4, e16, m1, ta, ma
> +.else
> + vsetvli zero, a4, e16, m1, ta, ma
This resets the same type and length every time. Just set them once in the
calling code then leave VL and VTYPE alone.
> + vle16.v v8, (a0)
> + vle16.v v9, (t4)
> + vle16.v v10, (t5)
> + vle16.v v11, (t6)
> +.endif
> + vadd.vx v8, v8, a3
> + vadd.vx v9, v9, a3
> + vadd.vx v10, v10, a3
> + vadd.vx v11, v11, a3
In the 8-bit case, you could add -128 to a3. Effectively this converts the
samples to signed 8-bit.
> + clip zero, a5, v8, v9, v10, v11
> + addi t1, t1, -4
> +.if \depth == 8
> + vsetvli zero, a4, e8, mf2, ta, ma
It is idiomatic to use zero as source here.
> + vncvt.x.x.w v0, v8
> + vncvt.x.x.w v1, v9
> + vncvt.x.x.w v2, v10
> + vncvt.x.x.w v3, v11
If you added -128 above, then you can use VNCLIP.WI to clip *and* narrow,
eliminating VMAX, VMIN and VNCVT. However you need to add a VXOR.VX to flip the
sign bit back.
Alternatively, replace VMIN and VNCT with VNCLIPU.WI.
Either way, there won't much code left to share between 8-bit and HDR...
> + vse8.v v0, (a0)
> + vse8.v v1, (t4)
> + vse8.v v2, (t5)
> + vse8.v v3, (t6)
> +.else
> + vse16.v v8, (a0)
> + vse16.v v9, (t4)
> + vse16.v v10, (t5)
> + vse16.v v11, (t6)
> +.endif
> + sh2add a0, a2, a0 // src += stride * 4
> + bnez t1, 1b
> + ret
> +endfunc
> +.endm
> +
> +idct_dc_add 8
> +idct_dc_add 16
> +
> +func ff_h264_idct4_dc_add_8_rvv, zve64x
> + li a4, 4
> + li a5, 255
Using VNCLIP{,U} should make this redundant.
> + j ff_h264_idct_dc_add_8_rvv
> +endfunc
> +func ff_h264_idct8_dc_add_8_rvv, zve64x
> + li a4, 8
> + li a5, 255
> + j ff_h264_idct_dc_add_8_rvv
> +endfunc
> +
> +.irp depth,9,10,12,14
> +func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
VSETIVLI here. And while at it, pick 1/2 group multiplier which should be
enough for 4 elements per vector.
That trick won't work for 8-bit, and that's another reason not to share the
code between bit-widths.
> + li a4, 4
> + li a5, (1 << \depth) - 1
> + j ff_h264_idct_dc_add_16_rvv
> +endfunc
> +
> +func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
> + li a4, 8
> + li a5, (1 << \depth) - 1
> + j ff_h264_idct_dc_add_16_rvv
> +endfunc
> +.endr
Last but not least, in the case of 32-bit or 64-bit rows, and if they are
suitably aligned, it is probably faster to pack all values contiguously in
vector groups using strided loads and stores. Thus all looping and unrolling
would be eliminated.
That trick does not work for 128-bit rows unfortunately, and it also will not
work with more complex functions involving calculations between rows.
--
Rémi Denis-Courmont
http://www.remlab.net/
More information about the ffmpeg-devel
mailing list