[FFmpeg-devel] [PATCH] avcodec/riscv: add h264 dc idct rvv

Mon Jun 10 18:03:11 EEST 2024

Le maanantaina 10. kesäkuuta 2024, 9.22.14 EEST J. Dekker a écrit :
> checkasm: bench runs 131072 (1 << 17)
> h264_idct4_add_dc_8bpp_c: 1.5
> h264_idct4_add_dc_8bpp_rvv_i64: 0.7

RDCYCLE would exhibit larger and more meaningful numbers here.

> h264_idct4_add_dc_9bpp_c: 1.5
> h264_idct4_add_dc_9bpp_rvv_i64: 0.7
> h264_idct4_add_dc_10bpp_c: 1.5
> h264_idct4_add_dc_10bpp_rvv_i64: 0.7
> h264_idct4_add_dc_12bpp_c: 1.2
> h264_idct4_add_dc_12bpp_rvv_i64: 0.7
> h264_idct4_add_dc_14bpp_c: 1.2
> h264_idct4_add_dc_14bpp_rvv_i64: 0.7
> h264_idct8_add_dc_8bpp_c: 5.2
> h264_idct8_add_dc_8bpp_rvv_i64: 1.5
> h264_idct8_add_dc_9bpp_c: 5.5
> h264_idct8_add_dc_9bpp_rvv_i64: 1.2
> h264_idct8_add_dc_10bpp_c: 5.5
> h264_idct8_add_dc_10bpp_rvv_i64: 1.2
> h264_idct8_add_dc_12bpp_c: 4.2
> h264_idct8_add_dc_12bpp_rvv_i64: 1.2
> h264_idct8_add_dc_14bpp_c: 4.2
> h264_idct8_add_dc_14bpp_rvv_i64: 1.2
> 
> Signed-off-by: J. Dekker <jdek at itanimul.li>
> ---
>  libavcodec/riscv/Makefile       |   1 +
>  libavcodec/riscv/h264dsp_init.c |  40 +++++++++-
>  libavcodec/riscv/h264dsp_rvv.S  | 129 ++++++++++++++++++++++++++++++++
>  3 files changed, 167 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/riscv/h264dsp_rvv.S
> 
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 590655f829..cb63631de4 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
>  OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
>  RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
>  OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
> +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
>  OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
>  RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
>  OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
> diff --git a/libavcodec/riscv/h264dsp_init.c
> b/libavcodec/riscv/h264dsp_init.c index dbbf3db400..fe3caf686d 100644
> --- a/libavcodec/riscv/h264dsp_init.c
> +++ b/libavcodec/riscv/h264dsp_init.c
> @@ -1,4 +1,5 @@
>  /*
> + * Copyright (c) 2024 J. Dekker <jdek at itanimul.li>
>   * Copyright © 2024 Rémi Denis-Courmont.
>   *
>   * This file is part of FFmpeg.
> @@ -24,22 +25,55 @@
> 
>  #include "libavutil/attributes.h"
>  #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
>  #include "libavcodec/h264dsp.h"
> 
>  extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
>  extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
> +void ff_h264_idct4_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_10_rvv(uint8_t *dst, int16_t *block, int
> stride); +void ff_h264_idct4_dc_add_12_rvv(uint8_t *dst, int16_t *block,
> int stride); +void ff_h264_idct8_dc_add_12_rvv(uint8_t *dst, int16_t
> *block, int stride); +void ff_h264_idct4_dc_add_14_rvv(uint8_t *dst,
> int16_t *block, int stride); +void ff_h264_idct8_dc_add_14_rvv(uint8_t
> *dst, int16_t *block, int stride);
> 
> -av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int
> bit_depth, +av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int
> bit_depth, const int chroma_format_idc)
>  {
>  #if HAVE_RV
>      int flags = av_get_cpu_flags();
> 
>      if (flags & AV_CPU_FLAG_RVB_BASIC)
> -        dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> +        c->startcode_find_candidate = ff_startcode_find_candidate_rvb;
>  # if HAVE_RVV
>      if (flags & AV_CPU_FLAG_RVV_I32)
> -        dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> +        c->startcode_find_candidate = ff_startcode_find_candidate_rvv;
>  # endif
> +    if ((flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) {

ff_rv_vlen_least()

> +        if (bit_depth == 8) {

switch()

> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_8_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
> +        }
> +        if (bit_depth == 9) {
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_9_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
> +        }
> +        if (bit_depth == 10) {
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_10_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
> +        }
> +        if (bit_depth == 12) {
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_12_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
> +        }
> +        if (bit_depth == 14) {
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_14_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
> +        }
> +    }
>  #endif
>  }
> diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
> new file mode 100644
> index 0000000000..c392403add
> --- /dev/null
> +++ b/libavcodec/riscv/h264dsp_rvv.S
> @@ -0,0 +1,129 @@
> +/*
> + * SPDX-License-Identifier: BSD-2-Clause
> + *
> + * Copyright (c) 2024 J. Dekker <jdek at itanimul.li>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
> IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
> PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY
> THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT
> (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE
> OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro clip min, max, regs:vararg
> +.irp x, \regs
> +        vmax.vx         \x, \x, \min
> +.endr
> +.irp x, \regs
> +        vmin.vx         \x, \x, \max
> +.endr
> +.endm

In my experience, single use macros juse make the code harder to follow and 
hide possible optimisations (indeed, this is the case here).

> +
> +.macro idct_dc_add depth
> +func ff_h264_idct_dc_add_\depth\()_rvv, zve64x, zbb

In this particular case, it seems to me that sharing the code between 8-bit 
and 16-bit is counter-productive.

> +.if \depth == 8
> +        lh              a3, 0(a1)
> +.else
> +        lw              a3, 0(a1)
> +.endif
> +        addi            a3, a3, 32
> +        mv              t1, a4     // lines = cols
> +        srai            a3, a3, 6
> +.if \depth == 8
> +        sh              zero, 0(a1)
> +.else
> +        sw              zero, 0(a1)
> +.endif
> +1:      add             t4, a0, a2 // &src[line * 1]
> +        sh1add          t5, a2, a0 // &src[line * 2]
> +        sh1add          t6, a2, t4 // &src[line * 3]
> +.if \depth == 8
> +        vsetvli         zero, a4, e8, mf2, ta, ma
> +        vle8.v          v0, (a0)
> +        vle8.v          v1, (t4)
> +        vle8.v          v2, (t5)
> +        vle8.v          v3, (t6)

I doubt that you'll gain anything from unrolling 4x here. Maybe 2x will help 
amortise consecutive latencies, but that's pretty much it.

> +        vwcvtu.x.x.v    v8, v0
> +        vwcvtu.x.x.v    v9, v1
> +        vwcvtu.x.x.v    v10, v2
> +        vwcvtu.x.x.v    v11, v3

As a general note, when up-converting integers, you have to choose between 
VWCVT{,U} and V{S,Z}EXT. Here, VZEXT fits better because you could stick with 
an element size of 16 bits.

> +        vsetvli         zero, a4, e16, m1, ta, ma
> +.else
> +        vsetvli         zero, a4, e16, m1, ta, ma

This resets the same type and length every time. Just set them once in the 
calling code then leave VL and VTYPE alone.

> +        vle16.v         v8, (a0)
> +        vle16.v         v9, (t4)
> +        vle16.v         v10, (t5)
> +        vle16.v         v11, (t6)
> +.endif
> +        vadd.vx         v8, v8, a3
> +        vadd.vx         v9, v9, a3
> +        vadd.vx         v10, v10, a3
> +        vadd.vx         v11, v11, a3

In the 8-bit case, you could add -128 to a3. Effectively this converts the 
samples to signed 8-bit.

> +        clip            zero, a5, v8, v9, v10, v11
> +        addi            t1, t1, -4
> +.if \depth == 8
> +        vsetvli         zero, a4, e8, mf2, ta, ma

It is idiomatic to use zero as source here.

> +        vncvt.x.x.w     v0, v8
> +        vncvt.x.x.w     v1, v9
> +        vncvt.x.x.w     v2, v10
> +        vncvt.x.x.w     v3, v11

If you added -128 above, then you can use VNCLIP.WI to clip *and* narrow, 
eliminating VMAX, VMIN and VNCVT. However you need to add a VXOR.VX to flip the 
sign bit back.

Alternatively, replace VMIN and VNCT with VNCLIPU.WI.

Either way, there won't much code left to share between 8-bit and HDR...

> +        vse8.v          v0, (a0)
> +        vse8.v          v1, (t4)
> +        vse8.v          v2, (t5)
> +        vse8.v          v3, (t6)
> +.else
> +        vse16.v         v8, (a0)
> +        vse16.v         v9, (t4)
> +        vse16.v         v10, (t5)
> +        vse16.v         v11, (t6)
> +.endif
> +        sh2add          a0, a2, a0 // src += stride * 4
> +        bnez            t1, 1b
> +        ret
> +endfunc
> +.endm
> +
> +idct_dc_add 8
> +idct_dc_add 16
> +
> +func ff_h264_idct4_dc_add_8_rvv, zve64x
> +        li              a4, 4
> +        li              a5, 255

Using VNCLIP{,U} should make this redundant.

> +        j               ff_h264_idct_dc_add_8_rvv
> +endfunc
> +func ff_h264_idct8_dc_add_8_rvv, zve64x
> +        li              a4, 8
> +        li              a5, 255
> +        j               ff_h264_idct_dc_add_8_rvv
> +endfunc
> +
> +.irp depth,9,10,12,14
> +func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x

VSETIVLI here. And while at it, pick 1/2 group multiplier which should be 
enough for 4 elements per vector.

That trick won't work for 8-bit, and that's another reason not to share the 
code between bit-widths.

> +        li              a4, 4
> +        li              a5, (1 << \depth) - 1
> +        j               ff_h264_idct_dc_add_16_rvv
> +endfunc
> +
> +func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
> +        li              a4, 8
> +        li              a5, (1 << \depth) - 1
> +        j               ff_h264_idct_dc_add_16_rvv
> +endfunc
> +.endr

Last but not least, in the case of 32-bit or 64-bit rows, and if they are 
suitably aligned, it is probably faster to pack all values contiguously in 
vector groups using strided loads and stores. Thus all looping and unrolling 
would be eliminated.

That trick does not work for 128-bit rows unfortunately, and it also will not 
work with more complex functions involving calculations between rows.

-- 
Rémi Denis-Courmont
http://www.remlab.net/