[FFmpeg-devel] [PATCH 2/2] lavc/vvc_mc: R-V V dmvr

flow gg hlefthleft at gmail.com
Fri Oct 11 13:49:43 EEST 2024


ping. ([PATCH 1/5] lavc/vvc_mc: R-V V put_pixels is after this)

<uk7b at foxmail.com> 于2024年9月29日周日 00:47写道:

> From: sunyuechi <sunyuechi at iscas.ac.cn>
>
>                                      k230               banana_f3
> dmvr_8_12x20_c:                       619.3 ( 1.00x)    624.1 ( 1.00x)
> dmvr_8_12x20_rvv_i32:                 128.6 ( 4.82x)    103.4 ( 6.04x)
> dmvr_8_20x12_c:                       610.0 ( 1.00x)    665.6 ( 1.00x)
> dmvr_8_20x12_rvv_i32:                 137.6 ( 4.44x)    92.9 ( 7.17x)
> dmvr_8_20x20_c:                      1008.0 ( 1.00x)    1082.7 ( 1.00x)
> dmvr_8_20x20_rvv_i32:                 221.1 ( 4.56x)    155.4 ( 6.97x)
> dmvr_h_8_12x20_c:                    2008.0 ( 1.00x)    2009.7 ( 1.00x)
> dmvr_h_8_12x20_rvv_i32:               239.6 ( 8.38x)    186.7 (10.77x)
> dmvr_h_8_20x12_c:                    1989.5 ( 1.00x)    2009.4 ( 1.00x)
> dmvr_h_8_20x12_rvv_i32:               230.3 ( 8.64x)    155.4 (12.93x)
> dmvr_h_8_20x20_c:                    3304.1 ( 1.00x)    3342.9 ( 1.00x)
> dmvr_h_8_20x20_rvv_i32:               378.3 ( 8.73x)    248.9 (13.43x)
> dmvr_hv_8_12x20_c:                   3609.8 ( 1.00x)    3603.4 ( 1.00x)
> dmvr_hv_8_12x20_rvv_i32:              369.1 ( 9.78x)    322.1 (11.19x)
> dmvr_hv_8_20x12_c:                   3628.3 ( 1.00x)    3624.2 ( 1.00x)
> dmvr_hv_8_20x12_rvv_i32:              322.8 (11.24x)    238.7 (15.19x)
> dmvr_hv_8_20x20_c:                   5933.8 ( 1.00x)    5936.6 ( 1.00x)
> dmvr_hv_8_20x20_rvv_i32:              526.5 (11.27x)    374.1 (15.87x)
> dmvr_v_8_12x20_c:                    2156.3 ( 1.00x)    2155.4 ( 1.00x)
> dmvr_v_8_12x20_rvv_i32:               239.6 ( 9.00x)    176.2 (12.24x)
> dmvr_v_8_20x12_c:                    2137.6 ( 1.00x)    2165.9 ( 1.00x)
> dmvr_v_8_20x12_rvv_i32:               230.3 ( 9.28x)    155.2 (13.96x)
> dmvr_v_8_20x20_c:                    4183.8 ( 1.00x)    3592.9 ( 1.00x)
> dmvr_v_8_20x20_rvv_i32:               369.3 (11.33x)    249.2 (14.42x)
> ---
>  libavcodec/riscv/vvc/vvc_mc_rvv.S  | 120 +++++++++++++++++++++++++++++
>  libavcodec/riscv/vvc/vvcdsp_init.c |  22 ++++++
>  2 files changed, 142 insertions(+)
>
> diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> index 18532616d9..2c634af48f 100644
> --- a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> @@ -285,3 +285,123 @@ endfunc
>  func_w_avg 128
>  func_w_avg 256
>  #endif
> +
> +func dmvr zve32x, zbb, zba
> +        lpad    0
> +        li                t0, 4
> +1:
> +        add               t1, a1, a2
> +        addi              t4, a0, 128*2
> +        vle8.v            v0, (a1)
> +        vle8.v            v4, (t1)
> +        addi              a3, a3, -2
> +        vwmulu.vx         v16, v0, t0
> +        vwmulu.vx         v20, v4, t0
> +        vse16.v           v16, (a0)
> +        vse16.v           v20, (t4)
> +        sh1add            a1, a2, a1
> +        add               a0, a0, 128*2*2
> +        bnez              a3, 1b
> +        ret
> +endfunc
> +
> +.macro dmvr_h_v mn, type, w, vlen
> +dmvr_\type\vlen\w:
> +        lla               t4, ff_vvc_inter_luma_dmvr_filters
> +        sh1add            t4, \mn, t4
> +        lbu               t5, (t4)
> +        lbu               t6, 1(t4)
> +1:
> +        vsetvlstatic8     \w, \vlen
> +.ifc \type,h
> +        addi              t0, a1, 1
> +        addi              t1, a1, 2
> +.else
> +        add               t0, a1, a2
> +        add               t1, t0, a2
> +.endif
> +        vle8.v            v0, (a1)
> +        vle8.v            v4, (t0)
> +        vle8.v            v8, (t1)
> +        addi              a3, a3, -2
> +        addi              t2, a0, 128*2
> +        vwmulu.vx         v12, v0, t5
> +        vwmulu.vx         v24, v4, t5
> +        vwmaccu.vx        v12, t6, v4
> +        vwmaccu.vx        v24, t6, v8
> +        vsetvlstatic16    \w, \vlen
> +        vssrl.vi          v12, v12, 2
> +        vssrl.vi          v24, v24, 2
> +        vse16.v           v12, (a0)
> +        vse16.v           v24, (t2)
> +        add               a0, a0, 128*4
> +        sh1add            a1, a2, a1
> +        bnez              a3, 1b
> +        ret
> +.endm
> +
> +.macro dmvr_load_h dst, filter0, filter1, w, vlen
> +        vsetvlstatic8     \w, \vlen
> +        addi              a6, a1, 1
> +        vle8.v            \dst, (a1)
> +        vle8.v            v2, (a6)
> +        vwmulu.vx         v4, \dst, \filter0
> +        vwmaccu.vx        v4, \filter1, v2
> +        vsetvlstatic16    \w, \vlen
> +        vssrl.vi          \dst, v4, 2
> +.endm
> +
> +.macro dmvr_hv w, vlen
> +dmvr_hv\vlen\w:
> +        lla               t0, ff_vvc_inter_luma_dmvr_filters
> +        sh1add            t1, a4, t0
> +        sh1add            t2, a5, t0
> +        lbu               t3, (t1)          // filter[mx][0]
> +        lbu               t4, 1(t1)         // filter[mx][1]
> +        lbu               t5, (t2)          // filter[my][0]
> +        lbu               t6, 1(t2)         // filter[my][1]
> +        dmvr_load_h       v12, t3, t4, \w, \vlen
> +        add               a1, a1, a2
> +1:
> +        vmul.vx           v28, v12, t5
> +        addi              a3, a3, -1
> +        dmvr_load_h       v12, t3, t4, \w, \vlen
> +        vmacc.vx          v28, t6, v12
> +        vssrl.vi          v28, v28, 4
> +        vse16.v           v28, (a0)
> +        add               a1, a1, a2
> +        addi              a0, a0, 128*2
> +        bnez              a3, 1b
> +        ret
> +.endm
> +
> +.macro func_dmvr vlen, name
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x, zbb, zba
> +        lpad    0
> +        li                t0, 20
> +        beq               a6, t0, DMVR\name\vlen\()20
> +        .irp w,12,20
> +DMVR\name\vlen\w:
> +        .ifc \name, dmvr
> +        vsetvlstatic8     \w, \vlen
> +        j                 \name
> +        .else
> +        csrwi             vxrm, 0
> +        j                 \name\()\vlen\w
> +        .endif
> +        .endr
> +endfunc
> +.endm
> +
> +
> +.irp vlen,256,128
> +.irp w,12,20
> +dmvr_h_v a4, h, \w, \vlen
> +dmvr_h_v a5, v, \w, \vlen
> +dmvr_hv \w, \vlen
> +.endr
> +func_dmvr \vlen, dmvr
> +func_dmvr \vlen, dmvr_h
> +func_dmvr \vlen, dmvr_v
> +func_dmvr \vlen, dmvr_hv
> +.endr
> diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> b/libavcodec/riscv/vvc/vvcdsp_init.c
> index ac1e7dda7d..7df3ce58db 100644
> --- a/libavcodec/riscv/vvc/vvcdsp_init.c
> +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> @@ -37,6 +37,26 @@ void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> dst_stride,
>  AVG_PROTOTYPES(8, rvv_128)
>  AVG_PROTOTYPES(8, rvv_256)
>
> +#define DMVR_PROTOTYPES(bd, opt)
>                           \
> +void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t
> src_stride,               \
> +     int height, intptr_t mx, intptr_t my, int width);
>                           \
> +void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src,
> ptrdiff_t src_stride,             \
> +     int height, intptr_t mx, intptr_t my, int width);
>                           \
> +void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src,
> ptrdiff_t src_stride,             \
> +     int height, intptr_t mx, intptr_t my, int width);
>                           \
> +void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src,
> ptrdiff_t src_stride,            \
> +     int height, intptr_t mx, intptr_t my, int width);
>                           \
> +
> +DMVR_PROTOTYPES(8, rvv_128)
> +DMVR_PROTOTYPES(8, rvv_256)
> +
> +#define DMVR_INIT(bd, opt) do {                                    \
> +    c->inter.dmvr[0][0]   = ff_vvc_dmvr_##bd##_##opt;              \
> +    c->inter.dmvr[0][1]   = ff_vvc_dmvr_h_##bd##_##opt;            \
> +    c->inter.dmvr[1][0]   = ff_vvc_dmvr_v_##bd##_##opt;            \
> +    c->inter.dmvr[1][1]   = ff_vvc_dmvr_hv_##bd##_##opt;           \
> +} while (0)
> +
>  void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
>  {
>  #if HAVE_RVV
> @@ -51,6 +71,7 @@ void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const
> int bd)
>  # if (__riscv_xlen == 64)
>                  c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
>  # endif
> +                DMVR_INIT(8, rvv_256);
>                  break;
>              default:
>                  break;
> @@ -63,6 +84,7 @@ void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const
> int bd)
>  # if (__riscv_xlen == 64)
>                  c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
>  # endif
> +                DMVR_INIT(8, rvv_128);
>                  break;
>              default:
>                  break;
> --
> 2.46.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>


More information about the ffmpeg-devel mailing list