[FFmpeg-devel] [PATCH v2] avcodec/riscv: add h264 qpel
J. Dekker
jdek at itanimul.li
Wed Sep 4 16:32:21 EEST 2024
From: Niklas Haas <git at haasn.dev>
checkasm: bench runs 131072 (1 << 17)
avg_h264_qpel_4_mc00_8_c: 38.2 ( 1.00x)
avg_h264_qpel_4_mc00_8_rvv_i32: 27.7 ( 1.38x)
avg_h264_qpel_4_mc01_8_c: 225.7 ( 1.00x)
avg_h264_qpel_4_mc01_8_rvv_i32: 100.5 ( 2.25x)
avg_h264_qpel_4_mc02_8_c: 215.2 ( 1.00x)
avg_h264_qpel_4_mc02_8_rvv_i32: 79.7 ( 2.70x)
avg_h264_qpel_4_mc03_8_c: 225.4 ( 1.00x)
avg_h264_qpel_4_mc03_8_rvv_i32: 90.2 ( 2.50x)
avg_h264_qpel_4_mc10_8_c: 173.4 ( 1.00x)
avg_h264_qpel_4_mc10_8_rvv_i32: 131.7 ( 1.32x)
avg_h264_qpel_4_mc11_8_c: 339.9 ( 1.00x)
avg_h264_qpel_4_mc11_8_rvv_i32: 194.2 ( 1.75x)
avg_h264_qpel_4_mc12_8_c: 548.5 ( 1.00x)
avg_h264_qpel_4_mc12_8_rvv_i32: 350.4 ( 1.56x)
avg_h264_qpel_4_mc13_8_c: 350.7 ( 1.00x)
avg_h264_qpel_4_mc13_8_rvv_i32: 194.2 ( 1.81x)
avg_h264_qpel_4_mc20_8_c: 142.2 ( 1.00x)
avg_h264_qpel_4_mc20_8_rvv_i32: 121.5 ( 1.17x)
avg_h264_qpel_4_mc21_8_c: 485.9 ( 1.00x)
avg_h264_qpel_4_mc21_8_rvv_i32: 392.2 ( 1.24x)
avg_h264_qpel_4_mc22_8_c: 350.4 ( 1.00x)
avg_h264_qpel_4_mc22_8_rvv_i32: 277.7 ( 1.26x)
avg_h264_qpel_4_mc23_8_c: 485.9 ( 1.00x)
avg_h264_qpel_4_mc23_8_rvv_i32: 392.2 ( 1.24x)
avg_h264_qpel_4_mc30_8_c: 173.4 ( 1.00x)
avg_h264_qpel_4_mc30_8_rvv_i32: 121.5 ( 1.43x)
avg_h264_qpel_4_mc31_8_c: 340.2 ( 1.00x)
avg_h264_qpel_4_mc31_8_rvv_i32: 194.2 ( 1.75x)
avg_h264_qpel_4_mc32_8_c: 548.5 ( 1.00x)
avg_h264_qpel_4_mc32_8_rvv_i32: 350.4 ( 1.56x)
avg_h264_qpel_4_mc33_8_c: 350.4 ( 1.00x)
avg_h264_qpel_4_mc33_8_rvv_i32: 194.4 ( 1.80x)
avg_h264_qpel_8_mc00_8_c: 111.0 ( 1.00x)
avg_h264_qpel_8_mc00_8_rvv_i32: 48.5 ( 2.29x)
avg_h264_qpel_8_mc01_8_c: 777.7 ( 1.00x)
avg_h264_qpel_8_mc01_8_rvv_i32: 162.9 ( 4.77x)
avg_h264_qpel_8_mc02_8_c: 777.7 ( 1.00x)
avg_h264_qpel_8_mc02_8_rvv_i32: 142.2 ( 5.47x)
avg_h264_qpel_8_mc03_8_c: 767.2 ( 1.00x)
avg_h264_qpel_8_mc03_8_rvv_i32: 162.9 ( 4.71x)
avg_h264_qpel_8_mc10_8_c: 621.5 ( 1.00x)
avg_h264_qpel_8_mc10_8_rvv_i32: 246.4 ( 2.52x)
avg_h264_qpel_8_mc11_8_c: 1204.7 ( 1.00x)
avg_h264_qpel_8_mc11_8_rvv_i32: 360.9 ( 3.34x)
avg_h264_qpel_8_mc12_8_c: 1892.2 ( 1.00x)
avg_h264_qpel_8_mc12_8_rvv_i32: 569.2 ( 3.32x)
avg_h264_qpel_8_mc13_8_c: 1204.7 ( 1.00x)
avg_h264_qpel_8_mc13_8_rvv_i32: 360.9 ( 3.34x)
avg_h264_qpel_8_mc20_8_c: 527.7 ( 1.00x)
avg_h264_qpel_8_mc20_8_rvv_i32: 225.4 ( 2.34x)
avg_h264_qpel_8_mc21_8_c: 1694.2 ( 1.00x)
avg_h264_qpel_8_mc21_8_rvv_i32: 652.7 ( 2.60x)
avg_h264_qpel_8_mc22_8_c: 1267.2 ( 1.00x)
avg_h264_qpel_8_mc22_8_rvv_i32: 433.9 ( 2.92x)
avg_h264_qpel_8_mc23_8_c: 1704.7 ( 1.00x)
avg_h264_qpel_8_mc23_8_rvv_i32: 642.2 ( 2.65x)
avg_h264_qpel_8_mc30_8_c: 611.0 ( 1.00x)
avg_h264_qpel_8_mc30_8_rvv_i32: 235.9 ( 2.59x)
avg_h264_qpel_8_mc31_8_c: 1204.7 ( 1.00x)
avg_h264_qpel_8_mc31_8_rvv_i32: 360.9 ( 3.34x)
avg_h264_qpel_8_mc32_8_c: 1902.7 ( 1.00x)
avg_h264_qpel_8_mc32_8_rvv_i32: 579.7 ( 3.28x)
avg_h264_qpel_8_mc33_8_c: 1215.0 ( 1.00x)
avg_h264_qpel_8_mc33_8_rvv_i32: 371.4 ( 3.27x)
avg_h264_qpel_16_mc00_8_c: 433.7 ( 1.00x)
avg_h264_qpel_16_mc00_8_rvv_i32: 79.7 ( 5.44x)
avg_h264_qpel_16_mc01_8_c: 2840.2 ( 1.00x)
avg_h264_qpel_16_mc01_8_rvv_i32: 308.9 ( 9.19x)
avg_h264_qpel_16_mc02_8_c: 2944.4 ( 1.00x)
avg_h264_qpel_16_mc02_8_rvv_i32: 277.7 (10.60x)
avg_h264_qpel_16_mc03_8_c: 2850.4 ( 1.00x)
avg_h264_qpel_16_mc03_8_rvv_i32: 308.9 ( 9.23x)
avg_h264_qpel_16_mc10_8_c: 2402.7 ( 1.00x)
avg_h264_qpel_16_mc10_8_rvv_i32: 475.4 ( 5.05x)
avg_h264_qpel_16_mc11_8_c: 4673.4 ( 1.00x)
avg_h264_qpel_16_mc11_8_rvv_i32: 725.5 ( 6.44x)
avg_h264_qpel_16_mc12_8_c: 7434.2 ( 1.00x)
avg_h264_qpel_16_mc12_8_rvv_i32: 1069.2 ( 6.95x)
avg_h264_qpel_16_mc13_8_c: 4642.4 ( 1.00x)
avg_h264_qpel_16_mc13_8_rvv_i32: 725.5 ( 6.40x)
avg_h264_qpel_16_mc20_8_c: 2058.9 ( 1.00x)
avg_h264_qpel_16_mc20_8_rvv_i32: 433.7 ( 4.75x)
avg_h264_qpel_16_mc21_8_c: 6736.2 ( 1.00x)
avg_h264_qpel_16_mc21_8_rvv_i32: 1225.5 ( 5.50x)
avg_h264_qpel_16_mc22_8_c: 5048.4 ( 1.00x)
avg_h264_qpel_16_mc22_8_rvv_i32: 788.0 ( 6.41x)
avg_h264_qpel_16_mc23_8_c: 6756.9 ( 1.00x)
avg_h264_qpel_16_mc23_8_rvv_i32: 1225.7 ( 5.51x)
avg_h264_qpel_16_mc30_8_c: 2402.7 ( 1.00x)
avg_h264_qpel_16_mc30_8_rvv_i32: 475.4 ( 5.05x)
avg_h264_qpel_16_mc31_8_c: 4642.4 ( 1.00x)
avg_h264_qpel_16_mc31_8_rvv_i32: 725.5 ( 6.40x)
avg_h264_qpel_16_mc32_8_c: 7433.9 ( 1.00x)
avg_h264_qpel_16_mc32_8_rvv_i32: 1079.7 ( 6.89x)
avg_h264_qpel_16_mc33_8_c: 4663.2 ( 1.00x)
avg_h264_qpel_16_mc33_8_rvv_i32: 725.5 ( 6.43x)
put_h264_qpel_4_mc00_8_c: 38.2 ( 1.00x)
put_h264_qpel_4_mc00_8_rvv_i32: 27.4 ( 1.39x)
put_h264_qpel_4_mc01_8_c: 204.7 ( 1.00x)
put_h264_qpel_4_mc01_8_rvv_i32: 90.0 ( 2.28x)
put_h264_qpel_4_mc02_8_c: 194.2 ( 1.00x)
put_h264_qpel_4_mc02_8_rvv_i32: 79.7 ( 2.44x)
put_h264_qpel_4_mc03_8_c: 204.7 ( 1.00x)
put_h264_qpel_4_mc03_8_rvv_i32: 90.2 ( 2.27x)
put_h264_qpel_4_mc10_8_c: 162.9 ( 1.00x)
put_h264_qpel_4_mc10_8_rvv_i32: 121.5 ( 1.34x)
put_h264_qpel_4_mc11_8_c: 340.2 ( 1.00x)
put_h264_qpel_4_mc11_8_rvv_i32: 183.9 ( 1.85x)
put_h264_qpel_4_mc12_8_c: 527.7 ( 1.00x)
put_h264_qpel_4_mc12_8_rvv_i32: 350.7 ( 1.50x)
put_h264_qpel_4_mc13_8_c: 340.2 ( 1.00x)
put_h264_qpel_4_mc13_8_rvv_i32: 183.9 ( 1.85x)
put_h264_qpel_4_mc20_8_c: 121.2 ( 1.00x)
put_h264_qpel_4_mc20_8_rvv_i32: 121.5 ( 1.00x)
put_h264_qpel_4_mc21_8_c: 475.7 ( 1.00x)
put_h264_qpel_4_mc21_8_rvv_i32: 381.7 ( 1.25x)
put_h264_qpel_4_mc22_8_c: 340.2 ( 1.00x)
put_h264_qpel_4_mc22_8_rvv_i32: 267.2 ( 1.27x)
put_h264_qpel_4_mc23_8_c: 475.4 ( 1.00x)
put_h264_qpel_4_mc23_8_rvv_i32: 381.7 ( 1.25x)
put_h264_qpel_4_mc30_8_c: 173.4 ( 1.00x)
put_h264_qpel_4_mc30_8_rvv_i32: 121.2 ( 1.43x)
put_h264_qpel_4_mc31_8_c: 350.4 ( 1.00x)
put_h264_qpel_4_mc31_8_rvv_i32: 183.9 ( 1.91x)
put_h264_qpel_4_mc32_8_c: 527.5 ( 1.00x)
put_h264_qpel_4_mc32_8_rvv_i32: 350.4 ( 1.51x)
put_h264_qpel_4_mc33_8_c: 339.9 ( 1.00x)
put_h264_qpel_4_mc33_8_rvv_i32: 183.9 ( 1.85x)
put_h264_qpel_8_mc00_8_c: 100.5 ( 1.00x)
put_h264_qpel_8_mc00_8_rvv_i32: 38.2 ( 2.63x)
put_h264_qpel_8_mc01_8_c: 736.2 ( 1.00x)
put_h264_qpel_8_mc01_8_rvv_i32: 152.7 ( 4.82x)
put_h264_qpel_8_mc02_8_c: 673.5 ( 1.00x)
put_h264_qpel_8_mc02_8_rvv_i32: 131.9 ( 5.10x)
put_h264_qpel_8_mc03_8_c: 736.0 ( 1.00x)
put_h264_qpel_8_mc03_8_rvv_i32: 152.7 ( 4.82x)
put_h264_qpel_8_mc10_8_c: 600.5 ( 1.00x)
put_h264_qpel_8_mc10_8_rvv_i32: 225.4 ( 2.66x)
put_h264_qpel_8_mc11_8_c: 1173.5 ( 1.00x)
put_h264_qpel_8_mc11_8_rvv_i32: 350.4 ( 3.35x)
put_h264_qpel_8_mc12_8_c: 1861.0 ( 1.00x)
put_h264_qpel_8_mc12_8_rvv_i32: 559.0 ( 3.33x)
put_h264_qpel_8_mc13_8_c: 1163.0 ( 1.00x)
put_h264_qpel_8_mc13_8_rvv_i32: 340.2 ( 3.42x)
put_h264_qpel_8_mc20_8_c: 454.7 ( 1.00x)
put_h264_qpel_8_mc20_8_rvv_i32: 214.9 ( 2.12x)
put_h264_qpel_8_mc21_8_c: 1673.5 ( 1.00x)
put_h264_qpel_8_mc21_8_rvv_i32: 632.0 ( 2.65x)
put_h264_qpel_8_mc22_8_c: 1163.0 ( 1.00x)
put_h264_qpel_8_mc22_8_rvv_i32: 423.4 ( 2.75x)
put_h264_qpel_8_mc23_8_c: 1663.2 ( 1.00x)
put_h264_qpel_8_mc23_8_rvv_i32: 632.0 ( 2.63x)
put_h264_qpel_8_mc30_8_c: 600.7 ( 1.00x)
put_h264_qpel_8_mc30_8_rvv_i32: 225.4 ( 2.66x)
put_h264_qpel_8_mc31_8_c: 1173.5 ( 1.00x)
put_h264_qpel_8_mc31_8_rvv_i32: 350.7 ( 3.35x)
put_h264_qpel_8_mc32_8_c: 1850.7 ( 1.00x)
put_h264_qpel_8_mc32_8_rvv_i32: 569.2 ( 3.25x)
put_h264_qpel_8_mc33_8_c: 1173.5 ( 1.00x)
put_h264_qpel_8_mc33_8_rvv_i32: 350.7 ( 3.35x)
put_h264_qpel_16_mc00_8_c: 308.9 ( 1.00x)
put_h264_qpel_16_mc00_8_rvv_i32: 48.5 ( 6.38x)
put_h264_qpel_16_mc01_8_c: 2756.7 ( 1.00x)
put_h264_qpel_16_mc01_8_rvv_i32: 287.9 ( 9.57x)
put_h264_qpel_16_mc02_8_c: 2558.9 ( 1.00x)
put_h264_qpel_16_mc02_8_rvv_i32: 267.2 ( 9.58x)
put_h264_qpel_16_mc03_8_c: 2735.9 ( 1.00x)
put_h264_qpel_16_mc03_8_rvv_i32: 287.9 ( 9.50x)
put_h264_qpel_16_mc10_8_c: 2277.7 ( 1.00x)
put_h264_qpel_16_mc10_8_rvv_i32: 444.2 ( 5.13x)
put_h264_qpel_16_mc11_8_c: 4538.2 ( 1.00x)
put_h264_qpel_16_mc11_8_rvv_i32: 694.2 ( 6.54x)
put_h264_qpel_16_mc12_8_c: 7350.9 ( 1.00x)
put_h264_qpel_16_mc12_8_rvv_i32: 1048.5 ( 7.01x)
put_h264_qpel_16_mc13_8_c: 4548.7 ( 1.00x)
put_h264_qpel_16_mc13_8_rvv_i32: 694.2 ( 6.55x)
put_h264_qpel_16_mc20_8_c: 1809.0 ( 1.00x)
put_h264_qpel_16_mc20_8_rvv_i32: 423.4 ( 4.27x)
put_h264_qpel_16_mc21_8_c: 6631.9 ( 1.00x)
put_h264_qpel_16_mc21_8_rvv_i32: 1204.7 ( 5.51x)
put_h264_qpel_16_mc22_8_c: 4590.2 ( 1.00x)
put_h264_qpel_16_mc22_8_rvv_i32: 767.2 ( 5.98x)
put_h264_qpel_16_mc23_8_c: 6642.4 ( 1.00x)
put_h264_qpel_16_mc23_8_rvv_i32: 1204.7 ( 5.51x)
put_h264_qpel_16_mc30_8_c: 2288.2 ( 1.00x)
put_h264_qpel_16_mc30_8_rvv_i32: 444.2 ( 5.15x)
put_h264_qpel_16_mc31_8_c: 4548.4 ( 1.00x)
put_h264_qpel_16_mc31_8_rvv_i32: 715.2 ( 6.36x)
put_h264_qpel_16_mc32_8_c: 7340.4 ( 1.00x)
put_h264_qpel_16_mc32_8_rvv_i32: 1059.0 ( 6.93x)
put_h264_qpel_16_mc33_8_c: 4559.2 ( 1.00x)
put_h264_qpel_16_mc33_8_rvv_i32: 704.7 ( 6.47x)
Signed-off-by: Niklas Haas <git at haasn.dev>
Signed-off-by: J. Dekker <jdek at itanimul.li>
---
libavcodec/h264qpel.c | 2 +
libavcodec/h264qpel.h | 1 +
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/h264qpel_init.c | 113 ++++++++
libavcodec/riscv/h264qpel_rvv.S | 445 +++++++++++++++++++++++++++++++
5 files changed, 563 insertions(+)
create mode 100644 libavcodec/riscv/h264qpel_init.c
create mode 100644 libavcodec/riscv/h264qpel_rvv.S
Version with push/pop macros removed other comments implemented.
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index 65fef03304..faca1e8953 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -102,6 +102,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
ff_h264qpel_init_arm(c, bit_depth);
#elif ARCH_PPC
ff_h264qpel_init_ppc(c, bit_depth);
+#elif ARCH_RISCV
+ ff_h264qpel_init_riscv(c, bit_depth);
#elif ARCH_X86
ff_h264qpel_init_x86(c, bit_depth);
#elif ARCH_MIPS
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 0259e8de23..24baf826f9 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -34,6 +34,7 @@ void ff_h264qpel_init(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
+void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth);
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 27befce929..1f1fa03329 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -33,6 +33,8 @@ RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
riscv/h264idct_rvv.o
+OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
+RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
diff --git a/libavcodec/riscv/h264qpel_init.c b/libavcodec/riscv/h264qpel_init.c
new file mode 100644
index 0000000000..69a1345447
--- /dev/null
+++ b/libavcodec/riscv/h264qpel_init.c
@@ -0,0 +1,113 @@
+/*
+ * RISC-V optimised DSP functions
+ * Copyright (c) 2024 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/h264qpel.h"
+
+#define DECL_QPEL_OPS(OP, SIZE, EXT) \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+DECL_QPEL_OPS(put, 16, rvv256)
+DECL_QPEL_OPS(put, 8, rvv256)
+DECL_QPEL_OPS(put, 4, rvv256)
+
+DECL_QPEL_OPS(avg, 16, rvv256)
+DECL_QPEL_OPS(avg, 8, rvv256)
+DECL_QPEL_OPS(avg, 4, rvv256)
+
+DECL_QPEL_OPS(put, 16, rvv)
+DECL_QPEL_OPS(put, 8, rvv)
+DECL_QPEL_OPS(put, 4, rvv)
+
+DECL_QPEL_OPS(avg, 16, rvv)
+DECL_QPEL_OPS(avg, 8, rvv)
+DECL_QPEL_OPS(avg, 4, rvv)
+
+#define SET_QPEL_FNS(OP, IDX, SIZE, EXT) \
+do { \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 0] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 1] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 2] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 3] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 4] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 5] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 6] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 7] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 8] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][ 9] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][10] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][11] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][12] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][13] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][14] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ ## EXT; \
+ c->OP ## _h264_qpel_pixels_tab[IDX][15] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT; \
+} while (0)
+
+av_cold void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+ if (flags & AV_CPU_FLAG_RVV_I32) {
+ const int vlen = 8 * ff_get_rv_vlenb();
+
+ switch (bit_depth) {
+ case 8:
+ if (vlen >= 256) {
+ SET_QPEL_FNS(put, 0, 16, rvv256);
+ SET_QPEL_FNS(put, 1, 8, rvv256);
+ SET_QPEL_FNS(put, 2, 4, rvv256);
+
+ SET_QPEL_FNS(avg, 0, 16, rvv256);
+ SET_QPEL_FNS(avg, 1, 8, rvv256);
+ SET_QPEL_FNS(avg, 2, 4, rvv256);
+ } else if (vlen >= 128) {
+ SET_QPEL_FNS(put, 0, 16, rvv);
+ SET_QPEL_FNS(put, 1, 8, rvv);
+ SET_QPEL_FNS(put, 2, 4, rvv);
+
+ SET_QPEL_FNS(avg, 0, 16, rvv);
+ SET_QPEL_FNS(avg, 1, 8, rvv);
+ SET_QPEL_FNS(avg, 2, 4, rvv);
+ }
+ break;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/riscv/h264qpel_rvv.S b/libavcodec/riscv/h264qpel_rvv.S
new file mode 100644
index 0000000000..fe751f9cae
--- /dev/null
+++ b/libavcodec/riscv/h264qpel_rvv.S
@@ -0,0 +1,445 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Niklas Haas
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/riscv/asm.S"
+
+ /* output is unclipped; clobbers v26-v31 plus t0 and t02 */
+.macro lowpass_h vdst, src
+ addi t4, \src, 3
+ lbu t5, 2(\src)
+ vle8.v v31, (t4)
+ lbu t4, 1(\src)
+ vslide1up.vx v30, v31, t5
+ lbu t5, 0(\src)
+ vslide1up.vx v29, v30, t4
+ lbu t4, -1(\src)
+ vslide1up.vx v28, v29, t5
+ lbu t5, -2(\src)
+ vslide1up.vx v27, v28, t4
+ vslide1up.vx v26, v27, t5
+ vwaddu.vv \vdst, v26, v31
+ vwmaccu.vx \vdst, t6, v28
+ vwmaccu.vx \vdst, t6, v29
+ vwmaccsu.vx \vdst, a7, v27
+ vwmaccsu.vx \vdst, a7, v30
+.endm
+
+ /* output is unclipped */
+.macro lowpass_v vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, vsrc5, signed=0
+ .if \signed
+ vwadd.vv \vdst, \vsrc0, \vsrc5
+ vwmacc.vx \vdst, t6, \vsrc2
+ vwmacc.vx \vdst, t6, \vsrc3
+ vwmacc.vx \vdst, a7, \vsrc1
+ vwmacc.vx \vdst, a7, \vsrc4
+ .else
+ vwaddu.vv \vdst, \vsrc0, \vsrc5
+ vwmaccu.vx \vdst, t6, \vsrc2
+ vwmaccu.vx \vdst, t6, \vsrc3
+ vwmaccsu.vx \vdst, a7, \vsrc1
+ vwmaccsu.vx \vdst, a7, \vsrc4
+ .endif
+.endm
+
+.macro qpel_mc00 op, dst, src, stride, size
+func ff_\op\()_h264_qpel_pixels, zve32x
+1:
+ add t1, a2, a1
+ add t2, a2, t1
+ add t3, a2, t2
+ vle8.v v0, (a1)
+ vle8.v v1, (t1)
+ vle8.v v2, (t2)
+ vle8.v v3, (t3)
+ addi a4, a4, -4
+ add a1, a2, t3
+ add t1, a2, a0
+ add t2, a2, t1
+ add t3, a2, t2
+ .ifc \op, avg
+ vle8.v v4, (a0)
+ vle8.v v5, (t1)
+ vle8.v v6, (t2)
+ vle8.v v7, (t3)
+ vaaddu.vv v0, v0, v4
+ vaaddu.vv v1, v1, v5
+ vaaddu.vv v2, v2, v6
+ vaaddu.vv v3, v3, v7
+ .endif
+ vse8.v v0, (a0)
+ vse8.v v1, (t1)
+ vse8.v v2, (t2)
+ vse8.v v3, (t3)
+ add a0, a2, t3
+ bnez a4, 1b
+ jr t0
+endfunc
+.endm
+
+ qpel_mc00 put, a0, a1, a2, a4
+ qpel_mc00 avg, a0, a1, a2, a4
+
+.macro qpel_lowpass op, ext, lmul, lmul2
+func ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x
+1:
+ add t1, a3, a1
+ add t2, a3, t1
+ add t3, a3, t2
+ lowpass_h v0, a1
+ lowpass_h v2, t1
+ lowpass_h v4, t2
+ lowpass_h v6, t3
+ add a1, a3, t3
+ addi a4, a4, -4
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vmax.vx v0, v0, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v4, v4, zero
+ vmax.vx v6, v6, zero
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v0, v0, 5
+ vnclipu.wi v2, v2, 5
+ vnclipu.wi v4, v4, 5
+ vnclipu.wi v6, v6, 5
+ .ifc \ext, _l2
+ add t1, a6, a5
+ add t2, a6, t1
+ add t3, a6, t2
+ vle8.v v8, (a5)
+ vle8.v v10, (t1)
+ vle8.v v12, (t2)
+ vle8.v v14, (t3)
+ add a5, a2, t3
+ vaaddu.vv v0, v0, v8
+ vaaddu.vv v2, v2, v10
+ vaaddu.vv v4, v4, v12
+ vaaddu.vv v6, v6, v14
+ .endif
+ add t1, a2, a0
+ add t2, a2, t1
+ add t3, a2, t2
+ .ifc \op, avg
+ vle8.v v1, (a0)
+ vle8.v v3, (t1)
+ vle8.v v5, (t2)
+ vle8.v v7, (t3)
+ vaaddu.vv v0, v0, v1
+ vaaddu.vv v2, v2, v3
+ vaaddu.vv v4, v4, v5
+ vaaddu.vv v6, v6, v7
+ .endif
+ vse8.v v0, (a0)
+ vse8.v v2, (t1)
+ vse8.v v4, (t2)
+ vse8.v v6, (t3)
+ add a0, a2, t3
+ bnez a4, 1b
+ jr t0
+endfunc
+
+func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x
+ sub t1, a1, a3
+ sub t2, t1, a3
+ vle8.v v2, (a1)
+ vle8.v v1, (t1)
+ vle8.v v0, (t2)
+ add t1, a1, a3
+ add t2, t1, a3
+ add a1, t2, a3
+ vle8.v v3, (t1)
+ vle8.v v4, (t2)
+1:
+ add t1, a3, a1
+ add t2, a3, t1
+ add t3, a3, t2
+ vle8.v v5, (a1)
+ vle8.v v6, (t1)
+ vle8.v v7, (t2)
+ vle8.v v8, (t3)
+ add a1, a3, t3
+ lowpass_v v24, v0, v1, v2, v3, v4, v5
+ lowpass_v v26, v1, v2, v3, v4, v5, v6
+ lowpass_v v28, v2, v3, v4, v5, v6, v7
+ lowpass_v v30, v3, v4, v5, v6, v7, v8
+ addi a4, a4, -4
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ vmax.vx v24, v24, zero
+ vmax.vx v26, v26, zero
+ vmax.vx v28, v28, zero
+ vmax.vx v30, v30, zero
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v24, v24, 5
+ vnclipu.wi v26, v26, 5
+ vnclipu.wi v28, v28, 5
+ vnclipu.wi v30, v30, 5
+ .ifc \ext, _l2
+ add t1, a6, a5
+ add t2, a6, t1
+ add t3, a6, t2
+ vle8.v v9, (a5)
+ vle8.v v10, (t1)
+ vle8.v v11, (t2)
+ vle8.v v12, (t3)
+ add a5, a6, t3
+ vaaddu.vv v24, v24, v9
+ vaaddu.vv v26, v26, v10
+ vaaddu.vv v28, v28, v11
+ vaaddu.vv v30, v30, v12
+ .endif
+ add t1, a2, a0
+ add t2, a2, t1
+ add t3, a2, t2
+ .ifc \op, avg
+ vle8.v v9, (a0)
+ vle8.v v10, (t1)
+ vle8.v v11, (t2)
+ vle8.v v12, (t3)
+ vaaddu.vv v24, v24, v9
+ vaaddu.vv v26, v26, v10
+ vaaddu.vv v28, v28, v11
+ vaaddu.vv v30, v30, v12
+ .endif
+ vse8.v v24, (a0)
+ vse8.v v26, (t1)
+ vse8.v v28, (t2)
+ vse8.v v30, (t3)
+ add a0, a2, t3
+ vmv.v.v v0, v4
+ vmv.v.v v1, v5
+ vmv.v.v v2, v6
+ vmv.v.v v3, v7
+ vmv.v.v v4, v8
+ bnez a4, 1b
+ jr t0
+endfunc
+
+func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x
+ sub t1, a1, a3
+ sub t2, t1, a3
+ lowpass_h v4, a1
+ lowpass_h v2, t1
+ lowpass_h v0, t2
+ add t1, a1, a3
+ add t2, t1, a3
+ add a1, t2, a3
+ lowpass_h v6, t1
+ lowpass_h v8, t2
+1:
+ add t1, a3, a1
+ add t2, a3, t1
+ add t3, a3, t2
+ lowpass_h v10, a1
+ lowpass_h v12, t1
+ lowpass_h v14, t2
+ lowpass_h v16, t3
+ vsetvli zero, zero, e16, \lmul2, ta, ma
+ addi a4, a4, -4
+ lowpass_v v20, v0, v2, v4, v6, v8, v10, signed=1
+ lowpass_v v24, v2, v4, v6, v8, v10, v12, signed=1
+ lowpass_v v28, v4, v6, v8, v10, v12, v14, signed=1
+ vnclip.wi v0, v20, 10
+ lowpass_v v20, v6, v8, v10, v12, v14, v16, signed=1
+ vnclip.wi v2, v24, 10
+ vnclip.wi v4, v28, 10
+ vnclip.wi v6, v20, 10
+ vmax.vx v18, v0, zero
+ vmax.vx v20, v2, zero
+ vmax.vx v22, v4, zero
+ vmax.vx v24, v6, zero
+ vmv.v.v v0, v8
+ vmv.v.v v2, v10
+ vmv.v.v v4, v12
+ vmv.v.v v6, v14
+ vmv.v.v v8, v16
+ add a1, a3, t3
+ vsetvli zero, zero, e8, \lmul, ta, ma
+ vnclipu.wi v18, v18, 0
+ vnclipu.wi v20, v20, 0
+ vnclipu.wi v22, v22, 0
+ vnclipu.wi v24, v24, 0
+ .ifc \ext, _l2
+ add t1, a6, a5
+ add t2, a6, t1
+ add t3, a6, t2
+ vle8.v v26, (a5)
+ vle8.v v27, (t1)
+ vle8.v v28, (t2)
+ vle8.v v29, (t3)
+ add a5, a6, t3
+ vaaddu.vv v18, v18, v26
+ vaaddu.vv v20, v20, v27
+ vaaddu.vv v22, v22, v28
+ vaaddu.vv v24, v24, v29
+ .endif
+ add t1, a2, a0
+ add t2, a2, t1
+ add t3, a2, t2
+ .ifc \op, avg
+ vle8.v v26, (a0)
+ vle8.v v27, (t1)
+ vle8.v v28, (t2)
+ vle8.v v29, (t3)
+ vaaddu.vv v18, v18, v26
+ vaaddu.vv v20, v20, v27
+ vaaddu.vv v22, v22, v28
+ vaaddu.vv v24, v24, v29
+ .endif
+ vse8.v v18, (a0)
+ vse8.v v20, (t1)
+ vse8.v v22, (t2)
+ vse8.v v24, (t3)
+ add a0, a2, t3
+ bnez a4, 1b
+ jr t0
+endfunc
+.endm
+
+/* Note: We could possibly specialize for the width 8 / width 4 cases by
+ loading 32 bit integers, but this makes the convolutions more complicated
+ to implement, so it's not necessarily any faster. */
+
+.macro h264_qpel lmul, lmul2
+ qpel_lowpass put, , \lmul, \lmul2
+ qpel_lowpass put, _l2, \lmul, \lmul2
+ qpel_lowpass avg, , \lmul, \lmul2
+ qpel_lowpass avg, _l2, \lmul, \lmul2
+.endm
+
+ h264_qpel m1, m2
+ h264_qpel mf2, m1
+ h264_qpel mf4, mf2
+ h264_qpel mf8, mf4
+
+.macro h264_qpel_1pass op, case, lmul, size, ext=rvv, dir, offset
+func ff_\op\()_h264_qpel\size\()_\case\()_\ext, zve32x
+ vsetivli zero, \size, e8, \lmul, ta, ma
+ csrwi vxrm, 0
+ li a4, \size
+ li t6, 20
+ li a7, -5
+ mv a3, a2
+ mv t0, ra
+.ifnb \offset
+ .ifc \dir, v
+ add a5, a1, \offset
+ .else
+ addi a5, a1, \offset
+ .endif
+ mv a6, a3
+ j ff_\op\()_h264_qpel_\dir\()_lowpass_\lmul\()_l2
+.else
+ j ff_\op\()_h264_qpel_\dir\()_lowpass_\lmul\()
+.endif
+endfunc
+.endm
+
+.macro h264_qpel_2pass op, case, lmul, size, ext=rvv, dir1, dir2, off1=0, off2
+func ff_\op\()_h264_qpel\size\()_\case\()_\ext, zve32x
+ vsetivli zero, \size, e8, \lmul, ta, ma
+ csrwi vxrm, 0
+ addi sp, sp, -16
+ li a4, \size
+ li t6, 20
+ li a7, -5
+ sx a0, 0(sp)
+ sx a1, 8(sp)
+ .ifc \off1, a2
+ add a1, a1, \off1
+ .elseif \off1
+ addi a1, a1, \off1
+ .endif
+ mv a3, a2
+ .ifc \op, avg
+ // Use temporary array on stack for the first pass
+ addi a0, sp, -(\size * \size)
+ li a2, \size
+ .endif
+ jal t0, ff_put_h264_qpel_\dir1\()_lowpass_\lmul
+ lx a0, 0(sp)
+ lx a1, 8(sp)
+ .ifc \op, put
+ // Directly reuse the first pass output buffer
+ mv a5, a0
+ mv a6, a2
+ .else
+ addi a5, sp, -(\size * \size)
+ li a6, \size
+ mv a2, a3
+ .endif
+ .ifnb \off2
+ addi a1, a1, \off2
+ .endif
+ li a4, \size
+ mv t0, ra
+ addi sp, sp, 16
+ j ff_\op\()_h264_qpel_\dir2\()_lowpass_\lmul\()_l2
+endfunc
+.endm
+
+.macro ff_h264_qpel_fns op, lmul, size, ext=rvv
+func ff_\op\()_h264_qpel\size\()_mc00_\ext, zve32x
+ vsetivli zero, \size, e8, \lmul, ta, ma
+ csrwi vxrm, 0
+ li a4, \size
+ mv t0, ra
+ j ff_\op\()_h264_qpel_pixels
+endfunc
+
+ h264_qpel_1pass \op, mc20, \lmul, \size, \ext, h
+ h264_qpel_1pass \op, mc02, \lmul, \size, \ext, v
+ h264_qpel_1pass \op, mc10, \lmul, \size, \ext, h, 0
+ h264_qpel_1pass \op, mc30, \lmul, \size, \ext, h, 1
+ h264_qpel_1pass \op, mc01, \lmul, \size, \ext, v, zero
+ h264_qpel_1pass \op, mc03, \lmul, \size, \ext, v, a2
+ h264_qpel_1pass \op, mc22, \lmul, \size, \ext, hv
+
+ h264_qpel_2pass \op, mc11, \lmul, \size, \ext, h, v
+ h264_qpel_2pass \op, mc21, \lmul, \size, \ext, h, hv
+ h264_qpel_2pass \op, mc12, \lmul, \size, \ext, v, hv
+ h264_qpel_2pass \op, mc31, \lmul, \size, \ext, h, v, off2=1
+ h264_qpel_2pass \op, mc13, \lmul, \size, \ext, h, v, a2
+ h264_qpel_2pass \op, mc33, \lmul, \size, \ext, h, v, a2, 1
+ h264_qpel_2pass \op, mc23, \lmul, \size, \ext, h, hv, a2
+ h264_qpel_2pass \op, mc32, \lmul, \size, \ext, v, hv, 1
+.endm
+
+ ff_h264_qpel_fns put, mf2, 16, rvv256
+ ff_h264_qpel_fns put, mf4, 8, rvv256
+ ff_h264_qpel_fns put, mf8, 4, rvv256
+
+ ff_h264_qpel_fns avg, mf2, 16, rvv256
+ ff_h264_qpel_fns avg, mf4, 8, rvv256
+ ff_h264_qpel_fns avg, mf8, 4, rvv256
+
+ ff_h264_qpel_fns put, m1, 16, rvv
+ ff_h264_qpel_fns put, mf2, 8, rvv
+ ff_h264_qpel_fns put, mf4, 4, rvv
+
+ ff_h264_qpel_fns avg, m1, 16, rvv
+ ff_h264_qpel_fns avg, mf2, 8, rvv
+ ff_h264_qpel_fns avg, mf4, 4, rvv
--
2.44.1
More information about the ffmpeg-devel
mailing list