[FFmpeg-devel] [PATCH v2 2/7] avcodec/hevc: Add add_residual_4/8/16/32 asm opt
jinbo
jinbo at loongson.cn
Wed Dec 27 06:50:14 EET 2023
After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 2fps (45fps-->47fsp).
---
libavcodec/loongarch/Makefile | 3 +-
libavcodec/loongarch/hevc_add_res.S | 162 ++++++++++++++++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 5 +
libavcodec/loongarch/hevcdsp_lsx.h | 5 +
4 files changed, 174 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/loongarch/hevc_add_res.S
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 06cfab5c20..07ea97f803 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -27,7 +27,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_lpf_sao_lsx.o \
loongarch/hevc_mc_bi_lsx.o \
loongarch/hevc_mc_uni_lsx.o \
- loongarch/hevc_mc_uniw_lsx.o
+ loongarch/hevc_mc_uniw_lsx.o \
+ loongarch/hevc_add_res.o
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
loongarch/h264idct_loongarch.o \
loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_add_res.S b/libavcodec/loongarch/hevc_add_res.S
new file mode 100644
index 0000000000..dd2d820af8
--- /dev/null
+++ b/libavcodec/loongarch/hevc_add_res.S
@@ -0,0 +1,162 @@
+/*
+ * Loongson LSX optimized add_residual functions for HEVC decoding
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_4x4_8
+ vldrepl.w vr0, a0, 0
+ add.d t0, a0, a2
+ vldrepl.w vr1, t0, 0
+ vld vr2, a1, 0
+
+ vilvl.w vr1, vr1, vr0
+ vsllwil.hu.bu vr1, vr1, 0
+ vadd.h vr1, vr1, vr2
+ vssrani.bu.h vr1, vr1, 0
+
+ vstelm.w vr1, a0, 0, 0
+ vstelm.w vr1, t0, 0, 1
+.endm
+
+function ff_hevc_add_residual4x4_8_lsx
+ ADD_RES_LSX_4x4_8
+ alsl.d a0, a2, a0, 1
+ addi.d a1, a1, 16
+ ADD_RES_LSX_4x4_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_8x8_8
+ vldrepl.d vr0, a0, 0
+ add.d t0, a0, a2
+ vldrepl.d vr1, t0, 0
+ add.d t1, t0, a2
+ vldrepl.d vr2, t1, 0
+ add.d t2, t1, a2
+ vldrepl.d vr3, t2, 0
+
+ vld vr4, a1, 0
+ addi.d t3, zero, 16
+ vldx vr5, a1, t3
+ addi.d t4, a1, 32
+ vld vr6, t4, 0
+ vldx vr7, t4, t3
+
+ vsllwil.hu.bu vr0, vr0, 0
+ vsllwil.hu.bu vr1, vr1, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.hu.bu vr3, vr3, 0
+ vadd.h vr0, vr0, vr4
+ vadd.h vr1, vr1, vr5
+ vadd.h vr2, vr2, vr6
+ vadd.h vr3, vr3, vr7
+ vssrani.bu.h vr1, vr0, 0
+ vssrani.bu.h vr3, vr2, 0
+
+ vstelm.d vr1, a0, 0, 0
+ vstelm.d vr1, t0, 0, 1
+ vstelm.d vr3, t1, 0, 0
+ vstelm.d vr3, t2, 0, 1
+.endm
+
+function ff_hevc_add_residual8x8_8_lsx
+ ADD_RES_LSX_8x8_8
+ alsl.d a0, a2, a0, 2
+ addi.d a1, a1, 64
+ ADD_RES_LSX_8x8_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+function ff_hevc_add_residual16x16_8_lsx
+.rept 8
+ vld vr0, a0, 0
+ vldx vr2, a0, a2
+
+ vld vr4, a1, 0
+ addi.d t0, zero, 16
+ vldx vr5, a1, t0
+ addi.d t1, a1, 32
+ vld vr6, t1, 0
+ vldx vr7, t1, t0
+
+ vexth.hu.bu vr1, vr0
+ vsllwil.hu.bu vr0, vr0, 0
+ vexth.hu.bu vr3, vr2
+ vsllwil.hu.bu vr2, vr2, 0
+ vadd.h vr0, vr0, vr4
+ vadd.h vr1, vr1, vr5
+ vadd.h vr2, vr2, vr6
+ vadd.h vr3, vr3, vr7
+
+ vssrani.bu.h vr1, vr0, 0
+ vssrani.bu.h vr3, vr2, 0
+
+ vst vr1, a0, 0
+ vstx vr3, a0, a2
+
+ alsl.d a0, a2, a0, 1
+ addi.d a1, a1, 64
+.endr
+endfunc
+
+/*
+ * void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+function ff_hevc_add_residual32x32_8_lsx
+.rept 32
+ vld vr0, a0, 0
+ addi.w t0, zero, 16
+ vldx vr2, a0, t0
+
+ vld vr4, a1, 0
+ vldx vr5, a1, t0
+ addi.d t1, a1, 32
+ vld vr6, t1, 0
+ vldx vr7, t1, t0
+
+ vexth.hu.bu vr1, vr0
+ vsllwil.hu.bu vr0, vr0, 0
+ vexth.hu.bu vr3, vr2
+ vsllwil.hu.bu vr2, vr2, 0
+ vadd.h vr0, vr0, vr4
+ vadd.h vr1, vr1, vr5
+ vadd.h vr2, vr2, vr6
+ vadd.h vr3, vr3, vr7
+
+ vssrani.bu.h vr1, vr0, 0
+ vssrani.bu.h vr3, vr2, 0
+
+ vst vr1, a0, 0
+ vstx vr3, a0, t0
+
+ add.d a0, a0, a2
+ addi.d a1, a1, 64
+.endr
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index 5a96f3a4c9..a8f753dc86 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -189,6 +189,11 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->idct[1] = ff_hevc_idct_8x8_lsx;
c->idct[2] = ff_hevc_idct_16x16_lsx;
c->idct[3] = ff_hevc_idct_32x32_lsx;
+
+ c->add_residual[0] = ff_hevc_add_residual4x4_8_lsx;
+ c->add_residual[1] = ff_hevc_add_residual8x8_8_lsx;
+ c->add_residual[2] = ff_hevc_add_residual16x16_8_lsx;
+ c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
}
}
}
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 0d54196caf..ac509984fd 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -227,4 +227,9 @@ void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit);
void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_add_residual4x4_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
--
2.20.1
More information about the ffmpeg-devel
mailing list