[FFmpeg-devel] [PATCH v2 7/7] avcodec/hevc: Add ff_hevc_idct_32x32_lasx asm opt
jinbo
jinbo at loongson.cn
Wed Dec 27 06:50:19 EET 2023
From: yuanhecai <yuanhecai at loongson.cn>
tests/checkasm/checkasm:
C LSX LASX
hevc_idct_32x32_8_c: 1243.0 211.7 101.7
Speedup of decoding H265 4K 30FPS 30Mbps on
3A6000 with 8 threads is 1fps(56fps-->57fps).
---
libavcodec/loongarch/Makefile | 3 +-
libavcodec/loongarch/hevc_idct.S | 863 ++++++++++++++++++
libavcodec/loongarch/hevc_idct_lsx.c | 10 +-
libavcodec/loongarch/hevcdsp_init_loongarch.c | 2 +
libavcodec/loongarch/hevcdsp_lasx.h | 2 +
5 files changed, 874 insertions(+), 6 deletions(-)
create mode 100644 libavcodec/loongarch/hevc_idct.S
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index ad98cd4054..07da2964e4 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -29,7 +29,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_mc_uni_lsx.o \
loongarch/hevc_mc_uniw_lsx.o \
loongarch/hevc_add_res.o \
- loongarch/hevc_mc.o
+ loongarch/hevc_mc.o \
+ loongarch/hevc_idct.o
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
loongarch/h264idct_loongarch.o \
loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_idct.S b/libavcodec/loongarch/hevc_idct.S
new file mode 100644
index 0000000000..5593e5fd73
--- /dev/null
+++ b/libavcodec/loongarch/hevc_idct.S
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro fr_store
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+.endm
+
+.macro fr_recover
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+.endm
+
+.macro malloc_space number
+ li.w t0, \number
+ sub.d sp, sp, t0
+ fr_store
+.endm
+
+.macro free_space number
+ fr_recover
+ li.w t0, \number
+ add.d sp, sp, t0
+.endm
+
+.extern gt32x32_cnst1
+
+.extern gt32x32_cnst2
+
+.extern gt8x8_cnst
+
+.extern gt32x32_cnst0
+
+.macro idct_16x32_step1_lasx
+ xvldrepl.w xr20, t1, 0
+ xvldrepl.w xr21, t1, 4
+ xvldrepl.w xr22, t1, 8
+ xvldrepl.w xr23, t1, 12
+
+ xvmulwev.w.h xr16, xr8, xr20
+ xvmaddwod.w.h xr16, xr8, xr20
+ xvmulwev.w.h xr17, xr9, xr20
+ xvmaddwod.w.h xr17, xr9, xr20
+
+ xvmaddwev.w.h xr16, xr10, xr21
+ xvmaddwod.w.h xr16, xr10, xr21
+ xvmaddwev.w.h xr17, xr11, xr21
+ xvmaddwod.w.h xr17, xr11, xr21
+
+ xvmaddwev.w.h xr16, xr12, xr22
+ xvmaddwod.w.h xr16, xr12, xr22
+ xvmaddwev.w.h xr17, xr13, xr22
+ xvmaddwod.w.h xr17, xr13, xr22
+
+ xvmaddwev.w.h xr16, xr14, xr23
+ xvmaddwod.w.h xr16, xr14, xr23
+ xvmaddwev.w.h xr17, xr15, xr23
+ xvmaddwod.w.h xr17, xr15, xr23
+
+ xvld xr0, t2, 0
+ xvld xr1, t2, 32
+
+ xvadd.w xr18, xr0, xr16
+ xvadd.w xr19, xr1, xr17
+ xvsub.w xr0, xr0, xr16
+ xvsub.w xr1, xr1, xr17
+
+ xvst xr18, t2, 0
+ xvst xr19, t2, 32
+ xvst xr0, t3, 0
+ xvst xr1, t3, 32
+.endm
+
+.macro idct_16x32_step2_lasx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1
+
+ xvldrepl.w xr20, t1, 0
+ xvldrepl.w xr21, t1, 4
+ xvldrepl.w xr22, t1, 8
+ xvldrepl.w xr23, t1, 12
+
+ xvmulwev.w.h \out0, \in0, xr20
+ xvmaddwod.w.h \out0, \in0, xr20
+ xvmulwev.w.h \out1, \in1, xr20
+ xvmaddwod.w.h \out1, \in1, xr20
+ xvmaddwev.w.h \out0, \in2, xr21
+ xvmaddwod.w.h \out0, \in2, xr21
+ xvmaddwev.w.h \out1, \in3, xr21
+ xvmaddwod.w.h \out1, \in3, xr21
+ xvmaddwev.w.h \out0, \in4, xr22
+ xvmaddwod.w.h \out0, \in4, xr22
+ xvmaddwev.w.h \out1, \in5, xr22
+ xvmaddwod.w.h \out1, \in5, xr22
+ xvmaddwev.w.h \out0, \in6, xr23
+ xvmaddwod.w.h \out0, \in6, xr23
+ xvmaddwev.w.h \out1, \in7, xr23 // sum0_r
+ xvmaddwod.w.h \out1, \in7, xr23 // sum0_l
+.endm
+
+ /* loop for all columns of filter constants */
+.macro idct_16x32_step3_lasx round
+ xvadd.w xr16, xr16, xr30
+ xvadd.w xr17, xr17, xr31
+
+ xvld xr0, t2, 0
+ xvld xr1, t2, 32
+
+ xvadd.w xr30, xr0, xr16
+ xvadd.w xr31, xr1, xr17
+ xvsub.w xr16, xr0, xr16
+ xvsub.w xr17, xr1, xr17
+ xvssrarni.h.w xr31, xr30, \round
+ xvssrarni.h.w xr17, xr16, \round
+ xvst xr31, t4, 0
+ xvst xr17, t5, 0
+.endm
+
+.macro idct_16x32_lasx buf_pitch, round
+ addi.d t2, sp, 64
+
+ addi.d t0, a0, \buf_pitch*4*2
+
+ // 4 12 20 28
+ xvld xr0, t0, 0
+ xvld xr1, t0, \buf_pitch*8*2
+ xvld xr2, t0, \buf_pitch*16*2
+ xvld xr3, t0, \buf_pitch*24*2
+
+ xvilvl.h xr10, xr1, xr0
+ xvilvh.h xr11, xr1, xr0
+ xvilvl.h xr12, xr3, xr2
+ xvilvh.h xr13, xr3, xr2
+
+ la.local t1, gt32x32_cnst2
+
+ xvldrepl.w xr20, t1, 0
+ xvldrepl.w xr21, t1, 4
+ xvmulwev.w.h xr14, xr10, xr20
+ xvmaddwod.w.h xr14, xr10, xr20
+ xvmulwev.w.h xr15, xr11, xr20
+ xvmaddwod.w.h xr15, xr11, xr20
+ xvmaddwev.w.h xr14, xr12, xr21
+ xvmaddwod.w.h xr14, xr12, xr21
+ xvmaddwev.w.h xr15, xr13, xr21
+ xvmaddwod.w.h xr15, xr13, xr21
+
+ xvldrepl.w xr20, t1, 8
+ xvldrepl.w xr21, t1, 12
+ xvmulwev.w.h xr16, xr10, xr20
+ xvmaddwod.w.h xr16, xr10, xr20
+ xvmulwev.w.h xr17, xr11, xr20
+ xvmaddwod.w.h xr17, xr11, xr20
+ xvmaddwev.w.h xr16, xr12, xr21
+ xvmaddwod.w.h xr16, xr12, xr21
+ xvmaddwev.w.h xr17, xr13, xr21
+ xvmaddwod.w.h xr17, xr13, xr21
+
+ xvldrepl.w xr20, t1, 16
+ xvldrepl.w xr21, t1, 20
+ xvmulwev.w.h xr18, xr10, xr20
+ xvmaddwod.w.h xr18, xr10, xr20
+ xvmulwev.w.h xr19, xr11, xr20
+ xvmaddwod.w.h xr19, xr11, xr20
+ xvmaddwev.w.h xr18, xr12, xr21
+ xvmaddwod.w.h xr18, xr12, xr21
+ xvmaddwev.w.h xr19, xr13, xr21
+ xvmaddwod.w.h xr19, xr13, xr21
+
+ xvldrepl.w xr20, t1, 24
+ xvldrepl.w xr21, t1, 28
+ xvmulwev.w.h xr22, xr10, xr20
+ xvmaddwod.w.h xr22, xr10, xr20
+ xvmulwev.w.h xr23, xr11, xr20
+ xvmaddwod.w.h xr23, xr11, xr20
+ xvmaddwev.w.h xr22, xr12, xr21
+ xvmaddwod.w.h xr22, xr12, xr21
+ xvmaddwev.w.h xr23, xr13, xr21
+ xvmaddwod.w.h xr23, xr13, xr21
+
+ /* process coeff 0, 8, 16, 24 */
+ la.local t1, gt8x8_cnst
+
+ xvld xr0, a0, 0
+ xvld xr1, a0, \buf_pitch*8*2
+ xvld xr2, a0, \buf_pitch*16*2
+ xvld xr3, a0, \buf_pitch*24*2
+
+ xvldrepl.w xr20, t1, 0
+ xvldrepl.w xr21, t1, 4
+
+ xvilvl.h xr10, xr2, xr0
+ xvilvh.h xr11, xr2, xr0
+ xvilvl.h xr12, xr3, xr1
+ xvilvh.h xr13, xr3, xr1
+
+ xvmulwev.w.h xr4, xr10, xr20
+ xvmaddwod.w.h xr4, xr10, xr20 // sum0_r
+ xvmulwev.w.h xr5, xr11, xr20
+ xvmaddwod.w.h xr5, xr11, xr20 // sum0_l
+ xvmulwev.w.h xr6, xr12, xr21
+ xvmaddwod.w.h xr6, xr12, xr21 // tmp1_r
+ xvmulwev.w.h xr7, xr13, xr21
+ xvmaddwod.w.h xr7, xr13, xr21 // tmp1_l
+
+ xvsub.w xr0, xr4, xr6 // sum1_r
+ xvadd.w xr1, xr4, xr6 // sum0_r
+ xvsub.w xr2, xr5, xr7 // sum1_l
+ xvadd.w xr3, xr5, xr7 // sum0_l
+
+ // HEVC_EVEN16_CALC
+ xvsub.w xr24, xr1, xr14 // 7
+ xvsub.w xr25, xr3, xr15
+ xvadd.w xr14, xr1, xr14 // 0
+ xvadd.w xr15, xr3, xr15
+ xvst xr24, t2, 7*16*4 // 448=16*28=7*16*4
+ xvst xr25, t2, 7*16*4+32 // 480
+ xvst xr14, t2, 0
+ xvst xr15, t2, 32
+
+ xvsub.w xr26, xr0, xr22 // 4
+ xvsub.w xr27, xr2, xr23
+ xvadd.w xr22, xr0, xr22 // 3
+ xvadd.w xr23, xr2, xr23
+ xvst xr26, t2, 4*16*4 // 256=4*16*4
+ xvst xr27, t2, 4*16*4+32 // 288
+ xvst xr22, t2, 3*16*4 // 192=3*16*4
+ xvst xr23, t2, 3*16*4+32 // 224
+
+ xvldrepl.w xr20, t1, 16
+ xvldrepl.w xr21, t1, 20
+
+ xvmulwev.w.h xr4, xr10, xr20
+ xvmaddwod.w.h xr4, xr10, xr20
+ xvmulwev.w.h xr5, xr11, xr20
+ xvmaddwod.w.h xr5, xr11, xr20
+ xvmulwev.w.h xr6, xr12, xr21
+ xvmaddwod.w.h xr6, xr12, xr21
+ xvmulwev.w.h xr7, xr13, xr21
+ xvmaddwod.w.h xr7, xr13, xr21
+
+ xvsub.w xr0, xr4, xr6 // sum1_r
+ xvadd.w xr1, xr4, xr6 // sum0_r
+ xvsub.w xr2, xr5, xr7 // sum1_l
+ xvadd.w xr3, xr5, xr7 // sum0_l
+
+ // HEVC_EVEN16_CALC
+ xvsub.w xr24, xr1, xr16 // 6
+ xvsub.w xr25, xr3, xr17
+ xvadd.w xr16, xr1, xr16 // 1
+ xvadd.w xr17, xr3, xr17
+ xvst xr24, t2, 6*16*4 // 384=6*16*4
+ xvst xr25, t2, 6*16*4+32 // 416
+ xvst xr16, t2, 1*16*4 // 64=1*16*4
+ xvst xr17, t2, 1*16*4+32 // 96
+
+ xvsub.w xr26, xr0, xr18 // 5
+ xvsub.w xr27, xr2, xr19
+ xvadd.w xr18, xr0, xr18 // 2
+ xvadd.w xr19, xr2, xr19
+ xvst xr26, t2, 5*16*4 // 320=5*16*4
+ xvst xr27, t2, 5*16*4+32 // 352
+ xvst xr18, t2, 2*16*4 // 128=2*16*4
+ xvst xr19, t2, 2*16*4+32 // 160
+
+ /* process coeff 2 6 10 14 18 22 26 30 */
+ addi.d t0, a0, \buf_pitch*2*2
+
+ xvld xr0, t0, 0
+ xvld xr1, t0, \buf_pitch*4*2
+ xvld xr2, t0, \buf_pitch*8*2
+ xvld xr3, t0, \buf_pitch*12*2
+
+ xvld xr4, t0, \buf_pitch*16*2
+ xvld xr5, t0, \buf_pitch*20*2
+ xvld xr6, t0, \buf_pitch*24*2
+ xvld xr7, t0, \buf_pitch*28*2
+
+ xvilvl.h xr8, xr1, xr0
+ xvilvh.h xr9, xr1, xr0
+ xvilvl.h xr10, xr3, xr2
+ xvilvh.h xr11, xr3, xr2
+ xvilvl.h xr12, xr5, xr4
+ xvilvh.h xr13, xr5, xr4
+ xvilvl.h xr14, xr7, xr6
+ xvilvh.h xr15, xr7, xr6
+
+ la.local t1, gt32x32_cnst1
+
+ addi.d t2, sp, 64
+ addi.d t3, sp, 64+960 // 30*32
+
+ idct_16x32_step1_lasx
+
+.rept 7
+ addi.d t1, t1, 16
+ addi.d t2, t2, 64
+ addi.d t3, t3, -64
+ idct_16x32_step1_lasx
+.endr
+
+ addi.d t0, a0, \buf_pitch*2
+
+ xvld xr0, t0, 0
+ xvld xr1, t0, \buf_pitch*2*2
+ xvld xr2, t0, \buf_pitch*4*2
+ xvld xr3, t0, \buf_pitch*6*2
+ xvld xr4, t0, \buf_pitch*8*2
+ xvld xr5, t0, \buf_pitch*10*2
+ xvld xr6, t0, \buf_pitch*12*2
+ xvld xr7, t0, \buf_pitch*14*2
+
+ xvilvl.h xr8, xr1, xr0
+ xvilvh.h xr9, xr1, xr0
+ xvilvl.h xr10, xr3, xr2
+ xvilvh.h xr11, xr3, xr2
+ xvilvl.h xr12, xr5, xr4
+ xvilvh.h xr13, xr5, xr4
+ xvilvl.h xr14, xr7, xr6
+ xvilvh.h xr15, xr7, xr6
+
+ la.local t1, gt32x32_cnst0
+
+ idct_16x32_step2_lasx xr8, xr9, xr10, xr11, xr12, xr13, \
+ xr14, xr15, xr16, xr17
+
+ addi.d t0, a0, \buf_pitch*16*2+\buf_pitch*2
+
+ xvld xr0, t0, 0
+ xvld xr1, t0, \buf_pitch*2*2
+ xvld xr2, t0, \buf_pitch*4*2
+ xvld xr3, t0, \buf_pitch*6*2
+ xvld xr4, t0, \buf_pitch*8*2
+ xvld xr5, t0, \buf_pitch*10*2
+ xvld xr6, t0, \buf_pitch*12*2
+ xvld xr7, t0, \buf_pitch*14*2
+
+ xvilvl.h xr18, xr1, xr0
+ xvilvh.h xr19, xr1, xr0
+ xvilvl.h xr24, xr3, xr2
+ xvilvh.h xr25, xr3, xr2
+ xvilvl.h xr26, xr5, xr4
+ xvilvh.h xr27, xr5, xr4
+ xvilvl.h xr28, xr7, xr6
+ xvilvh.h xr29, xr7, xr6
+
+ addi.d t1, t1, 16
+ idct_16x32_step2_lasx xr18, xr19, xr24, xr25, xr26, xr27, \
+ xr28, xr29, xr30, xr31
+
+ addi.d t4, a0, 0
+ addi.d t5, a0, \buf_pitch*31*2
+ addi.d t2, sp, 64
+
+ idct_16x32_step3_lasx \round
+
+.rept 15
+
+ addi.d t1, t1, 16
+ idct_16x32_step2_lasx xr8, xr9, xr10, xr11, xr12, xr13, \
+ xr14, xr15, xr16, xr17
+
+ addi.d t1, t1, 16
+ idct_16x32_step2_lasx xr18, xr19, xr24, xr25, xr26, xr27, \
+ xr28, xr29, xr30, xr31
+
+ addi.d t2, t2, 64
+ addi.d t4, t4, \buf_pitch*2
+ addi.d t5, t5, -\buf_pitch*2
+
+ idct_16x32_step3_lasx \round
+.endr
+
+.endm
+
+function hevc_idct_16x32_column_step1_lasx
+ malloc_space 512+512+512
+
+ idct_16x32_lasx 32, 7
+
+ free_space 512+512+512
+endfunc
+
+function hevc_idct_16x32_column_step2_lasx
+ malloc_space 512+512+512
+
+ idct_16x32_lasx 16, 12
+
+ free_space 512+512+512
+endfunc
+
+function hevc_idct_transpose_32x16_to_16x32_lasx
+ fr_store
+
+ xvld xr0, a0, 0
+ xvld xr1, a0, 64
+ xvld xr2, a0, 128
+ xvld xr3, a0, 192
+ xvld xr4, a0, 256
+ xvld xr5, a0, 320
+ xvld xr6, a0, 384
+ xvld xr7, a0, 448
+
+ xvpermi.q xr8, xr0, 0x01
+ xvpermi.q xr9, xr1, 0x01
+ xvpermi.q xr10, xr2, 0x01
+ xvpermi.q xr11, xr3, 0x01
+ xvpermi.q xr12, xr4, 0x01
+ xvpermi.q xr13, xr5, 0x01
+ xvpermi.q xr14, xr6, 0x01
+ xvpermi.q xr15, xr7, 0x01
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ addi.d a0, a0, 512
+
+ vld vr24, a0, 0
+ vld vr25, a0, 64
+ vld vr26, a0, 128
+ vld vr27, a0, 192
+ vld vr28, a0, 256
+ vld vr29, a0, 320
+ vld vr30, a0, 384
+ vld vr31, a0, 448
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ xvpermi.q xr0, xr24, 0x02
+ xvpermi.q xr1, xr25, 0x02
+ xvpermi.q xr2, xr26, 0x02
+ xvpermi.q xr3, xr27, 0x02
+ xvpermi.q xr4, xr28, 0x02
+ xvpermi.q xr5, xr29, 0x02
+ xvpermi.q xr6, xr30, 0x02
+ xvpermi.q xr7, xr31, 0x02
+
+ xvst xr0, a1, 0
+ xvst xr1, a1, 32
+ xvst xr2, a1, 64
+ xvst xr3, a1, 96
+ xvst xr4, a1, 128
+ xvst xr5, a1, 160
+ xvst xr6, a1, 192
+ xvst xr7, a1, 224
+
+ addi.d a1, a1, 256
+ addi.d a0, a0, 16
+
+ vld vr24, a0, 0
+ vld vr25, a0, 64
+ vld vr26, a0, 128
+ vld vr27, a0, 192
+ vld vr28, a0, 256
+ vld vr29, a0, 320
+ vld vr30, a0, 384
+ vld vr31, a0, 448
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ xvpermi.q xr8, xr24, 0x02
+ xvpermi.q xr9, xr25, 0x02
+ xvpermi.q xr10, xr26, 0x02
+ xvpermi.q xr11, xr27, 0x02
+ xvpermi.q xr12, xr28, 0x02
+ xvpermi.q xr13, xr29, 0x02
+ xvpermi.q xr14, xr30, 0x02
+ xvpermi.q xr15, xr31, 0x02
+
+ xvst xr8, a1, 0
+ xvst xr9, a1, 32
+ xvst xr10, a1, 64
+ xvst xr11, a1, 96
+ xvst xr12, a1, 128
+ xvst xr13, a1, 160
+ xvst xr14, a1, 192
+ xvst xr15, a1, 224
+
+ // second
+ addi.d a0, a0, 32-512-16
+
+ xvld xr0, a0, 0
+ xvld xr1, a0, 64
+ xvld xr2, a0, 128
+ xvld xr3, a0, 192
+ xvld xr4, a0, 256
+ xvld xr5, a0, 320
+ xvld xr6, a0, 384
+ xvld xr7, a0, 448
+
+ xvpermi.q xr8, xr0, 0x01
+ xvpermi.q xr9, xr1, 0x01
+ xvpermi.q xr10, xr2, 0x01
+ xvpermi.q xr11, xr3, 0x01
+ xvpermi.q xr12, xr4, 0x01
+ xvpermi.q xr13, xr5, 0x01
+ xvpermi.q xr14, xr6, 0x01
+ xvpermi.q xr15, xr7, 0x01
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ addi.d a0, a0, 512
+
+ vld vr24, a0, 0
+ vld vr25, a0, 64
+ vld vr26, a0, 128
+ vld vr27, a0, 192
+ vld vr28, a0, 256
+ vld vr29, a0, 320
+ vld vr30, a0, 384
+ vld vr31, a0, 448
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ xvpermi.q xr0, xr24, 0x02
+ xvpermi.q xr1, xr25, 0x02
+ xvpermi.q xr2, xr26, 0x02
+ xvpermi.q xr3, xr27, 0x02
+ xvpermi.q xr4, xr28, 0x02
+ xvpermi.q xr5, xr29, 0x02
+ xvpermi.q xr6, xr30, 0x02
+ xvpermi.q xr7, xr31, 0x02
+
+ addi.d a1, a1, 256
+ xvst xr0, a1, 0
+ xvst xr1, a1, 32
+ xvst xr2, a1, 64
+ xvst xr3, a1, 96
+ xvst xr4, a1, 128
+ xvst xr5, a1, 160
+ xvst xr6, a1, 192
+ xvst xr7, a1, 224
+
+ addi.d a1, a1, 256
+ addi.d a0, a0, 16
+
+ vld vr24, a0, 0
+ vld vr25, a0, 64
+ vld vr26, a0, 128
+ vld vr27, a0, 192
+ vld vr28, a0, 256
+ vld vr29, a0, 320
+ vld vr30, a0, 384
+ vld vr31, a0, 448
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ xvpermi.q xr8, xr24, 0x02
+ xvpermi.q xr9, xr25, 0x02
+ xvpermi.q xr10, xr26, 0x02
+ xvpermi.q xr11, xr27, 0x02
+ xvpermi.q xr12, xr28, 0x02
+ xvpermi.q xr13, xr29, 0x02
+ xvpermi.q xr14, xr30, 0x02
+ xvpermi.q xr15, xr31, 0x02
+
+ xvst xr8, a1, 0
+ xvst xr9, a1, 32
+ xvst xr10, a1, 64
+ xvst xr11, a1, 96
+ xvst xr12, a1, 128
+ xvst xr13, a1, 160
+ xvst xr14, a1, 192
+ xvst xr15, a1, 224
+
+ fr_recover
+endfunc
+
+function hevc_idct_transpose_16x32_to_32x16_lasx
+ fr_store
+
+ xvld xr0, a0, 0
+ xvld xr1, a0, 32
+ xvld xr2, a0, 64
+ xvld xr3, a0, 96
+ xvld xr4, a0, 128
+ xvld xr5, a0, 160
+ xvld xr6, a0, 192
+ xvld xr7, a0, 224
+
+ xvpermi.q xr8, xr0, 0x01
+ xvpermi.q xr9, xr1, 0x01
+ xvpermi.q xr10, xr2, 0x01
+ xvpermi.q xr11, xr3, 0x01
+ xvpermi.q xr12, xr4, 0x01
+ xvpermi.q xr13, xr5, 0x01
+ xvpermi.q xr14, xr6, 0x01
+ xvpermi.q xr15, xr7, 0x01
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ addi.d a0, a0, 256
+
+ vld vr24, a0, 0
+ vld vr25, a0, 32
+ vld vr26, a0, 64
+ vld vr27, a0, 96
+ vld vr28, a0, 128
+ vld vr29, a0, 160
+ vld vr30, a0, 192
+ vld vr31, a0, 224
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ xvpermi.q xr0, xr24, 0x02
+ xvpermi.q xr1, xr25, 0x02
+ xvpermi.q xr2, xr26, 0x02
+ xvpermi.q xr3, xr27, 0x02
+ xvpermi.q xr4, xr28, 0x02
+ xvpermi.q xr5, xr29, 0x02
+ xvpermi.q xr6, xr30, 0x02
+ xvpermi.q xr7, xr31, 0x02
+
+ xvst xr0, a1, 0
+ xvst xr1, a1, 64
+ xvst xr2, a1, 128
+ xvst xr3, a1, 192
+ xvst xr4, a1, 256
+ xvst xr5, a1, 320
+ xvst xr6, a1, 384
+ xvst xr7, a1, 448
+
+ addi.d a1, a1, 512
+ addi.d a0, a0, 16
+
+ vld vr24, a0, 0
+ vld vr25, a0, 32
+ vld vr26, a0, 64
+ vld vr27, a0, 96
+ vld vr28, a0, 128
+ vld vr29, a0, 160
+ vld vr30, a0, 192
+ vld vr31, a0, 224
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ xvpermi.q xr8, xr24, 0x02
+ xvpermi.q xr9, xr25, 0x02
+ xvpermi.q xr10, xr26, 0x02
+ xvpermi.q xr11, xr27, 0x02
+ xvpermi.q xr12, xr28, 0x02
+ xvpermi.q xr13, xr29, 0x02
+ xvpermi.q xr14, xr30, 0x02
+ xvpermi.q xr15, xr31, 0x02
+
+ xvst xr8, a1, 0
+ xvst xr9, a1, 64
+ xvst xr10, a1, 128
+ xvst xr11, a1, 192
+ xvst xr12, a1, 256
+ xvst xr13, a1, 320
+ xvst xr14, a1, 384
+ xvst xr15, a1, 448
+
+ // second
+ addi.d a0, a0, 256-16
+
+ xvld xr0, a0, 0
+ xvld xr1, a0, 32
+ xvld xr2, a0, 64
+ xvld xr3, a0, 96
+ xvld xr4, a0, 128
+ xvld xr5, a0, 160
+ xvld xr6, a0, 192
+ xvld xr7, a0, 224
+
+ xvpermi.q xr8, xr0, 0x01
+ xvpermi.q xr9, xr1, 0x01
+ xvpermi.q xr10, xr2, 0x01
+ xvpermi.q xr11, xr3, 0x01
+ xvpermi.q xr12, xr4, 0x01
+ xvpermi.q xr13, xr5, 0x01
+ xvpermi.q xr14, xr6, 0x01
+ xvpermi.q xr15, xr7, 0x01
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ addi.d a0, a0, 256
+
+ vld vr24, a0, 0
+ vld vr25, a0, 32
+ vld vr26, a0, 64
+ vld vr27, a0, 96
+ vld vr28, a0, 128
+ vld vr29, a0, 160
+ vld vr30, a0, 192
+ vld vr31, a0, 224
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ xvpermi.q xr0, xr24, 0x02
+ xvpermi.q xr1, xr25, 0x02
+ xvpermi.q xr2, xr26, 0x02
+ xvpermi.q xr3, xr27, 0x02
+ xvpermi.q xr4, xr28, 0x02
+ xvpermi.q xr5, xr29, 0x02
+ xvpermi.q xr6, xr30, 0x02
+ xvpermi.q xr7, xr31, 0x02
+
+ addi.d a1, a1, -512+32
+
+ xvst xr0, a1, 0
+ xvst xr1, a1, 64
+ xvst xr2, a1, 128
+ xvst xr3, a1, 192
+ xvst xr4, a1, 256
+ xvst xr5, a1, 320
+ xvst xr6, a1, 384
+ xvst xr7, a1, 448
+
+ addi.d a1, a1, 512
+ addi.d a0, a0, 16
+
+ vld vr24, a0, 0
+ vld vr25, a0, 32
+ vld vr26, a0, 64
+ vld vr27, a0, 96
+ vld vr28, a0, 128
+ vld vr29, a0, 160
+ vld vr30, a0, 192
+ vld vr31, a0, 224
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+ xvpermi.q xr8, xr24, 0x02
+ xvpermi.q xr9, xr25, 0x02
+ xvpermi.q xr10, xr26, 0x02
+ xvpermi.q xr11, xr27, 0x02
+ xvpermi.q xr12, xr28, 0x02
+ xvpermi.q xr13, xr29, 0x02
+ xvpermi.q xr14, xr30, 0x02
+ xvpermi.q xr15, xr31, 0x02
+
+ xvst xr8, a1, 0
+ xvst xr9, a1, 64
+ xvst xr10, a1, 128
+ xvst xr11, a1, 192
+ xvst xr12, a1, 256
+ xvst xr13, a1, 320
+ xvst xr14, a1, 384
+ xvst xr15, a1, 448
+
+ fr_recover
+endfunc
+
+function ff_hevc_idct_32x32_lasx
+
+ addi.d t7, a0, 0
+ addi.d t6, a1, 0
+
+ addi.d sp, sp, -8
+ st.d ra, sp, 0
+
+ bl hevc_idct_16x32_column_step1_lasx
+
+ addi.d a0, a0, 32
+
+ bl hevc_idct_16x32_column_step1_lasx
+
+ malloc_space (16*32+31)*2
+
+ addi.d t8, sp, 64+31*2 // tmp_buf_ptr
+
+ addi.d a0, t7, 0
+ addi.d a1, t8, 0
+ bl hevc_idct_transpose_32x16_to_16x32_lasx
+
+ addi.d a0, t8, 0
+ bl hevc_idct_16x32_column_step2_lasx
+
+ addi.d a0, t8, 0
+ addi.d a1, t7, 0
+ bl hevc_idct_transpose_16x32_to_32x16_lasx
+
+ // second
+ addi.d a0, t7, 32*8*2*2
+ addi.d a1, t8, 0
+ bl hevc_idct_transpose_32x16_to_16x32_lasx
+
+ addi.d a0, t8, 0
+ bl hevc_idct_16x32_column_step2_lasx
+
+ addi.d a0, t8, 0
+ addi.d a1, t7, 32*8*2*2
+ bl hevc_idct_transpose_16x32_to_32x16_lasx
+
+ free_space (16*32+31)*2
+
+ ld.d ra, sp, 0
+ addi.d sp, sp, 8
+
+endfunc
diff --git a/libavcodec/loongarch/hevc_idct_lsx.c b/libavcodec/loongarch/hevc_idct_lsx.c
index 2193b27546..527279d85d 100644
--- a/libavcodec/loongarch/hevc_idct_lsx.c
+++ b/libavcodec/loongarch/hevc_idct_lsx.c
@@ -23,18 +23,18 @@
#include "libavutil/loongarch/loongson_intrinsics.h"
#include "hevcdsp_lsx.h"
-static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
+const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
};
-static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
+const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
};
-static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
+const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
@@ -53,14 +53,14 @@ static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
};
-static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
+const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
};
-static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
+const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
};
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index 2756755733..1585bda276 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -360,6 +360,8 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lasx;
c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_lasx;
c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_lasx;
+
+ c->idct[3] = ff_hevc_idct_32x32_lasx;
}
}
}
diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h
index 5db35eed47..714cbf5880 100644
--- a/libavcodec/loongarch/hevcdsp_lasx.h
+++ b/libavcodec/loongarch/hevcdsp_lasx.h
@@ -131,4 +131,6 @@ BI_MC(epel, h, 64);
#undef BI_MC
+void ff_hevc_idct_32x32_lasx(int16_t *coeffs, int col_limit);
+
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
--
2.20.1
More information about the ffmpeg-devel
mailing list