[FFmpeg-devel] [PATCH v1 5/6] avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 asm opt
jinbo
jinbo at loongson.cn
Fri Dec 22 12:52:13 EET 2023
tests/checkasm/checkasm: C LSX LASX
put_hevc_epel_uni_w_hv4_8_c: 9.5 2.2
put_hevc_epel_uni_w_hv6_8_c: 18.5 5.0 3.7
put_hevc_epel_uni_w_hv8_8_c: 30.7 6.0 4.5
put_hevc_epel_uni_w_hv12_8_c: 63.7 14.0 10.7
put_hevc_epel_uni_w_hv16_8_c: 107.5 22.7 17.0
put_hevc_epel_uni_w_hv24_8_c: 236.7 50.2 31.7
put_hevc_epel_uni_w_hv32_8_c: 414.5 88.0 53.0
put_hevc_epel_uni_w_hv48_8_c: 917.5 197.7 118.5
put_hevc_epel_uni_w_hv64_8_c: 1617.0 349.5 203.0
After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 3fps (52fps-->55fsp).
Change-Id: If067e394cec4685c62193e7adb829ac93ba4804d
---
libavcodec/loongarch/hevc_mc.S | 821 ++++++++++++++++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 19 +
libavcodec/loongarch/hevcdsp_lasx.h | 9 +
libavcodec/loongarch/hevcdsp_lsx.h | 10 +
4 files changed, 859 insertions(+)
diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
index 2ee338fb8e..0b0647546b 100644
--- a/libavcodec/loongarch/hevc_mc.S
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -22,6 +22,7 @@
#include "loongson_asm.S"
.extern ff_hevc_qpel_filters
+.extern ff_hevc_epel_filters
.macro LOAD_VAR bit
addi.w t1, a5, 6 //shift
@@ -206,6 +207,12 @@
.endif
.endm
+/*
+ * void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
+ * const uint8_t *_src, ptrdiff_t _srcstride,
+ * int height, int denom, int wx, int ox,
+ * intptr_t mx, intptr_t my, int width)
+ */
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
LOAD_VAR 128
srli.w t0, a4, 1
@@ -482,6 +489,12 @@ endfunc
xvhaddw.d.w \in0, \in0, \in0
.endm
+/*
+ * void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
+ * const uint8_t *_src, ptrdiff_t _srcstride,
+ * int height, int denom, int wx, int ox,
+ * intptr_t mx, intptr_t my, int width)
+ */
function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx
LOAD_VAR 128
ld.d t0, sp, 8 //my
@@ -1253,6 +1266,12 @@ endfunc
xvssrani.bu.h \out0, xr11, 0
.endm
+/*
+ * void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
+ * const uint8_t *_src, ptrdiff_t _srcstride,
+ * int height, int denom, int wx, int ox,
+ * intptr_t mx, intptr_t my, int width)
+ */
function ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 //mx
@@ -1763,3 +1782,805 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx
addi.d a4, a4, -1
bnez a4, .LOOP_H64_LASX
endfunc
+
+const shufb
+ .byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6
+ .byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10
+endconst
+
+.macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w
+ fld.d f7, a2, 0 // start to load src
+ fldx.d f8, a2, a3
+ alsl.d a2, a3, a2, 1
+ fld.d f9, a2, 0
+ vshuf.b vr7, vr7, vr7, vr0 // 0123 1234 2345 3456
+ vshuf.b vr8, vr8, vr8, vr0
+ vshuf.b vr9, vr9, vr9, vr0
+ vdp2.h.bu.b vr10, vr7, vr5 // EPEL_FILTER(src, 1)
+ vdp2.h.bu.b vr11, vr8, vr5
+ vdp2.h.bu.b vr12, vr9, vr5
+ vhaddw.w.h vr10, vr10, vr10 // tmp[0/1/2/3]
+ vhaddw.w.h vr11, vr11, vr11 // vr10,vr11,vr12 corresponding to EPEL_EXTRA
+ vhaddw.w.h vr12, vr12, vr12
+.LOOP_HV4_\w:
+ add.d a2, a2, a3
+ fld.d f14, a2, 0 // height loop begin
+ vshuf.b vr14, vr14, vr14, vr0
+ vdp2.h.bu.b vr13, vr14, vr5
+ vhaddw.w.h vr13, vr13, vr13
+ vmul.w vr14, vr10, vr16 // EPEL_FILTER(tmp, MAX_PB_SIZE)
+ vmadd.w vr14, vr11, vr17
+ vmadd.w vr14, vr12, vr18
+ vmadd.w vr14, vr13, vr19
+ vaddi.wu vr10, vr11, 0 //back up previous value
+ vaddi.wu vr11, vr12, 0
+ vaddi.wu vr12, vr13, 0
+ vsrai.w vr14, vr14, 6 // >> 6
+ vmul.w vr14, vr14, vr1 // * wx
+ vadd.w vr14, vr14, vr2 // + offset
+ vsra.w vr14, vr14, vr3 // >> shift
+ vadd.w vr14, vr14, vr4 // + ox
+ vssrani.h.w vr14, vr14, 0
+ vssrani.bu.h vr14, vr14, 0 // clip
+ fst.s f14, a0, 0
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_HV4_\w
+.endm
+
+/*
+ * void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
+ * const uint8_t *_src, ptrdiff_t _srcstride,
+ * int height, int denom, int wx, int ox,
+ * intptr_t mx, intptr_t my, int width)
+ */
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ PUT_HEVC_EPEL_UNI_W_HV4_LSX 4
+endfunc
+
+.macro PUT_HEVC_EPEL_UNI_W_HV8_LSX w
+ vld vr7, a2, 0 // start to load src
+ vldx vr8, a2, a3
+ alsl.d a2, a3, a2, 1
+ vld vr9, a2, 0
+ vshuf.b vr10, vr7, vr7, vr0 // 0123 1234 2345 3456
+ vshuf.b vr11, vr8, vr8, vr0
+ vshuf.b vr12, vr9, vr9, vr0
+ vshuf.b vr7, vr7, vr7, vr22// 4567 5678 6789 78910
+ vshuf.b vr8, vr8, vr8, vr22
+ vshuf.b vr9, vr9, vr9, vr22
+ vdp2.h.bu.b vr13, vr10, vr5 // EPEL_FILTER(src, 1)
+ vdp2.h.bu.b vr14, vr11, vr5
+ vdp2.h.bu.b vr15, vr12, vr5
+ vdp2.h.bu.b vr23, vr7, vr5
+ vdp2.h.bu.b vr20, vr8, vr5
+ vdp2.h.bu.b vr21, vr9, vr5
+ vhaddw.w.h vr7, vr13, vr13
+ vhaddw.w.h vr8, vr14, vr14
+ vhaddw.w.h vr9, vr15, vr15
+ vhaddw.w.h vr10, vr23, vr23
+ vhaddw.w.h vr11, vr20, vr20
+ vhaddw.w.h vr12, vr21, vr21
+.LOOP_HV8_HORI_\w:
+ add.d a2, a2, a3
+ vld vr15, a2, 0
+ vshuf.b vr23, vr15, vr15, vr0
+ vshuf.b vr15, vr15, vr15, vr22
+ vdp2.h.bu.b vr13, vr23, vr5
+ vdp2.h.bu.b vr14, vr15, vr5
+ vhaddw.w.h vr13, vr13, vr13 //789--13
+ vhaddw.w.h vr14, vr14, vr14 //101112--14
+ vmul.w vr15, vr7, vr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
+ vmadd.w vr15, vr8, vr17
+ vmadd.w vr15, vr9, vr18
+ vmadd.w vr15, vr13, vr19
+ vmul.w vr20, vr10, vr16
+ vmadd.w vr20, vr11, vr17
+ vmadd.w vr20, vr12, vr18
+ vmadd.w vr20, vr14, vr19
+ vaddi.wu vr7, vr8, 0 //back up previous value
+ vaddi.wu vr8, vr9, 0
+ vaddi.wu vr9, vr13, 0
+ vaddi.wu vr10, vr11, 0
+ vaddi.wu vr11, vr12, 0
+ vaddi.wu vr12, vr14, 0
+ vsrai.w vr15, vr15, 6 // >> 6
+ vsrai.w vr20, vr20, 6
+ vmul.w vr15, vr15, vr1 // * wx
+ vmul.w vr20, vr20, vr1
+ vadd.w vr15, vr15, vr2 // + offset
+ vadd.w vr20, vr20, vr2
+ vsra.w vr15, vr15, vr3 // >> shift
+ vsra.w vr20, vr20, vr3
+ vadd.w vr15, vr15, vr4 // + ox
+ vadd.w vr20, vr20, vr4
+ vssrani.h.w vr20, vr15, 0
+ vssrani.bu.h vr20, vr20, 0
+.if \w > 6
+ fst.d f20, a0, 0
+.else
+ fst.s f20, a0, 0
+ vstelm.h vr20, a0, 4, 2
+.endif
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_HV8_HORI_\w
+.endm
+
+.macro PUT_HEVC_EPEL_UNI_W_HV8_LASX w
+ vld vr7, a2, 0 // start to load src
+ vldx vr8, a2, a3
+ alsl.d a2, a3, a2, 1
+ vld vr9, a2, 0
+ xvreplve0.q xr7, xr7
+ xvreplve0.q xr8, xr8
+ xvreplve0.q xr9, xr9
+ xvshuf.b xr10, xr7, xr7, xr0 // 0123 1234 2345 3456
+ xvshuf.b xr11, xr8, xr8, xr0
+ xvshuf.b xr12, xr9, xr9, xr0
+ xvdp2.h.bu.b xr13, xr10, xr5 // EPEL_FILTER(src, 1)
+ xvdp2.h.bu.b xr14, xr11, xr5
+ xvdp2.h.bu.b xr15, xr12, xr5
+ xvhaddw.w.h xr7, xr13, xr13
+ xvhaddw.w.h xr8, xr14, xr14
+ xvhaddw.w.h xr9, xr15, xr15
+.LOOP_HV8_HORI_LASX_\w:
+ add.d a2, a2, a3
+ vld vr15, a2, 0
+ xvreplve0.q xr15, xr15
+ xvshuf.b xr23, xr15, xr15, xr0
+ xvdp2.h.bu.b xr10, xr23, xr5
+ xvhaddw.w.h xr10, xr10, xr10
+ xvmul.w xr15, xr7, xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
+ xvmadd.w xr15, xr8, xr17
+ xvmadd.w xr15, xr9, xr18
+ xvmadd.w xr15, xr10, xr19
+ xvaddi.wu xr7, xr8, 0 //back up previous value
+ xvaddi.wu xr8, xr9, 0
+ xvaddi.wu xr9, xr10, 0
+ xvsrai.w xr15, xr15, 6 // >> 6
+ xvmul.w xr15, xr15, xr1 // * wx
+ xvadd.w xr15, xr15, xr2 // + offset
+ xvsra.w xr15, xr15, xr3 // >> shift
+ xvadd.w xr15, xr15, xr4 // + ox
+ xvpermi.q xr20, xr15, 0x01
+ vssrani.h.w vr20, vr15, 0
+ vssrani.bu.h vr20, vr20, 0
+.if \w > 6
+ fst.d f20, a0, 0
+.else
+ fst.s f20, a0, 0
+ vstelm.h vr20, a0, 4, 2
+.endif
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_HV8_HORI_LASX_\w
+.endm
+
+.macro PUT_HEVC_EPEL_UNI_W_HV16_LASX w
+ xvld xr7, a2, 0 // start to load src
+ xvldx xr8, a2, a3
+ alsl.d a2, a3, a2, 1
+ xvld xr9, a2, 0
+ xvpermi.d xr10, xr7, 0x09 //8..18
+ xvpermi.d xr11, xr8, 0x09
+ xvpermi.d xr12, xr9, 0x09
+ xvreplve0.q xr7, xr7
+ xvreplve0.q xr8, xr8
+ xvreplve0.q xr9, xr9
+ xvshuf.b xr13, xr7, xr7, xr0 // 0123 1234 2345 3456
+ xvshuf.b xr14, xr8, xr8, xr0
+ xvshuf.b xr15, xr9, xr9, xr0
+ xvdp2.h.bu.b xr20, xr13, xr5 // EPEL_FILTER(src, 1)
+ xvdp2.h.bu.b xr21, xr14, xr5
+ xvdp2.h.bu.b xr22, xr15, xr5
+ xvhaddw.w.h xr7, xr20, xr20
+ xvhaddw.w.h xr8, xr21, xr21
+ xvhaddw.w.h xr9, xr22, xr22
+ xvreplve0.q xr10, xr10
+ xvreplve0.q xr11, xr11
+ xvreplve0.q xr12, xr12
+ xvshuf.b xr13, xr10, xr10, xr0
+ xvshuf.b xr14, xr11, xr11, xr0
+ xvshuf.b xr15, xr12, xr12, xr0
+ xvdp2.h.bu.b xr20, xr13, xr5
+ xvdp2.h.bu.b xr21, xr14, xr5
+ xvdp2.h.bu.b xr22, xr15, xr5
+ xvhaddw.w.h xr10, xr20, xr20
+ xvhaddw.w.h xr11, xr21, xr21
+ xvhaddw.w.h xr12, xr22, xr22
+.LOOP_HV16_HORI_LASX_\w:
+ add.d a2, a2, a3
+ xvld xr15, a2, 0
+ xvpermi.d xr20, xr15, 0x09 //8...18
+ xvreplve0.q xr15, xr15
+ xvreplve0.q xr20, xr20
+ xvshuf.b xr21, xr15, xr15, xr0
+ xvshuf.b xr22, xr20, xr20, xr0
+ xvdp2.h.bu.b xr13, xr21, xr5
+ xvdp2.h.bu.b xr14, xr22, xr5
+ xvhaddw.w.h xr13, xr13, xr13
+ xvhaddw.w.h xr14, xr14, xr14
+ xvmul.w xr15, xr7, xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
+ xvmadd.w xr15, xr8, xr17
+ xvmadd.w xr15, xr9, xr18
+ xvmadd.w xr15, xr13, xr19
+ xvmul.w xr20, xr10, xr16
+ xvmadd.w xr20, xr11, xr17
+ xvmadd.w xr20, xr12, xr18
+ xvmadd.w xr20, xr14, xr19
+ xvaddi.wu xr7, xr8, 0 //back up previous value
+ xvaddi.wu xr8, xr9, 0
+ xvaddi.wu xr9, xr13, 0
+ xvaddi.wu xr10, xr11, 0
+ xvaddi.wu xr11, xr12, 0
+ xvaddi.wu xr12, xr14, 0
+ xvsrai.w xr15, xr15, 6 // >> 6
+ xvsrai.w xr20, xr20, 6 // >> 6
+ xvmul.w xr15, xr15, xr1 // * wx
+ xvmul.w xr20, xr20, xr1 // * wx
+ xvadd.w xr15, xr15, xr2 // + offset
+ xvadd.w xr20, xr20, xr2 // + offset
+ xvsra.w xr15, xr15, xr3 // >> shift
+ xvsra.w xr20, xr20, xr3 // >> shift
+ xvadd.w xr15, xr15, xr4 // + ox
+ xvadd.w xr20, xr20, xr4 // + ox
+ xvssrani.h.w xr20, xr15, 0
+ xvpermi.q xr21, xr20, 0x01
+ vssrani.bu.h vr21, vr20, 0
+ vpermi.w vr21, vr21, 0xd8
+.if \w < 16
+ fst.d f21, a0, 0
+ vstelm.w vr21, a0, 8, 2
+.else
+ vst vr21, a0, 0
+.endif
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_HV16_HORI_LASX_\w
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ vaddi.bu vr22, vr0, 4 // update shufb to get high part
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ PUT_HEVC_EPEL_UNI_W_HV8_LSX 6
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ xvreplve0.w xr5, xr5
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ xvreplve0.q xr6, xr6
+ xvrepl128vei.w xr16, xr6, 0
+ xvrepl128vei.w xr17, xr6, 1
+ xvrepl128vei.w xr18, xr6, 2
+ xvrepl128vei.w xr19, xr6, 3
+ la.local t1, shufb
+ xvld xr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ PUT_HEVC_EPEL_UNI_W_HV8_LASX 6
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ vaddi.bu vr22, vr0, 4 // update shufb to get high part
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ PUT_HEVC_EPEL_UNI_W_HV8_LSX 8
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ xvreplve0.w xr5, xr5
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ xvreplve0.q xr6, xr6
+ xvrepl128vei.w xr16, xr6, 0
+ xvrepl128vei.w xr17, xr6, 1
+ xvrepl128vei.w xr18, xr6, 2
+ xvrepl128vei.w xr19, xr6, 3
+ la.local t1, shufb
+ xvld xr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ PUT_HEVC_EPEL_UNI_W_HV8_LASX 8
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ vaddi.bu vr22, vr0, 4 // update shufb to get high part
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_HV8_LSX 12
+ addi.d a0, t2, 8
+ addi.d a2, t3, 8
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_HV4_LSX 12
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ xvreplve0.w xr5, xr5
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ xvreplve0.q xr6, xr6
+ xvrepl128vei.w xr16, xr6, 0
+ xvrepl128vei.w xr17, xr6, 1
+ xvrepl128vei.w xr18, xr6, 2
+ xvrepl128vei.w xr19, xr6, 3
+ la.local t1, shufb
+ xvld xr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ PUT_HEVC_EPEL_UNI_W_HV16_LASX 12
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ vaddi.bu vr22, vr0, 4 // update shufb to get high part
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ addi.d t5, zero, 2
+.LOOP_HV16:
+ PUT_HEVC_EPEL_UNI_W_HV8_LSX 16
+ addi.d a0, t2, 8
+ addi.d a2, t3, 8
+ addi.d a4, t4, 0
+ addi.d t5, t5, -1
+ bnez t5, .LOOP_HV16
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ xvreplve0.w xr5, xr5
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ xvreplve0.q xr6, xr6
+ xvrepl128vei.w xr16, xr6, 0
+ xvrepl128vei.w xr17, xr6, 1
+ xvrepl128vei.w xr18, xr6, 2
+ xvrepl128vei.w xr19, xr6, 3
+ la.local t1, shufb
+ xvld xr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ PUT_HEVC_EPEL_UNI_W_HV16_LASX 16
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ vaddi.bu vr22, vr0, 4 // update shufb to get high part
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ addi.d t5, zero, 3
+.LOOP_HV24:
+ PUT_HEVC_EPEL_UNI_W_HV8_LSX 24
+ addi.d a0, t2, 8
+ addi.d t2, t2, 8
+ addi.d a2, t3, 8
+ addi.d t3, t3, 8
+ addi.d a4, t4, 0
+ addi.d t5, t5, -1
+ bnez t5, .LOOP_HV24
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ xvreplve0.w xr5, xr5
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ xvreplve0.q xr6, xr6
+ xvrepl128vei.w xr16, xr6, 0
+ xvrepl128vei.w xr17, xr6, 1
+ xvrepl128vei.w xr18, xr6, 2
+ xvrepl128vei.w xr19, xr6, 3
+ la.local t1, shufb
+ xvld xr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_HV16_LASX 24
+ addi.d a0, t2, 16
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_HV8_LASX 24
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ vaddi.bu vr22, vr0, 4 // update shufb to get high part
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ addi.d t5, zero, 4
+.LOOP_HV32:
+ PUT_HEVC_EPEL_UNI_W_HV8_LSX 32
+ addi.d a0, t2, 8
+ addi.d t2, t2, 8
+ addi.d a2, t3, 8
+ addi.d t3, t3, 8
+ addi.d a4, t4, 0
+ addi.d t5, t5, -1
+ bnez t5, .LOOP_HV32
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ xvreplve0.w xr5, xr5
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ xvreplve0.q xr6, xr6
+ xvrepl128vei.w xr16, xr6, 0
+ xvrepl128vei.w xr17, xr6, 1
+ xvrepl128vei.w xr18, xr6, 2
+ xvrepl128vei.w xr19, xr6, 3
+ la.local t1, shufb
+ xvld xr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ addi.d t5, zero, 2
+.LOOP_HV32_LASX:
+ PUT_HEVC_EPEL_UNI_W_HV16_LASX 32
+ addi.d a0, t2, 16
+ addi.d t2, t2, 16
+ addi.d a2, t3, 16
+ addi.d t3, t3, 16
+ addi.d a4, t4, 0
+ addi.d t5, t5, -1
+ bnez t5, .LOOP_HV32_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ vaddi.bu vr22, vr0, 4 // update shufb to get high part
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ addi.d t5, zero, 6
+.LOOP_HV48:
+ PUT_HEVC_EPEL_UNI_W_HV8_LSX 48
+ addi.d a0, t2, 8
+ addi.d t2, t2, 8
+ addi.d a2, t3, 8
+ addi.d t3, t3, 8
+ addi.d a4, t4, 0
+ addi.d t5, t5, -1
+ bnez t5, .LOOP_HV48
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ xvreplve0.w xr5, xr5
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ xvreplve0.q xr6, xr6
+ xvrepl128vei.w xr16, xr6, 0
+ xvrepl128vei.w xr17, xr6, 1
+ xvrepl128vei.w xr18, xr6, 2
+ xvrepl128vei.w xr19, xr6, 3
+ la.local t1, shufb
+ xvld xr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ addi.d t5, zero, 3
+.LOOP_HV48_LASX:
+ PUT_HEVC_EPEL_UNI_W_HV16_LASX 48
+ addi.d a0, t2, 16
+ addi.d t2, t2, 16
+ addi.d a2, t3, 16
+ addi.d t3, t3, 16
+ addi.d a4, t4, 0
+ addi.d t5, t5, -1
+ bnez t5, .LOOP_HV48_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ vreplvei.w vr5, vr5, 0
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ vreplvei.w vr16, vr6, 0
+ vreplvei.w vr17, vr6, 1
+ vreplvei.w vr18, vr6, 2
+ vreplvei.w vr19, vr6, 3
+ la.local t1, shufb
+ vld vr0, t1, 0
+ vaddi.bu vr22, vr0, 4 // update shufb to get high part
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ addi.d t5, zero, 8
+.LOOP_HV64:
+ PUT_HEVC_EPEL_UNI_W_HV8_LSX 64
+ addi.d a0, t2, 8
+ addi.d t2, t2, 8
+ addi.d a2, t3, 8
+ addi.d t3, t3, 8
+ addi.d a4, t4, 0
+ addi.d t5, t5, -1
+ bnez t5, .LOOP_HV64
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 // mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
+ xvreplve0.w xr5, xr5
+ ld.d t0, sp, 8 // my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
+ vsllwil.h.b vr6, vr6, 0
+ vsllwil.w.h vr6, vr6, 0
+ xvreplve0.q xr6, xr6
+ xvrepl128vei.w xr16, xr6, 0
+ xvrepl128vei.w xr17, xr6, 1
+ xvrepl128vei.w xr18, xr6, 2
+ xvrepl128vei.w xr19, xr6, 3
+ la.local t1, shufb
+ xvld xr0, t1, 0
+ sub.d a2, a2, a3 // src -= srcstride
+ addi.d a2, a2, -1
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ addi.d t5, zero, 4
+.LOOP_HV64_LASX:
+ PUT_HEVC_EPEL_UNI_W_HV16_LASX 64
+ addi.d a0, t2, 16
+ addi.d t2, t2, 16
+ addi.d a2, t3, 16
+ addi.d t3, t3, 16
+ addi.d a4, t4, 0
+ addi.d t5, t5, -1
+ bnez t5, .LOOP_HV64_LASX
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index 3cdb3fb2d7..245a833947 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -171,6 +171,16 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+ c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx;
+ c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx;
+ c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx;
+ c->put_hevc_epel_uni_w[4][1][1] = ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx;
+ c->put_hevc_epel_uni_w[5][1][1] = ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx;
+ c->put_hevc_epel_uni_w[6][1][1] = ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx;
+ c->put_hevc_epel_uni_w[7][1][1] = ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx;
+ c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx;
+ c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx;
+
c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
@@ -258,6 +268,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+ c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx;
+ c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx;
+ c->put_hevc_epel_uni_w[4][1][1] = ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx;
+ c->put_hevc_epel_uni_w[5][1][1] = ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx;
+ c->put_hevc_epel_uni_w[6][1][1] = ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx;
+ c->put_hevc_epel_uni_w[7][1][1] = ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx;
+ c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx;
+ c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx;
+
c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx;
c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx;
c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx;
diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h
index 8a9266d375..7f09d0943a 100644
--- a/libavcodec/loongarch/hevcdsp_lasx.h
+++ b/libavcodec/loongarch/hevcdsp_lasx.h
@@ -66,6 +66,15 @@ PEL_UNI_W(qpel, h, 32);
PEL_UNI_W(qpel, h, 48);
PEL_UNI_W(qpel, h, 64);
+PEL_UNI_W(epel, hv, 6);
+PEL_UNI_W(epel, hv, 8);
+PEL_UNI_W(epel, hv, 12);
+PEL_UNI_W(epel, hv, 16);
+PEL_UNI_W(epel, hv, 24);
+PEL_UNI_W(epel, hv, 32);
+PEL_UNI_W(epel, hv, 48);
+PEL_UNI_W(epel, hv, 64);
+
#undef PEL_UNI_W
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 3291294ed9..7769cf25ae 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -277,6 +277,16 @@ PEL_UNI_W(qpel, h, 32);
PEL_UNI_W(qpel, h, 48);
PEL_UNI_W(qpel, h, 64);
+PEL_UNI_W(epel, hv, 4);
+PEL_UNI_W(epel, hv, 6);
+PEL_UNI_W(epel, hv, 8);
+PEL_UNI_W(epel, hv, 12);
+PEL_UNI_W(epel, hv, 16);
+PEL_UNI_W(epel, hv, 24);
+PEL_UNI_W(epel, hv, 32);
+PEL_UNI_W(epel, hv, 48);
+PEL_UNI_W(epel, hv, 64);
+
#undef PEL_UNI_W
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
--
2.20.1
More information about the ffmpeg-devel
mailing list