[FFmpeg-devel] [PATCH v2 3/7] avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt

Wed Dec 27 06:50:15 EET 2023

tests/checkasm/checkasm:           C       LSX     LASX
put_hevc_pel_uni_w_pixels4_8_c:    2.7     1.0
put_hevc_pel_uni_w_pixels6_8_c:    6.2     2.0     1.5
put_hevc_pel_uni_w_pixels8_8_c:    10.7    2.5     1.7
put_hevc_pel_uni_w_pixels12_8_c:   23.0    5.5     5.0
put_hevc_pel_uni_w_pixels16_8_c:   41.0    8.2     5.0
put_hevc_pel_uni_w_pixels24_8_c:   91.0    19.7    13.2
put_hevc_pel_uni_w_pixels32_8_c:   161.7   32.5    16.2
put_hevc_pel_uni_w_pixels48_8_c:   354.5   73.7    43.0
put_hevc_pel_uni_w_pixels64_8_c:   641.5   130.0   64.2

Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with
8 threads is 1fps(47fps-->48fps).
---
 libavcodec/loongarch/Makefile                 |   3 +-
 libavcodec/loongarch/hevc_mc.S                | 471 ++++++++++++++++++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |  43 ++
 libavcodec/loongarch/hevcdsp_lasx.h           |  53 ++
 libavcodec/loongarch/hevcdsp_lsx.h            |  27 +
 5 files changed, 596 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/loongarch/hevc_mc.S
 create mode 100644 libavcodec/loongarch/hevcdsp_lasx.h

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 07ea97f803..ad98cd4054 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -28,7 +28,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_mc_bi_lsx.o \
                                          loongarch/hevc_mc_uni_lsx.o \
                                          loongarch/hevc_mc_uniw_lsx.o \
-                                         loongarch/hevc_add_res.o
+                                         loongarch/hevc_add_res.o \
+                                         loongarch/hevc_mc.o
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
                                          loongarch/h264idct_loongarch.o \
                                          loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
new file mode 100644
index 0000000000..c5d553effe
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro LOAD_VAR bit
+    addi.w         t1,     a5,      6  //shift
+    addi.w         t3,     zero,    1  //one
+    sub.w          t4,     t1,      t3
+    sll.w          t3,     t3,      t4 //offset
+.if \bit == 128
+    vreplgr2vr.w   vr1,    a6          //wx
+    vreplgr2vr.w   vr2,    t3          //offset
+    vreplgr2vr.w   vr3,    t1          //shift
+    vreplgr2vr.w   vr4,    a7          //ox
+.else
+    xvreplgr2vr.w  xr1,    a6
+    xvreplgr2vr.w  xr2,    t3
+    xvreplgr2vr.w  xr3,    t1
+    xvreplgr2vr.w  xr4,    a7
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w
+    vldrepl.d      vr0,    \src0,   0
+    vsllwil.hu.bu  vr0,    vr0,     0
+    vexth.wu.hu    vr5,    vr0
+    vsllwil.wu.hu  vr0,    vr0,     0
+    vslli.w        vr0,    vr0,     6
+    vslli.w        vr5,    vr5,     6
+    vmul.w         vr0,    vr0,     vr1
+    vmul.w         vr5,    vr5,     vr1
+    vadd.w         vr0,    vr0,     vr2
+    vadd.w         vr5,    vr5,     vr2
+    vsra.w         vr0,    vr0,     vr3
+    vsra.w         vr5,    vr5,     vr3
+    vadd.w         vr0,    vr0,     vr4
+    vadd.w         vr5,    vr5,     vr4
+    vssrani.h.w    vr5,    vr0,     0
+    vssrani.bu.h   vr5,    vr5,     0
+.if \w == 6
+    fst.s          f5,     \dst0,   0
+    vstelm.h       vr5,    \dst0,   4,     2
+.else
+    fst.d          f5,     \dst0,   0
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w
+    vldrepl.d      vr0,    \src0,   0
+    add.d          t2,     \src0,   a3
+    vldrepl.d      vr5,    t2,      0
+    xvpermi.q      xr0,    xr5,     0x02
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr5,    xr0
+    xvsllwil.wu.hu xr0,    xr0,     0
+    xvslli.w       xr0,    xr0,     6
+    xvslli.w       xr5,    xr5,     6
+    xvmul.w        xr0,    xr0,     xr1
+    xvmul.w        xr5,    xr5,     xr1
+    xvadd.w        xr0,    xr0,     xr2
+    xvadd.w        xr5,    xr5,     xr2
+    xvsra.w        xr0,    xr0,     xr3
+    xvsra.w        xr5,    xr5,     xr3
+    xvadd.w        xr0,    xr0,     xr4
+    xvadd.w        xr5,    xr5,     xr4
+    xvssrani.h.w   xr5,    xr0,     0
+    xvpermi.q      xr0,    xr5,     0x01
+    xvssrani.bu.h  xr0,    xr5,     0
+    add.d          t3,     \dst0,   a1
+.if \w == 6
+    vstelm.w       vr0,    \dst0,   0,     0
+    vstelm.h       vr0,    \dst0,   4,     2
+    vstelm.w       vr0,    t3,      0,     2
+    vstelm.h       vr0,    t3,      4,     6
+.else
+    vstelm.d       vr0,    \dst0,   0,     0
+    vstelm.d       vr0,    t3,      0,     1
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0
+    vld            vr0,    \src0,   0
+    vexth.hu.bu    vr7,    vr0
+    vexth.wu.hu    vr8,    vr7
+    vsllwil.wu.hu  vr7,    vr7,     0
+    vsllwil.hu.bu  vr5,    vr0,     0
+    vexth.wu.hu    vr6,    vr5
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr5,    vr5,     6
+    vslli.w        vr6,    vr6,     6
+    vslli.w        vr7,    vr7,     6
+    vslli.w        vr8,    vr8,     6
+    vmul.w         vr5,    vr5,     vr1
+    vmul.w         vr6,    vr6,     vr1
+    vmul.w         vr7,    vr7,     vr1
+    vmul.w         vr8,    vr8,     vr1
+    vadd.w         vr5,    vr5,     vr2
+    vadd.w         vr6,    vr6,     vr2
+    vadd.w         vr7,    vr7,     vr2
+    vadd.w         vr8,    vr8,     vr2
+    vsra.w         vr5,    vr5,     vr3
+    vsra.w         vr6,    vr6,     vr3
+    vsra.w         vr7,    vr7,     vr3
+    vsra.w         vr8,    vr8,     vr3
+    vadd.w         vr5,    vr5,     vr4
+    vadd.w         vr6,    vr6,     vr4
+    vadd.w         vr7,    vr7,     vr4
+    vadd.w         vr8,    vr8,     vr4
+    vssrani.h.w    vr6,    vr5,     0
+    vssrani.h.w    vr8,    vr7,     0
+    vssrani.bu.h   vr8,    vr6,     0
+    vst            vr8,    \dst0,   0
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0
+    vld            vr0,    \src0,   0
+    xvpermi.d      xr0,    xr0,     0xd8
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr6,    xr0
+    xvsllwil.wu.hu xr5,    xr0,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvpermi.q      xr7,    xr6,     0x01
+    xvssrani.bu.h  xr7,    xr6,     0
+    vst            vr7,    \dst0,   0
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w
+.if \w == 16
+    vld            vr0,    \src0,   0
+    add.d          t2,     \src0,   a3
+    vld            vr5,    t2,      0
+    xvpermi.q      xr0,    xr5,     0x02
+.else //w=24/32
+    xvld           xr0,    \src0,   0
+.endif
+    xvexth.hu.bu   xr7,    xr0
+    xvexth.wu.hu   xr8,    xr7
+    xvsllwil.wu.hu xr7,    xr7,     0
+    xvsllwil.hu.bu xr5,    xr0,     0
+    xvexth.wu.hu   xr6,    xr5
+    xvsllwil.wu.hu xr5,    xr5,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvslli.w       xr7,    xr7,     6
+    xvslli.w       xr8,    xr8,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvmul.w        xr7,    xr7,     xr1
+    xvmul.w        xr8,    xr8,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvadd.w        xr7,    xr7,     xr2
+    xvadd.w        xr8,    xr8,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvsra.w        xr7,    xr7,     xr3
+    xvsra.w        xr8,    xr8,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvadd.w        xr7,    xr7,     xr4
+    xvadd.w        xr8,    xr8,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvssrani.h.w   xr8,    xr7,     0
+    xvssrani.bu.h  xr8,    xr6,     0
+.if \w == 16
+    vst            vr8,    \dst0,   0
+    add.d          t2,     \dst0,   a1
+    xvpermi.q      xr8,    xr8,     0x01
+    vst            vr8,    t2,      0
+.elseif \w == 24
+    vst            vr8,    \dst0,   0
+    xvstelm.d      xr8,    \dst0,   16,    2
+.else
+    xvst           xr8,    \dst0,   0
+.endif
+.endm
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
+    LOAD_VAR 128
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS4:
+    vldrepl.w      vr0,    a2,      0
+    add.d          t1,     a2,      a3
+    vldrepl.w      vr5,    t1,      0
+    vsllwil.hu.bu  vr0,    vr0,     0
+    vsllwil.wu.hu  vr0,    vr0,     0
+    vsllwil.hu.bu  vr5,    vr5,     0
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr0,    vr0,     6
+    vslli.w        vr5,    vr5,     6
+    vmul.w         vr0,    vr0,     vr1
+    vmul.w         vr5,    vr5,     vr1
+    vadd.w         vr0,    vr0,     vr2
+    vadd.w         vr5,    vr5,     vr2
+    vsra.w         vr0,    vr0,     vr3
+    vsra.w         vr5,    vr5,     vr3
+    vadd.w         vr0,    vr0,     vr4
+    vadd.w         vr5,    vr5,     vr4
+    vssrani.h.w    vr5,    vr0,     0
+    vssrani.bu.h   vr5,    vr5,     0
+    fst.s          f5,     a0,      0
+    add.d          t2,     a0,      a1
+    vstelm.w       vr5,    t2,      0,     1
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS4
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS6:
+    HEVC_PEL_UNI_W_PIXELS8_LSX      a2,    a0,    6
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS6
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS6_LASX:
+    HEVC_PEL_UNI_W_PIXELS8x2_LASX   a2,    a0,    6
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS6_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS8:
+    HEVC_PEL_UNI_W_PIXELS8_LSX      a2,    a0,    8
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS8
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS8_LASX:
+    HEVC_PEL_UNI_W_PIXELS8x2_LASX   a2,    a0,    8
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS8_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS12:
+    vld            vr0,    a2,      0
+    vexth.hu.bu    vr7,    vr0
+    vsllwil.wu.hu  vr7,    vr7,     0
+    vsllwil.hu.bu  vr5,    vr0,     0
+    vexth.wu.hu    vr6,    vr5
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr5,    vr5,     6
+    vslli.w        vr6,    vr6,     6
+    vslli.w        vr7,    vr7,     6
+    vmul.w         vr5,    vr5,     vr1
+    vmul.w         vr6,    vr6,     vr1
+    vmul.w         vr7,    vr7,     vr1
+    vadd.w         vr5,    vr5,     vr2
+    vadd.w         vr6,    vr6,     vr2
+    vadd.w         vr7,    vr7,     vr2
+    vsra.w         vr5,    vr5,     vr3
+    vsra.w         vr6,    vr6,     vr3
+    vsra.w         vr7,    vr7,     vr3
+    vadd.w         vr5,    vr5,     vr4
+    vadd.w         vr6,    vr6,     vr4
+    vadd.w         vr7,    vr7,     vr4
+    vssrani.h.w    vr6,    vr5,     0
+    vssrani.h.w    vr7,    vr7,     0
+    vssrani.bu.h   vr7,    vr6,     0
+    fst.d          f7,     a0,      0
+    vstelm.w       vr7,    a0,      8,     2
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS12
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS12_LASX:
+    vld            vr0,    a2,      0
+    xvpermi.d      xr0,    xr0,     0xd8
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr6,    xr0
+    xvsllwil.wu.hu xr5,    xr0,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvpermi.q      xr7,    xr6,     0x01
+    xvssrani.bu.h  xr7,    xr6,     0
+    fst.d          f7,     a0,      0
+    vstelm.w       vr7,    a0,      8,     2
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS12_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS16:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS16
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS16_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,   16
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS16_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS24:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS8_LSX      t0,    t1,   8
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS24
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS24_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,   24
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS24_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS32:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS32
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS32_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS32_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS48:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS48
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS48_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LASX    t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS48_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS64:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      48
+    addi.d         t1,     a0,      48
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS64
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS64_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS32_LASX    t0,    t1,    32
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS64_LASX
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index a8f753dc86..d0ee99d6b5 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -22,6 +22,7 @@
 
 #include "libavutil/loongarch/cpu.h"
 #include "hevcdsp_lsx.h"
+#include "hevcdsp_lasx.h"
 
 void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
 {
@@ -160,6 +161,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
             c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
 
+            c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+            c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+            c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+            c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+            c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+            c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+            c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+            c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+            c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
+            c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+            c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+            c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+            c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+            c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+            c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+            c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+            c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+            c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
             c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
             c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
             c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
@@ -196,4 +217,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
         }
     }
+
+    if (have_lasx(cpu_flags)) {
+        if (bit_depth == 8) {
+            c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+            c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+            c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+            c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+            c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+            c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+            c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+            c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+
+            c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+            c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+            c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+            c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+            c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+            c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+            c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+            c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+        }
+    }
 }
diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h
new file mode 100644
index 0000000000..819c3c3ecf
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lasx.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define PEL_UNI_W(PEL, DIR, WIDTH)                                       \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lasx(uint8_t *dst,  \
+                                                          ptrdiff_t      \
+                                                          dst_stride,    \
+                                                          const uint8_t *src,  \
+                                                          ptrdiff_t      \
+                                                          src_stride,    \
+                                                          int height,    \
+                                                          int denom,     \
+                                                          int wx,        \
+                                                          int ox,        \
+                                                          intptr_t mx,   \
+                                                          intptr_t my,   \
+                                                          int width)
+
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+#undef PEL_UNI_W
+
+#endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index ac509984fd..0d724a90ef 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -232,4 +232,31 @@ void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t s
 void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 
+#define PEL_UNI_W(PEL, DIR, WIDTH)                                      \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lsx(uint8_t *dst,  \
+                                                         ptrdiff_t      \
+                                                         dst_stride,    \
+                                                         const uint8_t *src,  \
+                                                         ptrdiff_t      \
+                                                         src_stride,    \
+                                                         int height,    \
+                                                         int denom,     \
+                                                         int wx,        \
+                                                         int ox,        \
+                                                         intptr_t mx,   \
+                                                         intptr_t my,   \
+                                                         int width)
+
+PEL_UNI_W(pel, pixels, 4);
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+#undef PEL_UNI_W
+
 #endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
-- 
2.20.1