FFmpeg
h264qpel_lasx.c
Go to the documentation of this file.
1 /*
2  * Loongson LASX optimized h264qpel
3  *
4  * Copyright (c) 2020 Loongson Technology Corporation Limited
5  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "h264qpel_lasx.h"
26 #include "libavutil/attributes.h"
27 
28 static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
29  /* 8 width cases */
30  0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
31  0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
32  1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
33  1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
34  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
35  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
36 };
37 
38 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
39 ( { \
40  __m256i out0_m; \
41  __m256i tmp0_m; \
42  \
43  tmp0_m = __lasx_xvshuf_b(in1, in0, mask0); \
44  out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m); \
45  tmp0_m = __lasx_xvshuf_b(in1, in0, mask1); \
46  out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \
47  tmp0_m = __lasx_xvshuf_b(in1, in0, mask2); \
48  out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \
49  \
50  out0_m; \
51 } )
52 
53 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
54 ( { \
55  __m256i out0_m; \
56  \
57  out0_m = __lasx_xvdp2_h_b(in0, coeff0); \
58  DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\
59  in2, coeff2, out0_m, out0_m); \
60  \
61  out0_m; \
62 } )
63 
64 static av_always_inline
66  uint8_t *src_y,
67  uint8_t *dst, ptrdiff_t stride)
68 {
69  const int16_t filt_const0 = 0xfb01;
70  const int16_t filt_const1 = 0x1414;
71  const int16_t filt_const2 = 0x1fb;
72  uint32_t loop_cnt;
73  ptrdiff_t stride_2x = stride << 1;
74  ptrdiff_t stride_3x = stride_2x + stride;
75  ptrdiff_t stride_4x = stride << 2;
76  __m256i tmp0, tmp1;
77  __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
78  __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
79  __m256i src_vt7, src_vt8;
80  __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
81  __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
82  __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
83  __m256i vt_out3, out0, out1, out2, out3;
84  __m256i minus5b = __lasx_xvldi(0xFB);
85  __m256i plus20b = __lasx_xvldi(20);
86 
87  filt0 = __lasx_xvreplgr2vr_h(filt_const0);
88  filt1 = __lasx_xvreplgr2vr_h(filt_const1);
89  filt2 = __lasx_xvreplgr2vr_h(filt_const2);
90 
91  mask0 = __lasx_xvld(luma_mask_arr, 0);
92  DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
93  src_vt0 = __lasx_xvld(src_y, 0);
94  DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
95  src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
96  src_y += stride_4x;
97 
98  src_vt0 = __lasx_xvxori_b(src_vt0, 128);
99  DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
100  src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
101 
102  for (loop_cnt = 4; loop_cnt--;) {
103  src_hz0 = __lasx_xvld(src_x, 0);
104  DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
105  src_hz1, src_hz2);
106  src_hz3 = __lasx_xvldx(src_x, stride_3x);
107  src_x += stride_4x;
108  src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
109  src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
110  src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
111  src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
112  DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
113  src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
114 
115  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
116  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
117  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
118  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
119  hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
120  hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
121 
122  DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
123  src_y, stride_3x, src_y, stride_4x,
124  src_vt5, src_vt6, src_vt7, src_vt8);
125  src_y += stride_4x;
126 
127  DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
128  src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
129 
130  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
131  0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
132  src_vt0, src_vt1, src_vt2, src_vt3);
133  src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
134  DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
135  src_vt3, src_vt2, src_vt87_h, src_vt3,
136  src_hz0, src_hz1, src_hz2, src_hz3);
137  DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
138  src_vt3, src_vt2, src_vt87_h, src_vt3,
139  src_vt0, src_vt1, src_vt2, src_vt3);
140  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1,
141  0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02,
142  src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
143  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1,
144  0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13,
145  src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
146  vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0,
147  filt1, filt2);
148  vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0,
149  filt1, filt2);
150  vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0,
151  filt1, filt2);
152  vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0,
153  filt1, filt2);
154  vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
155  vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
156 
157  DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
158  out0, out2);
159  DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
160  out1, out3);
161  tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
162  tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
163 
164  DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
165  out0 = __lasx_xvld(dst, 0);
166  DUP2_ARG2(__lasx_xvldx, dst, stride, dst, stride_2x, out1, out2);
167  out3 = __lasx_xvldx(dst, stride_3x);
168  out0 = __lasx_xvpermi_q(out0, out2, 0x02);
169  out1 = __lasx_xvpermi_q(out1, out3, 0x02);
170  out2 = __lasx_xvilvl_d(out1, out0);
171  out3 = __lasx_xvilvh_d(out1, out0);
172  out0 = __lasx_xvpermi_q(out2, out3, 0x02);
173  out1 = __lasx_xvpermi_q(out2, out3, 0x13);
174  tmp0 = __lasx_xvavgr_bu(out0, tmp0);
175  tmp1 = __lasx_xvavgr_bu(out1, tmp1);
176 
177  __lasx_xvstelm_d(tmp0, dst, 0, 0);
178  __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
179  __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
180  __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
181 
182  __lasx_xvstelm_d(tmp0, dst, 8, 2);
183  __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
184  __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
185  __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
186 
187  dst += stride_4x;
188  src_vt0 = src_vt4;
189  src_vt1 = src_vt5;
190  src_vt2 = src_vt6;
191  src_vt3 = src_vt7;
192  src_vt4 = src_vt8;
193  }
194 }
195 
196 static av_always_inline void
197 avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y,
198  uint8_t *dst, ptrdiff_t stride)
199 {
200  const int16_t filt_const0 = 0xfb01;
201  const int16_t filt_const1 = 0x1414;
202  const int16_t filt_const2 = 0x1fb;
203  uint32_t loop_cnt;
204  ptrdiff_t stride_2x = stride << 1;
205  ptrdiff_t stride_3x = stride_2x + stride;
206  ptrdiff_t stride_4x = stride << 2;
207  __m256i tmp0, tmp1;
208  __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
209  __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
210  __m256i src_vt7, src_vt8;
211  __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
212  __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
213  __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
214  __m256i vt_out3, out0, out1, out2, out3;
215  __m256i minus5b = __lasx_xvldi(0xFB);
216  __m256i plus20b = __lasx_xvldi(20);
217 
218  filt0 = __lasx_xvreplgr2vr_h(filt_const0);
219  filt1 = __lasx_xvreplgr2vr_h(filt_const1);
220  filt2 = __lasx_xvreplgr2vr_h(filt_const2);
221 
222  mask0 = __lasx_xvld(luma_mask_arr, 0);
223  DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
224  src_vt0 = __lasx_xvld(src_y, 0);
225  DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
226  src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
227  src_y += stride_4x;
228 
229  src_vt0 = __lasx_xvxori_b(src_vt0, 128);
230  DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
231  src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
232 
233  for (loop_cnt = 4; loop_cnt--;) {
234  src_hz0 = __lasx_xvld(src_x, 0);
235  DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
236  src_hz1, src_hz2);
237  src_hz3 = __lasx_xvldx(src_x, stride_3x);
238  src_x += stride_4x;
239  src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
240  src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
241  src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
242  src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
243  DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
244  src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
245 
246  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
247  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
248  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
249  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
250  hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
251  hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
252 
253  DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
254  src_y, stride_3x, src_y, stride_4x,
255  src_vt5, src_vt6, src_vt7, src_vt8);
256  src_y += stride_4x;
257 
258  DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
259  src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
260  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
261  0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
262  src_vt0, src_vt1, src_vt2, src_vt3);
263  src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
264  DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
265  src_vt3, src_vt2, src_vt87_h, src_vt3,
266  src_hz0, src_hz1, src_hz2, src_hz3);
267  DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
268  src_vt3, src_vt2, src_vt87_h, src_vt3,
269  src_vt0, src_vt1, src_vt2, src_vt3);
270  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1,
271  src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3,
272  0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
273  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1,
274  src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3,
275  0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
276 
277  vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h,
278  filt0, filt1, filt2);
279  vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h,
280  filt0, filt1, filt2);
281  vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h,
282  filt0, filt1, filt2);
283  vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h,
284  filt0, filt1, filt2);
285  vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
286  vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
287 
288  DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
289  out0, out2);
290  DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
291  out1, out3);
292  tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
293  tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
294 
295  DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
296  __lasx_xvstelm_d(tmp0, dst, 0, 0);
297  __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
298  __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
299  __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
300 
301  __lasx_xvstelm_d(tmp0, dst, 8, 2);
302  __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
303  __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
304  __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
305 
306  dst += stride_4x;
307  src_vt0 = src_vt4;
308  src_vt1 = src_vt5;
309  src_vt2 = src_vt6;
310  src_vt3 = src_vt7;
311  src_vt4 = src_vt8;
312  }
313 }
314 
315 /* put_pixels8_8_inline_asm: dst = src */
316 static av_always_inline void
317 put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
318 {
319  uint64_t tmp[8];
320  ptrdiff_t stride_2, stride_3, stride_4;
321  __asm__ volatile (
322  "slli.d %[stride_2], %[stride], 1 \n\t"
323  "add.d %[stride_3], %[stride_2], %[stride] \n\t"
324  "slli.d %[stride_4], %[stride_2], 1 \n\t"
325  "ld.d %[tmp0], %[src], 0x0 \n\t"
326  "ldx.d %[tmp1], %[src], %[stride] \n\t"
327  "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
328  "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
329  "add.d %[src], %[src], %[stride_4] \n\t"
330  "ld.d %[tmp4], %[src], 0x0 \n\t"
331  "ldx.d %[tmp5], %[src], %[stride] \n\t"
332  "ldx.d %[tmp6], %[src], %[stride_2] \n\t"
333  "ldx.d %[tmp7], %[src], %[stride_3] \n\t"
334 
335  "st.d %[tmp0], %[dst], 0x0 \n\t"
336  "stx.d %[tmp1], %[dst], %[stride] \n\t"
337  "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
338  "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
339  "add.d %[dst], %[dst], %[stride_4] \n\t"
340  "st.d %[tmp4], %[dst], 0x0 \n\t"
341  "stx.d %[tmp5], %[dst], %[stride] \n\t"
342  "stx.d %[tmp6], %[dst], %[stride_2] \n\t"
343  "stx.d %[tmp7], %[dst], %[stride_3] \n\t"
344  : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
345  [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
346  [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
347  [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]),
348  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
349  [stride_4]"=&r"(stride_4),
350  [dst]"+&r"(dst), [src]"+&r"(src)
351  : [stride]"r"(stride)
352  : "memory"
353  );
354 }
355 
356 /* avg_pixels8_8_lsx : dst = avg(src, dst)
357  * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
358  * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
359 static av_always_inline void
360 avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
361 {
362  uint8_t *tmp = dst;
363  ptrdiff_t stride_2, stride_3, stride_4;
364  __asm__ volatile (
365  /* h0~h7 */
366  "slli.d %[stride_2], %[stride], 1 \n\t"
367  "add.d %[stride_3], %[stride_2], %[stride] \n\t"
368  "slli.d %[stride_4], %[stride_2], 1 \n\t"
369  "vld $vr0, %[src], 0 \n\t"
370  "vldx $vr1, %[src], %[stride] \n\t"
371  "vldx $vr2, %[src], %[stride_2] \n\t"
372  "vldx $vr3, %[src], %[stride_3] \n\t"
373  "add.d %[src], %[src], %[stride_4] \n\t"
374  "vld $vr4, %[src], 0 \n\t"
375  "vldx $vr5, %[src], %[stride] \n\t"
376  "vldx $vr6, %[src], %[stride_2] \n\t"
377  "vldx $vr7, %[src], %[stride_3] \n\t"
378 
379  "vld $vr8, %[tmp], 0 \n\t"
380  "vldx $vr9, %[tmp], %[stride] \n\t"
381  "vldx $vr10, %[tmp], %[stride_2] \n\t"
382  "vldx $vr11, %[tmp], %[stride_3] \n\t"
383  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
384  "vld $vr12, %[tmp], 0 \n\t"
385  "vldx $vr13, %[tmp], %[stride] \n\t"
386  "vldx $vr14, %[tmp], %[stride_2] \n\t"
387  "vldx $vr15, %[tmp], %[stride_3] \n\t"
388 
389  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
390  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
391  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
392  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
393  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
394  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
395  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
396  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
397 
398  "vstelm.d $vr0, %[dst], 0, 0 \n\t"
399  "add.d %[dst], %[dst], %[stride] \n\t"
400  "vstelm.d $vr1, %[dst], 0, 0 \n\t"
401  "add.d %[dst], %[dst], %[stride] \n\t"
402  "vstelm.d $vr2, %[dst], 0, 0 \n\t"
403  "add.d %[dst], %[dst], %[stride] \n\t"
404  "vstelm.d $vr3, %[dst], 0, 0 \n\t"
405  "add.d %[dst], %[dst], %[stride] \n\t"
406  "vstelm.d $vr4, %[dst], 0, 0 \n\t"
407  "add.d %[dst], %[dst], %[stride] \n\t"
408  "vstelm.d $vr5, %[dst], 0, 0 \n\t"
409  "add.d %[dst], %[dst], %[stride] \n\t"
410  "vstelm.d $vr6, %[dst], 0, 0 \n\t"
411  "add.d %[dst], %[dst], %[stride] \n\t"
412  "vstelm.d $vr7, %[dst], 0, 0 \n\t"
413  : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
414  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
415  [stride_4]"=&r"(stride_4)
416  : [stride]"r"(stride)
417  : "memory"
418  );
419 }
420 
421 /* avg_pixels8_8_lsx : dst = avg(src, dst)
422  * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
423  * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
424 static av_always_inline void
425 put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
426  ptrdiff_t dstStride, ptrdiff_t srcStride)
427 {
428  ptrdiff_t stride_2, stride_3, stride_4;
429  __asm__ volatile (
430  /* h0~h7 */
431  "slli.d %[stride_2], %[srcStride], 1 \n\t"
432  "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
433  "slli.d %[stride_4], %[stride_2], 1 \n\t"
434  "vld $vr0, %[src], 0 \n\t"
435  "vldx $vr1, %[src], %[srcStride] \n\t"
436  "vldx $vr2, %[src], %[stride_2] \n\t"
437  "vldx $vr3, %[src], %[stride_3] \n\t"
438  "add.d %[src], %[src], %[stride_4] \n\t"
439  "vld $vr4, %[src], 0 \n\t"
440  "vldx $vr5, %[src], %[srcStride] \n\t"
441  "vldx $vr6, %[src], %[stride_2] \n\t"
442  "vldx $vr7, %[src], %[stride_3] \n\t"
443 
444  "vld $vr8, %[half], 0x00 \n\t"
445  "vld $vr9, %[half], 0x08 \n\t"
446  "vld $vr10, %[half], 0x10 \n\t"
447  "vld $vr11, %[half], 0x18 \n\t"
448  "vld $vr12, %[half], 0x20 \n\t"
449  "vld $vr13, %[half], 0x28 \n\t"
450  "vld $vr14, %[half], 0x30 \n\t"
451  "vld $vr15, %[half], 0x38 \n\t"
452 
453  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
454  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
455  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
456  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
457  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
458  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
459  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
460  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
461 
462  "vstelm.d $vr0, %[dst], 0, 0 \n\t"
463  "add.d %[dst], %[dst], %[dstStride] \n\t"
464  "vstelm.d $vr1, %[dst], 0, 0 \n\t"
465  "add.d %[dst], %[dst], %[dstStride] \n\t"
466  "vstelm.d $vr2, %[dst], 0, 0 \n\t"
467  "add.d %[dst], %[dst], %[dstStride] \n\t"
468  "vstelm.d $vr3, %[dst], 0, 0 \n\t"
469  "add.d %[dst], %[dst], %[dstStride] \n\t"
470  "vstelm.d $vr4, %[dst], 0, 0 \n\t"
471  "add.d %[dst], %[dst], %[dstStride] \n\t"
472  "vstelm.d $vr5, %[dst], 0, 0 \n\t"
473  "add.d %[dst], %[dst], %[dstStride] \n\t"
474  "vstelm.d $vr6, %[dst], 0, 0 \n\t"
475  "add.d %[dst], %[dst], %[dstStride] \n\t"
476  "vstelm.d $vr7, %[dst], 0, 0 \n\t"
477  : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
478  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
479  [stride_4]"=&r"(stride_4)
480  : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
481  : "memory"
482  );
483 }
484 
485 /* avg_pixels8_8_lsx : dst = avg(src, dst)
486  * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
487  * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
488 static av_always_inline void
489 avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
490  ptrdiff_t dstStride, ptrdiff_t srcStride)
491 {
492  uint8_t *tmp = dst;
493  ptrdiff_t stride_2, stride_3, stride_4;
494  __asm__ volatile (
495  /* h0~h7 */
496  "slli.d %[stride_2], %[srcStride], 1 \n\t"
497  "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
498  "slli.d %[stride_4], %[stride_2], 1 \n\t"
499  "vld $vr0, %[src], 0 \n\t"
500  "vldx $vr1, %[src], %[srcStride] \n\t"
501  "vldx $vr2, %[src], %[stride_2] \n\t"
502  "vldx $vr3, %[src], %[stride_3] \n\t"
503  "add.d %[src], %[src], %[stride_4] \n\t"
504  "vld $vr4, %[src], 0 \n\t"
505  "vldx $vr5, %[src], %[srcStride] \n\t"
506  "vldx $vr6, %[src], %[stride_2] \n\t"
507  "vldx $vr7, %[src], %[stride_3] \n\t"
508 
509  "vld $vr8, %[half], 0x00 \n\t"
510  "vld $vr9, %[half], 0x08 \n\t"
511  "vld $vr10, %[half], 0x10 \n\t"
512  "vld $vr11, %[half], 0x18 \n\t"
513  "vld $vr12, %[half], 0x20 \n\t"
514  "vld $vr13, %[half], 0x28 \n\t"
515  "vld $vr14, %[half], 0x30 \n\t"
516  "vld $vr15, %[half], 0x38 \n\t"
517 
518  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
519  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
520  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
521  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
522  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
523  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
524  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
525  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
526 
527  "slli.d %[stride_2], %[dstStride], 1 \n\t"
528  "add.d %[stride_3], %[stride_2], %[dstStride] \n\t"
529  "slli.d %[stride_4], %[stride_2], 1 \n\t"
530  "vld $vr8, %[tmp], 0 \n\t"
531  "vldx $vr9, %[tmp], %[dstStride] \n\t"
532  "vldx $vr10, %[tmp], %[stride_2] \n\t"
533  "vldx $vr11, %[tmp], %[stride_3] \n\t"
534  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
535  "vld $vr12, %[tmp], 0 \n\t"
536  "vldx $vr13, %[tmp], %[dstStride] \n\t"
537  "vldx $vr14, %[tmp], %[stride_2] \n\t"
538  "vldx $vr15, %[tmp], %[stride_3] \n\t"
539 
540  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
541  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
542  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
543  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
544  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
545  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
546  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
547  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
548 
549  "vstelm.d $vr0, %[dst], 0, 0 \n\t"
550  "add.d %[dst], %[dst], %[dstStride] \n\t"
551  "vstelm.d $vr1, %[dst], 0, 0 \n\t"
552  "add.d %[dst], %[dst], %[dstStride] \n\t"
553  "vstelm.d $vr2, %[dst], 0, 0 \n\t"
554  "add.d %[dst], %[dst], %[dstStride] \n\t"
555  "vstelm.d $vr3, %[dst], 0, 0 \n\t"
556  "add.d %[dst], %[dst], %[dstStride] \n\t"
557  "vstelm.d $vr4, %[dst], 0, 0 \n\t"
558  "add.d %[dst], %[dst], %[dstStride] \n\t"
559  "vstelm.d $vr5, %[dst], 0, 0 \n\t"
560  "add.d %[dst], %[dst], %[dstStride] \n\t"
561  "vstelm.d $vr6, %[dst], 0, 0 \n\t"
562  "add.d %[dst], %[dst], %[dstStride] \n\t"
563  "vstelm.d $vr7, %[dst], 0, 0 \n\t"
564  : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
565  [src]"+&r"(src), [stride_2]"=&r"(stride_2),
566  [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
567  : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
568  : "memory"
569  );
570 }
571 
572 /* put_pixels16_8_lsx: dst = src */
573 static av_always_inline void
574 put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
575 {
576  ptrdiff_t stride_2, stride_3, stride_4;
577  __asm__ volatile (
578  "slli.d %[stride_2], %[stride], 1 \n\t"
579  "add.d %[stride_3], %[stride_2], %[stride] \n\t"
580  "slli.d %[stride_4], %[stride_2], 1 \n\t"
581  "vld $vr0, %[src], 0 \n\t"
582  "vldx $vr1, %[src], %[stride] \n\t"
583  "vldx $vr2, %[src], %[stride_2] \n\t"
584  "vldx $vr3, %[src], %[stride_3] \n\t"
585  "add.d %[src], %[src], %[stride_4] \n\t"
586  "vld $vr4, %[src], 0 \n\t"
587  "vldx $vr5, %[src], %[stride] \n\t"
588  "vldx $vr6, %[src], %[stride_2] \n\t"
589  "vldx $vr7, %[src], %[stride_3] \n\t"
590  "add.d %[src], %[src], %[stride_4] \n\t"
591 
592  "vst $vr0, %[dst], 0 \n\t"
593  "vstx $vr1, %[dst], %[stride] \n\t"
594  "vstx $vr2, %[dst], %[stride_2] \n\t"
595  "vstx $vr3, %[dst], %[stride_3] \n\t"
596  "add.d %[dst], %[dst], %[stride_4] \n\t"
597  "vst $vr4, %[dst], 0 \n\t"
598  "vstx $vr5, %[dst], %[stride] \n\t"
599  "vstx $vr6, %[dst], %[stride_2] \n\t"
600  "vstx $vr7, %[dst], %[stride_3] \n\t"
601  "add.d %[dst], %[dst], %[stride_4] \n\t"
602 
603  "vld $vr0, %[src], 0 \n\t"
604  "vldx $vr1, %[src], %[stride] \n\t"
605  "vldx $vr2, %[src], %[stride_2] \n\t"
606  "vldx $vr3, %[src], %[stride_3] \n\t"
607  "add.d %[src], %[src], %[stride_4] \n\t"
608  "vld $vr4, %[src], 0 \n\t"
609  "vldx $vr5, %[src], %[stride] \n\t"
610  "vldx $vr6, %[src], %[stride_2] \n\t"
611  "vldx $vr7, %[src], %[stride_3] \n\t"
612 
613  "vst $vr0, %[dst], 0 \n\t"
614  "vstx $vr1, %[dst], %[stride] \n\t"
615  "vstx $vr2, %[dst], %[stride_2] \n\t"
616  "vstx $vr3, %[dst], %[stride_3] \n\t"
617  "add.d %[dst], %[dst], %[stride_4] \n\t"
618  "vst $vr4, %[dst], 0 \n\t"
619  "vstx $vr5, %[dst], %[stride] \n\t"
620  "vstx $vr6, %[dst], %[stride_2] \n\t"
621  "vstx $vr7, %[dst], %[stride_3] \n\t"
622  : [dst]"+&r"(dst), [src]"+&r"(src),
623  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
624  [stride_4]"=&r"(stride_4)
625  : [stride]"r"(stride)
626  : "memory"
627  );
628 }
629 
630 /* avg_pixels16_8_lsx : dst = avg(src, dst)
631  * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
632  * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
633 static av_always_inline void
634 avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
635 {
636  uint8_t *tmp = dst;
637  ptrdiff_t stride_2, stride_3, stride_4;
638  __asm__ volatile (
639  /* h0~h7 */
640  "slli.d %[stride_2], %[stride], 1 \n\t"
641  "add.d %[stride_3], %[stride_2], %[stride] \n\t"
642  "slli.d %[stride_4], %[stride_2], 1 \n\t"
643  "vld $vr0, %[src], 0 \n\t"
644  "vldx $vr1, %[src], %[stride] \n\t"
645  "vldx $vr2, %[src], %[stride_2] \n\t"
646  "vldx $vr3, %[src], %[stride_3] \n\t"
647  "add.d %[src], %[src], %[stride_4] \n\t"
648  "vld $vr4, %[src], 0 \n\t"
649  "vldx $vr5, %[src], %[stride] \n\t"
650  "vldx $vr6, %[src], %[stride_2] \n\t"
651  "vldx $vr7, %[src], %[stride_3] \n\t"
652  "add.d %[src], %[src], %[stride_4] \n\t"
653 
654  "vld $vr8, %[tmp], 0 \n\t"
655  "vldx $vr9, %[tmp], %[stride] \n\t"
656  "vldx $vr10, %[tmp], %[stride_2] \n\t"
657  "vldx $vr11, %[tmp], %[stride_3] \n\t"
658  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
659  "vld $vr12, %[tmp], 0 \n\t"
660  "vldx $vr13, %[tmp], %[stride] \n\t"
661  "vldx $vr14, %[tmp], %[stride_2] \n\t"
662  "vldx $vr15, %[tmp], %[stride_3] \n\t"
663  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
664 
665  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
666  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
667  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
668  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
669  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
670  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
671  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
672  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
673 
674  "vst $vr0, %[dst], 0 \n\t"
675  "vstx $vr1, %[dst], %[stride] \n\t"
676  "vstx $vr2, %[dst], %[stride_2] \n\t"
677  "vstx $vr3, %[dst], %[stride_3] \n\t"
678  "add.d %[dst], %[dst], %[stride_4] \n\t"
679  "vst $vr4, %[dst], 0 \n\t"
680  "vstx $vr5, %[dst], %[stride] \n\t"
681  "vstx $vr6, %[dst], %[stride_2] \n\t"
682  "vstx $vr7, %[dst], %[stride_3] \n\t"
683  "add.d %[dst], %[dst], %[stride_4] \n\t"
684 
685  /* h8~h15 */
686  "vld $vr0, %[src], 0 \n\t"
687  "vldx $vr1, %[src], %[stride] \n\t"
688  "vldx $vr2, %[src], %[stride_2] \n\t"
689  "vldx $vr3, %[src], %[stride_3] \n\t"
690  "add.d %[src], %[src], %[stride_4] \n\t"
691  "vld $vr4, %[src], 0 \n\t"
692  "vldx $vr5, %[src], %[stride] \n\t"
693  "vldx $vr6, %[src], %[stride_2] \n\t"
694  "vldx $vr7, %[src], %[stride_3] \n\t"
695 
696  "vld $vr8, %[tmp], 0 \n\t"
697  "vldx $vr9, %[tmp], %[stride] \n\t"
698  "vldx $vr10, %[tmp], %[stride_2] \n\t"
699  "vldx $vr11, %[tmp], %[stride_3] \n\t"
700  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
701  "vld $vr12, %[tmp], 0 \n\t"
702  "vldx $vr13, %[tmp], %[stride] \n\t"
703  "vldx $vr14, %[tmp], %[stride_2] \n\t"
704  "vldx $vr15, %[tmp], %[stride_3] \n\t"
705 
706  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
707  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
708  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
709  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
710  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
711  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
712  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
713  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
714 
715  "vst $vr0, %[dst], 0 \n\t"
716  "vstx $vr1, %[dst], %[stride] \n\t"
717  "vstx $vr2, %[dst], %[stride_2] \n\t"
718  "vstx $vr3, %[dst], %[stride_3] \n\t"
719  "add.d %[dst], %[dst], %[stride_4] \n\t"
720  "vst $vr4, %[dst], 0 \n\t"
721  "vstx $vr5, %[dst], %[stride] \n\t"
722  "vstx $vr6, %[dst], %[stride_2] \n\t"
723  "vstx $vr7, %[dst], %[stride_3] \n\t"
724  : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
725  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
726  [stride_4]"=&r"(stride_4)
727  : [stride]"r"(stride)
728  : "memory"
729  );
730 }
731 
732 /* avg_pixels16_8_lsx : dst = avg(src, dst)
733  * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
734  * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
735 static av_always_inline void
736 put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
737  ptrdiff_t dstStride, ptrdiff_t srcStride)
738 {
739  ptrdiff_t stride_2, stride_3, stride_4;
740  ptrdiff_t dstride_2, dstride_3, dstride_4;
741  __asm__ volatile (
742  "slli.d %[stride_2], %[srcStride], 1 \n\t"
743  "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
744  "slli.d %[stride_4], %[stride_2], 1 \n\t"
745  "slli.d %[dstride_2], %[dstStride], 1 \n\t"
746  "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
747  "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
748  /* h0~h7 */
749  "vld $vr0, %[src], 0 \n\t"
750  "vldx $vr1, %[src], %[srcStride] \n\t"
751  "vldx $vr2, %[src], %[stride_2] \n\t"
752  "vldx $vr3, %[src], %[stride_3] \n\t"
753  "add.d %[src], %[src], %[stride_4] \n\t"
754  "vld $vr4, %[src], 0 \n\t"
755  "vldx $vr5, %[src], %[srcStride] \n\t"
756  "vldx $vr6, %[src], %[stride_2] \n\t"
757  "vldx $vr7, %[src], %[stride_3] \n\t"
758  "add.d %[src], %[src], %[stride_4] \n\t"
759 
760  "vld $vr8, %[half], 0x00 \n\t"
761  "vld $vr9, %[half], 0x10 \n\t"
762  "vld $vr10, %[half], 0x20 \n\t"
763  "vld $vr11, %[half], 0x30 \n\t"
764  "vld $vr12, %[half], 0x40 \n\t"
765  "vld $vr13, %[half], 0x50 \n\t"
766  "vld $vr14, %[half], 0x60 \n\t"
767  "vld $vr15, %[half], 0x70 \n\t"
768 
769  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
770  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
771  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
772  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
773  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
774  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
775  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
776  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
777 
778  "vst $vr0, %[dst], 0 \n\t"
779  "vstx $vr1, %[dst], %[dstStride] \n\t"
780  "vstx $vr2, %[dst], %[dstride_2] \n\t"
781  "vstx $vr3, %[dst], %[dstride_3] \n\t"
782  "add.d %[dst], %[dst], %[dstride_4] \n\t"
783  "vst $vr4, %[dst], 0 \n\t"
784  "vstx $vr5, %[dst], %[dstStride] \n\t"
785  "vstx $vr6, %[dst], %[dstride_2] \n\t"
786  "vstx $vr7, %[dst], %[dstride_3] \n\t"
787  "add.d %[dst], %[dst], %[dstride_4] \n\t"
788 
789  /* h8~h15 */
790  "vld $vr0, %[src], 0 \n\t"
791  "vldx $vr1, %[src], %[srcStride] \n\t"
792  "vldx $vr2, %[src], %[stride_2] \n\t"
793  "vldx $vr3, %[src], %[stride_3] \n\t"
794  "add.d %[src], %[src], %[stride_4] \n\t"
795  "vld $vr4, %[src], 0 \n\t"
796  "vldx $vr5, %[src], %[srcStride] \n\t"
797  "vldx $vr6, %[src], %[stride_2] \n\t"
798  "vldx $vr7, %[src], %[stride_3] \n\t"
799 
800  "vld $vr8, %[half], 0x80 \n\t"
801  "vld $vr9, %[half], 0x90 \n\t"
802  "vld $vr10, %[half], 0xa0 \n\t"
803  "vld $vr11, %[half], 0xb0 \n\t"
804  "vld $vr12, %[half], 0xc0 \n\t"
805  "vld $vr13, %[half], 0xd0 \n\t"
806  "vld $vr14, %[half], 0xe0 \n\t"
807  "vld $vr15, %[half], 0xf0 \n\t"
808 
809  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
810  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
811  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
812  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
813  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
814  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
815  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
816  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
817 
818  "vst $vr0, %[dst], 0 \n\t"
819  "vstx $vr1, %[dst], %[dstStride] \n\t"
820  "vstx $vr2, %[dst], %[dstride_2] \n\t"
821  "vstx $vr3, %[dst], %[dstride_3] \n\t"
822  "add.d %[dst], %[dst], %[dstride_4] \n\t"
823  "vst $vr4, %[dst], 0 \n\t"
824  "vstx $vr5, %[dst], %[dstStride] \n\t"
825  "vstx $vr6, %[dst], %[dstride_2] \n\t"
826  "vstx $vr7, %[dst], %[dstride_3] \n\t"
827  : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
828  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
829  [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2),
830  [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
831  : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
832  : "memory"
833  );
834 }
835 
836 /* avg_pixels16_8_lsx : dst = avg(src, dst)
837  * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
838  * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
839 static av_always_inline void
840 avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
841  ptrdiff_t dstStride, ptrdiff_t srcStride)
842 {
843  uint8_t *tmp = dst;
844  ptrdiff_t stride_2, stride_3, stride_4;
845  ptrdiff_t dstride_2, dstride_3, dstride_4;
846  __asm__ volatile (
847  "slli.d %[stride_2], %[srcStride], 1 \n\t"
848  "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
849  "slli.d %[stride_4], %[stride_2], 1 \n\t"
850  "slli.d %[dstride_2], %[dstStride], 1 \n\t"
851  "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
852  "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
853  /* h0~h7 */
854  "vld $vr0, %[src], 0 \n\t"
855  "vldx $vr1, %[src], %[srcStride] \n\t"
856  "vldx $vr2, %[src], %[stride_2] \n\t"
857  "vldx $vr3, %[src], %[stride_3] \n\t"
858  "add.d %[src], %[src], %[stride_4] \n\t"
859  "vld $vr4, %[src], 0 \n\t"
860  "vldx $vr5, %[src], %[srcStride] \n\t"
861  "vldx $vr6, %[src], %[stride_2] \n\t"
862  "vldx $vr7, %[src], %[stride_3] \n\t"
863  "add.d %[src], %[src], %[stride_4] \n\t"
864 
865  "vld $vr8, %[half], 0x00 \n\t"
866  "vld $vr9, %[half], 0x10 \n\t"
867  "vld $vr10, %[half], 0x20 \n\t"
868  "vld $vr11, %[half], 0x30 \n\t"
869  "vld $vr12, %[half], 0x40 \n\t"
870  "vld $vr13, %[half], 0x50 \n\t"
871  "vld $vr14, %[half], 0x60 \n\t"
872  "vld $vr15, %[half], 0x70 \n\t"
873 
874  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
875  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
876  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
877  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
878  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
879  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
880  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
881  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
882 
883  "vld $vr8, %[tmp], 0 \n\t"
884  "vldx $vr9, %[tmp], %[dstStride] \n\t"
885  "vldx $vr10, %[tmp], %[dstride_2] \n\t"
886  "vldx $vr11, %[tmp], %[dstride_3] \n\t"
887  "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
888  "vld $vr12, %[tmp], 0 \n\t"
889  "vldx $vr13, %[tmp], %[dstStride] \n\t"
890  "vldx $vr14, %[tmp], %[dstride_2] \n\t"
891  "vldx $vr15, %[tmp], %[dstride_3] \n\t"
892  "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
893 
894  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
895  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
896  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
897  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
898  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
899  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
900  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
901  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
902 
903  "vst $vr0, %[dst], 0 \n\t"
904  "vstx $vr1, %[dst], %[dstStride] \n\t"
905  "vstx $vr2, %[dst], %[dstride_2] \n\t"
906  "vstx $vr3, %[dst], %[dstride_3] \n\t"
907  "add.d %[dst], %[dst], %[dstride_4] \n\t"
908  "vst $vr4, %[dst], 0 \n\t"
909  "vstx $vr5, %[dst], %[dstStride] \n\t"
910  "vstx $vr6, %[dst], %[dstride_2] \n\t"
911  "vstx $vr7, %[dst], %[dstride_3] \n\t"
912  "add.d %[dst], %[dst], %[dstride_4] \n\t"
913 
914  /* h8~h15 */
915  "vld $vr0, %[src], 0 \n\t"
916  "vldx $vr1, %[src], %[srcStride] \n\t"
917  "vldx $vr2, %[src], %[stride_2] \n\t"
918  "vldx $vr3, %[src], %[stride_3] \n\t"
919  "add.d %[src], %[src], %[stride_4] \n\t"
920  "vld $vr4, %[src], 0 \n\t"
921  "vldx $vr5, %[src], %[srcStride] \n\t"
922  "vldx $vr6, %[src], %[stride_2] \n\t"
923  "vldx $vr7, %[src], %[stride_3] \n\t"
924 
925  "vld $vr8, %[half], 0x80 \n\t"
926  "vld $vr9, %[half], 0x90 \n\t"
927  "vld $vr10, %[half], 0xa0 \n\t"
928  "vld $vr11, %[half], 0xb0 \n\t"
929  "vld $vr12, %[half], 0xc0 \n\t"
930  "vld $vr13, %[half], 0xd0 \n\t"
931  "vld $vr14, %[half], 0xe0 \n\t"
932  "vld $vr15, %[half], 0xf0 \n\t"
933 
934  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
935  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
936  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
937  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
938  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
939  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
940  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
941  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
942 
943  "vld $vr8, %[tmp], 0 \n\t"
944  "vldx $vr9, %[tmp], %[dstStride] \n\t"
945  "vldx $vr10, %[tmp], %[dstride_2] \n\t"
946  "vldx $vr11, %[tmp], %[dstride_3] \n\t"
947  "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
948  "vld $vr12, %[tmp], 0 \n\t"
949  "vldx $vr13, %[tmp], %[dstStride] \n\t"
950  "vldx $vr14, %[tmp], %[dstride_2] \n\t"
951  "vldx $vr15, %[tmp], %[dstride_3] \n\t"
952 
953  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
954  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
955  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
956  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
957  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
958  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
959  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
960  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
961 
962  "vst $vr0, %[dst], 0 \n\t"
963  "vstx $vr1, %[dst], %[dstStride] \n\t"
964  "vstx $vr2, %[dst], %[dstride_2] \n\t"
965  "vstx $vr3, %[dst], %[dstride_3] \n\t"
966  "add.d %[dst], %[dst], %[dstride_4] \n\t"
967  "vst $vr4, %[dst], 0 \n\t"
968  "vstx $vr5, %[dst], %[dstStride] \n\t"
969  "vstx $vr6, %[dst], %[dstride_2] \n\t"
970  "vstx $vr7, %[dst], %[dstride_3] \n\t"
971  : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
972  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
973  [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2),
974  [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
975  : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
976  : "memory"
977  );
978 }
979 
980 #define QPEL8_H_LOWPASS(out_v) \
981  src00 = __lasx_xvld(src, - 2); \
982  src += srcStride; \
983  src10 = __lasx_xvld(src, - 2); \
984  src += srcStride; \
985  src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
986  src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
987  src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
988  src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
989  src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
990  src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
991  DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
992  src00 = __lasx_xvaddwl_h_bu(src00, src05); \
993  src02 = __lasx_xvmul_h(src02, h_20); \
994  src01 = __lasx_xvmul_h(src01, h_5); \
995  src02 = __lasx_xvssub_h(src02, src01); \
996  src02 = __lasx_xvsadd_h(src02, src00); \
997  src02 = __lasx_xvsadd_h(src02, h_16); \
998  out_v = __lasx_xvssrani_bu_h(src02, src02, 5); \
999 
1000 static av_always_inline void
1001 put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1002  int srcStride)
1003 {
1004  int dstStride_2x = dstStride << 1;
1005  __m256i src00, src01, src02, src03, src04, src05, src10;
1006  __m256i out0, out1, out2, out3;
1007  __m256i h_20 = __lasx_xvldi(0x414);
1008  __m256i h_5 = __lasx_xvldi(0x405);
1009  __m256i h_16 = __lasx_xvldi(0x410);
1010  __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1011  __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1012  __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1013  __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1014  __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1015 
1016  QPEL8_H_LOWPASS(out0)
1017  QPEL8_H_LOWPASS(out1)
1018  QPEL8_H_LOWPASS(out2)
1019  QPEL8_H_LOWPASS(out3)
1020  __lasx_xvstelm_d(out0, dst, 0, 0);
1021  __lasx_xvstelm_d(out0, dst + dstStride, 0, 2);
1022  dst += dstStride_2x;
1023  __lasx_xvstelm_d(out1, dst, 0, 0);
1024  __lasx_xvstelm_d(out1, dst + dstStride, 0, 2);
1025  dst += dstStride_2x;
1026  __lasx_xvstelm_d(out2, dst, 0, 0);
1027  __lasx_xvstelm_d(out2, dst + dstStride, 0, 2);
1028  dst += dstStride_2x;
1029  __lasx_xvstelm_d(out3, dst, 0, 0);
1030  __lasx_xvstelm_d(out3, dst + dstStride, 0, 2);
1031 }
1032 
1033 #define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, \
1034  tmp0, tmp1, tmp2, tmp3, tmp4, tmp5) \
1035 { \
1036  tmp0 = __lasx_xvpermi_q(src0, src1, 0x02); \
1037  tmp1 = __lasx_xvpermi_q(src1, src2, 0x02); \
1038  tmp2 = __lasx_xvpermi_q(src2, src3, 0x02); \
1039  tmp3 = __lasx_xvpermi_q(src3, src4, 0x02); \
1040  tmp4 = __lasx_xvpermi_q(src4, src5, 0x02); \
1041  tmp5 = __lasx_xvpermi_q(src5, src6, 0x02); \
1042  DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \
1043  tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5); \
1044  tmp2 = __lasx_xvmul_h(tmp2, h_20); \
1045  tmp1 = __lasx_xvmul_h(tmp1, h_5); \
1046  tmp2 = __lasx_xvssub_h(tmp2, tmp1); \
1047  tmp2 = __lasx_xvsadd_h(tmp2, tmp0); \
1048  tmp2 = __lasx_xvsadd_h(tmp2, h_16); \
1049  tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5); \
1050 }
1051 
1052 static av_always_inline void
1053 put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1054  int srcStride)
1055 {
1056  int srcStride_2x = srcStride << 1;
1057  int dstStride_2x = dstStride << 1;
1058  int srcStride_4x = srcStride << 2;
1059  int srcStride_3x = srcStride_2x + srcStride;
1060  __m256i src00, src01, src02, src03, src04, src05, src06;
1061  __m256i src07, src08, src09, src10, src11, src12;
1062  __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05;
1063  __m256i h_20 = __lasx_xvldi(0x414);
1064  __m256i h_5 = __lasx_xvldi(0x405);
1065  __m256i h_16 = __lasx_xvldi(0x410);
1066 
1067  DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1068  src00, src01);
1069  src02 = __lasx_xvld(src, 0);
1070  DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1071  srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1072  src += srcStride_4x;
1073  DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1074  srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1075  src += srcStride_4x;
1076  DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1077 
1078  QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1079  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1080  __lasx_xvstelm_d(tmp02, dst, 0, 0);
1081  __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1082  dst += dstStride_2x;
1083  QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1084  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1085  __lasx_xvstelm_d(tmp02, dst, 0, 0);
1086  __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1087  dst += dstStride_2x;
1088  QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1089  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1090  __lasx_xvstelm_d(tmp02, dst, 0, 0);
1091  __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1092  dst += dstStride_2x;
1093  QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1094  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1095  __lasx_xvstelm_d(tmp02, dst, 0, 0);
1096  __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1097 }
1098 
1099 static av_always_inline void
1100 avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
1101  int srcStride)
1102 {
1103  int srcStride_2x = srcStride << 1;
1104  int srcStride_4x = srcStride << 2;
1105  int dstStride_2x = dstStride << 1;
1106  int dstStride_4x = dstStride << 2;
1107  int srcStride_3x = srcStride_2x + srcStride;
1108  int dstStride_3x = dstStride_2x + dstStride;
1109  __m256i src00, src01, src02, src03, src04, src05, src06;
1110  __m256i src07, src08, src09, src10, src11, src12, tmp00;
1111  __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09;
1112  __m256i h_20 = __lasx_xvldi(0x414);
1113  __m256i h_5 = __lasx_xvldi(0x405);
1114  __m256i h_16 = __lasx_xvldi(0x410);
1115 
1116 
1117  DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
1118  src00, src01);
1119  src02 = __lasx_xvld(src, 0);
1120  DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1121  srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
1122  src += srcStride_4x;
1123  DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
1124  srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
1125  src += srcStride_4x;
1126  DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
1127 
1128  tmp06 = __lasx_xvld(dst, 0);
1129  DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1130  dst, dstStride_3x, dst, dstStride_4x,
1131  tmp07, tmp02, tmp03, tmp04);
1132  dst += dstStride_4x;
1133  DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1134  tmp05, tmp00);
1135  tmp01 = __lasx_xvldx(dst, dstStride_3x);
1136  dst -= dstStride_4x;
1137 
1138  tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02);
1139  tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02);
1140  tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02);
1141  tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02);
1142 
1143  QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
1144  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1145  tmp06 = __lasx_xvavgr_bu(tmp06, tmp02);
1146  __lasx_xvstelm_d(tmp06, dst, 0, 0);
1147  __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2);
1148  dst += dstStride_2x;
1149  QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
1150  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1151  tmp07 = __lasx_xvavgr_bu(tmp07, tmp02);
1152  __lasx_xvstelm_d(tmp07, dst, 0, 0);
1153  __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2);
1154  dst += dstStride_2x;
1155  QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
1156  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1157  tmp08 = __lasx_xvavgr_bu(tmp08, tmp02);
1158  __lasx_xvstelm_d(tmp08, dst, 0, 0);
1159  __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2);
1160  dst += dstStride_2x;
1161  QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
1162  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1163  tmp09 = __lasx_xvavgr_bu(tmp09, tmp02);
1164  __lasx_xvstelm_d(tmp09, dst, 0, 0);
1165  __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2);
1166 }
1167 
1168 #define QPEL8_HV_LOWPASS_H(tmp) \
1169 { \
1170  src00 = __lasx_xvld(src, -2); \
1171  src += srcStride; \
1172  src10 = __lasx_xvld(src, -2); \
1173  src += srcStride; \
1174  src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
1175  src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
1176  src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
1177  src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
1178  src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
1179  src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
1180  DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
1181  src00 = __lasx_xvaddwl_h_bu(src00, src05); \
1182  src02 = __lasx_xvmul_h(src02, h_20); \
1183  src01 = __lasx_xvmul_h(src01, h_5); \
1184  src02 = __lasx_xvssub_h(src02, src01); \
1185  tmp = __lasx_xvsadd_h(src02, src00); \
1186 }
1187 
1188 #define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, \
1189  src4, src5, temp0, temp1, \
1190  temp2, temp3, temp4, temp5, \
1191  out) \
1192 { \
1193  DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \
1194  DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \
1195  temp4 = __lasx_xvaddwl_w_h(src0, src5); \
1196  temp5 = __lasx_xvaddwh_w_h(src0, src5); \
1197  temp0 = __lasx_xvmul_w(temp0, w_20); \
1198  temp1 = __lasx_xvmul_w(temp1, w_20); \
1199  temp2 = __lasx_xvmul_w(temp2, w_5); \
1200  temp3 = __lasx_xvmul_w(temp3, w_5); \
1201  temp0 = __lasx_xvssub_w(temp0, temp2); \
1202  temp1 = __lasx_xvssub_w(temp1, temp3); \
1203  temp0 = __lasx_xvsadd_w(temp0, temp4); \
1204  temp1 = __lasx_xvsadd_w(temp1, temp5); \
1205  temp0 = __lasx_xvsadd_w(temp0, w_512); \
1206  temp1 = __lasx_xvsadd_w(temp1, w_512); \
1207  temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10); \
1208  temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10); \
1209  temp0 = __lasx_xvpackev_d(temp1, temp0); \
1210  out = __lasx_xvssrani_bu_h(temp0, temp0, 0); \
1211 }
1212 
1213 static av_always_inline void
1214 put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1215  ptrdiff_t dstStride, ptrdiff_t srcStride)
1216 {
1217  __m256i src00, src01, src02, src03, src04, src05, src10;
1218  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1219  __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1220  __m256i h_20 = __lasx_xvldi(0x414);
1221  __m256i h_5 = __lasx_xvldi(0x405);
1222  __m256i w_20 = __lasx_xvldi(0x814);
1223  __m256i w_5 = __lasx_xvldi(0x805);
1224  __m256i w_512 = {512};
1225  __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1226  __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1227  __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1228  __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1229  __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1230 
1231  w_512 = __lasx_xvreplve0_w(w_512);
1232 
1233  src -= srcStride << 1;
1234  QPEL8_HV_LOWPASS_H(tmp0)
1235  QPEL8_HV_LOWPASS_H(tmp2)
1236  QPEL8_HV_LOWPASS_H(tmp4)
1237  QPEL8_HV_LOWPASS_H(tmp6)
1238  QPEL8_HV_LOWPASS_H(tmp8)
1239  QPEL8_HV_LOWPASS_H(tmp10)
1240  QPEL8_HV_LOWPASS_H(tmp12)
1241  tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1242  tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
1243  tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
1244  tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
1245  tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
1246  tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
1247 
1248  QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1249  src02, src03, src04, src05, tmp0)
1250  QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1251  src02, src03, src04, src05, tmp2)
1252  QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1253  src02, src03, src04, src05, tmp4)
1254  QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1255  src02, src03, src04, src05, tmp6)
1256  __lasx_xvstelm_d(tmp0, dst, 0, 0);
1257  dst += dstStride;
1258  __lasx_xvstelm_d(tmp0, dst, 0, 2);
1259  dst += dstStride;
1260  __lasx_xvstelm_d(tmp2, dst, 0, 0);
1261  dst += dstStride;
1262  __lasx_xvstelm_d(tmp2, dst, 0, 2);
1263  dst += dstStride;
1264  __lasx_xvstelm_d(tmp4, dst, 0, 0);
1265  dst += dstStride;
1266  __lasx_xvstelm_d(tmp4, dst, 0, 2);
1267  dst += dstStride;
1268  __lasx_xvstelm_d(tmp6, dst, 0, 0);
1269  dst += dstStride;
1270  __lasx_xvstelm_d(tmp6, dst, 0, 2);
1271 }
1272 
1273 static av_always_inline void
1274 avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
1275  int srcStride)
1276 {
1277  int dstStride_2x = dstStride << 1;
1278  int dstStride_4x = dstStride << 2;
1279  int dstStride_3x = dstStride_2x + dstStride;
1280  __m256i src00, src01, src02, src03, src04, src05, src10;
1281  __m256i dst00, dst01, dst0, dst1, dst2, dst3;
1282  __m256i out0, out1, out2, out3;
1283  __m256i h_20 = __lasx_xvldi(0x414);
1284  __m256i h_5 = __lasx_xvldi(0x405);
1285  __m256i h_16 = __lasx_xvldi(0x410);
1286  __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1287  __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1288  __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1289  __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1290  __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1291 
1292  QPEL8_H_LOWPASS(out0)
1293  QPEL8_H_LOWPASS(out1)
1294  QPEL8_H_LOWPASS(out2)
1295  QPEL8_H_LOWPASS(out3)
1296  src00 = __lasx_xvld(dst, 0);
1297  DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1298  dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1299  dst += dstStride_4x;
1300  DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00);
1301  dst01 = __lasx_xvldx(dst, dstStride_3x);
1302  dst -= dstStride_4x;
1303  dst0 = __lasx_xvpermi_q(src00, src01, 0x02);
1304  dst1 = __lasx_xvpermi_q(src02, src03, 0x02);
1305  dst2 = __lasx_xvpermi_q(src04, src05, 0x02);
1306  dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02);
1307  dst0 = __lasx_xvavgr_bu(dst0, out0);
1308  dst1 = __lasx_xvavgr_bu(dst1, out1);
1309  dst2 = __lasx_xvavgr_bu(dst2, out2);
1310  dst3 = __lasx_xvavgr_bu(dst3, out3);
1311  __lasx_xvstelm_d(dst0, dst, 0, 0);
1312  __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2);
1313  __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0);
1314  __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2);
1315  dst += dstStride_4x;
1316  __lasx_xvstelm_d(dst2, dst, 0, 0);
1317  __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2);
1318  __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0);
1319  __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2);
1320 }
1321 
1322 static av_always_inline void
1323 avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1324  ptrdiff_t dstStride, ptrdiff_t srcStride)
1325 {
1326  __m256i src00, src01, src02, src03, src04, src05, src10;
1327  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1328  __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1329  __m256i h_20 = __lasx_xvldi(0x414);
1330  __m256i h_5 = __lasx_xvldi(0x405);
1331  __m256i w_20 = __lasx_xvldi(0x814);
1332  __m256i w_5 = __lasx_xvldi(0x805);
1333  __m256i w_512 = {512};
1334  __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1335  __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1336  __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1337  __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1338  __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1339  ptrdiff_t dstStride_2x = dstStride << 1;
1340  ptrdiff_t dstStride_4x = dstStride << 2;
1341  ptrdiff_t dstStride_3x = dstStride_2x + dstStride;
1342 
1343  w_512 = __lasx_xvreplve0_w(w_512);
1344 
1345  src -= srcStride << 1;
1346  QPEL8_HV_LOWPASS_H(tmp0)
1347  QPEL8_HV_LOWPASS_H(tmp2)
1348  QPEL8_HV_LOWPASS_H(tmp4)
1349  QPEL8_HV_LOWPASS_H(tmp6)
1350  QPEL8_HV_LOWPASS_H(tmp8)
1351  QPEL8_HV_LOWPASS_H(tmp10)
1352  QPEL8_HV_LOWPASS_H(tmp12)
1353  tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1354  tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
1355  tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
1356  tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
1357  tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
1358  tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
1359 
1360  QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
1361  src02, src03, src04, src05, tmp0)
1362  QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
1363  src02, src03, src04, src05, tmp2)
1364  QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
1365  src02, src03, src04, src05, tmp4)
1366  QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
1367  src02, src03, src04, src05, tmp6)
1368 
1369  src00 = __lasx_xvld(dst, 0);
1370  DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1371  dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1372  dst += dstStride_4x;
1373  DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8);
1374  tmp9 = __lasx_xvldx(dst, dstStride_3x);
1375  dst -= dstStride_4x;
1376  tmp1 = __lasx_xvpermi_q(src00, src01, 0x02);
1377  tmp3 = __lasx_xvpermi_q(src02, src03, 0x02);
1378  tmp5 = __lasx_xvpermi_q(src04, src05, 0x02);
1379  tmp7 = __lasx_xvpermi_q(tmp8, tmp9, 0x02);
1380  tmp0 = __lasx_xvavgr_bu(tmp0, tmp1);
1381  tmp2 = __lasx_xvavgr_bu(tmp2, tmp3);
1382  tmp4 = __lasx_xvavgr_bu(tmp4, tmp5);
1383  tmp6 = __lasx_xvavgr_bu(tmp6, tmp7);
1384  __lasx_xvstelm_d(tmp0, dst, 0, 0);
1385  dst += dstStride;
1386  __lasx_xvstelm_d(tmp0, dst, 0, 2);
1387  dst += dstStride;
1388  __lasx_xvstelm_d(tmp2, dst, 0, 0);
1389  dst += dstStride;
1390  __lasx_xvstelm_d(tmp2, dst, 0, 2);
1391  dst += dstStride;
1392  __lasx_xvstelm_d(tmp4, dst, 0, 0);
1393  dst += dstStride;
1394  __lasx_xvstelm_d(tmp4, dst, 0, 2);
1395  dst += dstStride;
1396  __lasx_xvstelm_d(tmp6, dst, 0, 0);
1397  dst += dstStride;
1398  __lasx_xvstelm_d(tmp6, dst, 0, 2);
1399 }
1400 
1401 static av_always_inline void
1402 put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1403  int dstStride, int srcStride)
1404 {
1405  put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1406  put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1407  src += srcStride << 3;
1408  dst += dstStride << 3;
1409  put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1410  put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1411 }
1412 
1413 static av_always_inline void
1414 avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1415  int dstStride, int srcStride)
1416 {
1417  avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1418  avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1419  src += srcStride << 3;
1420  dst += dstStride << 3;
1421  avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1422  avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1423 }
1424 
1425 static void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1426  int dstStride, int srcStride)
1427 {
1428  put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1429  put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1430  src += 8*srcStride;
1431  dst += 8*dstStride;
1432  put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1433  put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1434 }
1435 
1436 static void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1437  int dstStride, int srcStride)
1438 {
1439  avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1440  avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1441  src += 8*srcStride;
1442  dst += 8*dstStride;
1443  avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1444  avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1445 }
1446 
1447 static void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1448  ptrdiff_t dstStride, ptrdiff_t srcStride)
1449 {
1450  put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1451  put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1452  src += srcStride << 3;
1453  dst += dstStride << 3;
1454  put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1455  put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1456 }
1457 
1458 static void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1459  ptrdiff_t dstStride, ptrdiff_t srcStride)
1460 {
1461  avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1462  avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1463  src += srcStride << 3;
1464  dst += dstStride << 3;
1465  avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1466  avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1467 }
1468 
1469 void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1470  ptrdiff_t stride)
1471 {
1472  /* In mmi optimization, it used function ff_put_pixels8_8_mmi
1473  * which implemented in hpeldsp_mmi.c */
1475 }
1476 
1477 void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1478  ptrdiff_t stride)
1479 {
1480  uint8_t half[64];
1481 
1483  /* in qpel8, the stride of half and height of block is 8 */
1485 }
1486 
1487 void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1488  ptrdiff_t stride)
1489 {
1491 }
1492 
1493 void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1494  ptrdiff_t stride)
1495 {
1496  uint8_t half[64];
1497 
1500 }
1501 
1502 void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
1503  ptrdiff_t stride)
1504 {
1505  uint8_t half[64];
1506 
1507  put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1509 }
1510 
1511 void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1512  ptrdiff_t stride)
1513 {
1514  uint8_t halfH[64];
1515  uint8_t halfV[64];
1516 
1518  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1519  put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1520 }
1521 
1522 void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1523  ptrdiff_t stride)
1524 {
1525  uint8_t temp[128];
1526  uint8_t *const halfH = temp;
1527  uint8_t *const halfHV = temp + 64;
1528 
1531  put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1532 }
1533 
1534 void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1535  ptrdiff_t stride)
1536 {
1537  uint8_t halfH[64];
1538  uint8_t halfV[64];
1539 
1541  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1542  put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1543 }
1544 
1545 void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1546  ptrdiff_t stride)
1547 {
1548  put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1549 }
1550 
1551 void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1552  ptrdiff_t stride)
1553 {
1554  uint8_t temp[128];
1555  uint8_t *const halfHV = temp;
1556  uint8_t *const halfH = temp + 64;
1557 
1559  put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1560  put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1561 }
1562 
1563 void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1564  ptrdiff_t stride)
1565 {
1567 }
1568 
1569 void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1570  ptrdiff_t stride)
1571 {
1572  uint8_t temp[128];
1573  uint8_t *const halfHV = temp;
1574  uint8_t *const halfH = temp + 64;
1575 
1577  put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1578  put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1579 }
1580 
1581 void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
1582  ptrdiff_t stride)
1583 {
1584  uint8_t half[64];
1585 
1586  put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1588 }
1589 
1590 void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1591  ptrdiff_t stride)
1592 {
1593  uint8_t halfH[64];
1594  uint8_t halfV[64];
1595 
1597  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1598  put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1599 }
1600 
1601 void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1602  ptrdiff_t stride)
1603 {
1604  uint8_t temp[128];
1605  uint8_t *const halfH = temp;
1606  uint8_t *const halfHV = temp + 64;
1607 
1610  put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1611 }
1612 
1613 void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1614  ptrdiff_t stride)
1615 {
1616  uint8_t halfH[64];
1617  uint8_t halfV[64];
1618 
1620  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1621  put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1622 }
1623 
1624 void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1625  ptrdiff_t stride)
1626 {
1627  /* In mmi optimization, it used function ff_avg_pixels8_8_mmi
1628  * which implemented in hpeldsp_mmi.c */
1629  avg_pixels8_8_lsx(dst, src, stride);
1630 }
1631 
1632 void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1633  ptrdiff_t stride)
1634 {
1635  uint8_t half[64];
1636 
1639 }
1640 
1641 void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1642  ptrdiff_t stride)
1643 {
1645 }
1646 
1647 void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1648  ptrdiff_t stride)
1649 {
1650  uint8_t half[64];
1651 
1654 }
1655 
1656 void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1657  ptrdiff_t stride)
1658 {
1659  uint8_t halfH[64];
1660  uint8_t halfV[64];
1661 
1663  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1664  avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1665 }
1666 
1667 void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1668  ptrdiff_t stride)
1669 {
1670  uint8_t temp[128];
1671  uint8_t *const halfH = temp;
1672  uint8_t *const halfHV = temp + 64;
1673 
1676  avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1677 }
1678 
1679 void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1680  ptrdiff_t stride)
1681 {
1682  uint8_t halfH[64];
1683  uint8_t halfV[64];
1684 
1686  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1687  avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1688 }
1689 
1690 void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1691  ptrdiff_t stride)
1692 {
1693  avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1694 }
1695 
1696 void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1697  ptrdiff_t stride)
1698 {
1699  uint8_t temp[128];
1700  uint8_t *const halfHV = temp;
1701  uint8_t *const halfH = temp + 64;
1702 
1704  put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1705  avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1706 }
1707 
1708 void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1709  ptrdiff_t stride)
1710 {
1712 }
1713 
1714 void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1715  ptrdiff_t stride)
1716 {
1717  uint8_t temp[128];
1718  uint8_t *const halfHV = temp;
1719  uint8_t *const halfH = temp + 64;
1720 
1722  put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1723  avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1724 }
1725 
1726 void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1727  ptrdiff_t stride)
1728 {
1729  uint8_t halfH[64];
1730  uint8_t halfV[64];
1731 
1733  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1734  avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1735 }
1736 
1737 void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1738  ptrdiff_t stride)
1739 {
1740  uint8_t temp[128];
1741  uint8_t *const halfH = temp;
1742  uint8_t *const halfHV = temp + 64;
1743 
1746  avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1747 }
1748 
1749 void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1750  ptrdiff_t stride)
1751 {
1752  uint8_t halfH[64];
1753  uint8_t halfV[64];
1754 
1756  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1757  avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1758 }
1759 
1760 void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1761  ptrdiff_t stride)
1762 {
1763  /* In mmi optimization, it used function ff_put_pixels16_8_mmi
1764  * which implemented in hpeldsp_mmi.c */
1765  put_pixels16_8_lsx(dst, src, stride);
1766 }
1767 
1768 void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1769  ptrdiff_t stride)
1770 {
1771  uint8_t half[256];
1772 
1775 }
1776 
1777 void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1778  ptrdiff_t stride)
1779 {
1781 }
1782 
1783 void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1784  ptrdiff_t stride)
1785 {
1786  uint8_t half[256];
1787 
1790 }
1791 
1792 void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1793  ptrdiff_t stride)
1794 {
1795  uint8_t half[256];
1796 
1799 }
1800 
1801 void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1802  ptrdiff_t stride)
1803 {
1804  avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2),
1805  dst, stride);
1806 }
1807 
1808 void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1809  ptrdiff_t stride)
1810 {
1811  uint8_t temp[512];
1812  uint8_t *const halfH = temp;
1813  uint8_t *const halfHV = temp + 256;
1814 
1817  put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1818 }
1819 
1820 void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1821  ptrdiff_t stride)
1822 {
1823  avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2) + 1,
1824  dst, stride);
1825 }
1826 
1827 void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1828  ptrdiff_t stride)
1829 {
1831 }
1832 
1833 void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1834  ptrdiff_t stride)
1835 {
1836  uint8_t temp[512];
1837  uint8_t *const halfHV = temp;
1838  uint8_t *const halfH = temp + 256;
1839 
1842  put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1843 }
1844 
1845 void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1846  ptrdiff_t stride)
1847 {
1849 }
1850 
1851 void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1852  ptrdiff_t stride)
1853 {
1854  uint8_t temp[512];
1855  uint8_t *const halfHV = temp;
1856  uint8_t *const halfH = temp + 256;
1857 
1859  put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
1860  put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1861 }
1862 
1863 void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
1864  ptrdiff_t stride)
1865 {
1866  uint8_t half[256];
1867 
1870 }
1871 
1872 void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
1873  ptrdiff_t stride)
1874 {
1875  avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, (uint8_t*)src - (stride * 2),
1876  dst, stride);
1877 }
1878 
1879 void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
1880  ptrdiff_t stride)
1881 {
1882  uint8_t temp[512];
1883  uint8_t *const halfH = temp;
1884  uint8_t *const halfHV = temp + 256;
1885 
1888  put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1889 }
1890 
1891 void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
1892  ptrdiff_t stride)
1893 {
1894  avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2,
1895  (uint8_t*)src - (stride * 2) + 1, dst, stride);
1896 }
1897 
1898 void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1899  ptrdiff_t stride)
1900 {
1901  /* In mmi optimization, it used function ff_avg_pixels16_8_mmi
1902  * which implemented in hpeldsp_mmi.c */
1903  avg_pixels16_8_lsx(dst, src, stride);
1904 }
1905 
1906 void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1907  ptrdiff_t stride)
1908 {
1909  uint8_t half[256];
1910 
1913 }
1914 
1915 void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1916  ptrdiff_t stride)
1917 {
1919 }
1920 
1921 void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1922  ptrdiff_t stride)
1923 {
1924  uint8_t half[256];
1925 
1928 }
1929 
1930 void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1931  ptrdiff_t stride)
1932 {
1933  uint8_t half[256];
1934 
1937 }
1938 
1939 void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1940  ptrdiff_t stride)
1941 {
1943  (uint8_t*)src - (stride * 2),
1944  dst, stride);
1945 }
1946 
1947 void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1948  ptrdiff_t stride)
1949 {
1950  uint8_t temp[512];
1951  uint8_t *const halfH = temp;
1952  uint8_t *const halfHV = temp + 256;
1953 
1956  avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1957 }
1958 
1959 void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1960  ptrdiff_t stride)
1961 {
1963  (uint8_t*)src - (stride * 2) + 1,
1964  dst, stride);
1965 }
1966 
1967 void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1968  ptrdiff_t stride)
1969 {
1971 }
1972 
1973 void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1974  ptrdiff_t stride)
1975 {
1976  uint8_t temp[512];
1977  uint8_t *const halfHV = temp;
1978  uint8_t *const halfH = temp + 256;
1979 
1982  avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1983 }
1984 
1985 void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1986  ptrdiff_t stride)
1987 {
1989 }
1990 
1991 void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1992  ptrdiff_t stride)
1993 {
1994  uint8_t temp[512];
1995  uint8_t *const halfHV = temp;
1996  uint8_t *const halfH = temp + 256;
1997 
1999  put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
2000  avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2001 }
2002 
2003 void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
2004  ptrdiff_t stride)
2005 {
2006  uint8_t half[256];
2007 
2010 }
2011 
2012 void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
2013  ptrdiff_t stride)
2014 {
2016  (uint8_t*)src - (stride * 2),
2017  dst, stride);
2018 }
2019 
2020 void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
2021  ptrdiff_t stride)
2022 {
2023  uint8_t temp[512];
2024  uint8_t *const halfH = temp;
2025  uint8_t *const halfHV = temp + 256;
2026 
2029  avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
2030 }
2031 
2032 void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
2033  ptrdiff_t stride)
2034 {
2036  (uint8_t*)src - (stride * 2) + 1,
2037  dst, stride);
2038 }
avg_pixels8_l2_8_lsx
static av_always_inline void avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:489
ff_avg_h264_qpel8_mc22_lasx
void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1708
ff_avg_h264_qpel8_mc33_lasx
void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1749
ff_put_h264_qpel8_mc01_lasx
void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1502
put_pixels8_l2_8_lsx
static av_always_inline void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:425
ff_put_h264_qpel16_mc21_lasx
void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1808
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
half
static uint8_t half(int a, int b)
Definition: mobiclip.c:541
ff_avg_h264_qpel16_mc10_lasx
void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1906
ff_put_h264_qpel8_mc23_lasx
void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1601
ff_avg_h264_qpel16_mc01_lasx
void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1930
ff_put_h264_qpel16_mc00_lasx
void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1760
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
ff_avg_h264_qpel16_mc13_lasx
void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:2012
ff_put_h264_qpel16_mc12_lasx
void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1833
ff_avg_h264_qpel16_mc30_lasx
void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1921
ff_put_h264_qpel8_mc30_lasx
void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1493
ff_put_h264_qpel8_mc10_lasx
void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1477
AVC_HORZ_FILTER_SH
#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)
Definition: h264qpel_lasx.c:38
avg_h264_qpel8_v_lowpass_lasx
static av_always_inline void avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1100
ff_avg_h264_qpel16_mc32_lasx
void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1991
aligned
static int aligned(int val)
Definition: dashdec.c:168
ff_put_h264_qpel8_mc20_lasx
void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1487
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
QPEL8_V_LOWPASS
#define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5)
Definition: h264qpel_lasx.c:1033
ff_avg_h264_qpel16_mc23_lasx
void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:2020
avg_h264_qpel8_hv_lowpass_lasx
static av_always_inline void avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:1323
QPEL8_HV_LOWPASS_V
#define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, src4, src5, temp0, temp1, temp2, temp3, temp4, temp5, out)
Definition: h264qpel_lasx.c:1188
avg_h264_qpel16_h_lowpass_lasx
static av_always_inline void avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1414
ff_put_h264_qpel16_mc20_lasx
void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1777
avg_h264_qpel16_hv_lowpass_lasx
static void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:1458
ff_avg_h264_qpel8_mc30_lasx
void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1647
avc_luma_hv_qrt_and_aver_dst_16x16_lasx
static av_always_inline void avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x, uint8_t *src_y, uint8_t *dst, ptrdiff_t stride)
Definition: h264qpel_lasx.c:65
put_h264_qpel16_hv_lowpass_lasx
static void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:1447
ff_put_h264_qpel16_mc02_lasx
void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1827
ff_put_h264_qpel8_mc11_lasx
void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1511
ff_put_h264_qpel8_mc22_lasx
void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1563
ff_put_h264_qpel8_mc00_lasx
void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1469
ff_avg_h264_qpel8_mc02_lasx
void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1690
ff_avg_h264_qpel8_mc00_lasx
void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1624
ff_avg_h264_qpel16_mc31_lasx
void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1959
avc_luma_hv_qrt_16x16_lasx
static av_always_inline void avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y, uint8_t *dst, ptrdiff_t stride)
Definition: h264qpel_lasx.c:197
h264qpel_lasx.h
AVC_DOT_SH3_SH
#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
Definition: h264qpel_lasx.c:53
put_h264_qpel16_v_lowpass_lasx
static void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1425
ff_avg_h264_qpel8_mc23_lasx
void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1737
put_pixels16_l2_8_lsx
static av_always_inline void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:736
ff_put_h264_qpel16_mc23_lasx
void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1879
ff_avg_h264_qpel16_mc33_lasx
void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:2032
avg_pixels8_8_lsx
static av_always_inline void avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:360
avg_h264_qpel8_h_lowpass_lasx
static av_always_inline void avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1274
avg_pixels16_8_lsx
static av_always_inline void avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:634
ff_avg_h264_qpel8_mc32_lasx
void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1714
put_h264_qpel8_h_lowpass_lasx
static av_always_inline void put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1001
avg_h264_qpel16_v_lowpass_lasx
static void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1436
put_h264_qpel8_hv_lowpass_lasx
static av_always_inline void put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:1214
ff_put_h264_qpel16_mc13_lasx
void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1872
ff_avg_h264_qpel16_mc00_lasx
void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1898
ff_avg_h264_qpel16_mc02_lasx
void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1967
ff_put_h264_qpel16_mc11_lasx
void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1801
ff_put_h264_qpel8_mc31_lasx
void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1534
ff_put_h264_qpel16_mc22_lasx
void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1845
ff_put_h264_qpel8_mc02_lasx
void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1545
attributes.h
ff_avg_h264_qpel8_mc21_lasx
void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1667
avg_pixels16_l2_8_lsx
static av_always_inline void avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:840
ff_put_h264_qpel16_mc03_lasx
void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1863
ff_avg_h264_qpel16_mc12_lasx
void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1973
av_always_inline
#define av_always_inline
Definition: attributes.h:49
ff_put_h264_qpel16_mc01_lasx
void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1792
QPEL8_H_LOWPASS
#define QPEL8_H_LOWPASS(out_v)
Definition: h264qpel_lasx.c:980
stride
#define stride
Definition: h264pred_template.c:537
ff_avg_h264_qpel8_mc20_lasx
void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1641
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
ff_avg_h264_qpel16_mc11_lasx
void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1939
put_pixels16_8_lsx
static av_always_inline void put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:574
luma_mask_arr
static const uint8_t luma_mask_arr[16 *6]
Definition: h264qpel_lasx.c:28
ff_avg_h264_qpel8_mc12_lasx
void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1696
ff_put_h264_qpel16_mc33_lasx
void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1891
ff_put_h264_qpel8_mc12_lasx
void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1551
ff_put_h264_qpel8_mc21_lasx
void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1522
put_h264_qpel8_v_lowpass_lasx
static av_always_inline void put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1053
ff_avg_h264_qpel8_mc10_lasx
void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1632
put_pixels8_8_inline_asm
static av_always_inline void put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:317
ff_put_h264_qpel16_mc31_lasx
void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1820
temp
else temp
Definition: vf_mcdeint.c:248
QPEL8_HV_LOWPASS_H
#define QPEL8_HV_LOWPASS_H(tmp)
Definition: h264qpel_lasx.c:1168
ff_avg_h264_qpel8_mc11_lasx
void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1656
ff_put_h264_qpel16_mc32_lasx
void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1851
ff_put_h264_qpel8_mc13_lasx
void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1590
ff_put_h264_qpel8_mc03_lasx
void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1581
loongson_intrinsics.h
ff_avg_h264_qpel16_mc20_lasx
void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1915
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
ff_avg_h264_qpel16_mc21_lasx
void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1947
put_h264_qpel16_h_lowpass_lasx
static av_always_inline void put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1402
ff_put_h264_qpel8_mc32_lasx
void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1569
ff_put_h264_qpel16_mc30_lasx
void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1783
ff_avg_h264_qpel16_mc03_lasx
void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:2003
ff_avg_h264_qpel8_mc31_lasx
void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1679
ff_avg_h264_qpel8_mc13_lasx
void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1726
ff_put_h264_qpel16_mc10_lasx
void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1768
ff_avg_h264_qpel16_mc22_lasx
void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1985
ff_put_h264_qpel8_mc33_lasx
void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1613
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83