FFmpeg
vp8_mc_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 #include "libavcodec/vp8dsp.h"
22 #include "vp8dsp_loongarch.h"
23 
24 static const uint8_t mc_filt_mask_arr[16 * 3] = {
25  /* 8 width cases */
26  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
27  /* 4 width cases */
28  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
29  /* 4 width cases */
30  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
31 };
32 
33 static const int8_t subpel_filters_lsx[7][8] = {
34  {-6, 123, 12, -1, 0, 0, 0, 0},
35  {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
36  {-9, 93, 50, -6, 0, 0, 0, 0},
37  {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
38  {-6, 50, 93, -9, 0, 0, 0, 0},
39  {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
40  {-1, 12, 123, -6, 0, 0, 0, 0},
41 };
42 
43 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
44 ( { \
45  __m128i out0_m; \
46  \
47  out0_m = __lsx_vdp2_h_b(in0, coeff0); \
48  out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); \
49  out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); \
50  \
51  out0_m; \
52 } )
53 
54 #define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
55  out0, out1, out2) \
56 { \
57  DUP2_ARG3(__lsx_vshuf_b, in1, in0, mask0, in3, in2, mask1, \
58  out0, out1); \
59  out2 = __lsx_vshuf_b(in5, in4, mask2); \
60 }
61 
62 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \
63  filt_h0, filt_h1, filt_h2) \
64 ( { \
65  __m128i vec0_m, vec1_m, vec2_m; \
66  __m128i hz_out_m; \
67  \
68  VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
69  vec0_m, vec1_m, vec2_m); \
70  hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \
71  filt_h0, filt_h1, filt_h2); \
72  \
73  hz_out_m = __lsx_vsrari_h(hz_out_m, 7); \
74  hz_out_m = __lsx_vsat_h(hz_out_m, 7); \
75  \
76  hz_out_m; \
77 } )
78 
79 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
80  mask0, mask1, mask2, \
81  filt0, filt1, filt2, \
82  out0, out1, out2, out3) \
83 { \
84  __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
85  \
86  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, \
87  mask0, src3, src3, mask0, vec0_m, vec1_m, vec2_m, vec3_m); \
88  DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
89  vec3_m, filt0, out0, out1, out2, out3); \
90  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, \
91  mask1, src3, src3, mask1, vec0_m, vec1_m, vec2_m, vec3_m); \
92  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, \
93  mask2, src3, src3, mask2, vec4_m, vec5_m, vec6_m, vec7_m); \
94  DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
95  out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, out3); \
96  DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
97  out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, out3); \
98 }
99 
100 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
101 ( { \
102  __m128i tmp0; \
103  \
104  tmp0 = __lsx_vdp2_h_b(vec0, filt0); \
105  tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1); \
106  \
107  tmp0; \
108 } )
109 
110 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
111 ( { \
112  __m128i vec0_m, vec1_m; \
113  __m128i hz_out_m; \
114  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, \
115  vec0_m, vec1_m); \
116  hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
117  \
118  hz_out_m = __lsx_vsrari_h(hz_out_m, 7); \
119  hz_out_m = __lsx_vsat_h(hz_out_m, 7); \
120  \
121  hz_out_m; \
122 } )
123 
124 void ff_put_vp8_epel8_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
125  const uint8_t *src, ptrdiff_t src_stride,
126  int height, int mx, int my)
127 {
128  uint32_t loop_cnt;
129  const int8_t *filter = subpel_filters_lsx[mx - 1];
130  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
131  __m128i mask0, mask1, mask2;
132  __m128i out0, out1, out2, out3;
133 
134  ptrdiff_t src_stride2 = src_stride << 1;
135  ptrdiff_t src_stride3 = src_stride2 + src_stride;
136  ptrdiff_t src_stride4 = src_stride2 << 1;
137 
138  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
139  src -= 2;
140 
141  /* rearranging filter */
142  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
143  filt2 = __lsx_vldrepl_h(filter, 4);
144 
145  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
146 
147  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
148  src + src_stride3, 0, src0, src1, src2, src3);
149  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
150  src0, src1, src2, src3);
151  src += src_stride4;
152  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
153  filt0, filt1, filt2, out0, out1, out2, out3);
154 
155  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
156  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
157  __lsx_vstelm_d(out0, dst, 0, 0);
158  dst += dst_stride;
159  __lsx_vstelm_d(out0, dst, 0, 1);
160  dst += dst_stride;
161  __lsx_vstelm_d(out1, dst, 0, 0);
162  dst += dst_stride;
163  __lsx_vstelm_d(out1, dst, 0, 1);
164  dst += dst_stride;
165 
166  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
167  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
168  src + src_stride3, 0, src0, src1, src2, src3);
169  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
170  src0, src1, src2, src3);
171  src += src_stride4;
172  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
173  filt0, filt1, filt2, out0, out1, out2, out3);
174 
175  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
176  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
177 
178  __lsx_vstelm_d(out0, dst, 0, 0);
179  dst += dst_stride;
180  __lsx_vstelm_d(out0, dst, 0, 1);
181  dst += dst_stride;
182  __lsx_vstelm_d(out1, dst, 0, 0);
183  dst += dst_stride;
184  __lsx_vstelm_d(out1, dst, 0, 1);
185  dst += dst_stride;
186  }
187 }
188 
189 void ff_put_vp8_epel16_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
190  const uint8_t *src, ptrdiff_t src_stride,
191  int height, int mx, int my)
192 {
193  uint32_t loop_cnt;
194  const int8_t *filter = subpel_filters_lsx[mx - 1];
195  __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1;
196  __m128i filt2, mask0, mask1, mask2;
197  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
198 
199  ptrdiff_t src_stride2 = src_stride << 1;
200  ptrdiff_t src_stride3 = src_stride2 + src_stride;
201  ptrdiff_t src_stride4 = src_stride2 << 1;
202 
203  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
204  src -= 2;
205  /* rearranging filter */
206  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
207  filt2 = __lsx_vldrepl_h(filter, 4);
208 
209  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
210 
211  for (loop_cnt = (height >> 2); loop_cnt--;) {
212  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
213  0, src + src_stride3, 0, src0 ,src2, src4, src6);
214  DUP4_ARG2(__lsx_vld, src, 8, src + src_stride, 8, src + src_stride2,
215  8, src + src_stride3, 8, src1, src3, src5, src7);
216 
217  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
218  src0, src1, src2, src3);
219  DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128,
220  src4, src5, src6, src7);
221  src += src_stride4;
222 
223  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
224  filt0, filt1, filt2, out0, out1, out2, out3);
225  HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
226  filt0, filt1, filt2, out4, out5, out6, out7);
227  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
228  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
229  __lsx_vst(out0, dst, 0);
230  dst += dst_stride;
231  __lsx_vst(out1, dst, 0);
232  dst += dst_stride;
233 
234  DUP2_ARG3(__lsx_vssrarni_b_h, out5, out4, 7, out7, out6, 7, out4, out5);
235  DUP2_ARG2(__lsx_vxori_b, out4, 128, out5, 128, out4, out5);
236  __lsx_vst(out4, dst, 0);
237  dst += dst_stride;
238  __lsx_vst(out5, dst, 0);
239  dst += dst_stride;
240  }
241 }
242 
243 void ff_put_vp8_epel8_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
244  const uint8_t *src, ptrdiff_t src_stride,
245  int height, int mx, int my)
246 {
247  uint32_t loop_cnt;
248  const int8_t *filter = subpel_filters_lsx[my - 1];
249  __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
250  __m128i src10_l, src32_l, src76_l, src98_l, src21_l, src43_l, src87_l;
251  __m128i src109_l, filt0, filt1, filt2;
252  __m128i out0_l, out1_l, out2_l, out3_l;
253 
254  ptrdiff_t src_stride2 = src_stride << 1;
255  ptrdiff_t src_stride3 = src_stride2 + src_stride;
256  ptrdiff_t src_stride4 = src_stride2 << 1;
257 
258  src -= src_stride2;
259  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
260  filt2 = __lsx_vldrepl_h(filter, 4);
261 
262  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
263  src + src_stride3, 0, src0, src1, src2, src3);
264  src += src_stride4;
265  src4 = __lsx_vld(src, 0);
266  src += src_stride;
267 
268  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
269  src0, src1, src2, src3);
270  src4 = __lsx_vxori_b(src4, 128);
271 
272  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4,
273  src3, src10_l, src32_l, src21_l, src43_l);
274  for (loop_cnt = (height >> 2); loop_cnt--;) {
275  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
276  0, src + src_stride3, 0, src7, src8, src9, src10);
277  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
278  128, src7, src8, src9, src10);
279  src += src_stride4;
280 
281  DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10,
282  src9, src76_l, src87_l, src98_l, src109_l);
283 
284  out0_l = DPADD_SH3_SH(src10_l, src32_l, src76_l, filt0, filt1, filt2);
285  out1_l = DPADD_SH3_SH(src21_l, src43_l, src87_l, filt0, filt1, filt2);
286  out2_l = DPADD_SH3_SH(src32_l, src76_l, src98_l, filt0, filt1, filt2);
287  out3_l = DPADD_SH3_SH(src43_l, src87_l, src109_l, filt0, filt1, filt2);
288 
289  DUP2_ARG3(__lsx_vssrarni_b_h, out1_l, out0_l, 7, out3_l, out2_l, 7,
290  out0_l, out1_l);
291  DUP2_ARG2(__lsx_vxori_b, out0_l, 128, out1_l, 128, out0_l, out1_l);
292 
293  __lsx_vstelm_d(out0_l, dst, 0, 0);
294  dst += dst_stride;
295  __lsx_vstelm_d(out0_l, dst, 0, 1);
296  dst += dst_stride;
297  __lsx_vstelm_d(out1_l, dst, 0, 0);
298  dst += dst_stride;
299  __lsx_vstelm_d(out1_l, dst, 0, 1);
300  dst += dst_stride;
301 
302  src10_l = src76_l;
303  src32_l = src98_l;
304  src21_l = src87_l;
305  src43_l = src109_l;
306  src4 = src10;
307  }
308 }
309 
310 void ff_put_vp8_epel16_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
311  const uint8_t *src, ptrdiff_t src_stride,
312  int height, int mx, int my)
313 {
314  uint32_t loop_cnt;
315  const int8_t *filter = subpel_filters_lsx[my - 1];
316  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
317  __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l, src65_l, src87_l;
318  __m128i src10_h, src32_h, src54_h, src76_h, src21_h, src43_h, src65_h, src87_h;
319  __m128i filt0, filt1, filt2;
320  __m128i tmp0, tmp1, tmp2, tmp3;
321 
322  ptrdiff_t src_stride2 = src_stride << 1;
323  ptrdiff_t src_stride3 = src_stride2 + src_stride;
324  ptrdiff_t src_stride4 = src_stride2 << 1;
325 
326  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
327  filt2 = __lsx_vldrepl_h(filter, 4);
328 
329  DUP4_ARG2(__lsx_vld, src - src_stride2, 0, src - src_stride, 0, src, 0,
330  src + src_stride, 0, src0, src1, src2, src3);
331  src4 = __lsx_vld(src + src_stride2, 0);
332  src += src_stride3;
333 
334  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
335  src1, src2, src3);
336  src4 = __lsx_vxori_b(src4, 128);
337 
338  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
339  src10_l, src32_l, src43_l, src21_l);
340  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
341  src10_h, src32_h, src43_h, src21_h);
342 
343  for (loop_cnt = (height >> 2); loop_cnt--;) {
344  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
345  src + src_stride3, 0, src5, src6, src7, src8);
346  src += src_stride4;
347  DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
348  src5, src6, src7, src8);
349 
350  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
351  src54_l, src65_l, src76_l, src87_l);
352  DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
353  src54_h, src65_h, src76_h, src87_h);
354 
355  tmp0 = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
356  tmp1 = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
357  tmp2 = DPADD_SH3_SH(src10_h, src32_h, src54_h, filt0, filt1, filt2);
358  tmp3 = DPADD_SH3_SH(src21_h, src43_h, src65_h, filt0, filt1, filt2);
359 
360  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
361  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
362  __lsx_vst(tmp0, dst, 0);
363  dst += dst_stride;
364  __lsx_vst(tmp1, dst, 0);
365  dst += dst_stride;
366 
367  tmp0 = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
368  tmp1 = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
369  tmp2 = DPADD_SH3_SH(src32_h, src54_h, src76_h, filt0, filt1, filt2);
370  tmp3 = DPADD_SH3_SH(src43_h, src65_h, src87_h, filt0, filt1, filt2);
371 
372  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
373  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
374  __lsx_vst(tmp0, dst, 0);
375  dst += dst_stride;
376  __lsx_vst(tmp1, dst, 0);
377  dst += dst_stride;
378 
379  src10_l = src54_l;
380  src32_l = src76_l;
381  src21_l = src65_l;
382  src43_l = src87_l;
383  src10_h = src54_h;
384  src32_h = src76_h;
385  src21_h = src65_h;
386  src43_h = src87_h;
387  src4 = src8;
388  }
389 }
390 
391 void ff_put_vp8_epel8_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
392  const uint8_t *src, ptrdiff_t src_stride,
393  int height, int mx, int my)
394 {
395  uint32_t loop_cnt;
396  const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
397  const int8_t *filter_vert = subpel_filters_lsx[my - 1];
398  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
399  __m128i filt_hz0, filt_hz1, filt_hz2;
400  __m128i mask0, mask1, mask2, filt_vt0, filt_vt1, filt_vt2;
401  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
402  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
403  __m128i tmp0, tmp1, tmp2, tmp3;
404 
405  ptrdiff_t src_stride2 = src_stride << 1;
406  ptrdiff_t src_stride3 = src_stride2 + src_stride;
407  ptrdiff_t src_stride4 = src_stride2 << 1;
408 
409  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
410  src -= (2 + src_stride2);
411 
412  /* rearranging filter */
413  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
414  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
415 
416  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
417 
418  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
419  src + src_stride3, 0, src0, src1, src2, src3);
420  src += src_stride4;
421  src4 = __lsx_vld(src, 0);
422  src += src_stride;
423 
424  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
425  src0 ,src1, src2, src3);
426  src4 = __lsx_vxori_b(src4, 128);
427 
428  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
429  filt_hz1, filt_hz2);
430  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
431  filt_hz1, filt_hz2);
432  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
433  filt_hz1, filt_hz2);
434  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
435  filt_hz1, filt_hz2);
436  hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
437  filt_hz1, filt_hz2);
438 
439  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
440  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
441 
442  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
443  DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
444  for (loop_cnt = (height >> 2); loop_cnt--;) {
445  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
446  src + src_stride3, 0, src5, src6, src7, src8);
447  src += src_stride4;
448 
449  DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
450  src5, src6, src7, src8);
451 
452  hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
453  filt_hz1, filt_hz2);
454  out2 = __lsx_vpackev_b(hz_out5, hz_out4);
455  tmp0 = DPADD_SH3_SH(out0, out1, out2,filt_vt0, filt_vt1, filt_vt2);
456 
457  hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
458  filt_hz1, filt_hz2);
459  out5 = __lsx_vpackev_b(hz_out6, hz_out5);
460  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
461 
462  hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
463  filt_hz1, filt_hz2);
464 
465  out7 = __lsx_vpackev_b(hz_out7, hz_out6);
466  tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
467 
468  hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
469  filt_hz1, filt_hz2);
470  out6 = __lsx_vpackev_b(hz_out8, hz_out7);
471  tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
472 
473  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
474  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
475  __lsx_vstelm_d(tmp0, dst, 0, 0);
476 
477  dst += dst_stride;
478  __lsx_vstelm_d(tmp0, dst, 0, 1);
479  dst += dst_stride;
480  __lsx_vstelm_d(tmp1, dst, 0, 0);
481  dst += dst_stride;
482  __lsx_vstelm_d(tmp1, dst, 0, 1);
483  dst += dst_stride;
484 
485  hz_out4 = hz_out8;
486  out0 = out2;
487  out1 = out7;
488  out3 = out5;
489  out4 = out6;
490  }
491 }
492 
493 void ff_put_vp8_epel16_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
494  const uint8_t *src, ptrdiff_t src_stride,
495  int height, int mx, int my)
496 {
497  int32_t multiple8_cnt;
498 
499  for (multiple8_cnt = 2; multiple8_cnt--;) {
500  ff_put_vp8_epel8_h6v6_lsx(dst, dst_stride, src, src_stride, height, mx, my);
501  src += 8;
502  dst += 8;
503  }
504 }
505 
506 void ff_put_vp8_epel8_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
507  const uint8_t *src, ptrdiff_t src_stride,
508  int height, int mx, int my)
509 {
510  uint32_t loop_cnt;
511  const int8_t *filter = subpel_filters_lsx[my - 1];
512  __m128i src0, src1, src2, src7, src8, src9, src10;
513  __m128i src10_l, src72_l, src98_l, src21_l, src87_l, src109_l, filt0, filt1;
514  __m128i out0, out1, out2, out3;
515 
516  ptrdiff_t src_stride2 = src_stride << 1;
517  ptrdiff_t src_stride3 = src_stride2 + src_stride;
518  ptrdiff_t src_stride4 = src_stride2 << 1;
519 
520  src -= src_stride;
521 
522  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
523  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
524  src2 = __lsx_vld(src + src_stride2, 0);
525  src += src_stride3;
526 
527  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
528  src2 = __lsx_vxori_b(src2, 128);
529  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l);
530 
531  for (loop_cnt = (height >> 2); loop_cnt--;) {
532  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
533  src + src_stride3, 0, src7, src8, src9, src10);
534  src += src_stride4;
535 
536  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
537  src7, src8, src9, src10);
538  DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
539  src72_l, src87_l, src98_l, src109_l);
540 
541  out0 = FILT_4TAP_DPADD_S_H(src10_l, src72_l, filt0, filt1);
542  out1 = FILT_4TAP_DPADD_S_H(src21_l, src87_l, filt0, filt1);
543  out2 = FILT_4TAP_DPADD_S_H(src72_l, src98_l, filt0, filt1);
544  out3 = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
545  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
546  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
547 
548  __lsx_vstelm_d(out0, dst, 0, 0);
549  dst += dst_stride;
550  __lsx_vstelm_d(out0, dst, 0, 1);
551  dst += dst_stride;
552  __lsx_vstelm_d(out1, dst, 0, 0);
553  dst += dst_stride;
554  __lsx_vstelm_d(out1, dst, 0, 1);
555  dst += dst_stride;
556 
557  src10_l = src98_l;
558  src21_l = src109_l;
559  src2 = src10;
560  }
561 }
562 
563 void ff_put_vp8_epel16_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
564  const uint8_t *src, ptrdiff_t src_stride,
565  int height, int mx, int my)
566 {
567  uint32_t loop_cnt;
568  const int8_t *filter = subpel_filters_lsx[my - 1];
569  __m128i src0, src1, src2, src3, src4, src5, src6;
570  __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l, src10_h;
571  __m128i src32_h, src54_h, src21_h, src43_h, src65_h, filt0, filt1;
572  __m128i tmp0, tmp1, tmp2, tmp3;
573 
574  ptrdiff_t src_stride2 = src_stride << 1;
575  ptrdiff_t src_stride3 = src_stride2 + src_stride;
576  ptrdiff_t src_stride4 = src_stride2 << 1;
577 
578  src -= src_stride;
579  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
580  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
581  src2 = __lsx_vld(src + src_stride2, 0);
582  src += src_stride3;
583 
584  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
585  src2 = __lsx_vxori_b(src2, 128);
586  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l);
587  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_h, src21_h);
588 
589  for (loop_cnt = (height >> 2); loop_cnt--;) {
590  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
591  0, src + src_stride3, 0, src3, src4, src5, src6);
592  src += src_stride4;
593 
594  DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
595  src3, src4, src5, src6);
596  DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6,
597  src5, src32_l, src43_l, src54_l, src65_l);
598  DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6,
599  src5, src32_h, src43_h, src54_h, src65_h);
600 
601  tmp0 = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
602  tmp1 = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
603  tmp2 = FILT_4TAP_DPADD_S_H(src10_h, src32_h, filt0, filt1);
604  tmp3 = FILT_4TAP_DPADD_S_H(src21_h, src43_h, filt0, filt1);
605  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
606  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
607 
608  __lsx_vst(tmp0, dst, 0);
609  dst += dst_stride;
610  __lsx_vst(tmp1, dst, 0);
611  dst += dst_stride;
612 
613  tmp0 = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
614  tmp1 = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
615  tmp2 = FILT_4TAP_DPADD_S_H(src32_h, src54_h, filt0, filt1);
616  tmp3 = FILT_4TAP_DPADD_S_H(src43_h, src65_h, filt0, filt1);
617  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
618  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
619 
620  __lsx_vst(tmp0, dst, 0);
621  dst += dst_stride;
622  __lsx_vst(tmp1, dst, 0);
623  dst += dst_stride;
624 
625  src10_l = src54_l;
626  src21_l = src65_l;
627  src10_h = src54_h;
628  src21_h = src65_h;
629  src2 = src6;
630  }
631 }
632 
633 void ff_put_vp8_epel8_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
634  const uint8_t *src, ptrdiff_t src_stride,
635  int height, int mx, int my)
636 {
637  uint32_t loop_cnt;
638  const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
639  const int8_t *filter_vert = subpel_filters_lsx[my - 1];
640  __m128i src0, src1, src2, src3, src4, src5, src6;
641  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
642  __m128i filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
643  __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
644 
645  ptrdiff_t src_stride2 = src_stride << 1;
646  ptrdiff_t src_stride3 = src_stride2 + src_stride;
647  ptrdiff_t src_stride4 = src_stride2 << 1;
648 
649  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
650  src -= (2 + src_stride);
651 
652  /* rearranging filter */
653  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
654  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
655 
656  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
657 
658  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
659  src2 = __lsx_vld(src + src_stride2, 0);
660  src += src_stride3;
661 
662  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
663  src2 = __lsx_vxori_b(src2, 128);
664  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
665  filt_hz1, filt_hz2);
666  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
667  filt_hz1, filt_hz2);
668  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
669  filt_hz1, filt_hz2);
670  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
671 
672  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
673 
674  for (loop_cnt = (height >> 2); loop_cnt--;) {
675  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
676  src + src_stride3, 0, src3, src4, src5, src6);
677  src += src_stride4;
678 
679  DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
680  src3, src4, src5, src6);
681 
682  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
683  filt_hz1, filt_hz2);
684  vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
685  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
686 
687  hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
688  filt_hz1, filt_hz2);
689  vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
690  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
691 
692  hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
693  filt_hz1, filt_hz2);
694  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
695  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
696 
697  hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
698  filt_hz1, filt_hz2);
699  DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
700  tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
701 
702  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
703  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
704 
705  __lsx_vstelm_d(tmp0, dst, 0, 0);
706  dst += dst_stride;
707  __lsx_vstelm_d(tmp0, dst, 0, 1);
708  dst += dst_stride;
709  __lsx_vstelm_d(tmp1, dst, 0, 0);
710  dst += dst_stride;
711  __lsx_vstelm_d(tmp1, dst, 0, 1);
712  dst += dst_stride;
713  }
714 }
715 
716 void ff_put_vp8_epel16_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
717  const uint8_t *src, ptrdiff_t src_stride,
718  int height, int mx, int my)
719 {
720  int32_t multiple8_cnt;
721 
722  for (multiple8_cnt = 2; multiple8_cnt--;) {
723  ff_put_vp8_epel8_h6v4_lsx(dst, dst_stride, src, src_stride, height,
724  mx, my);
725  src += 8;
726  dst += 8;
727  }
728 }
729 
730 void ff_put_vp8_epel8_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
731  const uint8_t *src, ptrdiff_t src_stride,
732  int height, int mx, int my)
733 {
734  uint32_t loop_cnt;
735  const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
736  const int8_t *filter_vert = subpel_filters_lsx[my - 1];
737  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
738  __m128i filt_hz0, filt_hz1, mask0, mask1;
739  __m128i filt_vt0, filt_vt1, filt_vt2;
740  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
741  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
742 
743  ptrdiff_t src_stride2 = src_stride << 1;
744  ptrdiff_t src_stride3 = src_stride2 + src_stride;
745  ptrdiff_t src_stride4 = src_stride2 << 1;
746 
747  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
748  src -= (1 + src_stride2);
749 
750  /* rearranging filter */
751  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
752  mask1 = __lsx_vaddi_bu(mask0, 2);
753 
754  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
755  src + src_stride3, 0, src0, src1, src2, src3);
756  src += src_stride4;
757  src4 = __lsx_vld(src, 0);
758  src += src_stride;
759 
760  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
761  src0, src1, src2, src3);
762  src4 = __lsx_vxori_b(src4, 128);
763 
764  tmp0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
765  tmp1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
766  tmp2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
767  tmp3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
768  tmp4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
769 
770  DUP4_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp2, tmp1,
771  tmp4, tmp3, out0, out1, out3, out4);
772 
773  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
774  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
775 
776  for (loop_cnt = (height >> 2); loop_cnt--;) {
777  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
778  src + src_stride3, 0, src5, src6, src7, src8);
779  src += src_stride4;
780 
781  DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
782  src5, src6, src7, src8);
783 
784  tmp5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
785  out2 = __lsx_vpackev_b(tmp5, tmp4);
786  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
787 
788  tmp6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
789  out5 = __lsx_vpackev_b(tmp6, tmp5);
790  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
791 
792  tmp7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
793  out6 = __lsx_vpackev_b(tmp7, tmp6);
794  tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
795 
796  tmp8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
797  out7 = __lsx_vpackev_b(tmp8, tmp7);
798  tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
799 
800  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
801  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
802 
803  __lsx_vstelm_d(tmp0, dst, 0, 0);
804  dst += dst_stride;
805  __lsx_vstelm_d(tmp0, dst, 0, 1);
806  dst += dst_stride;
807  __lsx_vstelm_d(tmp1, dst, 0, 0);
808  dst += dst_stride;
809  __lsx_vstelm_d(tmp1, dst, 0, 1);
810  dst += dst_stride;
811 
812  tmp4 = tmp8;
813  out0 = out2;
814  out1 = out6;
815  out3 = out5;
816  out4 = out7;
817  }
818 }
819 
820 void ff_put_vp8_epel16_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
821  const uint8_t *src, ptrdiff_t src_stride,
822  int height, int mx, int my)
823 {
824  int32_t multiple8_cnt;
825 
826  for (multiple8_cnt = 2; multiple8_cnt--;) {
827  ff_put_vp8_epel8_h4v6_lsx(dst, dst_stride, src, src_stride, height,
828  mx, my);
829  src += 8;
830  dst += 8;
831  }
832 }
833 
834 void ff_put_vp8_pixels8_lsx(uint8_t *dst, ptrdiff_t dst_stride,
835  const uint8_t *src, ptrdiff_t src_stride,
836  int height, int mx, int my)
837 {
838  int32_t cnt;
839  __m128i src0, src1, src2, src3;
840 
841  ptrdiff_t src_stride2 = src_stride << 1;
842  ptrdiff_t src_stride3 = src_stride2 + src_stride;
843  ptrdiff_t src_stride4 = src_stride2 << 1;
844 
845  if (0 == height % 8) {
846  for (cnt = height >> 3; cnt--;) {
847  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
848  src + src_stride3, 0, src0, src1, src2, src3);
849  src += src_stride4;
850 
851  __lsx_vstelm_d(src0, dst, 0, 0);
852  dst += dst_stride;
853  __lsx_vstelm_d(src1, dst, 0, 0);
854  dst += dst_stride;
855  __lsx_vstelm_d(src2, dst, 0, 0);
856  dst += dst_stride;
857  __lsx_vstelm_d(src3, dst, 0, 0);
858  dst += dst_stride;
859 
860  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
861  src + src_stride3, 0, src0, src1, src2, src3);
862  src += src_stride4;
863 
864  __lsx_vstelm_d(src0, dst, 0, 0);
865  dst += dst_stride;
866  __lsx_vstelm_d(src1, dst, 0, 0);
867  dst += dst_stride;
868  __lsx_vstelm_d(src2, dst, 0, 0);
869  dst += dst_stride;
870  __lsx_vstelm_d(src3, dst, 0, 0);
871  dst += dst_stride;
872  }
873  } else if( 0 == height % 4) {
874  for (cnt = (height >> 2); cnt--;) {
875  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
876  src + src_stride3, 0, src0, src1, src2, src3);
877  src += src_stride4;
878 
879  __lsx_vstelm_d(src0, dst, 0, 0);
880  dst += dst_stride;
881  __lsx_vstelm_d(src1, dst, 0, 0);
882  dst += dst_stride;
883  __lsx_vstelm_d(src2, dst, 0, 0);
884  dst += dst_stride;
885  __lsx_vstelm_d(src3, dst, 0, 0);
886  dst += dst_stride;
887  }
888  }
889 }
890 
891 void ff_put_vp8_pixels16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
892  const uint8_t *src, ptrdiff_t src_stride,
893  int height, int mx, int my)
894 {
895  int32_t width = 16;
896  int32_t cnt, loop_cnt;
897  const uint8_t *src_tmp;
898  uint8_t *dst_tmp;
899  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
900 
901  ptrdiff_t src_stride2 = src_stride << 1;
902  ptrdiff_t src_stride3 = src_stride2 + src_stride;
903  ptrdiff_t src_stride4 = src_stride2 << 1;
904 
905  ptrdiff_t dst_stride2 = dst_stride << 1;
906  ptrdiff_t dst_stride3 = dst_stride2 + dst_stride;
907  ptrdiff_t dst_stride4 = dst_stride2 << 1;
908 
909  if (0 == height % 8) {
910  for (cnt = (width >> 4); cnt--;) {
911  src_tmp = src;
912  dst_tmp = dst;
913  for (loop_cnt = (height >> 3); loop_cnt--;) {
914  DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
915  src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
916  src4, src5, src6, src7);
917  src_tmp += src_stride4;
918 
919  __lsx_vst(src4, dst_tmp, 0);
920  __lsx_vst(src5, dst_tmp + dst_stride, 0);
921  __lsx_vst(src6, dst_tmp + dst_stride2, 0);
922  __lsx_vst(src7, dst_tmp + dst_stride3, 0);
923  dst_tmp += dst_stride4;
924 
925  DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
926  src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
927  src4, src5, src6, src7);
928  src_tmp += src_stride4;
929 
930  __lsx_vst(src4, dst_tmp, 0);
931  __lsx_vst(src5, dst_tmp + dst_stride, 0);
932  __lsx_vst(src6, dst_tmp + dst_stride2, 0);
933  __lsx_vst(src7, dst_tmp + dst_stride3, 0);
934  dst_tmp += dst_stride4;
935  }
936  src += 16;
937  dst += 16;
938  }
939  } else if (0 == height % 4) {
940  for (cnt = (height >> 2); cnt--;) {
941  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
942  src + src_stride3, 0, src0, src1, src2, src3);
943  src += 4 * src_stride4;
944 
945  __lsx_vst(src0, dst, 0);
946  __lsx_vst(src1, dst + dst_stride, 0);
947  __lsx_vst(src2, dst + dst_stride2, 0);
948  __lsx_vst(src3, dst + dst_stride3, 0);
949  dst += dst_stride4;
950  }
951  }
952 }
ff_put_vp8_epel16_h6v6_lsx
void ff_put_vp8_epel16_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:493
vp8dsp_loongarch.h
ff_put_vp8_epel8_h6_lsx
void ff_put_vp8_epel8_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:124
src1
const pixel * src1
Definition: h264pred_template.c:421
HORIZ_6TAP_FILT
#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, filt_h2)
Definition: vp8_mc_lsx.c:62
ff_put_vp8_epel16_h6_lsx
void ff_put_vp8_epel16_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:189
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
subpel_filters_lsx
static const int8_t subpel_filters_lsx[7][8]
Definition: vp8_mc_lsx.c:33
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
HORIZ_4TAP_FILT
#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)
Definition: vp8_mc_lsx.c:110
vp8dsp.h
ff_put_vp8_epel16_h6v4_lsx
void ff_put_vp8_epel16_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:716
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
ff_put_vp8_epel16_v6_lsx
void ff_put_vp8_epel16_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:310
width
#define width
FILT_4TAP_DPADD_S_H
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
Definition: vp8_mc_lsx.c:100
mc_filt_mask_arr
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp8_mc_lsx.c:24
ff_put_vp8_pixels16_lsx
void ff_put_vp8_pixels16_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:891
ff_put_vp8_epel16_h4v6_lsx
void ff_put_vp8_epel16_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:820
ff_put_vp8_epel8_v6_lsx
void ff_put_vp8_epel8_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:243
HORIZ_6TAP_8WID_4VECS_FILT
#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, filt1, filt2, out0, out1, out2, out3)
Definition: vp8_mc_lsx.c:79
ff_put_vp8_epel16_v4_lsx
void ff_put_vp8_epel16_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:563
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
height
#define height
ff_put_vp8_epel8_h6v4_lsx
void ff_put_vp8_epel8_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:633
src2
const pixel * src2
Definition: h264pred_template.c:422
ff_put_vp8_epel8_h6v6_lsx
void ff_put_vp8_epel8_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:391
DPADD_SH3_SH
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
Definition: vp8_mc_lsx.c:43
src0
const pixel *const src0
Definition: h264pred_template.c:420
loongson_intrinsics.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
ff_put_vp8_epel8_h4v6_lsx
void ff_put_vp8_epel8_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:730
ff_put_vp8_epel8_v4_lsx
void ff_put_vp8_epel8_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:506
ff_put_vp8_pixels8_lsx
void ff_put_vp8_pixels8_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:834