FFmpeg
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35  mask0, mask1, mask2, mask3, \
36  filt0, filt1, filt2, filt3, \
37  out0, out1) \
38 { \
39  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
40  \
41  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
49 }
50 
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52  mask0, mask1, mask2, mask3, \
53  filt0, filt1, filt2, filt3, \
54  out0, out1, out2, out3) \
55 { \
56  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
57  \
58  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61  out0, out1, out2, out3); \
62  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65  out0, out1, out2, out3); \
66  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69  out0, out1, out2, out3); \
70  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73  out0, out1, out2, out3); \
74 }
75 
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77  mask0, mask1, filt0, filt1, \
78  out0, out1) \
79 { \
80  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
81  \
82  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
86 }
87 
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89  mask0, mask1, filt0, filt1, \
90  out0, out1, out2, out3) \
91 { \
92  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
93  \
94  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97  out0, out1, out2, out3); \
98  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101  out0, out1, out2, out3); \
102 }
103 
104 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
105  uint8_t *dst, int32_t dst_stride,
106  int32_t height)
107 {
108  int32_t cnt;
109  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110 
111  if (2 == height) {
112  LD2(src, src_stride, out0, out1);
113  SD(out0, dst);
114  dst += dst_stride;
115  SD(out1, dst);
116  } else if (6 == height) {
117  LD4(src, src_stride, out0, out1, out2, out3);
118  src += (4 * src_stride);
119  SD4(out0, out1, out2, out3, dst, dst_stride);
120  dst += (4 * dst_stride);
121  LD2(src, src_stride, out0, out1);
122  SD(out0, dst);
123  dst += dst_stride;
124  SD(out1, dst);
125  } else if (0 == (height % 8)) {
126  for (cnt = (height >> 3); cnt--;) {
127  LD4(src, src_stride, out0, out1, out2, out3);
128  src += (4 * src_stride);
129  LD4(src, src_stride, out4, out5, out6, out7);
130  src += (4 * src_stride);
131  SD4(out0, out1, out2, out3, dst, dst_stride);
132  dst += (4 * dst_stride);
133  SD4(out4, out5, out6, out7, dst, dst_stride);
134  dst += (4 * dst_stride);
135  }
136  } else if (0 == (height % 4)) {
137  for (cnt = (height >> 2); cnt--;) {
138  LD4(src, src_stride, out0, out1, out2, out3);
139  src += (4 * src_stride);
140  SD4(out0, out1, out2, out3, dst, dst_stride);
141  dst += (4 * dst_stride);
142  }
143  }
144 }
145 
146 static void copy_width12_msa(const uint8_t *src, int32_t src_stride,
147  uint8_t *dst, int32_t dst_stride,
148  int32_t height)
149 {
150  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151 
152  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153  src += (8 * src_stride);
154  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155  dst += (8 * dst_stride);
156  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158 }
159 
160 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
161  uint8_t *dst, int32_t dst_stride,
162  int32_t height)
163 {
164  int32_t cnt;
165  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166 
167  if (12 == height) {
168  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169  src += (8 * src_stride);
170  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171  dst += (8 * dst_stride);
172  LD_UB4(src, src_stride, src0, src1, src2, src3);
173  src += (4 * src_stride);
174  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175  dst += (4 * dst_stride);
176  } else if (0 == (height % 8)) {
177  for (cnt = (height >> 3); cnt--;) {
178  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179  src7);
180  src += (8 * src_stride);
181  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182  dst_stride);
183  dst += (8 * dst_stride);
184  }
185  } else if (0 == (height % 4)) {
186  for (cnt = (height >> 2); cnt--;) {
187  LD_UB4(src, src_stride, src0, src1, src2, src3);
188  src += (4 * src_stride);
189 
190  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191  dst += (4 * dst_stride);
192  }
193  }
194 }
195 
196 static void copy_width24_msa(const uint8_t *src, int32_t src_stride,
197  uint8_t *dst, int32_t dst_stride,
198  int32_t height)
199 {
200  int32_t cnt;
201  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203 
204  for (cnt = 4; cnt--;) {
205  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206  LD4(src + 16, src_stride, out0, out1, out2, out3);
207  src += (4 * src_stride);
208  LD4(src + 16, src_stride, out4, out5, out6, out7);
209  src += (4 * src_stride);
210 
211  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212  SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213  dst += (4 * dst_stride);
214  SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215  dst += (4 * dst_stride);
216  }
217 }
218 
219 static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
220  uint8_t *dst, int32_t dst_stride,
221  int32_t height)
222 {
223  int32_t cnt;
224  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225 
226  for (cnt = (height >> 2); cnt--;) {
227  LD_UB4(src, src_stride, src0, src1, src2, src3);
228  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229  src += (4 * src_stride);
230  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232  dst += (4 * dst_stride);
233  }
234 }
235 
236 static void copy_width48_msa(const uint8_t *src, int32_t src_stride,
237  uint8_t *dst, int32_t dst_stride,
238  int32_t height)
239 {
240  int32_t cnt;
241  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242  v16u8 src11;
243 
244  for (cnt = (height >> 2); cnt--;) {
245  LD_UB4(src, src_stride, src0, src1, src2, src3);
246  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247  LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248  src += (4 * src_stride);
249 
250  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252  ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253  dst += (4 * dst_stride);
254  }
255 }
256 
257 static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
258  uint8_t *dst, int32_t dst_stride,
259  int32_t height)
260 {
261  int32_t cnt;
262  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264 
265  for (cnt = (height >> 2); cnt--;) {
266  LD_UB4(src, 16, src0, src1, src2, src3);
267  src += src_stride;
268  LD_UB4(src, 16, src4, src5, src6, src7);
269  src += src_stride;
270  LD_UB4(src, 16, src8, src9, src10, src11);
271  src += src_stride;
272  LD_UB4(src, 16, src12, src13, src14, src15);
273  src += src_stride;
274 
275  ST_UB4(src0, src1, src2, src3, dst, 16);
276  dst += dst_stride;
277  ST_UB4(src4, src5, src6, src7, dst, 16);
278  dst += dst_stride;
279  ST_UB4(src8, src9, src10, src11, dst, 16);
280  dst += dst_stride;
281  ST_UB4(src12, src13, src14, src15, dst, 16);
282  dst += dst_stride;
283  }
284 }
285 
286 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
287  uint8_t *dst, int32_t dst_stride,
288  const int8_t *filter)
289 {
290  v16u8 mask0, mask1, mask2, mask3, out;
291  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292  v8i16 filt, out0, out1;
293 
294  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295  src -= 3;
296 
297  /* rearranging filter */
298  filt = LD_SH(filter);
299  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300 
301  mask1 = mask0 + 2;
302  mask2 = mask0 + 4;
303  mask3 = mask0 + 6;
304 
305  LD_SB4(src, src_stride, src0, src1, src2, src3);
306  XORI_B4_128_SB(src0, src1, src2, src3);
307  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308  mask3, filt0, filt1, filt2, filt3, out0, out1);
309  SRARI_H2_SH(out0, out1, 6);
310  SAT_SH2_SH(out0, out1, 7);
311  out = PCKEV_XORI128_UB(out0, out1);
312  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
313 }
314 
315 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter)
318 {
319  v16i8 filt0, filt1, filt2, filt3;
320  v16i8 src0, src1, src2, src3;
321  v16u8 mask0, mask1, mask2, mask3, out;
322  v8i16 filt, out0, out1, out2, out3;
323 
324  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325  src -= 3;
326 
327  /* rearranging filter */
328  filt = LD_SH(filter);
329  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330 
331  mask1 = mask0 + 2;
332  mask2 = mask0 + 4;
333  mask3 = mask0 + 6;
334 
335  LD_SB4(src, src_stride, src0, src1, src2, src3);
336  XORI_B4_128_SB(src0, src1, src2, src3);
337  src += (4 * src_stride);
338  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339  mask3, filt0, filt1, filt2, filt3, out0, out1);
340  LD_SB4(src, src_stride, src0, src1, src2, src3);
341  XORI_B4_128_SB(src0, src1, src2, src3);
342  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343  mask3, filt0, filt1, filt2, filt3, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 6);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348  out = PCKEV_XORI128_UB(out2, out3);
349  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
350 }
351 
352 static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride,
353  uint8_t *dst, int32_t dst_stride,
354  const int8_t *filter)
355 {
356  v16u8 mask0, mask1, mask2, mask3, out;
357  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358  v8i16 filt, out0, out1, out2, out3;
359 
360  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
361  src -= 3;
362 
363  /* rearranging filter */
364  filt = LD_SH(filter);
365  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
366 
367  mask1 = mask0 + 2;
368  mask2 = mask0 + 4;
369  mask3 = mask0 + 6;
370 
371  LD_SB4(src, src_stride, src0, src1, src2, src3);
372  XORI_B4_128_SB(src0, src1, src2, src3);
373  src += (4 * src_stride);
374  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375  mask3, filt0, filt1, filt2, filt3, out0, out1);
376  LD_SB4(src, src_stride, src0, src1, src2, src3);
377  XORI_B4_128_SB(src0, src1, src2, src3);
378  src += (4 * src_stride);
379  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380  mask3, filt0, filt1, filt2, filt3, out2, out3);
381  SRARI_H4_SH(out0, out1, out2, out3, 6);
382  SAT_SH4_SH(out0, out1, out2, out3, 7);
383  out = PCKEV_XORI128_UB(out0, out1);
384  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385  out = PCKEV_XORI128_UB(out2, out3);
386  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387  dst += (8 * dst_stride);
388 
389  LD_SB4(src, src_stride, src0, src1, src2, src3);
390  XORI_B4_128_SB(src0, src1, src2, src3);
391  src += (4 * src_stride);
392  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393  mask3, filt0, filt1, filt2, filt3, out0, out1);
394  LD_SB4(src, src_stride, src0, src1, src2, src3);
395  XORI_B4_128_SB(src0, src1, src2, src3);
396  src += (4 * src_stride);
397  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398  mask3, filt0, filt1, filt2, filt3, out2, out3);
399 
400  SRARI_H4_SH(out0, out1, out2, out3, 6);
401  SAT_SH4_SH(out0, out1, out2, out3, 7);
402  out = PCKEV_XORI128_UB(out0, out1);
403  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404  out = PCKEV_XORI128_UB(out2, out3);
405  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
406 }
407 
408 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
409  uint8_t *dst, int32_t dst_stride,
410  const int8_t *filter, int32_t height)
411 {
412  if (4 == height) {
413  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414  } else if (8 == height) {
415  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416  } else if (16 == height) {
417  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
418  }
419 }
420 
421 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
422  uint8_t *dst, int32_t dst_stride,
423  const int8_t *filter, int32_t height)
424 {
425  uint32_t loop_cnt;
426  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429  v8i16 filt, out0, out1, out2, out3;
430 
431  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
432  src -= 3;
433 
434  /* rearranging filter */
435  filt = LD_SH(filter);
436  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
437 
438  mask1 = mask0 + 2;
439  mask2 = mask0 + 4;
440  mask3 = mask0 + 6;
441 
442  for (loop_cnt = (height >> 2); loop_cnt--;) {
443  LD_SB4(src, src_stride, src0, src1, src2, src3);
444  XORI_B4_128_SB(src0, src1, src2, src3);
445  src += (4 * src_stride);
446 
447  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450  out0, out1, out2, out3);
451  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454  out0, out1, out2, out3);
455  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458  out0, out1, out2, out3);
459  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462  out0, out1, out2, out3);
463 
464  SRARI_H4_SH(out0, out1, out2, out3, 6);
465  SAT_SH4_SH(out0, out1, out2, out3, 7);
466  tmp0 = PCKEV_XORI128_UB(out0, out1);
467  tmp1 = PCKEV_XORI128_UB(out2, out3);
468  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469  dst += (4 * dst_stride);
470  }
471 }
472 
473 static void common_hz_8t_12w_msa(const uint8_t *src, int32_t src_stride,
474  uint8_t *dst, int32_t dst_stride,
475  const int8_t *filter, int32_t height)
476 {
477  uint32_t loop_cnt;
478  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479  v16u8 tmp0, tmp1, tmp2;
480  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482  v16i8 filt0, filt1, filt2, filt3;
483  v8i16 filt, out0, out1, out2, out3, out4, out5;
484 
485  mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
487 
488  src = src - 3;
489 
490  /* rearranging filter */
491  filt = LD_SH(filter);
492  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
493 
494  mask1 = mask00 + 2;
495  mask2 = mask00 + 4;
496  mask3 = mask00 + 6;
497  mask4 = mask0 + 2;
498  mask5 = mask0 + 4;
499  mask6 = mask0 + 6;
500 
501  for (loop_cnt = 4; loop_cnt--;) {
502  /* 8 width */
503  LD_SB4(src, src_stride, src0, src1, src2, src3);
504  /* 4 width */
505  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
506 
507  XORI_B4_128_SB(src0, src1, src2, src3);
508  XORI_B4_128_SB(src4, src5, src6, src7);
509  src += (4 * src_stride);
510 
511  VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512  VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
514  out1, out2, out3);
515  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
518  out1, out2, out3);
519  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
522  out1, out2, out3);
523  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
526  out1, out2, out3);
527 
528  /* 4 width */
529  VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531  VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533  VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534  DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535  VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536  DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
537 
538  SRARI_H4_SH(out0, out1, out2, out3, 6);
539  SRARI_H2_SH(out4, out5, 6);
540  SAT_SH4_SH(out0, out1, out2, out3, 7);
541  SAT_SH2_SH(out4, out5, 7);
542  tmp0 = PCKEV_XORI128_UB(out0, out1);
543  tmp1 = PCKEV_XORI128_UB(out2, out3);
544  tmp2 = PCKEV_XORI128_UB(out4, out5);
545 
546  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548  dst += (4 * dst_stride);
549  }
550 }
551 
552 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
553  uint8_t *dst, int32_t dst_stride,
554  const int8_t *filter, int32_t height)
555 {
556  uint32_t loop_cnt;
557  v16u8 mask0, mask1, mask2, mask3, out;
558  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559  v16i8 filt0, filt1, filt2, filt3;
560  v8i16 filt, out0, out1, out2, out3;
561 
562  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
563  src -= 3;
564 
565  /* rearranging filter */
566  filt = LD_SH(filter);
567  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568 
569  mask1 = mask0 + 2;
570  mask2 = mask0 + 4;
571  mask3 = mask0 + 6;
572 
573  for (loop_cnt = (height >> 2); loop_cnt--;) {
574  LD_SB2(src, src_stride, src0, src2);
575  LD_SB2(src + 8, src_stride, src1, src3);
576  src += (2 * src_stride);
577 
578  LD_SB2(src, src_stride, src4, src6);
579  LD_SB2(src + 8, src_stride, src5, src7);
580  src += (2 * src_stride);
581 
582  XORI_B4_128_SB(src0, src1, src2, src3);
583  XORI_B4_128_SB(src4, src5, src6, src7);
584  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585  mask3, filt0, filt1, filt2, filt3, out0,
586  out1, out2, out3);
587  SRARI_H4_SH(out0, out1, out2, out3, 6);
588  SAT_SH4_SH(out0, out1, out2, out3, 7);
589  out = PCKEV_XORI128_UB(out0, out1);
590  ST_UB(out, dst);
591  dst += dst_stride;
592  out = PCKEV_XORI128_UB(out2, out3);
593  ST_UB(out, dst);
594  dst += dst_stride;
595 
596  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597  mask3, filt0, filt1, filt2, filt3, out0,
598  out1, out2, out3);
599  SRARI_H4_SH(out0, out1, out2, out3, 6);
600  SAT_SH4_SH(out0, out1, out2, out3, 7);
601  out = PCKEV_XORI128_UB(out0, out1);
602  ST_UB(out, dst);
603  dst += dst_stride;
604  out = PCKEV_XORI128_UB(out2, out3);
605  ST_UB(out, dst);
606  dst += dst_stride;
607  }
608 }
609 
610 static void common_hz_8t_24w_msa(const uint8_t *src, int32_t src_stride,
611  uint8_t *dst, int32_t dst_stride,
612  const int8_t *filter, int32_t height)
613 {
614  uint32_t loop_cnt;
615  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
618  v16i8 vec11;
619  v8i16 out0, out1, out2, out3, out8, out9, filt;
620 
621  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
622  src -= 3;
623 
624  /* rearranging filter */
625  filt = LD_SH(filter);
626  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 
628  mask1 = mask0 + 2;
629  mask2 = mask0 + 4;
630  mask3 = mask0 + 6;
631  mask4 = mask0 + 8;
632  mask5 = mask0 + 10;
633  mask6 = mask0 + 12;
634  mask7 = mask0 + 14;
635 
636  for (loop_cnt = 16; loop_cnt--;) {
637  LD_SB2(src, src_stride, src0, src2);
638  LD_SB2(src + 16, src_stride, src1, src3);
639  XORI_B4_128_SB(src0, src1, src2, src3);
640  src += (2 * src_stride);
641  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
645  out8, out2, out9);
646  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650  DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651  out0, out8, out2, out9);
652  DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657  out0, out8, out2, out9);
658  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663  out0, out8, out2, out9);
664  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665  SRARI_H4_SH(out0, out8, out2, out9, 6);
666  SRARI_H2_SH(out1, out3, 6);
667  SAT_SH4_SH(out0, out8, out2, out9, 7);
668  SAT_SH2_SH(out1, out3, 7);
669  out = PCKEV_XORI128_UB(out8, out9);
670  ST_D2(out, 0, 1, dst + 16, dst_stride);
671  out = PCKEV_XORI128_UB(out0, out1);
672  ST_UB(out, dst);
673  dst += dst_stride;
674  out = PCKEV_XORI128_UB(out2, out3);
675  ST_UB(out, dst);
676  dst += dst_stride;
677  }
678 }
679 
680 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
681  uint8_t *dst, int32_t dst_stride,
682  const int8_t *filter, int32_t height)
683 {
684  uint32_t loop_cnt;
685  v16u8 mask0, mask1, mask2, mask3, out;
686  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687  v16i8 filt0, filt1, filt2, filt3;
688  v8i16 filt, out0, out1, out2, out3;
689 
690  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
691  src -= 3;
692 
693  /* rearranging filter */
694  filt = LD_SH(filter);
695  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696 
697  mask1 = mask0 + 2;
698  mask2 = mask0 + 4;
699  mask3 = mask0 + 6;
700 
701  for (loop_cnt = (height >> 1); loop_cnt--;) {
702  src0 = LD_SB(src);
703  src1 = LD_SB(src + 8);
704  src2 = LD_SB(src + 16);
705  src3 = LD_SB(src + 24);
706  src += src_stride;
707  XORI_B4_128_SB(src0, src1, src2, src3);
708 
709  src4 = LD_SB(src);
710  src5 = LD_SB(src + 8);
711  src6 = LD_SB(src + 16);
712  src7 = LD_SB(src + 24);
713  src += src_stride;
714  XORI_B4_128_SB(src4, src5, src6, src7);
715 
716  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717  mask3, filt0, filt1, filt2, filt3, out0,
718  out1, out2, out3);
719  SRARI_H4_SH(out0, out1, out2, out3, 6);
720  SAT_SH4_SH(out0, out1, out2, out3, 7);
721 
722  out = PCKEV_XORI128_UB(out0, out1);
723  ST_UB(out, dst);
724  out = PCKEV_XORI128_UB(out2, out3);
725  ST_UB(out, dst + 16);
726  dst += dst_stride;
727 
728  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729  mask3, filt0, filt1, filt2, filt3, out0,
730  out1, out2, out3);
731  SRARI_H4_SH(out0, out1, out2, out3, 6);
732  SAT_SH4_SH(out0, out1, out2, out3, 7);
733  out = PCKEV_XORI128_UB(out0, out1);
734  ST_UB(out, dst);
735  out = PCKEV_XORI128_UB(out2, out3);
736  ST_UB(out, dst + 16);
737  dst += dst_stride;
738  }
739 }
740 
741 static void common_hz_8t_48w_msa(const uint8_t *src, int32_t src_stride,
742  uint8_t *dst, int32_t dst_stride,
743  const int8_t *filter, int32_t height)
744 {
745  uint32_t loop_cnt;
746  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
747  v16i8 src4;
748  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749  v8i16 filt, out0, out1, out2, out3;
750 
751  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
752  src -= 3;
753 
754  /* rearranging filter */
755  filt = LD_SH(filter);
756  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757 
758  mask1 = mask0 + 2;
759  mask2 = mask0 + 4;
760  mask3 = mask0 + 6;
761  mask4 = mask0 + 8;
762  mask5 = mask0 + 10;
763  mask6 = mask0 + 12;
764  mask7 = mask0 + 14;
765 
766  for (loop_cnt = 64; loop_cnt--;) {
767  src0 = LD_SB(src);
768  src1 = LD_SB(src + 8);
769  src2 = LD_SB(src + 16);
770  src3 = LD_SB(src + 32);
771  src4 = LD_SB(src + 40);
772  src += src_stride;
773 
774  XORI_B4_128_SB(src0, src1, src2, src3);
775  src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
776 
777  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
778  vec0, vec1, vec2);
779  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
781  vec0, vec1, vec2);
782  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
785  vec0, vec1, vec2);
786  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
788 
789  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
790  vec0, vec1, vec2);
791  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
793 
794  SRARI_H2_SH(out0, out1, 6);
795  out3 = __msa_srari_h(out2, 6);
796  SAT_SH3_SH(out0, out1, out3, 7);
797  out = PCKEV_XORI128_UB(out0, out1);
798  ST_UB(out, dst);
799 
800  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
801  vec0, vec1, vec2);
802  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
804  vec0, vec1, vec2);
805  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
808  vec0, vec1, vec2);
809  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
812  vec0, vec1, vec2);
813  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
815 
816  SRARI_H2_SH(out0, out1, 6);
817  out2 = __msa_srari_h(out2, 6);
818  SAT_SH3_SH(out0, out1, out2, 7);
819  out = PCKEV_XORI128_UB(out3, out0);
820  ST_UB(out, dst + 16);
821  out = PCKEV_XORI128_UB(out1, out2);
822  ST_UB(out, dst + 32);
823  dst += dst_stride;
824  }
825 }
826 
827 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
828  uint8_t *dst, int32_t dst_stride,
829  const int8_t *filter, int32_t height)
830 {
831  int32_t loop_cnt;
832  v16u8 mask0, mask1, mask2, mask3, out;
833  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835  v16i8 filt0, filt1, filt2, filt3;
836  v8i16 res0, res1, res2, res3, filt;
837 
838  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
839  src -= 3;
840 
841  /* rearranging filter */
842  filt = LD_SH(filter);
843  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
844 
845  mask1 = mask0 + 2;
846  mask2 = mask0 + 4;
847  mask3 = mask0 + 6;
848 
849  for (loop_cnt = height; loop_cnt--;) {
850  LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
851  src += src_stride;
852 
853  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
854 
855  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
858  res1, res2, res3);
859  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
862  res1, res2, res3);
863  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
866  res1, res2, res3);
867  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
870  res1, res2, res3);
871 
872  SRARI_H4_SH(res0, res1, res2, res3, 6);
873  SAT_SH4_SH(res0, res1, res2, res3, 7);
874  out = PCKEV_XORI128_UB(res0, res1);
875  ST_UB(out, dst);
876  out = PCKEV_XORI128_UB(res2, res3);
877  ST_UB(out, dst + 16);
878 
879  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
882  res1, res2, res3);
883  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
886  res1, res2, res3);
887  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
890  res1, res2, res3);
891  VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892  VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894  res1, res2, res3);
895 
896  SRARI_H4_SH(res0, res1, res2, res3, 6);
897  SAT_SH4_SH(res0, res1, res2, res3, 7);
898  out = PCKEV_XORI128_UB(res0, res1);
899  ST_UB(out, dst + 32);
900  out = PCKEV_XORI128_UB(res2, res3);
901  ST_UB(out, dst + 48);
902  dst += dst_stride;
903  }
904 }
905 
906 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
907  uint8_t *dst, int32_t dst_stride,
908  const int8_t *filter, int32_t height)
909 {
910  uint32_t loop_cnt;
911  uint32_t res = (height & 0x07) >> 1;
912  v16u8 out0, out1;
913  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
914  v16i8 src11, src12, src13, src14;
915  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
916  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
917  v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
918  v16i8 src10998, filt0, filt1, filt2, filt3;
919  v8i16 filt, out10, out32, out54, out76;
920 
921  src -= (3 * src_stride);
922 
923  filt = LD_SH(filter);
924  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
925 
926  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
927  src += (7 * src_stride);
928 
929  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
930  src54_r, src21_r);
931  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
932  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
933  src4332, src6554);
934  XORI_B3_128_SB(src2110, src4332, src6554);
935 
936  for (loop_cnt = (height >> 3); loop_cnt--;) {
937  LD_SB4(src, src_stride, src7, src8, src9, src10);
938  src += (4 * src_stride);
939  LD_SB4(src, src_stride, src11, src12, src13, src14);
940  src += (4 * src_stride);
941 
942  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
943  src87_r, src98_r, src109_r);
944  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
945  src1110_r, src1211_r, src1312_r, src1413_r);
946  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
947  ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
948  src12111110, src14131312);
949  XORI_B2_128_SB(src8776, src10998);
950  XORI_B2_128_SB(src12111110, src14131312);
951 
952  DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
953  DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
954  DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
955  DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
956  DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
957  DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
958  DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
959  DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
960  SRARI_H2_SH(out10, out32, 6);
961  SRARI_H2_SH(out54, out76, 6);
962  SAT_SH2_SH(out10, out32, 7);
963  SAT_SH2_SH(out54, out76, 7);
964  out0 = PCKEV_XORI128_UB(out10, out32);
965  out1 = PCKEV_XORI128_UB(out54, out76);
966  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
967  dst += (8 * dst_stride);
968 
969  src2110 = src10998;
970  src4332 = src12111110;
971  src6554 = src14131312;
972  src6 = src14;
973  }
974  for (; res--; ) {
975  LD_SB2(src, src_stride, src7, src8);
976  src += 2 * src_stride;
977  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
978  src8776 = (v16i8)__msa_ilvr_d((v2i64) src87_r, (v2i64) src76_r);
979  src8776 = (v16i8)__msa_xori_b(src8776, 128);
980  out10 = (v8i16)__msa_dotp_s_h((v16i8) src2110, (v16i8) filt0);
981  out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src4332, filt1);
982  out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src6554, filt2);
983  out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src8776, filt3);
984  out10 = (v8i16)__msa_srari_h((v8i16) out10, 6);
985  out10 = (v8i16)__msa_sat_s_h((v8i16) out10, 7);
986  out0 = (v16u8)__msa_pckev_b((v16i8) out10, (v16i8) out10);
987  out0 = (v16u8)__msa_xori_b((v16u8) out0, 128);
988  ST_W2(out0, 0, 1, dst, dst_stride);
989  dst += 2 * dst_stride;
990  src2110 = src4332;
991  src4332 = src6554;
992  src6554 = src8776;
993  src6 = src8;
994  }
995 }
996 
997 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
998  uint8_t *dst, int32_t dst_stride,
999  const int8_t *filter, int32_t height)
1000 {
1001  uint32_t loop_cnt;
1002  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1003  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1004  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1005  v16u8 tmp0, tmp1;
1006  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
1007 
1008  src -= (3 * src_stride);
1009 
1010  filt = LD_SH(filter);
1011  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1012 
1013  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1014  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1015  src += (7 * src_stride);
1016  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1017  src54_r, src21_r);
1018  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1019 
1020  for (loop_cnt = (height >> 2); loop_cnt--;) {
1021  LD_SB4(src, src_stride, src7, src8, src9, src10);
1022  XORI_B4_128_SB(src7, src8, src9, src10);
1023  src += (4 * src_stride);
1024 
1025  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1026  src87_r, src98_r, src109_r);
1027  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1028  filt0, out0_r, out1_r, out2_r, out3_r);
1029  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1030  filt1, out0_r, out1_r, out2_r, out3_r);
1031  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1032  filt2, out0_r, out1_r, out2_r, out3_r);
1033  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1034  filt3, out0_r, out1_r, out2_r, out3_r);
1035  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1036  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1037  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1038  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1039  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1040  dst += (4 * dst_stride);
1041 
1042  src10_r = src54_r;
1043  src32_r = src76_r;
1044  src54_r = src98_r;
1045  src21_r = src65_r;
1046  src43_r = src87_r;
1047  src65_r = src109_r;
1048  src6 = src10;
1049  }
1050 }
1051 
1052 static void common_vt_8t_12w_msa(const uint8_t *src, int32_t src_stride,
1053  uint8_t *dst, int32_t dst_stride,
1054  const int8_t *filter, int32_t height)
1055 {
1056  uint32_t loop_cnt;
1057  uint32_t out2, out3;
1058  uint64_t out0, out1;
1059  v16u8 tmp0, tmp1, tmp2, tmp3;
1060  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1061  v16i8 filt0, filt1, filt2, filt3;
1062  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1063  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1064  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1065  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1066 
1067  src -= (3 * src_stride);
1068 
1069  filt = LD_SH(filter);
1070  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1071 
1072  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1073  src += (7 * src_stride);
1074 
1075  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1076 
1077  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1078  src54_r, src21_r);
1079  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1080  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1081  src54_l, src21_l);
1082  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1083 
1084  for (loop_cnt = 4; loop_cnt--;) {
1085  LD_SB4(src, src_stride, src7, src8, src9, src10);
1086  XORI_B4_128_SB(src7, src8, src9, src10);
1087  src += (4 * src_stride);
1088 
1089  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1090  src87_r, src98_r, src109_r);
1091  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1092  src87_l, src98_l, src109_l);
1093  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1094  filt1, filt2, filt3);
1095  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1096  filt1, filt2, filt3);
1097  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1098  filt1, filt2, filt3);
1099  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1100  filt1, filt2, filt3);
1101  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1102  filt1, filt2, filt3);
1103  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1104  filt1, filt2, filt3);
1105  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1106  filt1, filt2, filt3);
1107  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1108  filt1, filt2, filt3);
1109  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1110  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1111  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1112  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1113  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1114  out3_r, tmp0, tmp1, tmp2, tmp3);
1115  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1116 
1117  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1118  out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1119  out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1120  out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1121  SD(out0, dst);
1122  SW(out2, (dst + 8));
1123  dst += dst_stride;
1124  SD(out1, dst);
1125  SW(out3, (dst + 8));
1126  dst += dst_stride;
1127  out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1128  out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1129  out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1130  out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1131  SD(out0, dst);
1132  SW(out2, (dst + 8));
1133  dst += dst_stride;
1134  SD(out1, dst);
1135  SW(out3, (dst + 8));
1136  dst += dst_stride;
1137 
1138  src10_r = src54_r;
1139  src32_r = src76_r;
1140  src54_r = src98_r;
1141  src21_r = src65_r;
1142  src43_r = src87_r;
1143  src65_r = src109_r;
1144  src10_l = src54_l;
1145  src32_l = src76_l;
1146  src54_l = src98_l;
1147  src21_l = src65_l;
1148  src43_l = src87_l;
1149  src65_l = src109_l;
1150  src6 = src10;
1151  }
1152 }
1153 
1154 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
1155  uint8_t *dst, int32_t dst_stride,
1156  const int8_t *filter, int32_t height)
1157 {
1158  uint32_t loop_cnt;
1159  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1160  v16i8 filt0, filt1, filt2, filt3;
1161  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1162  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1163  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1164  v16u8 tmp0, tmp1, tmp2, tmp3;
1165  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1166 
1167  src -= (3 * src_stride);
1168 
1169  filt = LD_SH(filter);
1170  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1171 
1172  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1173  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1174  src += (7 * src_stride);
1175  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1176  src54_r, src21_r);
1177  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1178  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1179  src54_l, src21_l);
1180  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1181 
1182  for (loop_cnt = (height >> 2); loop_cnt--;) {
1183  LD_SB4(src, src_stride, src7, src8, src9, src10);
1184  XORI_B4_128_SB(src7, src8, src9, src10);
1185  src += (4 * src_stride);
1186 
1187  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1188  src87_r, src98_r, src109_r);
1189  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1190  src87_l, src98_l, src109_l);
1191  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1192  filt1, filt2, filt3);
1193  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1194  filt1, filt2, filt3);
1195  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1196  filt1, filt2, filt3);
1197  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1198  filt1, filt2, filt3);
1199  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1200  filt1, filt2, filt3);
1201  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1202  filt1, filt2, filt3);
1203  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1204  filt1, filt2, filt3);
1205  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1206  filt1, filt2, filt3);
1207  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1208  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1209  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1210  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1211  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1212  out3_r, tmp0, tmp1, tmp2, tmp3);
1213  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1214  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1215  dst += (4 * dst_stride);
1216 
1217  src10_r = src54_r;
1218  src32_r = src76_r;
1219  src54_r = src98_r;
1220  src21_r = src65_r;
1221  src43_r = src87_r;
1222  src65_r = src109_r;
1223  src10_l = src54_l;
1224  src32_l = src76_l;
1225  src54_l = src98_l;
1226  src21_l = src65_l;
1227  src43_l = src87_l;
1228  src65_l = src109_l;
1229  src6 = src10;
1230  }
1231 }
1232 
1233 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
1234  uint8_t *dst, int32_t dst_stride,
1235  const int8_t *filter, int32_t height,
1236  int32_t width)
1237 {
1238  const uint8_t *src_tmp;
1239  uint8_t *dst_tmp;
1240  uint32_t loop_cnt, cnt;
1241  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1242  v16i8 filt0, filt1, filt2, filt3;
1243  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1244  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1245  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1246  v16u8 tmp0, tmp1, tmp2, tmp3;
1247  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1248 
1249  src -= (3 * src_stride);
1250 
1251  filt = LD_SH(filter);
1252  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1253 
1254  for (cnt = (width >> 4); cnt--;) {
1255  src_tmp = src;
1256  dst_tmp = dst;
1257 
1258  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1259  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1260  src_tmp += (7 * src_stride);
1261  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1262  src32_r, src54_r, src21_r);
1263  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1264  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1265  src32_l, src54_l, src21_l);
1266  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1267 
1268  for (loop_cnt = (height >> 2); loop_cnt--;) {
1269  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1270  XORI_B4_128_SB(src7, src8, src9, src10);
1271  src_tmp += (4 * src_stride);
1272  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1273  src87_r, src98_r, src109_r);
1274  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1275  src87_l, src98_l, src109_l);
1276  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1277  filt0, filt1, filt2, filt3);
1278  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1279  filt0, filt1, filt2, filt3);
1280  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1281  filt0, filt1, filt2, filt3);
1282  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1283  filt0, filt1, filt2, filt3);
1284  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1285  filt0, filt1, filt2, filt3);
1286  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1287  filt0, filt1, filt2, filt3);
1288  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1289  filt0, filt1, filt2, filt3);
1290  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1291  filt0, filt1, filt2, filt3);
1292  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1293  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1294  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1295  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1296  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1297  out3_r, tmp0, tmp1, tmp2, tmp3);
1298  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1299  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1300  dst_tmp += (4 * dst_stride);
1301 
1302  src10_r = src54_r;
1303  src32_r = src76_r;
1304  src54_r = src98_r;
1305  src21_r = src65_r;
1306  src43_r = src87_r;
1307  src65_r = src109_r;
1308  src10_l = src54_l;
1309  src32_l = src76_l;
1310  src54_l = src98_l;
1311  src21_l = src65_l;
1312  src43_l = src87_l;
1313  src65_l = src109_l;
1314  src6 = src10;
1315  }
1316 
1317  src += 16;
1318  dst += 16;
1319  }
1320 }
1321 
1322 static void common_vt_8t_24w_msa(const uint8_t *src, int32_t src_stride,
1323  uint8_t *dst, int32_t dst_stride,
1324  const int8_t *filter, int32_t height)
1325 {
1326  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1327  16);
1328 
1329  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1330  height);
1331 }
1332 
1333 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
1334  uint8_t *dst, int32_t dst_stride,
1335  const int8_t *filter, int32_t height)
1336 {
1337  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1338  32);
1339 }
1340 
1341 static void common_vt_8t_48w_msa(const uint8_t *src, int32_t src_stride,
1342  uint8_t *dst, int32_t dst_stride,
1343  const int8_t *filter, int32_t height)
1344 {
1345  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1346  48);
1347 }
1348 
1349 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
1350  uint8_t *dst, int32_t dst_stride,
1351  const int8_t *filter, int32_t height)
1352 {
1353  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1354  64);
1355 }
1356 
1357 static void hevc_hv_uni_8t_4w_msa(const uint8_t *src,
1358  int32_t src_stride,
1359  uint8_t *dst,
1360  int32_t dst_stride,
1361  const int8_t *filter_x,
1362  const int8_t *filter_y,
1363  int32_t height)
1364 {
1365  uint32_t loop_cnt;
1366  uint32_t res = height & 0x07;
1367  v16u8 out0, out1;
1368  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1369  v16i8 src9, src10, src11, src12, src13, src14;
1370  v8i16 filt0, filt1, filt2, filt3;
1371  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1372  v16i8 mask1, mask2, mask3;
1373  v8i16 filter_vec;
1374  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1375  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1376  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1377  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1378  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1379  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1380  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1381 
1382  src -= ((3 * src_stride) + 3);
1383  filter_vec = LD_SH(filter_x);
1384  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1385 
1386  filter_vec = LD_SH(filter_y);
1387  UNPCK_R_SB_SH(filter_vec, filter_vec);
1388 
1389  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1390 
1391  mask1 = mask0 + 2;
1392  mask2 = mask0 + 4;
1393  mask3 = mask0 + 6;
1394 
1395  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1396  src += (7 * src_stride);
1397  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1398 
1399  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1400  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1401  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1402  vec8, vec9, vec10, vec11);
1403  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1404  vec12, vec13, vec14, vec15);
1405 
1406  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1407  filt3);
1408  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1409  filt3);
1410  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1411  filt3);
1412  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1413  filt3);
1414 
1415  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1416  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1417  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1418 
1419  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1420 
1421  for (loop_cnt = height >> 3; loop_cnt--;) {
1422  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1423  src14);
1424  src += (8 * src_stride);
1425  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1426 
1427  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1428  vec0, vec1, vec2, vec3);
1429  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1430  vec4, vec5, vec6, vec7);
1431  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1432  vec8, vec9, vec10, vec11);
1433  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1434  vec12, vec13, vec14, vec15);
1435 
1436  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1437  filt3);
1438  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1439  filt3);
1440  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1441  filt2, filt3);
1442  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1443  filt2, filt3);
1444 
1445  dst76_r = __msa_ilvr_h(dst117, dst66);
1446  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1447  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1448  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1449  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1450  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1451 
1452  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1453  filt_h1, filt_h2, filt_h3);
1454  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1455  filt_h1, filt_h2, filt_h3);
1456  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1457  filt_h1, filt_h2, filt_h3);
1458  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1459  filt_h1, filt_h2, filt_h3);
1460  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1461  filt_h1, filt_h2, filt_h3);
1462  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1463  filt_h1, filt_h2, filt_h3);
1464  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1465  filt_h1, filt_h2, filt_h3);
1466  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1467  filt_h0, filt_h1, filt_h2, filt_h3);
1468 
1469  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1470  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1471  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1472  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1473  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1474  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1475  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1476  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1477  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1478  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1479  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1480  dst += (8 * dst_stride);
1481 
1482  dst10_r = dst98_r;
1483  dst32_r = dst1110_r;
1484  dst54_r = dst1312_r;
1485  dst21_r = dst109_r;
1486  dst43_r = dst1211_r;
1487  dst65_r = dst1413_r;
1488  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1489  }
1490  if (res) {
1491  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1492  src14);
1493  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1494 
1495  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1496  vec0, vec1, vec2, vec3);
1497  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1498  vec4, vec5, vec6, vec7);
1499  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1500  vec8, vec9, vec10, vec11);
1501  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1502  vec12, vec13, vec14, vec15);
1503 
1504  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1505  filt3);
1506  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1507  filt3);
1508  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1509  filt2, filt3);
1510  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1511  filt2, filt3);
1512 
1513  dst76_r = __msa_ilvr_h(dst117, dst66);
1514  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1515  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1516  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1517  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1518  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1519 
1520  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1521  filt_h1, filt_h2, filt_h3);
1522  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1523  filt_h1, filt_h2, filt_h3);
1524  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1525  filt_h1, filt_h2, filt_h3);
1526  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1527  filt_h1, filt_h2, filt_h3);
1528  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1529  filt_h1, filt_h2, filt_h3);
1530  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1531  filt_h1, filt_h2, filt_h3);
1532  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1533  filt_h1, filt_h2, filt_h3);
1534  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1535  filt_h0, filt_h1, filt_h2, filt_h3);
1536 
1537  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1538  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1539  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1540  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1541  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1542  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1543  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1544  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1545  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1546  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1547  if (res == 2) {
1548  ST_W2(out0, 0, 1, dst, dst_stride);
1549  } else if(res == 4) {
1550  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
1551  } else {
1552  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
1553  ST_W2(out1, 0, 1, dst + 4 * dst_stride, dst_stride);
1554  }
1555  }
1556 }
1557 
1558 static void hevc_hv_uni_8t_8multx2mult_msa(const uint8_t *src,
1559  int32_t src_stride,
1560  uint8_t *dst,
1561  int32_t dst_stride,
1562  const int8_t *filter_x,
1563  const int8_t *filter_y,
1565 {
1566  uint32_t loop_cnt, cnt;
1567  const uint8_t *src_tmp;
1568  uint8_t *dst_tmp;
1569  v16u8 out;
1570  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1571  v8i16 filt0, filt1, filt2, filt3;
1572  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1573  v16i8 mask1, mask2, mask3;
1574  v8i16 filter_vec;
1575  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1576  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1577  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1578  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1579  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1580  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1581  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1582  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1583  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1584 
1585  src -= ((3 * src_stride) + 3);
1586 
1587  filter_vec = LD_SH(filter_x);
1588  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1589 
1590  filter_vec = LD_SH(filter_y);
1591  UNPCK_R_SB_SH(filter_vec, filter_vec);
1592 
1593  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1594 
1595  mask1 = mask0 + 2;
1596  mask2 = mask0 + 4;
1597  mask3 = mask0 + 6;
1598 
1599  for (cnt = width >> 3; cnt--;) {
1600  src_tmp = src;
1601  dst_tmp = dst;
1602 
1603  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1604  src_tmp += (7 * src_stride);
1605  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1606 
1607  /* row 0 row 1 row 2 row 3 */
1608  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1609  vec0, vec1, vec2, vec3);
1610  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1611  vec4, vec5, vec6, vec7);
1612  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1613  vec8, vec9, vec10, vec11);
1614  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1615  vec12, vec13, vec14, vec15);
1616  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1617  filt3);
1618  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1619  filt3);
1620  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1621  filt3);
1622  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1623  filt2, filt3);
1624 
1625  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1626  vec0, vec1, vec2, vec3);
1627  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1628  vec4, vec5, vec6, vec7);
1629  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1630  vec8, vec9, vec10, vec11);
1631  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1632  filt3);
1633  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1634  filt3);
1635  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1636  filt3);
1637 
1638  for (loop_cnt = height >> 1; loop_cnt--;) {
1639  LD_SB2(src_tmp, src_stride, src7, src8);
1640  XORI_B2_128_SB(src7, src8);
1641  src_tmp += 2 * src_stride;
1642 
1643  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1644  dst10_r, dst32_r, dst54_r, dst21_r);
1645  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1646  dst10_l, dst32_l, dst54_l, dst21_l);
1647  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1648  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1649 
1650  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1651  vec0, vec1, vec2, vec3);
1652  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1653  filt2, filt3);
1654 
1655  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1656  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1657  filt_h0, filt_h1, filt_h2, filt_h3);
1658  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1659  filt_h0, filt_h1, filt_h2, filt_h3);
1660  dst0_r >>= 6;
1661  dst0_l >>= 6;
1662 
1663  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1664  vec0, vec1, vec2, vec3);
1665  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1666  filt2, filt3);
1667 
1668  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1669  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1670  filt_h0, filt_h1, filt_h2, filt_h3);
1671  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1672  filt_h0, filt_h1, filt_h2, filt_h3);
1673  dst1_r >>= 6;
1674  dst1_l >>= 6;
1675  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1676  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1677 
1678  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1679  out = PCKEV_XORI128_UB(dst0, dst1);
1680  ST_D2(out, 0, 1, dst_tmp, dst_stride);
1681  dst_tmp += (2 * dst_stride);
1682 
1683  dst0 = dst2;
1684  dst1 = dst3;
1685  dst2 = dst4;
1686  dst3 = dst5;
1687  dst4 = dst6;
1688  dst5 = dst7;
1689  dst6 = dst8;
1690  }
1691 
1692  src += 8;
1693  dst += 8;
1694  }
1695 }
1696 
1697 static void hevc_hv_uni_8t_8w_msa(const uint8_t *src,
1698  int32_t src_stride,
1699  uint8_t *dst,
1700  int32_t dst_stride,
1701  const int8_t *filter_x,
1702  const int8_t *filter_y,
1703  int32_t height)
1704 {
1705  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1706  filter_x, filter_y, height, 8);
1707 }
1708 
1709 static void hevc_hv_uni_8t_12w_msa(const uint8_t *src,
1710  int32_t src_stride,
1711  uint8_t *dst,
1712  int32_t dst_stride,
1713  const int8_t *filter_x,
1714  const int8_t *filter_y,
1715  int32_t height)
1716 {
1717  uint32_t loop_cnt;
1718  const uint8_t *src_tmp;
1719  uint8_t *dst_tmp;
1720  v16u8 out0, out1;
1721  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1722  v16i8 src11, src12, src13, src14;
1723  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1724  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1725  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1726  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1727  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1728  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1729  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1730  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1731  v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1732  v8i16 dst1413_r, dst87_l, filter_vec;
1733  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1734  v4i32 dst0_l, dst1_l;
1735 
1736  src -= ((3 * src_stride) + 3);
1737 
1738  filter_vec = LD_SH(filter_x);
1739  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1740 
1741  filter_vec = LD_SH(filter_y);
1742  UNPCK_R_SB_SH(filter_vec, filter_vec);
1743 
1744  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1745 
1746  mask0 = LD_SB(ff_hevc_mask_arr);
1747  mask1 = mask0 + 2;
1748  mask2 = mask0 + 4;
1749  mask3 = mask0 + 6;
1750 
1751  src_tmp = src;
1752  dst_tmp = dst;
1753 
1754  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1755  src_tmp += (7 * src_stride);
1756  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1757 
1758  /* row 0 row 1 row 2 row 3 */
1759  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1760  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1761  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1762  vec11);
1763  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1764  vec15);
1765  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1766  filt3);
1767  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1768  filt3);
1769  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1770  filt3);
1771  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1772  filt2, filt3);
1773 
1774  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1775  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1776  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1777  vec11);
1778  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1779  filt3);
1780  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1781  filt3);
1782  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1783  filt3);
1784 
1785  for (loop_cnt = 8; loop_cnt--;) {
1786  LD_SB2(src_tmp, src_stride, src7, src8);
1787  XORI_B2_128_SB(src7, src8);
1788  src_tmp += 2 * src_stride;
1789 
1790  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1791  dst32_r, dst54_r, dst21_r);
1792  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1793  dst32_l, dst54_l, dst21_l);
1794  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1795  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1796 
1797  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1798  vec3);
1799  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1800  filt3);
1801 
1802  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1803  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1804  filt_h0, filt_h1, filt_h2, filt_h3);
1805  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1806  filt_h0, filt_h1, filt_h2, filt_h3);
1807  dst0_r >>= 6;
1808  dst0_l >>= 6;
1809 
1810  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1811  vec3);
1812  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1813  filt3);
1814 
1815  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1816  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1817  filt_h0, filt_h1, filt_h2, filt_h3);
1818  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1819  filt_h0, filt_h1, filt_h2, filt_h3);
1820  dst1_r >>= 6;
1821  dst1_l >>= 6;
1822  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1823  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1824 
1825  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1826  out0 = PCKEV_XORI128_UB(dst0, dst1);
1827  ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1828  dst_tmp += (2 * dst_stride);
1829 
1830  dst0 = dst2;
1831  dst1 = dst3;
1832  dst2 = dst4;
1833  dst3 = dst5;
1834  dst4 = dst6;
1835  dst5 = dst7;
1836  dst6 = dst8;
1837  }
1838 
1839  src += 8;
1840  dst += 8;
1841 
1842  mask4 = LD_SB(ff_hevc_mask_arr + 16);
1843  mask5 = mask4 + 2;
1844  mask6 = mask4 + 4;
1845  mask7 = mask4 + 6;
1846 
1847  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1848  src += (7 * src_stride);
1849  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1850 
1851  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1852  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1853  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1854  vec11);
1855  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1856  vec15);
1857 
1858  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1859  filt3);
1860  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1861  filt3);
1862  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1863  filt3);
1864  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1865  filt3);
1866 
1867  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1868  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1869  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1870 
1871  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1872 
1873  for (loop_cnt = 2; loop_cnt--;) {
1874  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1875  src14);
1876  src += (8 * src_stride);
1877  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1878 
1879  VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1880  vec3);
1881  VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1882  vec7);
1883  VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1884  vec11);
1885  VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1886  vec14, vec15);
1887 
1888  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1889  filt3);
1890  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1891  filt3);
1892  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1893  filt2, filt3);
1894  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1895  filt2, filt3);
1896 
1897  dst76_r = __msa_ilvr_h(dst117, dst66);
1898  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1899  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1900  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1901  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1902  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1903 
1904  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1905  filt_h1, filt_h2, filt_h3);
1906  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1907  filt_h1, filt_h2, filt_h3);
1908  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1909  filt_h1, filt_h2, filt_h3);
1910  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1911  filt_h1, filt_h2, filt_h3);
1912  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1913  filt_h1, filt_h2, filt_h3);
1914  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1915  filt_h1, filt_h2, filt_h3);
1916  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1917  filt_h1, filt_h2, filt_h3);
1918  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1919  filt_h0, filt_h1, filt_h2, filt_h3);
1920 
1921  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1922  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1923  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1924  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1925  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1926  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1927  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1928  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1929  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1930  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1931  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1932  dst += (8 * dst_stride);
1933 
1934  dst10_r = dst98_r;
1935  dst32_r = dst1110_r;
1936  dst54_r = dst1312_r;
1937  dst21_r = dst109_r;
1938  dst43_r = dst1211_r;
1939  dst65_r = dst1413_r;
1940  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1941  }
1942 }
1943 
1944 static void hevc_hv_uni_8t_16w_msa(const uint8_t *src,
1945  int32_t src_stride,
1946  uint8_t *dst,
1947  int32_t dst_stride,
1948  const int8_t *filter_x,
1949  const int8_t *filter_y,
1950  int32_t height)
1951 {
1952  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1953  filter_x, filter_y, height, 16);
1954 }
1955 
1956 static void hevc_hv_uni_8t_24w_msa(const uint8_t *src,
1957  int32_t src_stride,
1958  uint8_t *dst,
1959  int32_t dst_stride,
1960  const int8_t *filter_x,
1961  const int8_t *filter_y,
1962  int32_t height)
1963 {
1964  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1965  filter_x, filter_y, height, 24);
1966 }
1967 
1968 static void hevc_hv_uni_8t_32w_msa(const uint8_t *src,
1969  int32_t src_stride,
1970  uint8_t *dst,
1971  int32_t dst_stride,
1972  const int8_t *filter_x,
1973  const int8_t *filter_y,
1974  int32_t height)
1975 {
1976  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1977  filter_x, filter_y, height, 32);
1978 }
1979 
1980 static void hevc_hv_uni_8t_48w_msa(const uint8_t *src,
1981  int32_t src_stride,
1982  uint8_t *dst,
1983  int32_t dst_stride,
1984  const int8_t *filter_x,
1985  const int8_t *filter_y,
1986  int32_t height)
1987 {
1988  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1989  filter_x, filter_y, height, 48);
1990 }
1991 
1992 static void hevc_hv_uni_8t_64w_msa(const uint8_t *src,
1993  int32_t src_stride,
1994  uint8_t *dst,
1995  int32_t dst_stride,
1996  const int8_t *filter_x,
1997  const int8_t *filter_y,
1998  int32_t height)
1999 {
2000  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2001  filter_x, filter_y, height, 64);
2002 }
2003 
2004 static void common_hz_4t_4x2_msa(const uint8_t *src, int32_t src_stride,
2005  uint8_t *dst, int32_t dst_stride,
2006  const int8_t *filter)
2007 {
2008  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
2009  v16u8 out;
2010  v8i16 filt, res0;
2011 
2012  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2013  src -= 1;
2014 
2015  /* rearranging filter */
2016  filt = LD_SH(filter);
2017  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2018 
2019  mask1 = mask0 + 2;
2020 
2021  LD_SB2(src, src_stride, src0, src1);
2023  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2024  res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2025  res0 = __msa_srari_h(res0, 6);
2026  res0 = __msa_sat_s_h(res0, 7);
2027  out = PCKEV_XORI128_UB(res0, res0);
2028  ST_W2(out, 0, 1, dst, dst_stride);
2029 }
2030 
2031 static void common_hz_4t_4x4_msa(const uint8_t *src, int32_t src_stride,
2032  uint8_t *dst, int32_t dst_stride,
2033  const int8_t *filter)
2034 {
2035  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2036  v8i16 filt, out0, out1;
2037  v16u8 out;
2038 
2039  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2040  src -= 1;
2041 
2042  /* rearranging filter */
2043  filt = LD_SH(filter);
2044  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2045 
2046  mask1 = mask0 + 2;
2047 
2048  LD_SB4(src, src_stride, src0, src1, src2, src3);
2049  XORI_B4_128_SB(src0, src1, src2, src3);
2050  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2051  filt0, filt1, out0, out1);
2052  SRARI_H2_SH(out0, out1, 6);
2053  SAT_SH2_SH(out0, out1, 7);
2054  out = PCKEV_XORI128_UB(out0, out1);
2055  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2056 }
2057 
2058 static void common_hz_4t_4x8_msa(const uint8_t *src, int32_t src_stride,
2059  uint8_t *dst, int32_t dst_stride,
2060  const int8_t *filter)
2061 {
2062  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2063  v16u8 out;
2064  v8i16 filt, out0, out1, out2, out3;
2065 
2066  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2067  src -= 1;
2068 
2069  /* rearranging filter */
2070  filt = LD_SH(filter);
2071  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2072 
2073  mask1 = mask0 + 2;
2074 
2075  LD_SB4(src, src_stride, src0, src1, src2, src3);
2076  src += (4 * src_stride);
2077 
2078  XORI_B4_128_SB(src0, src1, src2, src3);
2079  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2080  filt0, filt1, out0, out1);
2081  LD_SB4(src, src_stride, src0, src1, src2, src3);
2082  XORI_B4_128_SB(src0, src1, src2, src3);
2083  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2084  filt0, filt1, out2, out3);
2085  SRARI_H4_SH(out0, out1, out2, out3, 6);
2086  SAT_SH4_SH(out0, out1, out2, out3, 7);
2087  out = PCKEV_XORI128_UB(out0, out1);
2088  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2089  out = PCKEV_XORI128_UB(out2, out3);
2090  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2091 }
2092 
2093 static void common_hz_4t_4x16_msa(const uint8_t *src, int32_t src_stride,
2094  uint8_t *dst, int32_t dst_stride,
2095  const int8_t *filter)
2096 {
2097  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2098  v16i8 filt0, filt1, mask0, mask1;
2099  v16u8 out;
2100  v8i16 filt, out0, out1, out2, out3;
2101 
2102  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2103  src -= 1;
2104 
2105  /* rearranging filter */
2106  filt = LD_SH(filter);
2107  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2108 
2109  mask1 = mask0 + 2;
2110 
2111  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2112  src += (8 * src_stride);
2113  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2114  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2115  filt0, filt1, out0, out1);
2116  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2117  filt0, filt1, out2, out3);
2118  SRARI_H4_SH(out0, out1, out2, out3, 6);
2119  SAT_SH4_SH(out0, out1, out2, out3, 7);
2120  out = PCKEV_XORI128_UB(out0, out1);
2121  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2122  out = PCKEV_XORI128_UB(out2, out3);
2123  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2124  dst += (8 * dst_stride);
2125 
2126  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2127  src += (8 * src_stride);
2128  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2129  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2130  filt0, filt1, out0, out1);
2131  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2132  filt0, filt1, out2, out3);
2133  SRARI_H4_SH(out0, out1, out2, out3, 6);
2134  SAT_SH4_SH(out0, out1, out2, out3, 7);
2135  out = PCKEV_XORI128_UB(out0, out1);
2136  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2137  out = PCKEV_XORI128_UB(out2, out3);
2138  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2139 }
2140 
2141 static void common_hz_4t_4w_msa(const uint8_t *src, int32_t src_stride,
2142  uint8_t *dst, int32_t dst_stride,
2143  const int8_t *filter, int32_t height)
2144 {
2145  if (2 == height) {
2146  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2147  } else if (4 == height) {
2148  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2149  } else if (8 == height) {
2150  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2151  } else if (16 == height) {
2152  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2153  }
2154 }
2155 
2156 static void common_hz_4t_6w_msa(const uint8_t *src, int32_t src_stride,
2157  uint8_t *dst, int32_t dst_stride,
2158  const int8_t *filter, int32_t height)
2159 {
2160  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2161  v16u8 out4, out5;
2162  v8i16 filt, out0, out1, out2, out3;
2163 
2164  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2165  src -= 1;
2166 
2167  /* rearranging filter */
2168  filt = LD_SH(filter);
2169  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2170 
2171  mask1 = mask0 + 2;
2172 
2173  LD_SB4(src, src_stride, src0, src1, src2, src3);
2174  src += (4 * src_stride);
2175 
2176  XORI_B4_128_SB(src0, src1, src2, src3);
2177  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2178  filt1, out0, out1, out2, out3);
2179  SRARI_H4_SH(out0, out1, out2, out3, 6);
2180  SAT_SH4_SH(out0, out1, out2, out3, 7);
2181  out4 = PCKEV_XORI128_UB(out0, out1);
2182  out5 = PCKEV_XORI128_UB(out2, out3);
2183  ST_W2(out4, 0, 2, dst, dst_stride);
2184  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2185  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2186  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2187  dst += (4 * dst_stride);
2188 
2189  LD_SB4(src, src_stride, src0, src1, src2, src3);
2190  src += (4 * src_stride);
2191 
2192  XORI_B4_128_SB(src0, src1, src2, src3);
2193  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2194  filt1, out0, out1, out2, out3);
2195  SRARI_H4_SH(out0, out1, out2, out3, 6);
2196  SAT_SH4_SH(out0, out1, out2, out3, 7);
2197  out4 = PCKEV_XORI128_UB(out0, out1);
2198  out5 = PCKEV_XORI128_UB(out2, out3);
2199  ST_W2(out4, 0, 2, dst, dst_stride);
2200  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2201  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2202  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2203 }
2204 
2205 static void common_hz_4t_8x2mult_msa(const uint8_t *src, int32_t src_stride,
2206  uint8_t *dst, int32_t dst_stride,
2207  const int8_t *filter, int32_t height)
2208 {
2209  uint32_t loop_cnt;
2210  v16i8 src0, src1, filt0, filt1, mask0, mask1;
2211  v16u8 out;
2212  v8i16 filt, vec0, vec1, vec2, vec3;
2213 
2214  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2215  src -= 1;
2216 
2217  filt = LD_SH(filter);
2218  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2219 
2220  mask1 = mask0 + 2;
2221 
2222  for (loop_cnt = (height >> 1); loop_cnt--;) {
2223  LD_SB2(src, src_stride, src0, src1);
2224  src += (2 * src_stride);
2225 
2227  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2228  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2229  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2230  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2231  SRARI_H2_SH(vec0, vec1, 6);
2232  SAT_SH2_SH(vec0, vec1, 7);
2233  out = PCKEV_XORI128_UB(vec0, vec1);
2234  ST_D2(out, 0, 1, dst, dst_stride);
2235  dst += (2 * dst_stride);
2236  }
2237 }
2238 
2239 static void common_hz_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride,
2240  uint8_t *dst, int32_t dst_stride,
2241  const int8_t *filter, int32_t height)
2242 {
2243  uint32_t loop_cnt;
2244  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2245  v16u8 tmp0, tmp1;
2246  v8i16 filt, out0, out1, out2, out3;
2247 
2248  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2249  src -= 1;
2250 
2251  /* rearranging filter */
2252  filt = LD_SH(filter);
2253  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2254 
2255  mask1 = mask0 + 2;
2256 
2257  for (loop_cnt = (height >> 2); loop_cnt--;) {
2258  LD_SB4(src, src_stride, src0, src1, src2, src3);
2259  src += (4 * src_stride);
2260 
2261  XORI_B4_128_SB(src0, src1, src2, src3);
2262  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2263  filt1, out0, out1, out2, out3);
2264  SRARI_H4_SH(out0, out1, out2, out3, 6);
2265  SAT_SH4_SH(out0, out1, out2, out3, 7);
2266  tmp0 = PCKEV_XORI128_UB(out0, out1);
2267  tmp1 = PCKEV_XORI128_UB(out2, out3);
2268  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2269  dst += (4 * dst_stride);
2270  }
2271 }
2272 
2273 static void common_hz_4t_8w_msa(const uint8_t *src, int32_t src_stride,
2274  uint8_t *dst, int32_t dst_stride,
2275  const int8_t *filter, int32_t height)
2276 {
2277  if ((2 == height) || (6 == height)) {
2278  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2279  height);
2280  } else {
2281  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2282  height);
2283  }
2284 }
2285 
2286 static void common_hz_4t_12w_msa(const uint8_t *src, int32_t src_stride,
2287  uint8_t *dst, int32_t dst_stride,
2288  const int8_t *filter, int32_t height)
2289 {
2290  uint32_t loop_cnt;
2291  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2292  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2293  v16i8 vec10, vec11;
2294  v16u8 tmp0, tmp1;
2295  v8i16 filt, out0, out1, out2, out3, out4, out5;
2296 
2297  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2298  mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2299 
2300  src -= 1;
2301 
2302  /* rearranging filter */
2303  filt = LD_SH(filter);
2304  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2305 
2306  mask1 = mask0 + 2;
2307  mask3 = mask2 + 2;
2308 
2309  for (loop_cnt = 4; loop_cnt--;) {
2310  LD_SB4(src, src_stride, src0, src1, src2, src3);
2311  src += (4 * src_stride);
2312 
2313  XORI_B4_128_SB(src0, src1, src2, src3);
2314  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2315  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2316  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2317  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2318  SRARI_H2_SH(out0, out1, 6);
2319  SAT_SH2_SH(out0, out1, 7);
2320  tmp0 = PCKEV_XORI128_UB(out0, out1);
2321  ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2322 
2323  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2324  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2325  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2326  out2, out3, out4, out5);
2327  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2328  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2329  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2330  out2, out3, out4, out5);
2331  SRARI_H4_SH(out2, out3, out4, out5, 6);
2332  SAT_SH4_SH(out2, out3, out4, out5, 7);
2333  tmp0 = PCKEV_XORI128_UB(out2, out3);
2334  tmp1 = PCKEV_XORI128_UB(out4, out5);
2335  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2336  dst += (4 * dst_stride);
2337  }
2338 }
2339 
2340 static void common_hz_4t_16w_msa(const uint8_t *src, int32_t src_stride,
2341  uint8_t *dst, int32_t dst_stride,
2342  const int8_t *filter, int32_t height)
2343 {
2344  uint32_t loop_cnt;
2345  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2346  v16i8 filt0, filt1, mask0, mask1;
2347  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2348  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2349  v16u8 out;
2350 
2351  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2352  src -= 1;
2353 
2354  /* rearranging filter */
2355  filt = LD_SH(filter);
2356  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2357 
2358  mask1 = mask0 + 2;
2359 
2360  for (loop_cnt = (height >> 2); loop_cnt--;) {
2361  LD_SB4(src, src_stride, src0, src2, src4, src6);
2362  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2363  src += (4 * src_stride);
2364 
2365  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2366 
2367  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2368  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2369  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2370  out0, out1, out2, out3);
2371  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2372  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2373  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2374  out0, out1, out2, out3);
2375  SRARI_H4_SH(out0, out1, out2, out3, 6);
2376  SAT_SH4_SH(out0, out1, out2, out3, 7);
2377  out = PCKEV_XORI128_UB(out0, out1);
2378  ST_UB(out, dst);
2379  dst += dst_stride;
2380  out = PCKEV_XORI128_UB(out2, out3);
2381  ST_UB(out, dst);
2382  dst += dst_stride;
2383 
2384  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2385  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2386  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2387  out4, out5, out6, out7);
2388  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2389  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2390  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2391  out4, out5, out6, out7);
2392  SRARI_H4_SH(out4, out5, out6, out7, 6);
2393  SAT_SH4_SH(out4, out5, out6, out7, 7);
2394  out = PCKEV_XORI128_UB(out4, out5);
2395  ST_UB(out, dst);
2396  dst += dst_stride;
2397  out = PCKEV_XORI128_UB(out6, out7);
2398  ST_UB(out, dst);
2399  dst += dst_stride;
2400  }
2401 }
2402 
2403 static void common_hz_4t_24w_msa(const uint8_t *src, int32_t src_stride,
2404  uint8_t *dst, int32_t dst_stride,
2405  const int8_t *filter, int32_t height)
2406 {
2407  uint8_t *dst1 = dst + 16;
2408  uint32_t loop_cnt;
2409  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2410  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2411  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2412  v8i16 filt, out0, out1, out2, out3;
2413  v16u8 tmp0, tmp1;
2414 
2415  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2416  src -= 1;
2417 
2418  /* rearranging filter */
2419  filt = LD_SH(filter);
2420  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2421 
2422  mask1 = mask0 + 2;
2423  mask00 = mask0 + 8;
2424  mask11 = mask0 + 10;
2425 
2426  for (loop_cnt = 8; loop_cnt--;) {
2427  LD_SB4(src, src_stride, src0, src2, src4, src6);
2428  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2429  src += (4 * src_stride);
2430 
2431  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2432  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2433  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2434  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2435  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2436  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2437  out0, out1, out2, out3);
2438  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2439  out0, out1, out2, out3);
2440  SRARI_H4_SH(out0, out1, out2, out3, 6);
2441  SAT_SH4_SH(out0, out1, out2, out3, 7);
2442  tmp0 = PCKEV_XORI128_UB(out0, out1);
2443  ST_UB(tmp0, dst);
2444  dst += dst_stride;
2445  tmp0 = PCKEV_XORI128_UB(out2, out3);
2446  ST_UB(tmp0, dst);
2447  dst += dst_stride;
2448 
2449  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2450  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2451  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2452  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2453  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2454  out0, out1, out2, out3);
2455  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2456  out0, out1, out2, out3);
2457  SRARI_H4_SH(out0, out1, out2, out3, 6);
2458  SAT_SH4_SH(out0, out1, out2, out3, 7);
2459  tmp0 = PCKEV_XORI128_UB(out0, out1);
2460  ST_UB(tmp0, dst);
2461  dst += dst_stride;
2462  tmp0 = PCKEV_XORI128_UB(out2, out3);
2463  ST_UB(tmp0, dst);
2464  dst += dst_stride;
2465 
2466  /* 8 width */
2467  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2468  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2469  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2470  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2471 
2472  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2473  out0, out1, out2, out3);
2474  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2475  out0, out1, out2, out3);
2476 
2477  SRARI_H4_SH(out0, out1, out2, out3, 6);
2478  SAT_SH4_SH(out0, out1, out2, out3, 7);
2479  tmp0 = PCKEV_XORI128_UB(out0, out1);
2480  tmp1 = PCKEV_XORI128_UB(out2, out3);
2481  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2482  dst1 += (4 * dst_stride);
2483  }
2484 }
2485 
2486 static void common_hz_4t_32w_msa(const uint8_t *src, int32_t src_stride,
2487  uint8_t *dst, int32_t dst_stride,
2488  const int8_t *filter, int32_t height)
2489 {
2490  uint32_t loop_cnt;
2491  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2492  v16i8 filt0, filt1, mask0, mask1;
2493  v16u8 out;
2494  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2495  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2496 
2497  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2498  src -= 1;
2499 
2500  /* rearranging filter */
2501  filt = LD_SH(filter);
2502  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2503 
2504  mask1 = mask0 + 2;
2505 
2506  for (loop_cnt = (height >> 1); loop_cnt--;) {
2507  src0 = LD_SB(src);
2508  src1 = LD_SB(src + 8);
2509  src2 = LD_SB(src + 16);
2510  src3 = LD_SB(src + 24);
2511  src += src_stride;
2512  src4 = LD_SB(src);
2513  src5 = LD_SB(src + 8);
2514  src6 = LD_SB(src + 16);
2515  src7 = LD_SB(src + 24);
2516  src += src_stride;
2517 
2518  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2519 
2520  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2521  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2522  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2523  out0, out1, out2, out3);
2524  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2525  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2526  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2527  out0, out1, out2, out3);
2528 
2529  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2530  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2531  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2532  out4, out5, out6, out7);
2533  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2534  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2535  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2536  out4, out5, out6, out7);
2537  SRARI_H4_SH(out0, out1, out2, out3, 6);
2538  SRARI_H4_SH(out4, out5, out6, out7, 6);
2539  SAT_SH4_SH(out0, out1, out2, out3, 7);
2540  SAT_SH4_SH(out4, out5, out6, out7, 7);
2541  out = PCKEV_XORI128_UB(out0, out1);
2542  ST_UB(out, dst);
2543  out = PCKEV_XORI128_UB(out2, out3);
2544  ST_UB(out, dst + 16);
2545  dst += dst_stride;
2546  out = PCKEV_XORI128_UB(out4, out5);
2547  ST_UB(out, dst);
2548  out = PCKEV_XORI128_UB(out6, out7);
2549  ST_UB(out, dst + 16);
2550  dst += dst_stride;
2551  }
2552 }
2553 
2554 static void common_vt_4t_4x2_msa(const uint8_t *src, int32_t src_stride,
2555  uint8_t *dst, int32_t dst_stride,
2556  const int8_t *filter)
2557 {
2558  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2559  v16i8 src2110, src4332, filt0, filt1;
2560  v16u8 out;
2561  v8i16 filt, out10;
2562 
2563  src -= src_stride;
2564 
2565  filt = LD_SH(filter);
2566  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2567 
2568  LD_SB3(src, src_stride, src0, src1, src2);
2569  src += (3 * src_stride);
2570 
2571  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2572  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2573  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2574  LD_SB2(src, src_stride, src3, src4);
2575  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2576  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2577  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2578  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2579  out10 = __msa_srari_h(out10, 6);
2580  out10 = __msa_sat_s_h(out10, 7);
2581  out = PCKEV_XORI128_UB(out10, out10);
2582  ST_W2(out, 0, 1, dst, dst_stride);
2583 }
2584 
2585 static void common_vt_4t_4x4multiple_msa(const uint8_t *src, int32_t src_stride,
2586  uint8_t *dst, int32_t dst_stride,
2587  const int8_t *filter, int32_t height)
2588 {
2589  uint32_t loop_cnt;
2590  v16i8 src0, src1, src2, src3, src4, src5;
2591  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2592  v16i8 src2110, src4332, filt0, filt1;
2593  v8i16 filt, out10, out32;
2594  v16u8 out;
2595 
2596  src -= src_stride;
2597 
2598  filt = LD_SH(filter);
2599  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2600 
2601  LD_SB3(src, src_stride, src0, src1, src2);
2602  src += (3 * src_stride);
2603 
2604  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2605 
2606  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2607  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2608 
2609  for (loop_cnt = (height >> 2); loop_cnt--;) {
2610  LD_SB3(src, src_stride, src3, src4, src5);
2611  src += (3 * src_stride);
2612  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2613  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2614  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2615  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2616 
2617  src2 = LD_SB(src);
2618  src += (src_stride);
2619  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2620  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2621  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2622  out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2623  SRARI_H2_SH(out10, out32, 6);
2624  SAT_SH2_SH(out10, out32, 7);
2625  out = PCKEV_XORI128_UB(out10, out32);
2626  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2627  dst += (4 * dst_stride);
2628  }
2629 }
2630 
2631 static void common_vt_4t_4w_msa(const uint8_t *src, int32_t src_stride,
2632  uint8_t *dst, int32_t dst_stride,
2633  const int8_t *filter, int32_t height)
2634 {
2635  if (2 == height) {
2636  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2637  } else {
2638  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2639  height);
2640  }
2641 }
2642 
2643 static void common_vt_4t_6w_msa(const uint8_t *src, int32_t src_stride,
2644  uint8_t *dst, int32_t dst_stride,
2645  const int8_t *filter, int32_t height)
2646 {
2647  v16u8 out0, out1;
2648  v16i8 src0, src1, src2, src3, src4, src5, src6;
2649  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2650  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2651 
2652  src -= src_stride;
2653 
2654  filter_vec = LD_SH(filter);
2655  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2656 
2657  LD_SB3(src, src_stride, src0, src1, src2);
2658  src += (3 * src_stride);
2660  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2661 
2662  LD_SB2(src, src_stride, src3, src4);
2663  src += (2 * src_stride);
2664  XORI_B2_128_SB(src3, src4);
2665  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2666 
2667  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2668  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2669 
2670  LD_SB2(src, src_stride, src5, src6);
2671  src += (2 * src_stride);
2672  XORI_B2_128_SB(src5, src6);
2673  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2674 
2675  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2676  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2677 
2678  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2679  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2680  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2681  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2682  ST_W2(out0, 0, 2, dst, dst_stride);
2683  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2684  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2685  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2686  dst += (4 * dst_stride);
2687 
2688  LD_SB2(src, src_stride, src3, src4);
2689  src += (2 * src_stride);
2690  XORI_B2_128_SB(src3, src4);
2691  ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2692 
2693  dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2694  dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2695 
2696  LD_SB2(src, src_stride, src5, src6);
2697  src += (2 * src_stride);
2698  XORI_B2_128_SB(src5, src6);
2699  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2700 
2701  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2702  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2703 
2704  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2705  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2706  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2707  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2708  ST_W2(out0, 0, 2, dst, dst_stride);
2709  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2710  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2711  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2712 }
2713 
2714 static void common_vt_4t_8x2_msa(const uint8_t *src, int32_t src_stride,
2715  uint8_t *dst, int32_t dst_stride,
2716  const int8_t *filter)
2717 {
2718  v16i8 src0, src1, src2, src3, src4;
2719  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2720  v16u8 out;
2721 
2722  src -= src_stride;
2723 
2724  /* rearranging filter_y */
2725  filt = LD_SH(filter);
2726  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2727 
2728  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2729  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2730  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2731  tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2732  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2733  tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2734  SRARI_H2_SH(tmp0, tmp1, 6);
2735  SAT_SH2_SH(tmp0, tmp1, 7);
2736  out = PCKEV_XORI128_UB(tmp0, tmp1);
2737  ST_D2(out, 0, 1, dst, dst_stride);
2738 }
2739 
2740 static void common_vt_4t_8x6_msa(const uint8_t *src, int32_t src_stride,
2741  uint8_t *dst, int32_t dst_stride,
2742  const int8_t *filter)
2743 {
2744  uint32_t loop_cnt;
2745  uint64_t out0, out1, out2;
2746  v16i8 src0, src1, src2, src3, src4, src5;
2747  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2748  v8i16 filt, filt0, filt1;
2749 
2750  src -= src_stride;
2751 
2752  /* rearranging filter_y */
2753  filt = LD_SH(filter);
2754  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2755 
2756  LD_SB3(src, src_stride, src0, src1, src2);
2757  src += (3 * src_stride);
2758 
2760  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2761 
2762  for (loop_cnt = 2; loop_cnt--;) {
2763  LD_SB3(src, src_stride, src3, src4, src5);
2764  src += (3 * src_stride);
2765 
2766  XORI_B3_128_SB(src3, src4, src5);
2767  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2768  tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2769  tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2770  tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2771  SRARI_H2_SH(tmp0, tmp1, 6);
2772  tmp2 = __msa_srari_h(tmp2, 6);
2773  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2774  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2775  XORI_B2_128_SH(tmp0, tmp2);
2776 
2777  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2778  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2779  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2780  SD(out0, dst);
2781  dst += dst_stride;
2782  SD(out1, dst);
2783  dst += dst_stride;
2784  SD(out2, dst);
2785  dst += dst_stride;
2786 
2787  src2 = src5;
2788  vec0 = vec3;
2789  vec2 = vec4;
2790  }
2791 }
2792 
2793 static void common_vt_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride,
2794  uint8_t *dst, int32_t dst_stride,
2795  const int8_t *filter, int32_t height)
2796 {
2797  uint32_t loop_cnt;
2798  v16i8 src0, src1, src2, src7, src8, src9, src10;
2799  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2800  v16u8 tmp0, tmp1;
2801  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2802 
2803  src -= src_stride;
2804 
2805  filt = LD_SH(filter);
2806  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2807 
2808  LD_SB3(src, src_stride, src0, src1, src2);
2809  src += (3 * src_stride);
2810 
2812  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2813 
2814  for (loop_cnt = (height >> 2); loop_cnt--;) {
2815  LD_SB4(src, src_stride, src7, src8, src9, src10);
2816  src += (4 * src_stride);
2817 
2818  XORI_B4_128_SB(src7, src8, src9, src10);
2819  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2820  src72_r, src87_r, src98_r, src109_r);
2821  out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2822  out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2823  out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2824  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2825  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2826  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2827  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2828  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2829  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2830  dst += (4 * dst_stride);
2831 
2832  src10_r = src98_r;
2833  src21_r = src109_r;
2834  src2 = src10;
2835  }
2836 }
2837 
2838 static void common_vt_4t_8w_msa(const uint8_t *src, int32_t src_stride,
2839  uint8_t *dst, int32_t dst_stride,
2840  const int8_t *filter, int32_t height)
2841 {
2842  if (2 == height) {
2843  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2844  } else if (6 == height) {
2845  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2846  } else {
2847  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2848  filter, height);
2849  }
2850 }
2851 
2852 static void common_vt_4t_12w_msa(const uint8_t *src, int32_t src_stride,
2853  uint8_t *dst, int32_t dst_stride,
2854  const int8_t *filter, int32_t height)
2855 {
2856  uint32_t loop_cnt;
2857  v16i8 src0, src1, src2, src3, src4, src5, src6;
2858  v16u8 out0, out1;
2859  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2860  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2861  v16i8 src2110, src4332, src6554;
2862  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2863  v8i16 filter_vec;
2864 
2865  src -= (1 * src_stride);
2866 
2867  filter_vec = LD_SH(filter);
2868  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2869 
2870  LD_SB3(src, src_stride, src0, src1, src2);
2871  src += (3 * src_stride);
2872 
2874  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2875  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2876  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2877 
2878  for (loop_cnt = 4; loop_cnt--;) {
2879  LD_SB4(src, src_stride, src3, src4, src5, src6);
2880  src += (4 * src_stride);
2881 
2882  XORI_B4_128_SB(src3, src4, src5, src6);
2883  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2884  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2885  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2886  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2887  ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2888  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2889 
2890  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2891  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2892  dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2893  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2894  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2895  dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2896 
2897  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2898  SRARI_H2_SH(dst0_l, dst1_l, 6);
2899  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2900  SAT_SH2_SH(dst0_l, dst1_l, 7);
2901  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2902  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2903  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2904  out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2905  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2906  dst += (4 * dst_stride);
2907 
2908  src2 = src6;
2909  src10_r = src54_r;
2910  src21_r = src65_r;
2911  src2110 = src6554;
2912  }
2913 }
2914 
2915 static void common_vt_4t_16w_msa(const uint8_t *src, int32_t src_stride,
2916  uint8_t *dst, int32_t dst_stride,
2917  const int8_t *filter, int32_t height)
2918 {
2919  uint32_t loop_cnt;
2920  v16i8 src0, src1, src2, src3, src4, src5, src6;
2921  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2922  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2923  v16u8 tmp0, tmp1, tmp2, tmp3;
2924  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2925 
2926  src -= src_stride;
2927 
2928  filt = LD_SH(filter);
2929  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2930 
2931  LD_SB3(src, src_stride, src0, src1, src2);
2932  src += (3 * src_stride);
2933 
2935  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2936  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2937 
2938  for (loop_cnt = (height >> 2); loop_cnt--;) {
2939  LD_SB4(src, src_stride, src3, src4, src5, src6);
2940  src += (4 * src_stride);
2941 
2942  XORI_B4_128_SB(src3, src4, src5, src6);
2943  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2944  src32_r, src43_r, src54_r, src65_r);
2945  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2946  src32_l, src43_l, src54_l, src65_l);
2947  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2948  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2949  out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2950  out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2951  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2952  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2953  out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2954  out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2955  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2956  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2957  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2958  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2959  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2960  out3_r, tmp0, tmp1, tmp2, tmp3);
2961  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2962  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2963  dst += (4 * dst_stride);
2964 
2965  src10_r = src54_r;
2966  src21_r = src65_r;
2967  src10_l = src54_l;
2968  src21_l = src65_l;
2969  src2 = src6;
2970  }
2971 }
2972 
2973 static void common_vt_4t_24w_msa(const uint8_t *src, int32_t src_stride,
2974  uint8_t *dst, int32_t dst_stride,
2975  const int8_t *filter, int32_t height)
2976 {
2977  uint32_t loop_cnt;
2978  uint64_t out0, out1;
2979  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2980  v16i8 src11, filt0, filt1;
2981  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2982  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2983  v16u8 out;
2984  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2985 
2986  src -= src_stride;
2987 
2988  filt = LD_SH(filter);
2989  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2990 
2991  /* 16 width */
2992  LD_SB3(src, src_stride, src0, src1, src2);
2994  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2995  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2996 
2997  /* 8 width */
2998  LD_SB3(src + 16, src_stride, src6, src7, src8);
2999  src += (3 * src_stride);
3000  XORI_B3_128_SB(src6, src7, src8);
3001  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3002 
3003  for (loop_cnt = 8; loop_cnt--;) {
3004  /* 16 width */
3005  LD_SB2(src, src_stride, src3, src4);
3006  XORI_B2_128_SB(src3, src4);
3007  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3008  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3009 
3010  /* 8 width */
3011  LD_SB2(src + 16, src_stride, src9, src10);
3012  src += (2 * src_stride);
3013  XORI_B2_128_SB(src9, src10);
3014  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3015 
3016  /* 16 width */
3017  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3018  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3019  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3020  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3021 
3022  /* 8 width */
3023  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3024  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3025 
3026  /* 16 + 8 width */
3027  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3028  SRARI_H2_SH(out0_l, out1_l, 6);
3029  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3030  SAT_SH2_SH(out0_l, out1_l, 7);
3031  out = PCKEV_XORI128_UB(out0_r, out0_l);
3032  ST_UB(out, dst);
3033  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
3034  XORI_B2_128_SH(out2_r, out3_r);
3035  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
3036  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
3037  SD(out0, dst + 16);
3038  dst += dst_stride;
3039  out = PCKEV_XORI128_UB(out1_r, out1_l);
3040  ST_UB(out, dst);
3041  SD(out1, dst + 16);
3042  dst += dst_stride;
3043 
3044  /* 16 width */
3045  LD_SB2(src, src_stride, src5, src2);
3046  XORI_B2_128_SB(src5, src2);
3047  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3048  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3049 
3050  /* 8 width */
3051  LD_SB2(src + 16, src_stride, src11, src8);
3052  src += (2 * src_stride);
3053  XORI_B2_128_SB(src11, src8);
3054  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3055 
3056  /* 16 width */
3057  out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3058  out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
3059  out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3060  out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
3061 
3062  /* 8 width */
3063  out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
3064  out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
3065 
3066  /* 16 + 8 width */
3067  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3068  SRARI_H2_SH(out0_l, out1_l, 6);
3069  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3070  SAT_SH2_SH(out0_l, out1_l, 7);
3071  out = PCKEV_XORI128_UB(out0_r, out0_l);
3072  ST_UB(out, dst);
3073  out = PCKEV_XORI128_UB(out2_r, out2_r);
3074  ST_D1(out, 0, dst + 16);
3075  dst += dst_stride;
3076  out = PCKEV_XORI128_UB(out1_r, out1_l);
3077  ST_UB(out, dst);
3078  out = PCKEV_XORI128_UB(out3_r, out3_r);
3079  ST_D1(out, 0, dst + 16);
3080  dst += dst_stride;
3081  }
3082 }
3083 
3084 static void common_vt_4t_32w_msa(const uint8_t *src, int32_t src_stride,
3085  uint8_t *dst, int32_t dst_stride,
3086  const int8_t *filter, int32_t height)
3087 {
3088  uint32_t loop_cnt;
3089  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3090  v16i8 src10_r, src32_r, src76_r, src98_r;
3091  v16i8 src21_r, src43_r, src87_r, src109_r;
3092  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3093  v16i8 src10_l, src32_l, src76_l, src98_l;
3094  v16i8 src21_l, src43_l, src87_l, src109_l;
3095  v8i16 filt;
3096  v16i8 filt0, filt1;
3097  v16u8 out;
3098 
3099  src -= src_stride;
3100 
3101  filt = LD_SH(filter);
3102  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3103 
3104  /* 16 width */
3105  LD_SB3(src, src_stride, src0, src1, src2);
3107 
3108  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3109  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3110 
3111  /* next 16 width */
3112  LD_SB3(src + 16, src_stride, src6, src7, src8);
3113  src += (3 * src_stride);
3114 
3115  XORI_B3_128_SB(src6, src7, src8);
3116  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3117  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3118 
3119  for (loop_cnt = (height >> 1); loop_cnt--;) {
3120  /* 16 width */
3121  LD_SB2(src, src_stride, src3, src4);
3122  XORI_B2_128_SB(src3, src4);
3123  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3124  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3125 
3126  /* 16 width */
3127  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3128  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3129  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3130  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3131 
3132  /* 16 width */
3133  SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3134  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3135  out = PCKEV_XORI128_UB(out0_r, out0_l);
3136  ST_UB(out, dst);
3137  out = PCKEV_XORI128_UB(out1_r, out1_l);
3138  ST_UB(out, dst + dst_stride);
3139 
3140  src10_r = src32_r;
3141  src21_r = src43_r;
3142  src10_l = src32_l;
3143  src21_l = src43_l;
3144  src2 = src4;
3145 
3146  /* next 16 width */
3147  LD_SB2(src + 16, src_stride, src9, src10);
3148  src += (2 * src_stride);
3149  XORI_B2_128_SB(src9, src10);
3150  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3151  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3152 
3153  /* next 16 width */
3154  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3155  out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3156  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3157  out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3158 
3159  /* next 16 width */
3160  SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3161  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3162  out = PCKEV_XORI128_UB(out2_r, out2_l);
3163  ST_UB(out, dst + 16);
3164  out = PCKEV_XORI128_UB(out3_r, out3_l);
3165  ST_UB(out, dst + 16 + dst_stride);
3166 
3167  dst += 2 * dst_stride;
3168 
3169  src76_r = src98_r;
3170  src87_r = src109_r;
3171  src76_l = src98_l;
3172  src87_l = src109_l;
3173  src8 = src10;
3174  }
3175 }
3176 
3177 static void hevc_hv_uni_4t_4x2_msa(const uint8_t *src,
3178  int32_t src_stride,
3179  uint8_t *dst,
3180  int32_t dst_stride,
3181  const int8_t *filter_x,
3182  const int8_t *filter_y)
3183 {
3184  v16u8 out;
3185  v16i8 src0, src1, src2, src3, src4;
3186  v8i16 filt0, filt1;
3187  v8i16 filt_h0, filt_h1;
3188  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3189  v16i8 mask1;
3190  v8i16 filter_vec, tmp;
3191  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3192  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3193  v4i32 dst0, dst1;
3194 
3195  src -= (src_stride + 1);
3196 
3197  filter_vec = LD_SH(filter_x);
3198  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3199 
3200  filter_vec = LD_SH(filter_y);
3201  UNPCK_R_SB_SH(filter_vec, filter_vec);
3202 
3203  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3204 
3205  mask1 = mask0 + 2;
3206 
3207  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3208  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3209 
3210  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3211  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3212  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3213 
3214  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3215  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3216  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3217 
3218  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3219  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3220 
3221  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3222  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3223  dst0 >>= 6;
3224  dst1 >>= 6;
3225  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3226  tmp = __msa_srari_h(tmp, 6);
3227  tmp = __msa_sat_s_h(tmp, 7);
3229  ST_W2(out, 0, 1, dst, dst_stride);
3230 }
3231 
3232 static void hevc_hv_uni_4t_4x4_msa(const uint8_t *src,
3233  int32_t src_stride,
3234  uint8_t *dst,
3235  int32_t dst_stride,
3236  const int8_t *filter_x,
3237  const int8_t *filter_y)
3238 {
3239  v16u8 out;
3240  v16i8 src0, src1, src2, src3, src4, src5, src6;
3241  v8i16 filt0, filt1;
3242  v8i16 filt_h0, filt_h1;
3243  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3244  v16i8 mask1;
3245  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3246  v8i16 filter_vec, tmp0, tmp1;
3247  v8i16 dst30, dst41, dst52, dst63;
3248  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3249  v4i32 dst0, dst1, dst2, dst3;
3250 
3251  src -= (src_stride + 1);
3252 
3253  filter_vec = LD_SH(filter_x);
3254  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3255 
3256  filter_vec = LD_SH(filter_y);
3257  UNPCK_R_SB_SH(filter_vec, filter_vec);
3258 
3259  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3260 
3261  mask1 = mask0 + 2;
3262 
3263  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3264  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3265 
3266  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3267  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3268  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3269  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3270 
3271  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3272  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3273  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3274  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3275 
3276  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3277  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3278  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3279  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3280  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3281  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3282  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3283  SRA_4V(dst0, dst1, dst2, dst3, 6);
3284  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3285  SRARI_H2_SH(tmp0, tmp1, 6);
3286  SAT_SH2_SH(tmp0, tmp1, 7);
3287  out = PCKEV_XORI128_UB(tmp0, tmp1);
3288  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3289 }
3290 
3291 static void hevc_hv_uni_4t_4multx8mult_msa(const uint8_t *src,
3292  int32_t src_stride,
3293  uint8_t *dst,
3294  int32_t dst_stride,
3295  const int8_t *filter_x,
3296  const int8_t *filter_y,
3297  int32_t height)
3298 {
3299  uint32_t loop_cnt;
3300  v16u8 out0, out1;
3301  v16i8 src0, src1, src2, src3, src4, src5;
3302  v16i8 src6, src7, src8, src9, src10;
3303  v8i16 filt0, filt1;
3304  v8i16 filt_h0, filt_h1;
3305  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3306  v16i8 mask1;
3307  v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3308  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3309  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3310  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3311  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3312  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3313  v8i16 dst98_r, dst109_r;
3314 
3315  src -= (src_stride + 1);
3316 
3317  filter_vec = LD_SH(filter_x);
3318  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3319 
3320  filter_vec = LD_SH(filter_y);
3321  UNPCK_R_SB_SH(filter_vec, filter_vec);
3322 
3323  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3324 
3325  mask1 = mask0 + 2;
3326 
3327  LD_SB3(src, src_stride, src0, src1, src2);
3328  src += (3 * src_stride);
3329 
3331 
3332  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3333  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3334  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3335  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3336  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3337  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3338 
3339  for (loop_cnt = height >> 3; loop_cnt--;) {
3340  LD_SB8(src, src_stride,
3341  src3, src4, src5, src6, src7, src8, src9, src10);
3342  src += (8 * src_stride);
3343 
3344  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3345 
3346  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3347  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3348  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3349  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3350 
3351  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3352  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3353  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3354  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3355 
3356  dst32_r = __msa_ilvr_h(dst73, dst22);
3357  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3358  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3359  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3360  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3361  dst76_r = __msa_ilvr_h(dst22, dst106);
3362 
3363  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3364  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3365  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3366  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3367  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3368  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3369  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3370  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3371  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3372  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3373  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3374  dst5_r, dst4_r, dst7_r, dst6_r,
3375  tmp0, tmp1, tmp2, tmp3);
3376  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3377  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3378  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3379  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3380  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3381  dst += (8 * dst_stride);
3382 
3383  dst10_r = dst98_r;
3384  dst21_r = dst109_r;
3385  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3386  }
3387 }
3388 
3389 static void hevc_hv_uni_4t_4w_msa(const uint8_t *src,
3390  int32_t src_stride,
3391  uint8_t *dst,
3392  int32_t dst_stride,
3393  const int8_t *filter_x,
3394  const int8_t *filter_y,
3395  int32_t height)
3396 {
3397  if (2 == height) {
3398  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3399  filter_x, filter_y);
3400  } else if (4 == height) {
3401  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3402  filter_x, filter_y);
3403  } else if (0 == (height % 8)) {
3404  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3405  filter_x, filter_y, height);
3406  }
3407 }
3408 
3409 static void hevc_hv_uni_4t_6w_msa(const uint8_t *src,
3410  int32_t src_stride,
3411  uint8_t *dst,
3412  int32_t dst_stride,
3413  const int8_t *filter_x,
3414  const int8_t *filter_y,
3415  int32_t height)
3416 {
3417  v16u8 out0, out1, out2;
3418  v16i8 src0, src1, src2, src3, src4, src5, src6;
3419  v16i8 src7, src8, src9, src10;
3420  v8i16 filt0, filt1;
3421  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3422  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3423  v16i8 mask1;
3424  v8i16 filt_h0, filt_h1, filter_vec;
3425  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3426  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3427  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3428  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3429  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3430  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3431  v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3432  v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3433  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3434 
3435  src -= (src_stride + 1);
3436 
3437  filter_vec = LD_SH(filter_x);
3438  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3439 
3440  filter_vec = LD_SH(filter_y);
3441  UNPCK_R_SB_SH(filter_vec, filter_vec);
3442 
3443  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3444 
3445  mask1 = mask0 + 2;
3446 
3447  LD_SB3(src, src_stride, src0, src1, src2);
3448  src += (3 * src_stride);
3449 
3451 
3452  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3453  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3454  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3455 
3456  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3457  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3458  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3459 
3460  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3461  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3462 
3463  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3464  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3465 
3466  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3467  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3468  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3469  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3470 
3471  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3472  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3473  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3474  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3475 
3476  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3477  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3478  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3479  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3480 
3481  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3482  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3483  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3484  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3485 
3486  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3487  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3488  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3489  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3490  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3491  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3492  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3493  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3494 
3495  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3496  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3497  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3498 
3499  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3500  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3501  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3502  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3503  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3504  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3505  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3506  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3507  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3508  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3509  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3510  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3511  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3512  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3513  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3514  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3515  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3516  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3517  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3518  SRARI_H2_SH(tmp4, tmp5, 6);
3519  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3520  SAT_SH2_SH(tmp4, tmp5,7);
3521  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3522  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3523  out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3524  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3525  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3526 }
3527 
3528 static void hevc_hv_uni_4t_8x2_msa(const uint8_t *src,
3529  int32_t src_stride,
3530  uint8_t *dst,
3531  int32_t dst_stride,
3532  const int8_t *filter_x,
3533  const int8_t *filter_y)
3534 {
3535  v16u8 out;
3536  v16i8 src0, src1, src2, src3, src4;
3537  v8i16 filt0, filt1;
3538  v8i16 filt_h0, filt_h1, filter_vec;
3539  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3540  v16i8 mask1;
3541  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3542  v8i16 dst0, dst1, dst2, dst3, dst4;
3543  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3544  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3545  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3546  v8i16 out0_r, out1_r;
3547 
3548  src -= (src_stride + 1);
3549 
3550  filter_vec = LD_SH(filter_x);
3551  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3552 
3553  filter_vec = LD_SH(filter_y);
3554  UNPCK_R_SB_SH(filter_vec, filter_vec);
3555 
3556  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3557 
3558  mask1 = mask0 + 2;
3559 
3560  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3561  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3562 
3563  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3564  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3565  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3566  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3567  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3568 
3569  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3570  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3571  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3572  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3573  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3574  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3575  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3576  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3577  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3578  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3579  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3580  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3581  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3582  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3583  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3584  SRARI_H2_SH(out0_r, out1_r, 6);
3585  SAT_SH2_SH(out0_r, out1_r, 7);
3586  out = PCKEV_XORI128_UB(out0_r, out1_r);
3587  ST_D2(out, 0, 1, dst, dst_stride);
3588 }
3589 
3590 static void hevc_hv_uni_4t_8multx4_msa(const uint8_t *src,
3591  int32_t src_stride,
3592  uint8_t *dst,
3593  int32_t dst_stride,
3594  const int8_t *filter_x,
3595  const int8_t *filter_y,
3596  int32_t width8mult)
3597 {
3598  uint32_t cnt;
3599  v16u8 out0, out1;
3600  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3601  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3602  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3603  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3604  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3605  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3606  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3607 
3608  src -= (src_stride + 1);
3609 
3610  filter_vec = LD_SH(filter_x);
3611  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3612 
3613  filter_vec = LD_SH(filter_y);
3614  UNPCK_R_SB_SH(filter_vec, filter_vec);
3615 
3616  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3617 
3618  mask0 = LD_SB(ff_hevc_mask_arr);
3619  mask1 = mask0 + 2;
3620 
3621  for (cnt = width8mult; cnt--;) {
3622  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3623  src += 8;
3624  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3625 
3626  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3627  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3628  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3629 
3630  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3631  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3632  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3633 
3634  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3635  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3636 
3637  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3638  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3639  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3640  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3641 
3642  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3643  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3644  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3645  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3646 
3647  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3648  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3649  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3650  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3651 
3652  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3653  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3654  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3655  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3656  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3657  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3658  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3659  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3660 
3661  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3662  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3663 
3664  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3665  dst3_r, tmp0, tmp1, tmp2, tmp3);
3666  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3667  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3668  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3669  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3670  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3671  dst += 8;
3672  }
3673 }
3674 
3675 static void hevc_hv_uni_4t_8x6_msa(const uint8_t *src,
3676  int32_t src_stride,
3677  uint8_t *dst,
3678  int32_t dst_stride,
3679  const int8_t *filter_x,
3680  const int8_t *filter_y)
3681 {
3682  v16u8 out0, out1, out2;
3683  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3684  v8i16 filt0, filt1;
3685  v8i16 filt_h0, filt_h1, filter_vec;
3686  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3687  v16i8 mask1;
3688  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3689  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3690  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3691  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3692  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3693  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3694  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3695  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3696  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3697  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3698 
3699  src -= (src_stride + 1);
3700 
3701  filter_vec = LD_SH(filter_x);
3702  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3703 
3704  filter_vec = LD_SH(filter_y);
3705  UNPCK_R_SB_SH(filter_vec, filter_vec);
3706 
3707  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3708 
3709  mask1 = mask0 + 2;
3710 
3711  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3712  src += (5 * src_stride);
3713  LD_SB4(src, src_stride, src5, src6, src7, src8);
3714 
3715  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3716  XORI_B4_128_SB(src5, src6, src7, src8);
3717 
3718  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3719  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3720  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3721  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3722  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3723  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3724  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3725  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3726  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3727 
3728  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3729  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3730  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3731  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3732  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3733  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3734  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3735  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3736  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3737 
3738  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3739  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3740  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3741  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3742  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3743  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3744  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3745  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3746 
3747  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3748  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3749  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3750  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3751  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3752  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3753  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3754  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3755  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3756  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3757  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3758  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3759 
3760  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3761  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3762  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3763  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3764  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3765  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3766  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3767  SRARI_H2_SH(out4_r, out5_r, 6);
3768  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3769  SAT_SH2_SH(out4_r, out5_r, 7);
3770  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3771  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3772  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3773 
3774  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3775  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3776 }
3777 
3778 static void hevc_hv_uni_4t_8multx4mult_msa(const uint8_t *src,
3779  int32_t src_stride,
3780  uint8_t *dst,
3781  int32_t dst_stride,
3782  const int8_t *filter_x,
3783  const int8_t *filter_y,
3784  int32_t height,
3785  int32_t width8mult)
3786 {
3787  uint32_t loop_cnt, cnt;
3788  const uint8_t *src_tmp;
3789  uint8_t *dst_tmp;
3790  v16u8 out0, out1;
3791  v16i8 src0, src1, src2, src3, src4, src5, src6;
3792  v8i16 filt0, filt1;
3793  v8i16 filt_h0, filt_h1, filter_vec;
3794  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3795  v16i8 mask1;
3796  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3797  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3798  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3799  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3800  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3801  v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3802  v8i16 out0_r, out1_r, out2_r, out3_r;
3803 
3804  src -= (src_stride + 1);
3805 
3806  filter_vec = LD_SH(filter_x);
3807  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3808 
3809  filter_vec = LD_SH(filter_y);
3810  UNPCK_R_SB_SH(filter_vec, filter_vec);
3811 
3812  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3813 
3814  mask1 = mask0 + 2;
3815 
3816  for (cnt = width8mult; cnt--;) {
3817  src_tmp = src;
3818  dst_tmp = dst;
3819 
3820  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3821  src_tmp += (3 * src_stride);
3822 
3824 
3825  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3826  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3827  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3828 
3829  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3830  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3831  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3832 
3833  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3834  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3835 
3836  for (loop_cnt = (height >> 2); loop_cnt--;) {
3837  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3838  src_tmp += (4 * src_stride);
3839 
3840  XORI_B4_128_SB(src3, src4, src5, src6);
3841 
3842  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3843  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3844  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3845  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3846 
3847  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3848  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3849  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3850  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3851 
3852  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3853  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3854  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3855  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3856 
3857  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3858  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3859  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3860  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3861  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3862  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3863  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3864  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3865 
3866  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3867  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3868 
3869  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3870  dst2_l, dst2_r, dst3_l, dst3_r,
3871  out0_r, out1_r, out2_r, out3_r);
3872 
3873  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3874  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3875  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3876  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3877  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3878  dst_tmp += (4 * dst_stride);
3879 
3880  dst10_r = dst54_r;
3881  dst10_l = dst54_l;
3882  dst21_r = dst65_r;
3883  dst21_l = dst65_l;
3884  dst2 = dst6;
3885  }
3886 
3887  src += 8;
3888  dst += 8;
3889  }
3890 }
3891 
3892 static void hevc_hv_uni_4t_8w_msa(const uint8_t *src,
3893  int32_t src_stride,
3894  uint8_t *dst,
3895  int32_t dst_stride,
3896  const int8_t *filter_x,
3897  const int8_t *filter_y,
3898  int32_t height)
3899 {
3900  if (2 == height) {
3901  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3902  filter_x, filter_y);
3903  } else if (4 == height) {
3904  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3905  filter_x, filter_y, 1);
3906  } else if (6 == height) {
3907  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3908  filter_x, filter_y);
3909  } else if (0 == (height % 4)) {
3910  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3911  filter_x, filter_y, height, 1);
3912  }
3913 }
3914 
3915 static void hevc_hv_uni_4t_12w_msa(const uint8_t *src,
3916  int32_t src_stride,
3917  uint8_t *dst,
3918  int32_t dst_stride,
3919  const int8_t *filter_x,
3920  const int8_t *filter_y,
3921  int32_t height)
3922 {
3923  uint32_t loop_cnt;
3924  const uint8_t *src_tmp;
3925  uint8_t *dst_tmp;
3926  v16u8 out0, out1;
3927  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3928  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3929  v16i8 mask0, mask1, mask2, mask3;
3930  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3931  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3932  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3933  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3934  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3935  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3936  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3937  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3938 
3939  src -= (src_stride + 1);
3940 
3941  filter_vec = LD_SH(filter_x);
3942  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3943 
3944  filter_vec = LD_SH(filter_y);
3945  UNPCK_R_SB_SH(filter_vec, filter_vec);
3946 
3947  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3948 
3949  mask0 = LD_SB(ff_hevc_mask_arr);
3950  mask1 = mask0 + 2;
3951 
3952  src_tmp = src;
3953  dst_tmp = dst;
3954 
3955  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3956  src_tmp += (3 * src_stride);
3957 
3959 
3960  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3961  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3962  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3963 
3964  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3965  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3966  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3967 
3968  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3969  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3970 
3971  for (loop_cnt = 4; loop_cnt--;) {
3972  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3973  src_tmp += (4 * src_stride);
3974  XORI_B4_128_SB(src3, src4, src5, src6);
3975 
3976  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3977  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3978  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3979  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3980 
3981  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3982  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3983  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3984  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3985 
3986  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3987  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3988  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3989  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3990 
3991  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3992  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3993  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3994  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3995  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3996  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3997  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3998  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3999 
4000  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4001  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4002 
4003  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4004  dst3_r, tmp0, tmp1, tmp2, tmp3);
4005  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
4006  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4007  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4008  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4009  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4010  dst_tmp += (4 * dst_stride);
4011 
4012  dst10_r = dst54_r;
4013  dst10_l = dst54_l;
4014  dst21_r = dst65_r;
4015  dst21_l = dst65_l;
4016  dsth2 = dsth6;
4017  }
4018 
4019  src += 8;
4020  dst += 8;
4021 
4022  mask2 = LD_SB(ff_hevc_mask_arr + 16);
4023  mask3 = mask2 + 2;
4024 
4025  LD_SB3(src, src_stride, src0, src1, src2);
4026  src += (3 * src_stride);
4028  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4029  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4030 
4031  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4032  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4033 
4034  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4035  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4036 
4037  for (loop_cnt = 2; loop_cnt--;) {
4038  LD_SB8(src, src_stride,
4039  src3, src4, src5, src6, src7, src8, src9, src10);
4040  src += (8 * src_stride);
4041  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4042  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4043  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4044  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4045  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4046 
4047  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4048  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4049  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4050  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4051 
4052  dst32_r = __msa_ilvr_h(dst73, dst22);
4053  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4054  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4055  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4056  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4057  dst76_r = __msa_ilvr_h(dst22, dst106);
4058 
4059  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4060  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4061  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4062  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4063  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4064  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4065  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4066  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4067  SRA_4V(dst0, dst1, dst2, dst3, 6);
4068  SRA_4V(dst4, dst5, dst6, dst7, 6);
4069  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4070  tmp0, tmp1, tmp2, tmp3);
4071  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
4072  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4073  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4074  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4075  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4076  dst += (8 * dst_stride);
4077 
4078  dst10_r = dst98_r;
4079  dst21_r = dst109_r;
4080  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4081  }
4082 }
4083 
4084 static void hevc_hv_uni_4t_16w_msa(const uint8_t *src,
4085  int32_t src_stride,
4086  uint8_t *dst,
4087  int32_t dst_stride,
4088  const int8_t *filter_x,
4089  const int8_t *filter_y,
4090  int32_t height)
4091 {
4092  if (4 == height) {
4093  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4094  filter_y, 2);
4095  } else {
4096  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4097  filter_x, filter_y, height, 2);
4098  }
4099 }
4100 
4101 static void hevc_hv_uni_4t_24w_msa(const uint8_t *src,
4102  int32_t src_stride,
4103  uint8_t *dst,
4104  int32_t dst_stride,
4105  const int8_t *filter_x,
4106  const int8_t *filter_y,
4107  int32_t height)
4108 {
4109  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4110  filter_x, filter_y, height, 3);
4111 }
4112 
4113 static void hevc_hv_uni_4t_32w_msa(const uint8_t *src,
4114  int32_t src_stride,
4115  uint8_t *dst,
4116  int32_t dst_stride,
4117  const int8_t *filter_x,
4118  const int8_t *filter_y,
4119  int32_t height)
4120 {
4121  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4122  filter_x, filter_y, height, 4);
4123 }
4124 
4125 #define UNI_MC_COPY(WIDTH) \
4126 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4127  ptrdiff_t dst_stride, \
4128  const uint8_t *src, \
4129  ptrdiff_t src_stride, \
4130  int height, \
4131  intptr_t mx, \
4132  intptr_t my, \
4133  int width) \
4134 { \
4135  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4136 }
4137 
4138 UNI_MC_COPY(8);
4139 UNI_MC_COPY(12);
4140 UNI_MC_COPY(16);
4141 UNI_MC_COPY(24);
4142 UNI_MC_COPY(32);
4143 UNI_MC_COPY(48);
4144 UNI_MC_COPY(64);
4145 
4146 #undef UNI_MC_COPY
4147 
4148 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4149 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4150  ptrdiff_t dst_stride, \
4151  const uint8_t *src, \
4152  ptrdiff_t src_stride, \
4153  int height, \
4154  intptr_t mx, \
4155  intptr_t my, \
4156  int width) \
4157 { \
4158  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
4159  \
4160  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4161  filter, height); \
4162 }
4163 
4164 UNI_MC(qpel, h, 4, 8, hz, mx);
4165 UNI_MC(qpel, h, 8, 8, hz, mx);
4166 UNI_MC(qpel, h, 12, 8, hz, mx);
4167 UNI_MC(qpel, h, 16, 8, hz, mx);
4168 UNI_MC(qpel, h, 24, 8, hz, mx);
4169 UNI_MC(qpel, h, 32, 8, hz, mx);
4170 UNI_MC(qpel, h, 48, 8, hz, mx);
4171 UNI_MC(qpel, h, 64, 8, hz, mx);
4172 
4173 UNI_MC(qpel, v, 4, 8, vt, my);
4174 UNI_MC(qpel, v, 8, 8, vt, my);
4175 UNI_MC(qpel, v, 12, 8, vt, my);
4176 UNI_MC(qpel, v, 16, 8, vt, my);
4177 UNI_MC(qpel, v, 24, 8, vt, my);
4178 UNI_MC(qpel, v, 32, 8, vt, my);
4179 UNI_MC(qpel, v, 48, 8, vt, my);
4180 UNI_MC(qpel, v, 64, 8, vt, my);
4181 
4182 UNI_MC(epel, h, 4, 4, hz, mx);
4183 UNI_MC(epel, h, 6, 4, hz, mx);
4184 UNI_MC(epel, h, 8, 4, hz, mx);
4185 UNI_MC(epel, h, 12, 4, hz, mx);
4186 UNI_MC(epel, h, 16, 4, hz, mx);
4187 UNI_MC(epel, h, 24, 4, hz, mx);
4188 UNI_MC(epel, h, 32, 4, hz, mx);
4189 
4190 UNI_MC(epel, v, 4, 4, vt, my);
4191 UNI_MC(epel, v, 6, 4, vt, my);
4192 UNI_MC(epel, v, 8, 4, vt, my);
4193 UNI_MC(epel, v, 12, 4, vt, my);
4194 UNI_MC(epel, v, 16, 4, vt, my);
4195 UNI_MC(epel, v, 24, 4, vt, my);
4196 UNI_MC(epel, v, 32, 4, vt, my);
4197 
4198 #undef UNI_MC
4199 
4200 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4201 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4202  ptrdiff_t dst_stride, \
4203  const uint8_t *src, \
4204  ptrdiff_t src_stride, \
4205  int height, \
4206  intptr_t mx, \
4207  intptr_t my, \
4208  int width) \
4209 { \
4210  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
4211  const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
4212  \
4213  hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4214  filter_x, filter_y, height); \
4215 }
4216 
4217 UNI_MC_HV(qpel, 4, 8);
4218 UNI_MC_HV(qpel, 8, 8);
4219 UNI_MC_HV(qpel, 12, 8);
4220 UNI_MC_HV(qpel, 16, 8);
4221 UNI_MC_HV(qpel, 24, 8);
4222 UNI_MC_HV(qpel, 32, 8);
4223 UNI_MC_HV(qpel, 48, 8);
4224 UNI_MC_HV(qpel, 64, 8);
4225 
4226 UNI_MC_HV(epel, 4, 4);
4227 UNI_MC_HV(epel, 6, 4);
4228 UNI_MC_HV(epel, 8, 4);
4229 UNI_MC_HV(epel, 12, 4);
4230 UNI_MC_HV(epel, 16, 4);
4231 UNI_MC_HV(epel, 24, 4);
4232 UNI_MC_HV(epel, 32, 4);
4233 
4234 #undef UNI_MC_HV
common_vt_8t_16w_msa
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1154
XORI_B2_128_SH
#define XORI_B2_128_SH(...)
Definition: generic_macros_msa.h:1836
VSHF_B2_SB
#define VSHF_B2_SB(...)
Definition: generic_macros_msa.h:662
UNI_MC
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_uni_msa.c:4148
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
common_vt_8t_24w_msa
static void common_vt_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1322
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:335
ILVR_H2_SH
#define ILVR_H2_SH(...)
Definition: generic_macros_msa.h:1392
hevc_hv_uni_4t_8x2_msa
static void hevc_hv_uni_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3528
HORIZ_8TAP_4WID_4VECS_FILT
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1)
Definition: hevc_mc_uni_msa.c:34
common_hz_4t_8x4mult_msa
static void common_hz_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2239
DPADD_SB2_SH
#define DPADD_SB2_SH(...)
Definition: generic_macros_msa.h:833
hevc_hv_uni_4t_8multx4_msa
static void hevc_hv_uni_4t_8multx4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
Definition: hevc_mc_uni_msa.c:3590
SRARI_H2_SH
#define SRARI_H2_SH(...)
Definition: generic_macros_msa.h:2059
out
FILE * out
Definition: movenc.c:55
common_hz_4t_4x4_msa
static void common_hz_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2031
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1674
common_vt_4t_32w_msa
static void common_vt_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:3084
src1
const pixel * src1
Definition: h264pred_template.c:421
SAT_SH4_SH
#define SAT_SH4_SH(...)
Definition: generic_macros_msa.h:1615
common_hz_8t_4x16_msa
static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:352
VSHF_B3_SB
#define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, out0, out1, out2)
Definition: vp8_mc_lsx.c:54
PCKEV_H2_SW
#define PCKEV_H2_SW(...)
Definition: generic_macros_msa.h:1760
hevc_hv_uni_4t_4multx8mult_msa
static void hevc_hv_uni_4t_4multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3291
common_vt_4t_8x6_msa
static void common_vt_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2740
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2067
common_hz_4t_16w_msa
static void common_hz_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2340
SAT_SH2_SH
#define SAT_SH2_SH(...)
Definition: generic_macros_msa.h:1601
copy_width32_msa
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:219
ST_D1
#define ST_D1(in, idx, pdst)
Definition: generic_macros_msa.h:485
hevc_hv_uni_8t_12w_msa
static void hevc_hv_uni_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1709
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
common_hz_8t_24w_msa
static void common_hz_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:610
hevc_hv_uni_8t_8multx2mult_msa
static void hevc_hv_uni_8t_8multx2mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
Definition: hevc_mc_uni_msa.c:1558
common_vt_4t_4x2_msa
static void common_vt_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2554
VSHF_B4_SB
#define VSHF_B4_SB(...)
Definition: generic_macros_msa.h:680
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
PCKEV_B4_UB
#define PCKEV_B4_UB(...)
Definition: generic_macros_msa.h:1739
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
hevc_hv_uni_8t_64w_msa
static void hevc_hv_uni_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1992
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
SAT_SH3_SH
#define SAT_SH3_SH(...)
Definition: generic_macros_msa.h:1608
DOTP_SB2_SH
#define DOTP_SB2_SH(...)
Definition: generic_macros_msa.h:768
common_hz_8t_4w_msa
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:408
hevc_hv_uni_8t_4w_msa
static void hevc_hv_uni_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1357
mx
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t mx
Definition: dsp.h:53
XORI_B4_128_SB
#define XORI_B4_128_SB(...)
Definition: generic_macros_msa.h:1851
copy_width16_msa
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:160
hevc_hv_uni_8t_48w_msa
static void hevc_hv_uni_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1980
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
hevc_hv_uni_8t_32w_msa
static void hevc_hv_uni_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1968
ST12x8_UB
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:527
common_hz_8t_4x8_msa
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:315
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
aligned
static int aligned(int val)
Definition: dashdec.c:171
copy_width24_msa
static void copy_width24_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:196
common_hz_4t_4w_msa
static void common_hz_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2141
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:167
copy_width64_msa
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:257
common_vt_4t_8x2_msa
static void common_vt_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2714
ILVL_H2_SH
#define ILVL_H2_SH(...)
Definition: generic_macros_msa.h:1292
common_hz_8t_64w_msa
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:827
HEVC_FILT_8TAP_SH
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:24
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_hv_uni_4t_24w_msa
static void hevc_hv_uni_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:4101
common_hz_8t_8w_msa
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:421
UNPCK_R_SB_SH
#define UNPCK_R_SB_SH(in, out)
Definition: generic_macros_msa.h:2156
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
SAT_SW4_SW
#define SAT_SW4_SW(...)
Definition: generic_macros_msa.h:1639
common_vt_4t_24w_msa
static void common_vt_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2973
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
HEVC_FILT_8TAP
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:35
common_hz_4t_6w_msa
static void common_hz_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2156
ILVR_B3_SH
#define ILVR_B3_SH(...)
Definition: generic_macros_msa.h:1351
common_vt_8t_8w_msa
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:997
my
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t my
Definition: dsp.h:53
common_hz_4t_4x16_msa
static void common_hz_4t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2093
hevc_macros_msa.h
common_hz_4t_8x2mult_msa
static void common_hz_4t_8x2mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2205
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
LD2
#define LD2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:223
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
HORIZ_4TAP_4WID_4VECS_FILT
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
Definition: hevc_mc_uni_msa.c:76
XORI_B5_128_SB
#define XORI_B5_128_SB(...)
Definition: generic_macros_msa.h:1859
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
common_vt_8t_12w_msa
static void common_vt_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1052
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *3]
Definition: hevc_mc_uni_msa.c:25
hevc_hv_uni_4t_12w_msa
static void hevc_hv_uni_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3915
DOTP_SB3_SH
#define DOTP_SB3_SH(...)
Definition: generic_macros_msa.h:776
hevc_hv_uni_8t_16w_msa
static void hevc_hv_uni_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1944
common_hz_8t_32w_msa
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:680
common_hz_4t_8w_msa
static void common_hz_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2273
common_hz_4t_4x2_msa
static void common_hz_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2004
common_hz_8t_16w_msa
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:552
common_vt_8t_4w_msa
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:906
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
HORIZ_4TAP_8WID_4VECS_FILT
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
Definition: hevc_mc_uni_msa.c:88
hevc_hv_uni_8t_24w_msa
static void hevc_hv_uni_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1956
common_vt_4t_4w_msa
static void common_vt_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2631
ILVR_D3_SB
#define ILVR_D3_SB(...)
Definition: generic_macros_msa.h:1452
hevcdsp_mips.h
VSHF_B2_SH
#define VSHF_B2_SH(...)
Definition: generic_macros_msa.h:664
PCKEV_XORI128_UB
#define PCKEV_XORI128_UB(in0, in1)
Definition: generic_macros_msa.h:2751
common_hz_4t_24w_msa
static void common_hz_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2403
height
#define height
Definition: dsp.h:85
LD_SB7
#define LD_SB7(...)
Definition: generic_macros_msa.h:327
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:296
XORI_B8_128_SB
#define XORI_B8_128_SB(...)
Definition: generic_macros_msa.h:1880
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
XORI_B2_128_SB
#define XORI_B2_128_SB(...)
Definition: generic_macros_msa.h:1835
hevc_hv_uni_8t_8w_msa
static void hevc_hv_uni_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1697
hevc_hv_uni_4t_16w_msa
static void hevc_hv_uni_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:4084
common_vt_8t_16w_mult_msa
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: hevc_mc_uni_msa.c:1233
hevc_hv_uni_4t_8x6_msa
static void hevc_hv_uni_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3675
common_vt_4t_8x4mult_msa
static void common_vt_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2793
hevc_hv_uni_4t_8w_msa
static void hevc_hv_uni_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3892
common_vt_4t_8w_msa
static void common_vt_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2838
ST_D2
#define ST_D2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:491
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1656
HORIZ_8TAP_8WID_4VECS_FILT
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3)
Definition: hevc_mc_uni_msa.c:51
copy_width48_msa
static void copy_width48_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:236
SPLATI_W4_SH
#define SPLATI_W4_SH(...)
Definition: generic_macros_msa.h:1700
HEVC_FILT_4TAP_SH
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:46
UNI_MC_COPY
#define UNI_MC_COPY(WIDTH)
Definition: hevc_mc_uni_msa.c:4125
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
hevc_hv_uni_4t_6w_msa
static void hevc_hv_uni_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3409
UNI_MC_HV
#define UNI_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_uni_msa.c:4200
common_vt_4t_6w_msa
static void common_vt_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2643
XORI_B4_128_UB
#define XORI_B4_128_UB(...)
Definition: generic_macros_msa.h:1850
src2
const pixel * src2
Definition: h264pred_template.c:422
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
filt
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:40
SPLATI_H4_SB
#define SPLATI_H4_SB(...)
Definition: generic_macros_msa.h:1673
DPADD_SB4_SH
#define DPADD_SB4_SH(...)
Definition: generic_macros_msa.h:841
hevc_hv_uni_4t_4w_msa
static void hevc_hv_uni_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3389
ST_H2
#define ST_H2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:409
SPLATI_W2_SH
#define SPLATI_W2_SH(...)
Definition: generic_macros_msa.h:1692
common_vt_4t_4x4multiple_msa
static void common_vt_4t_4x4multiple_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2585
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:289
ILVL_H4_SH
#define ILVL_H4_SH(...)
Definition: generic_macros_msa.h:1301
copy_width8_msa
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:104
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
hevc_hv_uni_4t_4x4_msa
static void hevc_hv_uni_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3232
common_hz_4t_12w_msa
static void common_hz_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2286
common_vt_8t_64w_msa
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1349
copy_width12_msa
static void copy_width12_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:146
common_hz_8t_4x4_msa
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:286
common_hz_4t_32w_msa
static void common_hz_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2486
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
DOTP_SB4_SH
#define DOTP_SB4_SH(...)
Definition: generic_macros_msa.h:784
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
SRARI_W4_SW
#define SRARI_W4_SW(...)
Definition: generic_macros_msa.h:2092
src0
const pixel *const src0
Definition: h264pred_template.c:420
common_vt_8t_48w_msa
static void common_vt_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1341
common_hz_8t_12w_msa
static void common_hz_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:473
XORI_B7_128_SB
#define XORI_B7_128_SB(...)
Definition: generic_macros_msa.h:1873
HEVC_FILT_4TAP
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:64
int32_t
int32_t
Definition: audioconvert.c:56
common_vt_4t_12w_msa
static void common_vt_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2852
common_hz_8t_48w_msa
static void common_hz_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:741
h
h
Definition: vp9dsp_template.c:2070
ILVR_H4_SH
#define ILVR_H4_SH(...)
Definition: generic_macros_msa.h:1408
common_hz_4t_4x8_msa
static void common_hz_4t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2058
hevc_hv_uni_4t_32w_msa
static void hevc_hv_uni_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:4113
common_vt_8t_32w_msa
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1333
width
#define width
Definition: dsp.h:85
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1789
hevc_hv_uni_4t_4x2_msa
static void hevc_hv_uni_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3177
hevc_hv_uni_4t_8multx4mult_msa
static void hevc_hv_uni_4t_8multx4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
Definition: hevc_mc_uni_msa.c:3778
SD
#define SD
Definition: ccaption_dec.c:940
SPLATI_H2_SB
#define SPLATI_H2_SB(...)
Definition: generic_macros_msa.h:1655
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759
src
#define src
Definition: vp8dsp.c:248
XORI_B3_128_SB
#define XORI_B3_128_SB(...)
Definition: generic_macros_msa.h:1843
common_vt_4t_16w_msa
static void common_vt_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2915
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278