FFmpeg
hevc_mc_biw_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29 };
30 
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
32  out0, out1) \
33 { \
34  v4i32 out0_r, out1_r, out0_l, out1_l; \
35  \
36  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
38  \
39  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
43  \
44  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45  CLIP_SW4_0_255(out0_l, out0_r, out1_l, out1_r); \
46  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
47 }
48 
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50  wgt, rnd, offset, out0, out1, out2, out3) \
51 { \
52  HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53  HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
54 }
55 
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
57  offset, out0, out1) \
58 { \
59  v4i32 out0_r, out1_r, out0_l, out1_l; \
60  \
61  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68  CLIP_SW4_0_255(out0_r, out1_r, out0_l, out1_l); \
69  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
70 }
71 
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73  vec3, wgt, rnd, offset, out0, out1, \
74  out2, out3) \
75 { \
76  HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
77  out0, out1); \
78  HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
79  out2, out3); \
80 }
81 
82 static void hevc_biwgt_copy_4w_msa(const uint8_t *src0_ptr,
83  int32_t src_stride,
84  const int16_t *src1_ptr,
85  int32_t src2_stride,
86  uint8_t *dst,
87  int32_t dst_stride,
89  int32_t weight0,
90  int32_t weight1,
91  int32_t offset0,
92  int32_t offset1,
93  int32_t rnd_val)
94 {
95  uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96  uint64_t tpd0, tpd1, tpd2, tpd3;
98  v16u8 out0, out1;
99  v16i8 zero = { 0 };
100  v16i8 src0 = { 0 }, src1 = { 0 };
101  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102  v8i16 dst0, dst1, dst2, dst3, weight_vec;
103  v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
104 
105  offset = (offset0 + offset1) << rnd_val;
106  weight0 = weight0 & 0x0000FFFF;
107  weight = weight0 | (weight1 << 16);
108 
109  offset_vec = __msa_fill_w(offset);
110  weight_vec = (v8i16) __msa_fill_w(weight);
111  rnd_vec = __msa_fill_w(rnd_val + 1);
112 
113  if (2 == height) {
114  LW2(src0_ptr, src_stride, tp0, tp1);
115  INSERT_W2_SB(tp0, tp1, src0);
116  LD2(src1_ptr, src2_stride, tpd0, tpd1);
117  INSERT_D2_SH(tpd0, tpd1, in0);
118 
119  dst0 = (v8i16) __msa_ilvr_b(zero, src0);
120  dst0 <<= 6;
121 
122  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126  CLIP_SW2_0_255(dst0_r, dst0_l);
127  dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
128  out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129  ST_W2(out0, 0, 1, dst, dst_stride);
130  } else if (4 == height) {
131  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134  INSERT_D2_SH(tpd0, tpd1, in0);
135  INSERT_D2_SH(tpd2, tpd3, in1);
136  ILVRL_B2_SH(zero, src0, dst0, dst1);
137  SLLI_2V(dst0, dst1, 6);
138  HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139  offset_vec, dst0, dst1);
140  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142  } else if (0 == height % 8) {
143  for (loop_cnt = (height >> 3); loop_cnt--;) {
144  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145  src0_ptr += 4 * src_stride;
146  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148  src0_ptr += 4 * src_stride;
149  INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151  src1_ptr += (4 * src2_stride);
152  INSERT_D2_SH(tpd0, tpd1, in0);
153  INSERT_D2_SH(tpd2, tpd3, in1);
154  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155  src1_ptr += (4 * src2_stride);
156  INSERT_D2_SH(tpd0, tpd1, in2);
157  INSERT_D2_SH(tpd2, tpd3, in3);
158  ILVRL_B2_SH(zero, src0, dst0, dst1);
159  ILVRL_B2_SH(zero, src1, dst2, dst3);
160  SLLI_4V(dst0, dst1, dst2, dst3, 6);
161  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162  in3, weight_vec, rnd_vec, offset_vec,
163  dst0, dst1, dst2, dst3);
164  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166  dst += (8 * dst_stride);
167  }
168  }
169 }
170 
171 static void hevc_biwgt_copy_6w_msa(const uint8_t *src0_ptr,
172  int32_t src_stride,
173  const int16_t *src1_ptr,
174  int32_t src2_stride,
175  uint8_t *dst,
176  int32_t dst_stride,
177  int32_t height,
178  int32_t weight0,
179  int32_t weight1,
180  int32_t offset0,
181  int32_t offset1,
182  int32_t rnd_val)
183 {
184  uint32_t loop_cnt;
185  int32_t res = height & 0x03;
187  uint64_t tp0, tp1, tp2, tp3;
188  v16u8 out0, out1;
189  v16i8 zero = { 0 };
190  v16i8 src0 = { 0 }, src1 = { 0 };
191  v8i16 in0, in1, in2, in3;
192  v8i16 dst0, dst1, dst2, dst3;
193  v4i32 offset_vec, weight_vec, rnd_vec;
194 
195  offset = (offset0 + offset1) << rnd_val;
196  weight0 = weight0 & 0x0000FFFF;
197  weight = weight0 | (weight1 << 16);
198 
199  weight_vec = __msa_fill_w(weight);
200  offset_vec = __msa_fill_w(offset);
201  rnd_vec = __msa_fill_w(rnd_val + 1);
202 
203  for (loop_cnt = (height >> 2); loop_cnt--;) {
204  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
205  src0_ptr += (4 * src_stride);
206  INSERT_D2_SB(tp0, tp1, src0);
207  INSERT_D2_SB(tp2, tp3, src1);
208  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
209  src1_ptr += (4 * src2_stride);
210  ILVRL_B2_SH(zero, src0, dst0, dst1);
211  ILVRL_B2_SH(zero, src1, dst2, dst3);
212  SLLI_4V(dst0, dst1, dst2, dst3, 6);
213  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
214  in0, in1, in2, in3,
215  weight_vec, rnd_vec, offset_vec,
216  dst0, dst1, dst2, dst3);
217  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
218  ST_W2(out0, 0, 2, dst, dst_stride);
219  ST_H2(out0, 2, 6, dst + 4, dst_stride);
220  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
221  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
222  dst += (4 * dst_stride);
223  }
224  if (res) {
225  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
226  src0_ptr += (4 * src_stride);
227  INSERT_D2_SB(tp0, tp1, src0);
228  INSERT_D2_SB(tp2, tp3, src1);
229  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
230  src1_ptr += (4 * src2_stride);
231  ILVRL_B2_SH(zero, src0, dst0, dst1);
232  ILVRL_B2_SH(zero, src1, dst2, dst3);
233  SLLI_4V(dst0, dst1, dst2, dst3, 6);
234  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
235  in0, in1, in2, in3,
236  weight_vec, rnd_vec, offset_vec,
237  dst0, dst1, dst2, dst3);
238 
239  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
240  ST_W2(out0, 0, 2, dst, dst_stride);
241  ST_H2(out0, 2, 6, dst + 4, dst_stride);
242  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
243  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
244  }
245 }
246 
247 static void hevc_biwgt_copy_8w_msa(const uint8_t *src0_ptr,
248  int32_t src_stride,
249  const int16_t *src1_ptr,
250  int32_t src2_stride,
251  uint8_t *dst,
252  int32_t dst_stride,
253  int32_t height,
254  int32_t weight0,
255  int32_t weight1,
256  int32_t offset0,
257  int32_t offset1,
258  int32_t rnd_val)
259 {
260  uint64_t tp0, tp1, tp2, tp3;
262  v16u8 out0, out1, out2;
263  v16i8 zero = { 0 };
264  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
265  v8i16 in0, in1, in2, in3, in4, in5;
266  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
267  v4i32 offset_vec, weight_vec, rnd_vec;
268 
269  offset = (offset0 + offset1) << rnd_val;
270  weight0 = weight0 & 0x0000FFFF;
271  weight = weight0 | (weight1 << 16);
272 
273  offset_vec = __msa_fill_w(offset);
274  weight_vec = __msa_fill_w(weight);
275  rnd_vec = __msa_fill_w(rnd_val + 1);
276 
277  if (2 == height) {
278  LD2(src0_ptr, src_stride, tp0, tp1);
279  INSERT_D2_SB(tp0, tp1, src0);
280  LD_SH2(src1_ptr, src2_stride, in0, in1);
281  ILVRL_B2_SH(zero, src0, dst0, dst1);
282  SLLI_2V(dst0, dst1, 6);
283 
284  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
285  weight_vec, rnd_vec, offset_vec,
286  dst0, dst1);
287 
288  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
289  ST_D2(out0, 0, 1, dst, dst_stride);
290  } else if (6 == height) {
291  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
292  src0_ptr += 4 * src_stride;
293  INSERT_D2_SB(tp0, tp1, src0);
294  INSERT_D2_SB(tp2, tp3, src1);
295  LD2(src0_ptr, src_stride, tp0, tp1);
296  INSERT_D2_SB(tp0, tp1, src2);
297  ILVRL_B2_SH(zero, src0, dst0, dst1);
298  ILVRL_B2_SH(zero, src1, dst2, dst3);
299  ILVRL_B2_SH(zero, src2, dst4, dst5);
300  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
301  SLLI_4V(dst0, dst1, dst2, dst3, 6);
302  SLLI_2V(dst4, dst5, 6);
303  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
304  weight_vec, rnd_vec, offset_vec, dst0, dst1,
305  dst2, dst3);
306  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
307  offset_vec, dst4, dst5);
308  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
309  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
310  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
311  } else if (0 == height % 4) {
312  uint32_t loop_cnt;
313 
314  for (loop_cnt = (height >> 2); loop_cnt--;) {
315  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
316  src0_ptr += (4 * src_stride);
317  INSERT_D2_SB(tp0, tp1, src0);
318  INSERT_D2_SB(tp2, tp3, src1);
319  ILVRL_B2_SH(zero, src0, dst0, dst1);
320  ILVRL_B2_SH(zero, src1, dst2, dst3);
321  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
322  src1_ptr += (4 * src2_stride);
323 
324  SLLI_4V(dst0, dst1, dst2, dst3, 6);
325  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
326  in3, weight_vec, rnd_vec, offset_vec,
327  dst0, dst1, dst2, dst3);
328  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
329  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
330  dst += (4 * dst_stride);
331  }
332  }
333 }
334 
335 static void hevc_biwgt_copy_12w_msa(const uint8_t *src0_ptr,
336  int32_t src_stride,
337  const int16_t *src1_ptr,
338  int32_t src2_stride,
339  uint8_t *dst,
340  int32_t dst_stride,
341  int32_t height,
342  int32_t weight0,
343  int32_t weight1,
344  int32_t offset0,
345  int32_t offset1,
346  int32_t rnd_val)
347 {
348  uint32_t loop_cnt;
350  v16i8 zero = { 0 };
351  v16u8 out0, out1, out2;
352  v16i8 src0, src1, src2, src3;
353  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
354  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
355  v4i32 offset_vec, weight_vec, rnd_vec;
356 
357  offset = (offset0 + offset1) << rnd_val;
358  weight0 = weight0 & 0x0000FFFF;
359  weight = weight0 | (weight1 << 16);
360 
361  offset_vec = __msa_fill_w(offset);
362  weight_vec = __msa_fill_w(weight);
363  rnd_vec = __msa_fill_w(rnd_val + 1);
364 
365  for (loop_cnt = (height >> 2); loop_cnt--;) {
366  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
367  src0_ptr += (4 * src_stride);
368  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
369  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
370  src1_ptr += (4 * src2_stride);
371 
372  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
373  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
374  dst0, dst1, dst2, dst3);
375 
376  SLLI_4V(dst0, dst1, dst2, dst3, 6);
377  ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
378  ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
379 
380  dst4 <<= 6;
381  dst5 <<= 6;
382  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
383  weight_vec, rnd_vec, offset_vec, dst0, dst1,
384  dst2, dst3);
385  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
386  offset_vec, dst4, dst5);
387  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
388  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
389  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
390  dst += (4 * dst_stride);
391  }
392 }
393 
394 static void hevc_biwgt_copy_16w_msa(const uint8_t *src0_ptr,
395  int32_t src_stride,
396  const int16_t *src1_ptr,
397  int32_t src2_stride,
398  uint8_t *dst,
399  int32_t dst_stride,
400  int32_t height,
401  int32_t weight0,
402  int32_t weight1,
403  int32_t offset0,
404  int32_t offset1,
405  int32_t rnd_val)
406 {
407  uint32_t loop_cnt;
409  v16u8 out0, out1, out2, out3;
410  v16i8 zero = { 0 };
411  v16i8 src0, src1, src2, src3;
412  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
413  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
414  v4i32 offset_vec, weight_vec, rnd_vec;
415 
416  offset = (offset0 + offset1) << rnd_val;
417  weight0 = weight0 & 0x0000FFFF;
418  weight = weight0 | (weight1 << 16);
419 
420  offset_vec = __msa_fill_w(offset);
421  weight_vec = __msa_fill_w(weight);
422  rnd_vec = __msa_fill_w(rnd_val + 1);
423 
424  for (loop_cnt = (height >> 2); loop_cnt--;) {
425  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
426  src0_ptr += (4 * src_stride);
427  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
428  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
429  src1_ptr += (4 * src2_stride);
430  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
431  tmp2, tmp3);
432  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
433  tmp6, tmp7);
434  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
435  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
436  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
437  weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
438  tmp4, tmp5);
439  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
440  weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
441  tmp6, tmp7);
442  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
443  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
444  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
445  dst += (4 * dst_stride);
446  }
447 }
448 
449 static void hevc_biwgt_copy_24w_msa(const uint8_t *src0_ptr,
450  int32_t src_stride,
451  const int16_t *src1_ptr,
452  int32_t src2_stride,
453  uint8_t *dst,
454  int32_t dst_stride,
455  int32_t height,
456  int32_t weight0,
457  int32_t weight1,
458  int32_t offset0,
459  int32_t offset1,
460  int32_t rnd_val)
461 {
462  uint32_t loop_cnt;
464  v16u8 out0, out1, out2, out3, out4, out5;
465  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
466  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
467  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
468  v4i32 offset_vec, weight_vec, rnd_vec;
469 
470  offset = (offset0 + offset1) << rnd_val;
471  weight0 = weight0 & 0x0000FFFF;
472  weight = weight0 | (weight1 << 16);
473 
474  offset_vec = __msa_fill_w(offset);
475  weight_vec = __msa_fill_w(weight);
476  rnd_vec = __msa_fill_w(rnd_val + 1);
477 
478  for (loop_cnt = 8; loop_cnt--;) {
479  LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
480  LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
481  src0_ptr += (4 * src_stride);
482  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
483  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
484  LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
485  src1_ptr += (4 * src2_stride);
486 
487  ILVRL_B2_SH(zero, src0, dst0, dst1);
488  ILVRL_B2_SH(zero, src1, dst2, dst3);
489  ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
490  ILVRL_B2_SH(zero, src4, dst6, dst7);
491  ILVRL_B2_SH(zero, src5, dst8, dst9);
492  ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
493  SLLI_4V(dst0, dst1, dst2, dst3, 6);
494  SLLI_4V(dst4, dst5, dst6, dst7, 6);
495  SLLI_4V(dst8, dst9, dst10, dst11, 6);
496  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
497  weight_vec, rnd_vec, offset_vec, dst0, dst1,
498  dst2, dst3);
499  HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
500  weight_vec, rnd_vec, offset_vec, dst4, dst5,
501  dst6, dst7);
502  HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
503  in11, weight_vec, rnd_vec, offset_vec,
504  dst8, dst9, dst10, dst11);
505  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
506  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
507  ST_UB4(out0, out1, out3, out4, dst, dst_stride);
508  ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
509  dst += (4 * dst_stride);
510  }
511 }
512 
513 static void hevc_biwgt_copy_32w_msa(const uint8_t *src0_ptr,
514  int32_t src_stride,
515  const int16_t *src1_ptr,
516  int32_t src2_stride,
517  uint8_t *dst,
518  int32_t dst_stride,
519  int32_t height,
520  int32_t weight0,
521  int32_t weight1,
522  int32_t offset0,
523  int32_t offset1,
524  int32_t rnd_val)
525 {
526  uint32_t loop_cnt;
528  v16u8 out0, out1, out2, out3;
529  v16i8 zero = { 0 };
530  v16i8 src0, src1, src2, src3;
531  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
532  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
533  v4i32 offset_vec, weight_vec, rnd_vec;
534 
535  offset = (offset0 + offset1) << rnd_val;
536  weight0 = weight0 & 0x0000FFFF;
537  weight = weight0 | (weight1 << 16);
538 
539  offset_vec = __msa_fill_w(offset);
540  weight_vec = __msa_fill_w(weight);
541  rnd_vec = __msa_fill_w(rnd_val + 1);
542 
543  for (loop_cnt = (height >> 1); loop_cnt--;) {
544  LD_SB2(src0_ptr, 16, src0, src1);
545  src0_ptr += src_stride;
546  LD_SB2(src0_ptr, 16, src2, src3);
547  src0_ptr += src_stride;
548  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
549  src1_ptr += src2_stride;
550  LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
551  src1_ptr += src2_stride;
552 
553  ILVRL_B2_SH(zero, src0, tmp0, tmp4);
554  ILVRL_B2_SH(zero, src1, tmp1, tmp5);
555  ILVRL_B2_SH(zero, src2, tmp2, tmp6);
556  ILVRL_B2_SH(zero, src3, tmp3, tmp7);
557  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
558  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
559  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
560  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
561  tmp1, tmp5);
562  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
563  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
564  tmp3, tmp7);
565  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
566  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
567  ST_UB2(out0, out1, dst, 16);
568  dst += dst_stride;
569  ST_UB2(out2, out3, dst, 16);
570  dst += dst_stride;
571  }
572 }
573 
574 static void hevc_biwgt_copy_48w_msa(const uint8_t *src0_ptr,
575  int32_t src_stride,
576  const int16_t *src1_ptr,
577  int32_t src2_stride,
578  uint8_t *dst,
579  int32_t dst_stride,
580  int32_t height,
581  int32_t weight0,
582  int32_t weight1,
583  int32_t offset0,
584  int32_t offset1,
585  int32_t rnd_val)
586 {
587  uint32_t loop_cnt;
589  v16u8 out0, out1, out2;
590  v16i8 src0, src1, src2;
591  v16i8 zero = { 0 };
592  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
593  v4i32 offset_vec, weight_vec, rnd_vec;
594 
595  offset = (offset0 + offset1) << rnd_val;
596  weight0 = weight0 & 0x0000FFFF;
597  weight = weight0 | (weight1 << 16);
598 
599  offset_vec = __msa_fill_w(offset);
600  weight_vec = __msa_fill_w(weight);
601  rnd_vec = __msa_fill_w(rnd_val + 1);
602 
603  for (loop_cnt = 64; loop_cnt--;) {
604  LD_SB3(src0_ptr, 16, src0, src1, src2);
605  src0_ptr += src_stride;
606  LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
607  src1_ptr += src2_stride;
608 
609  ILVRL_B2_SH(zero, src0, dst0, dst1);
610  ILVRL_B2_SH(zero, src1, dst2, dst3);
611  ILVRL_B2_SH(zero, src2, dst4, dst5);
612  SLLI_4V(dst0, dst1, dst2, dst3, 6);
613  SLLI_2V(dst4, dst5, 6);
614  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
615  weight_vec, rnd_vec, offset_vec, dst0, dst1,
616  dst2, dst3);
617  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
618  offset_vec, dst4, dst5);
619  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
620  ST_UB2(out0, out1, dst, 16);
621  ST_UB(out2, dst + 32);
622  dst += dst_stride;
623  }
624 }
625 
626 static void hevc_biwgt_copy_64w_msa(const uint8_t *src0_ptr,
627  int32_t src_stride,
628  const int16_t *src1_ptr,
629  int32_t src2_stride,
630  uint8_t *dst,
631  int32_t dst_stride,
632  int32_t height,
633  int32_t weight0,
634  int32_t weight1,
635  int32_t offset0,
636  int32_t offset1,
637  int32_t rnd_val)
638 {
639  uint32_t loop_cnt;
641  v16u8 out0, out1, out2, out3;
642  v16i8 zero = { 0 };
643  v16i8 src0, src1, src2, src3;
644  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
645  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
646  v4i32 offset_vec, weight_vec, rnd_vec;
647 
648  offset = (offset0 + offset1) << rnd_val;
649  weight0 = weight0 & 0x0000FFFF;
650  weight = weight0 | (weight1 << 16);
651 
652  offset_vec = __msa_fill_w(offset);
653  weight_vec = __msa_fill_w(weight);
654  rnd_vec = __msa_fill_w(rnd_val + 1);
655 
656  for (loop_cnt = height; loop_cnt--;) {
657  LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
658  src0_ptr += src_stride;
659  LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
660  src1_ptr += src2_stride;
661 
662  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
663  tmp2, tmp3);
664  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
665  tmp6, tmp7);
666  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
667  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
668  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
669  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
670  tmp1, tmp5);
671  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
672  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
673  tmp3, tmp7);
674  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
675  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
676  ST_UB4(out0, out1, out2, out3, dst, 16);
677  dst += dst_stride;
678  }
679 }
680 
681 static void hevc_hz_biwgt_8t_4w_msa(const uint8_t *src0_ptr,
682  int32_t src_stride,
683  const int16_t *src1_ptr,
684  int32_t src2_stride,
685  uint8_t *dst,
686  int32_t dst_stride,
687  const int8_t *filter,
688  int32_t height,
689  int32_t weight0,
690  int32_t weight1,
691  int32_t offset0,
692  int32_t offset1,
693  int32_t rnd_val)
694 {
695  uint32_t loop_cnt;
696  int32_t offset, weight, constant;
697  v8i16 filt0, filt1, filt2, filt3;
698  v16i8 src0, src1, src2, src3;
699  v16i8 mask1, mask2, mask3;
700  v16i8 vec0, vec1, vec2, vec3;
701  v8i16 dst0, dst1;
702  v8i16 in0, in1, in2, in3;
703  v8i16 filter_vec, out0, out1;
704  v4i32 weight_vec, offset_vec, rnd_vec;
705  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
706 
707  src0_ptr -= 3;
708  filter_vec = LD_SH(filter);
709  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
710 
711  mask1 = mask0 + 2;
712  mask2 = mask0 + 4;
713  mask3 = mask0 + 6;
714 
715  offset = (offset0 + offset1) << rnd_val;
716  weight0 = weight0 & 0x0000FFFF;
717  weight = weight0 | (weight1 << 16);
718  constant = 128 * weight1;
719  constant <<= 6;
720  offset += constant;
721 
722  offset_vec = __msa_fill_w(offset);
723  weight_vec = __msa_fill_w(weight);
724  rnd_vec = __msa_fill_w(rnd_val + 1);
725 
726  for (loop_cnt = (height >> 2); loop_cnt--;) {
727  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
728  src0_ptr += (4 * src_stride);
729  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
730  src1_ptr += (4 * src2_stride);
731  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
732  XORI_B4_128_SB(src0, src1, src2, src3);
733 
734  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
735  vec0, vec1, vec2, vec3);
736  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
737  filt3);
738  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
739  vec0, vec1, vec2, vec3);
740  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
741  filt3);
742 
743  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
744  weight_vec, rnd_vec, offset_vec,
745  out0, out1);
746 
747  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
748  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
749  dst += (4 * dst_stride);
750  }
751 }
752 
753 static void hevc_hz_biwgt_8t_8w_msa(const uint8_t *src0_ptr,
754  int32_t src_stride,
755  const int16_t *src1_ptr,
756  int32_t src2_stride,
757  uint8_t *dst,
758  int32_t dst_stride,
759  const int8_t *filter,
760  int32_t height,
761  int32_t weight0,
762  int32_t weight1,
763  int32_t offset0,
764  int32_t offset1,
765  int32_t rnd_val)
766 {
767  uint32_t loop_cnt;
768  int32_t offset, weight, constant;
769  v8i16 filt0, filt1, filt2, filt3;
770  v16i8 src0, src1, src2, src3;
771  v16i8 mask1, mask2, mask3;
772  v16i8 vec0, vec1, vec2, vec3;
773  v8i16 dst0, dst1, dst2, dst3;
774  v8i16 in0, in1, in2, in3;
775  v8i16 filter_vec, out0, out1, out2, out3;
776  v4i32 weight_vec, offset_vec, rnd_vec;
777  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
778 
779  src0_ptr -= 3;
780  offset = (offset0 + offset1) << rnd_val;
781  weight0 = weight0 & 0x0000FFFF;
782  weight = weight0 | (weight1 << 16);
783  constant = 128 * weight1;
784  constant <<= 6;
785  offset += constant;
786 
787  offset_vec = __msa_fill_w(offset);
788  weight_vec = __msa_fill_w(weight);
789  rnd_vec = __msa_fill_w(rnd_val + 1);
790 
791  filter_vec = LD_SH(filter);
792  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
793 
794  mask1 = mask0 + 2;
795  mask2 = mask0 + 4;
796  mask3 = mask0 + 6;
797 
798  for (loop_cnt = (height >> 2); loop_cnt--;) {
799  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
800  src0_ptr += (4 * src_stride);
801  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
802  src1_ptr += (4 * src2_stride);
803  XORI_B4_128_SB(src0, src1, src2, src3);
804 
805  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
806  vec0, vec1, vec2, vec3);
807  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
808  filt3);
809  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
810  vec0, vec1, vec2, vec3);
811  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
812  filt3);
813  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
814  vec0, vec1, vec2, vec3);
815  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
816  filt3);
817  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
818  vec0, vec1, vec2, vec3);
819  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
820  filt3);
821 
822  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
823  in0, in1, in2, in3,
824  weight_vec, rnd_vec, offset_vec,
825  out0, out1, out2, out3);
826 
827  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
828  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
829  dst += (4 * dst_stride);
830  }
831 }
832 
833 static void hevc_hz_biwgt_8t_12w_msa(const uint8_t *src0_ptr,
834  int32_t src_stride,
835  const int16_t *src1_ptr,
836  int32_t src2_stride,
837  uint8_t *dst,
838  int32_t dst_stride,
839  const int8_t *filter,
840  int32_t height,
841  int32_t weight0,
842  int32_t weight1,
843  int32_t offset0,
844  int32_t offset1,
845  int32_t rnd_val)
846 {
847  uint32_t loop_cnt;
848  int32_t offset, weight, constant;
849  v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
850  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
851  v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
852  v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
853  v4i32 weight_vec, offset_vec, rnd_vec;
854 
855  src0_ptr -= 3;
856 
857  weight0 = weight0 & 0x0000FFFF;
858  weight = weight0 | (weight1 << 16);
859  constant = 128 * weight1;
860  constant <<= 6;
861  offset = (offset0 + offset1) << rnd_val;
862  offset += constant;
863 
864  offset_vec = __msa_fill_w(offset);
865  weight_vec = __msa_fill_w(weight);
866  rnd_vec = __msa_fill_w(rnd_val + 1);
867 
868  filter_vec = LD_SH(filter);
869  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
870 
871  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
872  mask1 = mask0 + 2;
873  mask2 = mask0 + 4;
874  mask3 = mask0 + 6;
875  mask4 = LD_SB(&ff_hevc_mask_arr[16]);
876  mask5 = mask4 + 2;
877  mask6 = mask4 + 4;
878  mask7 = mask4 + 6;
879 
880  for (loop_cnt = 4; loop_cnt--;) {
881  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
882  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
883  XORI_B4_128_SB(src0, src1, src2, src3);
884  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
885  vec3);
886  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
887  filt3);
888  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
889  vec3);
890  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
891  filt3);
892  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
893  vec3);
894  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
895  filt3);
896  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
897  vec3);
898  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
899  filt3);
900  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
901  weight_vec, rnd_vec, offset_vec, out0, out1, out2,
902  out3);
903  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
904  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
905 
906  LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
907  src0_ptr += (4 * src_stride);
908  LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
909  src1_ptr += (4 * src2_stride);
910  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
911  XORI_B4_128_SB(src0, src1, src2, src3);
912  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
913  vec3);
914  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
915  filt3);
916  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
917  vec3);
918  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
919  filt3);
920  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
921  offset_vec, out0, out1);
922  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
923  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
924  dst += (4 * dst_stride);
925  }
926 }
927 
928 static void hevc_hz_biwgt_8t_16w_msa(const uint8_t *src0_ptr,
929  int32_t src_stride,
930  const int16_t *src1_ptr,
931  int32_t src2_stride,
932  uint8_t *dst,
933  int32_t dst_stride,
934  const int8_t *filter,
935  int32_t height,
936  int32_t weight0,
937  int32_t weight1,
938  int32_t offset0,
939  int32_t offset1,
940  int32_t rnd_val)
941 {
942  uint32_t loop_cnt;
943  int32_t offset, weight, constant;
944  v16i8 src0, src1, src2, src3;
945  v8i16 in0, in1, in2, in3;
946  v8i16 filt0, filt1, filt2, filt3;
947  v16i8 mask1, mask2, mask3;
948  v8i16 filter_vec, out0, out1, out2, out3;
949  v16i8 vec0, vec1, vec2, vec3;
950  v8i16 dst0, dst1, dst2, dst3;
951  v4i32 weight_vec, offset_vec, rnd_vec;
952  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
953 
954  src0_ptr -= 3;
955  offset = (offset0 + offset1) << rnd_val;
956  weight0 = weight0 & 0x0000FFFF;
957  weight = weight0 | (weight1 << 16);
958  constant = 128 * weight1;
959  constant <<= 6;
960  offset += constant;
961 
962  offset_vec = __msa_fill_w(offset);
963  weight_vec = __msa_fill_w(weight);
964  rnd_vec = __msa_fill_w(rnd_val + 1);
965 
966  filter_vec = LD_SH(filter);
967  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
968 
969  mask1 = mask0 + 2;
970  mask2 = mask0 + 4;
971  mask3 = mask0 + 6;
972 
973  for (loop_cnt = (height >> 1); loop_cnt--;) {
974  LD_SB2(src0_ptr, 8, src0, src1);
975  src0_ptr += src_stride;
976  LD_SB2(src0_ptr, 8, src2, src3);
977  src0_ptr += src_stride;
978  LD_SH2(src1_ptr, 8, in0, in1);
979  src1_ptr += src2_stride;
980  LD_SH2(src1_ptr, 8, in2, in3);
981  src1_ptr += src2_stride;
982  XORI_B4_128_SB(src0, src1, src2, src3);
983 
984  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
985  vec0, vec1, vec2, vec3);
986  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
987  filt3);
988  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
989  vec0, vec1, vec2, vec3);
990  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
991  filt3);
992  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
993  vec0, vec1, vec2, vec3);
994  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
995  filt3);
996  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
997  vec0, vec1, vec2, vec3);
998  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
999  filt3);
1000 
1001  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1002  in0, in1, in2, in3,
1003  weight_vec, rnd_vec, offset_vec,
1004  out0, out1, out2, out3);
1005 
1006  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1007  ST_SH2(out0, out1, dst, dst_stride);
1008  dst += (2 * dst_stride);
1009  }
1010 }
1011 
1012 static void hevc_hz_biwgt_8t_24w_msa(const uint8_t *src0_ptr,
1013  int32_t src_stride,
1014  const int16_t *src1_ptr,
1015  int32_t src2_stride,
1016  uint8_t *dst,
1017  int32_t dst_stride,
1018  const int8_t *filter,
1019  int32_t height,
1020  int32_t weight0,
1021  int32_t weight1,
1022  int32_t offset0,
1023  int32_t offset1,
1024  int32_t rnd_val)
1025 {
1026  uint32_t loop_cnt;
1027  uint64_t dst_val0;
1028  int32_t offset, weight, constant;
1029  v16i8 src0, src1;
1030  v8i16 in0, in1, in2;
1031  v8i16 filt0, filt1, filt2, filt3;
1032  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1033  v16i8 vec0, vec1, vec2, vec3;
1034  v8i16 dst0, dst1, dst2;
1035  v4i32 dst2_r, dst2_l;
1036  v8i16 filter_vec, out0, out1, out2;
1037  v4i32 weight_vec, offset_vec, rnd_vec;
1038  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1039 
1040  src0_ptr = src0_ptr - 3;
1041  offset = (offset0 + offset1) << rnd_val;
1042  weight0 = weight0 & 0x0000FFFF;
1043  weight = weight0 | (weight1 << 16);
1044  constant = 128 * weight1;
1045  constant <<= 6;
1046  offset += constant;
1047 
1048  offset_vec = __msa_fill_w(offset);
1049  weight_vec = __msa_fill_w(weight);
1050  rnd_vec = __msa_fill_w(rnd_val + 1);
1051 
1052  filter_vec = LD_SH(filter);
1053  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1054 
1055  mask1 = mask0 + 2;
1056  mask2 = mask0 + 4;
1057  mask3 = mask0 + 6;
1058  mask4 = mask0 + 8;
1059  mask5 = mask0 + 10;
1060  mask6 = mask0 + 12;
1061  mask7 = mask0 + 14;
1062 
1063  LD_SB2(src0_ptr, 16, src0, src1);
1064  src0_ptr += src_stride;
1065  LD_SH2(src1_ptr, 8, in0, in1);
1066  in2 = LD_SH(src1_ptr + 16);
1067  src1_ptr += src2_stride;
1069 
1070  for (loop_cnt = 31; loop_cnt--;) {
1071  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1072  vec0, vec1, vec2, vec3);
1073  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1074  filt3);
1075  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1076  vec0, vec1, vec2, vec3);
1077  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1078  filt3);
1079  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1080  vec0, vec1, vec2, vec3);
1081  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1082  filt3);
1083 
1084  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1085  weight_vec, rnd_vec, offset_vec,
1086  out0, out1);
1087 
1088  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1089  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1090  (v8i16) weight_vec);
1091  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1092  (v8i16) weight_vec);
1093  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1094  CLIP_SW2_0_255(dst2_r, dst2_l);
1095  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1096 
1097  LD_SB2(src0_ptr, 16, src0, src1);
1098  src0_ptr += src_stride;
1099  LD_SH2(src1_ptr, 8, in0, in1);
1100  in2 = LD_SH(src1_ptr + 16);
1101  src1_ptr += src2_stride;
1103  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1104  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1105  ST_SH(out0, dst);
1106  SD(dst_val0, dst + 16);
1107  dst += dst_stride;
1108  }
1109 
1110  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1111  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1112  filt3);
1113  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1114  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1115  filt3);
1116  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1117  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1118  filt3);
1119  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1120  out0, out1);
1121  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1122  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1123  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1124  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1125  CLIP_SW2_0_255(dst2_r, dst2_l);
1126  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1127  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1128  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1129  ST_SH(out0, dst);
1130  SD(dst_val0, dst + 16);
1131  dst += dst_stride;
1132 }
1133 
1134 static void hevc_hz_biwgt_8t_32w_msa(const uint8_t *src0_ptr,
1135  int32_t src_stride,
1136  const int16_t *src1_ptr,
1137  int32_t src2_stride,
1138  uint8_t *dst,
1139  int32_t dst_stride,
1140  const int8_t *filter,
1141  int32_t height,
1142  int32_t weight0,
1143  int32_t weight1,
1144  int32_t offset0,
1145  int32_t offset1,
1146  int32_t rnd_val)
1147 {
1148  uint32_t loop_cnt;
1149  int32_t offset, weight, constant;
1150  v16i8 src0, src1, src2;
1151  v8i16 in0, in1, in2, in3;
1152  v8i16 filt0, filt1, filt2, filt3;
1153  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1154  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1155  v16i8 vec0, vec1, vec2, vec3;
1156  v8i16 dst0, dst1, dst2, dst3;
1157  v8i16 filter_vec, out0, out1, out2, out3;
1158  v4i32 weight_vec, offset_vec, rnd_vec;
1159 
1160  src0_ptr -= 3;
1161  offset = (offset0 + offset1) << rnd_val;
1162  weight0 = weight0 & 0x0000FFFF;
1163  weight = weight0 | (weight1 << 16);
1164  constant = 128 * weight1;
1165  constant <<= 6;
1166  offset += constant;
1167 
1168  offset_vec = __msa_fill_w(offset);
1169  weight_vec = __msa_fill_w(weight);
1170  rnd_vec = __msa_fill_w(rnd_val + 1);
1171 
1172  filter_vec = LD_SH(filter);
1173  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1174 
1175  mask1 = mask0 + 2;
1176  mask2 = mask0 + 4;
1177  mask3 = mask0 + 6;
1178  mask4 = mask0 + 8;
1179  mask5 = mask0 + 10;
1180  mask6 = mask0 + 12;
1181  mask7 = mask0 + 14;
1182 
1183  for (loop_cnt = height; loop_cnt--;) {
1184  LD_SB2(src0_ptr, 16, src0, src1);
1185  src2 = LD_SB(src0_ptr + 24);
1186  src0_ptr += src_stride;
1187  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1188  src1_ptr += src2_stride;
1189 
1191 
1192  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1193  vec0, vec1, vec2, vec3);
1194  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1195  filt3);
1196  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1197  vec0, vec1, vec2, vec3);
1198  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1199  filt3);
1200  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1201  vec0, vec1, vec2, vec3);
1202  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1203  filt3);
1204  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1205  vec0, vec1, vec2, vec3);
1206  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1207  filt3);
1208 
1209  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1210  in0, in1, in2, in3,
1211  weight_vec, rnd_vec, offset_vec,
1212  out0, out1, out2, out3);
1213 
1214  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1215  ST_SH2(out0, out1, dst, 16);
1216  dst += dst_stride;
1217  }
1218 }
1219 
1220 static void hevc_hz_biwgt_8t_48w_msa(const uint8_t *src0_ptr,
1221  int32_t src_stride,
1222  const int16_t *src1_ptr,
1223  int32_t src2_stride,
1224  uint8_t *dst,
1225  int32_t dst_stride,
1226  const int8_t *filter,
1227  int32_t height,
1228  int32_t weight0,
1229  int32_t weight1,
1230  int32_t offset0,
1231  int32_t offset1,
1232  int32_t rnd_val)
1233 {
1234  uint32_t loop_cnt;
1235  int32_t offset, weight, constant;
1236  v16i8 src0, src1, src2, src3, src4;
1237  v8i16 in0, in1, in2, in3;
1238  v8i16 filt0, filt1, filt2, filt3;
1239  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1240  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1241  v16i8 vec0, vec1, vec2, vec3;
1242  v8i16 dst0, dst1, dst2, dst3;
1243  v8i16 filter_vec, out0, out1, out2, out3;
1244  v4i32 weight_vec, offset_vec, rnd_vec;
1245 
1246  src0_ptr -= 3;
1247  offset = (offset0 + offset1) << rnd_val;
1248  weight0 = weight0 & 0x0000FFFF;
1249  weight = weight0 | (weight1 << 16);
1250  constant = 128 * weight1;
1251  constant <<= 6;
1252  offset += constant;
1253 
1254  offset_vec = __msa_fill_w(offset);
1255  weight_vec = __msa_fill_w(weight);
1256  rnd_vec = __msa_fill_w(rnd_val + 1);
1257 
1258  filter_vec = LD_SH(filter);
1259  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1260 
1261  mask1 = mask0 + 2;
1262  mask2 = mask0 + 4;
1263  mask3 = mask0 + 6;
1264  mask4 = mask0 + 8;
1265  mask5 = mask0 + 10;
1266  mask6 = mask0 + 12;
1267  mask7 = mask0 + 14;
1268 
1269  for (loop_cnt = 64; loop_cnt--;) {
1270  LD_SB2(src0_ptr, 16, src0, src1);
1271  src2 = LD_SB(src0_ptr + 24);
1272  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1274  LD_SB2(src0_ptr + 32, 8, src3, src4);
1275  src0_ptr += src_stride;
1276  XORI_B2_128_SB(src3, src4);
1277 
1278  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1279  vec0, vec1, vec2, vec3);
1280  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1281  filt3);
1282  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1283  vec0, vec1, vec2, vec3);
1284  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1285  filt3);
1286  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1287  vec0, vec1, vec2, vec3);
1288  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1289  filt3);
1290  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1291  vec0, vec1, vec2, vec3);
1292  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1293  filt3);
1294 
1295  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1296  weight_vec, rnd_vec, offset_vec,
1297  out0, out1, out2, out3);
1298 
1299  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1300  ST_SH2(out0, out1, dst, 16);
1301 
1302  LD_SH2(src1_ptr + 32, 8, in2, in3);
1303  src1_ptr += src2_stride;
1304 
1305  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1306  vec0, vec1, vec2, vec3);
1307  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1308  filt3);
1309  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1310  vec0, vec1, vec2, vec3);
1311  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1312  filt3);
1313 
1314  HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1315  weight_vec, rnd_vec, offset_vec,
1316  out0, out1);
1317 
1318  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1319  ST_SH(out0, dst + 32);
1320  dst += dst_stride;
1321  }
1322 }
1323 
1324 static void hevc_hz_biwgt_8t_64w_msa(const uint8_t *src0_ptr,
1325  int32_t src_stride,
1326  const int16_t *src1_ptr,
1327  int32_t src2_stride,
1328  uint8_t *dst,
1329  int32_t dst_stride,
1330  const int8_t *filter,
1331  int32_t height,
1332  int32_t weight0,
1333  int32_t weight1,
1334  int32_t offset0,
1335  int32_t offset1,
1336  int32_t rnd_val)
1337 {
1338  const uint8_t *src0_ptr_tmp;
1339  uint8_t *dst_tmp;
1340  const int16_t *src1_ptr_tmp;
1341  uint32_t loop_cnt, cnt;
1342  int32_t offset, weight, constant;
1343  v16i8 src0, src1, src2;
1344  v8i16 in0, in1, in2, in3;
1345  v8i16 filt0, filt1, filt2, filt3;
1346  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1347  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1348  v16i8 vec0, vec1, vec2, vec3;
1349  v8i16 dst0, dst1, dst2, dst3;
1350  v8i16 filter_vec, out0, out1, out2, out3;
1351  v4i32 weight_vec, offset_vec, rnd_vec;
1352 
1353  src0_ptr -= 3;
1354  offset = (offset0 + offset1) << rnd_val;
1355  weight0 = weight0 & 0x0000FFFF;
1356  weight = weight0 | (weight1 << 16);
1357  constant = 128 * weight1;
1358  constant <<= 6;
1359  offset += constant;
1360 
1361  offset_vec = __msa_fill_w(offset);
1362  weight_vec = __msa_fill_w(weight);
1363  rnd_vec = __msa_fill_w(rnd_val + 1);
1364 
1365  filter_vec = LD_SH(filter);
1366  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1367 
1368  mask1 = mask0 + 2;
1369  mask2 = mask0 + 4;
1370  mask3 = mask0 + 6;
1371  mask4 = mask0 + 8;
1372  mask5 = mask0 + 10;
1373  mask6 = mask0 + 12;
1374  mask7 = mask0 + 14;
1375 
1376  for (loop_cnt = height; loop_cnt--;) {
1377  src0_ptr_tmp = src0_ptr;
1378  dst_tmp = dst;
1379  src1_ptr_tmp = src1_ptr;
1380 
1381  for (cnt = 2; cnt--;) {
1382  LD_SB2(src0_ptr_tmp, 16, src0, src1);
1383  src2 = LD_SB(src0_ptr_tmp + 24);
1384  src0_ptr_tmp += 32;
1385  LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1386  src1_ptr_tmp += 32;
1388 
1389  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1390  vec0, vec1, vec2, vec3);
1391  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1392  filt2, filt3);
1393  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1394  vec0, vec1, vec2, vec3);
1395  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1396  filt2, filt3);
1397  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1398  vec0, vec1, vec2, vec3);
1399  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1400  filt2, filt3);
1401  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1402  vec0, vec1, vec2, vec3);
1403  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1404  filt2, filt3);
1405 
1406  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1407  in0, in1, in2, in3,
1408  weight_vec, rnd_vec, offset_vec,
1409  out0, out1, out2, out3);
1410 
1411  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1412  ST_SH2(out0, out1, dst_tmp, 16);
1413  dst_tmp += 32;
1414  }
1415 
1416  src0_ptr += src_stride;
1417  src1_ptr += src2_stride;
1418  dst += dst_stride;
1419 
1420  }
1421 }
1422 
1423 static void hevc_vt_biwgt_8t_4w_msa(const uint8_t *src0_ptr,
1424  int32_t src_stride,
1425  const int16_t *src1_ptr,
1426  int32_t src2_stride,
1427  uint8_t *dst,
1428  int32_t dst_stride,
1429  const int8_t *filter,
1430  int32_t height,
1431  int32_t weight0,
1432  int32_t weight1,
1433  int32_t offset0,
1434  int32_t offset1,
1435  int32_t rnd_val)
1436 {
1437  uint32_t loop_cnt;
1438  int32_t res = height & 0x07;
1440  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1441  v16i8 src11, src12, src13, src14;
1442  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1443  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1444  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1445  v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1446  v16i8 src2110, src4332, src6554, src8776, src10998;
1447  v16i8 src12111110, src14131312;
1448  v8i16 dst10, dst32, dst54, dst76;
1449  v8i16 filt0, filt1, filt2, filt3;
1450  v8i16 filter_vec, out0, out1, out2, out3;
1451  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1452 
1453  src0_ptr -= (3 * src_stride);
1454  offset = (offset0 + offset1) << rnd_val;
1455  weight0 = weight0 & 0x0000FFFF;
1456  weight = weight0 | (weight1 << 16);
1457 
1458  const_vec = __msa_ldi_w(128);
1459  const_vec <<= 6;
1460  offset_vec = __msa_fill_w(offset);
1461  weight_vec = __msa_fill_w(weight);
1462  rnd_vec = __msa_fill_w(rnd_val + 1);
1463  weight1_vec = __msa_fill_w(weight1);
1464  offset_vec += const_vec * weight1_vec;
1465 
1466  filter_vec = LD_SH(filter);
1467  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1468 
1469  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1470  src0_ptr += (7 * src_stride);
1471 
1472  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1473  src10_r, src32_r, src54_r, src21_r);
1474  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1475  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1476  src2110, src4332, src6554);
1477  XORI_B3_128_SB(src2110, src4332, src6554);
1478 
1479  for (loop_cnt = (height >> 3); loop_cnt--;) {
1480  LD_SB8(src0_ptr, src_stride,
1481  src7, src8, src9, src10, src11, src12, src13, src14);
1482  src0_ptr += (8 * src_stride);
1483  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1484  src1_ptr += (8 * src2_stride);
1485 
1486  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1487  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1488  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1489  src76_r, src87_r, src98_r, src109_r);
1490  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1491  src1110_r, src1211_r, src1312_r, src1413_r);
1492  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1493  src1413_r, src1312_r,
1494  src8776, src10998, src12111110, src14131312);
1495  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1496 
1497  DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1498  filt0, dst10, dst32, dst54, dst76);
1499  DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1500  filt1, dst10, dst32, dst54, dst76);
1501  DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1502  filt2, filt2, dst10, dst32, dst54, dst76);
1503  DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1504  filt3, filt3, dst10, dst32, dst54, dst76);
1505 
1506  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1507  in0, in1, in2, in3,
1508  weight_vec, rnd_vec, offset_vec,
1509  out0, out1, out2, out3);
1510 
1511  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1512  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1513  dst += (8 * dst_stride);
1514 
1515  src2110 = src10998;
1516  src4332 = src12111110;
1517  src6554 = src14131312;
1518  src6 = src14;
1519  }
1520  if (res) {
1521  LD_SB8(src0_ptr, src_stride,
1522  src7, src8, src9, src10, src11, src12, src13, src14);
1523  src0_ptr += (8 * src_stride);
1524  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1525  src1_ptr += (8 * src2_stride);
1526 
1527  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1528  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1529  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1530  src76_r, src87_r, src98_r, src109_r);
1531  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1532  src1110_r, src1211_r, src1312_r, src1413_r);
1533  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1534  src1413_r, src1312_r,
1535  src8776, src10998, src12111110, src14131312);
1536  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1537 
1538  DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1539  filt0, dst10, dst32, dst54, dst76);
1540  DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1541  filt1, dst10, dst32, dst54, dst76);
1542  DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1543  filt2, filt2, dst10, dst32, dst54, dst76);
1544  DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1545  filt3, filt3, dst10, dst32, dst54, dst76);
1546 
1547  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1548  in0, in1, in2, in3,
1549  weight_vec, rnd_vec, offset_vec,
1550  out0, out1, out2, out3);
1551 
1552  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1553  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1554 
1555  src2110 = src10998;
1556  src4332 = src12111110;
1557  src6554 = src14131312;
1558  src6 = src14;
1559  }
1560 }
1561 
1562 static void hevc_vt_biwgt_8t_8w_msa(const uint8_t *src0_ptr,
1563  int32_t src_stride,
1564  const int16_t *src1_ptr,
1565  int32_t src2_stride,
1566  uint8_t *dst,
1567  int32_t dst_stride,
1568  const int8_t *filter,
1569  int32_t height,
1570  int32_t weight0,
1571  int32_t weight1,
1572  int32_t offset0,
1573  int32_t offset1,
1574  int32_t rnd_val)
1575 {
1576  uint32_t loop_cnt;
1578  v16i8 src0, src1, src2, src3, src4, src5;
1579  v16i8 src6, src7, src8, src9, src10;
1580  v8i16 in0, in1, in2, in3;
1581  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1582  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1583  v8i16 tmp0, tmp1, tmp2, tmp3;
1584  v8i16 filt0, filt1, filt2, filt3;
1585  v8i16 filter_vec, out0, out1, out2, out3;
1586  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1587 
1588  src0_ptr -= (3 * src_stride);
1589  offset = (offset0 + offset1) << rnd_val;
1590  weight0 = weight0 & 0x0000FFFF;
1591  weight = weight0 | (weight1 << 16);
1592 
1593  const_vec = __msa_ldi_w(128);
1594  const_vec <<= 6;
1595  offset_vec = __msa_fill_w(offset);
1596  weight_vec = __msa_fill_w(weight);
1597  rnd_vec = __msa_fill_w(rnd_val + 1);
1598  weight1_vec = __msa_fill_w(weight1);
1599  offset_vec += const_vec * weight1_vec;
1600 
1601  filter_vec = LD_SH(filter);
1602  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1603 
1604  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1605  src0_ptr += (7 * src_stride);
1606  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1607 
1608  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1609  src10_r, src32_r, src54_r, src21_r);
1610  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1611 
1612  for (loop_cnt = (height >> 2); loop_cnt--;) {
1613  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1614  src0_ptr += (4 * src_stride);
1615  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1616  src1_ptr += (4 * src2_stride);
1617 
1618  XORI_B4_128_SB(src7, src8, src9, src10);
1619  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1620  src76_r, src87_r, src98_r, src109_r);
1621 
1622  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1623  filt0, tmp0, tmp1, tmp2, tmp3);
1624  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1625  filt1, tmp0, tmp1, tmp2, tmp3);
1626  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1627  filt2, tmp0, tmp1, tmp2, tmp3);
1628  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1629  filt3, tmp0, tmp1, tmp2, tmp3);
1630 
1631  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1632  in0, in1, in2, in3,
1633  weight_vec, rnd_vec, offset_vec,
1634  out0, out1, out2, out3);
1635 
1636  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1637  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1638  dst += (4 * dst_stride);
1639 
1640  src10_r = src54_r;
1641  src32_r = src76_r;
1642  src54_r = src98_r;
1643  src21_r = src65_r;
1644  src43_r = src87_r;
1645  src65_r = src109_r;
1646  src6 = src10;
1647  }
1648 }
1649 
1650 static void hevc_vt_biwgt_8t_12w_msa(const uint8_t *src0_ptr,
1651  int32_t src_stride,
1652  const int16_t *src1_ptr,
1653  int32_t src2_stride,
1654  uint8_t *dst,
1655  int32_t dst_stride,
1656  const int8_t *filter,
1657  int32_t height,
1658  int32_t weight0,
1659  int32_t weight1,
1660  int32_t offset0,
1661  int32_t offset1,
1662  int32_t rnd_val)
1663 {
1664  uint32_t loop_cnt;
1666  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1667  v8i16 in0, in1, in2, in3;
1668  v16i8 src10_r, src32_r, src54_r, src76_r;
1669  v16i8 src21_r, src43_r, src65_r, src87_r;
1670  v8i16 tmp0, tmp1, tmp2;
1671  v16i8 src10_l, src32_l, src54_l, src76_l;
1672  v16i8 src21_l, src43_l, src65_l, src87_l;
1673  v16i8 src2110, src4332, src6554, src8776;
1674  v8i16 filt0, filt1, filt2, filt3;
1675  v8i16 out0, out1, out2, filter_vec;
1676  v4i32 dst2_r, dst2_l;
1677  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1678 
1679  src0_ptr -= (3 * src_stride);
1680  offset = (offset0 + offset1) << rnd_val;
1681  weight0 = weight0 & 0x0000FFFF;
1682  weight = weight0 | (weight1 << 16);
1683 
1684  const_vec = __msa_ldi_w(128);
1685  const_vec <<= 6;
1686  offset_vec = __msa_fill_w(offset);
1687  weight_vec = __msa_fill_w(weight);
1688  rnd_vec = __msa_fill_w(rnd_val + 1);
1689  weight1_vec = __msa_fill_w(weight1);
1690  offset_vec += const_vec * weight1_vec;
1691 
1692  filter_vec = LD_SH(filter);
1693  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1694 
1695  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1696  src0_ptr += (7 * src_stride);
1697  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1698 
1699  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1700  src10_r, src32_r, src54_r, src21_r);
1701  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1702  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1703  src10_l, src32_l, src54_l, src21_l);
1704  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1705  ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1706  src2110, src4332, src6554);
1707 
1708  for (loop_cnt = 8; loop_cnt--;) {
1709  LD_SB2(src0_ptr, src_stride, src7, src8);
1710  src0_ptr += (2 * src_stride);
1711  LD_SH2(src1_ptr, src2_stride, in0, in1);
1712  LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1713  src1_ptr += (2 * src2_stride);
1714  in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1715  XORI_B2_128_SB(src7, src8);
1716 
1717  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1718  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1719  src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1720 
1721  DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1722  tmp0, tmp1, tmp2);
1723  DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1724  tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1725  DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1726  tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1727  DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1728  tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1729 
1730  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1731  weight_vec, rnd_vec, offset_vec,
1732  out0, out1);
1733 
1734  ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1735  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1736  (v8i16) weight_vec);
1737  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1738  (v8i16) weight_vec);
1739  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1740  CLIP_SW2_0_255(dst2_r, dst2_l);
1741  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1742  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1743  ST_D2(out0, 0, 1, dst, dst_stride);
1744  ST_W2(out2, 0, 1, dst + 8, dst_stride);
1745  dst += (2 * dst_stride);
1746 
1747  src10_r = src32_r;
1748  src32_r = src54_r;
1749  src54_r = src76_r;
1750  src21_r = src43_r;
1751  src43_r = src65_r;
1752  src65_r = src87_r;
1753  src2110 = src4332;
1754  src4332 = src6554;
1755  src6554 = src8776;
1756  src6 = src8;
1757  }
1758 }
1759 
1760 static void hevc_vt_biwgt_8t_16multx2mult_msa(const uint8_t *src0_ptr,
1761  int32_t src_stride,
1762  const int16_t *src1_ptr,
1763  int32_t src2_stride,
1764  uint8_t *dst,
1765  int32_t dst_stride,
1766  const int8_t *filter,
1767  int32_t height,
1768  int32_t weight0,
1769  int32_t weight1,
1770  int32_t offset0,
1771  int32_t offset1,
1772  int32_t rnd_val,
1773  int32_t width)
1774 {
1775  const uint8_t *src0_ptr_tmp;
1776  const int16_t *src1_ptr_tmp;
1777  uint8_t *dst_tmp;
1778  uint32_t loop_cnt, cnt;
1780  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1781  v8i16 in0, in1, in2, in3;
1782  v16i8 src10_r, src32_r, src54_r, src76_r;
1783  v16i8 src21_r, src43_r, src65_r, src87_r;
1784  v16i8 src10_l, src32_l, src54_l, src76_l;
1785  v16i8 src21_l, src43_l, src65_l, src87_l;
1786  v8i16 tmp0, tmp1, tmp2, tmp3;
1787  v8i16 filt0, filt1, filt2, filt3;
1788  v8i16 filter_vec;
1789  v8i16 out0, out1, out2, out3;
1790  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1791 
1792  src0_ptr -= (3 * src_stride);
1793 
1794  offset = (offset0 + offset1) << rnd_val;
1795  weight0 = weight0 & 0x0000FFFF;
1796  weight = weight0 | (weight1 << 16);
1797 
1798  const_vec = __msa_ldi_w(128);
1799  const_vec <<= 6;
1800  offset_vec = __msa_fill_w(offset);
1801  weight_vec = __msa_fill_w(weight);
1802  rnd_vec = __msa_fill_w(rnd_val + 1);
1803  weight1_vec = __msa_fill_w(weight1);
1804  offset_vec += const_vec * weight1_vec;
1805 
1806  filter_vec = LD_SH(filter);
1807  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1808 
1809  for (cnt = (width >> 4); cnt--;) {
1810  src0_ptr_tmp = src0_ptr;
1811  src1_ptr_tmp = src1_ptr;
1812  dst_tmp = dst;
1813 
1814  LD_SB7(src0_ptr_tmp, src_stride,
1815  src0, src1, src2, src3, src4, src5, src6);
1816  src0_ptr_tmp += (7 * src_stride);
1817 
1818  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1819  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1820  src10_r, src32_r, src54_r, src21_r);
1821  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1822  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1823  src10_l, src32_l, src54_l, src21_l);
1824  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1825 
1826  for (loop_cnt = (height >> 1); loop_cnt--;) {
1827  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1828  src0_ptr_tmp += (2 * src_stride);
1829  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1830  LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1831  src1_ptr_tmp += (2 * src2_stride);
1832 
1833  XORI_B2_128_SB(src7, src8);
1834  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1835  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1836 
1837  DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1838  filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1839  DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1840  filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1841  DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1842  filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1843  DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1844  filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1845 
1846  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1847  in0, in1, in2, in3,
1848  weight_vec, rnd_vec, offset_vec,
1849  out0, out1, out2, out3);
1850 
1851  PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1852  ST_SH2(out0, out1, dst_tmp, dst_stride);
1853  dst_tmp += (2 * dst_stride);
1854 
1855  src10_r = src32_r;
1856  src32_r = src54_r;
1857  src54_r = src76_r;
1858  src21_r = src43_r;
1859  src43_r = src65_r;
1860  src65_r = src87_r;
1861  src10_l = src32_l;
1862  src32_l = src54_l;
1863  src54_l = src76_l;
1864  src21_l = src43_l;
1865  src43_l = src65_l;
1866  src65_l = src87_l;
1867  src6 = src8;
1868  }
1869 
1870  src0_ptr += 16;
1871  src1_ptr += 16;
1872  dst += 16;
1873  }
1874 }
1875 
1876 static void hevc_vt_biwgt_8t_16w_msa(const uint8_t *src0_ptr,
1877  int32_t src_stride,
1878  const int16_t *src1_ptr,
1879  int32_t src2_stride,
1880  uint8_t *dst,
1881  int32_t dst_stride,
1882  const int8_t *filter,
1883  int32_t height,
1884  int32_t weight0,
1885  int32_t weight1,
1886  int32_t offset0,
1887  int32_t offset1,
1888  int32_t rnd_val)
1889 {
1890  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1891  src1_ptr, src2_stride,
1892  dst, dst_stride, filter, height,
1893  weight0, weight1, offset0, offset1,
1894  rnd_val, 16);
1895 }
1896 
1897 static void hevc_vt_biwgt_8t_24w_msa(const uint8_t *src0_ptr,
1898  int32_t src_stride,
1899  const int16_t *src1_ptr,
1900  int32_t src2_stride,
1901  uint8_t *dst,
1902  int32_t dst_stride,
1903  const int8_t *filter,
1904  int32_t height,
1905  int32_t weight0,
1906  int32_t weight1,
1907  int32_t offset0,
1908  int32_t offset1,
1909  int32_t rnd_val)
1910 {
1911  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1912  src1_ptr, src2_stride,
1913  dst, dst_stride, filter, height,
1914  weight0, weight1, offset0, offset1,
1915  rnd_val, 16);
1916  hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1917  src1_ptr + 16, src2_stride,
1918  dst + 16, dst_stride, filter, height,
1919  weight0, weight1, offset0, offset1, rnd_val);
1920 }
1921 
1922 static void hevc_vt_biwgt_8t_32w_msa(const uint8_t *src0_ptr,
1923  int32_t src_stride,
1924  const int16_t *src1_ptr,
1925  int32_t src2_stride,
1926  uint8_t *dst,
1927  int32_t dst_stride,
1928  const int8_t *filter,
1929  int32_t height,
1930  int32_t weight0,
1931  int32_t weight1,
1932  int32_t offset0,
1933  int32_t offset1,
1934  int32_t rnd_val)
1935 {
1936  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1937  src1_ptr, src2_stride,
1938  dst, dst_stride, filter, height,
1939  weight0, weight1, offset0, offset1,
1940  rnd_val, 32);
1941 }
1942 
1943 static void hevc_vt_biwgt_8t_48w_msa(const uint8_t *src0_ptr,
1944  int32_t src_stride,
1945  const int16_t *src1_ptr,
1946  int32_t src2_stride,
1947  uint8_t *dst,
1948  int32_t dst_stride,
1949  const int8_t *filter,
1950  int32_t height,
1951  int32_t weight0,
1952  int32_t weight1,
1953  int32_t offset0,
1954  int32_t offset1,
1955  int32_t rnd_val)
1956 {
1957  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1958  src1_ptr, src2_stride,
1959  dst, dst_stride, filter, height,
1960  weight0, weight1, offset0, offset1,
1961  rnd_val, 48);
1962 }
1963 
1964 static void hevc_vt_biwgt_8t_64w_msa(const uint8_t *src0_ptr,
1965  int32_t src_stride,
1966  const int16_t *src1_ptr,
1967  int32_t src2_stride,
1968  uint8_t *dst,
1969  int32_t dst_stride,
1970  const int8_t *filter,
1971  int32_t height,
1972  int32_t weight0,
1973  int32_t weight1,
1974  int32_t offset0,
1975  int32_t offset1,
1976  int32_t rnd_val)
1977 {
1978  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1979  src1_ptr, src2_stride,
1980  dst, dst_stride, filter, height,
1981  weight0, weight1, offset0, offset1,
1982  rnd_val, 64);
1983 }
1984 
1985 static void hevc_hv_biwgt_8t_4w_msa(const uint8_t *src0_ptr,
1986  int32_t src_stride,
1987  const int16_t *src1_ptr,
1988  int32_t src2_stride,
1989  uint8_t *dst,
1990  int32_t dst_stride,
1991  const int8_t *filter_x,
1992  const int8_t *filter_y,
1993  int32_t height,
1994  int32_t weight0,
1995  int32_t weight1,
1996  int32_t offset0,
1997  int32_t offset1,
1998  int32_t rnd_val)
1999 {
2000  uint32_t loop_cnt;
2001  uint64_t tp0, tp1;
2003  v16u8 out;
2004  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2005  v8i16 in0 = { 0 }, in1 = { 0 };
2006  v8i16 filt0, filt1, filt2, filt3;
2007  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2008  v16i8 mask1, mask2, mask3;
2009  v8i16 filter_vec, weight_vec;
2010  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2011  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2012  v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
2013  v8i16 tmp0, tmp1, tmp2, tmp3;
2014  v8i16 dst10, dst32, dst54, dst76;
2015  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
2016  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2017  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
2018 
2019  src0_ptr -= ((3 * src_stride) + 3);
2020 
2021  filter_vec = LD_SH(filter_x);
2022  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2023 
2024  filter_vec = LD_SH(filter_y);
2025  UNPCK_R_SB_SH(filter_vec, filter_vec);
2026 
2027  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2028 
2029  mask1 = mask0 + 2;
2030  mask2 = mask0 + 4;
2031  mask3 = mask0 + 6;
2032 
2033  offset = (offset0 + offset1) << rnd_val;
2034  weight0 = weight0 & 0x0000FFFF;
2035  weight = weight0 | (weight1 << 16);
2036 
2037  const_vec = __msa_fill_w((128 * weight1));
2038  const_vec <<= 6;
2039  offset_vec = __msa_fill_w(offset);
2040  rnd_vec = __msa_fill_w(rnd_val + 1);
2041  offset_vec += const_vec;
2042  weight_vec = (v8i16) __msa_fill_w(weight);
2043 
2044  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2045  src0_ptr += (7 * src_stride);
2046 
2047  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2048 
2049  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2050  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2051  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
2052  vec8, vec9, vec10, vec11);
2053  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
2054  vec12, vec13, vec14, vec15);
2055 
2056  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2057  filt3);
2058  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2059  filt3);
2060  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2061  filt3);
2062  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2063  filt3);
2064 
2065  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2066  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2067  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2068 
2069  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2070 
2071  for (loop_cnt = height >> 2; loop_cnt--;) {
2072  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2073  src0_ptr += (4 * src_stride);
2074  XORI_B4_128_SB(src7, src8, src9, src10);
2075 
2076  LD2(src1_ptr, src2_stride, tp0, tp1);
2077  INSERT_D2_SH(tp0, tp1, in0);
2078  src1_ptr += (2 * src2_stride);
2079  LD2(src1_ptr, src2_stride, tp0, tp1);
2080  INSERT_D2_SH(tp0, tp1, in1);
2081  src1_ptr += (2 * src2_stride);
2082 
2083  VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2084  vec0, vec1, vec2, vec3);
2085  VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2086  vec4, vec5, vec6, vec7);
2087  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2088  filt3);
2089  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2090  filt3);
2091 
2092  dst76 = __msa_ilvr_h(dst97, dst66);
2093  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2094  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2095  dst98 = __msa_ilvr_h(dst66, dst108);
2096 
2097  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2098  filt_h2, filt_h3);
2099  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2100  filt_h2, filt_h3);
2101  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2102  filt_h2, filt_h3);
2103  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2104  filt_h2, filt_h3);
2105  SRA_4V(dst0, dst1, dst2, dst3, 6);
2106  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2107  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2108  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2109  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2110  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2111  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2112  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2113  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2114  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2115  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2116  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2117  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2118  dst += (4 * dst_stride);
2119 
2120  dst10 = dst54;
2121  dst32 = dst76;
2122  dst54 = dst98;
2123  dst21 = dst65;
2124  dst43 = dst87;
2125  dst65 = dst109;
2126  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2127  }
2128 }
2129 
2130 static void hevc_hv_biwgt_8t_8multx2mult_msa(const uint8_t *src0_ptr,
2131  int32_t src_stride,
2132  const int16_t *src1_ptr,
2133  int32_t src2_stride,
2134  uint8_t *dst,
2135  int32_t dst_stride,
2136  const int8_t *filter_x,
2137  const int8_t *filter_y,
2138  int32_t height,
2139  int32_t weight0,
2140  int32_t weight1,
2141  int32_t offset0,
2142  int32_t offset1,
2143  int32_t rnd_val,
2144  int32_t width8mult)
2145 {
2146  uint32_t loop_cnt, cnt;
2148  const uint8_t *src0_ptr_tmp;
2149  const int16_t *src1_ptr_tmp;
2150  uint8_t *dst_tmp;
2151  v16u8 out;
2152  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2153  v8i16 in0, in1;
2154  v8i16 filt0, filt1, filt2, filt3;
2155  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2156  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2157  v16i8 mask1, mask2, mask3;
2158  v8i16 filter_vec, weight_vec;
2159  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2160  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2161  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2162  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2163  v8i16 tmp0, tmp1, tmp2, tmp3;
2164  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2165  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2166  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2167  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2168  v4i32 offset_vec, rnd_vec, const_vec;
2169 
2170  src0_ptr -= ((3 * src_stride) + 3);
2171 
2172  offset = (offset0 + offset1) << rnd_val;
2173  weight0 = weight0 & 0x0000FFFF;
2174  weight = weight0 | (weight1 << 16);
2175 
2176  const_vec = __msa_fill_w((128 * weight1));
2177  const_vec <<= 6;
2178  offset_vec = __msa_fill_w(offset);
2179  rnd_vec = __msa_fill_w(rnd_val + 1);
2180  offset_vec += const_vec;
2181  weight_vec = (v8i16) __msa_fill_w(weight);
2182 
2183  filter_vec = LD_SH(filter_x);
2184  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2185 
2186  filter_vec = LD_SH(filter_y);
2187  UNPCK_R_SB_SH(filter_vec, filter_vec);
2188 
2189  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2190 
2191  mask1 = mask0 + 2;
2192  mask2 = mask0 + 4;
2193  mask3 = mask0 + 6;
2194 
2195  for (cnt = width8mult; cnt--;) {
2196  src0_ptr_tmp = src0_ptr;
2197  src1_ptr_tmp = src1_ptr;
2198  dst_tmp = dst;
2199 
2200  LD_SB7(src0_ptr_tmp, src_stride,
2201  src0, src1, src2, src3, src4, src5, src6);
2202  src0_ptr_tmp += (7 * src_stride);
2203 
2204  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2205 
2206  /* row 0 row 1 row 2 row 3 */
2207  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2208  vec0, vec1, vec2, vec3);
2209  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2210  vec4, vec5, vec6, vec7);
2211  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2212  vec8, vec9, vec10, vec11);
2213  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2214  vec12, vec13, vec14, vec15);
2215 
2216  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2217  filt3);
2218  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2219  filt3);
2220  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2221  filt3);
2222  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2223  filt2, filt3);
2224 
2225  /* row 4 row 5 row 6 */
2226  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2227  vec0, vec1, vec2, vec3);
2228  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2229  vec4, vec5, vec6, vec7);
2230  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2231  vec8, vec9, vec10, vec11);
2232 
2233  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2234  filt3);
2235  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2236  filt3);
2237  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2238  filt3);
2239 
2240  for (loop_cnt = height >> 1; loop_cnt--;) {
2241  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2242  XORI_B2_128_SB(src7, src8);
2243  src0_ptr_tmp += 2 * src_stride;
2244 
2245  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2246  src1_ptr_tmp += (2 * src2_stride);
2247 
2248  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2249  dst32_r, dst54_r, dst21_r);
2250  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2251  dst32_l, dst54_l, dst21_l);
2252  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2253  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2254 
2255  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2256  vec0, vec1, vec2, vec3);
2257  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2258  filt2, filt3);
2259 
2260  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2261  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2262  filt_h0, filt_h1, filt_h2, filt_h3);
2263  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2264  filt_h0, filt_h1, filt_h2, filt_h3);
2265 
2266  dst0_r >>= 6;
2267  dst0_l >>= 6;
2268 
2269  /* row 8 */
2270  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2271  vec0, vec1, vec2, vec3);
2272  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2273  filt2, filt3);
2274 
2275  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2276  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2277  filt_h0, filt_h1, filt_h2, filt_h3);
2278  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2279  filt_h0, filt_h1, filt_h2, filt_h3);
2280 
2281  dst1_r >>= 6;
2282  dst1_l >>= 6;
2283 
2284  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2285  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2286  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2287  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2288  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2289  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2290  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2291  SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2292  CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
2293  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2294  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2295  ST_D2(out, 0, 1, dst_tmp, dst_stride);
2296  dst_tmp += (2 * dst_stride);
2297 
2298  dst0 = dst2;
2299  dst1 = dst3;
2300  dst2 = dst4;
2301  dst3 = dst5;
2302  dst4 = dst6;
2303  dst5 = dst7;
2304  dst6 = dst8;
2305  }
2306 
2307  src0_ptr += 8;
2308  src1_ptr += 8;
2309  dst += 8;
2310  }
2311 }
2312 
2313 static void hevc_hv_biwgt_8t_8w_msa(const uint8_t *src0_ptr,
2314  int32_t src_stride,
2315  const int16_t *src1_ptr,
2316  int32_t src2_stride,
2317  uint8_t *dst,
2318  int32_t dst_stride,
2319  const int8_t *filter_x,
2320  const int8_t *filter_y,
2321  int32_t height,
2322  int32_t weight0,
2323  int32_t weight1,
2324  int32_t offset0,
2325  int32_t offset1,
2326  int32_t rnd_val)
2327 {
2328  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2329  src1_ptr, src2_stride,
2330  dst, dst_stride, filter_x, filter_y,
2331  height, weight0, weight1, offset0,
2332  offset1, rnd_val, 1);
2333 }
2334 
2335 static void hevc_hv_biwgt_8t_12w_msa(const uint8_t *src0_ptr,
2336  int32_t src_stride,
2337  const int16_t *src1_ptr,
2338  int32_t src2_stride,
2339  uint8_t *dst,
2340  int32_t dst_stride,
2341  const int8_t *filter_x,
2342  const int8_t *filter_y,
2343  int32_t height,
2344  int32_t weight0,
2345  int32_t weight1,
2346  int32_t offset0,
2347  int32_t offset1,
2348  int32_t rnd_val)
2349 {
2350  uint32_t loop_cnt;
2351  const uint8_t *src0_ptr_tmp;
2352  uint8_t *dst_tmp;
2353  const int16_t *src1_ptr_tmp;
2355  uint64_t tp0, tp1;
2356  v16u8 out;
2357  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2358  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2359  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2360  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2361  v8i16 in0 = { 0 }, in1 = { 0 };
2362  v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2363  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2364  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2365  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2366  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2367  v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2368  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2369  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2370 
2371  src0_ptr -= ((3 * src_stride) + 3);
2372 
2373  offset = (offset0 + offset1) << rnd_val;
2374  weight0 = weight0 & 0x0000FFFF;
2375  weight = weight0 | (weight1 << 16);
2376 
2377  const_vec = __msa_fill_w((128 * weight1));
2378  const_vec <<= 6;
2379  offset_vec = __msa_fill_w(offset);
2380  rnd_vec = __msa_fill_w(rnd_val + 1);
2381  offset_vec += const_vec;
2382  weight_vec = (v8i16) __msa_fill_w(weight);
2383 
2384  filter_vec = LD_SH(filter_x);
2385  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2386 
2387  filter_vec = LD_SH(filter_y);
2388  UNPCK_R_SB_SH(filter_vec, filter_vec);
2389 
2390  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2391 
2392  mask0 = LD_SB(ff_hevc_mask_arr);
2393  mask1 = mask0 + 2;
2394  mask2 = mask0 + 4;
2395  mask3 = mask0 + 6;
2396 
2397  src0_ptr_tmp = src0_ptr;
2398  src1_ptr_tmp = src1_ptr;
2399  dst_tmp = dst;
2400 
2401  LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2402  src0_ptr_tmp += (7 * src_stride);
2403  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2404 
2405  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2406  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2407  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2408  vec11);
2409  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2410  vec15);
2411  dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2412  filt3);
2413  dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2414  filt3);
2415  dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2416  filt3);
2417  dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2418  filt2, filt3);
2419  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2420  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2421  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2422  vec11);
2423  dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2424  filt3);
2425  dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2426  filt3);
2427  dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2428  filt3);
2429 
2430  for (loop_cnt = 8; loop_cnt--;) {
2431  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2432  src0_ptr_tmp += (2 * src_stride);
2433  XORI_B2_128_SB(src7, src8);
2434 
2435  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2436  src1_ptr_tmp += (2 * src2_stride);
2437 
2438  ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2439  dst10_r, dst32_r, dst54_r, dst21_r);
2440  ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2441  dst10_l, dst32_l, dst54_l, dst21_l);
2442  ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2443  ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2444 
2445  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2446  vec3);
2447  dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2448  filt3);
2449 
2450  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2451  dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2452  filt_h1, filt_h2, filt_h3);
2453  dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2454  filt_h1, filt_h2, filt_h3);
2455  dst0 >>= 6;
2456  dst1 >>= 6;
2457 
2458  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2459  vec3);
2460  dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2461  filt3);
2462 
2463  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2464  dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2465  filt_h1, filt_h2, filt_h3);
2466  dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2467  filt_h1, filt_h2, filt_h3);
2468  dst2 >>= 6;
2469  dst3 >>= 6;
2470 
2471  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2472  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2473  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2474  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2475  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2476  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2477  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2478  SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2479  CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
2480  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2481  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2482  ST_D2(out, 0, 1, dst_tmp, dst_stride);
2483  dst_tmp += (2 * dst_stride);
2484 
2485  dsth0 = dsth2;
2486  dsth1 = dsth3;
2487  dsth2 = dsth4;
2488  dsth3 = dsth5;
2489  dsth4 = dsth6;
2490  dsth5 = dsth7;
2491  dsth6 = dsth8;
2492  }
2493 
2494  src0_ptr += 8;
2495  src1_ptr += 8;
2496  dst += 8;
2497 
2498  mask4 = LD_SB(ff_hevc_mask_arr + 16);
2499  mask5 = mask4 + 2;
2500  mask6 = mask4 + 4;
2501  mask7 = mask4 + 6;
2502 
2503  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2504  src0_ptr += (7 * src_stride);
2505  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2506 
2507  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2508  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2509  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2510  vec11);
2511  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2512  vec15);
2513  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2514  filt3);
2515  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2516  filt3);
2517  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2518  filt3);
2519  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2520  filt3);
2521  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2522  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2523  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2524 
2525  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2526 
2527  for (loop_cnt = 4; loop_cnt--;) {
2528  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2529  src0_ptr += (4 * src_stride);
2530  XORI_B4_128_SB(src7, src8, src9, src10);
2531 
2532  LD2(src1_ptr, src2_stride, tp0, tp1);
2533  INSERT_D2_SH(tp0, tp1, in0);
2534  src1_ptr += (2 * src2_stride);
2535  LD2(src1_ptr, src2_stride, tp0, tp1);
2536  INSERT_D2_SH(tp0, tp1, in1);
2537  src1_ptr += (2 * src2_stride);
2538 
2539  VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2540  vec3);
2541  VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2542  vec7);
2543  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2544  filt3);
2545  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2546  filt3);
2547 
2548  dst76 = __msa_ilvr_h(dst97, dst66);
2549  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2550  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2551  dst98 = __msa_ilvr_h(dst66, dst108);
2552 
2553  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2554  filt_h2, filt_h3);
2555  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2556  filt_h2, filt_h3);
2557  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2558  filt_h2, filt_h3);
2559  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2560  filt_h2, filt_h3);
2561  SRA_4V(dst0, dst1, dst2, dst3, 6);
2562  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2563  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2564  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2565  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2566  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2567  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2568  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2569  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2570  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2571  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2572  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2573  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2574  dst += (4 * dst_stride);
2575 
2576  dst10 = dst54;
2577  dst32 = dst76;
2578  dst54 = dst98;
2579  dst21 = dst65;
2580  dst43 = dst87;
2581  dst65 = dst109;
2582  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2583  }
2584 }
2585 
2586 static void hevc_hv_biwgt_8t_16w_msa(const uint8_t *src0_ptr,
2587  int32_t src_stride,
2588  const int16_t *src1_ptr,
2589  int32_t src2_stride,
2590  uint8_t *dst,
2591  int32_t dst_stride,
2592  const int8_t *filter_x,
2593  const int8_t *filter_y,
2594  int32_t height,
2595  int32_t weight0,
2596  int32_t weight1,
2597  int32_t offset0,
2598  int32_t offset1,
2599  int32_t rnd_val)
2600 {
2601  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2602  src1_ptr, src2_stride,
2603  dst, dst_stride, filter_x, filter_y,
2604  height, weight0, weight1, offset0,
2605  offset1, rnd_val, 2);
2606 }
2607 
2608 static void hevc_hv_biwgt_8t_24w_msa(const uint8_t *src0_ptr,
2609  int32_t src_stride,
2610  const int16_t *src1_ptr,
2611  int32_t src2_stride,
2612  uint8_t *dst,
2613  int32_t dst_stride,
2614  const int8_t *filter_x,
2615  const int8_t *filter_y,
2616  int32_t height,
2617  int32_t weight0,
2618  int32_t weight1,
2619  int32_t offset0,
2620  int32_t offset1,
2621  int32_t rnd_val)
2622 {
2623  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2624  src1_ptr, src2_stride,
2625  dst, dst_stride, filter_x, filter_y,
2626  height, weight0, weight1, offset0,
2627  offset1, rnd_val, 3);
2628 }
2629 
2630 static void hevc_hv_biwgt_8t_32w_msa(const uint8_t *src0_ptr,
2631  int32_t src_stride,
2632  const int16_t *src1_ptr,
2633  int32_t src2_stride,
2634  uint8_t *dst,
2635  int32_t dst_stride,
2636  const int8_t *filter_x,
2637  const int8_t *filter_y,
2638  int32_t height,
2639  int32_t weight0,
2640  int32_t weight1,
2641  int32_t offset0,
2642  int32_t offset1,
2643  int32_t rnd_val)
2644 {
2645  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2646  src1_ptr, src2_stride,
2647  dst, dst_stride, filter_x, filter_y,
2648  height, weight0, weight1, offset0,
2649  offset1, rnd_val, 4);
2650 }
2651 
2652 static void hevc_hv_biwgt_8t_48w_msa(const uint8_t *src0_ptr,
2653  int32_t src_stride,
2654  const int16_t *src1_ptr,
2655  int32_t src2_stride,
2656  uint8_t *dst,
2657  int32_t dst_stride,
2658  const int8_t *filter_x,
2659  const int8_t *filter_y,
2660  int32_t height,
2661  int32_t weight0,
2662  int32_t weight1,
2663  int32_t offset0,
2664  int32_t offset1,
2665  int32_t rnd_val)
2666 {
2667  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2668  src1_ptr, src2_stride,
2669  dst, dst_stride, filter_x, filter_y,
2670  height, weight0, weight1, offset0,
2671  offset1, rnd_val, 6);
2672 }
2673 
2674 static void hevc_hv_biwgt_8t_64w_msa(const uint8_t *src0_ptr,
2675  int32_t src_stride,
2676  const int16_t *src1_ptr,
2677  int32_t src2_stride,
2678  uint8_t *dst,
2679  int32_t dst_stride,
2680  const int8_t *filter_x,
2681  const int8_t *filter_y,
2682  int32_t height,
2683  int32_t weight0,
2684  int32_t weight1,
2685  int32_t offset0,
2686  int32_t offset1,
2687  int32_t rnd_val)
2688 {
2689  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2690  src1_ptr, src2_stride,
2691  dst, dst_stride, filter_x, filter_y,
2692  height, weight0, weight1, offset0,
2693  offset1, rnd_val, 8);
2694 }
2695 
2696 static void hevc_hz_biwgt_4t_4x2_msa(const uint8_t *src0_ptr,
2697  int32_t src_stride,
2698  const int16_t *src1_ptr,
2699  int32_t src2_stride,
2700  uint8_t *dst,
2701  int32_t dst_stride,
2702  const int8_t *filter,
2703  int32_t weight0,
2704  int32_t weight1,
2705  int32_t offset0,
2706  int32_t offset1,
2707  int32_t rnd_val)
2708 {
2709  int32_t offset, weight, constant;
2710  v8i16 filt0, filt1;
2711  v16i8 src0, src1;
2712  v8i16 in0, in1;
2713  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2714  v16i8 mask1, vec0, vec1;
2715  v8i16 dst0;
2716  v4i32 dst0_r, dst0_l;
2717  v8i16 out0, filter_vec;
2718  v4i32 weight_vec, offset_vec, rnd_vec;
2719 
2720  src0_ptr -= 1;
2721 
2722  filter_vec = LD_SH(filter);
2723  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2724 
2725  mask1 = mask0 + 2;
2726 
2727  offset = (offset0 + offset1) << rnd_val;
2728  weight0 = weight0 & 0x0000FFFF;
2729  weight = weight0 | (weight1 << 16);
2730  constant = 128 * weight1;
2731  constant <<= 6;
2732  offset += constant;
2733 
2734  offset_vec = __msa_fill_w(offset);
2735  weight_vec = __msa_fill_w(weight);
2736  rnd_vec = __msa_fill_w(rnd_val + 1);
2737 
2738  LD_SB2(src0_ptr, src_stride, src0, src1);
2739  LD_SH2(src1_ptr, src2_stride, in0, in1);
2740  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2742 
2743  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2744  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2745 
2746  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2747  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2748  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2749  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2750  out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2751  CLIP_SH_0_255(out0);
2752  out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2753  ST_W2(out0, 0, 1, dst, dst_stride);
2754 }
2755 
2756 static void hevc_hz_biwgt_4t_4x4_msa(const uint8_t *src0_ptr,
2757  int32_t src_stride,
2758  const int16_t *src1_ptr,
2759  int32_t src2_stride,
2760  uint8_t *dst,
2761  int32_t dst_stride,
2762  const int8_t *filter,
2763  int32_t weight0,
2764  int32_t weight1,
2765  int32_t offset0,
2766  int32_t offset1,
2767  int32_t rnd_val)
2768 {
2769  int32_t offset, weight, constant;
2770  v8i16 filt0, filt1;
2771  v16i8 src0, src1, src2, src3;
2772  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2773  v16i8 mask1;
2774  v8i16 dst0, dst1;
2775  v16i8 vec0, vec1;
2776  v8i16 in0, in1, in2, in3;
2777  v8i16 filter_vec;
2778  v4i32 weight_vec, offset_vec, rnd_vec;
2779 
2780  src0_ptr -= 1;
2781 
2782  /* rearranging filter */
2783  filter_vec = LD_SH(filter);
2784  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2785 
2786  mask1 = mask0 + 2;
2787 
2788  offset = (offset0 + offset1) << rnd_val;
2789  weight0 = weight0 & 0x0000FFFF;
2790  weight = weight0 | (weight1 << 16);
2791  constant = 128 * weight1;
2792  constant <<= 6;
2793  offset += constant;
2794 
2795  offset_vec = __msa_fill_w(offset);
2796  weight_vec = __msa_fill_w(weight);
2797  rnd_vec = __msa_fill_w(rnd_val + 1);
2798 
2799  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2800  XORI_B4_128_SB(src0, src1, src2, src3);
2801  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2802  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2803 
2804  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2805  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2806  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2807  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2808  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2809  weight_vec, rnd_vec, offset_vec,
2810  dst0, dst1);
2811 
2812  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2813  ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2814 }
2815 
2816 static void hevc_hz_biwgt_4t_4x8multiple_msa(const uint8_t *src0_ptr,
2817  int32_t src_stride,
2818  const int16_t *src1_ptr,
2819  int32_t src2_stride,
2820  uint8_t *dst,
2821  int32_t dst_stride,
2822  const int8_t *filter,
2823  int32_t height,
2824  int32_t weight0,
2825  int32_t weight1,
2826  int32_t offset0,
2827  int32_t offset1,
2828  int32_t rnd_val)
2829 {
2830  uint32_t loop_cnt;
2831  int32_t weight, offset, constant;
2832  v8i16 filt0, filt1;
2833  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2834  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2835  v16i8 mask1;
2836  v16i8 vec0, vec1;
2837  v8i16 dst0, dst1, dst2, dst3;
2838  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2839  v8i16 filter_vec;
2840  v4i32 weight_vec, offset_vec, rnd_vec;
2841 
2842  src0_ptr -= 1;
2843 
2844  filter_vec = LD_SH(filter);
2845  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2846 
2847  offset = (offset0 + offset1) << rnd_val;
2848  weight0 = weight0 & 0x0000FFFF;
2849  weight = weight0 | (weight1 << 16);
2850  constant = 128 * weight1;
2851  constant <<= 6;
2852  offset += constant;
2853 
2854  offset_vec = __msa_fill_w(offset);
2855  weight_vec = __msa_fill_w(weight);
2856  rnd_vec = __msa_fill_w(rnd_val + 1);
2857 
2858  mask1 = mask0 + 2;
2859 
2860  for (loop_cnt = (height >> 3); loop_cnt--;) {
2861  LD_SB8(src0_ptr, src_stride,
2862  src0, src1, src2, src3, src4, src5, src6, src7);
2863  src0_ptr += (8 * src_stride);
2864  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2865  src1_ptr += (4 * src2_stride);
2866  LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2867  src1_ptr += (4 * src2_stride);
2868  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2869  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2870  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2871 
2872  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2873  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2874  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2875  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2876  VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2877  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2878  VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2879  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2880  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2881  in0, in1, in2, in3,
2882  weight_vec, rnd_vec, offset_vec,
2883  dst0, dst1, dst2, dst3);
2884 
2885  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2886  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2887  dst += (8 * dst_stride);
2888  }
2889 }
2890 
2891 static void hevc_hz_biwgt_4t_4w_msa(const uint8_t *src0_ptr,
2892  int32_t src_stride,
2893  const int16_t *src1_ptr,
2894  int32_t src2_stride,
2895  uint8_t *dst,
2896  int32_t dst_stride,
2897  const int8_t *filter,
2898  int32_t height,
2899  int32_t weight0,
2900  int32_t weight1,
2901  int32_t offset0,
2902  int32_t offset1,
2903  int32_t rnd_val)
2904 {
2905  if (2 == height) {
2906  hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2907  dst, dst_stride, filter,
2908  weight0, weight1, offset0, offset1, rnd_val);
2909  } else if (4 == height) {
2910  hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2911  dst, dst_stride, filter,
2912  weight0, weight1, offset0, offset1, rnd_val);
2913  } else if (0 == (height % 8)) {
2914  hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2915  src1_ptr, src2_stride,
2916  dst, dst_stride, filter, height,
2917  weight0, weight1, offset0, offset1,
2918  rnd_val);
2919  }
2920 }
2921 
2922 static void hevc_hz_biwgt_4t_6w_msa(const uint8_t *src0_ptr,
2923  int32_t src_stride,
2924  const int16_t *src1_ptr,
2925  int32_t src2_stride,
2926  uint8_t *dst,
2927  int32_t dst_stride,
2928  const int8_t *filter,
2929  int32_t height,
2930  int32_t weight0,
2931  int32_t weight1,
2932  int32_t offset0,
2933  int32_t offset1,
2934  int32_t rnd_val)
2935 {
2936  uint32_t loop_cnt;
2937  int32_t offset, weight, constant;
2938  v8i16 filt0, filt1;
2939  v16i8 src0, src1, src2, src3;
2940  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2941  v16i8 mask1;
2942  v16i8 vec0, vec1;
2943  v8i16 in0, in1, in2, in3;
2944  v8i16 dst0, dst1, dst2, dst3;
2945  v8i16 filter_vec;
2946  v4i32 weight_vec, offset_vec, rnd_vec;
2947 
2948  src0_ptr -= 1;
2949 
2950  filter_vec = LD_SH(filter);
2951  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2952 
2953  offset = (offset0 + offset1) << rnd_val;
2954  weight0 = weight0 & 0x0000FFFF;
2955  weight = weight0 | (weight1 << 16);
2956  constant = 128 * weight1;
2957  constant <<= 6;
2958  offset += constant;
2959 
2960  offset_vec = __msa_fill_w(offset);
2961  weight_vec = __msa_fill_w(weight);
2962  rnd_vec = __msa_fill_w(rnd_val + 1);
2963 
2964  mask1 = mask0 + 2;
2965 
2966  for (loop_cnt = 2; loop_cnt--;) {
2967  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2968  src0_ptr += (4 * src_stride);
2969  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2970  src1_ptr += (4 * src2_stride);
2971  XORI_B4_128_SB(src0, src1, src2, src3);
2972 
2973  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2974  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2975  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2976  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2977  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2978  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2979  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2980  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2981 
2982  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2983  in0, in1, in2, in3,
2984  weight_vec, rnd_vec, offset_vec,
2985  dst0, dst1, dst2, dst3);
2986 
2987  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2988  ST_W2(dst0, 0, 2, dst, dst_stride);
2989  ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2990  ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2991  ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2992  dst += (4 * dst_stride);
2993  }
2994 }
2995 
2996 static void hevc_hz_biwgt_4t_8x2_msa(const uint8_t *src0_ptr,
2997  int32_t src_stride,
2998  const int16_t *src1_ptr,
2999  int32_t src2_stride,
3000  uint8_t *dst,
3001  int32_t dst_stride,
3002  const int8_t *filter,
3003  int32_t weight0,
3004  int32_t weight1,
3005  int32_t offset0,
3006  int32_t offset1,
3007  int32_t rnd_val)
3008 {
3009  int32_t offset, weight, constant;
3010  v8i16 filt0, filt1;
3011  v16i8 src0, src1;
3012  v8i16 in0, in1;
3013  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3014  v16i8 mask1, vec0, vec1;
3015  v8i16 dst0, dst1;
3016  v8i16 filter_vec;
3017  v4i32 weight_vec, offset_vec, rnd_vec;
3018 
3019  src0_ptr -= 1;
3020 
3021  filter_vec = LD_SH(filter);
3022  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3023 
3024  offset = (offset0 + offset1) << rnd_val;
3025  weight0 = weight0 & 0x0000FFFF;
3026  weight = weight0 | (weight1 << 16);
3027  constant = 128 * weight1;
3028  constant <<= 6;
3029  offset += constant;
3030 
3031  offset_vec = __msa_fill_w(offset);
3032  weight_vec = __msa_fill_w(weight);
3033  rnd_vec = __msa_fill_w(rnd_val + 1);
3034 
3035  mask1 = mask0 + 2;
3036 
3037  LD_SB2(src0_ptr, src_stride, src0, src1);
3038  LD_SH2(src1_ptr, src2_stride, in0, in1);
3040  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3041  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3043  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3044  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
3045  weight_vec, rnd_vec, offset_vec,
3046  dst0, dst1);
3047 
3048  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3049  ST_D2(dst0, 0, 1, dst, dst_stride);
3050 }
3051 
3052 static void hevc_hz_biwgt_4t_8x6_msa(const uint8_t *src0_ptr,
3053  int32_t src_stride,
3054  const int16_t *src1_ptr,
3055  int32_t src2_stride,
3056  uint8_t *dst,
3057  int32_t dst_stride,
3058  const int8_t *filter,
3059  int32_t weight0,
3060  int32_t weight1,
3061  int32_t offset0,
3062  int32_t offset1,
3063  int32_t rnd_val)
3064 {
3065  int32_t weight, offset, constant;
3066  v8i16 filt0, filt1;
3067  v16i8 src0, src1, src2, src3, src4, src5;
3068  v8i16 in0, in1, in2, in3, in4, in5;
3069  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3070  v16i8 mask1;
3071  v16i8 vec0, vec1;
3072  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3073  v8i16 filter_vec;
3074  v4i32 weight_vec, offset_vec, rnd_vec;
3075 
3076  src0_ptr -= 1;
3077 
3078  filter_vec = LD_SH(filter);
3079  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3080 
3081  offset = (offset0 + offset1) << rnd_val;
3082  weight0 = weight0 & 0x0000FFFF;
3083  weight = weight0 | (weight1 << 16);
3084  constant = 128 * weight1;
3085  constant <<= 6;
3086  offset += constant;
3087 
3088  offset_vec = __msa_fill_w(offset);
3089  weight_vec = __msa_fill_w(weight);
3090  rnd_vec = __msa_fill_w(rnd_val + 1);
3091 
3092  mask1 = mask0 + 2;
3093 
3094  LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3095 
3096  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3097  src1_ptr += (4 * src2_stride);
3098  LD_SH2(src1_ptr, src2_stride, in4, in5);
3099  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3100  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3101  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3102  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3103  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3104  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3105  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3106  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3107  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3108  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3109  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3110  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3111  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3112  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3113  in0, in1, in2, in3,
3114  weight_vec, rnd_vec, offset_vec,
3115  dst0, dst1, dst2, dst3);
3116  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3117  weight_vec, rnd_vec, offset_vec,
3118  dst4, dst5);
3119 
3120  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3121  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3122  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3123  ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3124 }
3125 
3126 static void hevc_hz_biwgt_4t_8x4multiple_msa(const uint8_t *src0_ptr,
3127  int32_t src_stride,
3128  const int16_t *src1_ptr,
3129  int32_t src2_stride,
3130  uint8_t *dst,
3131  int32_t dst_stride,
3132  const int8_t *filter,
3133  int32_t height,
3134  int32_t weight0,
3135  int32_t weight1,
3136  int32_t offset0,
3137  int32_t offset1,
3138  int32_t rnd_val)
3139 {
3140  uint32_t loop_cnt;
3141  int32_t offset, weight, constant;
3142  v8i16 filt0, filt1;
3143  v16i8 src0, src1, src2, src3;
3144  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3145  v16i8 mask1;
3146  v16i8 vec0, vec1;
3147  v8i16 in0, in1, in2, in3;
3148  v8i16 dst0, dst1, dst2, dst3;
3149  v8i16 filter_vec;
3150  v4i32 weight_vec, offset_vec, rnd_vec;
3151 
3152  src0_ptr -= 1;
3153 
3154  filter_vec = LD_SH(filter);
3155  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3156 
3157  offset = (offset0 + offset1) << rnd_val;
3158  weight0 = weight0 & 0x0000FFFF;
3159  weight = weight0 | (weight1 << 16);
3160  constant = 128 * weight1;
3161  constant <<= 6;
3162  offset += constant;
3163 
3164  offset_vec = __msa_fill_w(offset);
3165  weight_vec = __msa_fill_w(weight);
3166  rnd_vec = __msa_fill_w(rnd_val + 1);
3167 
3168  mask1 = mask0 + 2;
3169 
3170  for (loop_cnt = (height >> 2); loop_cnt--;) {
3171  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3172  src0_ptr += (4 * src_stride);
3173  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3174  src1_ptr += (4 * src2_stride);
3175  XORI_B4_128_SB(src0, src1, src2, src3);
3176 
3177  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3178  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3179  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3180  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3181  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3182  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3183  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3184  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3185  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3186  in0, in1, in2, in3,
3187  weight_vec, rnd_vec, offset_vec,
3188  dst0, dst1, dst2, dst3);
3189 
3190  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3191  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3192  dst += (4 * dst_stride);
3193  }
3194 }
3195 
3196 static void hevc_hz_biwgt_4t_8w_msa(const uint8_t *src0_ptr,
3197  int32_t src_stride,
3198  const int16_t *src1_ptr,
3199  int32_t src2_stride,
3200  uint8_t *dst,
3201  int32_t dst_stride,
3202  const int8_t *filter,
3203  int32_t height,
3204  int32_t weight0,
3205  int32_t weight1,
3206  int32_t offset0,
3207  int32_t offset1,
3208  int32_t rnd_val)
3209 {
3210  if (2 == height) {
3211  hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3212  dst, dst_stride, filter,
3213  weight0, weight1, offset0, offset1, rnd_val);
3214  } else if (6 == height) {
3215  hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3216  dst, dst_stride, filter,
3217  weight0, weight1, offset0, offset1, rnd_val);
3218  } else if (0 == (height % 4)) {
3219  hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3220  src1_ptr, src2_stride,
3221  dst, dst_stride, filter, height,
3222  weight0, weight1, offset0, offset1,
3223  rnd_val);
3224  }
3225 }
3226 
3227 static void hevc_hz_biwgt_4t_12w_msa(const uint8_t *src0_ptr,
3228  int32_t src_stride,
3229  const int16_t *src1_ptr,
3230  int32_t src2_stride,
3231  uint8_t *dst,
3232  int32_t dst_stride,
3233  const int8_t *filter,
3234  int32_t height,
3235  int32_t weight0,
3236  int32_t weight1,
3237  int32_t offset0,
3238  int32_t offset1,
3239  int32_t rnd_val)
3240 {
3241  uint32_t loop_cnt;
3242  int32_t offset, weight, constant;
3243  v8i16 filt0, filt1;
3244  v16i8 src0, src1, src2, src3;
3245  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3246  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3247  v16i8 mask2 = {
3248  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3249  };
3250  v16i8 mask1, mask3;
3251  v16i8 vec0, vec1;
3252  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3253  v8i16 filter_vec;
3254  v4i32 weight_vec, offset_vec, rnd_vec;
3255 
3256  src0_ptr -= 1;
3257 
3258  filter_vec = LD_SH(filter);
3259  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3260 
3261  offset = (offset0 + offset1) << rnd_val;
3262  weight0 = weight0 & 0x0000FFFF;
3263  weight = weight0 | (weight1 << 16);
3264  constant = 128 * weight1;
3265  constant <<= 6;
3266  offset += constant;
3267 
3268  offset_vec = __msa_fill_w(offset);
3269  weight_vec = __msa_fill_w(weight);
3270  rnd_vec = __msa_fill_w(rnd_val + 1);
3271 
3272  mask1 = mask0 + 2;
3273  mask3 = mask2 + 2;
3274 
3275  for (loop_cnt = 4; loop_cnt--;) {
3276  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3277  src0_ptr += (4 * src_stride);
3278  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3279  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3280  src1_ptr += (4 * src2_stride);
3281  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3282  XORI_B4_128_SB(src0, src1, src2, src3);
3283 
3284  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3285  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3286  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3287  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3288  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3289  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3290  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3291  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3292  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3293  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3294  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3295  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3296 
3297  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3298  in0, in1, in2, in3,
3299  weight_vec, rnd_vec, offset_vec,
3300  dst0, dst1, dst2, dst3);
3301  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3302  weight_vec, rnd_vec, offset_vec,
3303  dst4, dst5);
3304 
3305  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3306  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3307  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3308  ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3309  dst += (4 * dst_stride);
3310  }
3311 }
3312 
3313 static void hevc_hz_biwgt_4t_16w_msa(const uint8_t *src0_ptr,
3314  int32_t src_stride,
3315  const int16_t *src1_ptr,
3316  int32_t src2_stride,
3317  uint8_t *dst,
3318  int32_t dst_stride,
3319  const int8_t *filter,
3320  int32_t height,
3321  int32_t weight0,
3322  int32_t weight1,
3323  int32_t offset0,
3324  int32_t offset1,
3325  int32_t rnd_val)
3326 {
3327  uint32_t loop_cnt;
3328  int32_t offset, weight, constant;
3329  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3330  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3331  v8i16 filt0, filt1;
3332  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3333  v16i8 mask1;
3334  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3335  v16i8 vec0, vec1;
3336  v8i16 filter_vec;
3337  v4i32 weight_vec, offset_vec, rnd_vec;
3338 
3339  src0_ptr -= 1;
3340 
3341  filter_vec = LD_SH(filter);
3342  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3343 
3344  offset = (offset0 + offset1) << rnd_val;
3345  weight0 = weight0 & 0x0000FFFF;
3346  weight = weight0 | (weight1 << 16);
3347  constant = 128 * weight1;
3348  constant <<= 6;
3349  offset += constant;
3350 
3351  offset_vec = __msa_fill_w(offset);
3352  weight_vec = __msa_fill_w(weight);
3353  rnd_vec = __msa_fill_w(rnd_val + 1);
3354 
3355  mask1 = mask0 + 2;
3356 
3357  for (loop_cnt = (height >> 2); loop_cnt--;) {
3358  LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3359  LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3360  src0_ptr += (4 * src_stride);
3361  LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3362  LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3363  src1_ptr += (4 * src2_stride);
3364  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3365 
3366  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3367  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3368  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3369  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3370  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3371  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3372  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3373  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3374  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3375  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3376  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3377  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3378  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3379  dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3380  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3381  dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3382  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3383  in0, in1, in2, in3,
3384  weight_vec, rnd_vec, offset_vec,
3385  dst0, dst1, dst2, dst3);
3386 
3387  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3388  ST_SH2(dst0, dst1, dst, dst_stride);
3389  dst += (2 * dst_stride);
3390 
3391  HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3392  in4, in5, in6, in7,
3393  weight_vec, rnd_vec, offset_vec,
3394  dst0, dst1, dst2, dst3);
3395 
3396  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3397  ST_SH2(dst0, dst1, dst, dst_stride);
3398  dst += (2 * dst_stride);
3399  }
3400 }
3401 
3402 static void hevc_hz_biwgt_4t_24w_msa(const uint8_t *src0_ptr,
3403  int32_t src_stride,
3404  const int16_t *src1_ptr,
3405  int32_t src2_stride,
3406  uint8_t *dst,
3407  int32_t dst_stride,
3408  const int8_t *filter,
3409  int32_t height,
3410  int32_t weight0,
3411  int32_t weight1,
3412  int32_t offset0,
3413  int32_t offset1,
3414  int32_t rnd_val)
3415 {
3416  uint32_t loop_cnt;
3417  int32_t offset, weight, constant;
3418  v16i8 src0, src1, src2, src3;
3419  v8i16 filt0, filt1;
3420  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3421  v16i8 mask1, mask2, mask3;
3422  v16i8 vec0, vec1;
3423  v8i16 dst0, dst1, dst2, dst3;
3424  v8i16 in0, in1, in2, in3, in4, in5;
3425  v8i16 filter_vec;
3426  v4i32 weight_vec, offset_vec, rnd_vec;
3427 
3428  src0_ptr -= 1;
3429 
3430  filter_vec = LD_SH(filter);
3431  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3432 
3433  offset = (offset0 + offset1) << rnd_val;
3434  weight0 = weight0 & 0x0000FFFF;
3435  weight = weight0 | (weight1 << 16);
3436  constant = 128 * weight1;
3437  constant <<= 6;
3438  offset += constant;
3439 
3440  offset_vec = __msa_fill_w(offset);
3441  weight_vec = __msa_fill_w(weight);
3442  rnd_vec = __msa_fill_w(rnd_val + 1);
3443 
3444  mask1 = mask0 + 2;
3445  mask2 = mask0 + 8;
3446  mask3 = mask0 + 10;
3447 
3448  for (loop_cnt = 16; loop_cnt--;) {
3449  LD_SB2(src0_ptr, src_stride, src0, src2);
3450  LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3451  src0_ptr += (2 * src_stride);
3452  LD_SH2(src1_ptr, src2_stride, in0, in2);
3453  LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3454  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3455  src1_ptr += (2 * src2_stride);
3456  XORI_B4_128_SB(src0, src1, src2, src3);
3457 
3458  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3459  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3460  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3461  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3462  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3463  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3464  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3465  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3466  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3467  in0, in1, in2, in3,
3468  weight_vec, rnd_vec, offset_vec,
3469  dst0, dst1, dst2, dst3);
3470 
3471  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3472  ST_SH2(dst0, dst1, dst, dst_stride);
3473 
3474  /* 8 width */
3475  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3476  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3477  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3478  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3479  HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3480  weight_vec, rnd_vec, offset_vec,
3481  dst0, dst1);
3482 
3483  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3484  ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3485  dst += (2 * dst_stride);
3486  }
3487 }
3488 
3489 static void hevc_hz_biwgt_4t_32w_msa(const uint8_t *src0_ptr,
3490  int32_t src_stride,
3491  const int16_t *src1_ptr,
3492  int32_t src2_stride,
3493  uint8_t *dst,
3494  int32_t dst_stride,
3495  const int8_t *filter,
3496  int32_t height,
3497  int32_t weight0,
3498  int32_t weight1,
3499  int32_t offset0,
3500  int32_t offset1,
3501  int32_t rnd_val)
3502 {
3503  uint32_t loop_cnt;
3504  int32_t offset, weight, constant;
3505  v16i8 src0, src1, src2;
3506  v8i16 filt0, filt1;
3507  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3508  v16i8 mask1, mask2, mask3;
3509  v8i16 dst0, dst1, dst2, dst3;
3510  v16i8 vec0, vec1;
3511  v8i16 in0, in1, in2, in3;
3512  v8i16 filter_vec;
3513  v4i32 weight_vec, offset_vec, rnd_vec;
3514 
3515  src0_ptr -= 1;
3516 
3517  filter_vec = LD_SH(filter);
3518  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3519 
3520  offset = (offset0 + offset1) << rnd_val;
3521  weight0 = weight0 & 0x0000FFFF;
3522  weight = weight0 | (weight1 << 16);
3523  constant = 128 * weight1;
3524  constant <<= 6;
3525  offset += constant;
3526 
3527  offset_vec = __msa_fill_w(offset);
3528  weight_vec = __msa_fill_w(weight);
3529  rnd_vec = __msa_fill_w(rnd_val + 1);
3530 
3531  mask1 = mask0 + 2;
3532  mask2 = mask0 + 8;
3533  mask3 = mask0 + 10;
3534 
3535  for (loop_cnt = height; loop_cnt--;) {
3536  LD_SB2(src0_ptr, 16, src0, src1);
3537  src2 = LD_SB(src0_ptr + 24);
3538  src0_ptr += src_stride;
3539  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3540  src1_ptr += src2_stride;
3542 
3543  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3544  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3545  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3546  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3547  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3548  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3549  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3550  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3551  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3552  in0, in1, in2, in3,
3553  weight_vec, rnd_vec, offset_vec,
3554  dst0, dst1, dst2, dst3);
3555 
3556  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3557  ST_SH2(dst0, dst1, dst, 16);
3558  dst += dst_stride;
3559  }
3560 }
3561 
3562 static void hevc_vt_biwgt_4t_4x2_msa(const uint8_t *src0_ptr,
3563  int32_t src_stride,
3564  const int16_t *src1_ptr,
3565  int32_t src2_stride,
3566  uint8_t *dst,
3567  int32_t dst_stride,
3568  const int8_t *filter,
3569  int32_t weight0,
3570  int32_t weight1,
3571  int32_t offset0,
3572  int32_t offset1,
3573  int32_t rnd_val)
3574 {
3575  int32_t weight, offset, constant;
3576  v16i8 src0, src1, src2, src3, src4;
3577  v8i16 in0, in1, dst10;
3578  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3579  v4i32 dst10_r, dst10_l;
3580  v8i16 filt0, filt1;
3581  v8i16 filter_vec, out;
3582  v4i32 weight_vec, offset_vec, rnd_vec;
3583 
3584  src0_ptr -= src_stride;
3585 
3586  offset = (offset0 + offset1) << rnd_val;
3587  weight0 = weight0 & 0x0000FFFF;
3588  weight = weight0 | (weight1 << 16);
3589  constant = 128 * weight1;
3590  constant <<= 6;
3591  offset += constant;
3592 
3593  offset_vec = __msa_fill_w(offset);
3594  weight_vec = __msa_fill_w(weight);
3595  rnd_vec = __msa_fill_w(rnd_val + 1);
3596 
3597  filter_vec = LD_SH(filter);
3598  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3599 
3600  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3601  src0_ptr += (3 * src_stride);
3602  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3603  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3604  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3605  LD_SB2(src0_ptr, src_stride, src3, src4);
3606  src0_ptr += (2 * src_stride);
3607  LD_SH2(src1_ptr, src2_stride, in0, in1);
3608  src1_ptr += (2 * src2_stride);
3609 
3610  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3611  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3612  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3613  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3614 
3615  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3616 
3617  ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3618  dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3619  dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3620  SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3621  CLIP_SW2_0_255(dst10_r, dst10_l);
3622  out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3623  out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3624  ST_W2(out, 0, 1, dst, dst_stride);
3625 }
3626 
3627 static void hevc_vt_biwgt_4t_4x4_msa(const uint8_t *src0_ptr,
3628  int32_t src_stride,
3629  const int16_t *src1_ptr,
3630  int32_t src2_stride,
3631  uint8_t *dst,
3632  int32_t dst_stride,
3633  const int8_t *filter,
3634  int32_t weight0,
3635  int32_t weight1,
3636  int32_t offset0,
3637  int32_t offset1,
3638  int32_t rnd_val)
3639 {
3640  int32_t weight, offset, constant;
3641  v16i8 src0, src1, src2, src3, src4, src5, src6;
3642  v8i16 in0, in1, in2, in3;
3643  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3644  v16i8 src2110, src4332, src6554;
3645  v8i16 dst10, dst32;
3646  v8i16 filt0, filt1;
3647  v8i16 filter_vec;
3648  v4i32 weight_vec, offset_vec, rnd_vec;
3649 
3650  src0_ptr -= src_stride;
3651 
3652  offset = (offset0 + offset1) << rnd_val;
3653  weight0 = weight0 & 0x0000FFFF;
3654  weight = weight0 | (weight1 << 16);
3655  constant = 128 * weight1;
3656  constant <<= 6;
3657  offset += constant;
3658 
3659  offset_vec = __msa_fill_w(offset);
3660  weight_vec = __msa_fill_w(weight);
3661  rnd_vec = __msa_fill_w(rnd_val + 1);
3662 
3663  filter_vec = LD_SH(filter);
3664  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3665 
3666  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3667  src0_ptr += (3 * src_stride);
3668  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3669  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3670  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3671 
3672  LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3673  src0_ptr += (4 * src_stride);
3674  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3675  src1_ptr += (4 * src2_stride);
3676  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3677  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3678  src32_r, src43_r, src54_r, src65_r);
3679  ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3680  XORI_B2_128_SB(src4332, src6554);
3681 
3682  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3683  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3684 
3685  HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3686  weight_vec, rnd_vec, offset_vec,
3687  dst10, dst32);
3688 
3689  dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3690  ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3691  dst += (4 * dst_stride);
3692 }
3693 
3694 static void hevc_vt_biwgt_4t_4x8multiple_msa(const uint8_t *src0_ptr,
3695  int32_t src_stride,
3696  const int16_t *src1_ptr,
3697  int32_t src2_stride,
3698  uint8_t *dst,
3699  int32_t dst_stride,
3700  const int8_t *filter,
3701  int32_t height,
3702  int32_t weight0,
3703  int32_t weight1,
3704  int32_t offset0,
3705  int32_t offset1,
3706  int32_t rnd_val)
3707 {
3708  uint32_t loop_cnt;
3709  int32_t weight, offset, constant;
3710  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3711  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3712  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3713  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3714  v16i8 src2110, src4332, src6554, src8776;
3715  v8i16 dst10, dst32, dst54, dst76;
3716  v8i16 filt0, filt1;
3717  v8i16 filter_vec;
3718  v4i32 weight_vec, offset_vec, rnd_vec;
3719 
3720  src0_ptr -= src_stride;
3721 
3722  offset = (offset0 + offset1) << rnd_val;
3723  weight0 = weight0 & 0x0000FFFF;
3724  weight = weight0 | (weight1 << 16);
3725  constant = 128 * weight1;
3726  constant <<= 6;
3727  offset += constant;
3728 
3729  offset_vec = __msa_fill_w(offset);
3730  weight_vec = __msa_fill_w(weight);
3731  rnd_vec = __msa_fill_w(rnd_val + 1);
3732 
3733  filter_vec = LD_SH(filter);
3734  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3735 
3736  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3737  src0_ptr += (3 * src_stride);
3738  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3739  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3740  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3741 
3742  for (loop_cnt = (height >> 3); loop_cnt--;) {
3743  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3744  src0_ptr += (6 * src_stride);
3745  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3746  src1_ptr += (8 * src2_stride);
3747 
3748  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3749  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3750 
3751  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3752  src32_r, src43_r, src54_r, src65_r);
3753  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3754  ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3755  src4332, src6554, src8776);
3756  XORI_B3_128_SB(src4332, src6554, src8776);
3757 
3758  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3759  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3760  dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3761 
3762  LD_SB2(src0_ptr, src_stride, src9, src2);
3763  src0_ptr += (2 * src_stride);
3764  ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3765  src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3766  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3767 
3768  dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3769  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3770  in0, in1, in2, in3,
3771  weight_vec, rnd_vec, offset_vec,
3772  dst10, dst32, dst54, dst76);
3773 
3774  PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3775  ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3776  dst += (8 * dst_stride);
3777  }
3778 }
3779 
3780 static void hevc_vt_biwgt_4t_4w_msa(const uint8_t *src0_ptr,
3781  int32_t src_stride,
3782  const int16_t *src1_ptr,
3783  int32_t src2_stride,
3784  uint8_t *dst,
3785  int32_t dst_stride,
3786  const int8_t *filter,
3787  int32_t height,
3788  int32_t weight0,
3789  int32_t weight1,
3790  int32_t offset0,
3791  int32_t offset1,
3792  int32_t rnd_val)
3793 {
3794  if (2 == height) {
3795  hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3796  dst, dst_stride, filter,
3797  weight0, weight1, offset0, offset1, rnd_val);
3798  } else if (4 == height) {
3799  hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3800  dst, dst_stride, filter,
3801  weight0, weight1, offset0, offset1, rnd_val);
3802  } else if (0 == (height % 8)) {
3803  hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3804  src1_ptr, src2_stride,
3805  dst, dst_stride, filter, height,
3806  weight0, weight1, offset0, offset1,
3807  rnd_val);
3808  }
3809 }
3810 
3811 static void hevc_vt_biwgt_4t_6w_msa(const uint8_t *src0_ptr,
3812  int32_t src_stride,
3813  const int16_t *src1_ptr,
3814  int32_t src2_stride,
3815  uint8_t *dst,
3816  int32_t dst_stride,
3817  const int8_t *filter,
3818  int32_t height,
3819  int32_t weight0,
3820  int32_t weight1,
3821  int32_t offset0,
3822  int32_t offset1,
3823  int32_t rnd_val)
3824 {
3825  uint32_t loop_cnt;
3826  int32_t res = height & 0x03;
3827  int32_t offset, weight, constant;
3828  v16i8 src0, src1, src2, src3, src4;
3829  v8i16 in0, in1, in2, in3;
3830  v16i8 src10_r, src32_r, src21_r, src43_r;
3831  v8i16 tmp0, tmp1, tmp2, tmp3;
3832  v8i16 filt0, filt1;
3833  v8i16 filter_vec;
3834  v4i32 weight_vec, offset_vec, rnd_vec;
3835 
3836  src0_ptr -= src_stride;
3837 
3838  offset = (offset0 + offset1) << rnd_val;
3839  weight0 = weight0 & 0x0000FFFF;
3840  weight = weight0 | (weight1 << 16);
3841  constant = 128 * weight1;
3842  constant <<= 6;
3843  offset += constant;
3844 
3845  offset_vec = __msa_fill_w(offset);
3846  weight_vec = __msa_fill_w(weight);
3847  rnd_vec = __msa_fill_w(rnd_val + 1);
3848 
3849  filter_vec = LD_SH(filter);
3850  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3851 
3852  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3853  src0_ptr += (3 * src_stride);
3855  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3856 
3857  for (loop_cnt = (height >> 2); loop_cnt--;) {
3858  LD_SB2(src0_ptr, src_stride, src3, src4);
3859  src0_ptr += (2 * src_stride);
3860  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3861  src1_ptr += (4 * src2_stride);
3862  XORI_B2_128_SB(src3, src4);
3863  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3864 
3865  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3866  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3867 
3868  LD_SB2(src0_ptr, src_stride, src1, src2);
3869  src0_ptr += (2 * src_stride);
3871  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3872 
3873  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3874  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3875  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3876  in0, in1, in2, in3,
3877  weight_vec, rnd_vec, offset_vec,
3878  tmp0, tmp1, tmp2, tmp3);
3879 
3880  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3881  ST_W2(tmp0, 0, 2, dst, dst_stride);
3882  ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3883  ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3884  ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3885  dst += (4 * dst_stride);
3886  }
3887  if (res) {
3888  LD_SB2(src0_ptr, src_stride, src3, src4);
3889  src0_ptr += (2 * src_stride);
3890  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3891  src1_ptr += (4 * src2_stride);
3892  XORI_B2_128_SB(src3, src4);
3893  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3894 
3895  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3896  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3897 
3898  LD_SB2(src0_ptr, src_stride, src1, src2);
3899  src0_ptr += (2 * src_stride);
3901  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3902 
3903  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3904  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3905  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3906  in0, in1, in2, in3,
3907  weight_vec, rnd_vec, offset_vec,
3908  tmp0, tmp1, tmp2, tmp3);
3909 
3910  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3911  ST_W2(tmp0, 0, 2, dst, dst_stride);
3912  ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3913  ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3914  ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3915  }
3916 }
3917 
3918 static void hevc_vt_biwgt_4t_8x2_msa(const uint8_t *src0_ptr,
3919  int32_t src_stride,
3920  const int16_t *src1_ptr,
3921  int32_t src2_stride,
3922  uint8_t *dst,
3923  int32_t dst_stride,
3924  const int8_t *filter,
3925  int32_t weight0,
3926  int32_t weight1,
3927  int32_t offset0,
3928  int32_t offset1,
3929  int32_t rnd_val)
3930 {
3931  int32_t offset, weight, constant;
3932  v16i8 src0, src1, src2, src3, src4;
3933  v8i16 in0, in1, tmp0, tmp1;
3934  v16i8 src10_r, src32_r, src21_r, src43_r;
3935  v8i16 filt0, filt1;
3936  v8i16 filter_vec;
3937  v4i32 weight_vec, offset_vec, rnd_vec;
3938 
3939  src0_ptr -= src_stride;
3940 
3941  offset = (offset0 + offset1) << rnd_val;
3942  weight0 = weight0 & 0x0000FFFF;
3943  weight = weight0 | (weight1 << 16);
3944  constant = 128 * weight1;
3945  constant <<= 6;
3946  offset += constant;
3947 
3948  offset_vec = __msa_fill_w(offset);
3949  weight_vec = __msa_fill_w(weight);
3950  rnd_vec = __msa_fill_w(rnd_val + 1);
3951 
3952  filter_vec = LD_SH(filter);
3953  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3954 
3955  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3956  src0_ptr += (3 * src_stride);
3958  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3959 
3960  LD_SB2(src0_ptr, src_stride, src3, src4);
3961  LD_SH2(src1_ptr, src2_stride, in0, in1);
3962  XORI_B2_128_SB(src3, src4);
3963  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3964 
3965  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3966  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3967  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3968  weight_vec, rnd_vec, offset_vec,
3969  tmp0, tmp1);
3970 
3971  tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3972  ST_D2(tmp0, 0, 1, dst, dst_stride);
3973 }
3974 
3975 static void hevc_vt_biwgt_4t_8x6_msa(const uint8_t *src0_ptr,
3976  int32_t src_stride,
3977  const int16_t *src1_ptr,
3978  int32_t src2_stride,
3979  uint8_t *dst,
3980  int32_t dst_stride,
3981  const int8_t *filter,
3982  int32_t weight0,
3983  int32_t weight1,
3984  int32_t offset0,
3985  int32_t offset1,
3986  int32_t rnd_val)
3987 {
3988  int32_t offset, weight, constant;
3989  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3990  v8i16 in0, in1, in2, in3, in4, in5;
3991  v16i8 src10_r, src32_r, src54_r, src76_r;
3992  v16i8 src21_r, src43_r, src65_r, src87_r;
3993  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3994  v8i16 filt0, filt1;
3995  v8i16 filter_vec;
3996  v4i32 weight_vec, offset_vec, rnd_vec;
3997 
3998  src0_ptr -= src_stride;
3999 
4000  offset = (offset0 + offset1) << rnd_val;
4001  weight0 = weight0 & 0x0000FFFF;
4002  weight = weight0 | (weight1 << 16);
4003  constant = 128 * weight1;
4004  constant <<= 6;
4005  offset += constant;
4006 
4007  offset_vec = __msa_fill_w(offset);
4008  weight_vec = __msa_fill_w(weight);
4009  rnd_vec = __msa_fill_w(rnd_val + 1);
4010 
4011  filter_vec = LD_SH(filter);
4012  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4013 
4014  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4015  src0_ptr += (3 * src_stride);
4017  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4018 
4019  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
4020  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4021  XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
4022  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
4023  src32_r, src43_r, src54_r, src65_r);
4024  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4025 
4026  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4027  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4028  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
4029  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
4030  tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
4031  tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
4032  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4033  in0, in1, in2, in3,
4034  weight_vec, rnd_vec, offset_vec,
4035  tmp0, tmp1, tmp2, tmp3);
4036  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4037  weight_vec, rnd_vec, offset_vec,
4038  tmp4, tmp5);
4039 
4040  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4041  tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4042  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4043  ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
4044 }
4045 
4046 static void hevc_vt_biwgt_4t_8x4multiple_msa(const uint8_t *src0_ptr,
4047  int32_t src_stride,
4048  const int16_t *src1_ptr,
4049  int32_t src2_stride,
4050  uint8_t *dst,
4051  int32_t dst_stride,
4052  const int8_t *filter,
4053  int32_t height,
4054  int32_t weight0,
4055  int32_t weight1,
4056  int32_t offset0,
4057  int32_t offset1,
4058  int32_t rnd_val)
4059 {
4060  uint32_t loop_cnt;
4061  int32_t offset, weight, constant;
4062  v16i8 src0, src1, src2, src3, src4;
4063  v8i16 in0, in1, in2, in3;
4064  v16i8 src10_r, src32_r, src21_r, src43_r;
4065  v8i16 tmp0, tmp1, tmp2, tmp3;
4066  v8i16 filt0, filt1;
4067  v8i16 filter_vec;
4068  v4i32 weight_vec, offset_vec, rnd_vec;
4069 
4070  src0_ptr -= src_stride;
4071 
4072  offset = (offset0 + offset1) << rnd_val;
4073  weight0 = weight0 & 0x0000FFFF;
4074  weight = weight0 | (weight1 << 16);
4075  constant = 128 * weight1;
4076  constant <<= 6;
4077  offset += constant;
4078 
4079  offset_vec = __msa_fill_w(offset);
4080  weight_vec = __msa_fill_w(weight);
4081  rnd_vec = __msa_fill_w(rnd_val + 1);
4082 
4083  filter_vec = LD_SH(filter);
4084  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4085 
4086  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4087  src0_ptr += (3 * src_stride);
4089  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4090 
4091  for (loop_cnt = (height >> 2); loop_cnt--;) {
4092  LD_SB2(src0_ptr, src_stride, src3, src4);
4093  src0_ptr += (2 * src_stride);
4094  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4095  src1_ptr += (4 * src2_stride);
4096  XORI_B2_128_SB(src3, src4);
4097  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4098 
4099  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4100  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4101 
4102  LD_SB2(src0_ptr, src_stride, src1, src2);
4103  src0_ptr += (2 * src_stride);
4105  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4106 
4107  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4108  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4109  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4110  in0, in1, in2, in3,
4111  weight_vec, rnd_vec, offset_vec,
4112  tmp0, tmp1, tmp2, tmp3);
4113 
4114  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4115  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4116  dst += (4 * dst_stride);
4117  }
4118 }
4119 
4120 static void hevc_vt_biwgt_4t_8w_msa(const uint8_t *src0_ptr,
4121  int32_t src_stride,
4122  const int16_t *src1_ptr,
4123  int32_t src2_stride,
4124  uint8_t *dst,
4125  int32_t dst_stride,
4126  const int8_t *filter,
4127  int32_t height,
4128  int32_t weight0,
4129  int32_t weight1,
4130  int32_t offset0,
4131  int32_t offset1,
4132  int32_t rnd_val)
4133 {
4134  if (2 == height) {
4135  hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4136  dst, dst_stride, filter,
4137  weight0, weight1, offset0, offset1, rnd_val);
4138  } else if (6 == height) {
4139  hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4140  dst, dst_stride, filter,
4141  weight0, weight1, offset0, offset1, rnd_val);
4142  } else {
4143  hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4144  src1_ptr, src2_stride,
4145  dst, dst_stride, filter, height,
4146  weight0, weight1, offset0, offset1,
4147  rnd_val);
4148  }
4149 }
4150 
4151 static void hevc_vt_biwgt_4t_12w_msa(const uint8_t *src0_ptr,
4152  int32_t src_stride,
4153  const int16_t *src1_ptr,
4154  int32_t src2_stride,
4155  uint8_t *dst,
4156  int32_t dst_stride,
4157  const int8_t *filter,
4158  int32_t height,
4159  int32_t weight0,
4160  int32_t weight1,
4161  int32_t offset0,
4162  int32_t offset1,
4163  int32_t rnd_val)
4164 {
4165  uint32_t loop_cnt;
4166  int32_t offset, weight, constant;
4167  v16i8 src0, src1, src2, src3, src4, src5;
4168  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4169  v16i8 src10_r, src32_r, src21_r, src43_r;
4170  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4171  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4172  v16i8 src2110, src4332;
4173  v8i16 filt0, filt1;
4174  v8i16 filter_vec;
4175  v4i32 weight_vec, offset_vec, rnd_vec;
4176 
4177  src0_ptr -= (1 * src_stride);
4178 
4179  offset = (offset0 + offset1) << rnd_val;
4180  weight0 = weight0 & 0x0000FFFF;
4181  weight = weight0 | (weight1 << 16);
4182  constant = 128 * weight1;
4183  constant <<= 6;
4184  offset += constant;
4185 
4186  offset_vec = __msa_fill_w(offset);
4187  weight_vec = __msa_fill_w(weight);
4188  rnd_vec = __msa_fill_w(rnd_val + 1);
4189 
4190  filter_vec = LD_SH(filter);
4191  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4192 
4193  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4194  src0_ptr += (3 * src_stride);
4196  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4197  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4198  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4199 
4200  for (loop_cnt = (height >> 2); loop_cnt--;) {
4201  LD_SB2(src0_ptr, src_stride, src3, src4);
4202  src0_ptr += (2 * src_stride);
4203  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4204  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4205  src1_ptr += (4 * src2_stride);
4206  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4207  XORI_B2_128_SB(src3, src4);
4208 
4209  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4210  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4211  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4212 
4213  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4214  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4215  tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4216 
4217  LD_SB2(src0_ptr, src_stride, src5, src2);
4218  src0_ptr += (2 * src_stride);
4219  XORI_B2_128_SB(src5, src2);
4220  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4221  ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4222  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4223 
4224  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4225  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4226  tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4227  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4228  in0, in1, in2, in3,
4229  weight_vec, rnd_vec, offset_vec,
4230  tmp0, tmp1, tmp2, tmp3);
4231  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4232  weight_vec, rnd_vec, offset_vec,
4233  tmp4, tmp5);
4234 
4235  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4236  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4237  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4238  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4239  dst += (4 * dst_stride);
4240  }
4241 }
4242 
4243 static void hevc_vt_biwgt_4t_16w_msa(const uint8_t *src0_ptr,
4244  int32_t src_stride,
4245  const int16_t *src1_ptr,
4246  int32_t src2_stride,
4247  uint8_t *dst,
4248  int32_t dst_stride,
4249  const int8_t *filter,
4250  int32_t height,
4251  int32_t weight0,
4252  int32_t weight1,
4253  int32_t offset0,
4254  int32_t offset1,
4255  int32_t rnd_val)
4256 {
4257  uint32_t loop_cnt;
4258  int32_t offset, weight, constant;
4259  v16i8 src0, src1, src2, src3, src4, src5;
4260  v8i16 in0, in1, in2, in3;
4261  v16i8 src10_r, src32_r, src21_r, src43_r;
4262  v16i8 src10_l, src32_l, src21_l, src43_l;
4263  v8i16 tmp0, tmp1, tmp2, tmp3;
4264  v8i16 filt0, filt1;
4265  v8i16 filter_vec;
4266  v4i32 weight_vec, offset_vec, rnd_vec;
4267 
4268  src0_ptr -= src_stride;
4269 
4270  offset = (offset0 + offset1) << rnd_val;
4271  weight0 = weight0 & 0x0000FFFF;
4272  weight = weight0 | (weight1 << 16);
4273  constant = 128 * weight1;
4274  constant <<= 6;
4275  offset += constant;
4276 
4277  offset_vec = __msa_fill_w(offset);
4278  weight_vec = __msa_fill_w(weight);
4279  rnd_vec = __msa_fill_w(rnd_val + 1);
4280 
4281  filter_vec = LD_SH(filter);
4282  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4283 
4284  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4285  src0_ptr += (3 * src_stride);
4287  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4288  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4289 
4290  for (loop_cnt = (height >> 2); loop_cnt--;) {
4291  LD_SB2(src0_ptr, src_stride, src3, src4);
4292  src0_ptr += (2 * src_stride);
4293  LD_SH2(src1_ptr, src2_stride, in0, in1);
4294  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4295  src1_ptr += (2 * src2_stride);
4296  XORI_B2_128_SB(src3, src4);
4297  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4298  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4299 
4300  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4301  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4302  tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4303  tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4304 
4305  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4306  in0, in1, in2, in3,
4307  weight_vec, rnd_vec, offset_vec,
4308  tmp0, tmp1, tmp2, tmp3);
4309  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4310  ST_SH2(tmp0, tmp1, dst, dst_stride);
4311  dst += (2 * dst_stride);
4312  LD_SB2(src0_ptr, src_stride, src5, src2);
4313  src0_ptr += (2 * src_stride);
4314 
4315  LD_SH2(src1_ptr, src2_stride, in0, in1);
4316  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4317  src1_ptr += (2 * src2_stride);
4318  XORI_B2_128_SB(src5, src2);
4319  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4320  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4321 
4322  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4323  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4324  tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4325  tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4326  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4327  in0, in1, in2, in3,
4328  weight_vec, rnd_vec, offset_vec,
4329  tmp0, tmp1, tmp2, tmp3);
4330 
4331  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4332  ST_SH2(tmp0, tmp1, dst, dst_stride);
4333  dst += (2 * dst_stride);
4334  }
4335 }
4336 
4337 static void hevc_vt_biwgt_4t_24w_msa(const uint8_t *src0_ptr,
4338  int32_t src_stride,
4339  const int16_t *src1_ptr,
4340  int32_t src2_stride,
4341  uint8_t *dst,
4342  int32_t dst_stride,
4343  const int8_t *filter,
4344  int32_t height,
4345  int32_t weight0,
4346  int32_t weight1,
4347  int32_t offset0,
4348  int32_t offset1,
4349  int32_t rnd_val)
4350 {
4351  uint32_t loop_cnt;
4352  int32_t offset, weight, constant;
4353  v16i8 src0, src1, src2, src3, src4, src5;
4354  v16i8 src6, src7, src8, src9, src10, src11;
4355  v8i16 in0, in1, in2, in3, in4, in5;
4356  v16i8 src10_r, src32_r, src76_r, src98_r;
4357  v16i8 src10_l, src32_l, src21_l, src43_l;
4358  v16i8 src21_r, src43_r, src87_r, src109_r;
4359  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4360  v8i16 filt0, filt1;
4361  v8i16 filter_vec;
4362  v4i32 weight_vec, offset_vec, rnd_vec;
4363 
4364  src0_ptr -= src_stride;
4365 
4366  offset = (offset0 + offset1) << rnd_val;
4367  weight0 = weight0 & 0x0000FFFF;
4368  weight = weight0 | (weight1 << 16);
4369  constant = 128 * weight1;
4370  constant <<= 6;
4371  offset += constant;
4372 
4373  offset_vec = __msa_fill_w(offset);
4374  weight_vec = __msa_fill_w(weight);
4375  rnd_vec = __msa_fill_w(rnd_val + 1);
4376 
4377  filter_vec = LD_SH(filter);
4378  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4379 
4380  /* 16width */
4381  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4383  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4384  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4385  /* 8width */
4386  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4387  src0_ptr += (3 * src_stride);
4388  XORI_B3_128_SB(src6, src7, src8);
4389  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4390 
4391  for (loop_cnt = (height >> 2); loop_cnt--;) {
4392  /* 16width */
4393  LD_SB2(src0_ptr, src_stride, src3, src4);
4394  LD_SH2(src1_ptr, src2_stride, in0, in1);
4395  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4396  XORI_B2_128_SB(src3, src4);
4397  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4398  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4399 
4400  /* 8width */
4401  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4402  src0_ptr += (2 * src_stride);
4403  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4404  src1_ptr += (2 * src2_stride);
4405  XORI_B2_128_SB(src9, src10);
4406  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4407  /* 16width */
4408  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4409  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4410  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4411  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4412  /* 8width */
4413  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4414  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4415  /* 16width */
4416  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4417  in0, in1, in2, in3,
4418  weight_vec, rnd_vec, offset_vec,
4419  tmp0, tmp1, tmp4, tmp5);
4420  /* 8width */
4421  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4422  weight_vec, rnd_vec, offset_vec,
4423  tmp2, tmp3);
4424  /* 16width */
4425  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4426  /* 8width */
4427  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4428  ST_SH2(tmp0, tmp1, dst, dst_stride);
4429  ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4430  dst += (2 * dst_stride);
4431 
4432  /* 16width */
4433  LD_SB2(src0_ptr, src_stride, src5, src2);
4434  LD_SH2(src1_ptr, src2_stride, in0, in1);
4435  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4436  XORI_B2_128_SB(src5, src2);
4437  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4438  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4439  /* 8width */
4440  LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4441  src0_ptr += (2 * src_stride);
4442  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4443  src1_ptr += (2 * src2_stride);
4444  XORI_B2_128_SB(src11, src8);
4445  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4446  /* 16width */
4447  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4448  tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4449  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4450  tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4451  /* 8width */
4452  tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4453  tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4454  /* 16width */
4455  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4456  in0, in1, in2, in3,
4457  weight_vec, rnd_vec, offset_vec,
4458  tmp0, tmp1, tmp4, tmp5);
4459  /* 8width */
4460  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4461  weight_vec, rnd_vec, offset_vec,
4462  tmp2, tmp3);
4463  /* 16width */
4464  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4465 
4466  /* 8width */
4467  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4468  ST_SH2(tmp0, tmp1, dst, dst_stride);
4469  ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4470  dst += (2 * dst_stride);
4471  }
4472 }
4473 
4474 static void hevc_vt_biwgt_4t_32w_msa(const uint8_t *src0_ptr,
4475  int32_t src_stride,
4476  const int16_t *src1_ptr,
4477  int32_t src2_stride,
4478  uint8_t *dst,
4479  int32_t dst_stride,
4480  const int8_t *filter,
4481  int32_t height,
4482  int32_t weight0,
4483  int32_t weight1,
4484  int32_t offset0,
4485  int32_t offset1,
4486  int32_t rnd_val)
4487 {
4488  uint32_t loop_cnt;
4489  uint8_t *dst_tmp = dst + 16;
4490  int32_t offset, weight, constant;
4491  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4492  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4493  v16i8 src10_r, src32_r, src76_r, src98_r;
4494  v16i8 src21_r, src43_r, src87_r, src109_r;
4495  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4496  v16i8 src10_l, src32_l, src76_l, src98_l;
4497  v16i8 src21_l, src43_l, src87_l, src109_l;
4498  v8i16 filt0, filt1;
4499  v8i16 filter_vec;
4500  v4i32 weight_vec, offset_vec, rnd_vec;
4501 
4502  src0_ptr -= src_stride;
4503 
4504  offset = (offset0 + offset1) << rnd_val;
4505  weight0 = weight0 & 0x0000FFFF;
4506  weight = weight0 | (weight1 << 16);
4507  constant = 128 * weight1;
4508  constant <<= 6;
4509  offset += constant;
4510 
4511  offset_vec = __msa_fill_w(offset);
4512  weight_vec = __msa_fill_w(weight);
4513  rnd_vec = __msa_fill_w(rnd_val + 1);
4514 
4515  filter_vec = LD_SH(filter);
4516  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4517 
4518  /* 16width */
4519  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4521  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4522  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4523  /* next 16width */
4524  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4525  src0_ptr += (3 * src_stride);
4526  XORI_B3_128_SB(src6, src7, src8);
4527  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4528  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4529 
4530  for (loop_cnt = (height >> 1); loop_cnt--;) {
4531  /* 16width */
4532  LD_SB2(src0_ptr, src_stride, src3, src4);
4533  LD_SH2(src1_ptr, src2_stride, in0, in1);
4534  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4535  XORI_B2_128_SB(src3, src4);
4536  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4537  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4538 
4539  /* 16width */
4540  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4541  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4542  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4543  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4544  /* 16width */
4545  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4546  in0, in1, in2, in3,
4547  weight_vec, rnd_vec, offset_vec,
4548  tmp0, tmp1, tmp4, tmp5);
4549  /* 16width */
4550  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4551  ST_SH2(tmp0, tmp1, dst, dst_stride);
4552  dst += (2 * dst_stride);
4553 
4554  src10_r = src32_r;
4555  src21_r = src43_r;
4556  src10_l = src32_l;
4557  src21_l = src43_l;
4558  src2 = src4;
4559 
4560  /* next 16width */
4561  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4562  src0_ptr += (2 * src_stride);
4563  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4564  LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4565  src1_ptr += (2 * src2_stride);
4566  XORI_B2_128_SB(src9, src10);
4567  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4568  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4569  /* next 16width */
4570  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4571  tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4572  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4573  tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4574  /* next 16width */
4575  HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4576  in4, in5, in6, in7,
4577  weight_vec, rnd_vec, offset_vec,
4578  tmp2, tmp3, tmp6, tmp7);
4579 
4580  /* next 16width */
4581  PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4582  ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4583  dst_tmp += (2 * dst_stride);
4584 
4585  src76_r = src98_r;
4586  src87_r = src109_r;
4587  src76_l = src98_l;
4588  src87_l = src109_l;
4589  src8 = src10;
4590  }
4591 }
4592 
4593 static void hevc_hv_biwgt_4t_4x2_msa(const uint8_t *src0_ptr,
4594  int32_t src_stride,
4595  const int16_t *src1_ptr,
4596  int32_t src2_stride,
4597  uint8_t *dst,
4598  int32_t dst_stride,
4599  const int8_t *filter_x,
4600  const int8_t *filter_y,
4601  int32_t weight0,
4602  int32_t weight1,
4603  int32_t offset0,
4604  int32_t offset1,
4605  int32_t rnd_val)
4606 {
4607  uint64_t tp0, tp1;
4609  v8i16 in0 = { 0 };
4610  v16u8 out;
4611  v16i8 src0, src1, src2, src3, src4;
4612  v8i16 filt0, filt1;
4613  v8i16 filt_h0, filt_h1;
4614  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4615  v16i8 mask1;
4616  v8i16 filter_vec, tmp, weight_vec;
4617  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4618  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4619  v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4620 
4621  src0_ptr -= (src_stride + 1);
4622 
4623  filter_vec = LD_SH(filter_x);
4624  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4625 
4626  filter_vec = LD_SH(filter_y);
4627  UNPCK_R_SB_SH(filter_vec, filter_vec);
4628 
4629  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4630 
4631  mask1 = mask0 + 2;
4632 
4633  offset = (offset0 + offset1) << rnd_val;
4634  weight0 = weight0 & 0x0000FFFF;
4635  weight = weight0 | (weight1 << 16);
4636 
4637  const_vec = __msa_fill_w((128 * weight1));
4638  const_vec <<= 6;
4639  offset_vec = __msa_fill_w(offset);
4640  weight_vec = (v8i16) __msa_fill_w(weight);
4641  rnd_vec = __msa_fill_w(rnd_val + 1);
4642  offset_vec += const_vec;
4643 
4644  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4645  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4646 
4647  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4648  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4649  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4650 
4651  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4652  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4653  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4654 
4655  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4656  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4657 
4658  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4659  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4660  dst0 >>= 6;
4661  dst1 >>= 6;
4662  dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4663 
4664  LD2(src1_ptr, src2_stride, tp0, tp1);
4665  INSERT_D2_SH(tp0, tp1, in0);
4666 
4667  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4668  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4669  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4670  SRAR_W2_SW(dst0, dst1, rnd_vec);
4671  CLIP_SW2_0_255(dst0, dst1);
4672  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4673  out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4674  ST_W2(out, 0, 1, dst, dst_stride);
4675 }
4676 
4677 static void hevc_hv_biwgt_4t_4x4_msa(const uint8_t *src0_ptr,
4678  int32_t src_stride,
4679  const int16_t *src1_ptr,
4680  int32_t src2_stride,
4681  uint8_t *dst,
4682  int32_t dst_stride,
4683  const int8_t *filter_x,
4684  const int8_t *filter_y,
4685  int32_t weight0,
4686  int32_t weight1,
4687  int32_t offset0,
4688  int32_t offset1,
4689  int32_t rnd_val)
4690 {
4691  uint64_t tp0, tp1;
4693  v16u8 out;
4694  v8i16 in0 = { 0 }, in1 = { 0 };
4695  v16i8 src0, src1, src2, src3, src4, src5, src6;
4696  v8i16 filt0, filt1;
4697  v8i16 filt_h0, filt_h1;
4698  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4699  v16i8 mask1;
4700  v8i16 filter_vec, weight_vec;
4701  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4702  v8i16 tmp0, tmp1, tmp2, tmp3;
4703  v8i16 dst30, dst41, dst52, dst63;
4704  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4705  v4i32 offset_vec, rnd_vec, const_vec;
4706  v4i32 dst0, dst1, dst2, dst3;
4707 
4708  src0_ptr -= (src_stride + 1);
4709 
4710  filter_vec = LD_SH(filter_x);
4711  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4712 
4713  filter_vec = LD_SH(filter_y);
4714  UNPCK_R_SB_SH(filter_vec, filter_vec);
4715 
4716  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4717 
4718  mask1 = mask0 + 2;
4719 
4720  offset = (offset0 + offset1) << rnd_val;
4721  weight0 = weight0 & 0x0000FFFF;
4722  weight = weight0 | (weight1 << 16);
4723 
4724  const_vec = __msa_fill_w((128 * weight1));
4725  const_vec <<= 6;
4726  offset_vec = __msa_fill_w(offset);
4727  weight_vec = (v8i16) __msa_fill_w(weight);
4728  rnd_vec = __msa_fill_w(rnd_val + 1);
4729  offset_vec += const_vec;
4730 
4731  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4732  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4733 
4734  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4735  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4736  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4737  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4738 
4739  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4740  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4741  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4742  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4743 
4744  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4745  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4746  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4747  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4748  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4749  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4750  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4751  SRA_4V(dst0, dst1, dst2, dst3, 6);
4752  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4753 
4754  LD2(src1_ptr, src2_stride, tp0, tp1);
4755  INSERT_D2_SH(tp0, tp1, in0);
4756  src1_ptr += (2 * src2_stride);
4757  LD2(src1_ptr, src2_stride, tp0, tp1);
4758  INSERT_D2_SH(tp0, tp1, in1);
4759 
4760  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4761  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4762 
4763  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4764  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4765  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4766  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4767  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4768  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
4769  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4770  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4771  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4772 }
4773 
4774 static void hevc_hv_biwgt_4t_4multx8mult_msa(const uint8_t *src0_ptr,
4775  int32_t src_stride,
4776  const int16_t *src1_ptr,
4777  int32_t src2_stride,
4778  uint8_t *dst,
4779  int32_t dst_stride,
4780  const int8_t *filter_x,
4781  const int8_t *filter_y,
4782  int32_t height,
4783  int32_t weight0,
4784  int32_t weight1,
4785  int32_t offset0,
4786  int32_t offset1,
4787  int32_t rnd_val)
4788 {
4789  uint32_t loop_cnt;
4790  uint64_t tp0, tp1;
4792  v16u8 out0, out1;
4793  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4794  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4795  v8i16 filt0, filt1;
4796  v8i16 filt_h0, filt_h1;
4797  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4798  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4799  v16i8 mask1;
4800  v8i16 filter_vec, weight_vec;
4801  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4802  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4803  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4804  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4805  v8i16 dst98_r, dst109_r;
4806  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4807  v4i32 offset_vec, rnd_vec, const_vec;
4808 
4809  src0_ptr -= (src_stride + 1);
4810 
4811  filter_vec = LD_SH(filter_x);
4812  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4813 
4814  filter_vec = LD_SH(filter_y);
4815  UNPCK_R_SB_SH(filter_vec, filter_vec);
4816 
4817  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4818 
4819  mask1 = mask0 + 2;
4820 
4821  offset = (offset0 + offset1) << rnd_val;
4822  weight0 = weight0 & 0x0000FFFF;
4823  weight = weight0 | (weight1 << 16);
4824 
4825  const_vec = __msa_fill_w((128 * weight1));
4826  const_vec <<= 6;
4827  offset_vec = __msa_fill_w(offset);
4828  weight_vec = (v8i16) __msa_fill_w(weight);
4829  rnd_vec = __msa_fill_w(rnd_val + 1);
4830  offset_vec += const_vec;
4831 
4832  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4833  src0_ptr += (3 * src_stride);
4835 
4836  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4837  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4838  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4839  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4840  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4841  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4842 
4843  for (loop_cnt = height >> 3; loop_cnt--;) {
4844  LD_SB8(src0_ptr, src_stride,
4845  src3, src4, src5, src6, src7, src8, src9, src10);
4846  src0_ptr += (8 * src_stride);
4847  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4848  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4849  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4850  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4851  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4852 
4853  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4854  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4855  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4856  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4857 
4858  dst32_r = __msa_ilvr_h(dst73, dst22);
4859  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4860  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4861  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4862  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4863  dst76_r = __msa_ilvr_h(dst22, dst106);
4864 
4865  LD2(src1_ptr, src2_stride, tp0, tp1);
4866  src1_ptr += 2 * src2_stride;
4867  INSERT_D2_SH(tp0, tp1, in0);
4868  LD2(src1_ptr, src2_stride, tp0, tp1);
4869  src1_ptr += 2 * src2_stride;
4870  INSERT_D2_SH(tp0, tp1, in1);
4871 
4872  LD2(src1_ptr, src2_stride, tp0, tp1);
4873  src1_ptr += 2 * src2_stride;
4874  INSERT_D2_SH(tp0, tp1, in2);
4875  LD2(src1_ptr, src2_stride, tp0, tp1);
4876  src1_ptr += 2 * src2_stride;
4877  INSERT_D2_SH(tp0, tp1, in3);
4878 
4879  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4880  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4881  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4882  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4883  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4884  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4885  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4886  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4887  SRA_4V(dst0, dst1, dst2, dst3, 6);
4888  SRA_4V(dst4, dst5, dst6, dst7, 6);
4889  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4890  dst2, dst3);
4891  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4892  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4893  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4894  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4895  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4896  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4897  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4898  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4899  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4900  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4901  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4902  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4903  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4904  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4905  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
4906  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
4907  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4908  tmp2, tmp3);
4909  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4910  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4911  dst += (8 * dst_stride);
4912 
4913  dst10_r = dst98_r;
4914  dst21_r = dst109_r;
4915  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4916  }
4917 }
4918 
4919 static void hevc_hv_biwgt_4t_4w_msa(const uint8_t *src0_ptr,
4920  int32_t src_stride,
4921  const int16_t *src1_ptr,
4922  int32_t src2_stride,
4923  uint8_t *dst,
4924  int32_t dst_stride,
4925  const int8_t *filter_x,
4926  const int8_t *filter_y,
4927  int32_t height,
4928  int32_t weight0,
4929  int32_t weight1,
4930  int32_t offset0,
4931  int32_t offset1,
4932  int32_t rnd_val)
4933 {
4934  if (2 == height) {
4935  hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4936  dst, dst_stride, filter_x, filter_y,
4937  weight0, weight1, offset0, offset1, rnd_val);
4938  } else if (4 == height) {
4939  hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4940  dst, dst_stride, filter_x, filter_y,
4941  weight0, weight1, offset0, offset1, rnd_val);
4942  } else if (0 == (height % 8)) {
4943  hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4944  src1_ptr, src2_stride,
4945  dst, dst_stride, filter_x, filter_y,
4946  height, weight0, weight1,
4947  offset0, offset1, rnd_val);
4948  }
4949 }
4950 
4951 static void hevc_hv_biwgt_4t_6w_msa(const uint8_t *src0_ptr,
4952  int32_t src_stride,
4953  const int16_t *src1_ptr,
4954  int32_t src2_stride,
4955  uint8_t *dst,
4956  int32_t dst_stride,
4957  const int8_t *filter_x,
4958  const int8_t *filter_y,
4959  int32_t height,
4960  int32_t weight0,
4961  int32_t weight1,
4962  int32_t offset0,
4963  int32_t offset1,
4964  int32_t rnd_val)
4965 {
4966  uint32_t tpw0, tpw1, tpw2, tpw3;
4967  uint64_t tp0, tp1;
4969  v16u8 out0, out1, out2;
4970  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4971  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4972  v8i16 in4 = { 0 }, in5 = { 0 };
4973  v8i16 filt0, filt1;
4974  v8i16 filt_h0, filt_h1, filter_vec;
4975  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4976  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4977  v16i8 mask1;
4978  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4979  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4980  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4981  v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4982  v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4983  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4984  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4985  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4986  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4987  v4i32 offset_vec, rnd_vec, const_vec;
4988 
4989  src0_ptr -= (src_stride + 1);
4990 
4991  filter_vec = LD_SH(filter_x);
4992  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4993 
4994  filter_vec = LD_SH(filter_y);
4995  UNPCK_R_SB_SH(filter_vec, filter_vec);
4996 
4997  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4998 
4999  mask1 = mask0 + 2;
5000 
5001  offset = (offset0 + offset1) << rnd_val;
5002  weight0 = weight0 & 0x0000FFFF;
5003  weight = weight0 | (weight1 << 16);
5004 
5005  const_vec = __msa_fill_w((128 * weight1));
5006  const_vec <<= 6;
5007  offset_vec = __msa_fill_w(offset);
5008  weight_vec = (v8i16) __msa_fill_w(weight);
5009  rnd_vec = __msa_fill_w(rnd_val + 1);
5010  offset_vec += const_vec;
5011 
5012  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5013  src0_ptr += (3 * src_stride);
5015 
5016  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5017  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5018  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5019  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5020  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5021  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5022 
5023  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5024  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5025 
5026  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5027  src10);
5028  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5029 
5030  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5031  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5032  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5033  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5034 
5035  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5036  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5037  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5038  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5039 
5040  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
5041  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
5042  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
5043  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
5044 
5045  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5046  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5047  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5048  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5049 
5050  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5051  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5052  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5053  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5054  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5055  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5056  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
5057  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
5058  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
5059  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
5060  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
5061 
5062  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5063  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5064  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5065  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5066  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5067  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5068  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5069  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5070  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
5071  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
5072  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
5073  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
5074  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
5075  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
5076  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
5077  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
5078  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
5079 
5080  LD2(src1_ptr, src2_stride, tp0, tp1);
5081  INSERT_D2_SH(tp0, tp1, in0);
5082  LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
5083  INSERT_D2_SH(tp0, tp1, in1);
5084 
5085  LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
5086  INSERT_D2_SH(tp0, tp1, in2);
5087  LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
5088  INSERT_D2_SH(tp0, tp1, in3);
5089 
5090  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5091  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5092  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5093  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5094  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5095  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5096  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5097  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5098  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5099  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5100  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5101  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5102  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5103  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5104  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5105  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5106  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5107  tmp2, tmp3);
5108  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5109  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5110 
5111  PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5112 
5113  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5114  src1_ptr += (4 * src2_stride);
5115  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5116  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5117  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5118 
5119  ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5120  ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5121 
5122  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5123  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5124  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5125  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5126  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5127  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5128  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5129 
5130  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5131  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5132 }
5133 
5134 static void hevc_hv_biwgt_4t_8x2_msa(const uint8_t *src0_ptr,
5135  int32_t src_stride,
5136  const int16_t *src1_ptr,
5137  int32_t src2_stride,
5138  uint8_t *dst,
5139  int32_t dst_stride,
5140  const int8_t *filter_x,
5141  const int8_t *filter_y,
5142  int32_t weight0,
5143  int32_t weight1,
5144  int32_t offset0,
5145  int32_t offset1,
5146  int32_t rnd_val)
5147 {
5149  v16u8 out;
5150  v16i8 src0, src1, src2, src3, src4;
5151  v8i16 filt0, filt1;
5152  v8i16 filt_h0, filt_h1;
5153  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5154  v16i8 mask1;
5155  v8i16 filter_vec, weight_vec;
5156  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5157  v8i16 dst0, dst1, dst2, dst3, dst4;
5158  v8i16 in0, in1;
5159  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5160  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5161  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5162  v8i16 tmp0, tmp1, tmp2, tmp3;
5163  v4i32 offset_vec, rnd_vec, const_vec;
5164 
5165  src0_ptr -= (src_stride + 1);
5166 
5167  filter_vec = LD_SH(filter_x);
5168  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5169 
5170  filter_vec = LD_SH(filter_y);
5171  UNPCK_R_SB_SH(filter_vec, filter_vec);
5172 
5173  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5174 
5175  mask1 = mask0 + 2;
5176 
5177  offset = (offset0 + offset1) << rnd_val;
5178  weight0 = weight0 & 0x0000FFFF;
5179  weight = weight0 | (weight1 << 16);
5180 
5181  const_vec = __msa_fill_w((128 * weight1));
5182  const_vec <<= 6;
5183  offset_vec = __msa_fill_w(offset);
5184  weight_vec = (v8i16) __msa_fill_w(weight);
5185  rnd_vec = __msa_fill_w(rnd_val + 1);
5186  offset_vec += const_vec;
5187 
5188  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5189  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5190 
5191  LD_SH2(src1_ptr, src2_stride, in0, in1);
5192 
5193  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5194  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5195  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5196  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5197  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5198 
5199  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5200  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5201  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5202  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5203  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5204 
5205  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5206  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5207  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5208  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5209  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5210  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5211  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5212  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5213  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5214  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5215 
5216  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5217  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5218 
5219  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5220  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5221  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5222  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5223  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5224  CLIP_SW4_0_255(dst0_r, dst0_l, dst1_r, dst1_l);
5225  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5226  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5227  ST_D2(out, 0, 1, dst, dst_stride);
5228 }
5229 
5230 static void hevc_hv_biwgt_4t_8multx4_msa(const uint8_t *src0_ptr,
5231  int32_t src_stride,
5232  const int16_t *src1_ptr,
5233  int32_t src2_stride,
5234  uint8_t *dst,
5235  int32_t dst_stride,
5236  const int8_t *filter_x,
5237  const int8_t *filter_y,
5238  int32_t weight0,
5239  int32_t weight1,
5240  int32_t offset0,
5241  int32_t offset1,
5242  int32_t rnd_val,
5243  int32_t width8mult)
5244 {
5246  uint32_t cnt;
5247  v16u8 out0, out1;
5248  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5249  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5250  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5251  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5252  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5253  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5254  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5255  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5256  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5257  v4i32 offset_vec, rnd_vec, const_vec;
5258 
5259  src0_ptr -= (src_stride + 1);
5260 
5261  filter_vec = LD_SH(filter_x);
5262  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5263 
5264  filter_vec = LD_SH(filter_y);
5265  UNPCK_R_SB_SH(filter_vec, filter_vec);
5266 
5267  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5268 
5269  mask0 = LD_SB(ff_hevc_mask_arr);
5270  mask1 = mask0 + 2;
5271 
5272  offset = (offset0 + offset1) << rnd_val;
5273  weight0 = weight0 & 0x0000FFFF;
5274  weight = weight0 | (weight1 << 16);
5275 
5276  const_vec = __msa_fill_w((128 * weight1));
5277  const_vec <<= 6;
5278  offset_vec = __msa_fill_w(offset);
5279  rnd_vec = __msa_fill_w(rnd_val + 1);
5280  offset_vec += const_vec;
5281  weight_vec = (v8i16) __msa_fill_w(weight);
5282 
5283  for (cnt = width8mult; cnt--;) {
5284  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5285  src0_ptr += 8;
5286  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5287 
5288  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5289  src1_ptr += 8;
5290 
5291  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5292  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5293  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5294 
5295  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5296  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5297  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5298 
5299  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5300  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5301 
5302  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5303  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5304  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5305  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5306 
5307  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5308  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5309  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5310  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5311 
5312  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5313  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5314  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5315  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5316 
5317  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5318  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5319  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5320  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5321  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5322  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5323  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5324  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5325 
5326  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5327  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5328  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5329  dst3_r, dst0, dst1, dst2, dst3);
5330 
5331  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5332  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5333  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5334  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5335  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5336  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5337  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5338  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5339  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5340  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5341  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5342  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5343  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5344  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5345  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5346  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5347  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5348  tmp0, tmp1, tmp2, tmp3);
5349  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5350  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5351  dst += 8;
5352  }
5353 }
5354 
5355 static void hevc_hv_biwgt_4t_8x6_msa(const uint8_t *src0_ptr,
5356  int32_t src_stride,
5357  const int16_t *src1_ptr,
5358  int32_t src2_stride,
5359  uint8_t *dst,
5360  int32_t dst_stride,
5361  const int8_t *filter_x,
5362  const int8_t *filter_y,
5363  int32_t weight0,
5364  int32_t weight1,
5365  int32_t offset0,
5366  int32_t offset1,
5367  int32_t rnd_val)
5368 {
5369  uint32_t offset, weight;
5370  v16u8 out0, out1, out2;
5371  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5372  v8i16 filt0, filt1;
5373  v8i16 filt_h0, filt_h1;
5374  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5375  v16i8 mask1;
5376  v8i16 filter_vec, weight_vec;
5377  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5378  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5379  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5380  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5381  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5382  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5383  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5384  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5385  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5386  v8i16 in0, in1, in2, in3, in4, in5;
5387  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5388  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5389  v4i32 offset_vec, rnd_vec, const_vec;
5390 
5391  src0_ptr -= (src_stride + 1);
5392 
5393  filter_vec = LD_SH(filter_x);
5394  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5395 
5396  filter_vec = LD_SH(filter_y);
5397  UNPCK_R_SB_SH(filter_vec, filter_vec);
5398 
5399  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5400 
5401  mask1 = mask0 + 2;
5402 
5403  offset = (offset0 + offset1) << rnd_val;
5404  weight0 = weight0 & 0x0000FFFF;
5405  weight = weight0 | (weight1 << 16);
5406 
5407  const_vec = __msa_fill_w((128 * weight1));
5408  const_vec <<= 6;
5409  offset_vec = __msa_fill_w(offset);
5410  weight_vec = (v8i16) __msa_fill_w(weight);
5411  rnd_vec = __msa_fill_w(rnd_val + 1);
5412  offset_vec += const_vec;
5413 
5414  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5415  src0_ptr += (5 * src_stride);
5416  LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5417 
5418  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5419  XORI_B4_128_SB(src5, src6, src7, src8);
5420 
5421  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5422 
5423  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5424  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5425  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5426  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5427  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5428  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5429  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5430  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5431  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5432 
5433  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5434  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5435  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5436  dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5437  dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5438  dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5439  dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5440  dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5441  dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5442 
5443  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5444  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5445  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5446  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5447  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5448  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5449  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5450  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5451 
5452  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5453  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5454  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5455  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5456  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5457  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5458  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5459  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5460  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5461  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5462  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5463  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5464 
5465  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5466  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5467  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5468  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5469  dst0, dst1, dst2, dst3);
5470 
5471  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5472  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5473  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5474  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5475  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5476  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5477  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5478  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5479  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5480  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5481  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5482  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5483  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5484  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5485  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5486  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5487  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5488  tmp0, tmp1, tmp2, tmp3);
5489  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5490 
5491  PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5492  ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5493  ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5494  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5495  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5496  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5497  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5498  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5499  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5500  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5501  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5502  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5503  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5504 }
5505 
5506 static void hevc_hv_biwgt_4t_8multx4mult_msa(const uint8_t *src0_ptr,
5507  int32_t src_stride,
5508  const int16_t *src1_ptr,
5509  int32_t src2_stride,
5510  uint8_t *dst,
5511  int32_t dst_stride,
5512  const int8_t *filter_x,
5513  const int8_t *filter_y,
5514  int32_t height,
5515  int32_t weight0,
5516  int32_t weight1,
5517  int32_t offset0,
5518  int32_t offset1,
5519  int32_t rnd_val,
5520  int32_t width)
5521 {
5522  uint32_t loop_cnt;
5523  uint32_t cnt;
5525  const uint8_t *src0_ptr_tmp;
5526  const int16_t *src1_ptr_tmp;
5527  uint8_t *dst_tmp;
5528  v16u8 out0, out1;
5529  v16i8 src0, src1, src2, src3, src4, src5, src6;
5530  v8i16 in0, in1, in2, in3;
5531  v8i16 filt0, filt1;
5532  v8i16 filt_h0, filt_h1;
5533  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5534  v16i8 mask1;
5535  v8i16 filter_vec;
5536  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5537  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5538  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5539  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5540  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5541  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5542  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5543  v4i32 offset_vec, rnd_vec, const_vec;
5544 
5545  src0_ptr -= (src_stride + 1);
5546 
5547  filter_vec = LD_SH(filter_x);
5548  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5549 
5550  filter_vec = LD_SH(filter_y);
5551  UNPCK_R_SB_SH(filter_vec, filter_vec);
5552 
5553  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5554 
5555  mask1 = mask0 + 2;
5556 
5557  offset = (offset0 + offset1) << rnd_val;
5558  weight0 = weight0 & 0x0000FFFF;
5559  weight = weight0 | (weight1 << 16);
5560 
5561  const_vec = __msa_fill_w((128 * weight1));
5562  const_vec <<= 6;
5563  offset_vec = __msa_fill_w(offset);
5564  weight_vec = (v8i16) __msa_fill_w(weight);
5565  rnd_vec = __msa_fill_w(rnd_val + 1);
5566  offset_vec += const_vec;
5567 
5568  for (cnt = width >> 3; cnt--;) {
5569  src0_ptr_tmp = src0_ptr;
5570  src1_ptr_tmp = src1_ptr;
5571  dst_tmp = dst;
5572 
5573  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5574  src0_ptr_tmp += (3 * src_stride);
5576 
5577  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5578  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5579  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5580  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5581  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5582  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5583 
5584  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5585  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5586 
5587  for (loop_cnt = height >> 2; loop_cnt--;) {
5588  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5589  src0_ptr_tmp += (4 * src_stride);
5590  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5591  src1_ptr_tmp += (4 * src2_stride);
5592  XORI_B4_128_SB(src3, src4, src5, src6);
5593 
5594  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5595  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5596  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5597  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5598 
5599  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5600  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5601  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5602  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5603 
5604  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5605  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5606  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5607  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5608 
5609  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5610  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5611  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5612  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5613  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5614  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5615  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5616  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5617 
5618  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5619  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5620  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5621  dst3_r, dst0, dst1, dst2, dst3);
5622  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5623  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5624  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5625  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5626  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5627  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5628  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5629  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5630  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5631  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5632  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5633  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5634  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5635  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5636  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5637  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5638  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5639  tmp0, tmp1, tmp2, tmp3);
5640  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5641  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5642  dst_tmp += (4 * dst_stride);
5643 
5644  dst10_r = dst54_r;
5645  dst10_l = dst54_l;
5646  dst21_r = dst65_r;
5647  dst21_l = dst65_l;
5648  dsth2 = dsth6;
5649  }
5650 
5651  src0_ptr += 8;
5652  dst += 8;
5653  src1_ptr += 8;
5654  }
5655 }
5656 
5657 static void hevc_hv_biwgt_4t_8w_msa(const uint8_t *src0_ptr,
5658  int32_t src_stride,
5659  const int16_t *src1_ptr,
5660  int32_t src2_stride,
5661  uint8_t *dst,
5662  int32_t dst_stride,
5663  const int8_t *filter_x,
5664  const int8_t *filter_y,
5665  int32_t height,
5666  int32_t weight0,
5667  int32_t weight1,
5668  int32_t offset0,
5669  int32_t offset1,
5670  int32_t rnd_val)
5671 {
5672  if (2 == height) {
5673  hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5674  dst, dst_stride, filter_x, filter_y,
5675  weight0, weight1, offset0, offset1, rnd_val);
5676  } else if (4 == height) {
5677  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5678  src2_stride, dst, dst_stride, filter_x,
5679  filter_y, weight0, weight1, offset0,
5680  offset1, rnd_val, 1);
5681  } else if (6 == height) {
5682  hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5683  dst, dst_stride, filter_x, filter_y,
5684  weight0, weight1, offset0, offset1, rnd_val);
5685  } else if (0 == (height % 4)) {
5686  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5687  src1_ptr, src2_stride,
5688  dst, dst_stride, filter_x, filter_y,
5689  height, weight0,
5690  weight1, offset0, offset1, rnd_val, 8);
5691  }
5692 }
5693 
5694 static void hevc_hv_biwgt_4t_12w_msa(const uint8_t *src0_ptr,
5695  int32_t src_stride,
5696  const int16_t *src1_ptr,
5697  int32_t src2_stride,
5698  uint8_t *dst,
5699  int32_t dst_stride,
5700  const int8_t *filter_x,
5701  const int8_t *filter_y,
5702  int32_t height,
5703  int32_t weight0,
5704  int32_t weight1,
5705  int32_t offset0,
5706  int32_t offset1,
5707  int32_t rnd_val)
5708 {
5709  uint32_t loop_cnt;
5710  uint64_t tp0, tp1;
5712  const uint8_t *src0_ptr_tmp;
5713  const int16_t *src1_ptr_tmp;
5714  uint8_t *dst_tmp;
5715  v16u8 out0, out1;
5716  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5717  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5718  v16i8 mask0, mask1, mask2, mask3;
5719  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5720  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5721  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5722  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5723  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5724  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5725  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5726  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5727  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5728  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5729  v4i32 offset_vec, rnd_vec, const_vec;
5730 
5731  src0_ptr -= (src_stride + 1);
5732 
5733  filter_vec = LD_SH(filter_x);
5734  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5735 
5736  filter_vec = LD_SH(filter_y);
5737  UNPCK_R_SB_SH(filter_vec, filter_vec);
5738 
5739  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5740 
5741  mask0 = LD_SB(ff_hevc_mask_arr);
5742  mask1 = mask0 + 2;
5743 
5744  offset = (offset0 + offset1) << rnd_val;
5745  weight0 = weight0 & 0x0000FFFF;
5746  weight = weight0 | (weight1 << 16);
5747 
5748  const_vec = __msa_fill_w((128 * weight1));
5749  const_vec <<= 6;
5750  offset_vec = __msa_fill_w(offset);
5751  rnd_vec = __msa_fill_w(rnd_val + 1);
5752  offset_vec += const_vec;
5753  weight_vec = (v8i16) __msa_fill_w(weight);
5754 
5755  src0_ptr_tmp = src0_ptr;
5756  dst_tmp = dst;
5757  src1_ptr_tmp = src1_ptr;
5758 
5759  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5760  src0_ptr_tmp += (3 * src_stride);
5761 
5763 
5764  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5765  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5766  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5767 
5768  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5769  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5770  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5771 
5772  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5773  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5774 
5775  for (loop_cnt = 4; loop_cnt--;) {
5776  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5777  src0_ptr_tmp += (4 * src_stride);
5778  XORI_B4_128_SB(src3, src4, src5, src6);
5779 
5780  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5781  src1_ptr_tmp += (4 * src2_stride);
5782 
5783  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5784  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5785  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5786  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5787 
5788  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5789  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5790  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5791  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5792 
5793  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5794  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5795  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5796  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5797 
5798  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5799  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5800  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5801  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5802  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5803  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5804  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5805  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5806 
5807  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5808  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5809  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5810  dst3_r, dst0, dst1, dst2, dst3);
5811  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5812  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5813  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5814  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5815  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5816  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5817  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5818  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5819  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5820  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5821  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5822  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5823  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5824  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5825  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5826  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5827  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5828  tmp0, tmp1, tmp2, tmp3);
5829  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5830  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5831  dst_tmp += (4 * dst_stride);
5832 
5833  dst10_r = dst54_r;
5834  dst10_l = dst54_l;
5835  dst21_r = dst65_r;
5836  dst21_l = dst65_l;
5837  dsth2 = dsth6;
5838  }
5839 
5840  src0_ptr += 8;
5841  dst += 8;
5842  src1_ptr += 8;
5843 
5844  mask2 = LD_SB(ff_hevc_mask_arr + 16);
5845  mask3 = mask2 + 2;
5846 
5847  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5848  src0_ptr += (3 * src_stride);
5850  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5851  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5852 
5853  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5854  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5855 
5856  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5857  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5858 
5859  for (loop_cnt = 2; loop_cnt--;) {
5860  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5861  src10);
5862  src0_ptr += (8 * src_stride);
5863  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5864  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5865  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5866  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5867  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5868 
5869  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5870  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5871  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5872  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5873 
5874  dst32_r = __msa_ilvr_h(dst73, dst22);
5875  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5876  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5877  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5878  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5879  dst76_r = __msa_ilvr_h(dst22, dst106);
5880 
5881  LD2(src1_ptr, src2_stride, tp0, tp1);
5882  src1_ptr += 2 * src2_stride;
5883  INSERT_D2_SH(tp0, tp1, in0);
5884  LD2(src1_ptr, src2_stride, tp0, tp1);
5885  src1_ptr += 2 * src2_stride;
5886  INSERT_D2_SH(tp0, tp1, in1);
5887 
5888  LD2(src1_ptr, src2_stride, tp0, tp1);
5889  src1_ptr += 2 * src2_stride;
5890  INSERT_D2_SH(tp0, tp1, in2);
5891  LD2(src1_ptr, src2_stride, tp0, tp1);
5892  src1_ptr += 2 * src2_stride;
5893  INSERT_D2_SH(tp0, tp1, in3);
5894 
5895  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5896  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5897  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5898  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5899  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5900  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5901  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5902  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5903 
5904  SRA_4V(dst0, dst1, dst2, dst3, 6);
5905  SRA_4V(dst4, dst5, dst6, dst7, 6);
5906  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5907  dst0, dst1, dst2, dst3);
5908  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5909  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5910  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5911  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5912  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5913  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5914  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5915  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5916  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5917  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5918  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5919  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5920  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5921  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5922  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5923  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5924  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5925  tmp0, tmp1, tmp2, tmp3);
5926  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5927  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5928  dst += (8 * dst_stride);
5929 
5930  dst10_r = dst98_r;
5931  dst21_r = dst109_r;
5932  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5933  }
5934 }
5935 
5936 static void hevc_hv_biwgt_4t_16w_msa(const uint8_t *src0_ptr,
5937  int32_t src_stride,
5938  const int16_t *src1_ptr,
5939  int32_t src2_stride,
5940  uint8_t *dst,
5941  int32_t dst_stride,
5942  const int8_t *filter_x,
5943  const int8_t *filter_y,
5944  int32_t height,
5945  int32_t weight0,
5946  int32_t weight1,
5947  int32_t offset0,
5948  int32_t offset1,
5949  int32_t rnd_val)
5950 {
5951  if (4 == height) {
5952  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5953  src2_stride, dst, dst_stride, filter_x,
5954  filter_y, weight0, weight1, offset0,
5955  offset1, rnd_val, 2);
5956  } else {
5957  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5958  src2_stride, dst, dst_stride,
5959  filter_x, filter_y, height, weight0,
5960  weight1, offset0, offset1, rnd_val, 16);
5961  }
5962 }
5963 
5964 static void hevc_hv_biwgt_4t_24w_msa(const uint8_t *src0_ptr,
5965  int32_t src_stride,
5966  const int16_t *src1_ptr,
5967  int32_t src2_stride,
5968  uint8_t *dst,
5969  int32_t dst_stride,
5970  const int8_t *filter_x,
5971  const int8_t *filter_y,
5972  int32_t height,
5973  int32_t weight0,
5974  int32_t weight1,
5975  int32_t offset0,
5976  int32_t offset1,
5977  int32_t rnd_val)
5978 {
5979  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5980  src1_ptr, src2_stride,
5981  dst, dst_stride,
5982  filter_x, filter_y, height, weight0,
5983  weight1, offset0, offset1, rnd_val, 24);
5984 }
5985 
5986 static void hevc_hv_biwgt_4t_32w_msa(const uint8_t *src0_ptr,
5987  int32_t src_stride,
5988  const int16_t *src1_ptr,
5989  int32_t src2_stride,
5990  uint8_t *dst,
5991  int32_t dst_stride,
5992  const int8_t *filter_x,
5993  const int8_t *filter_y,
5994  int32_t height,
5995  int32_t weight0,
5996  int32_t weight1,
5997  int32_t offset0,
5998  int32_t offset1,
5999  int32_t rnd_val)
6000 {
6001  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
6002  src1_ptr, src2_stride,
6003  dst, dst_stride,
6004  filter_x, filter_y, height, weight0,
6005  weight1, offset0, offset1, rnd_val, 32);
6006 }
6007 
6008 #define BI_W_MC_COPY(WIDTH) \
6009 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
6010  ptrdiff_t dst_stride, \
6011  const uint8_t *src, \
6012  ptrdiff_t src_stride, \
6013  const int16_t *src_16bit, \
6014  int height, \
6015  int denom, \
6016  int weight0, \
6017  int weight1, \
6018  int offset0, \
6019  int offset1, \
6020  intptr_t mx, \
6021  intptr_t my, \
6022  int width) \
6023 { \
6024  int shift = 14 + 1 - 8; \
6025  int log2Wd = denom + shift - 1; \
6026  \
6027  hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
6028  dst, dst_stride, height, \
6029  weight0, weight1, offset0, \
6030  offset1, log2Wd); \
6031 }
6032 
6033 BI_W_MC_COPY(4);
6034 BI_W_MC_COPY(6);
6035 BI_W_MC_COPY(8);
6036 BI_W_MC_COPY(12);
6037 BI_W_MC_COPY(16);
6038 BI_W_MC_COPY(24);
6039 BI_W_MC_COPY(32);
6040 BI_W_MC_COPY(48);
6041 BI_W_MC_COPY(64);
6042 
6043 #undef BI_W_MC_COPY
6044 
6045 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
6046 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
6047  ptrdiff_t \
6048  dst_stride, \
6049  const uint8_t *src, \
6050  ptrdiff_t \
6051  src_stride, \
6052  const int16_t *src_16bit, \
6053  int height, \
6054  int denom, \
6055  int weight0, \
6056  int weight1, \
6057  int offset0, \
6058  int offset1, \
6059  intptr_t mx, \
6060  intptr_t my, \
6061  int width) \
6062 { \
6063  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
6064  int log2Wd = denom + 14 - 8; \
6065  \
6066  hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6067  MAX_PB_SIZE, dst, dst_stride, \
6068  filter, height, weight0, \
6069  weight1, offset0, offset1, \
6070  log2Wd); \
6071 }
6072 
6073 BI_W_MC(qpel, h, 4, 8, hz, mx);
6074 BI_W_MC(qpel, h, 8, 8, hz, mx);
6075 BI_W_MC(qpel, h, 12, 8, hz, mx);
6076 BI_W_MC(qpel, h, 16, 8, hz, mx);
6077 BI_W_MC(qpel, h, 24, 8, hz, mx);
6078 BI_W_MC(qpel, h, 32, 8, hz, mx);
6079 BI_W_MC(qpel, h, 48, 8, hz, mx);
6080 BI_W_MC(qpel, h, 64, 8, hz, mx);
6081 
6082 BI_W_MC(qpel, v, 4, 8, vt, my);
6083 BI_W_MC(qpel, v, 8, 8, vt, my);
6084 BI_W_MC(qpel, v, 12, 8, vt, my);
6085 BI_W_MC(qpel, v, 16, 8, vt, my);
6086 BI_W_MC(qpel, v, 24, 8, vt, my);
6087 BI_W_MC(qpel, v, 32, 8, vt, my);
6088 BI_W_MC(qpel, v, 48, 8, vt, my);
6089 BI_W_MC(qpel, v, 64, 8, vt, my);
6090 
6091 BI_W_MC(epel, h, 4, 4, hz, mx);
6092 BI_W_MC(epel, h, 8, 4, hz, mx);
6093 BI_W_MC(epel, h, 6, 4, hz, mx);
6094 BI_W_MC(epel, h, 12, 4, hz, mx);
6095 BI_W_MC(epel, h, 16, 4, hz, mx);
6096 BI_W_MC(epel, h, 24, 4, hz, mx);
6097 BI_W_MC(epel, h, 32, 4, hz, mx);
6098 
6099 BI_W_MC(epel, v, 4, 4, vt, my);
6100 BI_W_MC(epel, v, 8, 4, vt, my);
6101 BI_W_MC(epel, v, 6, 4, vt, my);
6102 BI_W_MC(epel, v, 12, 4, vt, my);
6103 BI_W_MC(epel, v, 16, 4, vt, my);
6104 BI_W_MC(epel, v, 24, 4, vt, my);
6105 BI_W_MC(epel, v, 32, 4, vt, my);
6106 
6107 #undef BI_W_MC
6108 
6109 #define BI_W_MC_HV(PEL, WIDTH, TAP) \
6110 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
6111  ptrdiff_t dst_stride, \
6112  const uint8_t *src, \
6113  ptrdiff_t src_stride, \
6114  const int16_t *src_16bit, \
6115  int height, \
6116  int denom, \
6117  int weight0, \
6118  int weight1, \
6119  int offset0, \
6120  int offset1, \
6121  intptr_t mx, \
6122  intptr_t my, \
6123  int width) \
6124 { \
6125  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
6126  const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
6127  int log2Wd = denom + 14 - 8; \
6128  \
6129  hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6130  MAX_PB_SIZE, dst, dst_stride, \
6131  filter_x, filter_y, height, \
6132  weight0, weight1, offset0, \
6133  offset1, log2Wd); \
6134 }
6135 
6136 BI_W_MC_HV(qpel, 4, 8);
6137 BI_W_MC_HV(qpel, 8, 8);
6138 BI_W_MC_HV(qpel, 12, 8);
6139 BI_W_MC_HV(qpel, 16, 8);
6140 BI_W_MC_HV(qpel, 24, 8);
6141 BI_W_MC_HV(qpel, 32, 8);
6142 BI_W_MC_HV(qpel, 48, 8);
6143 BI_W_MC_HV(qpel, 64, 8);
6144 
6145 BI_W_MC_HV(epel, 4, 4);
6146 BI_W_MC_HV(epel, 8, 4);
6147 BI_W_MC_HV(epel, 6, 4);
6148 BI_W_MC_HV(epel, 12, 4);
6149 BI_W_MC_HV(epel, 16, 4);
6150 BI_W_MC_HV(epel, 24, 4);
6151 BI_W_MC_HV(epel, 32, 4);
6152 
6153 #undef BI_W_MC_HV
hevc_hv_biwgt_8t_48w_msa
static void hevc_hv_biwgt_8t_48w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2652
VSHF_B2_SB
#define VSHF_B2_SB(...)
Definition: generic_macros_msa.h:662
hevc_hz_biwgt_4t_4x8multiple_msa
static void hevc_hz_biwgt_4t_4x8multiple_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2816
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
hevc_hv_biwgt_4t_8multx4_msa
static void hevc_hv_biwgt_4t_8multx4_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width8mult)
Definition: hevc_mc_biw_msa.c:5230
LD_SH2
#define LD_SH2(...)
Definition: generic_macros_msa.h:280
hevc_hv_biwgt_8t_64w_msa
static void hevc_hv_biwgt_8t_64w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2674
hevc_hv_biwgt_4t_16w_msa
static void hevc_hv_biwgt_4t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5936
hevc_hz_biwgt_4t_8x2_msa
static void hevc_hz_biwgt_4t_8x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2996
hevc_vt_biwgt_4t_4x4_msa
static void hevc_vt_biwgt_4t_4x4_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3627
ILVR_H2_SH
#define ILVR_H2_SH(...)
Definition: generic_macros_msa.h:1392
DPADD_SB2_SH
#define DPADD_SB2_SH(...)
Definition: generic_macros_msa.h:833
hevc_hz_biwgt_4t_8x6_msa
static void hevc_hz_biwgt_4t_8x6_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3052
LD_SH4
#define LD_SH4(...)
Definition: generic_macros_msa.h:299
out
FILE * out
Definition: movenc.c:55
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1674
ILVL_B4_SH
#define ILVL_B4_SH(...)
Definition: generic_macros_msa.h:1276
hevc_hv_biwgt_8t_24w_msa
static void hevc_hv_biwgt_8t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2608
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
src1
const pixel * src1
Definition: h264pred_template.c:421
hevc_hz_biwgt_8t_48w_msa
static void hevc_hz_biwgt_8t_48w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1220
PCKEV_H2_SW
#define PCKEV_H2_SW(...)
Definition: generic_macros_msa.h:1760
INSERT_W4_SH
#define INSERT_W4_SH(...)
Definition: generic_macros_msa.h:1155
HEVC_BIW_RND_CLIP4_MAX_SATU
#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, vec3, wgt, rnd, offset, out0, out1, out2, out3)
Definition: hevc_mc_biw_msa.c:72
BI_W_MC_HV
#define BI_W_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_biw_msa.c:6109
hevc_biwgt_copy_6w_msa
static void hevc_biwgt_copy_6w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:171
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
hevc_hz_biwgt_8t_64w_msa
static void hevc_hz_biwgt_8t_64w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1324
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *2]
Definition: hevc_mc_biw_msa.c:25
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
SLLI_2V
#define SLLI_2V(in0, in1, shift)
Definition: generic_macros_msa.h:1916
VSHF_B4_SB
#define VSHF_B4_SB(...)
Definition: generic_macros_msa.h:680
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
hevc_hv_biwgt_4t_4w_msa
static void hevc_hv_biwgt_4t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4919
hevc_vt_biwgt_4t_8x6_msa
static void hevc_vt_biwgt_4t_8x6_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3975
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
hevc_vt_biwgt_8t_8w_msa
static void hevc_vt_biwgt_8t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1562
SRAR_W2_SW
#define SRAR_W2_SW(...)
Definition: generic_macros_msa.h:2034
hevc_biwgt_copy_8w_msa
static void hevc_biwgt_copy_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:247
hevc_biwgt_copy_12w_msa
static void hevc_biwgt_copy_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:335
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1144
hevc_hz_biwgt_8t_12w_msa
static void hevc_hz_biwgt_8t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:833
XORI_B4_128_SB
#define XORI_B4_128_SB(...)
Definition: generic_macros_msa.h:1851
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
PCKEV_H4_SW
#define PCKEV_H4_SW(...)
Definition: generic_macros_msa.h:1769
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
hevc_hv_biwgt_8t_4w_msa
static void hevc_hv_biwgt_8t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1985
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
ILVL_W2_SB
#define ILVL_W2_SB(...)
Definition: generic_macros_msa.h:1319
hevc_hz_biwgt_8t_4w_msa
static void hevc_hz_biwgt_8t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:681
aligned
static int aligned(int val)
Definition: dashdec.c:171
hevc_hz_biwgt_8t_8w_msa
static void hevc_hz_biwgt_8t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:753
CLIP_SW2_0_255
#define CLIP_SW2_0_255(in0, in1)
Definition: generic_macros_msa.h:972
ILVL_H2_SH
#define ILVL_H2_SH(...)
Definition: generic_macros_msa.h:1292
width
#define width
hevc_hv_biwgt_4t_8x2_msa
static void hevc_hv_biwgt_4t_8x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5134
HEVC_FILT_8TAP_SH
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:24
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_vt_biwgt_4t_12w_msa
static void hevc_vt_biwgt_4t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4151
UNPCK_R_SB_SH
#define UNPCK_R_SB_SH(in, out)
Definition: generic_macros_msa.h:2156
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
HEVC_FILT_8TAP
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:35
hevc_hz_biwgt_8t_16w_msa
static void hevc_hz_biwgt_8t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:928
hevc_hv_biwgt_4t_4x2_msa
static void hevc_hv_biwgt_4t_4x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4593
hevc_hv_biwgt_8t_12w_msa
static void hevc_hv_biwgt_8t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2335
INSERT_D2_SB
#define INSERT_D2_SB(...)
Definition: generic_macros_msa.h:1170
hevc_hv_biwgt_8t_16w_msa
static void hevc_hv_biwgt_8t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2586
hevc_hv_biwgt_4t_8x6_msa
static void hevc_hv_biwgt_4t_8x6_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5355
hevc_macros_msa.h
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
LD2
#define LD2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:223
hevc_vt_biwgt_4t_24w_msa
static void hevc_vt_biwgt_4t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4337
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:366
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
XORI_B5_128_SB
#define XORI_B5_128_SB(...)
Definition: generic_macros_msa.h:1859
hevc_biwgt_copy_48w_msa
static void hevc_biwgt_copy_48w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:574
hevc_vt_biwgt_4t_4x8multiple_msa
static void hevc_vt_biwgt_4t_4x8multiple_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3694
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
hevc_vt_biwgt_4t_4x2_msa
static void hevc_vt_biwgt_4t_4x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3562
INSERT_W4_SB
#define INSERT_W4_SB(...)
Definition: generic_macros_msa.h:1154
hevc_hv_biwgt_4t_24w_msa
static void hevc_hv_biwgt_4t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5964
hevc_hz_biwgt_8t_32w_msa
static void hevc_hz_biwgt_8t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1134
DOTP_SB3_SH
#define DOTP_SB3_SH(...)
Definition: generic_macros_msa.h:776
hevc_vt_biwgt_8t_32w_msa
static void hevc_vt_biwgt_8t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1922
hevc_vt_biwgt_8t_16multx2mult_msa
static void hevc_vt_biwgt_8t_16multx2mult_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
Definition: hevc_mc_biw_msa.c:1760
hevc_vt_biwgt_8t_4w_msa
static void hevc_vt_biwgt_8t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1423
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
hevc_hz_biwgt_4t_6w_msa
static void hevc_hz_biwgt_4t_6w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2922
hevc_hz_biwgt_4t_24w_msa
static void hevc_hz_biwgt_4t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3402
hevc_hv_biwgt_4t_4multx8mult_msa
static void hevc_hv_biwgt_4t_4multx8mult_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4774
weight
static int weight(int i, int blen, int offset)
Definition: diracdec.c:1563
ILVR_D3_SB
#define ILVR_D3_SB(...)
Definition: generic_macros_msa.h:1452
ILVR_D4_SB
#define ILVR_D4_SB(...)
Definition: generic_macros_msa.h:1460
CLIP_SW4_0_255
#define CLIP_SW4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:978
hevcdsp_mips.h
SLLI_4V
#define SLLI_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1921
LD_SB7
#define LD_SB7(...)
Definition: generic_macros_msa.h:327
BI_W_MC_COPY
#define BI_W_MC_COPY(WIDTH)
Definition: hevc_mc_biw_msa.c:6008
XORI_B8_128_SB
#define XORI_B8_128_SB(...)
Definition: generic_macros_msa.h:1880
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
XORI_B2_128_SB
#define XORI_B2_128_SB(...)
Definition: generic_macros_msa.h:1835
hevc_hz_biwgt_4t_32w_msa
static void hevc_hz_biwgt_4t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3489
hevc_hv_biwgt_4t_4x4_msa
static void hevc_hv_biwgt_4t_4x4_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4677
height
#define height
hevc_vt_biwgt_8t_16w_msa
static void hevc_vt_biwgt_8t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1876
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
hevc_hz_biwgt_4t_4x4_msa
static void hevc_hz_biwgt_4t_4x4_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2756
LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:202
ST_D2
#define ST_D2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:491
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1656
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
hevc_vt_biwgt_8t_12w_msa
static void hevc_vt_biwgt_8t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1650
LD_SB6
#define LD_SB6(...)
Definition: generic_macros_msa.h:316
hevc_vt_biwgt_8t_64w_msa
static void hevc_vt_biwgt_8t_64w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1964
SPLATI_W4_SH
#define SPLATI_W4_SH(...)
Definition: generic_macros_msa.h:1700
ILVRL_H2_SW
#define ILVRL_H2_SW(...)
Definition: generic_macros_msa.h:1509
HEVC_FILT_4TAP_SH
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:46
XORI_B6_128_SB
#define XORI_B6_128_SB(...)
Definition: generic_macros_msa.h:1866
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
hevc_biwgt_copy_24w_msa
static void hevc_biwgt_copy_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:449
HEVC_BIW_RND_CLIP4
#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, wgt, rnd, offset, out0, out1, out2, out3)
Definition: hevc_mc_biw_msa.c:49
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:338
hevc_vt_biwgt_8t_24w_msa
static void hevc_vt_biwgt_8t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1897
hevc_hz_biwgt_4t_8w_msa
static void hevc_hz_biwgt_4t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3196
src2
const pixel * src2
Definition: h264pred_template.c:422
CLIP_SH_0_255
#define CLIP_SH_0_255(in)
Definition: generic_macros_msa.h:935
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
PCKEV_B3_UB
#define PCKEV_B3_UB(...)
Definition: generic_macros_msa.h:1729
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
hevc_hv_biwgt_8t_8w_msa
static void hevc_hv_biwgt_8t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2313
DPADD_SB4_SH
#define DPADD_SB4_SH(...)
Definition: generic_macros_msa.h:841
ST_H2
#define ST_H2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:409
SPLATI_W2_SH
#define SPLATI_W2_SH(...)
Definition: generic_macros_msa.h:1692
hevc_vt_biwgt_8t_48w_msa
static void hevc_vt_biwgt_8t_48w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1943
hevc_vt_biwgt_4t_8x4multiple_msa
static void hevc_vt_biwgt_4t_8x4multiple_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4046
hevc_vt_biwgt_4t_32w_msa
static void hevc_vt_biwgt_4t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4474
HEVC_BIW_RND_CLIP2
#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1)
Definition: hevc_mc_biw_msa.c:31
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:289
hevc_hz_biwgt_4t_8x4multiple_msa
static void hevc_hz_biwgt_4t_8x4multiple_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3126
ILVL_H4_SH
#define ILVL_H4_SH(...)
Definition: generic_macros_msa.h:1301
HEVC_BIW_RND_CLIP2_MAX_SATU
#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1)
Definition: hevc_mc_biw_msa.c:56
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
hevc_vt_biwgt_4t_8w_msa
static void hevc_vt_biwgt_4t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4120
hevc_biwgt_copy_4w_msa
static void hevc_biwgt_copy_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:82
hevc_hz_biwgt_4t_12w_msa
static void hevc_hz_biwgt_4t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3227
BI_W_MC
#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_biw_msa.c:6045
ILVR_D2_SH
#define ILVR_D2_SH(...)
Definition: generic_macros_msa.h:1445
hevc_hv_biwgt_4t_8w_msa
static void hevc_hv_biwgt_4t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5657
hevc_vt_biwgt_4t_4w_msa
static void hevc_vt_biwgt_4t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3780
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
DOTP_SB4_SH
#define DOTP_SB4_SH(...)
Definition: generic_macros_msa.h:784
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
hevc_hv_biwgt_4t_32w_msa
static void hevc_hv_biwgt_4t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5986
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1362
src0
const pixel *const src0
Definition: h264pred_template.c:420
hevc_biwgt_copy_16w_msa
static void hevc_biwgt_copy_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:394
LD_SH6
#define LD_SH6(...)
Definition: generic_macros_msa.h:318
hevc_hz_biwgt_4t_4x2_msa
static void hevc_hz_biwgt_4t_4x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2696
hevc_vt_biwgt_4t_16w_msa
static void hevc_vt_biwgt_4t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4243
hevc_vt_biwgt_4t_6w_msa
static void hevc_vt_biwgt_4t_6w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3811
hevc_hv_biwgt_4t_6w_msa
static void hevc_hv_biwgt_4t_6w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4951
XORI_B7_128_SB
#define XORI_B7_128_SB(...)
Definition: generic_macros_msa.h:1873
hevc_hz_biwgt_8t_24w_msa
static void hevc_hz_biwgt_8t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1012
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1498
ST_SH
#define ST_SH(...)
Definition: generic_macros_msa.h:43
hevc_vt_biwgt_4t_8x2_msa
static void hevc_vt_biwgt_4t_8x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3918
HEVC_FILT_4TAP
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:64
hevc_hz_biwgt_4t_16w_msa
static void hevc_hz_biwgt_4t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3313
int32_t
int32_t
Definition: audioconvert.c:56
h
h
Definition: vp9dsp_template.c:2038
ILVR_H4_SH
#define ILVR_H4_SH(...)
Definition: generic_macros_msa.h:1408
hevc_hz_biwgt_4t_4w_msa
static void hevc_hz_biwgt_4t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2891
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
hevc_hv_biwgt_8t_8multx2mult_msa
static void hevc_hv_biwgt_8t_8multx2mult_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width8mult)
Definition: hevc_mc_biw_msa.c:2130
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1789
INSERT_D2_SH
#define INSERT_D2_SH(...)
Definition: generic_macros_msa.h:1171
SD
#define SD
Definition: ccaption_dec.c:940
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759
XORI_B3_128_SB
#define XORI_B3_128_SB(...)
Definition: generic_macros_msa.h:1843
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278
hevc_hv_biwgt_4t_8multx4mult_msa
static void hevc_hv_biwgt_4t_8multx4mult_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
Definition: hevc_mc_biw_msa.c:5506
hevc_hv_biwgt_8t_32w_msa
static void hevc_hv_biwgt_8t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2630
hevc_hv_biwgt_4t_12w_msa
static void hevc_hv_biwgt_4t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5694
hevc_biwgt_copy_64w_msa
static void hevc_biwgt_copy_64w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:626
SRAR_W4_SW
#define SRAR_W4_SW(...)
Definition: generic_macros_msa.h:2041
LW2
#define LW2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:210
hevc_biwgt_copy_32w_msa
static void hevc_biwgt_copy_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:513