FFmpeg
hevc_mc_biw_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29 };
30 
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
32  out0, out1) \
33 { \
34  v4i32 out0_r, out1_r, out0_l, out1_l; \
35  \
36  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
38  \
39  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
43  \
44  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45  CLIP_SW4_0_255(out0_l, out0_r, out1_l, out1_r); \
46  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
47 }
48 
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50  wgt, rnd, offset, out0, out1, out2, out3) \
51 { \
52  HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53  HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
54 }
55 
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
57  offset, out0, out1) \
58 { \
59  v4i32 out0_r, out1_r, out0_l, out1_l; \
60  \
61  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68  CLIP_SW4_0_255(out0_r, out1_r, out0_l, out1_l); \
69  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
70 }
71 
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73  vec3, wgt, rnd, offset, out0, out1, \
74  out2, out3) \
75 { \
76  HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
77  out0, out1); \
78  HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
79  out2, out3); \
80 }
81 
82 static void hevc_biwgt_copy_4w_msa(const uint8_t *src0_ptr,
83  int32_t src_stride,
84  const int16_t *src1_ptr,
85  int32_t src2_stride,
86  uint8_t *dst,
87  int32_t dst_stride,
89  int32_t weight0,
90  int32_t weight1,
91  int32_t offset0,
92  int32_t offset1,
93  int32_t rnd_val)
94 {
95  uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96  uint64_t tpd0, tpd1, tpd2, tpd3;
98  v16u8 out0, out1;
99  v16i8 zero = { 0 };
100  v16i8 src0 = { 0 }, src1 = { 0 };
101  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102  v8i16 dst0, dst1, dst2, dst3, weight_vec;
103  v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
104 
105  offset = (offset0 + offset1) << rnd_val;
106  weight0 = weight0 & 0x0000FFFF;
107  weight = weight0 | (weight1 << 16);
108 
109  offset_vec = __msa_fill_w(offset);
110  weight_vec = (v8i16) __msa_fill_w(weight);
111  rnd_vec = __msa_fill_w(rnd_val + 1);
112 
113  if (2 == height) {
114  LW2(src0_ptr, src_stride, tp0, tp1);
115  INSERT_W2_SB(tp0, tp1, src0);
116  LD2(src1_ptr, src2_stride, tpd0, tpd1);
117  INSERT_D2_SH(tpd0, tpd1, in0);
118 
119  dst0 = (v8i16) __msa_ilvr_b(zero, src0);
120  dst0 <<= 6;
121 
122  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126  CLIP_SW2_0_255(dst0_r, dst0_l);
127  dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
128  out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129  ST_W2(out0, 0, 1, dst, dst_stride);
130  } else if (4 == height) {
131  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134  INSERT_D2_SH(tpd0, tpd1, in0);
135  INSERT_D2_SH(tpd2, tpd3, in1);
136  ILVRL_B2_SH(zero, src0, dst0, dst1);
137  SLLI_2V(dst0, dst1, 6);
138  HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139  offset_vec, dst0, dst1);
140  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142  } else if (0 == height % 8) {
143  for (loop_cnt = (height >> 3); loop_cnt--;) {
144  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145  src0_ptr += 4 * src_stride;
146  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148  src0_ptr += 4 * src_stride;
149  INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151  src1_ptr += (4 * src2_stride);
152  INSERT_D2_SH(tpd0, tpd1, in0);
153  INSERT_D2_SH(tpd2, tpd3, in1);
154  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155  src1_ptr += (4 * src2_stride);
156  INSERT_D2_SH(tpd0, tpd1, in2);
157  INSERT_D2_SH(tpd2, tpd3, in3);
158  ILVRL_B2_SH(zero, src0, dst0, dst1);
159  ILVRL_B2_SH(zero, src1, dst2, dst3);
160  SLLI_4V(dst0, dst1, dst2, dst3, 6);
161  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162  in3, weight_vec, rnd_vec, offset_vec,
163  dst0, dst1, dst2, dst3);
164  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166  dst += (8 * dst_stride);
167  }
168  }
169 }
170 
171 static void hevc_biwgt_copy_6w_msa(const uint8_t *src0_ptr,
172  int32_t src_stride,
173  const int16_t *src1_ptr,
174  int32_t src2_stride,
175  uint8_t *dst,
176  int32_t dst_stride,
177  int32_t height,
178  int32_t weight0,
179  int32_t weight1,
180  int32_t offset0,
181  int32_t offset1,
182  int32_t rnd_val)
183 {
184  uint32_t loop_cnt;
185  int32_t res = height & 0x03;
187  uint64_t tp0, tp1, tp2, tp3;
188  v16u8 out0, out1;
189  v16i8 zero = { 0 };
190  v16i8 src0 = { 0 }, src1 = { 0 };
191  v8i16 in0, in1, in2, in3;
192  v8i16 dst0, dst1, dst2, dst3;
193  v4i32 offset_vec, weight_vec, rnd_vec;
194 
195  offset = (offset0 + offset1) << rnd_val;
196  weight0 = weight0 & 0x0000FFFF;
197  weight = weight0 | (weight1 << 16);
198 
199  weight_vec = __msa_fill_w(weight);
200  offset_vec = __msa_fill_w(offset);
201  rnd_vec = __msa_fill_w(rnd_val + 1);
202 
203  for (loop_cnt = (height >> 2); loop_cnt--;) {
204  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
205  src0_ptr += (4 * src_stride);
206  INSERT_D2_SB(tp0, tp1, src0);
207  INSERT_D2_SB(tp2, tp3, src1);
208  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
209  src1_ptr += (4 * src2_stride);
210  ILVRL_B2_SH(zero, src0, dst0, dst1);
211  ILVRL_B2_SH(zero, src1, dst2, dst3);
212  SLLI_4V(dst0, dst1, dst2, dst3, 6);
213  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
214  in0, in1, in2, in3,
215  weight_vec, rnd_vec, offset_vec,
216  dst0, dst1, dst2, dst3);
217  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
218  ST_W2(out0, 0, 2, dst, dst_stride);
219  ST_H2(out0, 2, 6, dst + 4, dst_stride);
220  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
221  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
222  dst += (4 * dst_stride);
223  }
224  if (res) {
225  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
226  src0_ptr += (4 * src_stride);
227  INSERT_D2_SB(tp0, tp1, src0);
228  INSERT_D2_SB(tp2, tp3, src1);
229  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
230  src1_ptr += (4 * src2_stride);
231  ILVRL_B2_SH(zero, src0, dst0, dst1);
232  ILVRL_B2_SH(zero, src1, dst2, dst3);
233  SLLI_4V(dst0, dst1, dst2, dst3, 6);
234  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
235  in0, in1, in2, in3,
236  weight_vec, rnd_vec, offset_vec,
237  dst0, dst1, dst2, dst3);
238 
239  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
240  ST_W2(out0, 0, 2, dst, dst_stride);
241  ST_H2(out0, 2, 6, dst + 4, dst_stride);
242  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
243  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
244  }
245 }
246 
247 static void hevc_biwgt_copy_8w_msa(const uint8_t *src0_ptr,
248  int32_t src_stride,
249  const int16_t *src1_ptr,
250  int32_t src2_stride,
251  uint8_t *dst,
252  int32_t dst_stride,
253  int32_t height,
254  int32_t weight0,
255  int32_t weight1,
256  int32_t offset0,
257  int32_t offset1,
258  int32_t rnd_val)
259 {
260  uint64_t tp0, tp1, tp2, tp3;
262  v16u8 out0, out1, out2;
263  v16i8 zero = { 0 };
264  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
265  v8i16 in0, in1, in2, in3, in4, in5;
266  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
267  v4i32 offset_vec, weight_vec, rnd_vec;
268 
269  offset = (offset0 + offset1) << rnd_val;
270  weight0 = weight0 & 0x0000FFFF;
271  weight = weight0 | (weight1 << 16);
272 
273  offset_vec = __msa_fill_w(offset);
274  weight_vec = __msa_fill_w(weight);
275  rnd_vec = __msa_fill_w(rnd_val + 1);
276 
277  if (2 == height) {
278  LD2(src0_ptr, src_stride, tp0, tp1);
279  INSERT_D2_SB(tp0, tp1, src0);
280  LD_SH2(src1_ptr, src2_stride, in0, in1);
281  ILVRL_B2_SH(zero, src0, dst0, dst1);
282  SLLI_2V(dst0, dst1, 6);
283 
284  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
285  weight_vec, rnd_vec, offset_vec,
286  dst0, dst1);
287 
288  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
289  ST_D2(out0, 0, 1, dst, dst_stride);
290  } else if (6 == height) {
291  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
292  src0_ptr += 4 * src_stride;
293  INSERT_D2_SB(tp0, tp1, src0);
294  INSERT_D2_SB(tp2, tp3, src1);
295  LD2(src0_ptr, src_stride, tp0, tp1);
296  INSERT_D2_SB(tp0, tp1, src2);
297  ILVRL_B2_SH(zero, src0, dst0, dst1);
298  ILVRL_B2_SH(zero, src1, dst2, dst3);
299  ILVRL_B2_SH(zero, src2, dst4, dst5);
300  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
301  SLLI_4V(dst0, dst1, dst2, dst3, 6);
302  SLLI_2V(dst4, dst5, 6);
303  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
304  weight_vec, rnd_vec, offset_vec, dst0, dst1,
305  dst2, dst3);
306  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
307  offset_vec, dst4, dst5);
308  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
309  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
310  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
311  } else if (0 == height % 4) {
312  uint32_t loop_cnt;
313 
314  for (loop_cnt = (height >> 2); loop_cnt--;) {
315  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
316  src0_ptr += (4 * src_stride);
317  INSERT_D2_SB(tp0, tp1, src0);
318  INSERT_D2_SB(tp2, tp3, src1);
319  ILVRL_B2_SH(zero, src0, dst0, dst1);
320  ILVRL_B2_SH(zero, src1, dst2, dst3);
321  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
322  src1_ptr += (4 * src2_stride);
323 
324  SLLI_4V(dst0, dst1, dst2, dst3, 6);
325  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
326  in3, weight_vec, rnd_vec, offset_vec,
327  dst0, dst1, dst2, dst3);
328  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
329  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
330  dst += (4 * dst_stride);
331  }
332  }
333 }
334 
335 static void hevc_biwgt_copy_12w_msa(const uint8_t *src0_ptr,
336  int32_t src_stride,
337  const int16_t *src1_ptr,
338  int32_t src2_stride,
339  uint8_t *dst,
340  int32_t dst_stride,
341  int32_t height,
342  int32_t weight0,
343  int32_t weight1,
344  int32_t offset0,
345  int32_t offset1,
346  int32_t rnd_val)
347 {
348  uint32_t loop_cnt;
350  v16i8 zero = { 0 };
351  v16u8 out0, out1, out2;
352  v16i8 src0, src1, src2, src3;
353  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
354  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
355  v4i32 offset_vec, weight_vec, rnd_vec;
356 
357  offset = (offset0 + offset1) << rnd_val;
358  weight0 = weight0 & 0x0000FFFF;
359  weight = weight0 | (weight1 << 16);
360 
361  offset_vec = __msa_fill_w(offset);
362  weight_vec = __msa_fill_w(weight);
363  rnd_vec = __msa_fill_w(rnd_val + 1);
364 
365  for (loop_cnt = (height >> 2); loop_cnt--;) {
366  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
367  src0_ptr += (4 * src_stride);
368  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
369  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
370  src1_ptr += (4 * src2_stride);
371 
372  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
373  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
374  dst0, dst1, dst2, dst3);
375 
376  SLLI_4V(dst0, dst1, dst2, dst3, 6);
377  ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
378  ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
379 
380  dst4 <<= 6;
381  dst5 <<= 6;
382  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
383  weight_vec, rnd_vec, offset_vec, dst0, dst1,
384  dst2, dst3);
385  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
386  offset_vec, dst4, dst5);
387  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
388  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
389  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
390  dst += (4 * dst_stride);
391  }
392 }
393 
394 static void hevc_biwgt_copy_16w_msa(const uint8_t *src0_ptr,
395  int32_t src_stride,
396  const int16_t *src1_ptr,
397  int32_t src2_stride,
398  uint8_t *dst,
399  int32_t dst_stride,
400  int32_t height,
401  int32_t weight0,
402  int32_t weight1,
403  int32_t offset0,
404  int32_t offset1,
405  int32_t rnd_val)
406 {
407  uint32_t loop_cnt;
409  v16u8 out0, out1, out2, out3;
410  v16i8 zero = { 0 };
411  v16i8 src0, src1, src2, src3;
412  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
413  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
414  v4i32 offset_vec, weight_vec, rnd_vec;
415 
416  offset = (offset0 + offset1) << rnd_val;
417  weight0 = weight0 & 0x0000FFFF;
418  weight = weight0 | (weight1 << 16);
419 
420  offset_vec = __msa_fill_w(offset);
421  weight_vec = __msa_fill_w(weight);
422  rnd_vec = __msa_fill_w(rnd_val + 1);
423 
424  for (loop_cnt = (height >> 2); loop_cnt--;) {
425  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
426  src0_ptr += (4 * src_stride);
427  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
428  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
429  src1_ptr += (4 * src2_stride);
430  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
431  tmp2, tmp3);
432  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
433  tmp6, tmp7);
434  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
435  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
436  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
437  weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
438  tmp4, tmp5);
439  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
440  weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
441  tmp6, tmp7);
442  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
443  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
444  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
445  dst += (4 * dst_stride);
446  }
447 }
448 
449 static void hevc_biwgt_copy_24w_msa(const uint8_t *src0_ptr,
450  int32_t src_stride,
451  const int16_t *src1_ptr,
452  int32_t src2_stride,
453  uint8_t *dst,
454  int32_t dst_stride,
455  int32_t height,
456  int32_t weight0,
457  int32_t weight1,
458  int32_t offset0,
459  int32_t offset1,
460  int32_t rnd_val)
461 {
462  uint32_t loop_cnt;
464  v16u8 out0, out1, out2, out3, out4, out5;
465  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
466  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
467  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
468  v4i32 offset_vec, weight_vec, rnd_vec;
469 
470  offset = (offset0 + offset1) << rnd_val;
471  weight0 = weight0 & 0x0000FFFF;
472  weight = weight0 | (weight1 << 16);
473 
474  offset_vec = __msa_fill_w(offset);
475  weight_vec = __msa_fill_w(weight);
476  rnd_vec = __msa_fill_w(rnd_val + 1);
477 
478  for (loop_cnt = 8; loop_cnt--;) {
479  LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
480  LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
481  src0_ptr += (4 * src_stride);
482  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
483  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
484  LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
485  src1_ptr += (4 * src2_stride);
486 
487  ILVRL_B2_SH(zero, src0, dst0, dst1);
488  ILVRL_B2_SH(zero, src1, dst2, dst3);
489  ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
490  ILVRL_B2_SH(zero, src4, dst6, dst7);
491  ILVRL_B2_SH(zero, src5, dst8, dst9);
492  ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
493  SLLI_4V(dst0, dst1, dst2, dst3, 6);
494  SLLI_4V(dst4, dst5, dst6, dst7, 6);
495  SLLI_4V(dst8, dst9, dst10, dst11, 6);
496  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
497  weight_vec, rnd_vec, offset_vec, dst0, dst1,
498  dst2, dst3);
499  HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
500  weight_vec, rnd_vec, offset_vec, dst4, dst5,
501  dst6, dst7);
502  HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
503  in11, weight_vec, rnd_vec, offset_vec,
504  dst8, dst9, dst10, dst11);
505  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
506  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
507  ST_UB4(out0, out1, out3, out4, dst, dst_stride);
508  ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
509  dst += (4 * dst_stride);
510  }
511 }
512 
513 static void hevc_biwgt_copy_32w_msa(const uint8_t *src0_ptr,
514  int32_t src_stride,
515  const int16_t *src1_ptr,
516  int32_t src2_stride,
517  uint8_t *dst,
518  int32_t dst_stride,
519  int32_t height,
520  int32_t weight0,
521  int32_t weight1,
522  int32_t offset0,
523  int32_t offset1,
524  int32_t rnd_val)
525 {
526  uint32_t loop_cnt;
528  v16u8 out0, out1, out2, out3;
529  v16i8 zero = { 0 };
530  v16i8 src0, src1, src2, src3;
531  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
532  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
533  v4i32 offset_vec, weight_vec, rnd_vec;
534 
535  offset = (offset0 + offset1) << rnd_val;
536  weight0 = weight0 & 0x0000FFFF;
537  weight = weight0 | (weight1 << 16);
538 
539  offset_vec = __msa_fill_w(offset);
540  weight_vec = __msa_fill_w(weight);
541  rnd_vec = __msa_fill_w(rnd_val + 1);
542 
543  for (loop_cnt = (height >> 1); loop_cnt--;) {
544  LD_SB2(src0_ptr, 16, src0, src1);
545  src0_ptr += src_stride;
546  LD_SB2(src0_ptr, 16, src2, src3);
547  src0_ptr += src_stride;
548  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
549  src1_ptr += src2_stride;
550  LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
551  src1_ptr += src2_stride;
552 
553  ILVRL_B2_SH(zero, src0, tmp0, tmp4);
554  ILVRL_B2_SH(zero, src1, tmp1, tmp5);
555  ILVRL_B2_SH(zero, src2, tmp2, tmp6);
556  ILVRL_B2_SH(zero, src3, tmp3, tmp7);
557  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
558  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
559  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
560  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
561  tmp1, tmp5);
562  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
563  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
564  tmp3, tmp7);
565  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
566  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
567  ST_UB2(out0, out1, dst, 16);
568  dst += dst_stride;
569  ST_UB2(out2, out3, dst, 16);
570  dst += dst_stride;
571  }
572 }
573 
574 static void hevc_biwgt_copy_48w_msa(const uint8_t *src0_ptr,
575  int32_t src_stride,
576  const int16_t *src1_ptr,
577  int32_t src2_stride,
578  uint8_t *dst,
579  int32_t dst_stride,
580  int32_t height,
581  int32_t weight0,
582  int32_t weight1,
583  int32_t offset0,
584  int32_t offset1,
585  int32_t rnd_val)
586 {
587  uint32_t loop_cnt;
589  v16u8 out0, out1, out2;
590  v16i8 src0, src1, src2;
591  v16i8 zero = { 0 };
592  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
593  v4i32 offset_vec, weight_vec, rnd_vec;
594 
595  offset = (offset0 + offset1) << rnd_val;
596  weight0 = weight0 & 0x0000FFFF;
597  weight = weight0 | (weight1 << 16);
598 
599  offset_vec = __msa_fill_w(offset);
600  weight_vec = __msa_fill_w(weight);
601  rnd_vec = __msa_fill_w(rnd_val + 1);
602 
603  for (loop_cnt = 64; loop_cnt--;) {
604  LD_SB3(src0_ptr, 16, src0, src1, src2);
605  src0_ptr += src_stride;
606  LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
607  src1_ptr += src2_stride;
608 
609  ILVRL_B2_SH(zero, src0, dst0, dst1);
610  ILVRL_B2_SH(zero, src1, dst2, dst3);
611  ILVRL_B2_SH(zero, src2, dst4, dst5);
612  SLLI_4V(dst0, dst1, dst2, dst3, 6);
613  SLLI_2V(dst4, dst5, 6);
614  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
615  weight_vec, rnd_vec, offset_vec, dst0, dst1,
616  dst2, dst3);
617  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
618  offset_vec, dst4, dst5);
619  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
620  ST_UB2(out0, out1, dst, 16);
621  ST_UB(out2, dst + 32);
622  dst += dst_stride;
623  }
624 }
625 
626 static void hevc_biwgt_copy_64w_msa(const uint8_t *src0_ptr,
627  int32_t src_stride,
628  const int16_t *src1_ptr,
629  int32_t src2_stride,
630  uint8_t *dst,
631  int32_t dst_stride,
632  int32_t height,
633  int32_t weight0,
634  int32_t weight1,
635  int32_t offset0,
636  int32_t offset1,
637  int32_t rnd_val)
638 {
639  uint32_t loop_cnt;
641  v16u8 out0, out1, out2, out3;
642  v16i8 zero = { 0 };
643  v16i8 src0, src1, src2, src3;
644  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
645  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
646  v4i32 offset_vec, weight_vec, rnd_vec;
647 
648  offset = (offset0 + offset1) << rnd_val;
649  weight0 = weight0 & 0x0000FFFF;
650  weight = weight0 | (weight1 << 16);
651 
652  offset_vec = __msa_fill_w(offset);
653  weight_vec = __msa_fill_w(weight);
654  rnd_vec = __msa_fill_w(rnd_val + 1);
655 
656  for (loop_cnt = height; loop_cnt--;) {
657  LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
658  src0_ptr += src_stride;
659  LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
660  src1_ptr += src2_stride;
661 
662  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
663  tmp2, tmp3);
664  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
665  tmp6, tmp7);
666  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
667  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
668  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
669  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
670  tmp1, tmp5);
671  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
672  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
673  tmp3, tmp7);
674  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
675  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
676  ST_UB4(out0, out1, out2, out3, dst, 16);
677  dst += dst_stride;
678  }
679 }
680 
681 static void hevc_hz_biwgt_8t_4w_msa(const uint8_t *src0_ptr,
682  int32_t src_stride,
683  const int16_t *src1_ptr,
684  int32_t src2_stride,
685  uint8_t *dst,
686  int32_t dst_stride,
687  const int8_t *filter,
688  int32_t height,
689  int32_t weight0,
690  int32_t weight1,
691  int32_t offset0,
692  int32_t offset1,
693  int32_t rnd_val)
694 {
695  uint32_t loop_cnt;
696  int32_t offset, weight, constant;
697  v8i16 filt0, filt1, filt2, filt3;
698  v16i8 src0, src1, src2, src3;
699  v16i8 mask1, mask2, mask3;
700  v16i8 vec0, vec1, vec2, vec3;
701  v8i16 dst0, dst1;
702  v8i16 in0, in1, in2, in3;
703  v8i16 filter_vec, out0, out1;
704  v4i32 weight_vec, offset_vec, rnd_vec;
705  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
706 
707  src0_ptr -= 3;
708  filter_vec = LD_SH(filter);
709  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
710 
711  mask1 = mask0 + 2;
712  mask2 = mask0 + 4;
713  mask3 = mask0 + 6;
714 
715  offset = (offset0 + offset1) << rnd_val;
716  weight0 = weight0 & 0x0000FFFF;
717  weight = weight0 | (weight1 << 16);
718  constant = 128 * weight1;
719  constant <<= 6;
720  offset += constant;
721 
722  offset_vec = __msa_fill_w(offset);
723  weight_vec = __msa_fill_w(weight);
724  rnd_vec = __msa_fill_w(rnd_val + 1);
725 
726  for (loop_cnt = (height >> 2); loop_cnt--;) {
727  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
728  src0_ptr += (4 * src_stride);
729  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
730  src1_ptr += (4 * src2_stride);
731  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
732  XORI_B4_128_SB(src0, src1, src2, src3);
733 
734  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
735  vec0, vec1, vec2, vec3);
736  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
737  filt3);
738  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
739  vec0, vec1, vec2, vec3);
740  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
741  filt3);
742 
743  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
744  weight_vec, rnd_vec, offset_vec,
745  out0, out1);
746 
747  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
748  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
749  dst += (4 * dst_stride);
750  }
751 }
752 
753 static void hevc_hz_biwgt_8t_8w_msa(const uint8_t *src0_ptr,
754  int32_t src_stride,
755  const int16_t *src1_ptr,
756  int32_t src2_stride,
757  uint8_t *dst,
758  int32_t dst_stride,
759  const int8_t *filter,
760  int32_t height,
761  int32_t weight0,
762  int32_t weight1,
763  int32_t offset0,
764  int32_t offset1,
765  int32_t rnd_val)
766 {
767  uint32_t loop_cnt;
768  int32_t offset, weight, constant;
769  v8i16 filt0, filt1, filt2, filt3;
770  v16i8 src0, src1, src2, src3;
771  v16i8 mask1, mask2, mask3;
772  v16i8 vec0, vec1, vec2, vec3;
773  v8i16 dst0, dst1, dst2, dst3;
774  v8i16 in0, in1, in2, in3;
775  v8i16 filter_vec, out0, out1, out2, out3;
776  v4i32 weight_vec, offset_vec, rnd_vec;
777  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
778 
779  src0_ptr -= 3;
780  offset = (offset0 + offset1) << rnd_val;
781  weight0 = weight0 & 0x0000FFFF;
782  weight = weight0 | (weight1 << 16);
783  constant = 128 * weight1;
784  constant <<= 6;
785  offset += constant;
786 
787  offset_vec = __msa_fill_w(offset);
788  weight_vec = __msa_fill_w(weight);
789  rnd_vec = __msa_fill_w(rnd_val + 1);
790 
791  filter_vec = LD_SH(filter);
792  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
793 
794  mask1 = mask0 + 2;
795  mask2 = mask0 + 4;
796  mask3 = mask0 + 6;
797 
798  for (loop_cnt = (height >> 2); loop_cnt--;) {
799  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
800  src0_ptr += (4 * src_stride);
801  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
802  src1_ptr += (4 * src2_stride);
803  XORI_B4_128_SB(src0, src1, src2, src3);
804 
805  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
806  vec0, vec1, vec2, vec3);
807  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
808  filt3);
809  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
810  vec0, vec1, vec2, vec3);
811  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
812  filt3);
813  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
814  vec0, vec1, vec2, vec3);
815  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
816  filt3);
817  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
818  vec0, vec1, vec2, vec3);
819  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
820  filt3);
821 
822  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
823  in0, in1, in2, in3,
824  weight_vec, rnd_vec, offset_vec,
825  out0, out1, out2, out3);
826 
827  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
828  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
829  dst += (4 * dst_stride);
830  }
831 }
832 
833 static void hevc_hz_biwgt_8t_12w_msa(const uint8_t *src0_ptr,
834  int32_t src_stride,
835  const int16_t *src1_ptr,
836  int32_t src2_stride,
837  uint8_t *dst,
838  int32_t dst_stride,
839  const int8_t *filter,
840  int32_t height,
841  int32_t weight0,
842  int32_t weight1,
843  int32_t offset0,
844  int32_t offset1,
845  int32_t rnd_val)
846 {
847  uint32_t loop_cnt;
848  int32_t offset, weight, constant;
849  v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
850  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
851  v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
852  v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
853  v4i32 weight_vec, offset_vec, rnd_vec;
854 
855  src0_ptr -= 3;
856 
857  weight0 = weight0 & 0x0000FFFF;
858  weight = weight0 | (weight1 << 16);
859  constant = 128 * weight1;
860  constant <<= 6;
861  offset = (offset0 + offset1) << rnd_val;
862  offset += constant;
863 
864  offset_vec = __msa_fill_w(offset);
865  weight_vec = __msa_fill_w(weight);
866  rnd_vec = __msa_fill_w(rnd_val + 1);
867 
868  filter_vec = LD_SH(filter);
869  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
870 
871  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
872  mask1 = mask0 + 2;
873  mask2 = mask0 + 4;
874  mask3 = mask0 + 6;
875  mask4 = LD_SB(&ff_hevc_mask_arr[16]);
876  mask5 = mask4 + 2;
877  mask6 = mask4 + 4;
878  mask7 = mask4 + 6;
879 
880  for (loop_cnt = 4; loop_cnt--;) {
881  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
882  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
883  XORI_B4_128_SB(src0, src1, src2, src3);
884  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
885  vec3);
886  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
887  filt3);
888  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
889  vec3);
890  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
891  filt3);
892  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
893  vec3);
894  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
895  filt3);
896  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
897  vec3);
898  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
899  filt3);
900  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
901  weight_vec, rnd_vec, offset_vec, out0, out1, out2,
902  out3);
903  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
904  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
905 
906  LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
907  src0_ptr += (4 * src_stride);
908  LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
909  src1_ptr += (4 * src2_stride);
910  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
911  XORI_B4_128_SB(src0, src1, src2, src3);
912  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
913  vec3);
914  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
915  filt3);
916  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
917  vec3);
918  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
919  filt3);
920  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
921  offset_vec, out0, out1);
922  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
923  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
924  dst += (4 * dst_stride);
925  }
926 }
927 
928 static void hevc_hz_biwgt_8t_16w_msa(const uint8_t *src0_ptr,
929  int32_t src_stride,
930  const int16_t *src1_ptr,
931  int32_t src2_stride,
932  uint8_t *dst,
933  int32_t dst_stride,
934  const int8_t *filter,
935  int32_t height,
936  int32_t weight0,
937  int32_t weight1,
938  int32_t offset0,
939  int32_t offset1,
940  int32_t rnd_val)
941 {
942  uint32_t loop_cnt;
943  int32_t offset, weight, constant;
944  v16i8 src0, src1, src2, src3;
945  v8i16 in0, in1, in2, in3;
946  v8i16 filt0, filt1, filt2, filt3;
947  v16i8 mask1, mask2, mask3;
948  v8i16 filter_vec, out0, out1, out2, out3;
949  v16i8 vec0, vec1, vec2, vec3;
950  v8i16 dst0, dst1, dst2, dst3;
951  v4i32 weight_vec, offset_vec, rnd_vec;
952  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
953 
954  src0_ptr -= 3;
955  offset = (offset0 + offset1) << rnd_val;
956  weight0 = weight0 & 0x0000FFFF;
957  weight = weight0 | (weight1 << 16);
958  constant = 128 * weight1;
959  constant <<= 6;
960  offset += constant;
961 
962  offset_vec = __msa_fill_w(offset);
963  weight_vec = __msa_fill_w(weight);
964  rnd_vec = __msa_fill_w(rnd_val + 1);
965 
966  filter_vec = LD_SH(filter);
967  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
968 
969  mask1 = mask0 + 2;
970  mask2 = mask0 + 4;
971  mask3 = mask0 + 6;
972 
973  for (loop_cnt = (height >> 1); loop_cnt--;) {
974  LD_SB2(src0_ptr, 8, src0, src1);
975  src0_ptr += src_stride;
976  LD_SB2(src0_ptr, 8, src2, src3);
977  src0_ptr += src_stride;
978  LD_SH2(src1_ptr, 8, in0, in1);
979  src1_ptr += src2_stride;
980  LD_SH2(src1_ptr, 8, in2, in3);
981  src1_ptr += src2_stride;
982  XORI_B4_128_SB(src0, src1, src2, src3);
983 
984  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
985  vec0, vec1, vec2, vec3);
986  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
987  filt3);
988  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
989  vec0, vec1, vec2, vec3);
990  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
991  filt3);
992  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
993  vec0, vec1, vec2, vec3);
994  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
995  filt3);
996  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
997  vec0, vec1, vec2, vec3);
998  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
999  filt3);
1000 
1001  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1002  in0, in1, in2, in3,
1003  weight_vec, rnd_vec, offset_vec,
1004  out0, out1, out2, out3);
1005 
1006  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1007  ST_SH2(out0, out1, dst, dst_stride);
1008  dst += (2 * dst_stride);
1009  }
1010 }
1011 
1012 static void hevc_hz_biwgt_8t_24w_msa(const uint8_t *src0_ptr,
1013  int32_t src_stride,
1014  const int16_t *src1_ptr,
1015  int32_t src2_stride,
1016  uint8_t *dst,
1017  int32_t dst_stride,
1018  const int8_t *filter,
1019  int32_t height,
1020  int32_t weight0,
1021  int32_t weight1,
1022  int32_t offset0,
1023  int32_t offset1,
1024  int32_t rnd_val)
1025 {
1026  uint32_t loop_cnt;
1027  uint64_t dst_val0;
1028  int32_t offset, weight, constant;
1029  v16i8 src0, src1;
1030  v8i16 in0, in1, in2;
1031  v8i16 filt0, filt1, filt2, filt3;
1032  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1033  v16i8 vec0, vec1, vec2, vec3;
1034  v8i16 dst0, dst1, dst2;
1035  v4i32 dst2_r, dst2_l;
1036  v8i16 filter_vec, out0, out1, out2;
1037  v4i32 weight_vec, offset_vec, rnd_vec;
1038  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1039 
1040  src0_ptr = src0_ptr - 3;
1041  offset = (offset0 + offset1) << rnd_val;
1042  weight0 = weight0 & 0x0000FFFF;
1043  weight = weight0 | (weight1 << 16);
1044  constant = 128 * weight1;
1045  constant <<= 6;
1046  offset += constant;
1047 
1048  offset_vec = __msa_fill_w(offset);
1049  weight_vec = __msa_fill_w(weight);
1050  rnd_vec = __msa_fill_w(rnd_val + 1);
1051 
1052  filter_vec = LD_SH(filter);
1053  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1054 
1055  mask1 = mask0 + 2;
1056  mask2 = mask0 + 4;
1057  mask3 = mask0 + 6;
1058  mask4 = mask0 + 8;
1059  mask5 = mask0 + 10;
1060  mask6 = mask0 + 12;
1061  mask7 = mask0 + 14;
1062 
1063  LD_SB2(src0_ptr, 16, src0, src1);
1064  src0_ptr += src_stride;
1065  LD_SH2(src1_ptr, 8, in0, in1);
1066  in2 = LD_SH(src1_ptr + 16);
1067  src1_ptr += src2_stride;
1069 
1070  for (loop_cnt = 31; loop_cnt--;) {
1071  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1072  vec0, vec1, vec2, vec3);
1073  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1074  filt3);
1075  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1076  vec0, vec1, vec2, vec3);
1077  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1078  filt3);
1079  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1080  vec0, vec1, vec2, vec3);
1081  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1082  filt3);
1083 
1084  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1085  weight_vec, rnd_vec, offset_vec,
1086  out0, out1);
1087 
1088  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1089  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1090  (v8i16) weight_vec);
1091  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1092  (v8i16) weight_vec);
1093  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1094  CLIP_SW2_0_255(dst2_r, dst2_l);
1095  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1096 
1097  LD_SB2(src0_ptr, 16, src0, src1);
1098  src0_ptr += src_stride;
1099  LD_SH2(src1_ptr, 8, in0, in1);
1100  in2 = LD_SH(src1_ptr + 16);
1101  src1_ptr += src2_stride;
1103  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1104  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1105  ST_SH(out0, dst);
1106  SD(dst_val0, dst + 16);
1107  dst += dst_stride;
1108  }
1109 
1110  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1111  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1112  filt3);
1113  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1114  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1115  filt3);
1116  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1117  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1118  filt3);
1119  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1120  out0, out1);
1121  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1122  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1123  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1124  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1125  CLIP_SW2_0_255(dst2_r, dst2_l);
1126  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1127  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1128  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1129  ST_SH(out0, dst);
1130  SD(dst_val0, dst + 16);
1131  dst += dst_stride;
1132 }
1133 
1134 static void hevc_hz_biwgt_8t_32w_msa(const uint8_t *src0_ptr,
1135  int32_t src_stride,
1136  const int16_t *src1_ptr,
1137  int32_t src2_stride,
1138  uint8_t *dst,
1139  int32_t dst_stride,
1140  const int8_t *filter,
1141  int32_t height,
1142  int32_t weight0,
1143  int32_t weight1,
1144  int32_t offset0,
1145  int32_t offset1,
1146  int32_t rnd_val)
1147 {
1148  uint32_t loop_cnt;
1149  int32_t offset, weight, constant;
1150  v16i8 src0, src1, src2;
1151  v8i16 in0, in1, in2, in3;
1152  v8i16 filt0, filt1, filt2, filt3;
1153  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1154  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1155  v16i8 vec0, vec1, vec2, vec3;
1156  v8i16 dst0, dst1, dst2, dst3;
1157  v8i16 filter_vec, out0, out1, out2, out3;
1158  v4i32 weight_vec, offset_vec, rnd_vec;
1159 
1160  src0_ptr -= 3;
1161  offset = (offset0 + offset1) << rnd_val;
1162  weight0 = weight0 & 0x0000FFFF;
1163  weight = weight0 | (weight1 << 16);
1164  constant = 128 * weight1;
1165  constant <<= 6;
1166  offset += constant;
1167 
1168  offset_vec = __msa_fill_w(offset);
1169  weight_vec = __msa_fill_w(weight);
1170  rnd_vec = __msa_fill_w(rnd_val + 1);
1171 
1172  filter_vec = LD_SH(filter);
1173  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1174 
1175  mask1 = mask0 + 2;
1176  mask2 = mask0 + 4;
1177  mask3 = mask0 + 6;
1178  mask4 = mask0 + 8;
1179  mask5 = mask0 + 10;
1180  mask6 = mask0 + 12;
1181  mask7 = mask0 + 14;
1182 
1183  for (loop_cnt = height; loop_cnt--;) {
1184  LD_SB2(src0_ptr, 16, src0, src1);
1185  src2 = LD_SB(src0_ptr + 24);
1186  src0_ptr += src_stride;
1187  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1188  src1_ptr += src2_stride;
1189 
1191 
1192  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1193  vec0, vec1, vec2, vec3);
1194  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1195  filt3);
1196  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1197  vec0, vec1, vec2, vec3);
1198  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1199  filt3);
1200  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1201  vec0, vec1, vec2, vec3);
1202  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1203  filt3);
1204  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1205  vec0, vec1, vec2, vec3);
1206  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1207  filt3);
1208 
1209  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1210  in0, in1, in2, in3,
1211  weight_vec, rnd_vec, offset_vec,
1212  out0, out1, out2, out3);
1213 
1214  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1215  ST_SH2(out0, out1, dst, 16);
1216  dst += dst_stride;
1217  }
1218 }
1219 
1220 static void hevc_hz_biwgt_8t_48w_msa(const uint8_t *src0_ptr,
1221  int32_t src_stride,
1222  const int16_t *src1_ptr,
1223  int32_t src2_stride,
1224  uint8_t *dst,
1225  int32_t dst_stride,
1226  const int8_t *filter,
1227  int32_t height,
1228  int32_t weight0,
1229  int32_t weight1,
1230  int32_t offset0,
1231  int32_t offset1,
1232  int32_t rnd_val)
1233 {
1234  uint32_t loop_cnt;
1235  int32_t offset, weight, constant;
1236  v16i8 src0, src1, src2, src3, src4;
1237  v8i16 in0, in1, in2, in3;
1238  v8i16 filt0, filt1, filt2, filt3;
1239  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1240  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1241  v16i8 vec0, vec1, vec2, vec3;
1242  v8i16 dst0, dst1, dst2, dst3;
1243  v8i16 filter_vec, out0, out1, out2, out3;
1244  v4i32 weight_vec, offset_vec, rnd_vec;
1245 
1246  src0_ptr -= 3;
1247  offset = (offset0 + offset1) << rnd_val;
1248  weight0 = weight0 & 0x0000FFFF;
1249  weight = weight0 | (weight1 << 16);
1250  constant = 128 * weight1;
1251  constant <<= 6;
1252  offset += constant;
1253 
1254  offset_vec = __msa_fill_w(offset);
1255  weight_vec = __msa_fill_w(weight);
1256  rnd_vec = __msa_fill_w(rnd_val + 1);
1257 
1258  filter_vec = LD_SH(filter);
1259  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1260 
1261  mask1 = mask0 + 2;
1262  mask2 = mask0 + 4;
1263  mask3 = mask0 + 6;
1264  mask4 = mask0 + 8;
1265  mask5 = mask0 + 10;
1266  mask6 = mask0 + 12;
1267  mask7 = mask0 + 14;
1268 
1269  for (loop_cnt = 64; loop_cnt--;) {
1270  LD_SB2(src0_ptr, 16, src0, src1);
1271  src2 = LD_SB(src0_ptr + 24);
1272  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1274  LD_SB2(src0_ptr + 32, 8, src3, src4);
1275  src0_ptr += src_stride;
1276  XORI_B2_128_SB(src3, src4);
1277 
1278  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1279  vec0, vec1, vec2, vec3);
1280  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1281  filt3);
1282  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1283  vec0, vec1, vec2, vec3);
1284  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1285  filt3);
1286  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1287  vec0, vec1, vec2, vec3);
1288  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1289  filt3);
1290  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1291  vec0, vec1, vec2, vec3);
1292  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1293  filt3);
1294 
1295  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1296  weight_vec, rnd_vec, offset_vec,
1297  out0, out1, out2, out3);
1298 
1299  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1300  ST_SH2(out0, out1, dst, 16);
1301 
1302  LD_SH2(src1_ptr + 32, 8, in2, in3);
1303  src1_ptr += src2_stride;
1304 
1305  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1306  vec0, vec1, vec2, vec3);
1307  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1308  filt3);
1309  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1310  vec0, vec1, vec2, vec3);
1311  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1312  filt3);
1313 
1314  HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1315  weight_vec, rnd_vec, offset_vec,
1316  out0, out1);
1317 
1318  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1319  ST_SH(out0, dst + 32);
1320  dst += dst_stride;
1321  }
1322 }
1323 
1324 static void hevc_hz_biwgt_8t_64w_msa(const uint8_t *src0_ptr,
1325  int32_t src_stride,
1326  const int16_t *src1_ptr,
1327  int32_t src2_stride,
1328  uint8_t *dst,
1329  int32_t dst_stride,
1330  const int8_t *filter,
1331  int32_t height,
1332  int32_t weight0,
1333  int32_t weight1,
1334  int32_t offset0,
1335  int32_t offset1,
1336  int32_t rnd_val)
1337 {
1338  const uint8_t *src0_ptr_tmp;
1339  uint8_t *dst_tmp;
1340  const int16_t *src1_ptr_tmp;
1341  uint32_t loop_cnt, cnt;
1342  int32_t offset, weight, constant;
1343  v16i8 src0, src1, src2;
1344  v8i16 in0, in1, in2, in3;
1345  v8i16 filt0, filt1, filt2, filt3;
1346  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1347  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1348  v16i8 vec0, vec1, vec2, vec3;
1349  v8i16 dst0, dst1, dst2, dst3;
1350  v8i16 filter_vec, out0, out1, out2, out3;
1351  v4i32 weight_vec, offset_vec, rnd_vec;
1352 
1353  src0_ptr -= 3;
1354  offset = (offset0 + offset1) << rnd_val;
1355  weight0 = weight0 & 0x0000FFFF;
1356  weight = weight0 | (weight1 << 16);
1357  constant = 128 * weight1;
1358  constant <<= 6;
1359  offset += constant;
1360 
1361  offset_vec = __msa_fill_w(offset);
1362  weight_vec = __msa_fill_w(weight);
1363  rnd_vec = __msa_fill_w(rnd_val + 1);
1364 
1365  filter_vec = LD_SH(filter);
1366  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1367 
1368  mask1 = mask0 + 2;
1369  mask2 = mask0 + 4;
1370  mask3 = mask0 + 6;
1371  mask4 = mask0 + 8;
1372  mask5 = mask0 + 10;
1373  mask6 = mask0 + 12;
1374  mask7 = mask0 + 14;
1375 
1376  for (loop_cnt = height; loop_cnt--;) {
1377  src0_ptr_tmp = src0_ptr;
1378  dst_tmp = dst;
1379  src1_ptr_tmp = src1_ptr;
1380 
1381  for (cnt = 2; cnt--;) {
1382  LD_SB2(src0_ptr_tmp, 16, src0, src1);
1383  src2 = LD_SB(src0_ptr_tmp + 24);
1384  src0_ptr_tmp += 32;
1385  LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1386  src1_ptr_tmp += 32;
1388 
1389  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1390  vec0, vec1, vec2, vec3);
1391  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1392  filt2, filt3);
1393  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1394  vec0, vec1, vec2, vec3);
1395  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1396  filt2, filt3);
1397  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1398  vec0, vec1, vec2, vec3);
1399  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1400  filt2, filt3);
1401  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1402  vec0, vec1, vec2, vec3);
1403  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1404  filt2, filt3);
1405 
1406  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1407  in0, in1, in2, in3,
1408  weight_vec, rnd_vec, offset_vec,
1409  out0, out1, out2, out3);
1410 
1411  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1412  ST_SH2(out0, out1, dst_tmp, 16);
1413  dst_tmp += 32;
1414  }
1415 
1416  src0_ptr += src_stride;
1417  src1_ptr += src2_stride;
1418  dst += dst_stride;
1419 
1420  }
1421 }
1422 
1423 static void hevc_vt_biwgt_8t_4w_msa(const uint8_t *src0_ptr,
1424  int32_t src_stride,
1425  const int16_t *src1_ptr,
1426  int32_t src2_stride,
1427  uint8_t *dst,
1428  int32_t dst_stride,
1429  const int8_t *filter,
1430  int32_t height,
1431  int32_t weight0,
1432  int32_t weight1,
1433  int32_t offset0,
1434  int32_t offset1,
1435  int32_t rnd_val)
1436 {
1437  uint32_t loop_cnt;
1438  int32_t res = height & 0x07;
1440  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1441  v16i8 src11, src12, src13, src14;
1442  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1443  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1444  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1445  v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1446  v16i8 src2110, src4332, src6554, src8776, src10998;
1447  v16i8 src12111110, src14131312;
1448  v8i16 dst10, dst32, dst54, dst76;
1449  v8i16 filt0, filt1, filt2, filt3;
1450  v8i16 filter_vec, out0, out1, out2, out3;
1451  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1452 
1453  src0_ptr -= (3 * src_stride);
1454  offset = (offset0 + offset1) << rnd_val;
1455  weight0 = weight0 & 0x0000FFFF;
1456  weight = weight0 | (weight1 << 16);
1457 
1458  const_vec = __msa_ldi_w(128);
1459  const_vec <<= 6;
1460  offset_vec = __msa_fill_w(offset);
1461  weight_vec = __msa_fill_w(weight);
1462  rnd_vec = __msa_fill_w(rnd_val + 1);
1463  weight1_vec = __msa_fill_w(weight1);
1464  offset_vec += const_vec * weight1_vec;
1465 
1466  filter_vec = LD_SH(filter);
1467  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1468 
1469  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1470  src0_ptr += (7 * src_stride);
1471 
1472  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1473  src10_r, src32_r, src54_r, src21_r);
1474  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1475  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1476  src2110, src4332, src6554);
1477  XORI_B3_128_SB(src2110, src4332, src6554);
1478 
1479  for (loop_cnt = (height >> 3); loop_cnt--;) {
1480  LD_SB8(src0_ptr, src_stride,
1481  src7, src8, src9, src10, src11, src12, src13, src14);
1482  src0_ptr += (8 * src_stride);
1483  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1484  src1_ptr += (8 * src2_stride);
1485 
1486  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1487  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1488  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1489  src76_r, src87_r, src98_r, src109_r);
1490  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1491  src1110_r, src1211_r, src1312_r, src1413_r);
1492  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1493  src1413_r, src1312_r,
1494  src8776, src10998, src12111110, src14131312);
1495  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1496 
1497  DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1498  filt0, dst10, dst32, dst54, dst76);
1499  DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1500  filt1, dst10, dst32, dst54, dst76);
1501  DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1502  filt2, filt2, dst10, dst32, dst54, dst76);
1503  DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1504  filt3, filt3, dst10, dst32, dst54, dst76);
1505 
1506  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1507  in0, in1, in2, in3,
1508  weight_vec, rnd_vec, offset_vec,
1509  out0, out1, out2, out3);
1510 
1511  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1512  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1513  dst += (8 * dst_stride);
1514 
1515  src2110 = src10998;
1516  src4332 = src12111110;
1517  src6554 = src14131312;
1518  src6 = src14;
1519  }
1520  if (res) {
1521  LD_SB8(src0_ptr, src_stride,
1522  src7, src8, src9, src10, src11, src12, src13, src14);
1523  src0_ptr += (8 * src_stride);
1524  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1525  src1_ptr += (8 * src2_stride);
1526 
1527  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1528  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1529  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1530  src76_r, src87_r, src98_r, src109_r);
1531  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1532  src1110_r, src1211_r, src1312_r, src1413_r);
1533  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1534  src1413_r, src1312_r,
1535  src8776, src10998, src12111110, src14131312);
1536  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1537 
1538  DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1539  filt0, dst10, dst32, dst54, dst76);
1540  DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1541  filt1, dst10, dst32, dst54, dst76);
1542  DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1543  filt2, filt2, dst10, dst32, dst54, dst76);
1544  DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1545  filt3, filt3, dst10, dst32, dst54, dst76);
1546 
1547  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1548  in0, in1, in2, in3,
1549  weight_vec, rnd_vec, offset_vec,
1550  out0, out1, out2, out3);
1551 
1552  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1553  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1554 
1555  src2110 = src10998;
1556  src4332 = src12111110;
1557  src6554 = src14131312;
1558  src6 = src14;
1559  }
1560 }
1561 
1562 static void hevc_vt_biwgt_8t_8w_msa(const uint8_t *src0_ptr,
1563  int32_t src_stride,
1564  const int16_t *src1_ptr,
1565  int32_t src2_stride,
1566  uint8_t *dst,
1567  int32_t dst_stride,
1568  const int8_t *filter,
1569  int32_t height,
1570  int32_t weight0,
1571  int32_t weight1,
1572  int32_t offset0,
1573  int32_t offset1,
1574  int32_t rnd_val)
1575 {
1576  uint32_t loop_cnt;
1578  v16i8 src0, src1, src2, src3, src4, src5;
1579  v16i8 src6, src7, src8, src9, src10;
1580  v8i16 in0, in1, in2, in3;
1581  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1582  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1583  v8i16 tmp0, tmp1, tmp2, tmp3;
1584  v8i16 filt0, filt1, filt2, filt3;
1585  v8i16 filter_vec, out0, out1, out2, out3;
1586  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1587 
1588  src0_ptr -= (3 * src_stride);
1589  offset = (offset0 + offset1) << rnd_val;
1590  weight0 = weight0 & 0x0000FFFF;
1591  weight = weight0 | (weight1 << 16);
1592 
1593  const_vec = __msa_ldi_w(128);
1594  const_vec <<= 6;
1595  offset_vec = __msa_fill_w(offset);
1596  weight_vec = __msa_fill_w(weight);
1597  rnd_vec = __msa_fill_w(rnd_val + 1);
1598  weight1_vec = __msa_fill_w(weight1);
1599  offset_vec += const_vec * weight1_vec;
1600 
1601  filter_vec = LD_SH(filter);
1602  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1603 
1604  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1605  src0_ptr += (7 * src_stride);
1606  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1607 
1608  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1609  src10_r, src32_r, src54_r, src21_r);
1610  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1611 
1612  for (loop_cnt = (height >> 2); loop_cnt--;) {
1613  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1614  src0_ptr += (4 * src_stride);
1615  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1616  src1_ptr += (4 * src2_stride);
1617 
1618  XORI_B4_128_SB(src7, src8, src9, src10);
1619  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1620  src76_r, src87_r, src98_r, src109_r);
1621 
1622  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1623  filt0, tmp0, tmp1, tmp2, tmp3);
1624  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1625  filt1, tmp0, tmp1, tmp2, tmp3);
1626  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1627  filt2, tmp0, tmp1, tmp2, tmp3);
1628  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1629  filt3, tmp0, tmp1, tmp2, tmp3);
1630 
1631  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1632  in0, in1, in2, in3,
1633  weight_vec, rnd_vec, offset_vec,
1634  out0, out1, out2, out3);
1635 
1636  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1637  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1638  dst += (4 * dst_stride);
1639 
1640  src10_r = src54_r;
1641  src32_r = src76_r;
1642  src54_r = src98_r;
1643  src21_r = src65_r;
1644  src43_r = src87_r;
1645  src65_r = src109_r;
1646  src6 = src10;
1647  }
1648 }
1649 
1650 static void hevc_vt_biwgt_8t_12w_msa(const uint8_t *src0_ptr,
1651  int32_t src_stride,
1652  const int16_t *src1_ptr,
1653  int32_t src2_stride,
1654  uint8_t *dst,
1655  int32_t dst_stride,
1656  const int8_t *filter,
1657  int32_t height,
1658  int32_t weight0,
1659  int32_t weight1,
1660  int32_t offset0,
1661  int32_t offset1,
1662  int32_t rnd_val)
1663 {
1664  uint32_t loop_cnt;
1666  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1667  v8i16 in0, in1, in2, in3;
1668  v16i8 src10_r, src32_r, src54_r, src76_r;
1669  v16i8 src21_r, src43_r, src65_r, src87_r;
1670  v8i16 tmp0, tmp1, tmp2;
1671  v16i8 src10_l, src32_l, src54_l, src76_l;
1672  v16i8 src21_l, src43_l, src65_l, src87_l;
1673  v16i8 src2110, src4332, src6554, src8776;
1674  v8i16 filt0, filt1, filt2, filt3;
1675  v8i16 out0, out1, out2, filter_vec;
1676  v4i32 dst2_r, dst2_l;
1677  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1678 
1679  src0_ptr -= (3 * src_stride);
1680  offset = (offset0 + offset1) << rnd_val;
1681  weight0 = weight0 & 0x0000FFFF;
1682  weight = weight0 | (weight1 << 16);
1683 
1684  const_vec = __msa_ldi_w(128);
1685  const_vec <<= 6;
1686  offset_vec = __msa_fill_w(offset);
1687  weight_vec = __msa_fill_w(weight);
1688  rnd_vec = __msa_fill_w(rnd_val + 1);
1689  weight1_vec = __msa_fill_w(weight1);
1690  offset_vec += const_vec * weight1_vec;
1691 
1692  filter_vec = LD_SH(filter);
1693  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1694 
1695  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1696  src0_ptr += (7 * src_stride);
1697  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1698 
1699  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1700  src10_r, src32_r, src54_r, src21_r);
1701  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1702  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1703  src10_l, src32_l, src54_l, src21_l);
1704  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1705  ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1706  src2110, src4332, src6554);
1707 
1708  for (loop_cnt = 8; loop_cnt--;) {
1709  LD_SB2(src0_ptr, src_stride, src7, src8);
1710  src0_ptr += (2 * src_stride);
1711  LD_SH2(src1_ptr, src2_stride, in0, in1);
1712  LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1713  src1_ptr += (2 * src2_stride);
1714  in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1715  XORI_B2_128_SB(src7, src8);
1716 
1717  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1718  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1719  src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1720 
1721  DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1722  tmp0, tmp1, tmp2);
1723  DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1724  tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1725  DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1726  tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1727  DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1728  tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1729 
1730  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1731  weight_vec, rnd_vec, offset_vec,
1732  out0, out1);
1733 
1734  ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1735  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1736  (v8i16) weight_vec);
1737  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1738  (v8i16) weight_vec);
1739  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1740  CLIP_SW2_0_255(dst2_r, dst2_l);
1741  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1742  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1743  ST_D2(out0, 0, 1, dst, dst_stride);
1744  ST_W2(out2, 0, 1, dst + 8, dst_stride);
1745  dst += (2 * dst_stride);
1746 
1747  src10_r = src32_r;
1748  src32_r = src54_r;
1749  src54_r = src76_r;
1750  src21_r = src43_r;
1751  src43_r = src65_r;
1752  src65_r = src87_r;
1753  src2110 = src4332;
1754  src4332 = src6554;
1755  src6554 = src8776;
1756  src6 = src8;
1757  }
1758 }
1759 
1760 static void hevc_vt_biwgt_8t_16multx2mult_msa(const uint8_t *src0_ptr,
1761  int32_t src_stride,
1762  const int16_t *src1_ptr,
1763  int32_t src2_stride,
1764  uint8_t *dst,
1765  int32_t dst_stride,
1766  const int8_t *filter,
1767  int32_t height,
1768  int32_t weight0,
1769  int32_t weight1,
1770  int32_t offset0,
1771  int32_t offset1,
1772  int32_t rnd_val,
1773  int32_t width)
1774 {
1775  const uint8_t *src0_ptr_tmp;
1776  const int16_t *src1_ptr_tmp;
1777  uint8_t *dst_tmp;
1778  uint32_t loop_cnt, cnt;
1780  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1781  v8i16 in0, in1, in2, in3;
1782  v16i8 src10_r, src32_r, src54_r, src76_r;
1783  v16i8 src21_r, src43_r, src65_r, src87_r;
1784  v16i8 src10_l, src32_l, src54_l, src76_l;
1785  v16i8 src21_l, src43_l, src65_l, src87_l;
1786  v8i16 tmp0, tmp1, tmp2, tmp3;
1787  v8i16 filt0, filt1, filt2, filt3;
1788  v8i16 filter_vec;
1789  v8i16 out0, out1, out2, out3;
1790  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1791 
1792  src0_ptr -= (3 * src_stride);
1793 
1794  offset = (offset0 + offset1) << rnd_val;
1795  weight0 = weight0 & 0x0000FFFF;
1796  weight = weight0 | (weight1 << 16);
1797 
1798  const_vec = __msa_ldi_w(128);
1799  const_vec <<= 6;
1800  offset_vec = __msa_fill_w(offset);
1801  weight_vec = __msa_fill_w(weight);
1802  rnd_vec = __msa_fill_w(rnd_val + 1);
1803  weight1_vec = __msa_fill_w(weight1);
1804  offset_vec += const_vec * weight1_vec;
1805 
1806  filter_vec = LD_SH(filter);
1807  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1808 
1809  for (cnt = (width >> 4); cnt--;) {
1810  src0_ptr_tmp = src0_ptr;
1811  src1_ptr_tmp = src1_ptr;
1812  dst_tmp = dst;
1813 
1814  LD_SB7(src0_ptr_tmp, src_stride,
1815  src0, src1, src2, src3, src4, src5, src6);
1816  src0_ptr_tmp += (7 * src_stride);
1817 
1818  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1819  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1820  src10_r, src32_r, src54_r, src21_r);
1821  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1822  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1823  src10_l, src32_l, src54_l, src21_l);
1824  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1825 
1826  for (loop_cnt = (height >> 1); loop_cnt--;) {
1827  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1828  src0_ptr_tmp += (2 * src_stride);
1829  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1830  LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1831  src1_ptr_tmp += (2 * src2_stride);
1832 
1833  XORI_B2_128_SB(src7, src8);
1834  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1835  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1836 
1837  DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1838  filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1839  DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1840  filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1841  DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1842  filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1843  DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1844  filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1845 
1846  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1847  in0, in1, in2, in3,
1848  weight_vec, rnd_vec, offset_vec,
1849  out0, out1, out2, out3);
1850 
1851  PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1852  ST_SH2(out0, out1, dst_tmp, dst_stride);
1853  dst_tmp += (2 * dst_stride);
1854 
1855  src10_r = src32_r;
1856  src32_r = src54_r;
1857  src54_r = src76_r;
1858  src21_r = src43_r;
1859  src43_r = src65_r;
1860  src65_r = src87_r;
1861  src10_l = src32_l;
1862  src32_l = src54_l;
1863  src54_l = src76_l;
1864  src21_l = src43_l;
1865  src43_l = src65_l;
1866  src65_l = src87_l;
1867  src6 = src8;
1868  }
1869 
1870  src0_ptr += 16;
1871  src1_ptr += 16;
1872  dst += 16;
1873  }
1874 }
1875 
1876 static void hevc_vt_biwgt_8t_16w_msa(const uint8_t *src0_ptr,
1877  int32_t src_stride,
1878  const int16_t *src1_ptr,
1879  int32_t src2_stride,
1880  uint8_t *dst,
1881  int32_t dst_stride,
1882  const int8_t *filter,
1883  int32_t height,
1884  int32_t weight0,
1885  int32_t weight1,
1886  int32_t offset0,
1887  int32_t offset1,
1888  int32_t rnd_val)
1889 {
1890  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1891  src1_ptr, src2_stride,
1892  dst, dst_stride, filter, height,
1893  weight0, weight1, offset0, offset1,
1894  rnd_val, 16);
1895 }
1896 
1897 static void hevc_vt_biwgt_8t_24w_msa(const uint8_t *src0_ptr,
1898  int32_t src_stride,
1899  const int16_t *src1_ptr,
1900  int32_t src2_stride,
1901  uint8_t *dst,
1902  int32_t dst_stride,
1903  const int8_t *filter,
1904  int32_t height,
1905  int32_t weight0,
1906  int32_t weight1,
1907  int32_t offset0,
1908  int32_t offset1,
1909  int32_t rnd_val)
1910 {
1911  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1912  src1_ptr, src2_stride,
1913  dst, dst_stride, filter, height,
1914  weight0, weight1, offset0, offset1,
1915  rnd_val, 16);
1916  hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1917  src1_ptr + 16, src2_stride,
1918  dst + 16, dst_stride, filter, height,
1919  weight0, weight1, offset0, offset1, rnd_val);
1920 }
1921 
1922 static void hevc_vt_biwgt_8t_32w_msa(const uint8_t *src0_ptr,
1923  int32_t src_stride,
1924  const int16_t *src1_ptr,
1925  int32_t src2_stride,
1926  uint8_t *dst,
1927  int32_t dst_stride,
1928  const int8_t *filter,
1929  int32_t height,
1930  int32_t weight0,
1931  int32_t weight1,
1932  int32_t offset0,
1933  int32_t offset1,
1934  int32_t rnd_val)
1935 {
1936  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1937  src1_ptr, src2_stride,
1938  dst, dst_stride, filter, height,
1939  weight0, weight1, offset0, offset1,
1940  rnd_val, 32);
1941 }
1942 
1943 static void hevc_vt_biwgt_8t_48w_msa(const uint8_t *src0_ptr,
1944  int32_t src_stride,
1945  const int16_t *src1_ptr,
1946  int32_t src2_stride,
1947  uint8_t *dst,
1948  int32_t dst_stride,
1949  const int8_t *filter,
1950  int32_t height,
1951  int32_t weight0,
1952  int32_t weight1,
1953  int32_t offset0,
1954  int32_t offset1,
1955  int32_t rnd_val)
1956 {
1957  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1958  src1_ptr, src2_stride,
1959  dst, dst_stride, filter, height,
1960  weight0, weight1, offset0, offset1,
1961  rnd_val, 48);
1962 }
1963 
1964 static void hevc_vt_biwgt_8t_64w_msa(const uint8_t *src0_ptr,
1965  int32_t src_stride,
1966  const int16_t *src1_ptr,
1967  int32_t src2_stride,
1968  uint8_t *dst,
1969  int32_t dst_stride,
1970  const int8_t *filter,
1971  int32_t height,
1972  int32_t weight0,
1973  int32_t weight1,
1974  int32_t offset0,
1975  int32_t offset1,
1976  int32_t rnd_val)
1977 {
1978  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1979  src1_ptr, src2_stride,
1980  dst, dst_stride, filter, height,
1981  weight0, weight1, offset0, offset1,
1982  rnd_val, 64);
1983 }
1984 
1985 static void hevc_hv_biwgt_8t_4w_msa(const uint8_t *src0_ptr,
1986  int32_t src_stride,
1987  const int16_t *src1_ptr,
1988  int32_t src2_stride,
1989  uint8_t *dst,
1990  int32_t dst_stride,
1991  const int8_t *filter_x,
1992  const int8_t *filter_y,
1993  int32_t height,
1994  int32_t weight0,
1995  int32_t weight1,
1996  int32_t offset0,
1997  int32_t offset1,
1998  int32_t rnd_val)
1999 {
2000  uint32_t loop_cnt;
2001  uint64_t tp0, tp1;
2003  v16u8 out;
2004  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2005  v8i16 in0 = { 0 }, in1 = { 0 };
2006  v8i16 filt0, filt1, filt2, filt3;
2007  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2008  v16i8 mask1, mask2, mask3;
2009  v8i16 filter_vec, weight_vec;
2010  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2011  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2012  v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
2013  v8i16 tmp0, tmp1, tmp2, tmp3;
2014  v8i16 dst10, dst32, dst54, dst76;
2015  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
2016  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2017  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
2018 
2019  src0_ptr -= ((3 * src_stride) + 3);
2020 
2021  filter_vec = LD_SH(filter_x);
2022  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2023 
2024  filter_vec = LD_SH(filter_y);
2025  UNPCK_R_SB_SH(filter_vec, filter_vec);
2026 
2027  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2028 
2029  mask1 = mask0 + 2;
2030  mask2 = mask0 + 4;
2031  mask3 = mask0 + 6;
2032 
2033  offset = (offset0 + offset1) << rnd_val;
2034  weight0 = weight0 & 0x0000FFFF;
2035  weight = weight0 | (weight1 << 16);
2036 
2037  const_vec = __msa_fill_w((128 * weight1));
2038  const_vec <<= 6;
2039  offset_vec = __msa_fill_w(offset);
2040  rnd_vec = __msa_fill_w(rnd_val + 1);
2041  offset_vec += const_vec;
2042  weight_vec = (v8i16) __msa_fill_w(weight);
2043 
2044  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2045  src0_ptr += (7 * src_stride);
2046 
2047  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2048 
2049  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2050  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2051  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
2052  vec8, vec9, vec10, vec11);
2053  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
2054  vec12, vec13, vec14, vec15);
2055 
2056  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2057  filt3);
2058  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2059  filt3);
2060  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2061  filt3);
2062  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2063  filt3);
2064 
2065  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2066  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2067  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2068 
2069  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2070 
2071  for (loop_cnt = height >> 2; loop_cnt--;) {
2072  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2073  src0_ptr += (4 * src_stride);
2074  XORI_B4_128_SB(src7, src8, src9, src10);
2075 
2076  LD2(src1_ptr, src2_stride, tp0, tp1);
2077  INSERT_D2_SH(tp0, tp1, in0);
2078  src1_ptr += (2 * src2_stride);
2079  LD2(src1_ptr, src2_stride, tp0, tp1);
2080  INSERT_D2_SH(tp0, tp1, in1);
2081  src1_ptr += (2 * src2_stride);
2082 
2083  VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2084  vec0, vec1, vec2, vec3);
2085  VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2086  vec4, vec5, vec6, vec7);
2087  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2088  filt3);
2089  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2090  filt3);
2091 
2092  dst76 = __msa_ilvr_h(dst97, dst66);
2093  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2094  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2095  dst98 = __msa_ilvr_h(dst66, dst108);
2096 
2097  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2098  filt_h2, filt_h3);
2099  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2100  filt_h2, filt_h3);
2101  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2102  filt_h2, filt_h3);
2103  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2104  filt_h2, filt_h3);
2105  SRA_4V(dst0, dst1, dst2, dst3, 6);
2106  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2107  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2108  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2109  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2110  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2111  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2112  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2113  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2114  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2115  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2116  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2117  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2118  dst += (4 * dst_stride);
2119 
2120  dst10 = dst54;
2121  dst32 = dst76;
2122  dst54 = dst98;
2123  dst21 = dst65;
2124  dst43 = dst87;
2125  dst65 = dst109;
2126  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2127  }
2128 }
2129 
2130 static void hevc_hv_biwgt_8t_8multx2mult_msa(const uint8_t *src0_ptr,
2131  int32_t src_stride,
2132  const int16_t *src1_ptr,
2133  int32_t src2_stride,
2134  uint8_t *dst,
2135  int32_t dst_stride,
2136  const int8_t *filter_x,
2137  const int8_t *filter_y,
2138  int32_t height,
2139  int32_t weight0,
2140  int32_t weight1,
2141  int32_t offset0,
2142  int32_t offset1,
2143  int32_t rnd_val,
2144  int32_t width8mult)
2145 {
2146  uint32_t loop_cnt, cnt;
2148  const uint8_t *src0_ptr_tmp;
2149  const int16_t *src1_ptr_tmp;
2150  uint8_t *dst_tmp;
2151  v16u8 out;
2152  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2153  v8i16 in0, in1;
2154  v8i16 filt0, filt1, filt2, filt3;
2155  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2156  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2157  v16i8 mask1, mask2, mask3;
2158  v8i16 filter_vec, weight_vec;
2159  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2160  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2161  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2162  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2163  v8i16 tmp0, tmp1, tmp2, tmp3;
2164  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2165  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2166  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2167  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2168  v4i32 offset_vec, rnd_vec, const_vec;
2169 
2170  src0_ptr -= ((3 * src_stride) + 3);
2171 
2172  offset = (offset0 + offset1) << rnd_val;
2173  weight0 = weight0 & 0x0000FFFF;
2174  weight = weight0 | (weight1 << 16);
2175 
2176  const_vec = __msa_fill_w((128 * weight1));
2177  const_vec <<= 6;
2178  offset_vec = __msa_fill_w(offset);
2179  rnd_vec = __msa_fill_w(rnd_val + 1);
2180  offset_vec += const_vec;
2181  weight_vec = (v8i16) __msa_fill_w(weight);
2182 
2183  filter_vec = LD_SH(filter_x);
2184  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2185 
2186  filter_vec = LD_SH(filter_y);
2187  UNPCK_R_SB_SH(filter_vec, filter_vec);
2188 
2189  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2190 
2191  mask1 = mask0 + 2;
2192  mask2 = mask0 + 4;
2193  mask3 = mask0 + 6;
2194 
2195  for (cnt = width8mult; cnt--;) {
2196  src0_ptr_tmp = src0_ptr;
2197  src1_ptr_tmp = src1_ptr;
2198  dst_tmp = dst;
2199 
2200  LD_SB7(src0_ptr_tmp, src_stride,
2201  src0, src1, src2, src3, src4, src5, src6);
2202  src0_ptr_tmp += (7 * src_stride);
2203 
2204  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2205 
2206  /* row 0 row 1 row 2 row 3 */
2207  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2208  vec0, vec1, vec2, vec3);
2209  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2210  vec4, vec5, vec6, vec7);
2211  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2212  vec8, vec9, vec10, vec11);
2213  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2214  vec12, vec13, vec14, vec15);
2215 
2216  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2217  filt3);
2218  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2219  filt3);
2220  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2221  filt3);
2222  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2223  filt2, filt3);
2224 
2225  /* row 4 row 5 row 6 */
2226  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2227  vec0, vec1, vec2, vec3);
2228  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2229  vec4, vec5, vec6, vec7);
2230  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2231  vec8, vec9, vec10, vec11);
2232 
2233  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2234  filt3);
2235  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2236  filt3);
2237  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2238  filt3);
2239 
2240  for (loop_cnt = height >> 1; loop_cnt--;) {
2241  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2242  XORI_B2_128_SB(src7, src8);
2243  src0_ptr_tmp += 2 * src_stride;
2244 
2245  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2246  src1_ptr_tmp += (2 * src2_stride);
2247 
2248  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2249  dst32_r, dst54_r, dst21_r);
2250  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2251  dst32_l, dst54_l, dst21_l);
2252  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2253  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2254 
2255  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2256  vec0, vec1, vec2, vec3);
2257  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2258  filt2, filt3);
2259 
2260  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2261  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2262  filt_h0, filt_h1, filt_h2, filt_h3);
2263  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2264  filt_h0, filt_h1, filt_h2, filt_h3);
2265 
2266  dst0_r >>= 6;
2267  dst0_l >>= 6;
2268 
2269  /* row 8 */
2270  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2271  vec0, vec1, vec2, vec3);
2272  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2273  filt2, filt3);
2274 
2275  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2276  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2277  filt_h0, filt_h1, filt_h2, filt_h3);
2278  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2279  filt_h0, filt_h1, filt_h2, filt_h3);
2280 
2281  dst1_r >>= 6;
2282  dst1_l >>= 6;
2283 
2284  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2285  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2286  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2287  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2288  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2289  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2290  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2291  SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2292  CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
2293  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2294  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2295  ST_D2(out, 0, 1, dst_tmp, dst_stride);
2296  dst_tmp += (2 * dst_stride);
2297 
2298  dst0 = dst2;
2299  dst1 = dst3;
2300  dst2 = dst4;
2301  dst3 = dst5;
2302  dst4 = dst6;
2303  dst5 = dst7;
2304  dst6 = dst8;
2305  }
2306 
2307  src0_ptr += 8;
2308  src1_ptr += 8;
2309  dst += 8;
2310  }
2311 }
2312 
2313 static void hevc_hv_biwgt_8t_8w_msa(const uint8_t *src0_ptr,
2314  int32_t src_stride,
2315  const int16_t *src1_ptr,
2316  int32_t src2_stride,
2317  uint8_t *dst,
2318  int32_t dst_stride,
2319  const int8_t *filter_x,
2320  const int8_t *filter_y,
2321  int32_t height,
2322  int32_t weight0,
2323  int32_t weight1,
2324  int32_t offset0,
2325  int32_t offset1,
2326  int32_t rnd_val)
2327 {
2328  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2329  src1_ptr, src2_stride,
2330  dst, dst_stride, filter_x, filter_y,
2331  height, weight0, weight1, offset0,
2332  offset1, rnd_val, 1);
2333 }
2334 
2335 static void hevc_hv_biwgt_8t_12w_msa(const uint8_t *src0_ptr,
2336  int32_t src_stride,
2337  const int16_t *src1_ptr,
2338  int32_t src2_stride,
2339  uint8_t *dst,
2340  int32_t dst_stride,
2341  const int8_t *filter_x,
2342  const int8_t *filter_y,
2343  int32_t height,
2344  int32_t weight0,
2345  int32_t weight1,
2346  int32_t offset0,
2347  int32_t offset1,
2348  int32_t rnd_val)
2349 {
2350  uint32_t loop_cnt;
2351  const uint8_t *src0_ptr_tmp;
2352  uint8_t *dst_tmp;
2353  const int16_t *src1_ptr_tmp;
2355  uint64_t tp0, tp1;
2356  v16u8 out;
2357  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2358  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2359  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2360  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2361  v8i16 in0 = { 0 }, in1 = { 0 };
2362  v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2363  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2364  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2365  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2366  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2367  v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2368  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2369  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2370 
2371  src0_ptr -= ((3 * src_stride) + 3);
2372 
2373  offset = (offset0 + offset1) << rnd_val;
2374  weight0 = weight0 & 0x0000FFFF;
2375  weight = weight0 | (weight1 << 16);
2376 
2377  const_vec = __msa_fill_w((128 * weight1));
2378  const_vec <<= 6;
2379  offset_vec = __msa_fill_w(offset);
2380  rnd_vec = __msa_fill_w(rnd_val + 1);
2381  offset_vec += const_vec;
2382  weight_vec = (v8i16) __msa_fill_w(weight);
2383 
2384  filter_vec = LD_SH(filter_x);
2385  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2386 
2387  filter_vec = LD_SH(filter_y);
2388  UNPCK_R_SB_SH(filter_vec, filter_vec);
2389 
2390  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2391 
2392  mask0 = LD_SB(ff_hevc_mask_arr);
2393  mask1 = mask0 + 2;
2394  mask2 = mask0 + 4;
2395  mask3 = mask0 + 6;
2396 
2397  src0_ptr_tmp = src0_ptr;
2398  src1_ptr_tmp = src1_ptr;
2399  dst_tmp = dst;
2400 
2401  LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2402  src0_ptr_tmp += (7 * src_stride);
2403  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2404 
2405  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2406  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2407  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2408  vec11);
2409  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2410  vec15);
2411  dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2412  filt3);
2413  dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2414  filt3);
2415  dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2416  filt3);
2417  dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2418  filt2, filt3);
2419  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2420  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2421  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2422  vec11);
2423  dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2424  filt3);
2425  dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2426  filt3);
2427  dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2428  filt3);
2429 
2430  for (loop_cnt = 8; loop_cnt--;) {
2431  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2432  src0_ptr_tmp += (2 * src_stride);
2433  XORI_B2_128_SB(src7, src8);
2434 
2435  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2436  src1_ptr_tmp += (2 * src2_stride);
2437 
2438  ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2439  dst10_r, dst32_r, dst54_r, dst21_r);
2440  ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2441  dst10_l, dst32_l, dst54_l, dst21_l);
2442  ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2443  ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2444 
2445  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2446  vec3);
2447  dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2448  filt3);
2449 
2450  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2451  dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2452  filt_h1, filt_h2, filt_h3);
2453  dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2454  filt_h1, filt_h2, filt_h3);
2455  dst0 >>= 6;
2456  dst1 >>= 6;
2457 
2458  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2459  vec3);
2460  dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2461  filt3);
2462 
2463  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2464  dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2465  filt_h1, filt_h2, filt_h3);
2466  dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2467  filt_h1, filt_h2, filt_h3);
2468  dst2 >>= 6;
2469  dst3 >>= 6;
2470 
2471  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2472  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2473  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2474  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2475  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2476  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2477  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2478  SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2479  CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
2480  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2481  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2482  ST_D2(out, 0, 1, dst_tmp, dst_stride);
2483  dst_tmp += (2 * dst_stride);
2484 
2485  dsth0 = dsth2;
2486  dsth1 = dsth3;
2487  dsth2 = dsth4;
2488  dsth3 = dsth5;
2489  dsth4 = dsth6;
2490  dsth5 = dsth7;
2491  dsth6 = dsth8;
2492  }
2493 
2494  src0_ptr += 8;
2495  src1_ptr += 8;
2496  dst += 8;
2497 
2498  mask4 = LD_SB(ff_hevc_mask_arr + 16);
2499  mask5 = mask4 + 2;
2500  mask6 = mask4 + 4;
2501  mask7 = mask4 + 6;
2502 
2503  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2504  src0_ptr += (7 * src_stride);
2505  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2506 
2507  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2508  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2509  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2510  vec11);
2511  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2512  vec15);
2513  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2514  filt3);
2515  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2516  filt3);
2517  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2518  filt3);
2519  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2520  filt3);
2521  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2522  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2523  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2524 
2525  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2526 
2527  for (loop_cnt = 4; loop_cnt--;) {
2528  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2529  src0_ptr += (4 * src_stride);
2530  XORI_B4_128_SB(src7, src8, src9, src10);
2531 
2532  LD2(src1_ptr, src2_stride, tp0, tp1);
2533  INSERT_D2_SH(tp0, tp1, in0);
2534  src1_ptr += (2 * src2_stride);
2535  LD2(src1_ptr, src2_stride, tp0, tp1);
2536  INSERT_D2_SH(tp0, tp1, in1);
2537  src1_ptr += (2 * src2_stride);
2538 
2539  VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2540  vec3);
2541  VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2542  vec7);
2543  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2544  filt3);
2545  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2546  filt3);
2547 
2548  dst76 = __msa_ilvr_h(dst97, dst66);
2549  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2550  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2551  dst98 = __msa_ilvr_h(dst66, dst108);
2552 
2553  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2554  filt_h2, filt_h3);
2555  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2556  filt_h2, filt_h3);
2557  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2558  filt_h2, filt_h3);
2559  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2560  filt_h2, filt_h3);
2561  SRA_4V(dst0, dst1, dst2, dst3, 6);
2562  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2563  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2564  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2565  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2566  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2567  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2568  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2569  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2570  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2571  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2572  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2573  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2574  dst += (4 * dst_stride);
2575 
2576  dst10 = dst54;
2577  dst32 = dst76;
2578  dst54 = dst98;
2579  dst21 = dst65;
2580  dst43 = dst87;
2581  dst65 = dst109;
2582  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2583  }
2584 }
2585 
2586 static void hevc_hv_biwgt_8t_16w_msa(const uint8_t *src0_ptr,
2587  int32_t src_stride,
2588  const int16_t *src1_ptr,
2589  int32_t src2_stride,
2590  uint8_t *dst,
2591  int32_t dst_stride,
2592  const int8_t *filter_x,
2593  const int8_t *filter_y,
2594  int32_t height,
2595  int32_t weight0,
2596  int32_t weight1,
2597  int32_t offset0,
2598  int32_t offset1,
2599  int32_t rnd_val)
2600 {
2601  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2602  src1_ptr, src2_stride,
2603  dst, dst_stride, filter_x, filter_y,
2604  height, weight0, weight1, offset0,
2605  offset1, rnd_val, 2);
2606 }
2607 
2608 static void hevc_hv_biwgt_8t_24w_msa(const uint8_t *src0_ptr,
2609  int32_t src_stride,
2610  const int16_t *src1_ptr,
2611  int32_t src2_stride,
2612  uint8_t *dst,
2613  int32_t dst_stride,
2614  const int8_t *filter_x,
2615  const int8_t *filter_y,
2616  int32_t height,
2617  int32_t weight0,
2618  int32_t weight1,
2619  int32_t offset0,
2620  int32_t offset1,
2621  int32_t rnd_val)
2622 {
2623  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2624  src1_ptr, src2_stride,
2625  dst, dst_stride, filter_x, filter_y,
2626  height, weight0, weight1, offset0,
2627  offset1, rnd_val, 3);
2628 }
2629 
2630 static void hevc_hv_biwgt_8t_32w_msa(const uint8_t *src0_ptr,
2631  int32_t src_stride,
2632  const int16_t *src1_ptr,
2633  int32_t src2_stride,
2634  uint8_t *dst,
2635  int32_t dst_stride,
2636  const int8_t *filter_x,
2637  const int8_t *filter_y,
2638  int32_t height,
2639  int32_t weight0,
2640  int32_t weight1,
2641  int32_t offset0,
2642  int32_t offset1,
2643  int32_t rnd_val)
2644 {
2645  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2646  src1_ptr, src2_stride,
2647  dst, dst_stride, filter_x, filter_y,
2648  height, weight0, weight1, offset0,
2649  offset1, rnd_val, 4);
2650 }
2651 
2652 static void hevc_hv_biwgt_8t_48w_msa(const uint8_t *src0_ptr,
2653  int32_t src_stride,
2654  const int16_t *src1_ptr,
2655  int32_t src2_stride,
2656  uint8_t *dst,
2657  int32_t dst_stride,
2658  const int8_t *filter_x,
2659  const int8_t *filter_y,
2660  int32_t height,
2661  int32_t weight0,
2662  int32_t weight1,
2663  int32_t offset0,
2664  int32_t offset1,
2665  int32_t rnd_val)
2666 {
2667  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2668  src1_ptr, src2_stride,
2669  dst, dst_stride, filter_x, filter_y,
2670  height, weight0, weight1, offset0,
2671  offset1, rnd_val, 6);
2672 }
2673 
2674 static void hevc_hv_biwgt_8t_64w_msa(const uint8_t *src0_ptr,
2675  int32_t src_stride,
2676  const int16_t *src1_ptr,
2677  int32_t src2_stride,
2678  uint8_t *dst,
2679  int32_t dst_stride,
2680  const int8_t *filter_x,
2681  const int8_t *filter_y,
2682  int32_t height,
2683  int32_t weight0,
2684  int32_t weight1,
2685  int32_t offset0,
2686  int32_t offset1,
2687  int32_t rnd_val)
2688 {
2689  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2690  src1_ptr, src2_stride,
2691  dst, dst_stride, filter_x, filter_y,
2692  height, weight0, weight1, offset0,
2693  offset1, rnd_val, 8);
2694 }
2695 
2696 static void hevc_hz_biwgt_4t_4x2_msa(const uint8_t *src0_ptr,
2697  int32_t src_stride,
2698  const int16_t *src1_ptr,
2699  int32_t src2_stride,
2700  uint8_t *dst,
2701  int32_t dst_stride,
2702  const int8_t *filter,
2703  int32_t weight0,
2704  int32_t weight1,
2705  int32_t offset0,
2706  int32_t offset1,
2707  int32_t rnd_val)
2708 {
2709  int32_t offset, weight, constant;
2710  v8i16 filt0, filt1;
2711  v16i8 src0, src1;
2712  v8i16 in0, in1;
2713  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2714  v16i8 mask1, vec0, vec1;
2715  v8i16 dst0;
2716  v4i32 dst0_r, dst0_l;
2717  v8i16 out0, filter_vec;
2718  v4i32 weight_vec, offset_vec, rnd_vec;
2719 
2720  src0_ptr -= 1;
2721 
2722  filter_vec = LD_SH(filter);
2723  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2724 
2725  mask1 = mask0 + 2;
2726 
2727  offset = (offset0 + offset1) << rnd_val;
2728  weight0 = weight0 & 0x0000FFFF;
2729  weight = weight0 | (weight1 << 16);
2730  constant = 128 * weight1;
2731  constant <<= 6;
2732  offset += constant;
2733 
2734  offset_vec = __msa_fill_w(offset);
2735  weight_vec = __msa_fill_w(weight);
2736  rnd_vec = __msa_fill_w(rnd_val + 1);
2737 
2738  LD_SB2(src0_ptr, src_stride, src0, src1);
2739  LD_SH2(src1_ptr, src2_stride, in0, in1);
2740  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2742 
2743  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2744  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2745 
2746  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2747  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2748  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2749  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2750  out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2751  CLIP_SH_0_255(out0);
2752  out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2753  ST_W2(out0, 0, 1, dst, dst_stride);
2754 }
2755 
2756 static void hevc_hz_biwgt_4t_4x4_msa(const uint8_t *src0_ptr,
2757  int32_t src_stride,
2758  const int16_t *src1_ptr,
2759  int32_t src2_stride,
2760  uint8_t *dst,
2761  int32_t dst_stride,
2762  const int8_t *filter,
2763  int32_t weight0,
2764  int32_t weight1,
2765  int32_t offset0,
2766  int32_t offset1,
2767  int32_t rnd_val)
2768 {
2769  int32_t offset, weight, constant;
2770  v8i16 filt0, filt1;
2771  v16i8 src0, src1, src2, src3;
2772  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2773  v16i8 mask1;
2774  v8i16 dst0, dst1;
2775  v16i8 vec0, vec1;
2776  v8i16 in0, in1, in2, in3;
2777  v8i16 filter_vec;
2778  v4i32 weight_vec, offset_vec, rnd_vec;
2779 
2780  src0_ptr -= 1;
2781 
2782  /* rearranging filter */
2783  filter_vec = LD_SH(filter);
2784  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2785 
2786  mask1 = mask0 + 2;
2787 
2788  offset = (offset0 + offset1) << rnd_val;
2789  weight0 = weight0 & 0x0000FFFF;
2790  weight = weight0 | (weight1 << 16);
2791  constant = 128 * weight1;
2792  constant <<= 6;
2793  offset += constant;
2794 
2795  offset_vec = __msa_fill_w(offset);
2796  weight_vec = __msa_fill_w(weight);
2797  rnd_vec = __msa_fill_w(rnd_val + 1);
2798 
2799  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2800  XORI_B4_128_SB(src0, src1, src2, src3);
2801  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2802  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2803 
2804  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2805  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2806  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2807  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2808  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2809  weight_vec, rnd_vec, offset_vec,
2810  dst0, dst1);
2811 
2812  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2813  ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2814 }
2815 
2816 static void hevc_hz_biwgt_4t_4x8multiple_msa(const uint8_t *src0_ptr,
2817  int32_t src_stride,
2818  const int16_t *src1_ptr,
2819  int32_t src2_stride,
2820  uint8_t *dst,
2821  int32_t dst_stride,
2822  const int8_t *filter,
2823  int32_t height,
2824  int32_t weight0,
2825  int32_t weight1,
2826  int32_t offset0,
2827  int32_t offset1,
2828  int32_t rnd_val)
2829 {
2830  uint32_t loop_cnt;
2831  int32_t weight, offset, constant;
2832  v8i16 filt0, filt1;
2833  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2834  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2835  v16i8 mask1;
2836  v16i8 vec0, vec1;
2837  v8i16 dst0, dst1, dst2, dst3;
2838  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2839  v8i16 filter_vec;
2840  v4i32 weight_vec, offset_vec, rnd_vec;
2841 
2842  src0_ptr -= 1;
2843 
2844  filter_vec = LD_SH(filter);
2845  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2846 
2847  offset = (offset0 + offset1) << rnd_val;
2848  weight0 = weight0 & 0x0000FFFF;
2849  weight = weight0 | (weight1 << 16);
2850  constant = 128 * weight1;
2851  constant <<= 6;
2852  offset += constant;
2853 
2854  offset_vec = __msa_fill_w(offset);
2855  weight_vec = __msa_fill_w(weight);
2856  rnd_vec = __msa_fill_w(rnd_val + 1);
2857 
2858  mask1 = mask0 + 2;
2859 
2860  for (loop_cnt = (height >> 3); loop_cnt--;) {
2861  LD_SB8(src0_ptr, src_stride,
2862  src0, src1, src2, src3, src4, src5, src6, src7);
2863  src0_ptr += (8 * src_stride);
2864  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2865  src1_ptr += (4 * src2_stride);
2866  LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2867  src1_ptr += (4 * src2_stride);
2868  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2869  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2870  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2871 
2872  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2873  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2874  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2875  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2876  VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2877  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2878  VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2879  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2880  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2881  in0, in1, in2, in3,
2882  weight_vec, rnd_vec, offset_vec,
2883  dst0, dst1, dst2, dst3);
2884 
2885  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2886  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2887  dst += (8 * dst_stride);
2888  }
2889 }
2890 
2891 static void hevc_hz_biwgt_4t_4w_msa(const uint8_t *src0_ptr,
2892  int32_t src_stride,
2893  const int16_t *src1_ptr,
2894  int32_t src2_stride,
2895  uint8_t *dst,
2896  int32_t dst_stride,
2897  const int8_t *filter,
2898  int32_t height,
2899  int32_t weight0,
2900  int32_t weight1,
2901  int32_t offset0,
2902  int32_t offset1,
2903  int32_t rnd_val)
2904 {
2905  if (2 == height) {
2906  hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2907  dst, dst_stride, filter,
2908  weight0, weight1, offset0, offset1, rnd_val);
2909  } else if (4 == height) {
2910  hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2911  dst, dst_stride, filter,
2912  weight0, weight1, offset0, offset1, rnd_val);
2913  } else if (0 == (height % 8)) {
2914  hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2915  src1_ptr, src2_stride,
2916  dst, dst_stride, filter, height,
2917  weight0, weight1, offset0, offset1,
2918  rnd_val);
2919  }
2920 }
2921 
2922 static void hevc_hz_biwgt_4t_6w_msa(const uint8_t *src0_ptr,
2923  int32_t src_stride,
2924  const int16_t *src1_ptr,
2925  int32_t src2_stride,
2926  uint8_t *dst,
2927  int32_t dst_stride,
2928  const int8_t *filter,
2929  int32_t height,
2930  int32_t weight0,
2931  int32_t weight1,
2932  int32_t offset0,
2933  int32_t offset1,
2934  int32_t rnd_val)
2935 {
2936  uint32_t loop_cnt;
2937  int32_t offset, weight, constant;
2938  v8i16 filt0, filt1;
2939  v16i8 src0, src1, src2, src3;
2940  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2941  v16i8 mask1;
2942  v16i8 vec0, vec1;
2943  v8i16 in0, in1, in2, in3;
2944  v8i16 dst0, dst1, dst2, dst3;
2945  v8i16 filter_vec;
2946  v4i32 weight_vec, offset_vec, rnd_vec;
2947 
2948  src0_ptr -= 1;
2949 
2950  filter_vec = LD_SH(filter);
2951  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2952 
2953  offset = (offset0 + offset1) << rnd_val;
2954  weight0 = weight0 & 0x0000FFFF;
2955  weight = weight0 | (weight1 << 16);
2956  constant = 128 * weight1;
2957  constant <<= 6;
2958  offset += constant;
2959 
2960  offset_vec = __msa_fill_w(offset);
2961  weight_vec = __msa_fill_w(weight);
2962  rnd_vec = __msa_fill_w(rnd_val + 1);
2963 
2964  mask1 = mask0 + 2;
2965 
2966  for (loop_cnt = 2; loop_cnt--;) {
2967  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2968  src0_ptr += (4 * src_stride);
2969  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2970  src1_ptr += (4 * src2_stride);
2971  XORI_B4_128_SB(src0, src1, src2, src3);
2972 
2973  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2974  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2975  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2976  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2977  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2978  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2979  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2980  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2981 
2982  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2983  in0, in1, in2, in3,
2984  weight_vec, rnd_vec, offset_vec,
2985  dst0, dst1, dst2, dst3);
2986 
2987  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2988  ST_W2(dst0, 0, 2, dst, dst_stride);
2989  ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2990  ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2991  ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2992  dst += (4 * dst_stride);
2993  }
2994 }
2995 
2996 static void hevc_hz_biwgt_4t_8x2_msa(const uint8_t *src0_ptr,
2997  int32_t src_stride,
2998  const int16_t *src1_ptr,
2999  int32_t src2_stride,
3000  uint8_t *dst,
3001  int32_t dst_stride,
3002  const int8_t *filter,
3003  int32_t weight0,
3004  int32_t weight1,
3005  int32_t offset0,
3006  int32_t offset1,
3007  int32_t rnd_val)
3008 {
3009  int32_t offset, weight, constant;
3010  v8i16 filt0, filt1;
3011  v16i8 src0, src1;
3012  v8i16 in0, in1;
3013  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3014  v16i8 mask1, vec0, vec1;
3015  v8i16 dst0, dst1;
3016  v8i16 filter_vec;
3017  v4i32 weight_vec, offset_vec, rnd_vec;
3018 
3019  src0_ptr -= 1;
3020 
3021  filter_vec = LD_SH(filter);
3022  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3023 
3024  offset = (offset0 + offset1) << rnd_val;
3025  weight0 = weight0 & 0x0000FFFF;
3026  weight = weight0 | (weight1 << 16);
3027  constant = 128 * weight1;
3028  constant <<= 6;
3029  offset += constant;
3030 
3031  offset_vec = __msa_fill_w(offset);
3032  weight_vec = __msa_fill_w(weight);
3033  rnd_vec = __msa_fill_w(rnd_val + 1);
3034 
3035  mask1 = mask0 + 2;
3036 
3037  LD_SB2(src0_ptr, src_stride, src0, src1);
3038  LD_SH2(src1_ptr, src2_stride, in0, in1);
3040  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3041  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3043  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3044  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
3045  weight_vec, rnd_vec, offset_vec,
3046  dst0, dst1);
3047 
3048  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3049  ST_D2(dst0, 0, 1, dst, dst_stride);
3050 }
3051 
3052 static void hevc_hz_biwgt_4t_8x6_msa(const uint8_t *src0_ptr,
3053  int32_t src_stride,
3054  const int16_t *src1_ptr,
3055  int32_t src2_stride,
3056  uint8_t *dst,
3057  int32_t dst_stride,
3058  const int8_t *filter,
3059  int32_t weight0,
3060  int32_t weight1,
3061  int32_t offset0,
3062  int32_t offset1,
3063  int32_t rnd_val)
3064 {
3065  int32_t weight, offset, constant;
3066  v8i16 filt0, filt1;
3067  v16i8 src0, src1, src2, src3, src4, src5;
3068  v8i16 in0, in1, in2, in3, in4, in5;
3069  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3070  v16i8 mask1;
3071  v16i8 vec0, vec1;
3072  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3073  v8i16 filter_vec;
3074  v4i32 weight_vec, offset_vec, rnd_vec;
3075 
3076  src0_ptr -= 1;
3077 
3078  filter_vec = LD_SH(filter);
3079  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3080 
3081  offset = (offset0 + offset1) << rnd_val;
3082  weight0 = weight0 & 0x0000FFFF;
3083  weight = weight0 | (weight1 << 16);
3084  constant = 128 * weight1;
3085  constant <<= 6;
3086  offset += constant;
3087 
3088  offset_vec = __msa_fill_w(offset);
3089  weight_vec = __msa_fill_w(weight);
3090  rnd_vec = __msa_fill_w(rnd_val + 1);
3091 
3092  mask1 = mask0 + 2;
3093 
3094  LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3095 
3096  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3097  src1_ptr += (4 * src2_stride);
3098  LD_SH2(src1_ptr, src2_stride, in4, in5);
3099  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3100  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3101  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3102  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3103  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3104  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3105  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3106  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3107  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3108  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3109  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3110  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3111  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3112  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3113  in0, in1, in2, in3,
3114  weight_vec, rnd_vec, offset_vec,
3115  dst0, dst1, dst2, dst3);
3116  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3117  weight_vec, rnd_vec, offset_vec,
3118  dst4, dst5);
3119 
3120  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3121  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3122  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3123  ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3124 }
3125 
3126 static void hevc_hz_biwgt_4t_8x4multiple_msa(const uint8_t *src0_ptr,
3127  int32_t src_stride,
3128  const int16_t *src1_ptr,
3129  int32_t src2_stride,
3130  uint8_t *dst,
3131  int32_t dst_stride,
3132  const int8_t *filter,
3133  int32_t height,
3134  int32_t weight0,
3135  int32_t weight1,
3136  int32_t offset0,
3137  int32_t offset1,
3138  int32_t rnd_val)
3139 {
3140  uint32_t loop_cnt;
3141  int32_t offset, weight, constant;
3142  v8i16 filt0, filt1;
3143  v16i8 src0, src1, src2, src3;
3144  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3145  v16i8 mask1;
3146  v16i8 vec0, vec1;
3147  v8i16 in0, in1, in2, in3;
3148  v8i16 dst0, dst1, dst2, dst3;
3149  v8i16 filter_vec;
3150  v4i32 weight_vec, offset_vec, rnd_vec;
3151 
3152  src0_ptr -= 1;
3153 
3154  filter_vec = LD_SH(filter);
3155  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3156 
3157  offset = (offset0 + offset1) << rnd_val;
3158  weight0 = weight0 & 0x0000FFFF;
3159  weight = weight0 | (weight1 << 16);
3160  constant = 128 * weight1;
3161  constant <<= 6;
3162  offset += constant;
3163 
3164  offset_vec = __msa_fill_w(offset);
3165  weight_vec = __msa_fill_w(weight);
3166  rnd_vec = __msa_fill_w(rnd_val + 1);
3167 
3168  mask1 = mask0 + 2;
3169 
3170  for (loop_cnt = (height >> 2); loop_cnt--;) {
3171  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3172  src0_ptr += (4 * src_stride);
3173  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3174  src1_ptr += (4 * src2_stride);
3175  XORI_B4_128_SB(src0, src1, src2, src3);
3176 
3177  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3178  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3179  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3180  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3181  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3182  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3183  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3184  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3185  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3186  in0, in1, in2, in3,
3187  weight_vec, rnd_vec, offset_vec,
3188  dst0, dst1, dst2, dst3);
3189 
3190  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3191  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3192  dst += (4 * dst_stride);
3193  }
3194 }
3195 
3196 static void hevc_hz_biwgt_4t_8w_msa(const uint8_t *src0_ptr,
3197  int32_t src_stride,
3198  const int16_t *src1_ptr,
3199  int32_t src2_stride,
3200  uint8_t *dst,
3201  int32_t dst_stride,
3202  const int8_t *filter,
3203  int32_t height,
3204  int32_t weight0,
3205  int32_t weight1,
3206  int32_t offset0,
3207  int32_t offset1,
3208  int32_t rnd_val)
3209 {
3210  if (2 == height) {
3211  hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3212  dst, dst_stride, filter,
3213  weight0, weight1, offset0, offset1, rnd_val);
3214  } else if (6 == height) {
3215  hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3216  dst, dst_stride, filter,
3217  weight0, weight1, offset0, offset1, rnd_val);
3218  } else if (0 == (height % 4)) {
3219  hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3220  src1_ptr, src2_stride,
3221  dst, dst_stride, filter, height,
3222  weight0, weight1, offset0, offset1,
3223  rnd_val);
3224  }
3225 }
3226 
3227 static void hevc_hz_biwgt_4t_12w_msa(const uint8_t *src0_ptr,
3228  int32_t src_stride,
3229  const int16_t *src1_ptr,
3230  int32_t src2_stride,
3231  uint8_t *dst,
3232  int32_t dst_stride,
3233  const int8_t *filter,
3234  int32_t height,
3235  int32_t weight0,
3236  int32_t weight1,
3237  int32_t offset0,
3238  int32_t offset1,
3239  int32_t rnd_val)
3240 {
3241  uint32_t loop_cnt;
3242  int32_t offset, weight, constant;
3243  v8i16 filt0, filt1;
3244  v16i8 src0, src1, src2, src3;
3245  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3246  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3247  v16i8 mask2 = {
3248  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3249  };
3250  v16i8 mask1, mask3;
3251  v16i8 vec0, vec1;
3252  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3253  v8i16 filter_vec;
3254  v4i32 weight_vec, offset_vec, rnd_vec;
3255 
3256  src0_ptr -= 1;
3257 
3258  filter_vec = LD_SH(filter);
3259  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3260 
3261  offset = (offset0 + offset1) << rnd_val;
3262  weight0 = weight0 & 0x0000FFFF;
3263  weight = weight0 | (weight1 << 16);
3264  constant = 128 * weight1;
3265  constant <<= 6;
3266  offset += constant;
3267 
3268  offset_vec = __msa_fill_w(offset);
3269  weight_vec = __msa_fill_w(weight);
3270  rnd_vec = __msa_fill_w(rnd_val + 1);
3271 
3272  mask1 = mask0 + 2;
3273  mask3 = mask2 + 2;
3274 
3275  for (loop_cnt = 4; loop_cnt--;) {
3276  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3277  src0_ptr += (4 * src_stride);
3278  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3279  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3280  src1_ptr += (4 * src2_stride);
3281  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3282  XORI_B4_128_SB(src0, src1, src2, src3);
3283 
3284  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3285  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3286  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3287  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3288  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3289  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3290  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3291  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3292  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3293  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3294  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3295  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3296 
3297  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3298  in0, in1, in2, in3,
3299  weight_vec, rnd_vec, offset_vec,
3300  dst0, dst1, dst2, dst3);
3301  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3302  weight_vec, rnd_vec, offset_vec,
3303  dst4, dst5);
3304 
3305  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3306  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3307  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3308  ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3309  dst += (4 * dst_stride);
3310  }
3311 }
3312 
3313 static void hevc_hz_biwgt_4t_16w_msa(const uint8_t *src0_ptr,
3314  int32_t src_stride,
3315  const int16_t *src1_ptr,
3316  int32_t src2_stride,
3317  uint8_t *dst,
3318  int32_t dst_stride,
3319  const int8_t *filter,
3320  int32_t height,
3321  int32_t weight0,
3322  int32_t weight1,
3323  int32_t offset0,
3324  int32_t offset1,
3325  int32_t rnd_val)
3326 {
3327  uint32_t loop_cnt;
3328  int32_t offset, weight, constant;
3329  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3330  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3331  v8i16 filt0, filt1;
3332  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3333  v16i8 mask1;
3334  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3335  v16i8 vec0, vec1;
3336  v8i16 filter_vec;
3337  v4i32 weight_vec, offset_vec, rnd_vec;
3338 
3339  src0_ptr -= 1;
3340 
3341  filter_vec = LD_SH(filter);
3342  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3343 
3344  offset = (offset0 + offset1) << rnd_val;
3345  weight0 = weight0 & 0x0000FFFF;
3346  weight = weight0 | (weight1 << 16);
3347  constant = 128 * weight1;
3348  constant <<= 6;
3349  offset += constant;
3350 
3351  offset_vec = __msa_fill_w(offset);
3352  weight_vec = __msa_fill_w(weight);
3353  rnd_vec = __msa_fill_w(rnd_val + 1);
3354 
3355  mask1 = mask0 + 2;
3356 
3357  for (loop_cnt = (height >> 2); loop_cnt--;) {
3358  LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3359  LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3360  src0_ptr += (4 * src_stride);
3361  LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3362  LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3363  src1_ptr += (4 * src2_stride);
3364  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3365 
3366  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3367  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3368  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3369  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3370  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3371  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3372  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3373  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3374  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3375  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3376  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3377  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3378  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3379  dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3380  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3381  dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3382  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3383  in0, in1, in2, in3,
3384  weight_vec, rnd_vec, offset_vec,
3385  dst0, dst1, dst2, dst3);
3386 
3387  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3388  ST_SH2(dst0, dst1, dst, dst_stride);
3389  dst += (2 * dst_stride);
3390 
3391  HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3392  in4, in5, in6, in7,
3393  weight_vec, rnd_vec, offset_vec,
3394  dst0, dst1, dst2, dst3);
3395 
3396  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3397  ST_SH2(dst0, dst1, dst, dst_stride);
3398  dst += (2 * dst_stride);
3399  }
3400 }
3401 
3402 static void hevc_hz_biwgt_4t_24w_msa(const uint8_t *src0_ptr,
3403  int32_t src_stride,
3404  const int16_t *src1_ptr,
3405  int32_t src2_stride,
3406  uint8_t *dst,
3407  int32_t dst_stride,
3408  const int8_t *filter,
3409  int32_t height,
3410  int32_t weight0,
3411  int32_t weight1,
3412  int32_t offset0,
3413  int32_t offset1,
3414  int32_t rnd_val)
3415 {
3416  uint32_t loop_cnt;
3417  int32_t offset, weight, constant;
3418  v16i8 src0, src1, src2, src3;
3419  v8i16 filt0, filt1;
3420  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3421  v16i8 mask1, mask2, mask3;
3422  v16i8 vec0, vec1;
3423  v8i16 dst0, dst1, dst2, dst3;
3424  v8i16 in0, in1, in2, in3, in4, in5;
3425  v8i16 filter_vec;
3426  v4i32 weight_vec, offset_vec, rnd_vec;
3427 
3428  src0_ptr -= 1;
3429 
3430  filter_vec = LD_SH(filter);
3431  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3432 
3433  offset = (offset0 + offset1) << rnd_val;
3434  weight0 = weight0 & 0x0000FFFF;
3435  weight = weight0 | (weight1 << 16);
3436  constant = 128 * weight1;
3437  constant <<= 6;
3438  offset += constant;
3439 
3440  offset_vec = __msa_fill_w(offset);
3441  weight_vec = __msa_fill_w(weight);
3442  rnd_vec = __msa_fill_w(rnd_val + 1);
3443 
3444  mask1 = mask0 + 2;
3445  mask2 = mask0 + 8;
3446  mask3 = mask0 + 10;
3447 
3448  for (loop_cnt = 16; loop_cnt--;) {
3449  LD_SB2(src0_ptr, src_stride, src0, src2);
3450  LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3451  src0_ptr += (2 * src_stride);
3452  LD_SH2(src1_ptr, src2_stride, in0, in2);
3453  LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3454  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3455  src1_ptr += (2 * src2_stride);
3456  XORI_B4_128_SB(src0, src1, src2, src3);
3457 
3458  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3459  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3460  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3461  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3462  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3463  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3464  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3465  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3466  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3467  in0, in1, in2, in3,
3468  weight_vec, rnd_vec, offset_vec,
3469  dst0, dst1, dst2, dst3);
3470 
3471  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3472  ST_SH2(dst0, dst1, dst, dst_stride);
3473 
3474  /* 8 width */
3475  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3476  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3477  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3478  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3479  HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3480  weight_vec, rnd_vec, offset_vec,
3481  dst0, dst1);
3482 
3483  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3484  ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3485  dst += (2 * dst_stride);
3486  }
3487 }
3488 
3489 static void hevc_hz_biwgt_4t_32w_msa(const uint8_t *src0_ptr,
3490  int32_t src_stride,
3491  const int16_t *src1_ptr,
3492  int32_t src2_stride,
3493  uint8_t *dst,
3494  int32_t dst_stride,
3495  const int8_t *filter,
3496  int32_t height,
3497  int32_t weight0,
3498  int32_t weight1,
3499  int32_t offset0,
3500  int32_t offset1,
3501  int32_t rnd_val)
3502 {
3503  uint32_t loop_cnt;
3504  int32_t offset, weight, constant;
3505  v16i8 src0, src1, src2;
3506  v8i16 filt0, filt1;
3507  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3508  v16i8 mask1, mask2, mask3;
3509  v8i16 dst0, dst1, dst2, dst3;
3510  v16i8 vec0, vec1;
3511  v8i16 in0, in1, in2, in3;
3512  v8i16 filter_vec;
3513  v4i32 weight_vec, offset_vec, rnd_vec;
3514 
3515  src0_ptr -= 1;
3516 
3517  filter_vec = LD_SH(filter);
3518  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3519 
3520  offset = (offset0 + offset1) << rnd_val;
3521  weight0 = weight0 & 0x0000FFFF;
3522  weight = weight0 | (weight1 << 16);
3523  constant = 128 * weight1;
3524  constant <<= 6;
3525  offset += constant;
3526 
3527  offset_vec = __msa_fill_w(offset);
3528  weight_vec = __msa_fill_w(weight);
3529  rnd_vec = __msa_fill_w(rnd_val + 1);
3530 
3531  mask1 = mask0 + 2;
3532  mask2 = mask0 + 8;
3533  mask3 = mask0 + 10;
3534 
3535  for (loop_cnt = height; loop_cnt--;) {
3536  LD_SB2(src0_ptr, 16, src0, src1);
3537  src2 = LD_SB(src0_ptr + 24);
3538  src0_ptr += src_stride;
3539  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3540  src1_ptr += src2_stride;
3542 
3543  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3544  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3545  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3546  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3547  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3548  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3549  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3550  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3551  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3552  in0, in1, in2, in3,
3553  weight_vec, rnd_vec, offset_vec,
3554  dst0, dst1, dst2, dst3);
3555 
3556  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3557  ST_SH2(dst0, dst1, dst, 16);
3558  dst += dst_stride;
3559  }
3560 }
3561 
3562 static void hevc_vt_biwgt_4t_4x2_msa(const uint8_t *src0_ptr,
3563  int32_t src_stride,
3564  const int16_t *src1_ptr,
3565  int32_t src2_stride,
3566  uint8_t *dst,
3567  int32_t dst_stride,
3568  const int8_t *filter,
3569  int32_t weight0,
3570  int32_t weight1,
3571  int32_t offset0,
3572  int32_t offset1,
3573  int32_t rnd_val)
3574 {
3575  int32_t weight, offset, constant;
3576  v16i8 src0, src1, src2, src3, src4;
3577  v8i16 in0, in1, dst10;
3578  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3579  v4i32 dst10_r, dst10_l;
3580  v8i16 filt0, filt1;
3581  v8i16 filter_vec, out;
3582  v4i32 weight_vec, offset_vec, rnd_vec;
3583 
3584  src0_ptr -= src_stride;
3585 
3586  offset = (offset0 + offset1) << rnd_val;
3587  weight0 = weight0 & 0x0000FFFF;
3588  weight = weight0 | (weight1 << 16);
3589  constant = 128 * weight1;
3590  constant <<= 6;
3591  offset += constant;
3592 
3593  offset_vec = __msa_fill_w(offset);
3594  weight_vec = __msa_fill_w(weight);
3595  rnd_vec = __msa_fill_w(rnd_val + 1);
3596 
3597  filter_vec = LD_SH(filter);
3598  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3599 
3600  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3601  src0_ptr += (3 * src_stride);
3602  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3603  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3604  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3605  LD_SB2(src0_ptr, src_stride, src3, src4);
3606  src0_ptr += (2 * src_stride);
3607  LD_SH2(src1_ptr, src2_stride, in0, in1);
3608  src1_ptr += (2 * src2_stride);
3609 
3610  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3611  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3612  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3613  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3614 
3615  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3616 
3617  ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3618  dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3619  dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3620  SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3621  CLIP_SW2_0_255(dst10_r, dst10_l);
3622  out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3623  out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3624  ST_W2(out, 0, 1, dst, dst_stride);
3625 }
3626 
3627 static void hevc_vt_biwgt_4t_4x4_msa(const uint8_t *src0_ptr,
3628  int32_t src_stride,
3629  const int16_t *src1_ptr,
3630  int32_t src2_stride,
3631  uint8_t *dst,
3632  int32_t dst_stride,
3633  const int8_t *filter,
3634  int32_t weight0,
3635  int32_t weight1,
3636  int32_t offset0,
3637  int32_t offset1,
3638  int32_t rnd_val)
3639 {
3640  int32_t weight, offset, constant;
3641  v16i8 src0, src1, src2, src3, src4, src5, src6;
3642  v8i16 in0, in1, in2, in3;
3643  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3644  v16i8 src2110, src4332, src6554;
3645  v8i16 dst10, dst32;
3646  v8i16 filt0, filt1;
3647  v8i16 filter_vec;
3648  v4i32 weight_vec, offset_vec, rnd_vec;
3649 
3650  src0_ptr -= src_stride;
3651 
3652  offset = (offset0 + offset1) << rnd_val;
3653  weight0 = weight0 & 0x0000FFFF;
3654  weight = weight0 | (weight1 << 16);
3655  constant = 128 * weight1;
3656  constant <<= 6;
3657  offset += constant;
3658 
3659  offset_vec = __msa_fill_w(offset);
3660  weight_vec = __msa_fill_w(weight);
3661  rnd_vec = __msa_fill_w(rnd_val + 1);
3662 
3663  filter_vec = LD_SH(filter);
3664  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3665 
3666  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3667  src0_ptr += (3 * src_stride);
3668  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3669  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3670  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3671 
3672  LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3673  src0_ptr += (4 * src_stride);
3674  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3675  src1_ptr += (4 * src2_stride);
3676  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3677  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3678  src32_r, src43_r, src54_r, src65_r);
3679  ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3680  XORI_B2_128_SB(src4332, src6554);
3681 
3682  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3683  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3684 
3685  HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3686  weight_vec, rnd_vec, offset_vec,
3687  dst10, dst32);
3688 
3689  dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3690  ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3691  dst += (4 * dst_stride);
3692 }
3693 
3694 static void hevc_vt_biwgt_4t_4x8multiple_msa(const uint8_t *src0_ptr,
3695  int32_t src_stride,
3696  const int16_t *src1_ptr,
3697  int32_t src2_stride,
3698  uint8_t *dst,
3699  int32_t dst_stride,
3700  const int8_t *filter,
3701  int32_t height,
3702  int32_t weight0,
3703  int32_t weight1,
3704  int32_t offset0,
3705  int32_t offset1,
3706  int32_t rnd_val)
3707 {
3708  uint32_t loop_cnt;
3709  int32_t weight, offset, constant;
3710  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3711  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3712  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3713  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3714  v16i8 src2110, src4332, src6554, src8776;
3715  v8i16 dst10, dst32, dst54, dst76;
3716  v8i16 filt0, filt1;
3717  v8i16 filter_vec;
3718  v4i32 weight_vec, offset_vec, rnd_vec;
3719 
3720  src0_ptr -= src_stride;
3721 
3722  offset = (offset0 + offset1) << rnd_val;
3723  weight0 = weight0 & 0x0000FFFF;
3724  weight = weight0 | (weight1 << 16);
3725  constant = 128 * weight1;
3726  constant <<= 6;
3727  offset += constant;
3728 
3729  offset_vec = __msa_fill_w(offset);
3730  weight_vec = __msa_fill_w(weight);
3731  rnd_vec = __msa_fill_w(rnd_val + 1);
3732 
3733  filter_vec = LD_SH(filter);
3734  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3735 
3736  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3737  src0_ptr += (3 * src_stride);
3738  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3739  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3740  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3741 
3742  for (loop_cnt = (height >> 3); loop_cnt--;) {
3743  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3744  src0_ptr += (6 * src_stride);
3745  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3746  src1_ptr += (8 * src2_stride);
3747 
3748  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3749  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3750 
3751  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3752  src32_r, src43_r, src54_r, src65_r);
3753  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3754  ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3755  src4332, src6554, src8776);
3756  XORI_B3_128_SB(src4332, src6554, src8776);
3757 
3758  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3759  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3760  dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3761 
3762  LD_SB2(src0_ptr, src_stride, src9, src2);
3763  src0_ptr += (2 * src_stride);
3764  ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3765  src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3766  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3767 
3768  dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3769  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3770  in0, in1, in2, in3,
3771  weight_vec, rnd_vec, offset_vec,
3772  dst10, dst32, dst54, dst76);
3773 
3774  PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3775  ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3776  dst += (8 * dst_stride);
3777  }
3778 }
3779 
3780 static void hevc_vt_biwgt_4t_4w_msa(const uint8_t *src0_ptr,
3781  int32_t src_stride,
3782  const int16_t *src1_ptr,
3783  int32_t src2_stride,
3784  uint8_t *dst,
3785  int32_t dst_stride,
3786  const int8_t *filter,
3787  int32_t height,
3788  int32_t weight0,
3789  int32_t weight1,
3790  int32_t offset0,
3791  int32_t offset1,
3792  int32_t rnd_val)
3793 {
3794  if (2 == height) {
3795  hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3796  dst, dst_stride, filter,
3797  weight0, weight1, offset0, offset1, rnd_val);
3798  } else if (4 == height) {
3799  hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3800  dst, dst_stride, filter,
3801  weight0, weight1, offset0, offset1, rnd_val);
3802  } else if (0 == (height % 8)) {
3803  hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3804  src1_ptr, src2_stride,
3805  dst, dst_stride, filter, height,
3806  weight0, weight1, offset0, offset1,
3807  rnd_val);
3808  }
3809 }
3810 
3811 static void hevc_vt_biwgt_4t_6w_msa(const uint8_t *src0_ptr,
3812  int32_t src_stride,
3813  const int16_t *src1_ptr,
3814  int32_t src2_stride,
3815  uint8_t *dst,
3816  int32_t dst_stride,
3817  const int8_t *filter,
3818  int32_t height,
3819  int32_t weight0,
3820  int32_t weight1,
3821  int32_t offset0,
3822  int32_t offset1,
3823  int32_t rnd_val)
3824 {
3825  uint32_t loop_cnt;
3826  int32_t res = height & 0x03;
3827  int32_t offset, weight, constant;
3828  v16i8 src0, src1, src2, src3, src4;
3829  v8i16 in0, in1, in2, in3;
3830  v16i8 src10_r, src32_r, src21_r, src43_r;
3831  v8i16 tmp0, tmp1, tmp2, tmp3;
3832  v8i16 filt0, filt1;
3833  v8i16 filter_vec;
3834  v4i32 weight_vec, offset_vec, rnd_vec;
3835 
3836  src0_ptr -= src_stride;
3837 
3838  offset = (offset0 + offset1) << rnd_val;
3839  weight0 = weight0 & 0x0000FFFF;
3840  weight = weight0 | (weight1 << 16);
3841  constant = 128 * weight1;
3842  constant <<= 6;
3843  offset += constant;
3844 
3845  offset_vec = __msa_fill_w(offset);
3846  weight_vec = __msa_fill_w(weight);
3847  rnd_vec = __msa_fill_w(rnd_val + 1);
3848 
3849  filter_vec = LD_SH(filter);
3850  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3851 
3852  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3853  src0_ptr += (3 * src_stride);
3855  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3856 
3857  for (loop_cnt = (height >> 2); loop_cnt--;) {
3858  LD_SB2(src0_ptr, src_stride, src3, src4);
3859  src0_ptr += (2 * src_stride);
3860  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3861  src1_ptr += (4 * src2_stride);
3862  XORI_B2_128_SB(src3, src4);
3863  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3864 
3865  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3866  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3867 
3868  LD_SB2(src0_ptr, src_stride, src1, src2);
3869  src0_ptr += (2 * src_stride);
3871  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3872 
3873  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3874  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3875  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3876  in0, in1, in2, in3,
3877  weight_vec, rnd_vec, offset_vec,
3878  tmp0, tmp1, tmp2, tmp3);
3879 
3880  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3881  ST_W2(tmp0, 0, 2, dst, dst_stride);
3882  ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3883  ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3884  ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3885  dst += (4 * dst_stride);
3886  }
3887  if (res) {
3888  LD_SB2(src0_ptr, src_stride, src3, src4);
3889  src0_ptr += (2 * src_stride);
3890  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3891  src1_ptr += (4 * src2_stride);
3892  XORI_B2_128_SB(src3, src4);
3893  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3894 
3895  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3896  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3897 
3898  LD_SB2(src0_ptr, src_stride, src1, src2);
3899  src0_ptr += (2 * src_stride);
3901  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3902 
3903  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3904  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3905  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3906  in0, in1, in2, in3,
3907  weight_vec, rnd_vec, offset_vec,
3908  tmp0, tmp1, tmp2, tmp3);
3909 
3910  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3911  ST_W2(tmp0, 0, 2, dst, dst_stride);
3912  ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3913  ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3914  ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3915  }
3916 }
3917 
3918 static void hevc_vt_biwgt_4t_8x2_msa(const uint8_t *src0_ptr,
3919  int32_t src_stride,
3920  const int16_t *src1_ptr,
3921  int32_t src2_stride,
3922  uint8_t *dst,
3923  int32_t dst_stride,
3924  const int8_t *filter,
3925  int32_t weight0,
3926  int32_t weight1,
3927  int32_t offset0,
3928  int32_t offset1,
3929  int32_t rnd_val)
3930 {
3931  int32_t offset, weight, constant;
3932  v16i8 src0, src1, src2, src3, src4;
3933  v8i16 in0, in1, tmp0, tmp1;
3934  v16i8 src10_r, src32_r, src21_r, src43_r;
3935  v8i16 filt0, filt1;
3936  v8i16 filter_vec;
3937  v4i32 weight_vec, offset_vec, rnd_vec;
3938 
3939  src0_ptr -= src_stride;
3940 
3941  offset = (offset0 + offset1) << rnd_val;
3942  weight0 = weight0 & 0x0000FFFF;
3943  weight = weight0 | (weight1 << 16);
3944  constant = 128 * weight1;
3945  constant <<= 6;
3946  offset += constant;
3947 
3948  offset_vec = __msa_fill_w(offset);
3949  weight_vec = __msa_fill_w(weight);
3950  rnd_vec = __msa_fill_w(rnd_val + 1);
3951 
3952  filter_vec = LD_SH(filter);
3953  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3954 
3955  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3956  src0_ptr += (3 * src_stride);
3958  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3959 
3960  LD_SB2(src0_ptr, src_stride, src3, src4);
3961  LD_SH2(src1_ptr, src2_stride, in0, in1);
3962  XORI_B2_128_SB(src3, src4);
3963  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3964 
3965  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3966  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3967  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3968  weight_vec, rnd_vec, offset_vec,
3969  tmp0, tmp1);
3970 
3971  tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3972  ST_D2(tmp0, 0, 1, dst, dst_stride);
3973 }
3974 
3975 static void hevc_vt_biwgt_4t_8x6_msa(const uint8_t *src0_ptr,
3976  int32_t src_stride,
3977  const int16_t *src1_ptr,
3978  int32_t src2_stride,
3979  uint8_t *dst,
3980  int32_t dst_stride,
3981  const int8_t *filter,
3982  int32_t weight0,
3983  int32_t weight1,
3984  int32_t offset0,
3985  int32_t offset1,
3986  int32_t rnd_val)
3987 {
3988  int32_t offset, weight, constant;
3989  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3990  v8i16 in0, in1, in2, in3, in4, in5;
3991  v16i8 src10_r, src32_r, src54_r, src76_r;
3992  v16i8 src21_r, src43_r, src65_r, src87_r;
3993  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3994  v8i16 filt0, filt1;
3995  v8i16 filter_vec;
3996  v4i32 weight_vec, offset_vec, rnd_vec;
3997 
3998  src0_ptr -= src_stride;
3999 
4000  offset = (offset0 + offset1) << rnd_val;
4001  weight0 = weight0 & 0x0000FFFF;
4002  weight = weight0 | (weight1 << 16);
4003  constant = 128 * weight1;
4004  constant <<= 6;
4005  offset += constant;
4006 
4007  offset_vec = __msa_fill_w(offset);
4008  weight_vec = __msa_fill_w(weight);
4009  rnd_vec = __msa_fill_w(rnd_val + 1);
4010 
4011  filter_vec = LD_SH(filter);
4012  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4013 
4014  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4015  src0_ptr += (3 * src_stride);
4017  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4018 
4019  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
4020  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4021  XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
4022  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
4023  src32_r, src43_r, src54_r, src65_r);
4024  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4025 
4026  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4027  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4028  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
4029  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
4030  tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
4031  tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
4032  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4033  in0, in1, in2, in3,
4034  weight_vec, rnd_vec, offset_vec,
4035  tmp0, tmp1, tmp2, tmp3);
4036  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4037  weight_vec, rnd_vec, offset_vec,
4038  tmp4, tmp5);
4039 
4040  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4041  tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4042  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4043  ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
4044 }
4045 
4046 static void hevc_vt_biwgt_4t_8x4multiple_msa(const uint8_t *src0_ptr,
4047  int32_t src_stride,
4048  const int16_t *src1_ptr,
4049  int32_t src2_stride,
4050  uint8_t *dst,
4051  int32_t dst_stride,
4052  const int8_t *filter,
4053  int32_t height,
4054  int32_t weight0,
4055  int32_t weight1,
4056  int32_t offset0,
4057  int32_t offset1,
4058  int32_t rnd_val)
4059 {
4060  uint32_t loop_cnt;
4061  int32_t offset, weight, constant;
4062  v16i8 src0, src1, src2, src3, src4;
4063  v8i16 in0, in1, in2, in3;
4064  v16i8 src10_r, src32_r, src21_r, src43_r;
4065  v8i16 tmp0, tmp1, tmp2, tmp3;
4066  v8i16 filt0, filt1;
4067  v8i16 filter_vec;
4068  v4i32 weight_vec, offset_vec, rnd_vec;
4069 
4070  src0_ptr -= src_stride;
4071 
4072  offset = (offset0 + offset1) << rnd_val;
4073  weight0 = weight0 & 0x0000FFFF;
4074  weight = weight0 | (weight1 << 16);
4075  constant = 128 * weight1;
4076  constant <<= 6;
4077  offset += constant;
4078 
4079  offset_vec = __msa_fill_w(offset);
4080  weight_vec = __msa_fill_w(weight);
4081  rnd_vec = __msa_fill_w(rnd_val + 1);
4082 
4083  filter_vec = LD_SH(filter);
4084  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4085 
4086  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4087  src0_ptr += (3 * src_stride);
4089  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4090 
4091  for (loop_cnt = (height >> 2); loop_cnt--;) {
4092  LD_SB2(src0_ptr, src_stride, src3, src4);
4093  src0_ptr += (2 * src_stride);
4094  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4095  src1_ptr += (4 * src2_stride);
4096  XORI_B2_128_SB(src3, src4);
4097  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4098 
4099  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4100  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4101 
4102  LD_SB2(src0_ptr, src_stride, src1, src2);
4103  src0_ptr += (2 * src_stride);
4105  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4106 
4107  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4108  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4109  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4110  in0, in1, in2, in3,
4111  weight_vec, rnd_vec, offset_vec,
4112  tmp0, tmp1, tmp2, tmp3);
4113 
4114  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4115  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4116  dst += (4 * dst_stride);
4117  }
4118 }
4119 
4120 static void hevc_vt_biwgt_4t_8w_msa(const uint8_t *src0_ptr,
4121  int32_t src_stride,
4122  const int16_t *src1_ptr,
4123  int32_t src2_stride,
4124  uint8_t *dst,
4125  int32_t dst_stride,
4126  const int8_t *filter,
4127  int32_t height,
4128  int32_t weight0,
4129  int32_t weight1,
4130  int32_t offset0,
4131  int32_t offset1,
4132  int32_t rnd_val)
4133 {
4134  if (2 == height) {
4135  hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4136  dst, dst_stride, filter,
4137  weight0, weight1, offset0, offset1, rnd_val);
4138  } else if (6 == height) {
4139  hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4140  dst, dst_stride, filter,
4141  weight0, weight1, offset0, offset1, rnd_val);
4142  } else {
4143  hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4144  src1_ptr, src2_stride,
4145  dst, dst_stride, filter, height,
4146  weight0, weight1, offset0, offset1,
4147  rnd_val);
4148  }
4149 }
4150 
4151 static void hevc_vt_biwgt_4t_12w_msa(const uint8_t *src0_ptr,
4152  int32_t src_stride,
4153  const int16_t *src1_ptr,
4154  int32_t src2_stride,
4155  uint8_t *dst,
4156  int32_t dst_stride,
4157  const int8_t *filter,
4158  int32_t height,
4159  int32_t weight0,
4160  int32_t weight1,
4161  int32_t offset0,
4162  int32_t offset1,
4163  int32_t rnd_val)
4164 {
4165  uint32_t loop_cnt;
4166  int32_t offset, weight, constant;
4167  v16i8 src0, src1, src2, src3, src4, src5;
4168  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4169  v16i8 src10_r, src32_r, src21_r, src43_r;
4170  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4171  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4172  v16i8 src2110, src4332;
4173  v8i16 filt0, filt1;
4174  v8i16 filter_vec;
4175  v4i32 weight_vec, offset_vec, rnd_vec;
4176 
4177  src0_ptr -= (1 * src_stride);
4178 
4179  offset = (offset0 + offset1) << rnd_val;
4180  weight0 = weight0 & 0x0000FFFF;
4181  weight = weight0 | (weight1 << 16);
4182  constant = 128 * weight1;
4183  constant <<= 6;
4184  offset += constant;
4185 
4186  offset_vec = __msa_fill_w(offset);
4187  weight_vec = __msa_fill_w(weight);
4188  rnd_vec = __msa_fill_w(rnd_val + 1);
4189 
4190  filter_vec = LD_SH(filter);
4191  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4192 
4193  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4194  src0_ptr += (3 * src_stride);
4196  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4197  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4198  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4199 
4200  for (loop_cnt = (height >> 2); loop_cnt--;) {
4201  LD_SB2(src0_ptr, src_stride, src3, src4);
4202  src0_ptr += (2 * src_stride);
4203  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4204  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4205  src1_ptr += (4 * src2_stride);
4206  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4207  XORI_B2_128_SB(src3, src4);
4208 
4209  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4210  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4211  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4212 
4213  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4214  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4215  tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4216 
4217  LD_SB2(src0_ptr, src_stride, src5, src2);
4218  src0_ptr += (2 * src_stride);
4219  XORI_B2_128_SB(src5, src2);
4220  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4221  ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4222  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4223 
4224  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4225  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4226  tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4227  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4228  in0, in1, in2, in3,
4229  weight_vec, rnd_vec, offset_vec,
4230  tmp0, tmp1, tmp2, tmp3);
4231  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4232  weight_vec, rnd_vec, offset_vec,
4233  tmp4, tmp5);
4234 
4235  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4236  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4237  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4238  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4239  dst += (4 * dst_stride);
4240  }
4241 }
4242 
4243 static void hevc_vt_biwgt_4t_16w_msa(const uint8_t *src0_ptr,
4244  int32_t src_stride,
4245  const int16_t *src1_ptr,
4246  int32_t src2_stride,
4247  uint8_t *dst,
4248  int32_t dst_stride,
4249  const int8_t *filter,
4250  int32_t height,
4251  int32_t weight0,
4252  int32_t weight1,
4253  int32_t offset0,
4254  int32_t offset1,
4255  int32_t rnd_val)
4256 {
4257  uint32_t loop_cnt;
4258  int32_t offset, weight, constant;
4259  v16i8 src0, src1, src2, src3, src4, src5;
4260  v8i16 in0, in1, in2, in3;
4261  v16i8 src10_r, src32_r, src21_r, src43_r;
4262  v16i8 src10_l, src32_l, src21_l, src43_l;
4263  v8i16 tmp0, tmp1, tmp2, tmp3;
4264  v8i16 filt0, filt1;
4265  v8i16 filter_vec;
4266  v4i32 weight_vec, offset_vec, rnd_vec;
4267 
4268  src0_ptr -= src_stride;
4269 
4270  offset = (offset0 + offset1) << rnd_val;
4271  weight0 = weight0 & 0x0000FFFF;
4272  weight = weight0 | (weight1 << 16);
4273  constant = 128 * weight1;
4274  constant <<= 6;
4275  offset += constant;
4276 
4277  offset_vec = __msa_fill_w(offset);
4278  weight_vec = __msa_fill_w(weight);
4279  rnd_vec = __msa_fill_w(rnd_val + 1);
4280 
4281  filter_vec = LD_SH(filter);
4282  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4283 
4284  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4285  src0_ptr += (3 * src_stride);
4287  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4288  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4289 
4290  for (loop_cnt = (height >> 2); loop_cnt--;) {
4291  LD_SB2(src0_ptr, src_stride, src3, src4);
4292  src0_ptr += (2 * src_stride);
4293  LD_SH2(src1_ptr, src2_stride, in0, in1);
4294  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4295  src1_ptr += (2 * src2_stride);
4296  XORI_B2_128_SB(src3, src4);
4297  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4298  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4299 
4300  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4301  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4302  tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4303  tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4304 
4305  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4306  in0, in1, in2, in3,
4307  weight_vec, rnd_vec, offset_vec,
4308  tmp0, tmp1, tmp2, tmp3);
4309  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4310  ST_SH2(tmp0, tmp1, dst, dst_stride);
4311  dst += (2 * dst_stride);
4312  LD_SB2(src0_ptr, src_stride, src5, src2);
4313  src0_ptr += (2 * src_stride);
4314 
4315  LD_SH2(src1_ptr, src2_stride, in0, in1);
4316  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4317  src1_ptr += (2 * src2_stride);
4318  XORI_B2_128_SB(src5, src2);
4319  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4320  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4321 
4322  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4323  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4324  tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4325  tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4326  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4327  in0, in1, in2, in3,
4328  weight_vec, rnd_vec, offset_vec,
4329  tmp0, tmp1, tmp2, tmp3);
4330 
4331  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4332  ST_SH2(tmp0, tmp1, dst, dst_stride);
4333  dst += (2 * dst_stride);
4334  }
4335 }
4336 
4337 static void hevc_vt_biwgt_4t_24w_msa(const uint8_t *src0_ptr,
4338  int32_t src_stride,
4339  const int16_t *src1_ptr,
4340  int32_t src2_stride,
4341  uint8_t *dst,
4342  int32_t dst_stride,
4343  const int8_t *filter,
4344  int32_t height,
4345  int32_t weight0,
4346  int32_t weight1,
4347  int32_t offset0,
4348  int32_t offset1,
4349  int32_t rnd_val)
4350 {
4351  uint32_t loop_cnt;
4352  int32_t offset, weight, constant;
4353  v16i8 src0, src1, src2, src3, src4, src5;
4354  v16i8 src6, src7, src8, src9, src10, src11;
4355  v8i16 in0, in1, in2, in3, in4, in5;
4356  v16i8 src10_r, src32_r, src76_r, src98_r;
4357  v16i8 src10_l, src32_l, src21_l, src43_l;
4358  v16i8 src21_r, src43_r, src87_r, src109_r;
4359  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4360  v8i16 filt0, filt1;
4361  v8i16 filter_vec;
4362  v4i32 weight_vec, offset_vec, rnd_vec;
4363 
4364  src0_ptr -= src_stride;
4365 
4366  offset = (offset0 + offset1) << rnd_val;
4367  weight0 = weight0 & 0x0000FFFF;
4368  weight = weight0 | (weight1 << 16);
4369  constant = 128 * weight1;
4370  constant <<= 6;
4371  offset += constant;
4372 
4373  offset_vec = __msa_fill_w(offset);
4374  weight_vec = __msa_fill_w(weight);
4375  rnd_vec = __msa_fill_w(rnd_val + 1);
4376 
4377  filter_vec = LD_SH(filter);
4378  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4379 
4380  /* 16width */
4381  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4383  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4384  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4385  /* 8width */
4386  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4387  src0_ptr += (3 * src_stride);
4388  XORI_B3_128_SB(src6, src7, src8);
4389  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4390 
4391  for (loop_cnt = (height >> 2); loop_cnt--;) {
4392  /* 16width */
4393  LD_SB2(src0_ptr, src_stride, src3, src4);
4394  LD_SH2(src1_ptr, src2_stride, in0, in1);
4395  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4396  XORI_B2_128_SB(src3, src4);
4397  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4398  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4399 
4400  /* 8width */
4401  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4402  src0_ptr += (2 * src_stride);
4403  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4404  src1_ptr += (2 * src2_stride);
4405  XORI_B2_128_SB(src9, src10);
4406  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4407  /* 16width */
4408  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4409  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4410  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4411  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4412  /* 8width */
4413  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4414  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4415  /* 16width */
4416  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4417  in0, in1, in2, in3,
4418  weight_vec, rnd_vec, offset_vec,
4419  tmp0, tmp1, tmp4, tmp5);
4420  /* 8width */
4421  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4422  weight_vec, rnd_vec, offset_vec,
4423  tmp2, tmp3);
4424  /* 16width */
4425  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4426  /* 8width */
4427  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4428  ST_SH2(tmp0, tmp1, dst, dst_stride);
4429  ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4430  dst += (2 * dst_stride);
4431 
4432  /* 16width */
4433  LD_SB2(src0_ptr, src_stride, src5, src2);
4434  LD_SH2(src1_ptr, src2_stride, in0, in1);
4435  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4436  XORI_B2_128_SB(src5, src2);
4437  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4438  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4439  /* 8width */
4440  LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4441  src0_ptr += (2 * src_stride);
4442  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4443  src1_ptr += (2 * src2_stride);
4444  XORI_B2_128_SB(src11, src8);
4445  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4446  /* 16width */
4447  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4448  tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4449  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4450  tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4451  /* 8width */
4452  tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4453  tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4454  /* 16width */
4455  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4456  in0, in1, in2, in3,
4457  weight_vec, rnd_vec, offset_vec,
4458  tmp0, tmp1, tmp4, tmp5);
4459  /* 8width */
4460  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4461  weight_vec, rnd_vec, offset_vec,
4462  tmp2, tmp3);
4463  /* 16width */
4464  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4465 
4466  /* 8width */
4467  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4468  ST_SH2(tmp0, tmp1, dst, dst_stride);
4469  ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4470  dst += (2 * dst_stride);
4471  }
4472 }
4473 
4474 static void hevc_vt_biwgt_4t_32w_msa(const uint8_t *src0_ptr,
4475  int32_t src_stride,
4476  const int16_t *src1_ptr,
4477  int32_t src2_stride,
4478  uint8_t *dst,
4479  int32_t dst_stride,
4480  const int8_t *filter,
4481  int32_t height,
4482  int32_t weight0,
4483  int32_t weight1,
4484  int32_t offset0,
4485  int32_t offset1,
4486  int32_t rnd_val)
4487 {
4488  uint32_t loop_cnt;
4489  uint8_t *dst_tmp = dst + 16;
4490  int32_t offset, weight, constant;
4491  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4492  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4493  v16i8 src10_r, src32_r, src76_r, src98_r;
4494  v16i8 src21_r, src43_r, src87_r, src109_r;
4495  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4496  v16i8 src10_l, src32_l, src76_l, src98_l;
4497  v16i8 src21_l, src43_l, src87_l, src109_l;
4498  v8i16 filt0, filt1;
4499  v8i16 filter_vec;
4500  v4i32 weight_vec, offset_vec, rnd_vec;
4501 
4502  src0_ptr -= src_stride;
4503 
4504  offset = (offset0 + offset1) << rnd_val;
4505  weight0 = weight0 & 0x0000FFFF;
4506  weight = weight0 | (weight1 << 16);
4507  constant = 128 * weight1;
4508  constant <<= 6;
4509  offset += constant;
4510 
4511  offset_vec = __msa_fill_w(offset);
4512  weight_vec = __msa_fill_w(weight);
4513  rnd_vec = __msa_fill_w(rnd_val + 1);
4514 
4515  filter_vec = LD_SH(filter);
4516  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4517 
4518  /* 16width */
4519  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4521  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4522  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4523  /* next 16width */
4524  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4525  src0_ptr += (3 * src_stride);
4526  XORI_B3_128_SB(src6, src7, src8);
4527  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4528  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4529 
4530  for (loop_cnt = (height >> 1); loop_cnt--;) {
4531  /* 16width */
4532  LD_SB2(src0_ptr, src_stride, src3, src4);
4533  LD_SH2(src1_ptr, src2_stride, in0, in1);
4534  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4535  XORI_B2_128_SB(src3, src4);
4536  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4537  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4538 
4539  /* 16width */
4540  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4541  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4542  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4543  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4544  /* 16width */
4545  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4546  in0, in1, in2, in3,
4547  weight_vec, rnd_vec, offset_vec,
4548  tmp0, tmp1, tmp4, tmp5);
4549  /* 16width */
4550  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4551  ST_SH2(tmp0, tmp1, dst, dst_stride);
4552  dst += (2 * dst_stride);
4553 
4554  src10_r = src32_r;
4555  src21_r = src43_r;
4556  src10_l = src32_l;
4557  src21_l = src43_l;
4558  src2 = src4;
4559 
4560  /* next 16width */
4561  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4562  src0_ptr += (2 * src_stride);
4563  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4564  LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4565  src1_ptr += (2 * src2_stride);
4566  XORI_B2_128_SB(src9, src10);
4567  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4568  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4569  /* next 16width */
4570  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4571  tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4572  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4573  tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4574  /* next 16width */
4575  HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4576  in4, in5, in6, in7,
4577  weight_vec, rnd_vec, offset_vec,
4578  tmp2, tmp3, tmp6, tmp7);
4579 
4580  /* next 16width */
4581  PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4582  ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4583  dst_tmp += (2 * dst_stride);
4584 
4585  src76_r = src98_r;
4586  src87_r = src109_r;
4587  src76_l = src98_l;
4588  src87_l = src109_l;
4589  src8 = src10;
4590  }
4591 }
4592 
4593 static void hevc_hv_biwgt_4t_4x2_msa(const uint8_t *src0_ptr,
4594  int32_t src_stride,
4595  const int16_t *src1_ptr,
4596  int32_t src2_stride,
4597  uint8_t *dst,
4598  int32_t dst_stride,
4599  const int8_t *filter_x,
4600  const int8_t *filter_y,
4601  int32_t weight0,
4602  int32_t weight1,
4603  int32_t offset0,
4604  int32_t offset1,
4605  int32_t rnd_val)
4606 {
4607  uint64_t tp0, tp1;
4609  v8i16 in0 = { 0 };
4610  v16u8 out;
4611  v16i8 src0, src1, src2, src3, src4;
4612  v8i16 filt0, filt1;
4613  v8i16 filt_h0, filt_h1;
4614  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4615  v16i8 mask1;
4616  v8i16 filter_vec, tmp, weight_vec;
4617  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4618  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4619  v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4620 
4621  src0_ptr -= (src_stride + 1);
4622 
4623  filter_vec = LD_SH(filter_x);
4624  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4625 
4626  filter_vec = LD_SH(filter_y);
4627  UNPCK_R_SB_SH(filter_vec, filter_vec);
4628 
4629  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4630 
4631  mask1 = mask0 + 2;
4632 
4633  offset = (offset0 + offset1) << rnd_val;
4634  weight0 = weight0 & 0x0000FFFF;
4635  weight = weight0 | (weight1 << 16);
4636 
4637  const_vec = __msa_fill_w((128 * weight1));
4638  const_vec <<= 6;
4639  offset_vec = __msa_fill_w(offset);
4640  weight_vec = (v8i16) __msa_fill_w(weight);
4641  rnd_vec = __msa_fill_w(rnd_val + 1);
4642  offset_vec += const_vec;
4643 
4644  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4645  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4646 
4647  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4648  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4649  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4650 
4651  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4652  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4653  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4654 
4655  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4656  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4657 
4658  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4659  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4660  dst0 >>= 6;
4661  dst1 >>= 6;
4662  dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4663 
4664  LD2(src1_ptr, src2_stride, tp0, tp1);
4665  INSERT_D2_SH(tp0, tp1, in0);
4666 
4667  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4668  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4669  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4670  SRAR_W2_SW(dst0, dst1, rnd_vec);
4671  CLIP_SW2_0_255(dst0, dst1);
4672  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4673  out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4674  ST_W2(out, 0, 1, dst, dst_stride);
4675 }
4676 
4677 static void hevc_hv_biwgt_4t_4x4_msa(const uint8_t *src0_ptr,
4678  int32_t src_stride,
4679  const int16_t *src1_ptr,
4680  int32_t src2_stride,
4681  uint8_t *dst,
4682  int32_t dst_stride,
4683  const int8_t *filter_x,
4684  const int8_t *filter_y,
4685  int32_t weight0,
4686  int32_t weight1,
4687  int32_t offset0,
4688  int32_t offset1,
4689  int32_t rnd_val)
4690 {
4691  uint64_t tp0, tp1;
4693  v16u8 out;
4694  v8i16 in0 = { 0 }, in1 = { 0 };
4695  v16i8 src0, src1, src2, src3, src4, src5, src6;
4696  v8i16 filt0, filt1;
4697  v8i16 filt_h0, filt_h1;
4698  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4699  v16i8 mask1;
4700  v8i16 filter_vec, weight_vec;
4701  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4702  v8i16 tmp0, tmp1, tmp2, tmp3;
4703  v8i16 dst30, dst41, dst52, dst63;
4704  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4705  v4i32 offset_vec, rnd_vec, const_vec;
4706  v4i32 dst0, dst1, dst2, dst3;
4707 
4708  src0_ptr -= (src_stride + 1);
4709 
4710  filter_vec = LD_SH(filter_x);
4711  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4712 
4713  filter_vec = LD_SH(filter_y);
4714  UNPCK_R_SB_SH(filter_vec, filter_vec);
4715 
4716  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4717 
4718  mask1 = mask0 + 2;
4719 
4720  offset = (offset0 + offset1) << rnd_val;
4721  weight0 = weight0 & 0x0000FFFF;
4722  weight = weight0 | (weight1 << 16);
4723 
4724  const_vec = __msa_fill_w((128 * weight1));
4725  const_vec <<= 6;
4726  offset_vec = __msa_fill_w(offset);
4727  weight_vec = (v8i16) __msa_fill_w(weight);
4728  rnd_vec = __msa_fill_w(rnd_val + 1);
4729  offset_vec += const_vec;
4730 
4731  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4732  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4733 
4734  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4735  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4736  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4737  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4738 
4739  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4740  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4741  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4742  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4743 
4744  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4745  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4746  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4747  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4748  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4749  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4750  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4751  SRA_4V(dst0, dst1, dst2, dst3, 6);
4752  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4753 
4754  LD2(src1_ptr, src2_stride, tp0, tp1);
4755  INSERT_D2_SH(tp0, tp1, in0);
4756  src1_ptr += (2 * src2_stride);
4757  LD2(src1_ptr, src2_stride, tp0, tp1);
4758  INSERT_D2_SH(tp0, tp1, in1);
4759 
4760  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4761  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4762 
4763  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4764  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4765  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4766  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4767  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4768  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
4769  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4770  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4771  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4772 }
4773 
4774 static void hevc_hv_biwgt_4t_4multx8mult_msa(const uint8_t *src0_ptr,
4775  int32_t src_stride,
4776  const int16_t *src1_ptr,
4777  int32_t src2_stride,
4778  uint8_t *dst,
4779  int32_t dst_stride,
4780  const int8_t *filter_x,
4781  const int8_t *filter_y,
4782  int32_t height,
4783  int32_t weight0,
4784  int32_t weight1,
4785  int32_t offset0,
4786  int32_t offset1,
4787  int32_t rnd_val)
4788 {
4789  uint32_t loop_cnt;
4790  uint64_t tp0, tp1;
4792  v16u8 out0, out1;
4793  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4794  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4795  v8i16 filt0, filt1;
4796  v8i16 filt_h0, filt_h1;
4797  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4798  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4799  v16i8 mask1;
4800  v8i16 filter_vec, weight_vec;
4801  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4802  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4803  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4804  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4805  v8i16 dst98_r, dst109_r;
4806  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4807  v4i32 offset_vec, rnd_vec, const_vec;
4808 
4809  src0_ptr -= (src_stride + 1);
4810 
4811  filter_vec = LD_SH(filter_x);
4812  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4813 
4814  filter_vec = LD_SH(filter_y);
4815  UNPCK_R_SB_SH(filter_vec, filter_vec);
4816 
4817  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4818 
4819  mask1 = mask0 + 2;
4820 
4821  offset = (offset0 + offset1) << rnd_val;
4822  weight0 = weight0 & 0x0000FFFF;
4823  weight = weight0 | (weight1 << 16);
4824 
4825  const_vec = __msa_fill_w((128 * weight1));
4826  const_vec <<= 6;
4827  offset_vec = __msa_fill_w(offset);
4828  weight_vec = (v8i16) __msa_fill_w(weight);
4829  rnd_vec = __msa_fill_w(rnd_val + 1);
4830  offset_vec += const_vec;
4831 
4832  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4833  src0_ptr += (3 * src_stride);
4835 
4836  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4837  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4838  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4839  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4840  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4841  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4842 
4843  for (loop_cnt = height >> 3; loop_cnt--;) {
4844  LD_SB8(src0_ptr, src_stride,
4845  src3, src4, src5, src6, src7, src8, src9, src10);
4846  src0_ptr += (8 * src_stride);
4847  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4848  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4849  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4850  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4851  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4852 
4853  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4854  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4855  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4856  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4857 
4858  dst32_r = __msa_ilvr_h(dst73, dst22);
4859  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4860  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4861  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4862  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4863  dst76_r = __msa_ilvr_h(dst22, dst106);
4864 
4865  LD2(src1_ptr, src2_stride, tp0, tp1);
4866  src1_ptr += 2 * src2_stride;
4867  INSERT_D2_SH(tp0, tp1, in0);
4868  LD2(src1_ptr, src2_stride, tp0, tp1);
4869  src1_ptr += 2 * src2_stride;
4870  INSERT_D2_SH(tp0, tp1, in1);
4871 
4872  LD2(src1_ptr, src2_stride, tp0, tp1);
4873  src1_ptr += 2 * src2_stride;
4874  INSERT_D2_SH(tp0, tp1, in2);
4875  LD2(src1_ptr, src2_stride, tp0, tp1);
4876  src1_ptr += 2 * src2_stride;
4877  INSERT_D2_SH(tp0, tp1, in3);
4878 
4879  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4880  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4881  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4882  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4883  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4884  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4885  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4886  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4887  SRA_4V(dst0, dst1, dst2, dst3, 6);
4888  SRA_4V(dst4, dst5, dst6, dst7, 6);
4889  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4890  dst2, dst3);
4891  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4892  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4893  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4894  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4895  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4896  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4897  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4898  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4899  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4900  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4901  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4902  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4903  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4904  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4905  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
4906  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
4907  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4908  tmp2, tmp3);
4909  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4910  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4911  dst += (8 * dst_stride);
4912 
4913  dst10_r = dst98_r;
4914  dst21_r = dst109_r;
4915  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4916  }
4917 }
4918 
4919 static void hevc_hv_biwgt_4t_4w_msa(const uint8_t *src0_ptr,
4920  int32_t src_stride,
4921  const int16_t *src1_ptr,
4922  int32_t src2_stride,
4923  uint8_t *dst,
4924  int32_t dst_stride,
4925  const int8_t *filter_x,
4926  const int8_t *filter_y,
4927  int32_t height,
4928  int32_t weight0,
4929  int32_t weight1,
4930  int32_t offset0,
4931  int32_t offset1,
4932  int32_t rnd_val)
4933 {
4934  if (2 == height) {
4935  hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4936  dst, dst_stride, filter_x, filter_y,
4937  weight0, weight1, offset0, offset1, rnd_val);
4938  } else if (4 == height) {
4939  hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4940  dst, dst_stride, filter_x, filter_y,
4941  weight0, weight1, offset0, offset1, rnd_val);
4942  } else if (0 == (height % 8)) {
4943  hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4944  src1_ptr, src2_stride,
4945  dst, dst_stride, filter_x, filter_y,
4946  height, weight0, weight1,
4947  offset0, offset1, rnd_val);
4948  }
4949 }
4950 
4951 static void hevc_hv_biwgt_4t_6w_msa(const uint8_t *src0_ptr,
4952  int32_t src_stride,
4953  const int16_t *src1_ptr,
4954  int32_t src2_stride,
4955  uint8_t *dst,
4956  int32_t dst_stride,
4957  const int8_t *filter_x,
4958  const int8_t *filter_y,
4959  int32_t height,
4960  int32_t weight0,
4961  int32_t weight1,
4962  int32_t offset0,
4963  int32_t offset1,
4964  int32_t rnd_val)
4965 {
4966  uint32_t tpw0, tpw1, tpw2, tpw3;
4967  uint64_t tp0, tp1;
4969  v16u8 out0, out1, out2;
4970  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4971  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4972  v8i16 in4 = { 0 }, in5 = { 0 };
4973  v8i16 filt0, filt1;
4974  v8i16 filt_h0, filt_h1, filter_vec;
4975  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4976  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4977  v16i8 mask1;
4978  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4979  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4980  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4981  v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4982  v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4983  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4984  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4985  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4986  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4987  v4i32 offset_vec, rnd_vec, const_vec;
4988 
4989  src0_ptr -= (src_stride + 1);
4990 
4991  filter_vec = LD_SH(filter_x);
4992  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4993 
4994  filter_vec = LD_SH(filter_y);
4995  UNPCK_R_SB_SH(filter_vec, filter_vec);
4996 
4997  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4998 
4999  mask1 = mask0 + 2;
5000 
5001  offset = (offset0 + offset1) << rnd_val;
5002  weight0 = weight0 & 0x0000FFFF;
5003  weight = weight0 | (weight1 << 16);
5004 
5005  const_vec = __msa_fill_w((128 * weight1));
5006  const_vec <<= 6;
5007  offset_vec = __msa_fill_w(offset);
5008  weight_vec = (v8i16) __msa_fill_w(weight);
5009  rnd_vec = __msa_fill_w(rnd_val + 1);
5010  offset_vec += const_vec;
5011 
5012  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5013  src0_ptr += (3 * src_stride);
5015 
5016  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5017  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5018  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5019  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5020  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5021  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5022 
5023  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5024  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5025 
5026  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5027  src10);
5028  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5029 
5030  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5031  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5032  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5033  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5034 
5035  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5036  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5037  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5038  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5039 
5040  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
5041  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
5042  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
5043  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
5044 
5045  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5046  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5047  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5048  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5049 
5050  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5051  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5052  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5053  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5054  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5055  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5056  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
5057  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
5058  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
5059  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
5060  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
5061 
5062  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5063  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5064  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5065  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5066  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5067  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5068  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5069  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5070  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
5071  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
5072  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
5073  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
5074  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
5075  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
5076  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
5077  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
5078  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
5079 
5080  LD2(src1_ptr, src2_stride, tp0, tp1);
5081  INSERT_D2_SH(tp0, tp1, in0);
5082  LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
5083  INSERT_D2_SH(tp0, tp1, in1);
5084 
5085  LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
5086  INSERT_D2_SH(tp0, tp1, in2);
5087  LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
5088  INSERT_D2_SH(tp0, tp1, in3);
5089 
5090  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5091  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5092  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5093  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5094  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5095  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5096  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5097  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5098  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5099  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5100  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5101  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5102  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5103  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5104  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5105  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5106  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5107  tmp2, tmp3);
5108  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5109  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5110 
5111  PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5112 
5113  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5114  src1_ptr += (4 * src2_stride);
5115  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5116  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5117  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5118 
5119  ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5120  ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5121 
5122  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5123  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5124  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5125  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5126  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5127  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5128  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5129 
5130  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5131  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5132 }
5133 
5134 static void hevc_hv_biwgt_4t_8x2_msa(const uint8_t *src0_ptr,
5135  int32_t src_stride,
5136  const int16_t *src1_ptr,
5137  int32_t src2_stride,
5138  uint8_t *dst,
5139  int32_t dst_stride,
5140  const int8_t *filter_x,
5141  const int8_t *filter_y,
5142  int32_t weight0,
5143  int32_t weight1,
5144  int32_t offset0,
5145  int32_t offset1,
5146  int32_t rnd_val)
5147 {
5149  v16u8 out;
5150  v16i8 src0, src1, src2, src3, src4;
5151  v8i16 filt0, filt1;
5152  v8i16 filt_h0, filt_h1;
5153  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5154  v16i8 mask1;
5155  v8i16 filter_vec, weight_vec;
5156  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5157  v8i16 dst0, dst1, dst2, dst3, dst4;
5158  v8i16 in0, in1;
5159  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5160  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5161  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5162  v8i16 tmp0, tmp1, tmp2, tmp3;
5163  v4i32 offset_vec, rnd_vec, const_vec;
5164 
5165  src0_ptr -= (src_stride + 1);
5166 
5167  filter_vec = LD_SH(filter_x);
5168  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5169 
5170  filter_vec = LD_SH(filter_y);
5171  UNPCK_R_SB_SH(filter_vec, filter_vec);
5172 
5173  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5174 
5175  mask1 = mask0 + 2;
5176 
5177  offset = (offset0 + offset1) << rnd_val;
5178  weight0 = weight0 & 0x0000FFFF;
5179  weight = weight0 | (weight1 << 16);
5180 
5181  const_vec = __msa_fill_w((128 * weight1));
5182  const_vec <<= 6;
5183  offset_vec = __msa_fill_w(offset);
5184  weight_vec = (v8i16) __msa_fill_w(weight);
5185  rnd_vec = __msa_fill_w(rnd_val + 1);
5186  offset_vec += const_vec;
5187 
5188  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5189  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5190 
5191  LD_SH2(src1_ptr, src2_stride, in0, in1);
5192 
5193  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5194  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5195  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5196  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5197  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5198 
5199  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5200  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5201  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5202  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5203  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5204 
5205  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5206  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5207  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5208  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5209  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5210  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5211  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5212  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5213  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5214  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5215 
5216  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5217  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5218 
5219  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5220  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5221  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5222  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5223  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5224  CLIP_SW4_0_255(dst0_r, dst0_l, dst1_r, dst1_l);
5225  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5226  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5227  ST_D2(out, 0, 1, dst, dst_stride);
5228 }
5229 
5230 static void hevc_hv_biwgt_4t_8multx4_msa(const uint8_t *src0_ptr,
5231  int32_t src_stride,
5232  const int16_t *src1_ptr,
5233  int32_t src2_stride,
5234  uint8_t *dst,
5235  int32_t dst_stride,
5236  const int8_t *filter_x,
5237  const int8_t *filter_y,
5238  int32_t weight0,
5239  int32_t weight1,
5240  int32_t offset0,
5241  int32_t offset1,
5242  int32_t rnd_val,
5243  int32_t width8mult)
5244 {
5246  uint32_t cnt;
5247  v16u8 out0, out1;
5248  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5249  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5250  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5251  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5252  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5253  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5254  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5255  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5256  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5257  v4i32 offset_vec, rnd_vec, const_vec;
5258 
5259  src0_ptr -= (src_stride + 1);
5260 
5261  filter_vec = LD_SH(filter_x);
5262  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5263 
5264  filter_vec = LD_SH(filter_y);
5265  UNPCK_R_SB_SH(filter_vec, filter_vec);
5266 
5267  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5268 
5269  mask0 = LD_SB(ff_hevc_mask_arr);
5270  mask1 = mask0 + 2;
5271 
5272  offset = (offset0 + offset1) << rnd_val;
5273  weight0 = weight0 & 0x0000FFFF;
5274  weight = weight0 | (weight1 << 16);
5275 
5276  const_vec = __msa_fill_w((128 * weight1));
5277  const_vec <<= 6;
5278  offset_vec = __msa_fill_w(offset);
5279  rnd_vec = __msa_fill_w(rnd_val + 1);
5280  offset_vec += const_vec;
5281  weight_vec = (v8i16) __msa_fill_w(weight);
5282 
5283  for (cnt = width8mult; cnt--;) {
5284  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5285  src0_ptr += 8;
5286  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5287 
5288  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5289  src1_ptr += 8;
5290 
5291  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5292  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5293  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5294 
5295  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5296  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5297  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5298 
5299  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5300  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5301 
5302  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5303  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5304  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5305  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5306 
5307  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5308  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5309  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5310  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5311 
5312  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5313  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5314  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5315  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5316 
5317  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5318  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5319  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5320  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5321  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5322  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5323  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5324  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5325 
5326  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5327  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5328  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5329  dst3_r, dst0, dst1, dst2, dst3);
5330 
5331  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5332  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5333  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5334  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5335  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5336  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5337  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5338  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5339  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5340  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5341  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5342  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5343  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5344  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5345  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5346  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5347  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5348  tmp0, tmp1, tmp2, tmp3);
5349  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5350  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5351  dst += 8;
5352  }
5353 }
5354 
5355 static void hevc_hv_biwgt_4t_8x6_msa(const uint8_t *src0_ptr,
5356  int32_t src_stride,
5357  const int16_t *src1_ptr,
5358  int32_t src2_stride,
5359  uint8_t *dst,
5360  int32_t dst_stride,
5361  const int8_t *filter_x,
5362  const int8_t *filter_y,
5363  int32_t weight0,
5364  int32_t weight1,
5365  int32_t offset0,
5366  int32_t offset1,
5367  int32_t rnd_val)
5368 {
5369  uint32_t offset, weight;
5370  v16u8 out0, out1, out2;
5371  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5372  v8i16 filt0, filt1;
5373  v8i16 filt_h0, filt_h1;
5374  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5375  v16i8 mask1;
5376  v8i16 filter_vec, weight_vec;
5377  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5378  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5379  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5380  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5381  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5382  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5383  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5384  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5385  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5386  v8i16 in0, in1, in2, in3, in4, in5;
5387  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5388  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5389  v4i32 offset_vec, rnd_vec, const_vec;
5390 
5391  src0_ptr -= (src_stride + 1);
5392 
5393  filter_vec = LD_SH(filter_x);
5394  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5395 
5396  filter_vec = LD_SH(filter_y);
5397  UNPCK_R_SB_SH(filter_vec, filter_vec);
5398 
5399  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5400 
5401  mask1 = mask0 + 2;
5402 
5403  offset = (offset0 + offset1) << rnd_val;
5404  weight0 = weight0 & 0x0000FFFF;
5405  weight = weight0 | (weight1 << 16);
5406 
5407  const_vec = __msa_fill_w((128 * weight1));
5408  const_vec <<= 6;
5409  offset_vec = __msa_fill_w(offset);
5410  weight_vec = (v8i16) __msa_fill_w(weight);
5411  rnd_vec = __msa_fill_w(rnd_val + 1);
5412  offset_vec += const_vec;
5413 
5414  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5415  src0_ptr += (5 * src_stride);
5416  LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5417 
5418  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5419  XORI_B4_128_SB(src5, src6, src7, src8);
5420 
5421  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5422 
5423  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5424  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5425  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5426  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5427  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5428  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5429  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5430  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5431  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5432 
5433  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5434  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5435  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5436  dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5437  dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5438  dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5439  dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5440  dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5441  dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5442 
5443  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5444  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5445  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5446  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5447  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5448  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5449  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5450  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5451 
5452  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5453  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5454  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5455  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5456  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5457  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5458  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5459  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5460  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5461  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5462  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5463  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5464 
5465  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5466  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5467  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5468  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5469  dst0, dst1, dst2, dst3);
5470 
5471  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5472  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5473  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5474  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5475  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5476  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5477  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5478  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5479  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5480  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5481  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5482  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5483  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5484  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5485  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5486  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5487  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5488  tmp0, tmp1, tmp2, tmp3);
5489  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5490 
5491  PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5492  ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5493  ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5494  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5495  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5496  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5497  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5498  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5499  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5500  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5501  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5502  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5503  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5504 }
5505 
5506 static void hevc_hv_biwgt_4t_8multx4mult_msa(const uint8_t *src0_ptr,
5507  int32_t src_stride,
5508  const int16_t *src1_ptr,
5509  int32_t src2_stride,
5510  uint8_t *dst,
5511  int32_t dst_stride,
5512  const int8_t *filter_x,
5513  const int8_t *filter_y,
5514  int32_t height,
5515  int32_t weight0,
5516  int32_t weight1,
5517  int32_t offset0,
5518  int32_t offset1,
5519  int32_t rnd_val,
5520  int32_t width)
5521 {
5522  uint32_t loop_cnt;
5523  uint32_t cnt;
5525  const uint8_t *src0_ptr_tmp;
5526  const int16_t *src1_ptr_tmp;
5527  uint8_t *dst_tmp;
5528  v16u8 out0, out1;
5529  v16i8 src0, src1, src2, src3, src4, src5, src6;
5530  v8i16 in0, in1, in2, in3;
5531  v8i16 filt0, filt1;
5532  v8i16 filt_h0, filt_h1;
5533  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5534  v16i8 mask1;
5535  v8i16 filter_vec;
5536  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5537  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5538  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5539  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5540  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5541  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5542  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5543  v4i32 offset_vec, rnd_vec, const_vec;
5544 
5545  src0_ptr -= (src_stride + 1);
5546 
5547  filter_vec = LD_SH(filter_x);
5548  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5549 
5550  filter_vec = LD_SH(filter_y);
5551  UNPCK_R_SB_SH(filter_vec, filter_vec);
5552 
5553  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5554 
5555  mask1 = mask0 + 2;
5556 
5557  offset = (offset0 + offset1) << rnd_val;
5558  weight0 = weight0 & 0x0000FFFF;
5559  weight = weight0 | (weight1 << 16);
5560 
5561  const_vec = __msa_fill_w((128 * weight1));
5562  const_vec <<= 6;
5563  offset_vec = __msa_fill_w(offset);
5564  weight_vec = (v8i16) __msa_fill_w(weight);
5565  rnd_vec = __msa_fill_w(rnd_val + 1);
5566  offset_vec += const_vec;
5567 
5568  for (cnt = width >> 3; cnt--;) {
5569  src0_ptr_tmp = src0_ptr;
5570  src1_ptr_tmp = src1_ptr;
5571  dst_tmp = dst;
5572 
5573  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5574  src0_ptr_tmp += (3 * src_stride);
5576 
5577  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5578  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5579  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5580  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5581  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5582  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5583 
5584  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5585  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5586 
5587  for (loop_cnt = height >> 2; loop_cnt--;) {
5588  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5589  src0_ptr_tmp += (4 * src_stride);
5590  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5591  src1_ptr_tmp += (4 * src2_stride);
5592  XORI_B4_128_SB(src3, src4, src5, src6);
5593 
5594  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5595  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5596  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5597  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5598 
5599  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5600  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5601  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5602  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5603 
5604  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5605  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5606  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5607  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5608 
5609  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5610  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5611  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5612  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5613  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5614  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5615  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5616  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5617 
5618  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5619  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5620  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5621  dst3_r, dst0, dst1, dst2, dst3);
5622  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5623  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5624  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5625  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5626  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5627  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5628  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5629  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5630  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5631  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5632  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5633  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5634  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5635  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5636  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5637  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5638  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5639  tmp0, tmp1, tmp2, tmp3);
5640  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5641  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5642  dst_tmp += (4 * dst_stride);
5643 
5644  dst10_r = dst54_r;
5645  dst10_l = dst54_l;
5646  dst21_r = dst65_r;
5647  dst21_l = dst65_l;
5648  dsth2 = dsth6;
5649  }
5650 
5651  src0_ptr += 8;
5652  dst += 8;
5653  src1_ptr += 8;
5654  }
5655 }
5656 
5657 static void hevc_hv_biwgt_4t_8w_msa(const uint8_t *src0_ptr,
5658  int32_t src_stride,
5659  const int16_t *src1_ptr,
5660  int32_t src2_stride,
5661  uint8_t *dst,
5662  int32_t dst_stride,
5663  const int8_t *filter_x,
5664  const int8_t *filter_y,
5665  int32_t height,
5666  int32_t weight0,
5667  int32_t weight1,
5668  int32_t offset0,
5669  int32_t offset1,
5670  int32_t rnd_val)
5671 {
5672  if (2 == height) {
5673  hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5674  dst, dst_stride, filter_x, filter_y,
5675  weight0, weight1, offset0, offset1, rnd_val);
5676  } else if (4 == height) {
5677  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5678  src2_stride, dst, dst_stride, filter_x,
5679  filter_y, weight0, weight1, offset0,
5680  offset1, rnd_val, 1);
5681  } else if (6 == height) {
5682  hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5683  dst, dst_stride, filter_x, filter_y,
5684  weight0, weight1, offset0, offset1, rnd_val);
5685  } else if (0 == (height % 4)) {
5686  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5687  src1_ptr, src2_stride,
5688  dst, dst_stride, filter_x, filter_y,
5689  height, weight0,
5690  weight1, offset0, offset1, rnd_val, 8);
5691  }
5692 }
5693 
5694 static void hevc_hv_biwgt_4t_12w_msa(const uint8_t *src0_ptr,
5695  int32_t src_stride,
5696  const int16_t *src1_ptr,
5697  int32_t src2_stride,
5698  uint8_t *dst,
5699  int32_t dst_stride,
5700  const int8_t *filter_x,
5701  const int8_t *filter_y,
5702  int32_t height,
5703  int32_t weight0,
5704  int32_t weight1,
5705  int32_t offset0,
5706  int32_t offset1,
5707  int32_t rnd_val)
5708 {
5709  uint32_t loop_cnt;
5710  uint64_t tp0, tp1;
5712  const uint8_t *src0_ptr_tmp;
5713  const int16_t *src1_ptr_tmp;
5714  uint8_t *dst_tmp;
5715  v16u8 out0, out1;
5716  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5717  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5718  v16i8 mask0, mask1, mask2, mask3;
5719  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5720  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5721  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5722  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5723  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5724  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5725  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5726  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5727  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5728  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5729  v4i32 offset_vec, rnd_vec, const_vec;
5730 
5731  src0_ptr -= (src_stride + 1);
5732 
5733  filter_vec = LD_SH(filter_x);
5734  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5735 
5736  filter_vec = LD_SH(filter_y);
5737  UNPCK_R_SB_SH(filter_vec, filter_vec);
5738 
5739  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5740 
5741  mask0 = LD_SB(ff_hevc_mask_arr);
5742  mask1 = mask0 + 2;
5743 
5744  offset = (offset0 + offset1) << rnd_val;
5745  weight0 = weight0 & 0x0000FFFF;
5746  weight = weight0 | (weight1 << 16);
5747 
5748  const_vec = __msa_fill_w((128 * weight1));
5749  const_vec <<= 6;
5750  offset_vec = __msa_fill_w(offset);
5751  rnd_vec = __msa_fill_w(rnd_val + 1);
5752  offset_vec += const_vec;
5753  weight_vec = (v8i16) __msa_fill_w(weight);
5754 
5755  src0_ptr_tmp = src0_ptr;
5756  dst_tmp = dst;
5757  src1_ptr_tmp = src1_ptr;
5758 
5759  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5760  src0_ptr_tmp += (3 * src_stride);
5761 
5763 
5764  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5765  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5766  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5767 
5768  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5769  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5770  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5771 
5772  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5773  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5774 
5775  for (loop_cnt = 4; loop_cnt--;) {
5776  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5777  src0_ptr_tmp += (4 * src_stride);
5778  XORI_B4_128_SB(src3, src4, src5, src6);
5779 
5780  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5781  src1_ptr_tmp += (4 * src2_stride);
5782 
5783  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5784  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5785  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5786  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5787 
5788  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5789  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5790  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5791  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5792 
5793  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5794  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5795  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5796  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5797 
5798  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5799  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5800  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5801  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5802  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5803  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5804  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5805  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5806 
5807  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5808  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5809  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5810  dst3_r, dst0, dst1, dst2, dst3);
5811  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5812  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5813  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5814  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5815  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5816  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5817  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5818  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5819  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5820  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5821  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5822  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5823  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5824  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5825  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5826  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5827  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5828  tmp0, tmp1, tmp2, tmp3);
5829  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5830  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5831  dst_tmp += (4 * dst_stride);
5832 
5833  dst10_r = dst54_r;
5834  dst10_l = dst54_l;
5835  dst21_r = dst65_r;
5836  dst21_l = dst65_l;
5837  dsth2 = dsth6;
5838  }
5839 
5840  src0_ptr += 8;
5841  dst += 8;
5842  src1_ptr += 8;
5843 
5844  mask2 = LD_SB(ff_hevc_mask_arr + 16);
5845  mask3 = mask2 + 2;
5846 
5847  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5848  src0_ptr += (3 * src_stride);
5850  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5851  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5852 
5853  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5854  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5855 
5856  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5857  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5858 
5859  for (loop_cnt = 2; loop_cnt--;) {
5860  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5861  src10);
5862  src0_ptr += (8 * src_stride);
5863  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5864  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5865  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5866  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5867  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5868 
5869  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5870  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5871  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5872  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5873 
5874  dst32_r = __msa_ilvr_h(dst73, dst22);
5875  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5876  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5877  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5878  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5879  dst76_r = __msa_ilvr_h(dst22, dst106);
5880 
5881  LD2(src1_ptr, src2_stride, tp0, tp1);
5882  src1_ptr += 2 * src2_stride;
5883  INSERT_D2_SH(tp0, tp1, in0);
5884  LD2(src1_ptr, src2_stride, tp0, tp1);
5885  src1_ptr += 2 * src2_stride;
5886  INSERT_D2_SH(tp0, tp1, in1);
5887 
5888  LD2(src1_ptr, src2_stride, tp0, tp1);
5889  src1_ptr += 2 * src2_stride;
5890  INSERT_D2_SH(tp0, tp1, in2);
5891  LD2(src1_ptr, src2_stride, tp0, tp1);
5892  src1_ptr += 2 * src2_stride;
5893  INSERT_D2_SH(tp0, tp1, in3);
5894 
5895  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5896  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5897  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5898  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5899  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5900  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5901  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5902  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5903 
5904  SRA_4V(dst0, dst1, dst2, dst3, 6);
5905  SRA_4V(dst4, dst5, dst6, dst7, 6);
5906  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5907  dst0, dst1, dst2, dst3);
5908  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5909  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5910  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5911  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5912  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5913  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5914  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5915  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5916  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5917  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5918  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5919  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5920  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5921  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5922  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5923  CLIP_SW4_0_255(dst4, dst5, dst6, dst7);
5924  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5925  tmp0, tmp1, tmp2, tmp3);
5926  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5927  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5928  dst += (8 * dst_stride);
5929 
5930  dst10_r = dst98_r;
5931  dst21_r = dst109_r;
5932  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5933  }
5934 }
5935 
5936 static void hevc_hv_biwgt_4t_16w_msa(const uint8_t *src0_ptr,
5937  int32_t src_stride,
5938  const int16_t *src1_ptr,
5939  int32_t src2_stride,
5940  uint8_t *dst,
5941  int32_t dst_stride,
5942  const int8_t *filter_x,
5943  const int8_t *filter_y,
5944  int32_t height,
5945  int32_t weight0,
5946  int32_t weight1,
5947  int32_t offset0,
5948  int32_t offset1,
5949  int32_t rnd_val)
5950 {
5951  if (4 == height) {
5952  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5953  src2_stride, dst, dst_stride, filter_x,
5954  filter_y, weight0, weight1, offset0,
5955  offset1, rnd_val, 2);
5956  } else {
5957  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5958  src2_stride, dst, dst_stride,
5959  filter_x, filter_y, height, weight0,
5960  weight1, offset0, offset1, rnd_val, 16);
5961  }
5962 }
5963 
5964 static void hevc_hv_biwgt_4t_24w_msa(const uint8_t *src0_ptr,
5965  int32_t src_stride,
5966  const int16_t *src1_ptr,
5967  int32_t src2_stride,
5968  uint8_t *dst,
5969  int32_t dst_stride,
5970  const int8_t *filter_x,
5971  const int8_t *filter_y,
5972  int32_t height,
5973  int32_t weight0,
5974  int32_t weight1,
5975  int32_t offset0,
5976  int32_t offset1,
5977  int32_t rnd_val)
5978 {
5979  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5980  src1_ptr, src2_stride,
5981  dst, dst_stride,
5982  filter_x, filter_y, height, weight0,
5983  weight1, offset0, offset1, rnd_val, 24);
5984 }
5985 
5986 static void hevc_hv_biwgt_4t_32w_msa(const uint8_t *src0_ptr,
5987  int32_t src_stride,
5988  const int16_t *src1_ptr,
5989  int32_t src2_stride,
5990  uint8_t *dst,
5991  int32_t dst_stride,
5992  const int8_t *filter_x,
5993  const int8_t *filter_y,
5994  int32_t height,
5995  int32_t weight0,
5996  int32_t weight1,
5997  int32_t offset0,
5998  int32_t offset1,
5999  int32_t rnd_val)
6000 {
6001  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
6002  src1_ptr, src2_stride,
6003  dst, dst_stride,
6004  filter_x, filter_y, height, weight0,
6005  weight1, offset0, offset1, rnd_val, 32);
6006 }
6007 
6008 #define BI_W_MC_COPY(WIDTH) \
6009 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
6010  ptrdiff_t dst_stride, \
6011  const uint8_t *src, \
6012  ptrdiff_t src_stride, \
6013  const int16_t *src_16bit, \
6014  int height, \
6015  int denom, \
6016  int weight0, \
6017  int weight1, \
6018  int offset0, \
6019  int offset1, \
6020  intptr_t mx, \
6021  intptr_t my, \
6022  int width) \
6023 { \
6024  int shift = 14 + 1 - 8; \
6025  int log2Wd = denom + shift - 1; \
6026  \
6027  hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
6028  dst, dst_stride, height, \
6029  weight0, weight1, offset0, \
6030  offset1, log2Wd); \
6031 }
6032 
6033 BI_W_MC_COPY(4);
6034 BI_W_MC_COPY(6);
6035 BI_W_MC_COPY(8);
6036 BI_W_MC_COPY(12);
6037 BI_W_MC_COPY(16);
6038 BI_W_MC_COPY(24);
6039 BI_W_MC_COPY(32);
6040 BI_W_MC_COPY(48);
6041 BI_W_MC_COPY(64);
6042 
6043 #undef BI_W_MC_COPY
6044 
6045 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
6046 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
6047  ptrdiff_t \
6048  dst_stride, \
6049  const uint8_t *src, \
6050  ptrdiff_t \
6051  src_stride, \
6052  const int16_t *src_16bit, \
6053  int height, \
6054  int denom, \
6055  int weight0, \
6056  int weight1, \
6057  int offset0, \
6058  int offset1, \
6059  intptr_t mx, \
6060  intptr_t my, \
6061  int width) \
6062 { \
6063  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
6064  int log2Wd = denom + 14 - 8; \
6065  \
6066  hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6067  MAX_PB_SIZE, dst, dst_stride, \
6068  filter, height, weight0, \
6069  weight1, offset0, offset1, \
6070  log2Wd); \
6071 }
6072 
6073 BI_W_MC(qpel, h, 4, 8, hz, mx);
6074 BI_W_MC(qpel, h, 8, 8, hz, mx);
6075 BI_W_MC(qpel, h, 12, 8, hz, mx);
6076 BI_W_MC(qpel, h, 16, 8, hz, mx);
6077 BI_W_MC(qpel, h, 24, 8, hz, mx);
6078 BI_W_MC(qpel, h, 32, 8, hz, mx);
6079 BI_W_MC(qpel, h, 48, 8, hz, mx);
6080 BI_W_MC(qpel, h, 64, 8, hz, mx);
6081 
6082 BI_W_MC(qpel, v, 4, 8, vt, my);
6083 BI_W_MC(qpel, v, 8, 8, vt, my);
6084 BI_W_MC(qpel, v, 12, 8, vt, my);
6085 BI_W_MC(qpel, v, 16, 8, vt, my);
6086 BI_W_MC(qpel, v, 24, 8, vt, my);
6087 BI_W_MC(qpel, v, 32, 8, vt, my);
6088 BI_W_MC(qpel, v, 48, 8, vt, my);
6089 BI_W_MC(qpel, v, 64, 8, vt, my);
6090 
6091 BI_W_MC(epel, h, 4, 4, hz, mx);
6092 BI_W_MC(epel, h, 8, 4, hz, mx);
6093 BI_W_MC(epel, h, 6, 4, hz, mx);
6094 BI_W_MC(epel, h, 12, 4, hz, mx);
6095 BI_W_MC(epel, h, 16, 4, hz, mx);
6096 BI_W_MC(epel, h, 24, 4, hz, mx);
6097 BI_W_MC(epel, h, 32, 4, hz, mx);
6098 
6099 BI_W_MC(epel, v, 4, 4, vt, my);
6100 BI_W_MC(epel, v, 8, 4, vt, my);
6101 BI_W_MC(epel, v, 6, 4, vt, my);
6102 BI_W_MC(epel, v, 12, 4, vt, my);
6103 BI_W_MC(epel, v, 16, 4, vt, my);
6104 BI_W_MC(epel, v, 24, 4, vt, my);
6105 BI_W_MC(epel, v, 32, 4, vt, my);
6106 
6107 #undef BI_W_MC
6108 
6109 #define BI_W_MC_HV(PEL, WIDTH, TAP) \
6110 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
6111  ptrdiff_t dst_stride, \
6112  const uint8_t *src, \
6113  ptrdiff_t src_stride, \
6114  const int16_t *src_16bit, \
6115  int height, \
6116  int denom, \
6117  int weight0, \
6118  int weight1, \
6119  int offset0, \
6120  int offset1, \
6121  intptr_t mx, \
6122  intptr_t my, \
6123  int width) \
6124 { \
6125  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
6126  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
6127  int log2Wd = denom + 14 - 8; \
6128  \
6129  hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6130  MAX_PB_SIZE, dst, dst_stride, \
6131  filter_x, filter_y, height, \
6132  weight0, weight1, offset0, \
6133  offset1, log2Wd); \
6134 }
6135 
6136 BI_W_MC_HV(qpel, 4, 8);
6137 BI_W_MC_HV(qpel, 8, 8);
6138 BI_W_MC_HV(qpel, 12, 8);
6139 BI_W_MC_HV(qpel, 16, 8);
6140 BI_W_MC_HV(qpel, 24, 8);
6141 BI_W_MC_HV(qpel, 32, 8);
6142 BI_W_MC_HV(qpel, 48, 8);
6143 BI_W_MC_HV(qpel, 64, 8);
6144 
6145 BI_W_MC_HV(epel, 4, 4);
6146 BI_W_MC_HV(epel, 8, 4);
6147 BI_W_MC_HV(epel, 6, 4);
6148 BI_W_MC_HV(epel, 12, 4);
6149 BI_W_MC_HV(epel, 16, 4);
6150 BI_W_MC_HV(epel, 24, 4);
6151 BI_W_MC_HV(epel, 32, 4);
6152 
6153 #undef BI_W_MC_HV
hevc_hv_biwgt_8t_48w_msa
static void hevc_hv_biwgt_8t_48w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2652
VSHF_B2_SB
#define VSHF_B2_SB(...)
Definition: generic_macros_msa.h:662
hevc_hz_biwgt_4t_4x8multiple_msa
static void hevc_hz_biwgt_4t_4x8multiple_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2816
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
hevc_hv_biwgt_4t_8multx4_msa
static void hevc_hv_biwgt_4t_8multx4_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width8mult)
Definition: hevc_mc_biw_msa.c:5230
LD_SH2
#define LD_SH2(...)
Definition: generic_macros_msa.h:280
hevc_hv_biwgt_8t_64w_msa
static void hevc_hv_biwgt_8t_64w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2674
hevc_hv_biwgt_4t_16w_msa
static void hevc_hv_biwgt_4t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5936
hevc_hz_biwgt_4t_8x2_msa
static void hevc_hz_biwgt_4t_8x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2996
hevc_vt_biwgt_4t_4x4_msa
static void hevc_vt_biwgt_4t_4x4_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3627
ILVR_H2_SH
#define ILVR_H2_SH(...)
Definition: generic_macros_msa.h:1392
DPADD_SB2_SH
#define DPADD_SB2_SH(...)
Definition: generic_macros_msa.h:833
hevc_hz_biwgt_4t_8x6_msa
static void hevc_hz_biwgt_4t_8x6_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3052
LD_SH4
#define LD_SH4(...)
Definition: generic_macros_msa.h:299
out
FILE * out
Definition: movenc.c:54
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1674
ILVL_B4_SH
#define ILVL_B4_SH(...)
Definition: generic_macros_msa.h:1276
hevc_hv_biwgt_8t_24w_msa
static void hevc_hv_biwgt_8t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2608
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
src1
const pixel * src1
Definition: h264pred_template.c:421
hevc_hz_biwgt_8t_48w_msa
static void hevc_hz_biwgt_8t_48w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1220
PCKEV_H2_SW
#define PCKEV_H2_SW(...)
Definition: generic_macros_msa.h:1760
INSERT_W4_SH
#define INSERT_W4_SH(...)
Definition: generic_macros_msa.h:1155
HEVC_BIW_RND_CLIP4_MAX_SATU
#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, vec3, wgt, rnd, offset, out0, out1, out2, out3)
Definition: hevc_mc_biw_msa.c:72
BI_W_MC_HV
#define BI_W_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_biw_msa.c:6109
hevc_biwgt_copy_6w_msa
static void hevc_biwgt_copy_6w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:171
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
hevc_hz_biwgt_8t_64w_msa
static void hevc_hz_biwgt_8t_64w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1324
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *2]
Definition: hevc_mc_biw_msa.c:25
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
SLLI_2V
#define SLLI_2V(in0, in1, shift)
Definition: generic_macros_msa.h:1916
VSHF_B4_SB
#define VSHF_B4_SB(...)
Definition: generic_macros_msa.h:680
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
hevc_hv_biwgt_4t_4w_msa
static void hevc_hv_biwgt_4t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4919
hevc_vt_biwgt_4t_8x6_msa
static void hevc_vt_biwgt_4t_8x6_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3975
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
hevc_vt_biwgt_8t_8w_msa
static void hevc_vt_biwgt_8t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1562
SRAR_W2_SW
#define SRAR_W2_SW(...)
Definition: generic_macros_msa.h:2034
hevc_biwgt_copy_8w_msa
static void hevc_biwgt_copy_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:247
hevc_biwgt_copy_12w_msa
static void hevc_biwgt_copy_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:335
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1144
hevc_hz_biwgt_8t_12w_msa
static void hevc_hz_biwgt_8t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:833
XORI_B4_128_SB
#define XORI_B4_128_SB(...)
Definition: generic_macros_msa.h:1851
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
PCKEV_H4_SW
#define PCKEV_H4_SW(...)
Definition: generic_macros_msa.h:1769
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
hevc_hv_biwgt_8t_4w_msa
static void hevc_hv_biwgt_8t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1985
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
ILVL_W2_SB
#define ILVL_W2_SB(...)
Definition: generic_macros_msa.h:1319
hevc_hz_biwgt_8t_4w_msa
static void hevc_hz_biwgt_8t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:681
aligned
static int aligned(int val)
Definition: dashdec.c:170
hevc_hz_biwgt_8t_8w_msa
static void hevc_hz_biwgt_8t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:753
CLIP_SW2_0_255
#define CLIP_SW2_0_255(in0, in1)
Definition: generic_macros_msa.h:972
ILVL_H2_SH
#define ILVL_H2_SH(...)
Definition: generic_macros_msa.h:1292
width
#define width
hevc_hv_biwgt_4t_8x2_msa
static void hevc_hv_biwgt_4t_8x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5134
HEVC_FILT_8TAP_SH
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:24
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_vt_biwgt_4t_12w_msa
static void hevc_vt_biwgt_4t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4151
UNPCK_R_SB_SH
#define UNPCK_R_SB_SH(in, out)
Definition: generic_macros_msa.h:2156
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
HEVC_FILT_8TAP
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:35
hevc_hz_biwgt_8t_16w_msa
static void hevc_hz_biwgt_8t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:928
hevc_hv_biwgt_4t_4x2_msa
static void hevc_hv_biwgt_4t_4x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4593
hevc_hv_biwgt_8t_12w_msa
static void hevc_hv_biwgt_8t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2335
INSERT_D2_SB
#define INSERT_D2_SB(...)
Definition: generic_macros_msa.h:1170
hevc_hv_biwgt_8t_16w_msa
static void hevc_hv_biwgt_8t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2586
hevc_hv_biwgt_4t_8x6_msa
static void hevc_hv_biwgt_4t_8x6_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5355
hevc_macros_msa.h
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
LD2
#define LD2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:223
hevc_vt_biwgt_4t_24w_msa
static void hevc_vt_biwgt_4t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4337
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:366
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
XORI_B5_128_SB
#define XORI_B5_128_SB(...)
Definition: generic_macros_msa.h:1859
hevc_biwgt_copy_48w_msa
static void hevc_biwgt_copy_48w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:574
hevc_vt_biwgt_4t_4x8multiple_msa
static void hevc_vt_biwgt_4t_4x8multiple_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3694
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
hevc_vt_biwgt_4t_4x2_msa
static void hevc_vt_biwgt_4t_4x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3562
INSERT_W4_SB
#define INSERT_W4_SB(...)
Definition: generic_macros_msa.h:1154
hevc_hv_biwgt_4t_24w_msa
static void hevc_hv_biwgt_4t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5964
hevc_hz_biwgt_8t_32w_msa
static void hevc_hz_biwgt_8t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1134
DOTP_SB3_SH
#define DOTP_SB3_SH(...)
Definition: generic_macros_msa.h:776
hevc_vt_biwgt_8t_32w_msa
static void hevc_vt_biwgt_8t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1922
hevc_vt_biwgt_8t_16multx2mult_msa
static void hevc_vt_biwgt_8t_16multx2mult_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
Definition: hevc_mc_biw_msa.c:1760
hevc_vt_biwgt_8t_4w_msa
static void hevc_vt_biwgt_8t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1423
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
hevc_hz_biwgt_4t_6w_msa
static void hevc_hz_biwgt_4t_6w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2922
hevc_hz_biwgt_4t_24w_msa
static void hevc_hz_biwgt_4t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3402
hevc_hv_biwgt_4t_4multx8mult_msa
static void hevc_hv_biwgt_4t_4multx8mult_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4774
weight
static int weight(int i, int blen, int offset)
Definition: diracdec.c:1562
ILVR_D3_SB
#define ILVR_D3_SB(...)
Definition: generic_macros_msa.h:1452
ILVR_D4_SB
#define ILVR_D4_SB(...)
Definition: generic_macros_msa.h:1460
CLIP_SW4_0_255
#define CLIP_SW4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:978
hevcdsp_mips.h
SLLI_4V
#define SLLI_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1921
LD_SB7
#define LD_SB7(...)
Definition: generic_macros_msa.h:327
BI_W_MC_COPY
#define BI_W_MC_COPY(WIDTH)
Definition: hevc_mc_biw_msa.c:6008
XORI_B8_128_SB
#define XORI_B8_128_SB(...)
Definition: generic_macros_msa.h:1880
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
XORI_B2_128_SB
#define XORI_B2_128_SB(...)
Definition: generic_macros_msa.h:1835
hevc_hz_biwgt_4t_32w_msa
static void hevc_hz_biwgt_4t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3489
hevc_hv_biwgt_4t_4x4_msa
static void hevc_hv_biwgt_4t_4x4_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4677
height
#define height
hevc_vt_biwgt_8t_16w_msa
static void hevc_vt_biwgt_8t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1876
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
hevc_hz_biwgt_4t_4x4_msa
static void hevc_hz_biwgt_4t_4x4_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2756
LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:202
ST_D2
#define ST_D2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:491
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1656
hevc_vt_biwgt_8t_12w_msa
static void hevc_vt_biwgt_8t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1650
LD_SB6
#define LD_SB6(...)
Definition: generic_macros_msa.h:316
hevc_vt_biwgt_8t_64w_msa
static void hevc_vt_biwgt_8t_64w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1964
SPLATI_W4_SH
#define SPLATI_W4_SH(...)
Definition: generic_macros_msa.h:1700
ILVRL_H2_SW
#define ILVRL_H2_SW(...)
Definition: generic_macros_msa.h:1509
HEVC_FILT_4TAP_SH
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:46
XORI_B6_128_SB
#define XORI_B6_128_SB(...)
Definition: generic_macros_msa.h:1866
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
hevc_biwgt_copy_24w_msa
static void hevc_biwgt_copy_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:449
HEVC_BIW_RND_CLIP4
#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, wgt, rnd, offset, out0, out1, out2, out3)
Definition: hevc_mc_biw_msa.c:49
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:338
hevc_vt_biwgt_8t_24w_msa
static void hevc_vt_biwgt_8t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1897
hevc_hz_biwgt_4t_8w_msa
static void hevc_hz_biwgt_4t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3196
src2
const pixel * src2
Definition: h264pred_template.c:422
CLIP_SH_0_255
#define CLIP_SH_0_255(in)
Definition: generic_macros_msa.h:935
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
PCKEV_B3_UB
#define PCKEV_B3_UB(...)
Definition: generic_macros_msa.h:1729
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
hevc_hv_biwgt_8t_8w_msa
static void hevc_hv_biwgt_8t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2313
DPADD_SB4_SH
#define DPADD_SB4_SH(...)
Definition: generic_macros_msa.h:841
ST_H2
#define ST_H2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:409
SPLATI_W2_SH
#define SPLATI_W2_SH(...)
Definition: generic_macros_msa.h:1692
hevc_vt_biwgt_8t_48w_msa
static void hevc_vt_biwgt_8t_48w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1943
hevc_vt_biwgt_4t_8x4multiple_msa
static void hevc_vt_biwgt_4t_8x4multiple_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4046
hevc_vt_biwgt_4t_32w_msa
static void hevc_vt_biwgt_4t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4474
HEVC_BIW_RND_CLIP2
#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1)
Definition: hevc_mc_biw_msa.c:31
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:289
hevc_hz_biwgt_4t_8x4multiple_msa
static void hevc_hz_biwgt_4t_8x4multiple_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3126
ILVL_H4_SH
#define ILVL_H4_SH(...)
Definition: generic_macros_msa.h:1301
HEVC_BIW_RND_CLIP2_MAX_SATU
#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1)
Definition: hevc_mc_biw_msa.c:56
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
hevc_vt_biwgt_4t_8w_msa
static void hevc_vt_biwgt_4t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4120
hevc_biwgt_copy_4w_msa
static void hevc_biwgt_copy_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:82
hevc_hz_biwgt_4t_12w_msa
static void hevc_hz_biwgt_4t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3227
BI_W_MC
#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_biw_msa.c:6045
ILVR_D2_SH
#define ILVR_D2_SH(...)
Definition: generic_macros_msa.h:1445
hevc_hv_biwgt_4t_8w_msa
static void hevc_hv_biwgt_4t_8w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5657
hevc_vt_biwgt_4t_4w_msa
static void hevc_vt_biwgt_4t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3780
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
DOTP_SB4_SH
#define DOTP_SB4_SH(...)
Definition: generic_macros_msa.h:784
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
hevc_hv_biwgt_4t_32w_msa
static void hevc_hv_biwgt_4t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5986
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1362
src0
const pixel *const src0
Definition: h264pred_template.c:420
hevc_biwgt_copy_16w_msa
static void hevc_biwgt_copy_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:394
LD_SH6
#define LD_SH6(...)
Definition: generic_macros_msa.h:318
hevc_hz_biwgt_4t_4x2_msa
static void hevc_hz_biwgt_4t_4x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2696
zero
#define zero
Definition: regdef.h:64
hevc_vt_biwgt_4t_16w_msa
static void hevc_vt_biwgt_4t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4243
hevc_vt_biwgt_4t_6w_msa
static void hevc_vt_biwgt_4t_6w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3811
hevc_hv_biwgt_4t_6w_msa
static void hevc_hv_biwgt_4t_6w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4951
XORI_B7_128_SB
#define XORI_B7_128_SB(...)
Definition: generic_macros_msa.h:1873
hevc_hz_biwgt_8t_24w_msa
static void hevc_hz_biwgt_8t_24w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1012
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1498
ST_SH
#define ST_SH(...)
Definition: generic_macros_msa.h:43
hevc_vt_biwgt_4t_8x2_msa
static void hevc_vt_biwgt_4t_8x2_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3918
HEVC_FILT_4TAP
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:64
hevc_hz_biwgt_4t_16w_msa
static void hevc_hz_biwgt_4t_16w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3313
int32_t
int32_t
Definition: audioconvert.c:56
h
h
Definition: vp9dsp_template.c:2038
ILVR_H4_SH
#define ILVR_H4_SH(...)
Definition: generic_macros_msa.h:1408
hevc_hz_biwgt_4t_4w_msa
static void hevc_hz_biwgt_4t_4w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2891
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
hevc_hv_biwgt_8t_8multx2mult_msa
static void hevc_hv_biwgt_8t_8multx2mult_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width8mult)
Definition: hevc_mc_biw_msa.c:2130
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1789
INSERT_D2_SH
#define INSERT_D2_SH(...)
Definition: generic_macros_msa.h:1171
SD
#define SD
Definition: ccaption_dec.c:924
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759
XORI_B3_128_SB
#define XORI_B3_128_SB(...)
Definition: generic_macros_msa.h:1843
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278
hevc_hv_biwgt_4t_8multx4mult_msa
static void hevc_hv_biwgt_4t_8multx4mult_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
Definition: hevc_mc_biw_msa.c:5506
hevc_hv_biwgt_8t_32w_msa
static void hevc_hv_biwgt_8t_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2630
hevc_hv_biwgt_4t_12w_msa
static void hevc_hv_biwgt_4t_12w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5694
hevc_biwgt_copy_64w_msa
static void hevc_biwgt_copy_64w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:626
SRAR_W4_SW
#define SRAR_W4_SW(...)
Definition: generic_macros_msa.h:2041
LW2
#define LW2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:210
hevc_biwgt_copy_32w_msa
static void hevc_biwgt_copy_32w_msa(const uint8_t *src0_ptr, int32_t src_stride, const int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:513