25 #define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \
26 out0, out1, out2, out3) \
28 MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3); \
29 SRAR_W4_SW(out0, out1, out2, out3, rnd); \
30 ADD4(out0, offset, out1, offset, out2, offset, out3, offset, \
31 out0, out1, out2, out3); \
32 out0 = CLIP_SW_0_255(out0); \
33 out1 = CLIP_SW_0_255(out1); \
34 out2 = CLIP_SW_0_255(out2); \
35 out3 = CLIP_SW_0_255(out3); \
38 #define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \
39 out0_r, out1_r, out0_l, out1_l) \
41 ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r); \
42 ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l); \
43 DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt, \
44 out0_r, out1_r, out0_l, out1_l); \
45 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
46 ADD4(out0_r, offset, out1_r, offset, \
47 out0_l, offset, out1_l, offset, \
48 out0_r, out1_r, out0_l, out1_l); \
49 out0_r = CLIP_SW_0_255(out0_r); \
50 out1_r = CLIP_SW_0_255(out1_r); \
51 out0_l = CLIP_SW_0_255(out0_l); \
52 out1_l = CLIP_SW_0_255(out1_l); \
55 #define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \
56 out0_r, out1_r, out2_r, out3_r, \
57 out0_l, out1_l, out2_l, out3_l) \
59 HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \
60 out0_r, out1_r, out0_l, out1_l); \
61 HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd, \
62 out2_r, out3_r, out2_l, out3_l); \
75 v4i32 weight_vec, offset_vec, rnd_vec;
77 weight = weight & 0x0000FFFF;
78 weight_vec = __msa_fill_w(weight);
79 offset_vec = __msa_fill_w(offset);
80 rnd_vec = __msa_fill_w(rnd_val);
87 LD_SB2(src, src_stride, src0, src1);
88 src0 = (v16i8) __msa_ilvr_w((v4i32)
src1, (v4i32) src0);
89 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
93 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
95 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
101 }
else if (4 == height) {
104 v4i32 dst0_r, dst1_r;
105 v4i32 dst0_l, dst1_l;
107 LD_SB4(src, src_stride, src0, src1, src2, src3);
108 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
109 ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
114 dst0_r, dst1_r, dst0_l, dst1_l);
117 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
118 }
else if (0 == height % 8) {
120 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
121 v8i16 dst0, dst1, dst2, dst3;
122 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
123 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
125 for (loop_cnt = (height >> 3); loop_cnt--;) {
127 src0, src1, src2, src3, src4, src5, src6, src7);
128 src += (8 * src_stride);
129 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
130 src0, src1, src2, src3);
131 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
132 dst0, dst1, dst2, dst3);
134 SLLI_4V(dst0, dst1, dst2, dst3, 6);
136 weight_vec, offset_vec, rnd_vec,
137 dst0_r, dst1_r, dst2_r, dst3_r,
138 dst0_l, dst1_l, dst2_l, dst3_l);
141 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
142 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
143 dst += (8 * dst_stride);
159 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
160 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
161 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
162 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
163 v4i32 weight_vec, offset_vec, rnd_vec;
165 weight = weight & 0x0000FFFF;
166 weight_vec = __msa_fill_w(weight);
167 offset_vec = __msa_fill_w(offset);
168 rnd_vec = __msa_fill_w(rnd_val);
170 for (loop_cnt = (height >> 3); loop_cnt--;) {
171 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
172 src += (8 * src_stride);
173 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
174 dst0, dst1, dst2, dst3);
175 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
176 dst4, dst5, dst6, dst7);
178 SLLI_4V(dst0, dst1, dst2, dst3, 6);
179 SLLI_4V(dst4, dst5, dst6, dst7, 6);
181 weight_vec, offset_vec, rnd_vec,
182 dst0_r, dst1_r, dst2_r, dst3_r,
183 dst0_l, dst1_l, dst2_l, dst3_l);
186 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
187 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
188 dst += (4 * dst_stride);
191 weight_vec, offset_vec, rnd_vec,
192 dst0_r, dst1_r, dst2_r, dst3_r,
193 dst0_l, dst1_l, dst2_l, dst3_l);
196 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
197 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
198 dst += (4 * dst_stride);
212 v4i32 weight_vec, offset_vec, rnd_vec;
214 weight = weight & 0x0000FFFF;
215 weight_vec = __msa_fill_w(weight);
216 offset_vec = __msa_fill_w(offset);
217 rnd_vec = __msa_fill_w(rnd_val);
222 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
224 LD_SB2(src, src_stride, src0, src1);
225 ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
230 dst0_r, dst1_r, dst0_l, dst1_l);
234 }
else if (6 == height) {
235 v16i8
src0,
src1, src2, src3, src4, src5;
236 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
237 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
238 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
240 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
241 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
242 dst0, dst1, dst2, dst3);
243 ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
245 SLLI_4V(dst0, dst1, dst2, dst3, 6);
249 weight_vec, offset_vec, rnd_vec,
250 dst0_r, dst1_r, dst2_r, dst3_r,
251 dst0_l, dst1_l, dst2_l, dst3_l);
253 dst4_r, dst5_r, dst4_l, dst5_l);
256 dst2_l, dst2_r, dst3_l, dst3_r,
257 dst4_l, dst4_r, dst5_l, dst5_r,
258 dst0_r, dst1_r, dst2_r);
259 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
260 dst += (4 * dst_stride);
262 }
else if (0 == height % 4) {
265 v8i16 dst0, dst1, dst2, dst3;
266 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
268 for (loop_cnt = (height >> 2); loop_cnt--;) {
269 LD_SB4(src, src_stride, src0, src1, src2, src3);
270 src += (4 * src_stride);
271 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
272 dst0, dst1, dst2, dst3);
274 SLLI_4V(dst0, dst1, dst2, dst3, 6);
276 weight_vec, offset_vec, rnd_vec,
277 dst0_r, dst1_r, dst2_r, dst3_r,
278 dst0_l, dst1_l, dst2_l, dst3_l);
281 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
282 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
283 dst += (4 * dst_stride);
299 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
300 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
301 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
303 v4i32 weight_vec, offset_vec, rnd_vec;
305 weight = weight & 0x0000FFFF;
306 weight_vec = __msa_fill_w(weight);
307 offset_vec = __msa_fill_w(offset);
308 rnd_vec = __msa_fill_w(rnd_val);
310 for (loop_cnt = (height >> 2); loop_cnt--;) {
311 LD_SB4(src, src_stride, src0, src1, src2, src3);
312 src += (4 * src_stride);
313 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
314 dst0, dst1, dst2, dst3);
316 SLLI_4V(dst0, dst1, dst2, dst3, 6);
317 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
318 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
322 weight_vec, offset_vec, rnd_vec,
323 dst0_r, dst1_r, dst2_r, dst3_r,
324 dst0_l, dst1_l, dst2_l, dst3_l);
326 dst4_r, dst5_r, dst4_l, dst5_l);
329 dst2_l, dst2_r, dst3_l, dst3_r,
330 dst4_l, dst4_r, dst5_l, dst5_r,
331 dst0_r, dst1_r, dst2_r);
332 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
333 dst += (4 * dst_stride);
347 uint32_t loop_cnt, cnt;
351 v8i16 tmp0, tmp1, tmp2, tmp3;
352 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
354 v4i32 weight_vec, offset_vec, rnd_vec;
356 weight = weight & 0x0000FFFF;
357 weight_vec = __msa_fill_w(weight);
358 offset_vec = __msa_fill_w(offset);
359 rnd_vec = __msa_fill_w(rnd_val);
361 for (cnt = width >> 4; cnt--;) {
365 for (loop_cnt = height >> 2; loop_cnt--;) {
366 LD_SB4(src_tmp, src_stride, src0, src1, src2, src3);
367 src_tmp += (4 * src_stride);
368 ILVR_B2_SH(zero, src0, zero, src1, tmp0, tmp1);
369 ILVL_B2_SH(zero, src0, zero, src1, tmp2, tmp3);
371 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
373 weight_vec, offset_vec, rnd_vec,
374 dst0_r, dst1_r, dst2_r, dst3_r,
375 dst0_l, dst1_l, dst2_l, dst3_l);
378 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
380 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
381 dst_tmp += (2 * dst_stride);
383 ILVR_B2_SH(zero, src2, zero, src3, tmp0, tmp1);
384 ILVL_B2_SH(zero, src2, zero, src3, tmp2, tmp3);
386 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
388 weight_vec, offset_vec, rnd_vec,
389 dst0_r, dst1_r, dst2_r, dst3_r,
390 dst0_l, dst1_l, dst2_l, dst3_l);
393 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
395 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
396 dst_tmp += (2 * dst_stride);
414 height, weight, offset, rnd_val, 16);
427 height, weight, offset, rnd_val, 16);
430 height, weight, offset, rnd_val);
443 height, weight, offset, rnd_val, 32);
456 height, weight, offset, rnd_val, 48);
469 height, weight, offset, rnd_val, 64);
483 v8i16 filt0, filt1, filt2, filt3;
484 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
485 v16i8 mask1, mask2, mask3;
486 v8i16 filter_vec, const_vec;
487 v16i8 vec0, vec1, vec2, vec3;
488 v8i16 dst0, dst1, dst2, dst3;
489 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
490 v4i32 weight_vec, offset_vec, rnd_vec;
491 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
494 weight = weight & 0x0000FFFF;
495 const_vec = __msa_ldi_h(128);
498 weight_vec = __msa_fill_w(weight);
499 offset_vec = __msa_fill_w(offset);
500 rnd_vec = __msa_fill_w(rnd_val);
502 filter_vec =
LD_SH(filter);
503 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
509 for (loop_cnt = (height >> 3); loop_cnt--;) {
510 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
511 src += (8 * src_stride);
514 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
515 vec0, vec1, vec2, vec3);
518 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
519 dst0, dst0, dst0, dst0);
520 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
521 vec0, vec1, vec2, vec3);
523 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
524 dst1, dst1, dst1, dst1);
525 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
526 vec0, vec1, vec2, vec3);
528 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
529 dst2, dst2, dst2, dst2);
530 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
531 vec0, vec1, vec2, vec3);
533 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
534 dst3, dst3, dst3, dst3);
537 weight_vec, offset_vec, rnd_vec,
538 dst0_r, dst1_r, dst2_r, dst3_r,
539 dst0_l, dst1_l, dst2_l, dst3_l);
542 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
543 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
544 dst += (8 * dst_stride);
560 v8i16 filt0, filt1, filt2, filt3;
561 v16i8 mask1, mask2, mask3;
562 v8i16 filter_vec, const_vec;
563 v16i8 vec0, vec1, vec2, vec3;
564 v8i16 dst0, dst1, dst2, dst3;
565 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
566 v4i32 weight_vec, offset_vec, rnd_vec;
567 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
570 weight = weight & 0x0000FFFF;
571 const_vec = __msa_ldi_h(128);
574 weight_vec = __msa_fill_w(weight);
575 offset_vec = __msa_fill_w(offset);
576 rnd_vec = __msa_fill_w(rnd_val);
578 filter_vec =
LD_SH(filter);
579 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
585 for (loop_cnt = (height >> 2); loop_cnt--;) {
586 LD_SB4(src, src_stride, src0, src1, src2, src3);
587 src += (4 * src_stride);
590 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
591 vec0, vec1, vec2, vec3);
593 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
594 dst0, dst0, dst0, dst0);
595 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
596 vec0, vec1, vec2, vec3);
598 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
599 dst1, dst1, dst1, dst1);
600 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
601 vec0, vec1, vec2, vec3);
603 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
604 dst2, dst2, dst2, dst2);
605 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
606 vec0, vec1, vec2, vec3);
608 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
609 dst3, dst3, dst3, dst3);
612 weight_vec, offset_vec, rnd_vec,
613 dst0_r, dst1_r, dst2_r, dst3_r,
614 dst0_l, dst1_l, dst2_l, dst3_l);
617 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
618 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
619 dst += (4 * dst_stride);
634 filter, height, weight, offset, rnd_val);
636 filter, height, weight, offset, rnd_val);
651 v8i16 filt0, filt1, filt2, filt3;
652 v16i8 mask1, mask2, mask3;
653 v8i16 filter_vec, const_vec;
654 v16i8 vec0, vec1, vec2, vec3;
655 v8i16 dst0, dst1, dst2, dst3;
656 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
657 v4i32 weight_vec, offset_vec, rnd_vec;
658 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
661 const_vec = __msa_ldi_h(128);
664 weight = weight & 0x0000FFFF;
665 weight_vec = __msa_fill_w(weight);
666 offset_vec = __msa_fill_w(offset);
667 rnd_vec = __msa_fill_w(rnd_val);
669 filter_vec =
LD_SH(filter);
670 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
676 for (loop_cnt = (height >> 1); loop_cnt--;) {
677 LD_SB2(src, src_stride, src0, src2);
678 LD_SB2(src + 8, src_stride, src1, src3);
679 src += (2 * src_stride);
682 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
683 vec0, vec1, vec2, vec3);
685 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
686 dst0, dst0, dst0, dst0);
687 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
688 vec0, vec1, vec2, vec3);
690 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
691 dst1, dst1, dst1, dst1);
692 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
693 vec0, vec1, vec2, vec3);
695 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
696 dst2, dst2, dst2, dst2);
697 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
698 vec0, vec1, vec2, vec3);
700 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
701 dst3, dst3, dst3, dst3);
704 weight_vec, offset_vec, rnd_vec,
705 dst0_r, dst1_r, dst2_r, dst3_r,
706 dst0_l, dst1_l, dst2_l, dst3_l);
709 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
710 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
711 dst += (2 * dst_stride);
727 v8i16 filt0, filt1, filt2, filt3;
728 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
729 v16i8 vec0, vec1, vec2, vec3;
730 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
731 v8i16 filter_vec, const_vec;
732 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
733 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
734 v4i32 weight_vec, offset_vec, rnd_vec;
735 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
738 const_vec = __msa_ldi_h(128);
741 weight = weight & 0x0000FFFF;
742 weight_vec = __msa_fill_w(weight);
743 offset_vec = __msa_fill_w(offset);
744 rnd_vec = __msa_fill_w(rnd_val);
746 filter_vec =
LD_SH(filter);
747 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757 for (loop_cnt = (height >> 1); loop_cnt--;) {
758 LD_SB2(src, 16, src0, src1);
760 LD_SB2(src, 16, src2, src3);
763 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
764 vec0, vec1, vec2, vec3);
767 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
768 dst0, dst0, dst0, dst0);
769 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
770 vec0, vec1, vec2, vec3);
772 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
773 dst1, dst1, dst1, dst1);
774 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
775 vec0, vec1, vec2, vec3);
777 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
778 dst2, dst2, dst2, dst2);
779 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
780 vec0, vec1, vec2, vec3);
782 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
783 dst3, dst3, dst3, dst3);
784 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
785 vec0, vec1, vec2, vec3);
787 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
788 dst4, dst4, dst4, dst4);
789 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
790 vec0, vec1, vec2, vec3);
792 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
793 dst5, dst5, dst5, dst5);
796 weight_vec, offset_vec, rnd_vec,
797 dst0_r, dst1_r, dst2_r, dst3_r,
798 dst0_l, dst1_l, dst2_l, dst3_l);
800 dst4_r, dst5_r, dst4_l, dst5_l);
803 dst3_l, dst3_r, dst4_l, dst4_r, dst0_r, dst1_r);
805 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
806 ST8x2_UB(dst2_r, dst + 16, dst_stride);
807 dst += (2 * dst_stride);
823 v8i16 filt0, filt1, filt2, filt3;
824 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
825 v16i8 vec0, vec1, vec2, vec3;
826 v8i16 dst0, dst1, dst2, dst3;
827 v8i16 filter_vec, const_vec;
828 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
829 v4i32 weight_vec, offset_vec, rnd_vec;
830 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
833 const_vec = __msa_ldi_h(128);
836 weight = weight & 0x0000FFFF;
837 weight_vec = __msa_fill_w(weight);
838 offset_vec = __msa_fill_w(offset);
839 rnd_vec = __msa_fill_w(rnd_val);
841 filter_vec =
LD_SH(filter);
842 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
852 for (loop_cnt = height; loop_cnt--;) {
853 LD_SB2(src, 16, src0, src1);
854 src2 =
LD_SB(src + 24);
858 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
859 vec0, vec1, vec2, vec3);
861 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
862 dst0, dst0, dst0, dst0);
863 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
864 vec0, vec1, vec2, vec3);
866 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
867 dst1, dst1, dst1, dst1);
868 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
869 vec0, vec1, vec2, vec3);
871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
872 dst2, dst2, dst2, dst2);
873 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
874 vec0, vec1, vec2, vec3);
876 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
877 dst3, dst3, dst3, dst3);
880 weight_vec, offset_vec, rnd_vec,
881 dst0_r, dst1_r, dst2_r, dst3_r,
882 dst0_l, dst1_l, dst2_l, dst3_l);
885 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
886 ST_SW2(dst0_r, dst1_r, dst, 16);
903 v8i16 filt0, filt1, filt2, filt3;
904 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
905 v16i8 vec0, vec1, vec2, vec3;
906 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
907 v8i16 filter_vec, const_vec;
908 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
909 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
910 v4i32 weight_vec, offset_vec, rnd_vec;
911 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
914 const_vec = __msa_ldi_h(128);
917 weight = weight & 0x0000FFFF;
918 weight_vec = __msa_fill_w(weight);
919 offset_vec = __msa_fill_w(offset);
920 rnd_vec = __msa_fill_w(rnd_val);
922 filter_vec =
LD_SH(filter);
923 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
933 for (loop_cnt = height; loop_cnt--;) {
934 LD_SB3(src, 16, src0, src1, src2);
935 src3 =
LD_SB(src + 40);
939 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
940 vec0, vec1, vec2, vec3);
942 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
943 dst0, dst0, dst0, dst0);
944 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
945 vec0, vec1, vec2, vec3);
947 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
948 dst1, dst1, dst1, dst1);
949 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
950 vec0, vec1, vec2, vec3);
952 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
953 dst2, dst2, dst2, dst2);
954 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
955 vec0, vec1, vec2, vec3);
957 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
958 dst3, dst3, dst3, dst3);
959 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
960 vec0, vec1, vec2, vec3);
962 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
963 dst4, dst4, dst4, dst4);
964 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
965 vec0, vec1, vec2, vec3);
967 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
968 dst5, dst5, dst5, dst5);
971 weight_vec, offset_vec, rnd_vec,
972 dst0_r, dst1_r, dst2_r, dst3_r,
973 dst0_l, dst1_l, dst2_l, dst3_l);
976 dst4_r, dst5_r, dst4_l, dst5_l);
979 dst2_l, dst2_r, dst3_l, dst3_r,
980 dst4_l, dst4_r, dst5_l, dst5_r,
981 dst0_r, dst1_r, dst2_r);
982 ST_SW2(dst0_r, dst1_r, dst, 16);
983 ST_SW(dst2_r, dst + 32);
1000 uint32_t loop_cnt, cnt;
1002 v8i16 filt0, filt1, filt2, filt3;
1003 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1004 v16i8 vec0, vec1, vec2, vec3;
1005 v8i16 dst0, dst1, dst2, dst3;
1006 v8i16 filter_vec, const_vec;
1007 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1008 v4i32 weight_vec, offset_vec, rnd_vec;
1009 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1012 const_vec = __msa_ldi_h(128);
1015 weight = weight & 0x0000FFFF;
1016 weight_vec = __msa_fill_w(weight);
1017 offset_vec = __msa_fill_w(offset);
1018 rnd_vec = __msa_fill_w(rnd_val);
1020 filter_vec =
LD_SH(filter);
1021 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1031 for (loop_cnt = height; loop_cnt--;) {
1035 for (cnt = 2; cnt--;) {
1036 LD_SB2(src_tmp, 16, src0, src1);
1037 src2 =
LD_SB(src_tmp + 24);
1041 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1042 vec0, vec1, vec2, vec3);
1044 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1045 dst0, dst0, dst0, dst0);
1046 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1047 vec0, vec1, vec2, vec3);
1049 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1050 dst1, dst1, dst1, dst1);
1051 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1052 vec0, vec1, vec2, vec3);
1054 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1055 dst2, dst2, dst2, dst2);
1056 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1057 vec0, vec1, vec2, vec3);
1059 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1060 dst3, dst3, dst3, dst3);
1063 weight_vec, offset_vec, rnd_vec,
1064 dst0_r, dst1_r, dst2_r, dst3_r,
1065 dst0_l, dst1_l, dst2_l, dst3_l);
1068 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1069 ST_SW2(dst0_r, dst1_r, dst_tmp, 16);
1089 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1090 v16i8 src9, src10, src11, src12, src13, src14;
1091 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1092 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1093 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1094 v16i8 src2110, src4332, src6554, src8776, src10998;
1095 v16i8 src12111110, src14131312;
1096 v8i16 dst10, dst32, dst54, dst76;
1097 v8i16 filt0, filt1, filt2, filt3;
1098 v8i16 filter_vec, const_vec;
1099 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1100 v4i32 weight_vec, offset_vec, rnd_vec;
1102 src -= (3 * src_stride);
1103 const_vec = __msa_ldi_h(128);
1106 weight = weight & 0x0000FFFF;
1107 weight_vec = __msa_fill_w(weight);
1108 offset_vec = __msa_fill_w(offset);
1109 rnd_vec = __msa_fill_w(rnd_val);
1111 filter_vec =
LD_SH(filter);
1112 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1114 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1115 src += (7 * src_stride);
1117 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1118 src10_r, src32_r, src54_r, src21_r);
1120 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1123 src32_r, src65_r, src54_r, src2110, src4332, src6554);
1127 for (loop_cnt = (height >> 3); loop_cnt--;) {
1129 src7, src8, src9, src10, src11, src12, src13, src14);
1130 src += (8 * src_stride);
1131 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1132 src76_r, src87_r, src98_r, src109_r);
1133 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1134 src1110_r, src1211_r, src1312_r, src1413_r);
1135 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1136 src1413_r, src1312_r,
1137 src8776, src10998, src12111110, src14131312);
1141 DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
1142 filt2, filt3, dst10, dst10, dst10, dst10);
1145 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1148 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1150 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1151 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1154 weight_vec, offset_vec, rnd_vec,
1155 dst0_r, dst1_r, dst2_r, dst3_r,
1156 dst0_l, dst1_l, dst2_l, dst3_l);
1159 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1160 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
1161 dst += (8 * dst_stride);
1164 src4332 = src12111110;
1165 src6554 = src14131312;
1181 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1182 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1183 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1184 v8i16 tmp0, tmp1, tmp2, tmp3;
1185 v8i16 filt0, filt1, filt2, filt3;
1186 v8i16 filter_vec, const_vec;
1187 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1188 v4i32 weight_vec, offset_vec, rnd_vec;
1190 src -= (3 * src_stride);
1191 const_vec = __msa_ldi_h(128);
1194 weight = weight & 0x0000FFFF;
1195 weight_vec = __msa_fill_w(weight);
1196 offset_vec = __msa_fill_w(offset);
1197 rnd_vec = __msa_fill_w(rnd_val);
1199 filter_vec =
LD_SH(filter);
1200 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1202 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1203 src += (7 * src_stride);
1206 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1207 src10_r, src32_r, src54_r, src21_r);
1208 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1210 for (loop_cnt = (height >> 2); loop_cnt--;) {
1211 LD_SB4(src, src_stride, src7, src8, src9, src10);
1212 src += (4 * src_stride);
1214 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1215 src76_r, src87_r, src98_r, src109_r);
1219 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1222 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1225 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1228 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1231 weight_vec, offset_vec, rnd_vec,
1232 dst0_r, dst1_r, dst2_r, dst3_r,
1233 dst0_l, dst1_l, dst2_l, dst3_l);
1236 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1237 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1238 dst += (4 * dst_stride);
1261 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1262 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1263 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1264 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1265 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1266 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1267 v16i8 src2110, src4332, src6554, src8776, src10998;
1268 v8i16 filt0, filt1, filt2, filt3;
1269 v8i16 filter_vec, const_vec;
1270 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
1271 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
1272 v4i32 weight_vec, offset_vec, rnd_vec;
1274 src -= (3 * src_stride);
1275 const_vec = __msa_ldi_h(128);
1278 weight = weight & 0x0000FFFF;
1279 weight_vec = __msa_fill_w(weight);
1280 offset_vec = __msa_fill_w(offset);
1281 rnd_vec = __msa_fill_w(rnd_val);
1283 filter_vec =
LD_SH(filter);
1284 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1286 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1287 src += (7 * src_stride);
1290 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1291 src10_r, src32_r, src54_r, src21_r);
1292 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1293 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1294 src10_l, src32_l, src54_l, src21_l);
1295 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1296 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1297 src2110, src4332, src6554);
1299 for (loop_cnt = (height >> 2); loop_cnt--;) {
1300 LD_SB4(src, src_stride, src7, src8, src9, src10);
1301 src += (4 * src_stride);
1304 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1305 src76_r, src87_r, src98_r, src109_r);
1306 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1307 src76_l, src87_l, src98_l, src109_l);
1308 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1312 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1315 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1318 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1321 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1324 filt0, filt1, filt2, filt3, tmp4, tmp4, tmp4, tmp4);
1327 filt0, filt1, filt2, filt3, tmp5, tmp5, tmp5, tmp5);
1330 weight_vec, offset_vec, rnd_vec,
1331 dst0_r, dst1_r, dst2_r, dst3_r,
1332 dst0_l, dst1_l, dst2_l, dst3_l);
1334 dst4_r, dst5_r, dst4_l, dst5_l);
1337 dst2_l, dst2_r, dst3_l, dst3_r,
1338 dst4_l, dst4_r, dst5_l, dst5_r,
1339 dst0_r, dst1_r, dst2_r);
1340 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
1341 dst += (4 * dst_stride);
1370 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1371 v16i8 src10_r, src32_r, src54_r, src76_r;
1372 v16i8 src21_r, src43_r, src65_r, src87_r;
1373 v8i16 tmp0, tmp1, tmp2, tmp3;
1374 v16i8 src10_l, src32_l, src54_l, src76_l;
1375 v16i8 src21_l, src43_l, src65_l, src87_l;
1376 v8i16 filt0, filt1, filt2, filt3;
1377 v8i16 filter_vec, const_vec;
1378 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1379 v4i32 weight_vec, offset_vec, rnd_vec;
1381 src -= (3 * src_stride);
1382 const_vec = __msa_ldi_h(128);
1385 weight = weight & 0x0000FFFF;
1386 weight_vec = __msa_fill_w(weight);
1387 offset_vec = __msa_fill_w(offset);
1388 rnd_vec = __msa_fill_w(rnd_val);
1390 filter_vec =
LD_SH(filter);
1391 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1393 for (cnt = (width >> 4); cnt--;) {
1397 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1398 src_tmp += (7 * src_stride);
1400 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1401 src10_r, src32_r, src54_r, src21_r);
1402 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1403 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1404 src10_l, src32_l, src54_l, src21_l);
1405 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1407 for (loop_cnt = (height >> 1); loop_cnt--;) {
1408 LD_SB2(src_tmp, src_stride, src7, src8);
1409 src_tmp += (2 * src_stride);
1411 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1412 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1416 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1419 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1422 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1425 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1428 weight_vec, offset_vec, rnd_vec,
1429 dst0_r, dst1_r, dst2_r, dst3_r,
1430 dst0_l, dst1_l, dst2_l, dst3_l);
1433 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
1434 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
1435 dst_tmp += (2 * dst_stride);
1468 filter, height, weight,
1469 offset, rnd_val, 16);
1483 filter, height, weight,
1484 offset, rnd_val, 16);
1487 filter, height, weight, offset, rnd_val);
1501 filter, height, weight,
1502 offset, rnd_val, 32);
1516 filter, height, weight,
1517 offset, rnd_val, 48);
1531 filter, height, weight,
1532 offset, rnd_val, 64);
1539 const int8_t *filter_x,
1540 const int8_t *filter_y,
1547 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1548 v8i16 filt0, filt1, filt2, filt3;
1549 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1550 v16i8 mask1, mask2, mask3;
1551 v8i16 filter_vec, const_vec;
1552 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1553 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1554 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1555 v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
1556 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1557 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1558 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1559 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1561 src -= ((3 * src_stride) + 3);
1562 filter_vec =
LD_SH(filter_x);
1563 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1565 filter_vec =
LD_SH(filter_y);
1566 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1567 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1569 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1575 const_vec = __msa_ldi_h(128);
1578 weight_vec = __msa_fill_w(weight);
1579 offset_vec = __msa_fill_w(offset);
1580 rnd_vec = __msa_fill_w(rnd_val);
1582 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1583 src += (7 * src_stride);
1587 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1588 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1589 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1590 vec8, vec9, vec10, vec11);
1591 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1592 vec12, vec13, vec14, vec15);
1594 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1595 dst30, dst30, dst30, dst30);
1597 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1598 dst41, dst41, dst41, dst41);
1600 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1601 dst52, dst52, dst52, dst52);
1603 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1604 dst63, dst63, dst63, dst63);
1606 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1607 dst10_r, dst21_r, dst32_r);
1609 dst43_r = __msa_ilvl_h(dst41, dst30);
1610 dst54_r = __msa_ilvl_h(dst52, dst41);
1611 dst65_r = __msa_ilvl_h(dst63, dst52);
1613 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1615 for (loop_cnt = height >> 1; loop_cnt--;) {
1616 LD_SB2(src, src_stride, src7, src8);
1617 src += (2 * src_stride);
1620 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1621 vec0, vec1, vec2, vec3);
1623 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1624 dst87, dst87, dst87, dst87);
1625 dst76_r = __msa_ilvr_h(dst87, dst66);
1627 filt_h0, filt_h1, filt_h2, filt_h3);
1628 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1630 filt_h0, filt_h1, filt_h2, filt_h3);
1634 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1636 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1642 dst += (2 * dst_stride);
1650 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1658 const int8_t *filter_x,
1659 const int8_t *filter_y,
1666 uint32_t loop_cnt, cnt;
1669 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1670 v8i16 filt0, filt1, filt2, filt3;
1671 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1672 v16i8 mask1, mask2, mask3;
1673 v8i16 filter_vec, const_vec;
1674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1675 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1676 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1677 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1678 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1679 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1680 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1681 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1682 v4i32 weight_vec, offset_vec, rnd_vec;
1683 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1685 src -= ((3 * src_stride) + 3);
1686 const_vec = __msa_ldi_h(128);
1689 weight_vec = __msa_fill_w(weight);
1690 offset_vec = __msa_fill_w(offset);
1691 rnd_vec = __msa_fill_w(rnd_val);
1693 filter_vec =
LD_SH(filter_x);
1694 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1696 filter_vec =
LD_SH(filter_y);
1697 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1698 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1699 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1705 for (cnt = width >> 3; cnt--;) {
1709 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1710 src_tmp += (7 * src_stride);
1713 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1714 vec0, vec1, vec2, vec3);
1715 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1716 vec4, vec5, vec6, vec7);
1717 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1718 vec8, vec9, vec10, vec11);
1719 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1720 vec12, vec13, vec14, vec15);
1722 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1723 dst0, dst0, dst0, dst0);
1725 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1726 dst1, dst1, dst1, dst1);
1728 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1729 dst2, dst2, dst2, dst2);
1731 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1732 dst3, dst3, dst3, dst3);
1734 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1735 vec0, vec1, vec2, vec3);
1736 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1737 vec4, vec5, vec6, vec7);
1738 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1739 vec8, vec9, vec10, vec11);
1741 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1742 dst4, dst4, dst4, dst4);
1744 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1745 dst5, dst5, dst5, dst5);
1747 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1748 dst6, dst6, dst6, dst6);
1750 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1751 dst10_r, dst32_r, dst54_r, dst21_r);
1752 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1753 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1754 dst10_l, dst32_l, dst54_l, dst21_l);
1755 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1757 for (loop_cnt = height >> 1; loop_cnt--;) {
1758 LD_SB2(src_tmp, src_stride, src7, src8);
1759 src_tmp += 2 * src_stride;
1762 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1763 vec0, vec1, vec2, vec3);
1765 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1766 dst7, dst7, dst7, dst7);
1770 filt_h0, filt_h1, filt_h2, filt_h3);
1772 filt_h0, filt_h1, filt_h2, filt_h3);
1777 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1778 vec0, vec1, vec2, vec3);
1780 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1781 dst8, dst8, dst8, dst8);
1785 filt_h0, filt_h1, filt_h2, filt_h3);
1787 filt_h0, filt_h1, filt_h2, filt_h3);
1792 weight_vec, offset_vec, rnd_vec,
1793 dst0_r, dst1_r, dst0_l, dst1_l);
1796 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1797 dst_tmp += (2 * dst_stride);
1823 const int8_t *filter_x,
1824 const int8_t *filter_y,
1831 filter_x, filter_y, height, weight,
1832 offset, rnd_val, 8);
1839 const int8_t *filter_x,
1840 const int8_t *filter_y,
1847 filter_x, filter_y, height, weight,
1848 offset, rnd_val, 8);
1850 filter_x, filter_y, height, weight, offset,
1858 const int8_t *filter_x,
1859 const int8_t *filter_y,
1866 filter_x, filter_y, height, weight,
1867 offset, rnd_val, 16);
1874 const int8_t *filter_x,
1875 const int8_t *filter_y,
1882 filter_x, filter_y, height, weight,
1883 offset, rnd_val, 24);
1890 const int8_t *filter_x,
1891 const int8_t *filter_y,
1898 filter_x, filter_y, height, weight,
1899 offset, rnd_val, 32);
1906 const int8_t *filter_x,
1907 const int8_t *filter_y,
1914 filter_x, filter_y, height, weight,
1915 offset, rnd_val, 48);
1922 const int8_t *filter_x,
1923 const int8_t *filter_y,
1930 filter_x, filter_y, height, weight,
1931 offset, rnd_val, 64);
1948 v4i32 dst0_r, dst0_l;
1949 v8i16 filter_vec, const_vec;
1950 v4i32 weight_vec, offset_vec, rnd_vec;
1951 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1955 filter_vec =
LD_SH(filter);
1960 weight = weight & 0x0000FFFF;
1962 const_vec = __msa_ldi_h(128);
1965 weight_vec = __msa_fill_w(weight);
1966 offset_vec = __msa_fill_w(offset);
1967 rnd_vec = __msa_fill_w(rnd_val);
1969 LD_SB2(src, src_stride, src0, src1);
1972 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1977 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
1979 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
1985 dst += (4 * dst_stride);
2000 v16i8 mask1, vec0, vec1;
2002 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2003 v8i16 filter_vec, const_vec;
2004 v4i32 weight_vec, offset_vec, rnd_vec;
2005 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2010 filter_vec =
LD_SH(filter);
2015 weight = weight & 0x0000FFFF;
2017 const_vec = __msa_ldi_h(128);
2020 weight_vec = __msa_fill_w(weight);
2021 offset_vec = __msa_fill_w(offset);
2022 rnd_vec = __msa_fill_w(rnd_val);
2024 LD_SB4(src, src_stride, src0, src1, src2, src3);
2027 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2031 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2036 dst0_r, dst1_r, dst0_l, dst1_l);
2039 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
2040 dst += (4 * dst_stride);
2055 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2056 v16i8 mask1, vec0, vec1;
2057 v8i16 dst0, dst1, dst2, dst3;
2058 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2059 v8i16 filter_vec, const_vec;
2060 v4i32 weight_vec, offset_vec, rnd_vec;
2061 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2065 filter_vec =
LD_SH(filter);
2068 weight = weight & 0x0000FFFF;
2069 const_vec = __msa_ldi_h(128);
2072 weight_vec = __msa_fill_w(weight);
2073 offset_vec = __msa_fill_w(offset);
2074 rnd_vec = __msa_fill_w(rnd_val);
2078 for (loop_cnt = (height >> 3); loop_cnt--;) {
2079 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2080 src += (8 * src_stride);
2084 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2088 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2092 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2096 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2101 weight_vec, offset_vec, rnd_vec,
2102 dst0_r, dst1_r, dst2_r, dst3_r,
2103 dst0_l, dst1_l, dst2_l, dst3_l);
2106 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2107 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
2108 dst += (8 * dst_stride);
2124 filter, height, weight, offset, rnd_val);
2125 }
else if (4 == height) {
2127 filter, height, weight, offset, rnd_val);
2128 }
else if (8 == height || 16 == height) {
2130 filter, height, weight,
2148 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2151 v8i16 dst0, dst1, dst2, dst3;
2152 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2153 v8i16 filter_vec, const_vec;
2154 v4i32 weight_vec, offset_vec, rnd_vec;
2158 filter_vec =
LD_SH(filter);
2161 weight = weight & 0x0000FFFF;
2162 const_vec = __msa_ldi_h(128);
2165 weight_vec = __msa_fill_w(weight);
2166 offset_vec = __msa_fill_w(offset);
2167 rnd_vec = __msa_fill_w(rnd_val);
2171 for (loop_cnt = (height >> 2); loop_cnt--;) {
2172 LD_SB4(src, src_stride, src0, src1, src2, src3);
2173 src += (4 * src_stride);
2177 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2181 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2185 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2189 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2194 weight_vec, offset_vec, rnd_vec,
2195 dst0_r, dst1_r, dst2_r, dst3_r,
2196 dst0_l, dst1_l, dst2_l, dst3_l);
2199 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2201 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2202 dst += (4 * dst_stride);
2216 v8i16 filt0, filt1, dst0, dst1;
2218 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2221 v8i16 filter_vec, const_vec;
2222 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2223 v4i32 weight_vec, offset_vec, rnd_vec;
2227 filter_vec =
LD_SH(filter);
2230 weight = weight & 0x0000FFFF;
2231 const_vec = __msa_ldi_h(128);
2234 weight_vec = __msa_fill_w(weight);
2235 offset_vec = __msa_fill_w(offset);
2236 rnd_vec = __msa_fill_w(rnd_val);
2240 LD_SB2(src, src_stride, src0, src1);
2243 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2246 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2251 dst0_r, dst1_r, dst0_l, dst1_l);
2268 v16i8
src0,
src1, src2, src3, src4, src5;
2269 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2272 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2273 v8i16 filter_vec, const_vec;
2274 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2275 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2276 v4i32 weight_vec, offset_vec, rnd_vec;
2280 filter_vec =
LD_SH(filter);
2283 weight = weight & 0x0000FFFF;
2284 const_vec = __msa_ldi_h(128);
2287 weight_vec = __msa_fill_w(weight);
2288 offset_vec = __msa_fill_w(offset);
2289 rnd_vec = __msa_fill_w(rnd_val);
2293 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2294 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2297 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2301 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2305 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2309 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2313 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2317 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2322 weight_vec, offset_vec, rnd_vec,
2323 dst0_r, dst1_r, dst2_r, dst3_r,
2324 dst0_l, dst1_l, dst2_l, dst3_l);
2327 dst4_r, dst5_r, dst4_l, dst5_l);
2330 dst2_l, dst2_r, dst3_l, dst3_r,
2331 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
2333 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2334 dst += (4 * dst_stride);
2351 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2354 v8i16 dst0, dst1, dst2, dst3;
2355 v8i16 filter_vec, const_vec;
2356 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2357 v4i32 weight_vec, offset_vec, rnd_vec;
2361 filter_vec =
LD_SH(filter);
2364 weight = weight & 0x0000FFFF;
2365 const_vec = __msa_ldi_h(128);
2368 weight_vec = __msa_fill_w(weight);
2369 offset_vec = __msa_fill_w(offset);
2370 rnd_vec = __msa_fill_w(rnd_val);
2374 for (loop_cnt = (height >> 2); loop_cnt--;) {
2375 LD_SB4(src, src_stride, src0, src1, src2, src3);
2376 src += (4 * src_stride);
2380 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2384 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2388 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2392 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2397 weight_vec, offset_vec, rnd_vec,
2398 dst0_r, dst1_r, dst2_r, dst3_r,
2399 dst0_l, dst1_l, dst2_l, dst3_l);
2402 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2404 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2405 dst += (4 * dst_stride);
2421 filter, height, weight, offset, rnd_val);
2422 }
else if (6 == height) {
2424 filter, height, weight, offset, rnd_val);
2427 filter, height, weight, offset,
2445 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2446 v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2450 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2451 v8i16 filter_vec, const_vec;
2453 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2454 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2455 v4i32 weight_vec, offset_vec, rnd_vec;
2459 filter_vec =
LD_SH(filter);
2462 weight = weight & 0x0000FFFF;
2463 const_vec = __msa_ldi_h(128);
2466 weight_vec = __msa_fill_w(weight);
2467 offset_vec = __msa_fill_w(offset);
2468 rnd_vec = __msa_fill_w(rnd_val);
2473 for (loop_cnt = (height >> 2); loop_cnt--;) {
2474 LD_SB4(src, src_stride, src0, src1, src2, src3);
2475 src += (4 * src_stride);
2479 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2483 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2487 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2491 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2495 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2499 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2504 weight_vec, offset_vec, rnd_vec,
2505 dst0_r, dst1_r, dst2_r, dst3_r,
2506 dst0_l, dst1_l, dst2_l, dst3_l);
2509 dst4_r, dst5_r, dst4_l, dst5_l);
2512 dst2_l, dst2_r, dst3_l, dst3_r,
2513 dst4_l, dst4_r, dst5_l, dst5_r,
2514 dst0_r, dst1_r, dst2_r);
2516 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
2517 dst += (4 * dst_stride);
2532 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2534 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2536 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2538 v8i16 filter_vec, const_vec;
2539 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2540 v4i32 weight_vec, offset_vec, rnd_vec;
2544 filter_vec =
LD_SH(filter);
2547 weight = weight & 0x0000FFFF;
2548 const_vec = __msa_ldi_h(128);
2551 weight_vec = __msa_fill_w(weight);
2552 offset_vec = __msa_fill_w(offset);
2553 rnd_vec = __msa_fill_w(rnd_val);
2557 for (loop_cnt = (height >> 2); loop_cnt--;) {
2558 LD_SB4(src, src_stride, src0, src2, src4, src6);
2559 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2560 src += (4 * src_stride);
2564 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2568 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2572 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2576 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2580 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2584 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2588 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2592 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2597 weight_vec, offset_vec, rnd_vec,
2598 dst0_r, dst1_r, dst2_r, dst3_r,
2599 dst0_l, dst1_l, dst2_l, dst3_l);
2602 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2603 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2604 dst += (2 * dst_stride);
2607 weight_vec, offset_vec, rnd_vec,
2608 dst0_r, dst1_r, dst2_r, dst3_r,
2609 dst0_l, dst1_l, dst2_l, dst3_l);
2612 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2613 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2614 dst += (2 * dst_stride);
2632 v8i16 dst0, dst1, dst2, dst3;
2633 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2634 v16i8 mask1, mask2, mask3;
2636 v8i16 filter_vec, const_vec;
2637 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2638 v4i32 weight_vec, offset_vec, rnd_vec;
2642 filter_vec =
LD_SH(filter);
2645 weight = weight & 0x0000FFFF;
2646 const_vec = __msa_ldi_h(128);
2649 weight_vec = __msa_fill_w(weight);
2650 offset_vec = __msa_fill_w(offset);
2651 rnd_vec = __msa_fill_w(rnd_val);
2657 for (loop_cnt = (height >> 1); loop_cnt--;) {
2659 LD_SB2(src, src_stride, src0, src2);
2660 LD_SB2(src + 16, src_stride, src1, src3);
2661 src += (2 * src_stride);
2665 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2669 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2673 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2677 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2682 weight_vec, offset_vec, rnd_vec,
2683 dst0_r, dst1_r, dst2_r, dst3_r,
2684 dst0_l, dst1_l, dst2_l, dst3_l);
2687 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2688 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2689 dst += (2 * dst_stride);
2692 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2696 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2701 dst0_r, dst1_r, dst0_l, dst1_l);
2704 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2705 dst_tmp += (2 * dst_stride);
2722 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2723 v16i8 mask1, mask2, mask3;
2724 v8i16 dst0, dst1, dst2, dst3;
2726 v8i16 filter_vec, const_vec;
2727 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2728 v4i32 weight_vec, offset_vec, rnd_vec;
2732 filter_vec =
LD_SH(filter);
2735 weight = weight & 0x0000FFFF;
2736 const_vec = __msa_ldi_h(128);
2739 weight_vec = __msa_fill_w(weight);
2740 offset_vec = __msa_fill_w(offset);
2741 rnd_vec = __msa_fill_w(rnd_val);
2747 for (loop_cnt = (height >> 1); loop_cnt--;) {
2748 LD_SB2(src, 16, src0, src1);
2749 src2 =
LD_SB(src + 24);
2754 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2758 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2762 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2766 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2771 weight_vec, offset_vec, rnd_vec,
2772 dst0_r, dst1_r, dst2_r, dst3_r,
2773 dst0_l, dst1_l, dst2_l, dst3_l);
2776 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2777 ST_SW2(dst0_r, dst1_r, dst, 16);
2780 LD_SB2(src, 16, src0, src1);
2781 src2 =
LD_SB(src + 24);
2786 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2790 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2794 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2798 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2803 weight_vec, offset_vec, rnd_vec,
2804 dst0_r, dst1_r, dst2_r, dst3_r,
2805 dst0_l, dst1_l, dst2_l, dst3_l);
2808 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2809 ST_SW2(dst0_r, dst1_r, dst, 16);
2824 v16i8
src0,
src1, src2, src3, src4;
2825 v16i8 src10_r, src32_r, src21_r, src43_r;
2826 v16i8 src2110, src4332;
2828 v4i32 dst0_r, dst0_l;
2830 v8i16 filter_vec, const_vec;
2831 v4i32 weight_vec, offset_vec, rnd_vec;
2835 const_vec = __msa_ldi_h(128);
2837 weight = weight & 0x0000FFFF;
2839 weight_vec = __msa_fill_w(weight);
2840 offset_vec = __msa_fill_w(offset);
2841 rnd_vec = __msa_fill_w(rnd_val);
2843 filter_vec =
LD_SH(filter);
2846 LD_SB3(src, src_stride, src0, src1, src2);
2847 src += (3 * src_stride);
2848 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2849 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2850 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2851 LD_SB2(src, src_stride, src3, src4);
2852 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2853 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2854 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2857 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2860 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2862 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2880 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2881 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2882 v16i8 src2110, src4332, src6554;
2884 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2886 v8i16 filter_vec, const_vec;
2887 v4i32 weight_vec, offset_vec, rnd_vec;
2891 const_vec = __msa_ldi_h(128);
2893 weight = weight & 0x0000FFFF;
2895 weight_vec = __msa_fill_w(weight);
2896 offset_vec = __msa_fill_w(offset);
2897 rnd_vec = __msa_fill_w(rnd_val);
2899 filter_vec =
LD_SH(filter);
2902 LD_SB3(src, src_stride, src0, src1, src2);
2903 src += (3 * src_stride);
2904 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2906 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2908 LD_SB4(src, src_stride, src3, src4, src5, src6);
2909 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2910 src32_r, src43_r, src54_r, src65_r);
2911 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2915 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2917 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2919 dst0_r, dst1_r, dst0_l, dst1_l);
2922 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
2923 dst += (4 * dst_stride);
2937 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
2938 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2939 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2940 v16i8 src2110, src4332, src6554, src8776;
2941 v8i16 dst10, dst32, dst54, dst76;
2942 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2944 v8i16 filter_vec, const_vec;
2945 v4i32 weight_vec, offset_vec, rnd_vec;
2949 const_vec = __msa_ldi_h(128);
2951 weight = weight & 0x0000FFFF;
2953 weight_vec = __msa_fill_w(weight);
2954 offset_vec = __msa_fill_w(offset);
2955 rnd_vec = __msa_fill_w(rnd_val);
2957 filter_vec =
LD_SH(filter);
2960 LD_SB3(src, src_stride, src0, src1, src2);
2961 src += (3 * src_stride);
2962 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2963 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2964 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2966 for (loop_cnt = (height >> 3); loop_cnt--;) {
2967 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
2968 src += (6 * src_stride);
2969 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2970 src32_r, src43_r, src54_r, src65_r);
2971 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2972 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
2973 src4332, src6554, src8776);
2977 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2979 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2981 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2983 LD_SB2(src, src_stride, src9, src2);
2984 src += (2 * src_stride);
2985 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
2986 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
2987 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2990 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
2992 weight_vec, offset_vec, rnd_vec,
2993 dst0_r, dst1_r, dst2_r, dst3_r,
2994 dst0_l, dst1_l, dst2_l, dst3_l);
2997 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2998 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
2999 dst += (8 * dst_stride);
3015 filter, height, weight, offset, rnd_val);
3016 }
else if (4 == height) {
3018 filter, height, weight, offset, rnd_val);
3019 }
else if (0 == (height % 8)) {
3021 filter, height, weight, offset,
3037 v16i8
src0,
src1, src2, src3, src4;
3038 v16i8 src10_r, src32_r, src21_r, src43_r;
3039 v8i16 tmp0, tmp1, tmp2, tmp3;
3041 v8i16 filter_vec, const_vec;
3042 v4i32 weight_vec, offset_vec, rnd_vec;
3043 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3047 const_vec = __msa_ldi_h(128);
3049 weight = weight & 0x0000FFFF;
3051 weight_vec = __msa_fill_w(weight);
3052 offset_vec = __msa_fill_w(offset);
3053 rnd_vec = __msa_fill_w(rnd_val);
3055 filter_vec =
LD_SH(filter);
3058 LD_SB3(src, src_stride, src0, src1, src2);
3059 src += (3 * src_stride);
3061 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3063 for (loop_cnt = (height >> 2); loop_cnt--;) {
3064 LD_SB2(src, src_stride, src3, src4);
3065 src += (2 * src_stride);
3067 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3070 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3072 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3074 LD_SB2(src, src_stride, src1, src2);
3075 src += (2 * src_stride);
3077 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3080 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3082 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3084 weight_vec, offset_vec, rnd_vec,
3085 dst0_r, dst1_r, dst2_r, dst3_r,
3086 dst0_l, dst1_l, dst2_l, dst3_l);
3089 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3091 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3092 dst += (4 * dst_stride);
3106 v16i8
src0,
src1, src2, src3, src4;
3107 v16i8 src10_r, src32_r, src21_r, src43_r;
3110 v8i16 filter_vec, const_vec;
3111 v4i32 weight_vec, offset_vec, rnd_vec;
3112 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3116 const_vec = __msa_ldi_h(128);
3118 weight = weight & 0x0000FFFF;
3120 weight_vec = __msa_fill_w(weight);
3121 offset_vec = __msa_fill_w(offset);
3122 rnd_vec = __msa_fill_w(rnd_val);
3124 filter_vec =
LD_SH(filter);
3127 LD_SB3(src, src_stride, src0, src1, src2);
3128 src += (3 * src_stride);
3130 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3131 LD_SB2(src, src_stride, src3, src4);
3133 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3136 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3138 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3140 dst0_r, dst1_r, dst0_l, dst1_l);
3156 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3157 v16i8 src10_r, src32_r, src54_r, src76_r;
3158 v16i8 src21_r, src43_r, src65_r, src87_r;
3159 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3161 v8i16 filter_vec, const_vec;
3162 v4i32 weight_vec, offset_vec, rnd_vec;
3163 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3164 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3168 const_vec = __msa_ldi_h(128);
3170 weight = weight & 0x0000FFFF;
3172 weight_vec = __msa_fill_w(weight);
3173 offset_vec = __msa_fill_w(offset);
3174 rnd_vec = __msa_fill_w(rnd_val);
3176 filter_vec =
LD_SH(filter);
3179 LD_SB3(src, src_stride, src0, src1, src2);
3180 src += (3 * src_stride);
3182 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3184 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3186 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3187 src32_r, src43_r, src54_r, src65_r);
3188 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3191 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3193 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3195 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
3197 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
3199 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
3201 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
3203 weight_vec, offset_vec, rnd_vec,
3204 dst0_r, dst1_r, dst2_r, dst3_r,
3205 dst0_l, dst1_l, dst2_l, dst3_l);
3207 dst4_r, dst5_r, dst4_l, dst5_l);
3210 dst2_l, dst2_r, dst3_l, dst3_r,
3211 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
3212 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3213 dst += (4 * dst_stride);
3228 v16i8
src0,
src1, src2, src3, src4;
3229 v16i8 src10_r, src32_r, src21_r, src43_r;
3230 v8i16 tmp0, tmp1, tmp2, tmp3;
3232 v8i16 filter_vec, const_vec;
3233 v4i32 weight_vec, offset_vec, rnd_vec;
3234 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3238 const_vec = __msa_ldi_h(128);
3240 weight = weight & 0x0000FFFF;
3242 weight_vec = __msa_fill_w(weight);
3243 offset_vec = __msa_fill_w(offset);
3244 rnd_vec = __msa_fill_w(rnd_val);
3246 filter_vec =
LD_SH(filter);
3249 LD_SB3(src, src_stride, src0, src1, src2);
3250 src += (3 * src_stride);
3252 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3254 for (loop_cnt = (height >> 2); loop_cnt--;) {
3255 LD_SB2(src, src_stride, src3, src4);
3256 src += (2 * src_stride);
3258 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3261 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3263 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3265 LD_SB2(src, src_stride, src1, src2);
3266 src += (2 * src_stride);
3268 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3271 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3273 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3275 weight_vec, offset_vec, rnd_vec,
3276 dst0_r, dst1_r, dst2_r, dst3_r,
3277 dst0_l, dst1_l, dst2_l, dst3_l);
3280 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3281 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3282 dst += (4 * dst_stride);
3298 filter, height, weight, offset, rnd_val);
3299 }
else if (6 == height) {
3301 filter, height, weight, offset, rnd_val);
3304 filter, height, weight, offset,
3320 v16i8
src0,
src1, src2, src3, src4, src5;
3321 v16i8 src10_r, src32_r, src21_r, src43_r;
3322 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3323 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3324 v16i8 src2110, src4332;
3326 v8i16 filter_vec, const_vec;
3327 v4i32 weight_vec, offset_vec, rnd_vec;
3328 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3329 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3331 src -= (1 * src_stride);
3333 const_vec = __msa_ldi_h(128);
3335 weight = weight & 0x0000FFFF;
3337 weight_vec = __msa_fill_w(weight);
3338 offset_vec = __msa_fill_w(offset);
3339 rnd_vec = __msa_fill_w(rnd_val);
3341 filter_vec =
LD_SH(filter);
3344 LD_SB3(src, src_stride, src0, src1, src2);
3345 src += (3 * src_stride);
3347 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3348 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3349 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3351 for (loop_cnt = (height >> 2); loop_cnt--;) {
3352 LD_SB2(src, src_stride, src3, src4);
3353 src += (2 * src_stride);
3355 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3356 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3357 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3360 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3362 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3364 DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
3366 LD_SB2(src, src_stride, src5, src2);
3367 src += (2 * src_stride);
3369 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3370 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3371 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3374 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3376 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3378 DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
3380 weight_vec, offset_vec, rnd_vec,
3381 dst0_r, dst1_r, dst2_r, dst3_r,
3382 dst0_l, dst1_l, dst2_l, dst3_l);
3384 dst4_r, dst5_r, dst4_l, dst5_l);
3387 dst2_l, dst2_r, dst3_l, dst3_r,
3388 dst4_l, dst4_r, dst5_l, dst5_r,
3389 dst0_r, dst1_r, dst2_r);
3390 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
3391 dst += (4 * dst_stride);
3406 v16i8
src0,
src1, src2, src3, src4, src5;
3407 v16i8 src10_r, src32_r, src21_r, src43_r;
3408 v16i8 src10_l, src32_l, src21_l, src43_l;
3409 v8i16 tmp0, tmp1, tmp2, tmp3;
3411 v8i16 filter_vec, const_vec;
3412 v4i32 weight_vec, offset_vec, rnd_vec;
3413 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3417 const_vec = __msa_ldi_h(128);
3419 weight = weight & 0x0000FFFF;
3421 weight_vec = __msa_fill_w(weight);
3422 offset_vec = __msa_fill_w(offset);
3423 rnd_vec = __msa_fill_w(rnd_val);
3425 filter_vec =
LD_SH(filter);
3428 LD_SB3(src, src_stride, src0, src1, src2);
3429 src += (3 * src_stride);
3431 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3432 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3434 for (loop_cnt = (height >> 2); loop_cnt--;) {
3435 LD_SB2(src, src_stride, src3, src4);
3436 src += (2 * src_stride);
3438 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3439 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3442 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3444 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3446 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
3448 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
3450 weight_vec, offset_vec, rnd_vec,
3451 dst0_r, dst1_r, dst2_r, dst3_r,
3452 dst0_l, dst1_l, dst2_l, dst3_l);
3455 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3456 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3457 dst += (2 * dst_stride);
3459 LD_SB2(src, src_stride, src5, src2);
3460 src += (2 * src_stride);
3462 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3463 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3466 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
3468 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
3470 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
3472 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
3474 weight_vec, offset_vec, rnd_vec,
3475 dst0_r, dst1_r, dst2_r, dst3_r,
3476 dst0_l, dst1_l, dst2_l, dst3_l);
3479 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3480 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3481 dst += (2 * dst_stride);
3496 v16i8
src0,
src1, src2, src3, src4, src5;
3497 v16i8 src6, src7, src8, src9, src10, src11;
3498 v16i8 src10_r, src32_r, src76_r, src98_r;
3499 v16i8 src21_r, src43_r, src87_r, src109_r;
3500 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3501 v16i8 src10_l, src32_l, src21_l, src43_l;
3503 v8i16 filter_vec, const_vec;
3504 v4i32 weight_vec, offset_vec, rnd_vec;
3505 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3506 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3510 const_vec = __msa_ldi_h(128);
3512 weight = weight & 0x0000FFFF;
3514 weight_vec = __msa_fill_w(weight);
3515 offset_vec = __msa_fill_w(offset);
3516 rnd_vec = __msa_fill_w(rnd_val);
3518 filter_vec =
LD_SH(filter);
3521 LD_SB3(src, src_stride, src0, src1, src2);
3523 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3524 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3526 LD_SB3(src + 16, src_stride, src6, src7, src8);
3527 src += (3 * src_stride);
3529 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3531 for (loop_cnt = (height >> 2); loop_cnt--;) {
3532 LD_SB2(src, src_stride, src3, src4);
3534 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3535 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3536 LD_SB2(src + 16, src_stride, src9, src10);
3537 src += (2 * src_stride);
3539 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3542 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3544 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
3546 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3548 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
3550 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
3552 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
3555 weight_vec, offset_vec, rnd_vec,
3556 dst0_r, dst1_r, dst2_r, dst3_r,
3557 dst0_l, dst1_l, dst2_l, dst3_l);
3559 dst4_r, dst5_r, dst4_l, dst5_l);
3562 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3564 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3565 ST8x2_UB(dst4_r, dst + 16, dst_stride);
3566 dst += (2 * dst_stride);
3568 LD_SB2(src, src_stride, src5, src2);
3570 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3571 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3572 LD_SB2(src + 16, src_stride, src11, src8);
3573 src += (2 * src_stride);
3575 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3578 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
3580 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
3582 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
3584 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
3586 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
3588 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
3591 weight_vec, offset_vec, rnd_vec,
3592 dst0_r, dst1_r, dst2_r, dst3_r,
3593 dst0_l, dst1_l, dst2_l, dst3_l);
3595 dst4_r, dst5_r, dst4_l, dst5_l);
3598 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3600 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3601 ST8x2_UB(dst4_r, dst + 16, dst_stride);
3602 dst += (2 * dst_stride);
3618 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3619 v16i8 src10_r, src32_r, src76_r, src98_r;
3620 v16i8 src21_r, src43_r, src87_r, src109_r;
3621 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3622 v16i8 src10_l, src32_l, src76_l, src98_l;
3623 v16i8 src21_l, src43_l, src87_l, src109_l;
3625 v8i16 filter_vec, const_vec;
3626 v4i32 weight_vec, offset_vec, rnd_vec;
3627 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3628 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
3632 const_vec = __msa_ldi_h(128);
3634 weight = weight & 0x0000FFFF;
3636 weight_vec = __msa_fill_w(weight);
3637 offset_vec = __msa_fill_w(offset);
3638 rnd_vec = __msa_fill_w(rnd_val);
3640 filter_vec =
LD_SH(filter);
3643 LD_SB3(src, src_stride, src0, src1, src2);
3645 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3646 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3648 LD_SB3(src + 16, src_stride, src6, src7, src8);
3649 src += (3 * src_stride);
3651 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3652 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3654 for (loop_cnt = (height >> 1); loop_cnt--;) {
3655 LD_SB2(src, src_stride, src3, src4);
3657 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3658 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3661 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3663 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
3665 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3667 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
3670 weight_vec, offset_vec, rnd_vec,
3671 dst0_r, dst1_r, dst2_r, dst3_r,
3672 dst0_l, dst1_l, dst2_l, dst3_l);
3674 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3675 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3676 dst += (2 * dst_stride);
3684 LD_SB2(src + 16, src_stride, src9, src10);
3685 src += (2 * src_stride);
3687 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3688 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3691 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
3693 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
3695 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
3697 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
3700 weight_vec, offset_vec, rnd_vec,
3701 dst4_r, dst5_r, dst6_r, dst7_r,
3702 dst4_l, dst5_l, dst6_l, dst7_l);
3705 dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
3706 ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
3707 dst_tmp += (2 * dst_stride);
3721 const int8_t *filter_x,
3722 const int8_t *filter_y,
3728 v16i8
src0,
src1, src2, src3, src4;
3730 v4i32 filt_h0, filt_h1;
3731 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3733 v8i16 filter_vec, const_vec;
3734 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3735 v8i16 dst0, dst1, dst2, dst3, dst4;
3736 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3737 v4i32 dst0_r, dst1_r;
3738 v4i32 weight_vec, offset_vec, rnd_vec;
3740 src -= (src_stride + 1);
3742 filter_vec =
LD_SH(filter_x);
3745 filter_vec =
LD_SH(filter_y);
3746 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3747 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3753 const_vec = __msa_ldi_h(128);
3756 weight_vec = __msa_fill_w(weight);
3757 offset_vec = __msa_fill_w(offset);
3758 rnd_vec = __msa_fill_w(rnd_val);
3760 LD_SB3(src, src_stride, src0, src1, src2);
3761 src += (3 * src_stride);
3764 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3765 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3766 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3774 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3775 LD_SB2(src, src_stride, src3, src4);
3779 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3783 dst32_r = __msa_ilvr_h(dst3, dst2);
3787 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3791 dst43_r = __msa_ilvr_h(dst4, dst3);
3795 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
3797 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
3809 const int8_t *filter_x,
3810 const int8_t *filter_y,
3816 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3818 v4i32 filt_h0, filt_h1;
3819 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3821 v8i16 filter_vec, const_vec;
3822 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3823 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3824 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
3825 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3826 v4i32 weight_vec, offset_vec, rnd_vec;
3828 src -= (src_stride + 1);
3830 filter_vec =
LD_SH(filter_x);
3833 filter_vec =
LD_SH(filter_y);
3834 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3835 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3841 const_vec = __msa_ldi_h(128);
3844 weight_vec = __msa_fill_w(weight);
3845 offset_vec = __msa_fill_w(offset);
3846 rnd_vec = __msa_fill_w(rnd_val);
3848 LD_SB3(src, src_stride, src0, src1, src2);
3849 src += (3 * src_stride);
3852 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3853 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3854 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3862 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3864 LD_SB4(src, src_stride, src3, src4, src5, src6);
3868 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3871 dst32_r = __msa_ilvr_h(dst3, dst2);
3876 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3879 dst43_r = __msa_ilvr_h(dst4, dst3);
3884 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3887 dst10_r = __msa_ilvr_h(dst5, dst4);
3892 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3895 dst21_r = __msa_ilvr_h(dst2, dst5);
3900 weight_vec, offset_vec, rnd_vec,
3901 dst0_r, dst1_r, dst2_r, dst3_r);
3903 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
3910 const int8_t *filter_x,
3911 const int8_t *filter_y,
3918 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3920 v4i32 filt_h0, filt_h1;
3921 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3923 v8i16 filter_vec, const_vec;
3924 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3925 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3926 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3927 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3928 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3929 v4i32 weight_vec, offset_vec, rnd_vec;
3931 src -= (src_stride + 1);
3933 filter_vec =
LD_SH(filter_x);
3936 filter_vec =
LD_SH(filter_y);
3937 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3938 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3944 const_vec = __msa_ldi_h(128);
3947 weight_vec = __msa_fill_w(weight);
3948 offset_vec = __msa_fill_w(offset);
3949 rnd_vec = __msa_fill_w(rnd_val);
3951 LD_SB3(src, src_stride, src0, src1, src2);
3952 src += (3 * src_stride);
3955 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3956 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3957 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3964 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3966 for (loop_cnt = height >> 3; loop_cnt--;) {
3968 src3, src4, src5, src6, src7, src8, src9, src10);
3969 src += (8 * src_stride);
3972 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3975 dst32_r = __msa_ilvr_h(dst3, dst2);
3979 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3982 dst43_r = __msa_ilvr_h(dst4, dst3);
3986 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3989 dst54_r = __msa_ilvr_h(dst5, dst4);
3993 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3996 dst65_r = __msa_ilvr_h(dst6, dst5);
4000 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4003 dst76_r = __msa_ilvr_h(dst7, dst6);
4007 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4010 dst87_r = __msa_ilvr_h(dst8, dst7);
4014 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
4017 dst10_r = __msa_ilvr_h(dst9, dst8);
4021 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
4024 dst21_r = __msa_ilvr_h(dst2, dst9);
4029 weight_vec, offset_vec, rnd_vec,
4030 dst0_r, dst1_r, dst2_r, dst3_r);
4032 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4033 dst += (4 * dst_stride);
4036 weight_vec, offset_vec, rnd_vec,
4037 dst4_r, dst5_r, dst6_r, dst7_r);
4039 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4040 dst += (4 * dst_stride);
4048 const int8_t *filter_x,
4049 const int8_t *filter_y,
4057 filter_x, filter_y, height, weight,
4059 }
else if (4 == height) {
4061 filter_x, filter_y, height, weight,
4063 }
else if (0 == (height % 8)) {
4065 filter_x, filter_y, height, weight,
4074 const int8_t *filter_x,
4075 const int8_t *filter_y,
4082 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4084 v4i32 filt_h0, filt_h1;
4085 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4087 v8i16 filter_vec, const_vec;
4088 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4089 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4090 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4091 v4i32 weight_vec, offset_vec, rnd_vec;
4092 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4093 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4094 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4096 src -= (src_stride + 1);
4098 filter_vec =
LD_SH(filter_x);
4101 filter_vec =
LD_SH(filter_y);
4102 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4103 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4109 const_vec = __msa_ldi_h(128);
4112 weight_vec = __msa_fill_w(weight);
4113 offset_vec = __msa_fill_w(offset);
4114 rnd_vec = __msa_fill_w(rnd_val);
4116 LD_SB3(src, src_stride, src0, src1, src2);
4117 src += (3 * src_stride);
4120 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4121 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4122 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4133 for (loop_cnt = height >> 2; loop_cnt--;) {
4134 LD_SB4(src, src_stride, src3, src4, src5, src6);
4135 src += (4 * src_stride);
4139 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4149 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4159 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4169 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4179 weight_vec, offset_vec, rnd_vec,
4180 dst0_r, dst1_r, dst0_l, dst1_l);
4182 weight_vec, offset_vec, rnd_vec,
4183 dst2_r, dst3_r, dst2_l, dst3_l);
4185 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4186 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
4187 dst += (4 * dst_stride);
4195 const int8_t *filter_x,
4196 const int8_t *filter_y,
4202 v16i8
src0,
src1, src2, src3, src4;
4204 v4i32 filt_h0, filt_h1;
4205 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4207 v8i16 filter_vec, const_vec;
4208 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4209 v8i16 dst0, dst1, dst2, dst3, dst4;
4210 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4211 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4212 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4213 v4i32 weight_vec, offset_vec, rnd_vec;
4215 src -= (src_stride + 1);
4217 filter_vec =
LD_SH(filter_x);
4220 filter_vec =
LD_SH(filter_y);
4221 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4222 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4228 const_vec = __msa_ldi_h(128);
4231 weight_vec = __msa_fill_w(weight);
4232 offset_vec = __msa_fill_w(offset);
4233 rnd_vec = __msa_fill_w(rnd_val);
4235 LD_SB3(src, src_stride, src0, src1, src2);
4236 src += (3 * src_stride);
4239 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4240 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4241 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4252 LD_SB2(src, src_stride, src3, src4);
4253 src += (2 * src_stride);
4256 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4265 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4275 weight_vec, offset_vec, rnd_vec,
4276 dst0_r, dst1_r, dst0_l, dst1_l);
4279 dst += (2 * dst_stride);
4286 const int8_t *filter_x,
4287 const int8_t *filter_y,
4293 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4295 v4i32 filt_h0, filt_h1;
4296 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4298 v8i16 filter_vec, const_vec;
4299 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4300 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4301 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4302 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4303 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4304 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4305 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4306 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4307 v4i32 weight_vec, offset_vec, rnd_vec;
4309 src -= (src_stride + 1);
4311 filter_vec =
LD_SH(filter_x);
4314 filter_vec =
LD_SH(filter_y);
4315 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4316 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4322 const_vec = __msa_ldi_h(128);
4325 weight_vec = __msa_fill_w(weight);
4326 offset_vec = __msa_fill_w(offset);
4327 rnd_vec = __msa_fill_w(rnd_val);
4329 LD_SB3(src, src_stride, src0, src1, src2);
4330 src += (3 * src_stride);
4334 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4335 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4336 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4347 LD_SB2(src, src_stride, src3, src4);
4348 src += (2 * src_stride);
4352 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4362 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4371 LD_SB2(src, src_stride, src5, src6);
4372 src += (2 * src_stride);
4376 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4386 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4395 LD_SB2(src, src_stride, src7, src8);
4396 src += (2 * src_stride);
4400 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4411 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4421 weight_vec, offset_vec, rnd_vec,
4422 dst0_r, dst1_r, dst0_l, dst1_l);
4424 weight_vec, offset_vec, rnd_vec,
4425 dst2_r, dst3_r, dst2_l, dst3_l);
4427 weight_vec, offset_vec, rnd_vec,
4428 dst4_r, dst5_r, dst4_l, dst5_l);
4430 dst2_l, dst2_r, dst3_l, dst3_r,
4431 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
4432 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
4433 dst += (4 * dst_stride);
4441 const int8_t *filter_x,
4442 const int8_t *filter_y,
4449 uint32_t loop_cnt, cnt;
4452 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4454 v4i32 filt_h0, filt_h1;
4455 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4457 v8i16 filter_vec, const_vec;
4458 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4459 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4460 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4461 v4i32 weight_vec, offset_vec, rnd_vec;
4462 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4463 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4464 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4466 src -= (src_stride + 1);
4468 filter_vec =
LD_SH(filter_x);
4471 filter_vec =
LD_SH(filter_y);
4472 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4473 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4479 const_vec = __msa_ldi_h(128);
4482 weight_vec = __msa_fill_w(weight);
4483 offset_vec = __msa_fill_w(offset);
4484 rnd_vec = __msa_fill_w(rnd_val);
4486 for (cnt = width >> 3; cnt--;) {
4490 LD_SB3(src_tmp, src_stride, src0, src1, src2);
4491 src_tmp += (3 * src_stride);
4494 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4495 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4496 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4507 for (loop_cnt = height >> 2; loop_cnt--;) {
4508 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4509 src_tmp += (4 * src_stride);
4512 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4521 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4530 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4539 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4549 weight_vec, offset_vec, rnd_vec,
4550 dst0_r, dst1_r, dst0_l, dst1_l);
4552 weight_vec, offset_vec, rnd_vec,
4553 dst2_r, dst3_r, dst2_l, dst3_l);
4555 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4556 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
4557 dst_tmp += (4 * dst_stride);
4569 const int8_t *filter_x,
4570 const int8_t *filter_y,
4579 filter_x, filter_y, height, weight,
4581 }
else if (6 == height) {
4583 filter_x, filter_y, height, weight,
4585 }
else if (0 == (height % 4)) {
4587 filter_x, filter_y, height, weight,
4588 offset, rnd_val, 8);
4596 const int8_t *filter_x,
4597 const int8_t *filter_y,
4604 filter_x, filter_y, height, weight,
4605 offset, rnd_val, 8);
4607 filter_x, filter_y, height, weight,
4615 const int8_t *filter_x,
4616 const int8_t *filter_y,
4623 filter_x, filter_y, height, weight,
4624 offset, rnd_val, 16);
4631 const int8_t *filter_x,
4632 const int8_t *filter_y,
4639 filter_x, filter_y, height, weight,
4640 offset, rnd_val, 24);
4647 const int8_t *filter_x,
4648 const int8_t *filter_y,
4655 filter_x, filter_y, height, weight,
4656 offset, rnd_val, 32);
4659 #define UNIWGT_MC_COPY(WIDTH) \
4660 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4661 ptrdiff_t dst_stride, \
4663 ptrdiff_t src_stride, \
4672 int shift = denom + 14 - 8; \
4673 hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4674 height, weight, offset, shift); \
4687 #undef UNIWGT_MC_COPY
4689 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4690 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4704 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4705 int shift = denom + 14 - 8; \
4707 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4708 dst_stride, filter, height, \
4709 weight, offset, shift); \
4748 #define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
4749 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4763 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4764 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4765 int shift = denom + 14 - 8; \
4767 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4768 dst_stride, filter_x, \
4769 filter_y, height, weight, \
#define HEVC_PCK_SW_SB2(in0, in1, out)
static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B8_128_SB(...)
static void hevc_hz_uniwgt_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B2_128_SB(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B3_128_SB(...)
static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void hevc_uniwgt_copy_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,out0_r, out1_r, out2_r, out3_r,out0_l, out1_l, out2_l, out3_l)
static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, out0, out1, out2)
static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define CLIP_SW_0_255(in)
static void hevc_vt_uniwgt_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B7_128_SB(...)
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
#define XORI_B4_128_SB(...)
static const uint8_t offset[127][2]
static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define DPADD_SB2_SH(...)
static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W4_SW(...)
static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_SW2(in0, in1, pdst, stride)
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_16multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1)
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,out0, out1, out2, out3)
#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
#define ADD2(in0, in1, in2, in3, out0, out1)
static void hevc_vt_uniwgt_8t_16multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static int weight(int i, int blen, int offset)
static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SLLI_4V(in0, in1, in2, in3, shift)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNIWGT_MC_COPY(WIDTH)
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST8x4_UB(in0, in1, pdst, stride)
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B6_128_SB(...)
static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,out0_r, out1_r, out0_l, out1_l)
#define ST12x4_UB(in0, in1, in2, pdst, stride)
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W2_SW(...)
static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST4x2_UB(in, pdst, stride)
static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)