27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
35 {-6, 123, 12, -1, 0, 0, 0, 0},
36 {2, -11, 108, 36, -8, 1, 0, 0},
37 {-9, 93, 50, -6, 0, 0, 0, 0},
38 {3, -16, 77, 77, -16, 3, 0, 0},
39 {-6, 50, 93, -9, 0, 0, 0, 0},
40 {1, -8, 36, 108, -11, 2, 0, 0},
41 {-1, 12, 123, -6, 0, 0, 0, 0},
54 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \
55 filt_h0, filt_h1, filt_h2) \
57 v16i8 vec0_m, vec1_m, vec2_m; \
60 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
61 vec0_m, vec1_m, vec2_m); \
62 hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \
63 filt_h0, filt_h1, filt_h2); \
65 hz_out_m = __msa_srari_h(hz_out_m, 7); \
66 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
71 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
72 mask0, mask1, mask2, \
73 filt0, filt1, filt2, \
76 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
78 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
79 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
80 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
81 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
82 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
83 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
86 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
87 mask0, mask1, mask2, \
88 filt0, filt1, filt2, \
89 out0, out1, out2, out3) \
91 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
93 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
94 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
95 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
96 out0, out1, out2, out3); \
97 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
98 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
99 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \
100 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \
101 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
102 out0, out1, out2, out3); \
103 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
104 out0, out1, out2, out3); \
107 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
111 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
112 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
117 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
119 v16i8 vec0_m, vec1_m; \
122 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
123 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
125 hz_out_m = __msa_srari_h(hz_out_m, 7); \
126 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
131 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
132 mask0, mask1, filt0, filt1, \
135 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
137 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
138 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
139 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
140 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
143 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
144 mask0, mask1, filt0, filt1, \
145 out0, out1, out2, out3) \
147 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
149 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
150 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
151 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
152 out0, out1, out2, out3); \
153 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
154 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
155 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
156 out0, out1, out2, out3); \
160 uint8_t *dst,
int32_t dst_stride,
164 v16u8 mask0, mask1, mask2,
out;
165 v8i16
filt, out0, out1;
180 filt0, filt1, filt2, out0, out1);
184 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
188 uint8_t *dst,
int32_t dst_stride,
192 v16u8 mask0, mask1, mask2,
out;
193 v8i16
filt, out0, out1, out2, out3;
207 src += (4 * src_stride);
209 filt0, filt1, filt2, out0, out1);
213 filt0, filt1, filt2, out2, out3);
217 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
219 ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
223 const uint8_t *
src, ptrdiff_t src_stride,
224 int height,
int mx,
int my)
236 const uint8_t *
src, ptrdiff_t src_stride,
237 int height,
int mx,
int my)
242 v16u8 mask0, mask1, mask2, tmp0, tmp1;
243 v8i16
filt, out0, out1, out2, out3;
258 src += (4 * src_stride);
260 filt0, filt1, filt2, out0, out1, out2, out3);
265 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266 dst += (4 * dst_stride);
268 for (loop_cnt = (
height >> 2) - 1; loop_cnt--;) {
271 src += (4 * src_stride);
273 filt0, filt1, filt2, out0, out1, out2, out3);
278 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279 dst += (4 * dst_stride);
284 const uint8_t *
src, ptrdiff_t src_stride,
285 int height,
int mx,
int my)
289 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290 v16u8 mask0, mask1, mask2,
out;
291 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
303 for (loop_cnt = (
height >> 2); loop_cnt--;) {
307 src += (4 * src_stride);
310 filt0, filt1, filt2, out0, out1, out2, out3);
312 filt0, filt1, filt2, out4, out5, out6, out7);
333 const uint8_t *
src, ptrdiff_t src_stride,
334 int height,
int mx,
int my)
338 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
339 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
342 v8i16
filt, out10, out32;
344 src -= (2 * src_stride);
350 src += (5 * src_stride);
352 ILVR_B4_SB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3, src10_r, src21_r,
354 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
357 for (loop_cnt = (
height >> 2); loop_cnt--;) {
358 LD_SB4(
src, src_stride, src5, src6, src7, src8);
359 src += (4 * src_stride);
361 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362 src65_r, src76_r, src87_r);
363 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
365 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
370 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
371 dst += (4 * dst_stride);
380 const uint8_t *
src, ptrdiff_t src_stride,
381 int height,
int mx,
int my)
385 v16i8
src0,
src1,
src2, src3, src4, src7, src8, src9, src10;
386 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387 v16i8 src109_r, filt0, filt1, filt2;
389 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
391 src -= (2 * src_stride);
397 src += (5 * src_stride);
401 src10_r, src32_r, src21_r, src43_r);
403 for (loop_cnt = (
height >> 2); loop_cnt--;) {
404 LD_SB4(
src, src_stride, src7, src8, src9, src10);
406 src += (4 * src_stride);
408 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409 src87_r, src98_r, src109_r);
410 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412 out2_r =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413 out3_r =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
415 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
418 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419 dst += (4 * dst_stride);
430 const uint8_t *
src, ptrdiff_t src_stride,
431 int height,
int mx,
int my)
435 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
436 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438 v16i8 src65_l, src87_l, filt0, filt1, filt2;
439 v16u8 tmp0, tmp1, tmp2, tmp3;
440 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l,
filt;
442 src -= (2 * src_stride);
448 src += (5 * src_stride);
452 src32_r, src43_r, src21_r);
454 src32_l, src43_l, src21_l);
456 for (loop_cnt = (
height >> 2); loop_cnt--;) {
457 LD_SB4(
src, src_stride, src5, src6, src7, src8);
458 src += (4 * src_stride);
461 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462 src65_r, src76_r, src87_r);
463 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464 src65_l, src76_l, src87_l);
465 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
467 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
469 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
471 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
473 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
475 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
477 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
479 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
483 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486 out3_r, tmp0, tmp1, tmp2, tmp3);
488 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489 dst += (4 * dst_stride);
504 const uint8_t *
src, ptrdiff_t src_stride,
505 int height,
int mx,
int my)
510 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
511 v16i8 filt_hz0, filt_hz1, filt_hz2;
512 v16u8 mask0, mask1, mask2,
out;
514 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515 v8i16 hz_out7,
filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
518 src -= (2 + 2 * src_stride);
531 src += (5 * src_stride);
538 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
541 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
543 for (loop_cnt = (
height >> 2); loop_cnt--;) {
545 src += (2 * src_stride);
550 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
553 src += (2 * src_stride);
558 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
560 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
563 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564 tmp1 =
DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
569 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
570 dst += (4 * dst_stride);
579 const uint8_t *
src, ptrdiff_t src_stride,
580 int height,
int mx,
int my)
585 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
586 v16i8 filt_hz0, filt_hz1, filt_hz2;
587 v16u8 mask0, mask1, mask2, vec0, vec1;
588 v8i16
filt, filt_vt0, filt_vt1, filt_vt2;
589 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591 v8i16 tmp0, tmp1, tmp2, tmp3;
594 src -= (2 + 2 * src_stride);
604 src += (5 * src_stride);
621 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
624 for (loop_cnt = (
height >> 2); loop_cnt--;) {
625 LD_SB4(
src, src_stride, src5, src6, src7, src8);
626 src += (4 * src_stride);
631 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
636 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637 tmp1 =
DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
641 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642 tmp2 =
DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
646 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647 tmp3 =
DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
653 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654 dst += (4 * dst_stride);
666 const uint8_t *
src, ptrdiff_t src_stride,
667 int height,
int mx,
int my)
671 for (multiple8_cnt = 2; multiple8_cnt--;) {
681 uint8_t *dst,
int32_t dst_stride,
684 v16i8
src0,
src1,
src2, src3, filt0, filt1, mask0, mask1;
685 v8i16
filt, out0, out1;
700 filt0, filt1, out0, out1);
704 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
708 uint8_t *dst,
int32_t dst_stride,
711 v16i8
src0,
src1,
src2, src3, filt0, filt1, mask0, mask1;
713 v8i16
filt, out0, out1, out2, out3;
725 src += (4 * src_stride);
729 filt0, filt1, out0, out1);
733 filt0, filt1, out2, out3);
737 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
739 ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
743 uint8_t *dst,
int32_t dst_stride,
747 v16i8 filt0, filt1, mask0, mask1;
749 v8i16
filt, out0, out1, out2, out3;
761 src += (8 * src_stride);
764 filt0, filt1, out0, out1);
766 filt0, filt1, out2, out3);
770 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
771 dst += (4 * dst_stride);
773 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
774 dst += (4 * dst_stride);
777 src += (8 * src_stride);
780 filt0, filt1, out0, out1);
782 filt0, filt1, out2, out3);
786 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
787 dst += (4 * dst_stride);
789 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
793 const uint8_t *
src, ptrdiff_t src_stride,
794 int height,
int mx,
int my)
802 }
else if (16 ==
height) {
808 const uint8_t *
src, ptrdiff_t src_stride,
809 int height,
int mx,
int my)
813 v16i8
src0,
src1,
src2, src3, filt0, filt1, mask0, mask1;
815 v8i16
filt, out0, out1, out2, out3;
826 for (loop_cnt = (
height >> 2); loop_cnt--;) {
828 src += (4 * src_stride);
832 filt1, out0, out1, out2, out3);
837 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838 dst += (4 * dst_stride);
843 const uint8_t *
src, ptrdiff_t src_stride,
844 int height,
int mx,
int my)
849 v16i8 filt0, filt1, mask0, mask1;
850 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
862 for (loop_cnt = (
height >> 2); loop_cnt--;) {
865 src += (4 * src_stride);
869 filt1, out0, out1, out2, out3);
871 filt1, out4, out5, out6, out7);
892 const uint8_t *
src, ptrdiff_t src_stride,
893 int height,
int mx,
int my)
898 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899 v16i8 src2110, src4332, filt0, filt1;
900 v8i16
filt, out10, out32;
909 src += (3 * src_stride);
913 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
916 for (loop_cnt = (
height >> 2); loop_cnt--;) {
917 LD_SB3(
src, src_stride, src3, src4, src5);
918 src += (3 * src_stride);
920 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
927 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
933 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
934 dst += (4 * dst_stride);
939 const uint8_t *
src, ptrdiff_t src_stride,
940 int height,
int mx,
int my)
945 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
947 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
955 src += (3 * src_stride);
960 for (loop_cnt = (
height >> 2); loop_cnt--;) {
961 LD_SB4(
src, src_stride, src7, src8, src9, src10);
962 src += (4 * src_stride);
966 src72_r, src87_r, src98_r, src109_r);
972 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
975 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976 dst += (4 * dst_stride);
985 const uint8_t *
src, ptrdiff_t src_stride,
986 int height,
int mx,
int my)
991 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993 v16u8 tmp0, tmp1, tmp2, tmp3;
994 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1002 src += (3 * src_stride);
1008 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1009 LD_SB4(
src, src_stride, src3, src4, src5, src6);
1010 src += (4 * src_stride);
1014 src32_r, src43_r, src54_r, src65_r);
1016 src32_l, src43_l, src54_l, src65_l);
1027 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030 out3_r, tmp0, tmp1, tmp2, tmp3);
1032 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033 dst += (4 * dst_stride);
1044 const uint8_t *
src, ptrdiff_t src_stride,
1045 int height,
int mx,
int my)
1050 v16i8
src0,
src1,
src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051 v16u8 mask0, mask1,
out;
1052 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1056 src -= (1 + 1 * src_stride);
1065 src += (3 * src_stride);
1070 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1075 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1076 LD_SB4(
src, src_stride, src3, src4, src5, src6);
1077 src += (4 * src_stride);
1080 hz_out3 =
HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1086 hz_out5 =
HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1094 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
1095 dst += (4 * dst_stride);
1103 const uint8_t *
src, ptrdiff_t src_stride,
1104 int height,
int mx,
int my)
1109 v16i8
src0,
src1,
src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110 v16u8 mask0, mask1, out0, out1;
1111 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113 v8i16 vec0, vec1, vec2, vec3, vec4;
1116 src -= (1 + 1 * src_stride);
1125 src += (3 * src_stride);
1131 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1136 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1137 LD_SB4(
src, src_stride, src3, src4, src5, src6);
1138 src += (4 * src_stride);
1141 hz_out3 =
HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1145 hz_out0 =
HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1149 hz_out1 =
HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150 vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1153 hz_out2 =
HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1161 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162 dst += (4 * dst_stride);
1170 const uint8_t *
src, ptrdiff_t src_stride,
1171 int height,
int mx,
int my)
1175 for (multiple8_cnt = 2; multiple8_cnt--;) {
1185 const uint8_t *
src, ptrdiff_t src_stride,
1186 int height,
int mx,
int my)
1192 v16i8 filt_hz0, filt_hz1, filt_hz2;
1193 v16u8 res0, res1, mask0, mask1, mask2;
1194 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1198 src -= (2 + 1 * src_stride);
1208 src += (3 * src_stride);
1212 filt_hz1, filt_hz2);
1214 filt_hz1, filt_hz2);
1215 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1220 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1221 LD_SB4(
src, src_stride, src3, src4, src5, src6);
1222 src += (4 * src_stride);
1226 filt_hz1, filt_hz2);
1227 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1232 filt_hz1, filt_hz2);
1233 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1241 ST_W2(res0, 0, 1, dst, dst_stride);
1242 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243 dst += (4 * dst_stride);
1251 const uint8_t *
src, ptrdiff_t src_stride,
1252 int height,
int mx,
int my)
1258 v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259 v8i16
filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260 v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1264 src -= (2 + src_stride);
1274 src += (3 * src_stride);
1278 filt_hz1, filt_hz2);
1280 filt_hz1, filt_hz2);
1282 filt_hz1, filt_hz2);
1283 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1288 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1289 LD_SB4(
src, src_stride, src3, src4, src5, src6);
1290 src += (4 * src_stride);
1295 filt_hz1, filt_hz2);
1296 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1300 filt_hz1, filt_hz2);
1301 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1305 filt_hz1, filt_hz2);
1306 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1310 filt_hz1, filt_hz2);
1311 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1318 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319 dst += (4 * dst_stride);
1324 const uint8_t *
src, ptrdiff_t src_stride,
1325 int height,
int mx,
int my)
1329 for (multiple8_cnt = 2; multiple8_cnt--;) {
1339 const uint8_t *
src, ptrdiff_t src_stride,
1340 int height,
int mx,
int my)
1345 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1346 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1348 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349 v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350 v8i16
filt, filt_vt0, filt_vt1, filt_vt2;
1354 src -= (1 + 2 * src_stride);
1363 src += (5 * src_stride);
1368 hz_out3 =
HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1375 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1376 LD_SB4(
src, src_stride, src5, src6, src7, src8);
1378 src += (4 * src_stride);
1380 hz_out5 =
HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1385 hz_out7 =
HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388 tmp1 =
DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1393 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
1394 dst += (4 * dst_stride);
1403 const uint8_t *
src, ptrdiff_t src_stride,
1404 int height,
int mx,
int my)
1409 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1410 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411 v8i16
filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1417 src -= (1 + 2 * src_stride);
1426 src += (5 * src_stride);
1432 hz_out3 =
HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433 hz_out4 =
HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1440 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1441 LD_SB4(
src, src_stride, src5, src6, src7, src8);
1442 src += (4 * src_stride);
1446 hz_out5 =
HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1450 hz_out6 =
HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452 tmp1 =
DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1454 hz_out7 =
HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456 tmp2 =
DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1458 hz_out8 =
HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460 tmp3 =
DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1466 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467 dst += (4 * dst_stride);
1478 const uint8_t *
src, ptrdiff_t src_stride,
1479 int height,
int mx,
int my)
1483 for (multiple8_cnt = 2; multiple8_cnt--;) {
1493 uint8_t *dst,
int32_t dst_stride,
1497 v16u8 filt0, vec0, vec1, res0, res1;
1498 v8u16 vec2, vec3,
filt;
1504 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1508 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1511 ST_W2(res0, 0, 1, dst, dst_stride);
1512 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1516 uint8_t *dst,
int32_t dst_stride,
1519 v16u8 vec0, vec1, vec2, vec3, filt0;
1521 v16i8 res0, res1, res2, res3;
1522 v8u16 vec4, vec5, vec6, vec7,
filt;
1528 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1533 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534 vec4, vec5, vec6, vec7);
1536 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537 res0, res1, res2, res3);
1538 ST_W2(res0, 0, 1, dst, dst_stride);
1539 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1545 const uint8_t *
src, ptrdiff_t src_stride,
1546 int height,
int mx,
int my)
1552 }
else if (8 ==
height) {
1558 uint8_t *dst,
int32_t dst_stride,
1563 v8u16 vec0, vec1, vec2, vec3,
filt;
1569 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1574 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575 vec0, vec1, vec2, vec3);
1582 uint8_t *dst,
int32_t dst_stride,
1587 v8u16 vec0, vec1, vec2, vec3,
filt;
1593 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1596 src += (4 * src_stride);
1600 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601 vec0, vec1, vec2, vec3);
1605 src += (4 * src_stride);
1608 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1612 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613 vec0, vec1, vec2, vec3);
1616 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617 dst += (8 * dst_stride);
1621 src += (4 * src_stride);
1625 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626 vec0, vec1, vec2, vec3);
1629 src += (4 * src_stride);
1632 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1636 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637 vec0, vec1, vec2, vec3);
1640 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1645 const uint8_t *
src, ptrdiff_t src_stride,
1646 int height,
int mx,
int my)
1659 const uint8_t *
src, ptrdiff_t src_stride,
1660 int height,
int mx,
int my)
1665 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666 v8u16 out0, out1, out2, out3, out4, out5, out6, out7,
filt;
1670 loop_cnt = (
height >> 2) - 1;
1674 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1678 src += (4 * src_stride);
1684 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685 out0, out1, out2, out3);
1686 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687 out4, out5, out6, out7);
1699 for (; loop_cnt--;) {
1702 src += (4 * src_stride);
1708 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709 out0, out1, out2, out3);
1710 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711 out4, out5, out6, out7);
1726 uint8_t *dst,
int32_t dst_stride,
1730 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1736 filt0 = (v16u8) __msa_splati_h(
filt, 0);
1739 src += (5 * src_stride);
1742 src10_r, src21_r, src32_r, src43_r);
1743 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1747 src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748 ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1752 uint8_t *dst,
int32_t dst_stride,
1755 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1756 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758 v8u16 tmp0, tmp1, tmp2, tmp3;
1763 filt0 = (v16u8) __msa_splati_h(
filt, 0);
1766 src += (8 * src_stride);
1771 ILVR_B4_SB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3, src10_r, src21_r,
1773 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1775 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776 src87_r, src76_r, src2110, src4332, src6554, src8776);
1777 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778 tmp0, tmp1, tmp2, tmp3);
1781 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1786 const uint8_t *
src, ptrdiff_t src_stride,
1787 int height,
int mx,
int my)
1793 }
else if (8 ==
height) {
1799 uint8_t *dst,
int32_t dst_stride,
1802 v16u8
src0,
src1,
src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1804 v8u16 tmp0, tmp1, tmp2, tmp3;
1809 filt0 = (v16u8) __msa_splati_h(
filt, 0);
1814 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815 tmp0, tmp1, tmp2, tmp3);
1819 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1823 uint8_t *dst,
int32_t dst_stride,
1827 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1828 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1830 v8u16 tmp0, tmp1, tmp2, tmp3;
1835 filt0 = (v16u8) __msa_splati_h(
filt, 0);
1840 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1842 src += (8 * src_stride);
1845 vec0, vec1, vec2, vec3);
1846 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847 vec4, vec5, vec6, vec7);
1848 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849 tmp0, tmp1, tmp2, tmp3);
1853 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1855 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856 tmp0, tmp1, tmp2, tmp3);
1860 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861 dst += (8 * dst_stride);
1868 const uint8_t *
src, ptrdiff_t src_stride,
1869 int height,
int mx,
int my)
1882 const uint8_t *
src, ptrdiff_t src_stride,
1883 int height,
int mx,
int my)
1888 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889 v8u16 tmp0, tmp1, tmp2, tmp3;
1894 filt0 = (v16u8) __msa_splati_h(
filt, 0);
1899 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1901 src += (4 * src_stride);
1905 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1913 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1919 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1925 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1936 uint8_t *dst,
int32_t dst_stride,
1937 const int8_t *filter_horiz,
1938 const int8_t *filter_vert)
1941 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4,
filt, tmp0, tmp1;
1948 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1951 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1957 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1960 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1965 ST_W2(res0, 0, 1, dst, dst_stride);
1966 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1970 uint8_t *dst,
int32_t dst_stride,
1971 const int8_t *filter_horiz,
1972 const int8_t *filter_vert)
1975 v16i8 res0, res1, res2, res3;
1976 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7,
filt;
1984 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1987 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1990 src += (8 * src_stride);
1998 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
2000 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2002 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005 vec4, vec5, vec6, vec7);
2008 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009 res0, res1, res2, res3);
2010 ST_W2(res0, 0, 1, dst, dst_stride);
2011 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2017 const uint8_t *
src, ptrdiff_t src_stride,
2018 int height,
int mx,
int my)
2025 filter_horiz, filter_vert);
2026 }
else if (8 ==
height) {
2028 filter_horiz, filter_vert);
2033 uint8_t *dst,
int32_t dst_stride,
2034 const int8_t *filter_horiz,
2035 const int8_t *filter_vert)
2038 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2046 filt_hz = (v16u8) __msa_splati_h(
filt, 0);
2049 filt_vt = (v16u8) __msa_splati_h(
filt, 0);
2055 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2059 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2063 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2067 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2073 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2077 uint8_t *dst,
int32_t dst_stride,
2078 const int8_t *filter_horiz,
2079 const int8_t *filter_vert,
2084 v16u8 filt_hz, filt_vt, vec0;
2085 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2092 filt_hz = (v16u8) __msa_splati_h(
filt, 0);
2095 filt_vt = (v16u8) __msa_splati_h(
filt, 0);
2102 for (loop_cnt = (
height >> 3); loop_cnt--;) {
2104 src += (4 * src_stride);
2107 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2111 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2118 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2123 src += (4 * src_stride);
2124 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125 tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2130 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2133 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134 tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2137 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138 tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2141 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142 tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2145 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146 tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2151 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152 dst += (8 * dst_stride);
2157 const uint8_t *
src, ptrdiff_t src_stride,
2158 int height,
int mx,
int my)
2165 filter_horiz, filter_vert);
2168 filter_horiz, filter_vert,
height);
2173 const uint8_t *
src, ptrdiff_t src_stride,
2174 int height,
int mx,
int my)
2180 v16u8 filt_hz, filt_vt, vec0, vec1;
2181 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2188 filt_hz = (v16u8) __msa_splati_h(
filt, 0);
2191 filt_vt = (v16u8) __msa_splati_h(
filt, 0);
2200 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2203 src += (4 * src_stride);
2207 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2216 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2225 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2234 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2244 const uint8_t *
src, ptrdiff_t src_stride,
2245 int height,
int mx,
int my)
2248 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2252 for (cnt =
height >> 3; cnt--;) {
2255 src += (8 * src_stride);
2257 out0 = __msa_copy_u_d((v2i64)
src0, 0);
2258 out1 = __msa_copy_u_d((v2i64)
src1, 0);
2259 out2 = __msa_copy_u_d((v2i64)
src2, 0);
2260 out3 = __msa_copy_u_d((v2i64) src3, 0);
2261 out4 = __msa_copy_u_d((v2i64) src4, 0);
2262 out5 = __msa_copy_u_d((v2i64) src5, 0);
2263 out6 = __msa_copy_u_d((v2i64) src6, 0);
2264 out7 = __msa_copy_u_d((v2i64) src7, 0);
2266 SD4(out0, out1, out2, out3, dst, dst_stride);
2267 dst += (4 * dst_stride);
2268 SD4(out4, out5, out6, out7, dst, dst_stride);
2269 dst += (4 * dst_stride);
2271 }
else if (0 ==
height % 4) {
2272 for (cnt = (
height / 4); cnt--;) {
2274 src += (4 * src_stride);
2275 out0 = __msa_copy_u_d((v2i64)
src0, 0);
2276 out1 = __msa_copy_u_d((v2i64)
src1, 0);
2277 out2 = __msa_copy_u_d((v2i64)
src2, 0);
2278 out3 = __msa_copy_u_d((v2i64) src3, 0);
2280 SD4(out0, out1, out2, out3, dst, dst_stride);
2281 dst += (4 * dst_stride);
2287 uint8_t *dst,
int32_t dst_stride,
2294 for (cnt = (
width >> 4); cnt--;) {
2295 const uint8_t *src_tmp =
src;
2298 for (loop_cnt = (
height >> 3); loop_cnt--;) {
2299 LD_UB8(src_tmp, src_stride,
2301 src_tmp += (8 * src_stride);
2304 dst_tmp, dst_stride);
2305 dst_tmp += (8 * dst_stride);
2314 const uint8_t *
src, ptrdiff_t src_stride,
2315 int height,
int mx,
int my)
2322 }
else if (0 ==
height % 4) {
2323 for (cnt = (
height >> 2); cnt--;) {
2325 src += (4 * src_stride);
2328 dst += (4 * dst_stride);