26 #ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
27 #define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
40 #ifndef LOONGSON_INTRINSICS_H
41 #define LOONGSON_INTRINSICS_H
48 #define LSOM_VERSION_MAJOR 1
49 #define LSOM_VERSION_MINOR 1
50 #define LSOM_VERSION_MICRO 0
52 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
58 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
60 _OUT0 = _INS(_IN0, _IN1); \
61 _OUT1 = _INS(_IN2, _IN3); \
64 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
66 _OUT0 = _INS(_IN0, _IN1, _IN2); \
67 _OUT1 = _INS(_IN3, _IN4, _IN5); \
70 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
72 DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
73 DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
76 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
77 _OUT1, _OUT2, _OUT3) \
79 DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
80 DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
83 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
84 _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
86 DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
87 DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
91 #include <lsxintrin.h>
109 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
113 out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
114 out = __lsx_vmaddwod_h_b(
out, in_h, in_l);
135 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
139 out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
140 out = __lsx_vmaddwod_h_bu(
out, in_h, in_l);
161 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
165 out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
166 out = __lsx_vmaddwod_h_bu_b(
out, in_h, in_l);
187 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
191 out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
192 out = __lsx_vmaddwod_w_h(
out, in_h, in_l);
211 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
214 out = __lsx_vmulwev_h_b(in_h, in_l);
215 out = __lsx_vmaddwod_h_b(
out, in_h, in_l);
234 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
237 out = __lsx_vmulwev_h_bu(in_h, in_l);
238 out = __lsx_vmaddwod_h_bu(
out, in_h, in_l);
257 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
260 out = __lsx_vmulwev_h_bu_b(in_h, in_l);
261 out = __lsx_vmaddwod_h_bu_b(
out, in_h, in_l);
280 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
283 out = __lsx_vmulwev_w_h(in_h, in_l);
284 out = __lsx_vmaddwod_w_h(
out, in_h, in_l);
305 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i
min, __m128i
max) {
308 out = __lsx_vmax_h(
min, _in);
325 static inline __m128i __lsx_vclip255_h(__m128i _in) {
328 out = __lsx_vmaxi_h(_in, 0);
329 out = __lsx_vsat_hu(
out, 7);
345 static inline __m128i __lsx_vclip255_w(__m128i _in) {
348 out = __lsx_vmaxi_w(_in, 0);
349 out = __lsx_vsat_wu(
out, 7);
366 #define LSX_SWAP(_in0, _in1) \
368 _in0 = __lsx_vxor_v(_in0, _in1); \
369 _in1 = __lsx_vxor_v(_in0, _in1); \
370 _in0 = __lsx_vxor_v(_in0, _in1); \
386 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
388 __m128i _t0, _t1, _t2, _t3; \
390 _t0 = __lsx_vilvl_w(_in1, _in0); \
391 _t1 = __lsx_vilvh_w(_in1, _in0); \
392 _t2 = __lsx_vilvl_w(_in3, _in2); \
393 _t3 = __lsx_vilvh_w(_in3, _in2); \
394 _out0 = __lsx_vilvl_d(_t2, _t0); \
395 _out1 = __lsx_vilvh_d(_t2, _t0); \
396 _out2 = __lsx_vilvl_d(_t3, _t1); \
397 _out3 = __lsx_vilvh_d(_t3, _t1); \
428 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
429 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
432 __m128i zero = { 0 }; \
433 __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
434 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
436 _t0 = __lsx_vilvl_b(_in2, _in0); \
437 _t1 = __lsx_vilvl_b(_in3, _in1); \
438 _t2 = __lsx_vilvl_b(_in6, _in4); \
439 _t3 = __lsx_vilvl_b(_in7, _in5); \
440 _t4 = __lsx_vilvl_b(_t1, _t0); \
441 _t5 = __lsx_vilvh_b(_t1, _t0); \
442 _t6 = __lsx_vilvl_b(_t3, _t2); \
443 _t7 = __lsx_vilvh_b(_t3, _t2); \
444 _out0 = __lsx_vilvl_w(_t6, _t4); \
445 _out2 = __lsx_vilvh_w(_t6, _t4); \
446 _out4 = __lsx_vilvl_w(_t7, _t5); \
447 _out6 = __lsx_vilvh_w(_t7, _t5); \
448 _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
449 _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
450 _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
451 _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
471 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
472 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
475 __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
477 _s0 = __lsx_vilvl_h(_in6, _in4); \
478 _s1 = __lsx_vilvl_h(_in7, _in5); \
479 _t0 = __lsx_vilvl_h(_s1, _s0); \
480 _t1 = __lsx_vilvh_h(_s1, _s0); \
481 _s0 = __lsx_vilvh_h(_in6, _in4); \
482 _s1 = __lsx_vilvh_h(_in7, _in5); \
483 _t2 = __lsx_vilvl_h(_s1, _s0); \
484 _t3 = __lsx_vilvh_h(_s1, _s0); \
485 _s0 = __lsx_vilvl_h(_in2, _in0); \
486 _s1 = __lsx_vilvl_h(_in3, _in1); \
487 _t4 = __lsx_vilvl_h(_s1, _s0); \
488 _t5 = __lsx_vilvh_h(_s1, _s0); \
489 _s0 = __lsx_vilvh_h(_in2, _in0); \
490 _s1 = __lsx_vilvh_h(_in3, _in1); \
491 _t6 = __lsx_vilvl_h(_s1, _s0); \
492 _t7 = __lsx_vilvh_h(_s1, _s0); \
494 _out0 = __lsx_vpickev_d(_t0, _t4); \
495 _out2 = __lsx_vpickev_d(_t1, _t5); \
496 _out4 = __lsx_vpickev_d(_t2, _t6); \
497 _out6 = __lsx_vpickev_d(_t3, _t7); \
498 _out1 = __lsx_vpickod_d(_t0, _t4); \
499 _out3 = __lsx_vpickod_d(_t1, _t5); \
500 _out5 = __lsx_vpickod_d(_t2, _t6); \
501 _out7 = __lsx_vpickod_d(_t3, _t7); \
528 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
529 _out0, _out1, _out2, _out3) \
531 __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
533 _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
534 _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
535 _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
536 _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
537 _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
539 _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
540 _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
541 _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
543 _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
544 _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
545 _out1 = __lsx_vilvh_d(_out2, _out0); \
546 _out3 = __lsx_vilvh_d(_out0, _out2); \
575 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
576 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
577 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
580 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
581 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
582 DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
583 _tmp0, _tmp1, _tmp2, _tmp3); \
584 DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
585 _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
586 DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
587 DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
588 DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
589 DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
590 DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
591 DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
592 DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
593 DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
594 DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
595 DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
596 DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
597 DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
613 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
615 _out0 = __lsx_vadd_b(_in0, _in3); \
616 _out1 = __lsx_vadd_b(_in1, _in2); \
617 _out2 = __lsx_vsub_b(_in1, _in2); \
618 _out3 = __lsx_vsub_b(_in0, _in3); \
620 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
622 _out0 = __lsx_vadd_h(_in0, _in3); \
623 _out1 = __lsx_vadd_h(_in1, _in2); \
624 _out2 = __lsx_vsub_h(_in1, _in2); \
625 _out3 = __lsx_vsub_h(_in0, _in3); \
627 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
629 _out0 = __lsx_vadd_w(_in0, _in3); \
630 _out1 = __lsx_vadd_w(_in1, _in2); \
631 _out2 = __lsx_vsub_w(_in1, _in2); \
632 _out3 = __lsx_vsub_w(_in0, _in3); \
634 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
636 _out0 = __lsx_vadd_d(_in0, _in3); \
637 _out1 = __lsx_vadd_d(_in1, _in2); \
638 _out2 = __lsx_vsub_d(_in1, _in2); \
639 _out3 = __lsx_vsub_d(_in0, _in3); \
659 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
660 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
663 _out0 = __lsx_vadd_b(_in0, _in7); \
664 _out1 = __lsx_vadd_b(_in1, _in6); \
665 _out2 = __lsx_vadd_b(_in2, _in5); \
666 _out3 = __lsx_vadd_b(_in3, _in4); \
667 _out4 = __lsx_vsub_b(_in3, _in4); \
668 _out5 = __lsx_vsub_b(_in2, _in5); \
669 _out6 = __lsx_vsub_b(_in1, _in6); \
670 _out7 = __lsx_vsub_b(_in0, _in7); \
673 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
674 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
677 _out0 = __lsx_vadd_h(_in0, _in7); \
678 _out1 = __lsx_vadd_h(_in1, _in6); \
679 _out2 = __lsx_vadd_h(_in2, _in5); \
680 _out3 = __lsx_vadd_h(_in3, _in4); \
681 _out4 = __lsx_vsub_h(_in3, _in4); \
682 _out5 = __lsx_vsub_h(_in2, _in5); \
683 _out6 = __lsx_vsub_h(_in1, _in6); \
684 _out7 = __lsx_vsub_h(_in0, _in7); \
687 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
688 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
691 _out0 = __lsx_vadd_w(_in0, _in7); \
692 _out1 = __lsx_vadd_w(_in1, _in6); \
693 _out2 = __lsx_vadd_w(_in2, _in5); \
694 _out3 = __lsx_vadd_w(_in3, _in4); \
695 _out4 = __lsx_vsub_w(_in3, _in4); \
696 _out5 = __lsx_vsub_w(_in2, _in5); \
697 _out6 = __lsx_vsub_w(_in1, _in6); \
698 _out7 = __lsx_vsub_w(_in0, _in7); \
701 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
702 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
705 _out0 = __lsx_vadd_d(_in0, _in7); \
706 _out1 = __lsx_vadd_d(_in1, _in6); \
707 _out2 = __lsx_vadd_d(_in2, _in5); \
708 _out3 = __lsx_vadd_d(_in3, _in4); \
709 _out4 = __lsx_vsub_d(_in3, _in4); \
710 _out5 = __lsx_vsub_d(_in2, _in5); \
711 _out6 = __lsx_vsub_d(_in1, _in6); \
712 _out7 = __lsx_vsub_d(_in0, _in7); \
717 #ifdef __loongarch_asx
718 #include <lasxintrin.h>
733 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
736 out = __lasx_xvmulwev_h_bu(in_h, in_l);
737 out = __lasx_xvmaddwod_h_bu(
out, in_h, in_l);
755 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
758 out = __lasx_xvmulwev_h_b(in_h, in_l);
759 out = __lasx_xvmaddwod_h_b(
out, in_h, in_l);
780 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
783 out = __lasx_xvmulwev_w_h(in_h, in_l);
784 out = __lasx_xvmaddwod_w_h(
out, in_h, in_l);
802 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
805 out = __lasx_xvmulwev_d_w(in_h, in_l);
806 out = __lasx_xvmaddwod_d_w(
out, in_h, in_l);
824 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
827 out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
828 out = __lasx_xvmaddwod_w_hu_h(
out, in_h, in_l);
846 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
850 out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
851 out = __lasx_xvmaddwod_h_b(
out, in_h, in_l);
869 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
873 out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
874 out = __lasx_xvmaddwod_h_bu(
out, in_h, in_l);
892 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
896 out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
897 out = __lasx_xvmaddwod_h_bu_b(
out, in_h, in_l);
919 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
923 out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
924 out = __lasx_xvmaddwod_w_h(
out, in_h, in_l);
942 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
946 out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
947 out = __lasx_xvmaddwod_w_hu(
out, in_h, in_l);
965 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
969 out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
970 out = __lasx_xvmaddwod_w_hu_h(
out, in_h, in_l);
989 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
993 out = __lasx_xvmulwev_h_bu(in_h, in_l);
994 out = __lasx_xvmaddwod_h_bu(
out, in_h, in_l);
995 out = __lasx_xvsub_h(in_c,
out);
1018 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
1022 out = __lasx_xvmulwev_w_h(in_h, in_l);
1023 out = __lasx_xvmaddwod_w_h(
out, in_h, in_l);
1024 out = __lasx_xvsub_w(in_c,
out);
1045 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1048 out = __lasx_xvmulwev_w_h(in_h, in_l);
1049 out = __lasx_xvmaddwod_w_h(
out, in_h, in_l);
1066 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1069 out = __lasx_xvilvh_b(in_h, in_l);
1089 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1092 out = __lasx_xvilvh_h(in_h, in_l);
1109 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1112 out = __lasx_xvilvl_b(in_h, in_l);
1132 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1135 out = __lasx_xvilvl_h(in_h, in_l);
1152 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1155 out = __lasx_xvilvl_b(in_h, in_l);
1171 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1174 out = __lasx_xvsllwil_hu_bu(in_l, 0);
1175 out = __lasx_xvadd_h(in_h,
out);
1193 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1196 out = __lasx_xvsllwil_w_h(in_l, 0);
1197 out = __lasx_xvadd_w(in_h,
out);
1219 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1221 __m256i tmp0, tmp1,
out;
1223 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1224 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1225 tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1226 out = __lasx_xvadd_w(tmp0, in_c);
1243 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1245 __m256i tmp0, tmp1,
out;
1247 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1248 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1249 tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1250 out = __lasx_xvadd_w(tmp0, in_c);
1269 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1270 __m256i tmp0, tmp1,
out;
1272 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1273 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1274 out = __lasx_xvmul_w(tmp0, tmp1);
1293 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1294 __m256i tmp0, tmp1,
out;
1296 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1297 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1298 out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1319 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1321 __m256i
zero = { 0 };
1323 tmp1 = __lasx_xvilvl_b(
zero, in_l);
1324 out = __lasx_xvsadd_hu(in_h, tmp1);
1344 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i
min, __m256i
max) {
1347 out = __lasx_xvmax_h(
min, in);
1362 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1365 out = __lasx_xvmaxi_h(in, 0);
1366 out = __lasx_xvsat_hu(
out, 7);
1382 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1385 out = __lasx_xvmaxi_w(in, 0);
1386 out = __lasx_xvsat_wu(
out, 7);
1406 static inline __m256i __lasx_xvsplati_l_h(__m256i in,
int idx) {
1409 out = __lasx_xvpermi_q(in, in, 0x02);
1410 out = __lasx_xvreplve_h(
out, idx);
1430 static inline __m256i __lasx_xvsplati_h_h(__m256i in,
int idx) {
1433 out = __lasx_xvpermi_q(in, in, 0x13);
1434 out = __lasx_xvreplve_h(
out, idx);
1455 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1458 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1459 _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1460 _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1461 _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1462 _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1463 _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1464 _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1465 _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1466 _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1495 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1496 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1499 __m256i _s0_m, _s1_m; \
1500 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1501 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1503 _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1504 _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1505 _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1506 _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1507 _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1508 _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1509 _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1510 _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1511 _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1512 _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1513 _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1514 _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1515 _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1516 _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1517 _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1518 _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1519 _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1520 _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1521 _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1522 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1523 _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1524 _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1525 _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1526 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1542 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1543 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1544 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1547 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1548 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1550 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1551 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1552 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1553 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1554 _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1555 _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1556 _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1557 _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1558 _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1559 _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1560 _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1561 _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1562 _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1563 _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1564 _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1565 _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1566 _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1567 _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1568 _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1569 _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1570 _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1571 _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1572 _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1573 _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1574 _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1575 _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1576 _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1577 _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1578 _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1579 _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1580 _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1581 _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1622 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1623 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1624 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1627 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1628 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1629 __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1631 _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1632 _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1633 _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1634 _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1635 _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1636 _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1637 _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1638 _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1639 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1640 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1641 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1642 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1643 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1644 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1645 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1646 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1647 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1648 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1649 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1650 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1651 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1652 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1653 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1654 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1655 _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1656 _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1657 _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1658 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1660 _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1661 _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1662 _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1663 _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1664 _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1665 _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1666 _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1667 _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1668 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1669 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1670 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1671 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1672 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1673 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1674 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1675 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1676 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1677 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1678 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1679 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1680 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1681 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1682 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1683 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1684 _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1685 _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1686 _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1687 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1701 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1704 __m256i _s0_m, _s1_m; \
1706 _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1707 _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1708 _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1709 _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1710 _out1 = __lasx_xvilvh_d(_out0, _out0); \
1711 _out3 = __lasx_xvilvh_d(_out2, _out2); \
1724 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1725 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1728 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1729 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1730 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1731 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1732 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1733 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1734 _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1735 _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1736 _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1737 _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1738 _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1739 _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1740 _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1741 _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1742 _out1 = __lasx_xvbsrl_v(_out0, 8); \
1743 _out3 = __lasx_xvbsrl_v(_out2, 8); \
1744 _out5 = __lasx_xvbsrl_v(_out4, 8); \
1745 _out7 = __lasx_xvbsrl_v(_out6, 8); \
1775 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1776 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1779 __m256i _s0_m, _s1_m; \
1780 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1781 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1783 _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1784 _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1785 _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1786 _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1787 _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1788 _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1789 _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1790 _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1792 _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1793 _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1794 _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1795 _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1796 _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1797 _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1798 _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1799 _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1801 _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1802 _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1803 _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1804 _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1805 _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1806 _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1807 _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1808 _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1824 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1826 _out0 = __lasx_xvadd_b(_in0, _in3); \
1827 _out1 = __lasx_xvadd_b(_in1, _in2); \
1828 _out2 = __lasx_xvsub_b(_in1, _in2); \
1829 _out3 = __lasx_xvsub_b(_in0, _in3); \
1831 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1833 _out0 = __lasx_xvadd_h(_in0, _in3); \
1834 _out1 = __lasx_xvadd_h(_in1, _in2); \
1835 _out2 = __lasx_xvsub_h(_in1, _in2); \
1836 _out3 = __lasx_xvsub_h(_in0, _in3); \
1838 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1840 _out0 = __lasx_xvadd_w(_in0, _in3); \
1841 _out1 = __lasx_xvadd_w(_in1, _in2); \
1842 _out2 = __lasx_xvsub_w(_in1, _in2); \
1843 _out3 = __lasx_xvsub_w(_in0, _in3); \
1845 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1847 _out0 = __lasx_xvadd_d(_in0, _in3); \
1848 _out1 = __lasx_xvadd_d(_in1, _in2); \
1849 _out2 = __lasx_xvsub_d(_in1, _in2); \
1850 _out3 = __lasx_xvsub_d(_in0, _in3); \
1870 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1871 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1874 _out0 = __lasx_xvadd_b(_in0, _in7); \
1875 _out1 = __lasx_xvadd_b(_in1, _in6); \
1876 _out2 = __lasx_xvadd_b(_in2, _in5); \
1877 _out3 = __lasx_xvadd_b(_in3, _in4); \
1878 _out4 = __lasx_xvsub_b(_in3, _in4); \
1879 _out5 = __lasx_xvsub_b(_in2, _in5); \
1880 _out6 = __lasx_xvsub_b(_in1, _in6); \
1881 _out7 = __lasx_xvsub_b(_in0, _in7); \
1884 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1885 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1888 _out0 = __lasx_xvadd_h(_in0, _in7); \
1889 _out1 = __lasx_xvadd_h(_in1, _in6); \
1890 _out2 = __lasx_xvadd_h(_in2, _in5); \
1891 _out3 = __lasx_xvadd_h(_in3, _in4); \
1892 _out4 = __lasx_xvsub_h(_in3, _in4); \
1893 _out5 = __lasx_xvsub_h(_in2, _in5); \
1894 _out6 = __lasx_xvsub_h(_in1, _in6); \
1895 _out7 = __lasx_xvsub_h(_in0, _in7); \
1898 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1899 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1902 _out0 = __lasx_xvadd_w(_in0, _in7); \
1903 _out1 = __lasx_xvadd_w(_in1, _in6); \
1904 _out2 = __lasx_xvadd_w(_in2, _in5); \
1905 _out3 = __lasx_xvadd_w(_in3, _in4); \
1906 _out4 = __lasx_xvsub_w(_in3, _in4); \
1907 _out5 = __lasx_xvsub_w(_in2, _in5); \
1908 _out6 = __lasx_xvsub_w(_in1, _in6); \
1909 _out7 = __lasx_xvsub_w(_in0, _in7); \
1912 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1913 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1916 _out0 = __lasx_xvadd_d(_in0, _in7); \
1917 _out1 = __lasx_xvadd_d(_in1, _in6); \
1918 _out2 = __lasx_xvadd_d(_in2, _in5); \
1919 _out3 = __lasx_xvadd_d(_in3, _in4); \
1920 _out4 = __lasx_xvsub_d(_in3, _in4); \
1921 _out5 = __lasx_xvsub_d(_in2, _in5); \
1922 _out6 = __lasx_xvsub_d(_in1, _in6); \
1923 _out7 = __lasx_xvsub_d(_in0, _in7); \
1939 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
1941 RTYPE _tmp0 = (RTYPE)in0; \
1943 if (enter) printf("\nVP:"); \
1944 for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \