25 #define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \
27 __m256i temp_0, temp_1, temp_2, temp_3; \
28 __m256i temp_4, temp_5, temp_6, temp_7; \
29 DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\
30 0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3); \
31 DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\
32 DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\
33 DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2); \
34 DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3); \
37 #define LASX_IDCTROWCONDDC \
38 const_val = 16383 * ((1 << 19) / 16383); \
39 const_val1 = __lasx_xvreplgr2vr_w(const_val); \
40 DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96, \
41 in0, in1, in2, in3); \
42 LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \
43 a0 = __lasx_xvpermi_d(in0, 0xD8); \
44 a0 = __lasx_vext2xv_w_h(a0); \
45 temp = __lasx_xvslli_w(a0, 3); \
46 a1 = __lasx_xvpermi_d(in0, 0x8D); \
47 a1 = __lasx_vext2xv_w_h(a1); \
48 a2 = __lasx_xvpermi_d(in1, 0xD8); \
49 a2 = __lasx_vext2xv_w_h(a2); \
50 a3 = __lasx_xvpermi_d(in1, 0x8D); \
51 a3 = __lasx_vext2xv_w_h(a3); \
52 b0 = __lasx_xvpermi_d(in2, 0xD8); \
53 b0 = __lasx_vext2xv_w_h(b0); \
54 b1 = __lasx_xvpermi_d(in2, 0x8D); \
55 b1 = __lasx_vext2xv_w_h(b1); \
56 b2 = __lasx_xvpermi_d(in3, 0xD8); \
57 b2 = __lasx_vext2xv_w_h(b2); \
58 b3 = __lasx_xvpermi_d(in3, 0x8D); \
59 b3 = __lasx_vext2xv_w_h(b3); \
60 select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3; \
61 select_vec = __lasx_xvslti_wu(select_vec, 1); \
63 DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5, \
65 DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7); \
66 w1 = __lasx_xvrepl128vei_h(w1, 1); \
69 temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4); \
70 DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \
71 a0 = __lasx_xvadd_w(temp0, temp1); \
72 a1 = __lasx_xvadd_w(temp0, temp2); \
73 a2 = __lasx_xvsub_w(temp0, temp2); \
74 a3 = __lasx_xvsub_w(temp0, temp1); \
76 DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \
77 b0 = __lasx_xvdp2_w_h(temp0, temp1); \
78 temp1 = __lasx_xvneg_h(w7); \
79 temp2 = __lasx_xvilvl_h(temp1, w3); \
80 b1 = __lasx_xvdp2_w_h(temp0, temp2); \
81 temp1 = __lasx_xvneg_h(w1); \
82 temp2 = __lasx_xvilvl_h(temp1, w5); \
83 b2 = __lasx_xvdp2_w_h(temp0, temp2); \
84 temp1 = __lasx_xvneg_h(w5); \
85 temp2 = __lasx_xvilvl_h(temp1, w7); \
86 b3 = __lasx_xvdp2_w_h(temp0, temp2); \
89 DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \
90 a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \
91 temp1 = __lasx_xvilvl_h(w2, w4); \
92 a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \
93 temp1 = __lasx_xvneg_h(w4); \
94 temp2 = __lasx_xvilvl_h(w2, temp1); \
95 a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \
96 temp1 = __lasx_xvneg_h(w6); \
97 temp2 = __lasx_xvilvl_h(temp1, w4); \
98 a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \
100 DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \
101 b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \
102 DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \
103 b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \
104 b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \
105 temp1 = __lasx_xvneg_h(w1); \
106 temp2 = __lasx_xvilvl_h(temp1, w3); \
107 b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \
109 DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \
110 temp0, temp1, temp2, temp3); \
111 DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3, \
113 DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11, \
114 temp0, temp1, temp2, temp3); \
115 DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\
116 DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp, \
117 select_vec, temp2, temp, select_vec, temp3, temp, select_vec, \
118 in0, in1, in2, in3); \
119 DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp, \
120 select_vec, a2, temp, select_vec, a3, temp, select_vec, \
122 DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1, \
123 in0, in1, in2, in3); \
124 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, \
125 in0, in1, in2, in3); \
127 #define LASX_IDCTCOLS \
129 LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \
130 temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4); \
131 DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \
132 a0 = __lasx_xvadd_w(temp0, temp1); \
133 a1 = __lasx_xvadd_w(temp0, temp2); \
134 a2 = __lasx_xvsub_w(temp0, temp2); \
135 a3 = __lasx_xvsub_w(temp0, temp1); \
137 DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \
138 b0 = __lasx_xvdp2_w_h(temp0, temp1); \
139 temp1 = __lasx_xvneg_h(w7); \
140 temp2 = __lasx_xvilvl_h(temp1, w3); \
141 b1 = __lasx_xvdp2_w_h(temp0, temp2); \
142 temp1 = __lasx_xvneg_h(w1); \
143 temp2 = __lasx_xvilvl_h(temp1, w5); \
144 b2 = __lasx_xvdp2_w_h(temp0, temp2); \
145 temp1 = __lasx_xvneg_h(w5); \
146 temp2 = __lasx_xvilvl_h(temp1, w7); \
147 b3 = __lasx_xvdp2_w_h(temp0, temp2); \
150 DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \
151 a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \
152 temp1 = __lasx_xvilvl_h(w2, w4); \
153 a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \
154 temp1 = __lasx_xvneg_h(w4); \
155 temp2 = __lasx_xvilvl_h(w2, temp1); \
156 a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \
157 temp1 = __lasx_xvneg_h(w6); \
158 temp2 = __lasx_xvilvl_h(temp1, w4); \
159 a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \
161 DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \
162 b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \
163 DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \
164 b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \
165 b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \
166 temp1 = __lasx_xvneg_h(w1); \
167 temp2 = __lasx_xvilvl_h(temp1, w3); \
168 b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \
170 DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \
171 temp0, temp1, temp2, temp3); \
172 DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0, \
174 DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3, \
175 20, a0, a1, 20, in0, in1, in2, in3); \
180 __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
181 0x4B42539F58C50000, 0x11A822A332493FFF};
182 __m256i in0, in1, in2, in3;
183 __m256i w2, w3, w4, w5, w6, w7;
186 __m256i temp0, temp1, temp2, temp3;
187 __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
188 __m256i const_val1, select_vec,
temp;
192 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
194 __lasx_xvst(in0,
block, 0);
195 __lasx_xvst(in1,
block, 32);
196 __lasx_xvst(in2,
block, 64);
197 __lasx_xvst(in3,
block, 96);
204 ptrdiff_t dst_stride_2x = dst_stride << 1;
205 ptrdiff_t dst_stride_4x = dst_stride << 2;
206 ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
207 __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
208 0x4B42539F58C50000, 0x11A822A332493FFF};
209 __m256i in0, in1, in2, in3;
210 __m256i w2, w3, w4, w5, w6, w7;
213 __m256i temp0, temp1, temp2, temp3;
214 __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
215 __m256i const_val1, select_vec,
temp;
219 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
221 DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
222 DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
223 __lasx_xvstelm_d(in0, dst, 0, 0);
224 __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
225 __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
226 __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
227 dst += dst_stride_4x;
228 __lasx_xvstelm_d(in1, dst, 0, 0);
229 __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
230 __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
231 __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
239 ptrdiff_t dst_stride_2x = dst_stride << 1;
240 ptrdiff_t dst_stride_4x = dst_stride << 2;
241 ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
243 __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
244 0x4B42539F58C50000, 0x11A822A332493FFF};
245 __m256i sh = {0x0003000200010000, 0x000B000A00090008,
246 0x0007000600050004, 0x000F000E000D000C};
247 __m256i in0, in1, in2, in3;
248 __m256i w2, w3, w4, w5, w6, w7;
251 __m256i temp0, temp1, temp2, temp3;
252 __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
253 __m256i const_val1, select_vec,
temp;
257 a0 = __lasx_xvldrepl_d(dst1, 0);
258 a0 = __lasx_vext2xv_hu_bu(
a0);
260 a1 = __lasx_xvldrepl_d(dst1, 0);
261 a1 = __lasx_vext2xv_hu_bu(
a1);
263 a2 = __lasx_xvldrepl_d(dst1, 0);
264 a2 = __lasx_vext2xv_hu_bu(
a2);
266 a3 = __lasx_xvldrepl_d(dst1, 0);
267 a3 = __lasx_vext2xv_hu_bu(
a3);
269 b0 = __lasx_xvldrepl_d(dst1, 0);
270 b0 = __lasx_vext2xv_hu_bu(
b0);
272 b1 = __lasx_xvldrepl_d(dst1, 0);
273 b1 = __lasx_vext2xv_hu_bu(
b1);
275 b2 = __lasx_xvldrepl_d(dst1, 0);
276 b2 = __lasx_vext2xv_hu_bu(
b2);
278 b3 = __lasx_xvldrepl_d(dst1, 0);
279 b3 = __lasx_vext2xv_hu_bu(
b3);
280 DUP4_ARG3(__lasx_xvshuf_h, sh,
a1,
a0, sh,
a3,
a2, sh,
b1,
b0, sh,
b3,
b2,
281 temp0, temp1, temp2, temp3);
282 DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3,
284 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
286 DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
287 DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
288 __lasx_xvstelm_d(in0, dst, 0, 0);
289 __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
290 __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
291 __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
292 dst += dst_stride_4x;
293 __lasx_xvstelm_d(in1, dst, 0, 0);
294 __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
295 __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
296 __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);