26 #define SCALE_8_16(_sh) \
28 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
29 src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
30 src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
31 src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
32 src4 = __lasx_xvldrepl_d(src + filterPos[4], 0); \
33 src5 = __lasx_xvldrepl_d(src + filterPos[5], 0); \
34 src6 = __lasx_xvldrepl_d(src + filterPos[6], 0); \
35 src7 = __lasx_xvldrepl_d(src + filterPos[7], 0); \
36 src8 = __lasx_xvldrepl_d(src + filterPos[8], 0); \
37 src9 = __lasx_xvldrepl_d(src + filterPos[9], 0); \
38 src10 = __lasx_xvldrepl_d(src + filterPos[10], 0); \
39 src11 = __lasx_xvldrepl_d(src + filterPos[11], 0); \
40 src12 = __lasx_xvldrepl_d(src + filterPos[12], 0); \
41 src13 = __lasx_xvldrepl_d(src + filterPos[13], 0); \
42 src14 = __lasx_xvldrepl_d(src + filterPos[14], 0); \
43 src15 = __lasx_xvldrepl_d(src + filterPos[15], 0); \
44 DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
45 filter, 96, filter0, filter1, filter2, filter3); \
46 DUP4_ARG2(__lasx_xvld, filter, 128, filter, 160, \
47 filter, 192, filter, 224, filter4, \
48 filter5, filter6, filter7); \
49 DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, \
50 src5, src4, src7, src6, src0, src2, src4, src6); \
51 DUP4_ARG2(__lasx_xvilvl_d, src9, src8, src11, src10, \
52 src13, src12, src15, src14, src8, src10, src12, src14); \
53 DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6, \
54 src0, src2, src4, src6); \
55 DUP4_ARG1(__lasx_vext2xv_hu_bu, src8, src10, src12, \
56 src14, src8, src10, src12, src14); \
57 DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, \
58 filter2, src4, filter3, src6, src0, src1, src2, src3); \
59 DUP4_ARG2(__lasx_xvdp2_w_h, filter4, src8, filter5, src10, \
60 filter6, src12, filter7, src14, src4, src5, src6, src7);\
61 src0 = __lasx_xvhaddw_d_w(src0, src0); \
62 src1 = __lasx_xvhaddw_d_w(src1, src1); \
63 src2 = __lasx_xvhaddw_d_w(src2, src2); \
64 src3 = __lasx_xvhaddw_d_w(src3, src3); \
65 src4 = __lasx_xvhaddw_d_w(src4, src4); \
66 src5 = __lasx_xvhaddw_d_w(src5, src5); \
67 src6 = __lasx_xvhaddw_d_w(src6, src6); \
68 src7 = __lasx_xvhaddw_d_w(src7, src7); \
69 DUP4_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, \
70 src5, src4, src7, src6, src0, src1, src2, src3); \
71 src0 = __lasx_xvhaddw_d_w(src0, src0); \
72 src1 = __lasx_xvhaddw_d_w(src1, src1); \
73 src2 = __lasx_xvhaddw_d_w(src2, src2); \
74 src3 = __lasx_xvhaddw_d_w(src3, src3); \
75 src0 = __lasx_xvpickev_w(src1, src0); \
76 src1 = __lasx_xvpickev_w(src3, src2); \
77 src0 = __lasx_xvsrai_w(src0, _sh); \
78 src1 = __lasx_xvsrai_w(src1, _sh); \
79 src0 = __lasx_xvmin_w(src0, vmax); \
80 src1 = __lasx_xvmin_w(src1, vmax); \
81 src0 = __lasx_xvperm_w(src0, shuf); \
82 src1 = __lasx_xvperm_w(src1, shuf); \
83 src0 = __lasx_xvpickev_h(src1, src0); \
84 src0 = __lasx_xvpermi_d(src0, 0xd8); \
85 __lasx_xvst(src0, dst, 0); \
91 #define SCALE_8_8(_sh) \
93 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
94 src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
95 src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
96 src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
97 src4 = __lasx_xvldrepl_d(src + filterPos[4], 0); \
98 src5 = __lasx_xvldrepl_d(src + filterPos[5], 0); \
99 src6 = __lasx_xvldrepl_d(src + filterPos[6], 0); \
100 src7 = __lasx_xvldrepl_d(src + filterPos[7], 0); \
101 DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
102 filter, 96, filter0, filter1, filter2, filter3); \
105 DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, \
106 src5, src4, src7, src6, src0, src2, src4, src6); \
107 DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6, \
108 src0, src2, src4, src6); \
109 DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, \
110 filter2, src4, filter3, src6, src0, src1, src2,src3); \
111 src0 = __lasx_xvhaddw_d_w(src0, src0); \
112 src1 = __lasx_xvhaddw_d_w(src1, src1); \
113 src2 = __lasx_xvhaddw_d_w(src2, src2); \
114 src3 = __lasx_xvhaddw_d_w(src3, src3); \
115 src0 = __lasx_xvpickev_w(src1, src0); \
116 src1 = __lasx_xvpickev_w(src3, src2); \
117 src0 = __lasx_xvhaddw_d_w(src0, src0); \
118 src1 = __lasx_xvhaddw_d_w(src1, src1); \
119 src0 = __lasx_xvpickev_w(src1, src0); \
120 src0 = __lasx_xvsrai_w(src0, _sh); \
121 src0 = __lasx_xvmin_w(src0, vmax); \
122 src0 = __lasx_xvperm_w(src0, shuf); \
125 #define SCALE_8_4(_sh) \
127 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
128 src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
129 src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
130 src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
131 filter0 = __lasx_xvld(filter, 0); \
132 filter1 = __lasx_xvld(filter, 32); \
135 src0 = __lasx_xvilvl_d(src1, src0); \
136 src2 = __lasx_xvilvl_d(src3, src2); \
137 src0 = __lasx_vext2xv_hu_bu(src0); \
138 src2 = __lasx_vext2xv_hu_bu(src2); \
139 src0 = __lasx_xvdp2_w_h(src0, filter0); \
140 src1 = __lasx_xvdp2_w_h(src2, filter1); \
141 src0 = __lasx_xvhaddw_d_w(src0, src0); \
142 src1 = __lasx_xvhaddw_d_w(src1, src1); \
143 src0 = __lasx_xvpickev_w(src1, src0); \
144 src0 = __lasx_xvhaddw_d_w(src0, src0); \
145 src0 = __lasx_xvpickev_w(src0, src0); \
146 src0 = __lasx_xvsrai_w(src0, _sh); \
147 src0 = __lasx_xvmin_w(src0, vmax); \
148 src0 = __lasx_xvperm_w(src0, shuf); \
151 #define SCALE_8_2(_sh) \
153 src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
154 src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
155 filter0 = __lasx_xvld(filter, 0); \
156 src0 = __lasx_xvilvl_d(src1, src0); \
157 src0 = __lasx_vext2xv_hu_bu(src0); \
158 src0 = __lasx_xvdp2_w_h(filter0, src0); \
159 src0 = __lasx_xvhaddw_d_w(src0, src0); \
160 src0 = __lasx_xvhaddw_q_d(src0, src0); \
161 src0 = __lasx_xvsrai_w(src0, _sh); \
162 src0 = __lasx_xvmin_w(src0, vmax); \
163 dst[0] = __lasx_xvpickve2gr_w(src0, 0); \
164 dst[1] = __lasx_xvpickve2gr_w(src0, 4); \
170 #define SCALE_4_16(_sh) \
172 src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
173 src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
174 src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
175 src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
176 src4 = __lasx_xvldrepl_w(src + filterPos[4], 0); \
177 src5 = __lasx_xvldrepl_w(src + filterPos[5], 0); \
178 src6 = __lasx_xvldrepl_w(src + filterPos[6], 0); \
179 src7 = __lasx_xvldrepl_w(src + filterPos[7], 0); \
180 src8 = __lasx_xvldrepl_w(src + filterPos[8], 0); \
181 src9 = __lasx_xvldrepl_w(src + filterPos[9], 0); \
182 src10 = __lasx_xvldrepl_w(src + filterPos[10], 0); \
183 src11 = __lasx_xvldrepl_w(src + filterPos[11], 0); \
184 src12 = __lasx_xvldrepl_w(src + filterPos[12], 0); \
185 src13 = __lasx_xvldrepl_w(src + filterPos[13], 0); \
186 src14 = __lasx_xvldrepl_w(src + filterPos[14], 0); \
187 src15 = __lasx_xvldrepl_w(src + filterPos[15], 0); \
188 DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
189 filter, 96, filter0, filter1, filter2, filter3); \
190 DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5, \
191 src4, src7, src6, src0, src2, src4, src6); \
192 DUP4_ARG2(__lasx_xvilvl_w, src9, src8, src11, src10, src13, \
193 src12, src15, src14, src8, src10, src12, src14); \
194 DUP4_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src10, \
195 src8, src14, src12, src0, src1, src2, src3); \
196 DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src2, src3, \
197 src0, src1, src2, src3); \
198 DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1, \
199 filter2, src2, filter3, src3, src0, src1, src2, src3); \
200 src0 = __lasx_xvhaddw_d_w(src0, src0); \
201 src1 = __lasx_xvhaddw_d_w(src1, src1); \
202 src2 = __lasx_xvhaddw_d_w(src2, src2); \
203 src3 = __lasx_xvhaddw_d_w(src3, src3); \
204 src0 = __lasx_xvpickev_w(src1, src0); \
205 src1 = __lasx_xvpickev_w(src3, src2); \
206 src0 = __lasx_xvsrai_w(src0, _sh); \
207 src1 = __lasx_xvsrai_w(src1, _sh); \
208 src0 = __lasx_xvmin_w(src0, vmax); \
209 src1 = __lasx_xvmin_w(src1, vmax); \
210 src0 = __lasx_xvpickev_h(src1, src0); \
211 src0 = __lasx_xvperm_w(src0, shuf); \
212 __lasx_xvst(src0, dst, 0); \
218 #define SCALE_4_8(_sh) \
220 src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
221 src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
222 src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
223 src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
224 src4 = __lasx_xvldrepl_w(src + filterPos[4], 0); \
225 src5 = __lasx_xvldrepl_w(src + filterPos[5], 0); \
226 src6 = __lasx_xvldrepl_w(src + filterPos[6], 0); \
227 src7 = __lasx_xvldrepl_w(src + filterPos[7], 0); \
228 filter0 = __lasx_xvld(filter, 0); \
229 filter1 = __lasx_xvld(filter, 32); \
232 DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5, \
233 src4, src7, src6, src0, src2, src4, src6); \
234 src0 = __lasx_xvilvl_d(src2, src0); \
235 src1 = __lasx_xvilvl_d(src6, src4); \
237 src0 = __lasx_vext2xv_hu_bu(src0); \
238 src1 = __lasx_vext2xv_hu_bu(src1); \
239 src0 = __lasx_xvdp2_w_h(filter0, src0); \
240 src1 = __lasx_xvdp2_w_h(filter1, src1); \
241 src0 = __lasx_xvhaddw_d_w(src0, src0); \
242 src1 = __lasx_xvhaddw_d_w(src1, src1); \
243 src0 = __lasx_xvpickev_w(src1, src0); \
244 src0 = __lasx_xvsrai_w(src0, _sh); \
245 src0 = __lasx_xvmin_w(src0, vmax); \
248 #define SCALE_4_4(_sh) \
250 src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
251 src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
252 src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
253 src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
254 filter0 = __lasx_xvld(filter, 0); \
257 src0 = __lasx_xvilvl_w(src1, src0); \
258 src1 = __lasx_xvilvl_w(src3, src2); \
260 src0 = __lasx_xvilvl_d(src1, src0); \
261 src0 = __lasx_vext2xv_hu_bu(src0); \
262 src0 = __lasx_xvdp2_w_h(filter0, src0); \
263 src0 = __lasx_xvhaddw_d_w(src0, src0); \
264 src0 = __lasx_xvsrai_w(src0, _sh); \
265 src0 = __lasx_xvmin_w(src0, vmax); \
266 src0 = __lasx_xvpickev_w(src0, src0); \
267 src0 = __lasx_xvpermi_d(src0, 0xd8); \
270 #define SCALE_4_2(_sh) \
272 src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
273 src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
274 filter0 = __lasx_xvld(filter, 0); \
275 src0 = __lasx_xvilvl_w(src1, src0); \
276 src0 = __lasx_vext2xv_hu_bu(src0); \
277 src0 = __lasx_xvdp2_w_h(filter0, src0); \
278 src0 = __lasx_xvhaddw_d_w(src0, src0); \
279 src0 = __lasx_xvsrai_w(src0, _sh); \
280 src0 = __lasx_xvmin_w(src0, vmax); \
281 dst[0] = __lasx_xvpickve2gr_w(src0, 0); \
282 dst[1] = __lasx_xvpickve2gr_w(src0, 2); \
291 src0 = __lasx_xvldrepl_d((srcPos1 + j), 0); \
292 src1 = __lasx_xvldrepl_d((srcPos2 + j), 0); \
293 src2 = __lasx_xvldrepl_d((srcPos3 + j), 0); \
294 src3 = __lasx_xvldrepl_d((srcPos4 + j), 0); \
295 DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex, \
296 filterStart3, dex, filterStart4, dex, filter0, \
297 filter1, filter2, filter3); \
298 src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
299 src1 = __lasx_xvpermi_q(src2, src3, 0x02); \
300 filter0 = __lasx_xvpermi_q(filter0, filter1, 0x02); \
301 filter1 = __lasx_xvpermi_q(filter2, filter3, 0x02); \
302 src0 = __lasx_xvilvl_b(zero, src0); \
303 src1 = __lasx_xvilvl_b(zero, src1); \
304 out0 = __lasx_xvdp2_w_h(filter0, src0); \
305 out1 = __lasx_xvdp2_w_h(filter1, src1); \
306 src0 = __lasx_xvhaddw_d_w(out0, out0); \
307 src1 = __lasx_xvhaddw_d_w(out1, out1); \
308 out0 = __lasx_xvpackev_d(src1, src0); \
309 out1 = __lasx_xvpackod_d(src1, src0); \
310 out0 = __lasx_xvadd_w(out0, out1); \
311 out = __lasx_xvadd_w(out, out0); \
315 const uint8_t *
src,
const int16_t *
filter,
316 const int32_t *filterPos,
int filterSize)
319 int max = (1 << 15) - 1;
321 if (filterSize == 8) {
323 __m256i src8, src9, src10, src11, src12, src13, src14, src15;
325 __m256i filter4, filter5, filter6, filter7;
326 __m256i vmax = __lasx_xvreplgr2vr_w(
max);
327 __m256i shuf = {0x0000000400000000, 0x0000000500000001,
328 0x0000000600000002, 0x0000000700000003};
337 __lasx_xvstelm_d(
src0, dst, 0, 0);
338 __lasx_xvstelm_d(
src0, dst, 8, 2);
344 __lasx_xvstelm_d(
src0, dst, 0, 0);
352 src0 = __lasx_xvldrepl_d(
src + filterPos[0], 0);
358 val = __lasx_xvpickve2gr_w(
src0, 0);
361 }
else if (filterSize == 4) {
363 __m256i src8, src9, src10, src11, src12, src13, src14, src15;
365 __m256i vmax = __lasx_xvreplgr2vr_w(
max);
366 __m256i shuf = {0x0000000400000000, 0x0000000500000001,
367 0x0000000600000002, 0x0000000700000003};
377 __lasx_xvstelm_d(
src0, dst, 0, 0);
378 __lasx_xvstelm_d(
src0, dst, 8, 1);
384 __lasx_xvstelm_d(
src0, dst, 0, 0);
392 const uint8_t *srcPos =
src + filterPos[0];
394 for (
int j = 0; j < filterSize; j++) {
399 }
else if (filterSize > 8) {
400 int filterlen = filterSize - 7;
403 __m256i
zero = __lasx_xvldi(0);
409 const uint8_t *srcPos1 =
src + filterPos[0];
410 const uint8_t *srcPos2 =
src + filterPos[1];
411 const uint8_t *srcPos3 =
src + filterPos[2];
412 const uint8_t *srcPos4 =
src + filterPos[3];
413 const int16_t *filterStart1 =
filter;
414 const int16_t *filterStart2 = filterStart1 + filterSize;
415 const int16_t *filterStart3 = filterStart2 + filterSize;
416 const int16_t *filterStart4 = filterStart3 + filterSize;
417 int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
419 for (j = 0; j < filterlen; j += 8) {
422 val1 = __lasx_xvpickve2gr_w(
out, 0);
423 val2 = __lasx_xvpickve2gr_w(
out, 4);
424 val3 = __lasx_xvpickve2gr_w(
out, 2);
425 val4 = __lasx_xvpickve2gr_w(
out, 6);
426 for (; j < filterSize; j++) {
427 val1 += ((
int)srcPos1[j]) * filterStart1[j];
428 val2 += ((
int)srcPos2[j]) * filterStart2[j];
429 val3 += ((
int)srcPos3[j]) * filterStart3[j];
430 val4 += ((
int)srcPos4[j]) * filterStart4[j];
438 filter = filterStart4 + filterSize;
440 for(
i = 0;
i < res;
i++) {
442 const uint8_t *srcPos =
src + filterPos[
i];
445 for (j = 0; j < filterlen; j += 8) {
446 src1 = __lasx_xvldrepl_d((srcPos + j), 0);
450 out0 = __lasx_xvhaddw_d_w(out0, out0);
451 out0 = __lasx_xvhaddw_q_d(out0, out0);
452 val += __lasx_xvpickve2gr_w(out0, 0);
454 for (; j < filterSize; j++) {
461 for (
i = 0;
i < dstW;
i++) {
463 const uint8_t *srcPos =
src + filterPos[
i];
465 for (
int j = 0; j < filterSize; j++) {
475 const uint8_t *
src,
const int16_t *
filter,
476 const int32_t *filterPos,
int filterSize)
479 int max = (1 << 19) - 1;
482 if (filterSize == 8) {
485 __m256i vmax = __lasx_xvreplgr2vr_w(
max);
486 __m256i shuf = {0x0000000400000000, 0x0000000500000001,
487 0x0000000600000002, 0x0000000700000003};
492 __lasx_xvst(
src0, dst, 0);
497 __lasx_xvstelm_d(
src0, dst, 0, 0);
498 __lasx_xvstelm_d(
src0, dst, 8, 1);
508 src0 = __lasx_xvldrepl_d(
src + filterPos[0], 0);
512 out0 = __lasx_xvhaddw_d_w(out0, out0);
513 out0 = __lasx_xvhaddw_q_d(out0, out0);
514 val = __lasx_xvpickve2gr_w(out0, 0);
517 }
else if (filterSize == 4) {
520 __m256i vmax = __lasx_xvreplgr2vr_w(
max);
521 __m256i shuf = {0x0000000100000000, 0x0000000500000004,
522 0x0000000300000002, 0x0000000700000006};
528 __lasx_xvst(
src0, dst, 0);
533 __lasx_xvstelm_d(
src0, dst, 0, 0);
534 __lasx_xvstelm_d(
src0, dst, 8, 1);
542 const uint8_t *srcPos =
src + filterPos[0];
544 for (
int j = 0; j < filterSize; j++) {
549 }
else if (filterSize > 8) {
552 int filterlen = filterSize - 7;
553 __m256i
zero = __lasx_xvldi(0);
559 const uint8_t *srcPos1 =
src + filterPos[0];
560 const uint8_t *srcPos2 =
src + filterPos[1];
561 const uint8_t *srcPos3 =
src + filterPos[2];
562 const uint8_t *srcPos4 =
src + filterPos[3];
563 const int16_t *filterStart1 =
filter;
564 const int16_t *filterStart2 = filterStart1 + filterSize;
565 const int16_t *filterStart3 = filterStart2 + filterSize;
566 const int16_t *filterStart4 = filterStart3 + filterSize;
567 int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
569 for (j = 0; j < filterlen; j += 8) {
572 val1 = __lasx_xvpickve2gr_w(
out, 0);
573 val2 = __lasx_xvpickve2gr_w(
out, 4);
574 val3 = __lasx_xvpickve2gr_w(
out, 2);
575 val4 = __lasx_xvpickve2gr_w(
out, 6);
576 for (; j < filterSize; j++) {
577 val1 += ((
int)srcPos1[j]) * filterStart1[j];
578 val2 += ((
int)srcPos2[j]) * filterStart2[j];
579 val3 += ((
int)srcPos3[j]) * filterStart3[j];
580 val4 += ((
int)srcPos4[j]) * filterStart4[j];
588 filter = filterStart4 + filterSize;
590 for (
i = 0;
i < res;
i++) {
592 const uint8_t *srcPos =
src + filterPos[
i];
595 for (j = 0; j < filterlen; j += 8) {
596 src1 = __lasx_xvldrepl_d((srcPos + j), 0);
600 out0 = __lasx_xvhaddw_d_w(out0, out0);
601 out0 = __lasx_xvhaddw_q_d(out0, out0);
602 val += __lasx_xvpickve2gr_w(out0, 0);
604 for (; j < filterSize; j++) {
611 for (
i = 0;
i < dstW;
i++) {
613 const uint8_t *srcPos =
src + filterPos[
i];
615 for (
int j = 0; j < filterSize; j++) {
628 __m256i src0, src1, src2, src3, filter0, filter1, out0, out1; \
629 DUP4_ARG2(__lasx_xvld, src + filterPos[0], 0, src + filterPos[1], 0, \
630 src + filterPos[2], 0, src + filterPos[3], 0, src0, src1, src2,\
632 filter0 = __lasx_xvld(filter, 0); \
633 filter1 = __lasx_xvld(filter, 32); \
634 src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
635 src2 = __lasx_xvpermi_q(src2, src3, 0x02); \
636 out0 = __lasx_xvdp2_w_hu_h(src0, filter0); \
637 out1 = __lasx_xvdp2_w_hu_h(src2, filter1); \
638 src0 = __lasx_xvhaddw_d_w(out0, out0); \
639 src1 = __lasx_xvhaddw_d_w(out1, out1); \
640 out0 = __lasx_xvpackev_d(src1, src0); \
641 out1 = __lasx_xvpackod_d(src1, src0); \
642 out0 = __lasx_xvadd_w(out0, out1); \
643 out0 = __lasx_xvsra_w(out0, shift); \
644 out0 = __lasx_xvmin_w(out0, v_max); \
645 dst[0] = __lasx_xvpickve2gr_w(out0, 0); \
646 dst[1] = __lasx_xvpickve2gr_w(out0, 4); \
647 dst[2] = __lasx_xvpickve2gr_w(out0, 2); \
648 dst[3] = __lasx_xvpickve2gr_w(out0, 6); \
657 DUP4_ARG2(__lasx_xvldx, srcPos1, dex, srcPos2, dex, srcPos3, dex, \
658 srcPos4, dex, src0, src1, src2, src3); \
659 DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex, \
660 filterStart3, dex, filterStart4, dex, filter0, \
661 filter1, filter2, filter3); \
662 src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
663 src1 = __lasx_xvpermi_q(src2, src3, 0x02); \
664 filter0 = __lasx_xvpermi_q(filter0, filter1, 0x02); \
665 filter1 = __lasx_xvpermi_q(filter2, filter3, 0x02); \
666 out0 = __lasx_xvdp2_w_hu_h(src0, filter0); \
667 out1 = __lasx_xvdp2_w_hu_h(src1, filter1); \
668 src0 = __lasx_xvhaddw_d_w(out0, out0); \
669 src1 = __lasx_xvhaddw_d_w(out1, out1); \
670 out0 = __lasx_xvpackev_d(src1, src0); \
671 out1 = __lasx_xvpackod_d(src1, src0); \
672 out0 = __lasx_xvadd_w(out0, out1); \
673 out = __lasx_xvadd_w(out, out0); \
677 const uint8_t *_src,
const int16_t *
filter,
678 const int32_t *filterPos,
int filterSize)
682 const uint16_t *
src = (
const uint16_t *) _src;
683 int sh =
desc->comp[0].depth - 1;
684 int max = (1 << 15) - 1;
688 __m256i
zero = __lasx_xvldi(0);
692 (
desc->comp[0].depth - 1);
696 shift = __lasx_xvreplgr2vr_w(sh);
698 if (filterSize == 8) {
699 __m256i v_max = __lasx_xvreplgr2vr_w(
max);
700 for (
i = 0;
i <
len;
i++) {
703 for (
i = 0;
i < res;
i++) {
707 src0 = __lasx_xvld(
src + filterPos[
i], 0);
710 out0 = __lasx_xvhaddw_d_w(out0, out0);
711 out0 = __lasx_xvhaddw_q_d(out0, out0);
712 val = __lasx_xvpickve2gr_w(out0, 0);
716 }
else if (filterSize == 4) {
717 __m256i v_max = __lasx_xvreplgr2vr_w(
max);
718 for (
i = 0;
i <
len;
i++) {
721 src1 = __lasx_xvldrepl_d(
src + filterPos[0], 0);
722 src2 = __lasx_xvldrepl_d(
src + filterPos[1], 0);
723 src3 = __lasx_xvldrepl_d(
src + filterPos[2], 0);
724 src4 = __lasx_xvldrepl_d(
src + filterPos[3], 0);
727 src3 = __lasx_xvextrins_d(src3, src4, 0x10);
728 src0 = __lasx_xvpermi_q(
src1, src3, 0x02);
730 out0 = __lasx_xvhaddw_d_w(out0, out0);
731 out0 = __lasx_xvsra_w(out0,
shift);
732 out0 = __lasx_xvmin_w(out0, v_max);
733 dst[0] = __lasx_xvpickve2gr_w(out0, 0);
734 dst[1] = __lasx_xvpickve2gr_w(out0, 2);
735 dst[2] = __lasx_xvpickve2gr_w(out0, 4);
736 dst[3] = __lasx_xvpickve2gr_w(out0, 6);
741 for (
i = 0;
i < res;
i++) {
743 const uint16_t *srcPos =
src + filterPos[
i];
745 for (
int j = 0; j < filterSize; j++) {
751 }
else if (filterSize > 8) {
752 int filterlen = filterSize - 7;
754 for (
i = 0;
i <
len;
i++) {
758 const uint16_t *srcPos1 =
src + filterPos[0];
759 const uint16_t *srcPos2 =
src + filterPos[1];
760 const uint16_t *srcPos3 =
src + filterPos[2];
761 const uint16_t *srcPos4 =
src + filterPos[3];
762 const int16_t *filterStart1 =
filter;
763 const int16_t *filterStart2 = filterStart1 + filterSize;
764 const int16_t *filterStart3 = filterStart2 + filterSize;
765 const int16_t *filterStart4 = filterStart3 + filterSize;
766 int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
768 for (j = 0; j < filterlen; j += 8) {
771 val1 = __lasx_xvpickve2gr_w(
out, 0);
772 val2 = __lasx_xvpickve2gr_w(
out, 4);
773 val3 = __lasx_xvpickve2gr_w(
out, 2);
774 val4 = __lasx_xvpickve2gr_w(
out, 6);
775 for (; j < filterSize; j++) {
776 val1 += ((
int)srcPos1[j]) * filterStart1[j];
777 val2 += ((
int)srcPos2[j]) * filterStart2[j];
778 val3 += ((
int)srcPos3[j]) * filterStart3[j];
779 val4 += ((
int)srcPos4[j]) * filterStart4[j];
787 filter = filterStart4 + filterSize;
789 for (
i = 0;
i < res;
i++) {
791 const uint16_t *srcPos =
src + filterPos[
i];
794 for (j = 0; j < filterlen; j += 8) {
796 src0 = __lasx_xvldx(srcPos, dex);
799 out0 = __lasx_xvhaddw_d_w(out0, out0);
800 out0 = __lasx_xvhaddw_q_d(out0, out0);
801 val += __lasx_xvpickve2gr_w(out0, 0);
803 for (; j < filterSize; j++) {
810 for (
i = 0;
i < dstW;
i++) {
812 const uint16_t *srcPos =
src + filterPos[
i];
814 for (
int j = 0; j < filterSize; j++) {
824 const uint8_t *_src,
const int16_t *
filter,
825 const int32_t *filterPos,
int filterSize)
830 const uint16_t *
src = (
const uint16_t *) _src;
831 int sh =
desc->comp[0].depth - 5;
832 int max = (1 << 19) - 1;
836 __m256i
zero = __lasx_xvldi(0);
839 &&
desc->comp[0].depth<16) {
844 shift = __lasx_xvreplgr2vr_w(sh);
846 if (filterSize == 8) {
847 __m256i v_max = __lasx_xvreplgr2vr_w(
max);
848 for (
i = 0;
i <
len;
i++) {
851 for (
i = 0;
i < res;
i++) {
855 src0 = __lasx_xvld(
src + filterPos[
i], 0);
858 out0 = __lasx_xvhaddw_d_w(out0, out0);
859 out0 = __lasx_xvhaddw_q_d(out0, out0);
860 val = __lasx_xvpickve2gr_w(out0, 0);
864 }
else if (filterSize == 4) {
865 __m256i v_max = __lasx_xvreplgr2vr_w(
max);
866 for (
i = 0;
i <
len;
i++) {
869 src1 = __lasx_xvldrepl_d(
src + filterPos[0], 0);
870 src2 = __lasx_xvldrepl_d(
src + filterPos[1], 0);
871 src3 = __lasx_xvldrepl_d(
src + filterPos[2], 0);
872 src4 = __lasx_xvldrepl_d(
src + filterPos[3], 0);
875 src3 = __lasx_xvextrins_d(src3, src4, 0x10);
876 src0 = __lasx_xvpermi_q(
src1, src3, 0x02);
878 out0 = __lasx_xvhaddw_d_w(out0, out0);
879 out0 = __lasx_xvsra_w(out0,
shift);
880 out0 = __lasx_xvmin_w(out0, v_max);
881 dst[0] = __lasx_xvpickve2gr_w(out0, 0);
882 dst[1] = __lasx_xvpickve2gr_w(out0, 2);
883 dst[2] = __lasx_xvpickve2gr_w(out0, 4);
884 dst[3] = __lasx_xvpickve2gr_w(out0, 6);
889 for (
i = 0;
i < res;
i++) {
891 const uint16_t *srcPos =
src + filterPos[
i];
893 for (
int j = 0; j < filterSize; j++) {
899 }
else if (filterSize > 8) {
900 int filterlen = filterSize - 7;
902 for (
i = 0;
i <
len;
i ++) {
906 const uint16_t *srcPos1 =
src + filterPos[0];
907 const uint16_t *srcPos2 =
src + filterPos[1];
908 const uint16_t *srcPos3 =
src + filterPos[2];
909 const uint16_t *srcPos4 =
src + filterPos[3];
910 const int16_t *filterStart1 =
filter;
911 const int16_t *filterStart2 = filterStart1 + filterSize;
912 const int16_t *filterStart3 = filterStart2 + filterSize;
913 const int16_t *filterStart4 = filterStart3 + filterSize;
914 int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
916 for (j = 0; j < filterlen; j += 8) {
919 val1 = __lasx_xvpickve2gr_w(
out, 0);
920 val2 = __lasx_xvpickve2gr_w(
out, 4);
921 val3 = __lasx_xvpickve2gr_w(
out, 2);
922 val4 = __lasx_xvpickve2gr_w(
out, 6);
923 for (; j < filterSize; j++) {
924 val1 += ((
int)srcPos1[j]) * filterStart1[j];
925 val2 += ((
int)srcPos2[j]) * filterStart2[j];
926 val3 += ((
int)srcPos3[j]) * filterStart3[j];
927 val4 += ((
int)srcPos4[j]) * filterStart4[j];
935 filter = filterStart4 + filterSize;
937 for (
i = 0;
i < res;
i++) {
939 const uint16_t *srcPos =
src + filterPos[
i];
942 for (j = 0; j < filterlen; j += 8) {
944 src0 = __lasx_xvldx(srcPos, dex);
947 out0 = __lasx_xvhaddw_d_w(out0, out0);
948 out0 = __lasx_xvhaddw_q_d(out0, out0);
949 val += __lasx_xvpickve2gr_w(out0, 0);
951 for (; j < filterSize; j++) {
958 for (
i = 0;
i < dstW;
i++) {
960 const uint16_t *srcPos =
src + filterPos[
i];
962 for (
int j = 0; j < filterSize; j++) {