30 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
31 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
32 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
33 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
34 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
35 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
38 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
43 tmp0_m = __lasx_xvshuf_b(in1, in0, mask0); \
44 out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m); \
45 tmp0_m = __lasx_xvshuf_b(in1, in0, mask1); \
46 out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \
47 tmp0_m = __lasx_xvshuf_b(in1, in0, mask2); \
48 out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \
53 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
57 out0_m = __lasx_xvdp2_h_b(in0, coeff0); \
58 DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\
59 in2, coeff2, out0_m, out0_m); \
67 uint8_t *dst, ptrdiff_t
stride)
69 const int16_t filt_const0 = 0xfb01;
70 const int16_t filt_const1 = 0x1414;
71 const int16_t filt_const2 = 0x1fb;
73 ptrdiff_t stride_2x =
stride << 1;
74 ptrdiff_t stride_3x = stride_2x +
stride;
75 ptrdiff_t stride_4x =
stride << 2;
77 __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
78 __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
79 __m256i src_vt7, src_vt8;
80 __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
81 __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
82 __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
83 __m256i vt_out3, out0, out1, out2, out3;
84 __m256i minus5b = __lasx_xvldi(0xFB);
85 __m256i plus20b = __lasx_xvldi(20);
87 filt0 = __lasx_xvreplgr2vr_h(filt_const0);
88 filt1 = __lasx_xvreplgr2vr_h(filt_const1);
89 filt2 = __lasx_xvreplgr2vr_h(filt_const2);
93 src_vt0 = __lasx_xvld(src_y, 0);
94 DUP4_ARG2(__lasx_xvldx, src_y,
stride, src_y, stride_2x, src_y, stride_3x,
95 src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
98 src_vt0 = __lasx_xvxori_b(src_vt0, 128);
99 DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
100 src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
102 for (loop_cnt = 4; loop_cnt--;) {
103 src_hz0 = __lasx_xvld(src_x, 0);
106 src_hz3 = __lasx_xvldx(src_x, stride_3x);
108 src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
109 src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
110 src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
111 src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
112 DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
113 src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
119 hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
120 hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
123 src_y, stride_3x, src_y, stride_4x,
124 src_vt5, src_vt6, src_vt7, src_vt8);
127 DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
128 src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
130 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
131 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
132 src_vt0, src_vt1, src_vt2, src_vt3);
133 src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
134 DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
135 src_vt3, src_vt2, src_vt87_h, src_vt3,
136 src_hz0, src_hz1, src_hz2, src_hz3);
137 DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
138 src_vt3, src_vt2, src_vt87_h, src_vt3,
139 src_vt0, src_vt1, src_vt2, src_vt3);
140 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1,
141 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02,
142 src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
143 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1,
144 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13,
145 src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
146 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0,
148 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0,
150 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0,
152 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0,
154 vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
155 vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
157 DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
159 DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
161 tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
162 tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
164 DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
165 out0 = __lasx_xvld(dst, 0);
167 out3 = __lasx_xvldx(dst, stride_3x);
168 out0 = __lasx_xvpermi_q(out0, out2, 0x02);
169 out1 = __lasx_xvpermi_q(out1, out3, 0x02);
170 out2 = __lasx_xvilvl_d(out1, out0);
171 out3 = __lasx_xvilvh_d(out1, out0);
172 out0 = __lasx_xvpermi_q(out2, out3, 0x02);
173 out1 = __lasx_xvpermi_q(out2, out3, 0x13);
174 tmp0 = __lasx_xvavgr_bu(out0, tmp0);
175 tmp1 = __lasx_xvavgr_bu(out1, tmp1);
177 __lasx_xvstelm_d(tmp0, dst, 0, 0);
178 __lasx_xvstelm_d(tmp0, dst +
stride, 0, 1);
179 __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
180 __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
182 __lasx_xvstelm_d(tmp0, dst, 8, 2);
183 __lasx_xvstelm_d(tmp0, dst +
stride, 8, 3);
184 __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
185 __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
198 uint8_t *dst, ptrdiff_t
stride)
200 const int16_t filt_const0 = 0xfb01;
201 const int16_t filt_const1 = 0x1414;
202 const int16_t filt_const2 = 0x1fb;
204 ptrdiff_t stride_2x =
stride << 1;
205 ptrdiff_t stride_3x = stride_2x +
stride;
206 ptrdiff_t stride_4x =
stride << 2;
208 __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
209 __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
210 __m256i src_vt7, src_vt8;
211 __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
212 __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
213 __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
214 __m256i vt_out3, out0, out1, out2, out3;
215 __m256i minus5b = __lasx_xvldi(0xFB);
216 __m256i plus20b = __lasx_xvldi(20);
218 filt0 = __lasx_xvreplgr2vr_h(filt_const0);
219 filt1 = __lasx_xvreplgr2vr_h(filt_const1);
220 filt2 = __lasx_xvreplgr2vr_h(filt_const2);
224 src_vt0 = __lasx_xvld(src_y, 0);
225 DUP4_ARG2(__lasx_xvldx, src_y,
stride, src_y, stride_2x, src_y, stride_3x,
226 src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
229 src_vt0 = __lasx_xvxori_b(src_vt0, 128);
230 DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
231 src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
233 for (loop_cnt = 4; loop_cnt--;) {
234 src_hz0 = __lasx_xvld(src_x, 0);
237 src_hz3 = __lasx_xvldx(src_x, stride_3x);
239 src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
240 src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
241 src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
242 src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
243 DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
244 src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
250 hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
251 hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
254 src_y, stride_3x, src_y, stride_4x,
255 src_vt5, src_vt6, src_vt7, src_vt8);
258 DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
259 src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
260 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
261 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
262 src_vt0, src_vt1, src_vt2, src_vt3);
263 src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
264 DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
265 src_vt3, src_vt2, src_vt87_h, src_vt3,
266 src_hz0, src_hz1, src_hz2, src_hz3);
267 DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
268 src_vt3, src_vt2, src_vt87_h, src_vt3,
269 src_vt0, src_vt1, src_vt2, src_vt3);
270 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1,
271 src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3,
272 0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
273 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1,
274 src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3,
275 0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
278 filt0, filt1, filt2);
280 filt0, filt1, filt2);
282 filt0, filt1, filt2);
284 filt0, filt1, filt2);
285 vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
286 vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
288 DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
290 DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
292 tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
293 tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
295 DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
296 __lasx_xvstelm_d(tmp0, dst, 0, 0);
297 __lasx_xvstelm_d(tmp0, dst +
stride, 0, 1);
298 __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
299 __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
301 __lasx_xvstelm_d(tmp0, dst, 8, 2);
302 __lasx_xvstelm_d(tmp0, dst +
stride, 8, 3);
303 __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
304 __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
320 ptrdiff_t stride_2, stride_3, stride_4;
322 "slli.d %[stride_2], %[stride], 1 \n\t"
323 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
324 "slli.d %[stride_4], %[stride_2], 1 \n\t"
325 "ld.d %[tmp0], %[src], 0x0 \n\t"
326 "ldx.d %[tmp1], %[src], %[stride] \n\t"
327 "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
328 "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
329 "add.d %[src], %[src], %[stride_4] \n\t"
330 "ld.d %[tmp4], %[src], 0x0 \n\t"
331 "ldx.d %[tmp5], %[src], %[stride] \n\t"
332 "ldx.d %[tmp6], %[src], %[stride_2] \n\t"
333 "ldx.d %[tmp7], %[src], %[stride_3] \n\t"
335 "st.d %[tmp0], %[dst], 0x0 \n\t"
336 "stx.d %[tmp1], %[dst], %[stride] \n\t"
337 "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
338 "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
339 "add.d %[dst], %[dst], %[stride_4] \n\t"
340 "st.d %[tmp4], %[dst], 0x0 \n\t"
341 "stx.d %[tmp5], %[dst], %[stride] \n\t"
342 "stx.d %[tmp6], %[dst], %[stride_2] \n\t"
343 "stx.d %[tmp7], %[dst], %[stride_3] \n\t"
344 : [tmp0]
"=&r"(
tmp[0]), [tmp1]
"=&r"(
tmp[1]),
345 [tmp2]
"=&r"(
tmp[2]), [tmp3]
"=&r"(
tmp[3]),
346 [tmp4]
"=&r"(
tmp[4]), [tmp5]
"=&r"(
tmp[5]),
347 [tmp6]
"=&r"(
tmp[6]), [tmp7]
"=&r"(
tmp[7]),
348 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
349 [stride_4]
"=&r"(stride_4),
350 [dst]
"+&r"(dst), [
src]
"+&r"(
src)
363 ptrdiff_t stride_2, stride_3, stride_4;
366 "slli.d %[stride_2], %[stride], 1 \n\t"
367 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
368 "slli.d %[stride_4], %[stride_2], 1 \n\t"
369 "vld $vr0, %[src], 0 \n\t"
370 "vldx $vr1, %[src], %[stride] \n\t"
371 "vldx $vr2, %[src], %[stride_2] \n\t"
372 "vldx $vr3, %[src], %[stride_3] \n\t"
373 "add.d %[src], %[src], %[stride_4] \n\t"
374 "vld $vr4, %[src], 0 \n\t"
375 "vldx $vr5, %[src], %[stride] \n\t"
376 "vldx $vr6, %[src], %[stride_2] \n\t"
377 "vldx $vr7, %[src], %[stride_3] \n\t"
379 "vld $vr8, %[tmp], 0 \n\t"
380 "vldx $vr9, %[tmp], %[stride] \n\t"
381 "vldx $vr10, %[tmp], %[stride_2] \n\t"
382 "vldx $vr11, %[tmp], %[stride_3] \n\t"
383 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
384 "vld $vr12, %[tmp], 0 \n\t"
385 "vldx $vr13, %[tmp], %[stride] \n\t"
386 "vldx $vr14, %[tmp], %[stride_2] \n\t"
387 "vldx $vr15, %[tmp], %[stride_3] \n\t"
389 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
390 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
391 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
392 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
393 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
394 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
395 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
396 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
398 "vstelm.d $vr0, %[dst], 0, 0 \n\t"
399 "add.d %[dst], %[dst], %[stride] \n\t"
400 "vstelm.d $vr1, %[dst], 0, 0 \n\t"
401 "add.d %[dst], %[dst], %[stride] \n\t"
402 "vstelm.d $vr2, %[dst], 0, 0 \n\t"
403 "add.d %[dst], %[dst], %[stride] \n\t"
404 "vstelm.d $vr3, %[dst], 0, 0 \n\t"
405 "add.d %[dst], %[dst], %[stride] \n\t"
406 "vstelm.d $vr4, %[dst], 0, 0 \n\t"
407 "add.d %[dst], %[dst], %[stride] \n\t"
408 "vstelm.d $vr5, %[dst], 0, 0 \n\t"
409 "add.d %[dst], %[dst], %[stride] \n\t"
410 "vstelm.d $vr6, %[dst], 0, 0 \n\t"
411 "add.d %[dst], %[dst], %[stride] \n\t"
412 "vstelm.d $vr7, %[dst], 0, 0 \n\t"
414 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
415 [stride_4]
"=&r"(stride_4)
426 ptrdiff_t dstStride, ptrdiff_t srcStride)
428 ptrdiff_t stride_2, stride_3, stride_4;
431 "slli.d %[stride_2], %[srcStride], 1 \n\t"
432 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
433 "slli.d %[stride_4], %[stride_2], 1 \n\t"
434 "vld $vr0, %[src], 0 \n\t"
435 "vldx $vr1, %[src], %[srcStride] \n\t"
436 "vldx $vr2, %[src], %[stride_2] \n\t"
437 "vldx $vr3, %[src], %[stride_3] \n\t"
438 "add.d %[src], %[src], %[stride_4] \n\t"
439 "vld $vr4, %[src], 0 \n\t"
440 "vldx $vr5, %[src], %[srcStride] \n\t"
441 "vldx $vr6, %[src], %[stride_2] \n\t"
442 "vldx $vr7, %[src], %[stride_3] \n\t"
444 "vld $vr8, %[half], 0x00 \n\t"
445 "vld $vr9, %[half], 0x08 \n\t"
446 "vld $vr10, %[half], 0x10 \n\t"
447 "vld $vr11, %[half], 0x18 \n\t"
448 "vld $vr12, %[half], 0x20 \n\t"
449 "vld $vr13, %[half], 0x28 \n\t"
450 "vld $vr14, %[half], 0x30 \n\t"
451 "vld $vr15, %[half], 0x38 \n\t"
453 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
454 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
455 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
456 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
457 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
458 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
459 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
460 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
462 "vstelm.d $vr0, %[dst], 0, 0 \n\t"
463 "add.d %[dst], %[dst], %[dstStride] \n\t"
464 "vstelm.d $vr1, %[dst], 0, 0 \n\t"
465 "add.d %[dst], %[dst], %[dstStride] \n\t"
466 "vstelm.d $vr2, %[dst], 0, 0 \n\t"
467 "add.d %[dst], %[dst], %[dstStride] \n\t"
468 "vstelm.d $vr3, %[dst], 0, 0 \n\t"
469 "add.d %[dst], %[dst], %[dstStride] \n\t"
470 "vstelm.d $vr4, %[dst], 0, 0 \n\t"
471 "add.d %[dst], %[dst], %[dstStride] \n\t"
472 "vstelm.d $vr5, %[dst], 0, 0 \n\t"
473 "add.d %[dst], %[dst], %[dstStride] \n\t"
474 "vstelm.d $vr6, %[dst], 0, 0 \n\t"
475 "add.d %[dst], %[dst], %[dstStride] \n\t"
476 "vstelm.d $vr7, %[dst], 0, 0 \n\t"
478 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
479 [stride_4]
"=&r"(stride_4)
480 : [srcStride]
"r"(srcStride), [dstStride]
"r"(dstStride)
490 ptrdiff_t dstStride, ptrdiff_t srcStride)
493 ptrdiff_t stride_2, stride_3, stride_4;
496 "slli.d %[stride_2], %[srcStride], 1 \n\t"
497 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
498 "slli.d %[stride_4], %[stride_2], 1 \n\t"
499 "vld $vr0, %[src], 0 \n\t"
500 "vldx $vr1, %[src], %[srcStride] \n\t"
501 "vldx $vr2, %[src], %[stride_2] \n\t"
502 "vldx $vr3, %[src], %[stride_3] \n\t"
503 "add.d %[src], %[src], %[stride_4] \n\t"
504 "vld $vr4, %[src], 0 \n\t"
505 "vldx $vr5, %[src], %[srcStride] \n\t"
506 "vldx $vr6, %[src], %[stride_2] \n\t"
507 "vldx $vr7, %[src], %[stride_3] \n\t"
509 "vld $vr8, %[half], 0x00 \n\t"
510 "vld $vr9, %[half], 0x08 \n\t"
511 "vld $vr10, %[half], 0x10 \n\t"
512 "vld $vr11, %[half], 0x18 \n\t"
513 "vld $vr12, %[half], 0x20 \n\t"
514 "vld $vr13, %[half], 0x28 \n\t"
515 "vld $vr14, %[half], 0x30 \n\t"
516 "vld $vr15, %[half], 0x38 \n\t"
518 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
519 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
520 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
521 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
522 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
523 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
524 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
525 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
527 "slli.d %[stride_2], %[dstStride], 1 \n\t"
528 "add.d %[stride_3], %[stride_2], %[dstStride] \n\t"
529 "slli.d %[stride_4], %[stride_2], 1 \n\t"
530 "vld $vr8, %[tmp], 0 \n\t"
531 "vldx $vr9, %[tmp], %[dstStride] \n\t"
532 "vldx $vr10, %[tmp], %[stride_2] \n\t"
533 "vldx $vr11, %[tmp], %[stride_3] \n\t"
534 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
535 "vld $vr12, %[tmp], 0 \n\t"
536 "vldx $vr13, %[tmp], %[dstStride] \n\t"
537 "vldx $vr14, %[tmp], %[stride_2] \n\t"
538 "vldx $vr15, %[tmp], %[stride_3] \n\t"
540 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
541 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
542 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
543 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
544 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
545 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
546 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
547 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
549 "vstelm.d $vr0, %[dst], 0, 0 \n\t"
550 "add.d %[dst], %[dst], %[dstStride] \n\t"
551 "vstelm.d $vr1, %[dst], 0, 0 \n\t"
552 "add.d %[dst], %[dst], %[dstStride] \n\t"
553 "vstelm.d $vr2, %[dst], 0, 0 \n\t"
554 "add.d %[dst], %[dst], %[dstStride] \n\t"
555 "vstelm.d $vr3, %[dst], 0, 0 \n\t"
556 "add.d %[dst], %[dst], %[dstStride] \n\t"
557 "vstelm.d $vr4, %[dst], 0, 0 \n\t"
558 "add.d %[dst], %[dst], %[dstStride] \n\t"
559 "vstelm.d $vr5, %[dst], 0, 0 \n\t"
560 "add.d %[dst], %[dst], %[dstStride] \n\t"
561 "vstelm.d $vr6, %[dst], 0, 0 \n\t"
562 "add.d %[dst], %[dst], %[dstStride] \n\t"
563 "vstelm.d $vr7, %[dst], 0, 0 \n\t"
565 [
src]
"+&r"(
src), [stride_2]
"=&r"(stride_2),
566 [stride_3]
"=&r"(stride_3), [stride_4]
"=&r"(stride_4)
567 : [dstStride]
"r"(dstStride), [srcStride]
"r"(srcStride)
576 ptrdiff_t stride_2, stride_3, stride_4;
578 "slli.d %[stride_2], %[stride], 1 \n\t"
579 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
580 "slli.d %[stride_4], %[stride_2], 1 \n\t"
581 "vld $vr0, %[src], 0 \n\t"
582 "vldx $vr1, %[src], %[stride] \n\t"
583 "vldx $vr2, %[src], %[stride_2] \n\t"
584 "vldx $vr3, %[src], %[stride_3] \n\t"
585 "add.d %[src], %[src], %[stride_4] \n\t"
586 "vld $vr4, %[src], 0 \n\t"
587 "vldx $vr5, %[src], %[stride] \n\t"
588 "vldx $vr6, %[src], %[stride_2] \n\t"
589 "vldx $vr7, %[src], %[stride_3] \n\t"
590 "add.d %[src], %[src], %[stride_4] \n\t"
592 "vst $vr0, %[dst], 0 \n\t"
593 "vstx $vr1, %[dst], %[stride] \n\t"
594 "vstx $vr2, %[dst], %[stride_2] \n\t"
595 "vstx $vr3, %[dst], %[stride_3] \n\t"
596 "add.d %[dst], %[dst], %[stride_4] \n\t"
597 "vst $vr4, %[dst], 0 \n\t"
598 "vstx $vr5, %[dst], %[stride] \n\t"
599 "vstx $vr6, %[dst], %[stride_2] \n\t"
600 "vstx $vr7, %[dst], %[stride_3] \n\t"
601 "add.d %[dst], %[dst], %[stride_4] \n\t"
603 "vld $vr0, %[src], 0 \n\t"
604 "vldx $vr1, %[src], %[stride] \n\t"
605 "vldx $vr2, %[src], %[stride_2] \n\t"
606 "vldx $vr3, %[src], %[stride_3] \n\t"
607 "add.d %[src], %[src], %[stride_4] \n\t"
608 "vld $vr4, %[src], 0 \n\t"
609 "vldx $vr5, %[src], %[stride] \n\t"
610 "vldx $vr6, %[src], %[stride_2] \n\t"
611 "vldx $vr7, %[src], %[stride_3] \n\t"
613 "vst $vr0, %[dst], 0 \n\t"
614 "vstx $vr1, %[dst], %[stride] \n\t"
615 "vstx $vr2, %[dst], %[stride_2] \n\t"
616 "vstx $vr3, %[dst], %[stride_3] \n\t"
617 "add.d %[dst], %[dst], %[stride_4] \n\t"
618 "vst $vr4, %[dst], 0 \n\t"
619 "vstx $vr5, %[dst], %[stride] \n\t"
620 "vstx $vr6, %[dst], %[stride_2] \n\t"
621 "vstx $vr7, %[dst], %[stride_3] \n\t"
622 : [dst]
"+&r"(dst), [
src]
"+&r"(
src),
623 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
624 [stride_4]
"=&r"(stride_4)
637 ptrdiff_t stride_2, stride_3, stride_4;
640 "slli.d %[stride_2], %[stride], 1 \n\t"
641 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
642 "slli.d %[stride_4], %[stride_2], 1 \n\t"
643 "vld $vr0, %[src], 0 \n\t"
644 "vldx $vr1, %[src], %[stride] \n\t"
645 "vldx $vr2, %[src], %[stride_2] \n\t"
646 "vldx $vr3, %[src], %[stride_3] \n\t"
647 "add.d %[src], %[src], %[stride_4] \n\t"
648 "vld $vr4, %[src], 0 \n\t"
649 "vldx $vr5, %[src], %[stride] \n\t"
650 "vldx $vr6, %[src], %[stride_2] \n\t"
651 "vldx $vr7, %[src], %[stride_3] \n\t"
652 "add.d %[src], %[src], %[stride_4] \n\t"
654 "vld $vr8, %[tmp], 0 \n\t"
655 "vldx $vr9, %[tmp], %[stride] \n\t"
656 "vldx $vr10, %[tmp], %[stride_2] \n\t"
657 "vldx $vr11, %[tmp], %[stride_3] \n\t"
658 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
659 "vld $vr12, %[tmp], 0 \n\t"
660 "vldx $vr13, %[tmp], %[stride] \n\t"
661 "vldx $vr14, %[tmp], %[stride_2] \n\t"
662 "vldx $vr15, %[tmp], %[stride_3] \n\t"
663 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
665 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
666 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
667 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
668 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
669 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
670 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
671 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
672 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
674 "vst $vr0, %[dst], 0 \n\t"
675 "vstx $vr1, %[dst], %[stride] \n\t"
676 "vstx $vr2, %[dst], %[stride_2] \n\t"
677 "vstx $vr3, %[dst], %[stride_3] \n\t"
678 "add.d %[dst], %[dst], %[stride_4] \n\t"
679 "vst $vr4, %[dst], 0 \n\t"
680 "vstx $vr5, %[dst], %[stride] \n\t"
681 "vstx $vr6, %[dst], %[stride_2] \n\t"
682 "vstx $vr7, %[dst], %[stride_3] \n\t"
683 "add.d %[dst], %[dst], %[stride_4] \n\t"
686 "vld $vr0, %[src], 0 \n\t"
687 "vldx $vr1, %[src], %[stride] \n\t"
688 "vldx $vr2, %[src], %[stride_2] \n\t"
689 "vldx $vr3, %[src], %[stride_3] \n\t"
690 "add.d %[src], %[src], %[stride_4] \n\t"
691 "vld $vr4, %[src], 0 \n\t"
692 "vldx $vr5, %[src], %[stride] \n\t"
693 "vldx $vr6, %[src], %[stride_2] \n\t"
694 "vldx $vr7, %[src], %[stride_3] \n\t"
696 "vld $vr8, %[tmp], 0 \n\t"
697 "vldx $vr9, %[tmp], %[stride] \n\t"
698 "vldx $vr10, %[tmp], %[stride_2] \n\t"
699 "vldx $vr11, %[tmp], %[stride_3] \n\t"
700 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
701 "vld $vr12, %[tmp], 0 \n\t"
702 "vldx $vr13, %[tmp], %[stride] \n\t"
703 "vldx $vr14, %[tmp], %[stride_2] \n\t"
704 "vldx $vr15, %[tmp], %[stride_3] \n\t"
706 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
707 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
708 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
709 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
710 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
711 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
712 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
713 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
715 "vst $vr0, %[dst], 0 \n\t"
716 "vstx $vr1, %[dst], %[stride] \n\t"
717 "vstx $vr2, %[dst], %[stride_2] \n\t"
718 "vstx $vr3, %[dst], %[stride_3] \n\t"
719 "add.d %[dst], %[dst], %[stride_4] \n\t"
720 "vst $vr4, %[dst], 0 \n\t"
721 "vstx $vr5, %[dst], %[stride] \n\t"
722 "vstx $vr6, %[dst], %[stride_2] \n\t"
723 "vstx $vr7, %[dst], %[stride_3] \n\t"
725 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
726 [stride_4]
"=&r"(stride_4)
737 ptrdiff_t dstStride, ptrdiff_t srcStride)
739 ptrdiff_t stride_2, stride_3, stride_4;
740 ptrdiff_t dstride_2, dstride_3, dstride_4;
742 "slli.d %[stride_2], %[srcStride], 1 \n\t"
743 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
744 "slli.d %[stride_4], %[stride_2], 1 \n\t"
745 "slli.d %[dstride_2], %[dstStride], 1 \n\t"
746 "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
747 "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
749 "vld $vr0, %[src], 0 \n\t"
750 "vldx $vr1, %[src], %[srcStride] \n\t"
751 "vldx $vr2, %[src], %[stride_2] \n\t"
752 "vldx $vr3, %[src], %[stride_3] \n\t"
753 "add.d %[src], %[src], %[stride_4] \n\t"
754 "vld $vr4, %[src], 0 \n\t"
755 "vldx $vr5, %[src], %[srcStride] \n\t"
756 "vldx $vr6, %[src], %[stride_2] \n\t"
757 "vldx $vr7, %[src], %[stride_3] \n\t"
758 "add.d %[src], %[src], %[stride_4] \n\t"
760 "vld $vr8, %[half], 0x00 \n\t"
761 "vld $vr9, %[half], 0x10 \n\t"
762 "vld $vr10, %[half], 0x20 \n\t"
763 "vld $vr11, %[half], 0x30 \n\t"
764 "vld $vr12, %[half], 0x40 \n\t"
765 "vld $vr13, %[half], 0x50 \n\t"
766 "vld $vr14, %[half], 0x60 \n\t"
767 "vld $vr15, %[half], 0x70 \n\t"
769 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
770 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
771 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
772 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
773 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
774 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
775 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
776 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
778 "vst $vr0, %[dst], 0 \n\t"
779 "vstx $vr1, %[dst], %[dstStride] \n\t"
780 "vstx $vr2, %[dst], %[dstride_2] \n\t"
781 "vstx $vr3, %[dst], %[dstride_3] \n\t"
782 "add.d %[dst], %[dst], %[dstride_4] \n\t"
783 "vst $vr4, %[dst], 0 \n\t"
784 "vstx $vr5, %[dst], %[dstStride] \n\t"
785 "vstx $vr6, %[dst], %[dstride_2] \n\t"
786 "vstx $vr7, %[dst], %[dstride_3] \n\t"
787 "add.d %[dst], %[dst], %[dstride_4] \n\t"
790 "vld $vr0, %[src], 0 \n\t"
791 "vldx $vr1, %[src], %[srcStride] \n\t"
792 "vldx $vr2, %[src], %[stride_2] \n\t"
793 "vldx $vr3, %[src], %[stride_3] \n\t"
794 "add.d %[src], %[src], %[stride_4] \n\t"
795 "vld $vr4, %[src], 0 \n\t"
796 "vldx $vr5, %[src], %[srcStride] \n\t"
797 "vldx $vr6, %[src], %[stride_2] \n\t"
798 "vldx $vr7, %[src], %[stride_3] \n\t"
800 "vld $vr8, %[half], 0x80 \n\t"
801 "vld $vr9, %[half], 0x90 \n\t"
802 "vld $vr10, %[half], 0xa0 \n\t"
803 "vld $vr11, %[half], 0xb0 \n\t"
804 "vld $vr12, %[half], 0xc0 \n\t"
805 "vld $vr13, %[half], 0xd0 \n\t"
806 "vld $vr14, %[half], 0xe0 \n\t"
807 "vld $vr15, %[half], 0xf0 \n\t"
809 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
810 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
811 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
812 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
813 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
814 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
815 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
816 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
818 "vst $vr0, %[dst], 0 \n\t"
819 "vstx $vr1, %[dst], %[dstStride] \n\t"
820 "vstx $vr2, %[dst], %[dstride_2] \n\t"
821 "vstx $vr3, %[dst], %[dstride_3] \n\t"
822 "add.d %[dst], %[dst], %[dstride_4] \n\t"
823 "vst $vr4, %[dst], 0 \n\t"
824 "vstx $vr5, %[dst], %[dstStride] \n\t"
825 "vstx $vr6, %[dst], %[dstride_2] \n\t"
826 "vstx $vr7, %[dst], %[dstride_3] \n\t"
828 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
829 [stride_4]
"=&r"(stride_4), [dstride_2]
"=&r"(dstride_2),
830 [dstride_3]
"=&r"(dstride_3), [dstride_4]
"=&r"(dstride_4)
831 : [dstStride]
"r"(dstStride), [srcStride]
"r"(srcStride)
841 ptrdiff_t dstStride, ptrdiff_t srcStride)
844 ptrdiff_t stride_2, stride_3, stride_4;
845 ptrdiff_t dstride_2, dstride_3, dstride_4;
847 "slli.d %[stride_2], %[srcStride], 1 \n\t"
848 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
849 "slli.d %[stride_4], %[stride_2], 1 \n\t"
850 "slli.d %[dstride_2], %[dstStride], 1 \n\t"
851 "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
852 "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
854 "vld $vr0, %[src], 0 \n\t"
855 "vldx $vr1, %[src], %[srcStride] \n\t"
856 "vldx $vr2, %[src], %[stride_2] \n\t"
857 "vldx $vr3, %[src], %[stride_3] \n\t"
858 "add.d %[src], %[src], %[stride_4] \n\t"
859 "vld $vr4, %[src], 0 \n\t"
860 "vldx $vr5, %[src], %[srcStride] \n\t"
861 "vldx $vr6, %[src], %[stride_2] \n\t"
862 "vldx $vr7, %[src], %[stride_3] \n\t"
863 "add.d %[src], %[src], %[stride_4] \n\t"
865 "vld $vr8, %[half], 0x00 \n\t"
866 "vld $vr9, %[half], 0x10 \n\t"
867 "vld $vr10, %[half], 0x20 \n\t"
868 "vld $vr11, %[half], 0x30 \n\t"
869 "vld $vr12, %[half], 0x40 \n\t"
870 "vld $vr13, %[half], 0x50 \n\t"
871 "vld $vr14, %[half], 0x60 \n\t"
872 "vld $vr15, %[half], 0x70 \n\t"
874 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
875 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
876 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
877 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
878 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
879 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
880 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
881 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
883 "vld $vr8, %[tmp], 0 \n\t"
884 "vldx $vr9, %[tmp], %[dstStride] \n\t"
885 "vldx $vr10, %[tmp], %[dstride_2] \n\t"
886 "vldx $vr11, %[tmp], %[dstride_3] \n\t"
887 "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
888 "vld $vr12, %[tmp], 0 \n\t"
889 "vldx $vr13, %[tmp], %[dstStride] \n\t"
890 "vldx $vr14, %[tmp], %[dstride_2] \n\t"
891 "vldx $vr15, %[tmp], %[dstride_3] \n\t"
892 "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
894 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
895 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
896 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
897 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
898 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
899 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
900 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
901 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
903 "vst $vr0, %[dst], 0 \n\t"
904 "vstx $vr1, %[dst], %[dstStride] \n\t"
905 "vstx $vr2, %[dst], %[dstride_2] \n\t"
906 "vstx $vr3, %[dst], %[dstride_3] \n\t"
907 "add.d %[dst], %[dst], %[dstride_4] \n\t"
908 "vst $vr4, %[dst], 0 \n\t"
909 "vstx $vr5, %[dst], %[dstStride] \n\t"
910 "vstx $vr6, %[dst], %[dstride_2] \n\t"
911 "vstx $vr7, %[dst], %[dstride_3] \n\t"
912 "add.d %[dst], %[dst], %[dstride_4] \n\t"
915 "vld $vr0, %[src], 0 \n\t"
916 "vldx $vr1, %[src], %[srcStride] \n\t"
917 "vldx $vr2, %[src], %[stride_2] \n\t"
918 "vldx $vr3, %[src], %[stride_3] \n\t"
919 "add.d %[src], %[src], %[stride_4] \n\t"
920 "vld $vr4, %[src], 0 \n\t"
921 "vldx $vr5, %[src], %[srcStride] \n\t"
922 "vldx $vr6, %[src], %[stride_2] \n\t"
923 "vldx $vr7, %[src], %[stride_3] \n\t"
925 "vld $vr8, %[half], 0x80 \n\t"
926 "vld $vr9, %[half], 0x90 \n\t"
927 "vld $vr10, %[half], 0xa0 \n\t"
928 "vld $vr11, %[half], 0xb0 \n\t"
929 "vld $vr12, %[half], 0xc0 \n\t"
930 "vld $vr13, %[half], 0xd0 \n\t"
931 "vld $vr14, %[half], 0xe0 \n\t"
932 "vld $vr15, %[half], 0xf0 \n\t"
934 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
935 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
936 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
937 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
938 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
939 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
940 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
941 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
943 "vld $vr8, %[tmp], 0 \n\t"
944 "vldx $vr9, %[tmp], %[dstStride] \n\t"
945 "vldx $vr10, %[tmp], %[dstride_2] \n\t"
946 "vldx $vr11, %[tmp], %[dstride_3] \n\t"
947 "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
948 "vld $vr12, %[tmp], 0 \n\t"
949 "vldx $vr13, %[tmp], %[dstStride] \n\t"
950 "vldx $vr14, %[tmp], %[dstride_2] \n\t"
951 "vldx $vr15, %[tmp], %[dstride_3] \n\t"
953 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
954 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
955 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
956 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
957 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
958 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
959 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
960 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
962 "vst $vr0, %[dst], 0 \n\t"
963 "vstx $vr1, %[dst], %[dstStride] \n\t"
964 "vstx $vr2, %[dst], %[dstride_2] \n\t"
965 "vstx $vr3, %[dst], %[dstride_3] \n\t"
966 "add.d %[dst], %[dst], %[dstride_4] \n\t"
967 "vst $vr4, %[dst], 0 \n\t"
968 "vstx $vr5, %[dst], %[dstStride] \n\t"
969 "vstx $vr6, %[dst], %[dstride_2] \n\t"
970 "vstx $vr7, %[dst], %[dstride_3] \n\t"
972 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
973 [stride_4]
"=&r"(stride_4), [dstride_2]
"=&r"(dstride_2),
974 [dstride_3]
"=&r"(dstride_3), [dstride_4]
"=&r"(dstride_4)
975 : [dstStride]
"r"(dstStride), [srcStride]
"r"(srcStride)
980 #define QPEL8_H_LOWPASS(out_v) \
981 src00 = __lasx_xvld(src, - 2); \
983 src10 = __lasx_xvld(src, - 2); \
985 src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
986 src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
987 src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
988 src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
989 src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
990 src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
991 DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
992 src00 = __lasx_xvaddwl_h_bu(src00, src05); \
993 src02 = __lasx_xvmul_h(src02, h_20); \
994 src01 = __lasx_xvmul_h(src01, h_5); \
995 src02 = __lasx_xvssub_h(src02, src01); \
996 src02 = __lasx_xvsadd_h(src02, src00); \
997 src02 = __lasx_xvsadd_h(src02, h_16); \
998 out_v = __lasx_xvssrani_bu_h(src02, src02, 5); \
1004 int dstStride_2x = dstStride << 1;
1005 __m256i src00, src01, src02, src03, src04, src05, src10;
1006 __m256i out0, out1, out2, out3;
1007 __m256i h_20 = __lasx_xvldi(0x414);
1008 __m256i h_5 = __lasx_xvldi(0x405);
1009 __m256i h_16 = __lasx_xvldi(0x410);
1010 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1011 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1012 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1013 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1014 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1020 __lasx_xvstelm_d(out0, dst, 0, 0);
1021 __lasx_xvstelm_d(out0, dst + dstStride, 0, 2);
1022 dst += dstStride_2x;
1023 __lasx_xvstelm_d(out1, dst, 0, 0);
1024 __lasx_xvstelm_d(out1, dst + dstStride, 0, 2);
1025 dst += dstStride_2x;
1026 __lasx_xvstelm_d(out2, dst, 0, 0);
1027 __lasx_xvstelm_d(out2, dst + dstStride, 0, 2);
1028 dst += dstStride_2x;
1029 __lasx_xvstelm_d(out3, dst, 0, 0);
1030 __lasx_xvstelm_d(out3, dst + dstStride, 0, 2);
1033 #define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, \
1034 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5) \
1036 tmp0 = __lasx_xvpermi_q(src0, src1, 0x02); \
1037 tmp1 = __lasx_xvpermi_q(src1, src2, 0x02); \
1038 tmp2 = __lasx_xvpermi_q(src2, src3, 0x02); \
1039 tmp3 = __lasx_xvpermi_q(src3, src4, 0x02); \
1040 tmp4 = __lasx_xvpermi_q(src4, src5, 0x02); \
1041 tmp5 = __lasx_xvpermi_q(src5, src6, 0x02); \
1042 DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \
1043 tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5); \
1044 tmp2 = __lasx_xvmul_h(tmp2, h_20); \
1045 tmp1 = __lasx_xvmul_h(tmp1, h_5); \
1046 tmp2 = __lasx_xvssub_h(tmp2, tmp1); \
1047 tmp2 = __lasx_xvsadd_h(tmp2, tmp0); \
1048 tmp2 = __lasx_xvsadd_h(tmp2, h_16); \
1049 tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5); \
1056 int srcStride_2x = srcStride << 1;
1057 int dstStride_2x = dstStride << 1;
1058 int srcStride_4x = srcStride << 2;
1059 int srcStride_3x = srcStride_2x + srcStride;
1060 __m256i src00, src01, src02, src03, src04, src05, src06;
1061 __m256i src07, src08, src09, src10, src11, src12;
1062 __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05;
1063 __m256i h_20 = __lasx_xvldi(0x414);
1064 __m256i h_5 = __lasx_xvldi(0x405);
1065 __m256i h_16 = __lasx_xvldi(0x410);
1069 src02 = __lasx_xvld(
src, 0);
1071 srcStride_3x,
src, srcStride_4x, src03, src04, src05, src06);
1072 src += srcStride_4x;
1074 srcStride_3x,
src, srcStride_4x, src07, src08, src09, src10);
1075 src += srcStride_4x;
1076 DUP2_ARG2(__lasx_xvldx,
src, srcStride,
src, srcStride_2x, src11, src12);
1079 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1080 __lasx_xvstelm_d(tmp02, dst, 0, 0);
1081 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1082 dst += dstStride_2x;
1084 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1085 __lasx_xvstelm_d(tmp02, dst, 0, 0);
1086 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1087 dst += dstStride_2x;
1089 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1090 __lasx_xvstelm_d(tmp02, dst, 0, 0);
1091 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1092 dst += dstStride_2x;
1094 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1095 __lasx_xvstelm_d(tmp02, dst, 0, 0);
1096 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
1103 int srcStride_2x = srcStride << 1;
1104 int srcStride_4x = srcStride << 2;
1105 int dstStride_2x = dstStride << 1;
1106 int dstStride_4x = dstStride << 2;
1107 int srcStride_3x = srcStride_2x + srcStride;
1108 int dstStride_3x = dstStride_2x + dstStride;
1109 __m256i src00, src01, src02, src03, src04, src05, src06;
1110 __m256i src07, src08, src09, src10, src11, src12, tmp00;
1111 __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09;
1112 __m256i h_20 = __lasx_xvldi(0x414);
1113 __m256i h_5 = __lasx_xvldi(0x405);
1114 __m256i h_16 = __lasx_xvldi(0x410);
1119 src02 = __lasx_xvld(
src, 0);
1121 srcStride_3x,
src, srcStride_4x, src03, src04, src05, src06);
1122 src += srcStride_4x;
1124 srcStride_3x,
src, srcStride_4x, src07, src08, src09, src10);
1125 src += srcStride_4x;
1126 DUP2_ARG2(__lasx_xvldx,
src, srcStride,
src, srcStride_2x, src11, src12);
1128 tmp06 = __lasx_xvld(dst, 0);
1129 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1130 dst, dstStride_3x, dst, dstStride_4x,
1131 tmp07, tmp02, tmp03, tmp04);
1132 dst += dstStride_4x;
1133 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
1135 tmp01 = __lasx_xvldx(dst, dstStride_3x);
1136 dst -= dstStride_4x;
1138 tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02);
1139 tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02);
1140 tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02);
1141 tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02);
1144 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1145 tmp06 = __lasx_xvavgr_bu(tmp06, tmp02);
1146 __lasx_xvstelm_d(tmp06, dst, 0, 0);
1147 __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2);
1148 dst += dstStride_2x;
1150 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1151 tmp07 = __lasx_xvavgr_bu(tmp07, tmp02);
1152 __lasx_xvstelm_d(tmp07, dst, 0, 0);
1153 __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2);
1154 dst += dstStride_2x;
1156 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1157 tmp08 = __lasx_xvavgr_bu(tmp08, tmp02);
1158 __lasx_xvstelm_d(tmp08, dst, 0, 0);
1159 __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2);
1160 dst += dstStride_2x;
1162 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
1163 tmp09 = __lasx_xvavgr_bu(tmp09, tmp02);
1164 __lasx_xvstelm_d(tmp09, dst, 0, 0);
1165 __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2);
1168 #define QPEL8_HV_LOWPASS_H(tmp) \
1170 src00 = __lasx_xvld(src, -2); \
1172 src10 = __lasx_xvld(src, -2); \
1174 src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
1175 src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
1176 src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
1177 src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
1178 src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
1179 src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
1180 DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
1181 src00 = __lasx_xvaddwl_h_bu(src00, src05); \
1182 src02 = __lasx_xvmul_h(src02, h_20); \
1183 src01 = __lasx_xvmul_h(src01, h_5); \
1184 src02 = __lasx_xvssub_h(src02, src01); \
1185 tmp = __lasx_xvsadd_h(src02, src00); \
1188 #define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, \
1189 src4, src5, temp0, temp1, \
1190 temp2, temp3, temp4, temp5, \
1193 DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \
1194 DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \
1195 temp4 = __lasx_xvaddwl_w_h(src0, src5); \
1196 temp5 = __lasx_xvaddwh_w_h(src0, src5); \
1197 temp0 = __lasx_xvmul_w(temp0, w_20); \
1198 temp1 = __lasx_xvmul_w(temp1, w_20); \
1199 temp2 = __lasx_xvmul_w(temp2, w_5); \
1200 temp3 = __lasx_xvmul_w(temp3, w_5); \
1201 temp0 = __lasx_xvssub_w(temp0, temp2); \
1202 temp1 = __lasx_xvssub_w(temp1, temp3); \
1203 temp0 = __lasx_xvsadd_w(temp0, temp4); \
1204 temp1 = __lasx_xvsadd_w(temp1, temp5); \
1205 temp0 = __lasx_xvsadd_w(temp0, w_512); \
1206 temp1 = __lasx_xvsadd_w(temp1, w_512); \
1207 temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10); \
1208 temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10); \
1209 temp0 = __lasx_xvpackev_d(temp1, temp0); \
1210 out = __lasx_xvssrani_bu_h(temp0, temp0, 0); \
1215 ptrdiff_t dstStride, ptrdiff_t srcStride)
1217 __m256i src00, src01, src02, src03, src04, src05, src10;
1218 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1219 __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1220 __m256i h_20 = __lasx_xvldi(0x414);
1221 __m256i h_5 = __lasx_xvldi(0x405);
1222 __m256i w_20 = __lasx_xvldi(0x814);
1223 __m256i w_5 = __lasx_xvldi(0x805);
1224 __m256i w_512 = {512};
1225 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1226 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1227 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1228 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1229 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1231 w_512 = __lasx_xvreplve0_w(w_512);
1233 src -= srcStride << 1;
1241 tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1242 tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
1243 tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
1244 tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
1245 tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
1246 tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
1249 src02, src03, src04, src05, tmp0)
1251 src02, src03, src04, src05, tmp2)
1253 src02, src03, src04, src05, tmp4)
1255 src02, src03, src04, src05, tmp6)
1256 __lasx_xvstelm_d(tmp0, dst, 0, 0);
1258 __lasx_xvstelm_d(tmp0, dst, 0, 2);
1260 __lasx_xvstelm_d(tmp2, dst, 0, 0);
1262 __lasx_xvstelm_d(tmp2, dst, 0, 2);
1264 __lasx_xvstelm_d(tmp4, dst, 0, 0);
1266 __lasx_xvstelm_d(tmp4, dst, 0, 2);
1268 __lasx_xvstelm_d(tmp6, dst, 0, 0);
1270 __lasx_xvstelm_d(tmp6, dst, 0, 2);
1277 int dstStride_2x = dstStride << 1;
1278 int dstStride_4x = dstStride << 2;
1279 int dstStride_3x = dstStride_2x + dstStride;
1280 __m256i src00, src01, src02, src03, src04, src05, src10;
1281 __m256i dst00, dst01, dst0, dst1, dst2, dst3;
1282 __m256i out0, out1, out2, out3;
1283 __m256i h_20 = __lasx_xvldi(0x414);
1284 __m256i h_5 = __lasx_xvldi(0x405);
1285 __m256i h_16 = __lasx_xvldi(0x410);
1286 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1287 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1288 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1289 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1290 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1296 src00 = __lasx_xvld(dst, 0);
1297 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1298 dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1299 dst += dstStride_4x;
1300 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00);
1301 dst01 = __lasx_xvldx(dst, dstStride_3x);
1302 dst -= dstStride_4x;
1303 dst0 = __lasx_xvpermi_q(src00, src01, 0x02);
1304 dst1 = __lasx_xvpermi_q(src02, src03, 0x02);
1305 dst2 = __lasx_xvpermi_q(src04, src05, 0x02);
1306 dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02);
1307 dst0 = __lasx_xvavgr_bu(dst0, out0);
1308 dst1 = __lasx_xvavgr_bu(dst1, out1);
1309 dst2 = __lasx_xvavgr_bu(dst2, out2);
1310 dst3 = __lasx_xvavgr_bu(dst3, out3);
1311 __lasx_xvstelm_d(dst0, dst, 0, 0);
1312 __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2);
1313 __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0);
1314 __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2);
1315 dst += dstStride_4x;
1316 __lasx_xvstelm_d(dst2, dst, 0, 0);
1317 __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2);
1318 __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0);
1319 __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2);
1324 ptrdiff_t dstStride, ptrdiff_t srcStride)
1326 __m256i src00, src01, src02, src03, src04, src05, src10;
1327 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1328 __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
1329 __m256i h_20 = __lasx_xvldi(0x414);
1330 __m256i h_5 = __lasx_xvldi(0x405);
1331 __m256i w_20 = __lasx_xvldi(0x814);
1332 __m256i w_5 = __lasx_xvldi(0x805);
1333 __m256i w_512 = {512};
1334 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
1335 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
1336 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
1337 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
1338 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
1339 ptrdiff_t dstStride_2x = dstStride << 1;
1340 ptrdiff_t dstStride_4x = dstStride << 2;
1341 ptrdiff_t dstStride_3x = dstStride_2x + dstStride;
1343 w_512 = __lasx_xvreplve0_w(w_512);
1345 src -= srcStride << 1;
1353 tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
1354 tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
1355 tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
1356 tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
1357 tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
1358 tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
1361 src02, src03, src04, src05, tmp0)
1363 src02, src03, src04, src05, tmp2)
1365 src02, src03, src04, src05, tmp4)
1367 src02, src03, src04, src05, tmp6)
1369 src00 = __lasx_xvld(dst, 0);
1370 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
1371 dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
1372 dst += dstStride_4x;
1373 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8);
1374 tmp9 = __lasx_xvldx(dst, dstStride_3x);
1375 dst -= dstStride_4x;
1376 tmp1 = __lasx_xvpermi_q(src00, src01, 0x02);
1377 tmp3 = __lasx_xvpermi_q(src02, src03, 0x02);
1378 tmp5 = __lasx_xvpermi_q(src04, src05, 0x02);
1379 tmp7 = __lasx_xvpermi_q(tmp8, tmp9, 0x02);
1380 tmp0 = __lasx_xvavgr_bu(tmp0, tmp1);
1381 tmp2 = __lasx_xvavgr_bu(tmp2, tmp3);
1382 tmp4 = __lasx_xvavgr_bu(tmp4, tmp5);
1383 tmp6 = __lasx_xvavgr_bu(tmp6, tmp7);
1384 __lasx_xvstelm_d(tmp0, dst, 0, 0);
1386 __lasx_xvstelm_d(tmp0, dst, 0, 2);
1388 __lasx_xvstelm_d(tmp2, dst, 0, 0);
1390 __lasx_xvstelm_d(tmp2, dst, 0, 2);
1392 __lasx_xvstelm_d(tmp4, dst, 0, 0);
1394 __lasx_xvstelm_d(tmp4, dst, 0, 2);
1396 __lasx_xvstelm_d(tmp6, dst, 0, 0);
1398 __lasx_xvstelm_d(tmp6, dst, 0, 2);
1403 int dstStride,
int srcStride)
1407 src += srcStride << 3;
1408 dst += dstStride << 3;
1415 int dstStride,
int srcStride)
1419 src += srcStride << 3;
1420 dst += dstStride << 3;
1426 int dstStride,
int srcStride)
1437 int dstStride,
int srcStride)
1448 ptrdiff_t dstStride, ptrdiff_t srcStride)
1452 src += srcStride << 3;
1453 dst += dstStride << 3;
1459 ptrdiff_t dstStride, ptrdiff_t srcStride)
1463 src += srcStride << 3;
1464 dst += dstStride << 3;
1526 uint8_t *
const halfH =
temp;
1527 uint8_t *
const halfHV =
temp + 64;
1555 uint8_t *
const halfHV =
temp;
1556 uint8_t *
const halfH =
temp + 64;
1573 uint8_t *
const halfHV =
temp;
1574 uint8_t *
const halfH =
temp + 64;
1605 uint8_t *
const halfH =
temp;
1606 uint8_t *
const halfHV =
temp + 64;
1671 uint8_t *
const halfH =
temp;
1672 uint8_t *
const halfHV =
temp + 64;
1700 uint8_t *
const halfHV =
temp;
1701 uint8_t *
const halfH =
temp + 64;
1718 uint8_t *
const halfHV =
temp;
1719 uint8_t *
const halfH =
temp + 64;
1741 uint8_t *
const halfH =
temp;
1742 uint8_t *
const halfHV =
temp + 64;
1812 uint8_t *
const halfH =
temp;
1813 uint8_t *
const halfHV =
temp + 256;
1837 uint8_t *
const halfHV =
temp;
1838 uint8_t *
const halfH =
temp + 256;
1855 uint8_t *
const halfHV =
temp;
1856 uint8_t *
const halfH =
temp + 256;
1883 uint8_t *
const halfH =
temp;
1884 uint8_t *
const halfHV =
temp + 256;
1951 uint8_t *
const halfH =
temp;
1952 uint8_t *
const halfHV =
temp + 256;
1977 uint8_t *
const halfHV =
temp;
1978 uint8_t *
const halfH =
temp + 256;
1995 uint8_t *
const halfHV =
temp;
1996 uint8_t *
const halfH =
temp + 256;
2024 uint8_t *
const halfH =
temp;
2025 uint8_t *
const halfHV =
temp + 256;