29 unsigned A1,
unsigned A2,
30 const void *_r,
const void *_g,
const void *_b,
int y,
35 uint32_t *dest = (uint32_t *) _dest;
36 const uint32_t *
r = (
const uint32_t *) _r;
37 const uint32_t *
g = (
const uint32_t *) _g;
38 const uint32_t *
b = (
const uint32_t *) _b;
41 dest[
i * 2 + 0] =
r[Y1] +
g[Y1] +
b[Y1];
42 dest[
i * 2 + 1] =
r[Y2] +
g[Y2] +
b[Y2];
44 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
49 dest[
i * 2 + 0] =
r[Y1] +
g[Y1] +
b[Y1];
50 dest[
i * 2 + 1] =
r[Y2] +
g[Y2] +
b[Y2];
53 uint8_t *dest = (uint8_t *) _dest;
54 const uint8_t *
r = (
const uint8_t *) _r;
55 const uint8_t *
g = (
const uint8_t *) _g;
56 const uint8_t *
b = (
const uint8_t *) _b;
58 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
59 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
61 dest[
i * 6 + 0] =
r_b[Y1];
62 dest[
i * 6 + 1] =
g[Y1];
63 dest[
i * 6 + 2] =
b_r[Y1];
64 dest[
i * 6 + 3] =
r_b[Y2];
65 dest[
i * 6 + 4] =
g[Y2];
66 dest[
i * 6 + 5] =
b_r[Y2];
72 uint16_t *dest = (uint16_t *) _dest;
73 const uint16_t *
r = (
const uint16_t *) _r;
74 const uint16_t *
g = (
const uint16_t *) _g;
75 const uint16_t *
b = (
const uint16_t *) _b;
76 int dr1, dg1, db1, dr2, dg2, db2;
101 dest[
i * 2 + 0] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1];
102 dest[
i * 2 + 1] =
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2];
104 uint8_t *dest = (uint8_t *) _dest;
105 const uint8_t *
r = (
const uint8_t *) _r;
106 const uint8_t *
g = (
const uint8_t *) _g;
107 const uint8_t *
b = (
const uint8_t *) _b;
108 int dr1, dg1, db1, dr2, dg2, db2;
113 dr1 = dg1 = d32[(
i * 2 + 0) & 7];
114 db1 = d64[(
i * 2 + 0) & 7];
115 dr2 = dg2 = d32[(
i * 2 + 1) & 7];
116 db2 = d64[(
i * 2 + 1) & 7];
120 dr1 = db1 =
d128[(
i * 2 + 0) & 7];
121 dg1 = d64[(
i * 2 + 0) & 7];
122 dr2 = db2 =
d128[(
i * 2 + 1) & 7];
123 dg2 = d64[(
i * 2 + 1) & 7];
127 dest[
i] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1] +
128 ((
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2]) << 4);
130 dest[
i * 2 + 0] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1];
131 dest[
i * 2 + 1] =
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2];
136 #define WRITE_YUV2RGB_LSX(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
138 Y1 = __lsx_vpickve2gr_w(vec_y1, t1); \
139 Y2 = __lsx_vpickve2gr_w(vec_y2, t2); \
140 U = __lsx_vpickve2gr_w(vec_u, t3); \
141 V = __lsx_vpickve2gr_w(vec_v, t4); \
142 r = c->table_rV[V]; \
143 g = (c->table_gU[U] + c->table_gV[V]); \
144 b = c->table_bU[U]; \
145 yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
146 r, g, b, y, target, 0); \
152 const int16_t **lumSrc,
int lumFilterSize,
153 const int16_t *chrFilter,
const int16_t **chrUSrc,
154 const int16_t **chrVSrc,
int chrFilterSize,
155 const int16_t **alpSrc, uint8_t *dest,
int dstW,
163 int len_count = (dstW + 1) >> 1;
164 const void *
r, *
g, *
b;
166 __m128i
headroom = __lsx_vreplgr2vr_w(head);
168 for (
i = 0;
i <
len;
i++) {
169 int Y1, Y2,
U,
V, count_lum = count << 1;
170 __m128i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
171 __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2, yh_ev1, yh_ev2, yh_od1, yh_od2;
172 __m128i u_ev1, u_ev2, u_od1, u_od2, v_ev1, v_ev2, v_od1, v_od2,
temp;
174 yl_ev = __lsx_vldrepl_w(&t, 0);
192 for (j = 0; j < lumFilterSize; j++) {
193 temp = __lsx_vldrepl_h((lumFilter + j), 0);
194 DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
196 DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 32, lumSrc[j] + count_lum,
198 yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1,
temp, l_src1);
199 yl_od1 = __lsx_vmaddwod_w_h(yl_od1,
temp, l_src1);
200 yh_ev1 = __lsx_vmaddwev_w_h(yh_ev1,
temp, l_src3);
201 yh_od1 = __lsx_vmaddwod_w_h(yh_od1,
temp, l_src3);
202 yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2,
temp, l_src2);
203 yl_od2 = __lsx_vmaddwod_w_h(yl_od2,
temp, l_src2);
204 yh_ev2 = __lsx_vmaddwev_w_h(yh_ev2,
temp, l_src4);
205 yh_od2 = __lsx_vmaddwod_w_h(yh_od2,
temp, l_src4);
207 for (j = 0; j < chrFilterSize; j++) {
208 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
210 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 16, chrVSrc[j] + count, 16,
212 temp = __lsx_vldrepl_h((chrFilter + j), 0);
213 u_ev1 = __lsx_vmaddwev_w_h(u_ev1,
temp, u_src1);
214 u_od1 = __lsx_vmaddwod_w_h(u_od1,
temp, u_src1);
215 v_ev1 = __lsx_vmaddwev_w_h(v_ev1,
temp, v_src1);
216 v_od1 = __lsx_vmaddwod_w_h(v_od1,
temp, v_src1);
217 u_ev2 = __lsx_vmaddwev_w_h(u_ev2,
temp, u_src2);
218 u_od2 = __lsx_vmaddwod_w_h(u_od2,
temp, u_src2);
219 v_ev2 = __lsx_vmaddwev_w_h(v_ev2,
temp, v_src2);
220 v_od2 = __lsx_vmaddwod_w_h(v_od2,
temp, v_src2);
222 yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
223 yh_ev1 = __lsx_vsrai_w(yh_ev1, 19);
224 yl_od1 = __lsx_vsrai_w(yl_od1, 19);
225 yh_od1 = __lsx_vsrai_w(yh_od1, 19);
226 u_ev1 = __lsx_vsrai_w(u_ev1, 19);
227 v_ev1 = __lsx_vsrai_w(v_ev1, 19);
228 u_od1 = __lsx_vsrai_w(u_od1, 19);
229 v_od1 = __lsx_vsrai_w(v_od1, 19);
230 yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
231 yh_ev2 = __lsx_vsrai_w(yh_ev2, 19);
232 yl_od2 = __lsx_vsrai_w(yl_od2, 19);
233 yh_od2 = __lsx_vsrai_w(yh_od2, 19);
234 u_ev2 = __lsx_vsrai_w(u_ev2, 19);
235 v_ev2 = __lsx_vsrai_w(v_ev2, 19);
236 u_od2 = __lsx_vsrai_w(u_od2, 19);
237 v_od2 = __lsx_vsrai_w(v_od2, 19);
238 u_ev1 = __lsx_vadd_w(u_ev1,
headroom);
239 v_ev1 = __lsx_vadd_w(v_ev1,
headroom);
240 u_od1 = __lsx_vadd_w(u_od1,
headroom);
241 v_od1 = __lsx_vadd_w(v_od1,
headroom);
242 u_ev2 = __lsx_vadd_w(u_ev2,
headroom);
243 v_ev2 = __lsx_vadd_w(v_ev2,
headroom);
244 u_od2 = __lsx_vadd_w(u_od2,
headroom);
245 v_od2 = __lsx_vadd_w(v_od2,
headroom);
266 int Y1, Y2,
U,
V, count_lum = count << 1;
267 __m128i l_src1, l_src2, u_src1, v_src1;
268 __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2;
269 __m128i u_ev1, u_od1, v_ev1, v_od1,
temp;
271 yl_ev = __lsx_vldrepl_w(&t, 0);
281 for (j = 0; j < lumFilterSize; j++) {
282 temp = __lsx_vldrepl_h((lumFilter + j), 0);
283 DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
285 yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1,
temp, l_src1);
286 yl_od1 = __lsx_vmaddwod_w_h(yl_od1,
temp, l_src1);
287 yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2,
temp, l_src2);
288 yl_od2 = __lsx_vmaddwod_w_h(yl_od2,
temp, l_src2);
290 for (j = 0; j < chrFilterSize; j++) {
291 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
293 temp = __lsx_vldrepl_h((chrFilter + j), 0);
294 u_ev1 = __lsx_vmaddwev_w_h(u_ev1,
temp, u_src1);
295 u_od1 = __lsx_vmaddwod_w_h(u_od1,
temp, u_src1);
296 v_ev1 = __lsx_vmaddwev_w_h(v_ev1,
temp, v_src1);
297 v_od1 = __lsx_vmaddwod_w_h(v_od1,
temp, v_src1);
299 yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
300 yl_od1 = __lsx_vsrai_w(yl_od1, 19);
301 u_ev1 = __lsx_vsrai_w(u_ev1, 19);
302 v_ev1 = __lsx_vsrai_w(v_ev1, 19);
303 u_od1 = __lsx_vsrai_w(u_od1, 19);
304 v_od1 = __lsx_vsrai_w(v_od1, 19);
305 yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
306 yl_od2 = __lsx_vsrai_w(yl_od2, 19);
307 u_ev1 = __lsx_vadd_w(u_ev1,
headroom);
308 v_ev1 = __lsx_vadd_w(v_ev1,
headroom);
309 u_od1 = __lsx_vadd_w(u_od1,
headroom);
310 v_od1 = __lsx_vadd_w(v_od1,
headroom);
324 int Y1, Y2,
U,
V, count_lum = count << 1;
325 __m128i l_src1, u_src, v_src;
326 __m128i yl_ev, yl_od;
327 __m128i u_ev, u_od, v_ev, v_od,
temp;
329 yl_ev = __lsx_vldrepl_w(&t, 0);
335 for (j = 0; j < lumFilterSize; j++) {
336 temp = __lsx_vldrepl_h((lumFilter + j), 0);
337 l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
338 yl_ev = __lsx_vmaddwev_w_h(yl_ev,
temp, l_src1);
339 yl_od = __lsx_vmaddwod_w_h(yl_od,
temp, l_src1);
341 for (j = 0; j < chrFilterSize; j++) {
342 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
344 temp = __lsx_vldrepl_h((chrFilter + j), 0);
345 u_ev = __lsx_vmaddwev_w_h(u_ev,
temp, u_src);
346 u_od = __lsx_vmaddwod_w_h(u_od,
temp, u_src);
347 v_ev = __lsx_vmaddwev_w_h(v_ev,
temp, v_src);
348 v_od = __lsx_vmaddwod_w_h(v_od,
temp, v_src);
350 yl_ev = __lsx_vsrai_w(yl_ev, 19);
351 yl_od = __lsx_vsrai_w(yl_od, 19);
352 u_ev = __lsx_vsrai_w(u_ev, 19);
353 v_ev = __lsx_vsrai_w(v_ev, 19);
354 u_od = __lsx_vsrai_w(u_od, 19);
355 v_od = __lsx_vsrai_w(v_od, 19);
356 u_ev = __lsx_vadd_w(u_ev,
headroom);
357 v_ev = __lsx_vadd_w(v_ev,
headroom);
358 u_od = __lsx_vadd_w(u_od,
headroom);
359 v_od = __lsx_vadd_w(v_od,
headroom);
368 int Y1, Y2,
U,
V, count_lum = count << 1;
369 __m128i l_src1, u_src, v_src;
370 __m128i yl_ev, yl_od;
371 __m128i u_ev, u_od, v_ev, v_od,
temp;
373 yl_ev = __lsx_vldrepl_w(&t, 0);
379 for (j = 0; j < lumFilterSize; j++) {
380 temp = __lsx_vldrepl_h((lumFilter + j), 0);
381 l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
382 yl_ev = __lsx_vmaddwev_w_h(yl_ev,
temp, l_src1);
383 yl_od = __lsx_vmaddwod_w_h(yl_od,
temp, l_src1);
385 for (j = 0; j < chrFilterSize; j++) {
386 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
388 temp = __lsx_vldrepl_h((chrFilter + j), 0);
389 u_ev = __lsx_vmaddwev_w_h(u_ev,
temp, u_src);
390 u_od = __lsx_vmaddwod_w_h(u_od,
temp, u_src);
391 v_ev = __lsx_vmaddwev_w_h(v_ev,
temp, v_src);
392 v_od = __lsx_vmaddwod_w_h(v_od,
temp, v_src);
394 yl_ev = __lsx_vsrai_w(yl_ev, 19);
395 yl_od = __lsx_vsrai_w(yl_od, 19);
396 u_ev = __lsx_vsrai_w(u_ev, 19);
397 v_ev = __lsx_vsrai_w(v_ev, 19);
398 u_od = __lsx_vsrai_w(u_od, 19);
399 v_od = __lsx_vsrai_w(v_od, 19);
400 u_ev = __lsx_vadd_w(u_ev,
headroom);
401 v_ev = __lsx_vadd_w(v_ev,
headroom);
402 u_od = __lsx_vadd_w(u_od,
headroom);
403 v_od = __lsx_vadd_w(v_od,
headroom);
410 int Y1, Y2,
U,
V, count_lum = count << 1;
411 __m128i l_src1, u_src, v_src;
412 __m128i yl_ev, yl_od;
413 __m128i u_ev, u_od, v_ev, v_od,
temp;
415 yl_ev = __lsx_vldrepl_w(&t, 0);
421 for (j = 0; j < lumFilterSize; j++) {
422 temp = __lsx_vldrepl_h((lumFilter + j), 0);
423 l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
424 yl_ev = __lsx_vmaddwev_w_h(yl_ev,
temp, l_src1);
425 yl_od = __lsx_vmaddwod_w_h(yl_od,
temp, l_src1);
427 for (j = 0; j < chrFilterSize; j++) {
428 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
430 temp = __lsx_vldrepl_h((chrFilter + j), 0);
431 u_ev = __lsx_vmaddwev_w_h(u_ev,
temp, u_src);
432 u_od = __lsx_vmaddwod_w_h(u_od,
temp, u_src);
433 v_ev = __lsx_vmaddwev_w_h(v_ev,
temp, v_src);
434 v_od = __lsx_vmaddwod_w_h(v_od,
temp, v_src);
436 yl_ev = __lsx_vsrai_w(yl_ev, 19);
437 yl_od = __lsx_vsrai_w(yl_od, 19);
438 u_ev = __lsx_vsrai_w(u_ev, 19);
439 v_ev = __lsx_vsrai_w(v_ev, 19);
440 u_od = __lsx_vsrai_w(u_od, 19);
441 v_od = __lsx_vsrai_w(v_od, 19);
442 u_ev = __lsx_vadd_w(u_ev,
headroom);
443 v_ev = __lsx_vadd_w(v_ev,
headroom);
444 u_od = __lsx_vadd_w(u_od,
headroom);
445 v_od = __lsx_vadd_w(v_od,
headroom);
450 for (; count < len_count; count++) {
456 for (j = 0; j < lumFilterSize; j++) {
457 Y1 += lumSrc[j][count * 2] * lumFilter[j];
458 Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
460 for (j = 0; j < chrFilterSize; j++) {
461 U += chrUSrc[j][count] * chrFilter[j];
462 V += chrVSrc[j][count] * chrFilter[j];
474 r,
g,
b, y, target, 0);
480 const int16_t *ubuf[2],
const int16_t *vbuf[2],
481 const int16_t *abuf[2], uint8_t *dest,
int dstW,
482 int yalpha,
int uvalpha,
int y,
485 const int16_t *buf0 = buf[0], *buf1 = buf[1],
486 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
487 *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
488 int yalpha1 = 4096 - yalpha;
489 int uvalpha1 = 4096 - uvalpha;
492 int len_count = (dstW + 1) >> 1;
493 const void *
r, *
g, *
b;
495 __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
496 __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
497 __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
498 __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
499 __m128i
headroom = __lsx_vreplgr2vr_w(head);
500 __m128i
zero = __lsx_vldi(0);
502 for (
i = 0;
i <
len;
i += 8) {
505 int c_dex = count << 1;
506 __m128i y0_h, y0_l, y0, u0, v0;
507 __m128i y1_h, y1_l, y1, u1, v1;
508 __m128i y_l, y_h,
u, v;
510 DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
511 buf1, i_dex, y0, u0, v0, y1);
512 DUP2_ARG2(__lsx_vldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
513 DUP2_ARG2(__lsx_vsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
514 DUP2_ARG1(__lsx_vexth_w_h, y0, y1, y0_h, y1_h);
517 y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
518 y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
519 u0 = __lsx_vmul_w(u0, v_uvalpha1);
520 v0 = __lsx_vmul_w(v0, v_uvalpha1);
521 y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
522 y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
523 u = __lsx_vmadd_w(u0, v_uvalpha, u1);
524 v = __lsx_vmadd_w(v0, v_uvalpha, v1);
525 y_l = __lsx_vsrai_w(y_l, 19);
526 y_h = __lsx_vsrai_w(y_h, 19);
527 u = __lsx_vsrai_w(
u, 19);
528 v = __lsx_vsrai_w(v, 19);
539 __m128i y0_l, y0, u0, v0;
540 __m128i y1_l, y1, u1, v1;
543 y0 = __lsx_vldx(buf0, i_dex);
544 u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
545 v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
546 y1 = __lsx_vldx(buf1, i_dex);
547 u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
548 v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
552 y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
553 u0 = __lsx_vmul_w(u0, v_uvalpha1);
554 v0 = __lsx_vmul_w(v0, v_uvalpha1);
555 y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
556 u = __lsx_vmadd_w(u0, v_uvalpha, u1);
557 v = __lsx_vmadd_w(v0, v_uvalpha, v1);
558 y_l = __lsx_vsrai_w(y_l, 19);
559 u = __lsx_vsrai_w(
u, 19);
560 v = __lsx_vsrai_w(v, 19);
567 for (; count < len_count; count++) {
568 int Y1 = (buf0[count * 2] * yalpha1 +
569 buf1[count * 2] * yalpha) >> 19;
570 int Y2 = (buf0[count * 2 + 1] * yalpha1 +
571 buf1[count * 2 + 1] * yalpha) >> 19;
572 int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
573 int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
581 r,
g,
b, y, target, 0);
587 const int16_t *ubuf[2],
const int16_t *vbuf[2],
588 const int16_t *abuf0, uint8_t *dest,
int dstW,
592 const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
594 int len = (dstW - 7);
595 int len_count = (dstW + 1) >> 1;
596 const void *
r, *
g, *
b;
598 if (uvalpha < 2048) {
601 __m128i
headroom = __lsx_vreplgr2vr_h(head);
603 for (
i = 0;
i <
len;
i += 8) {
606 int c_dex = count << 1;
607 __m128i src_y, src_u, src_v;
608 __m128i
u, v, uv, y_l, y_h;
610 src_y = __lsx_vldx(buf0, i_dex);
611 DUP2_ARG2(__lsx_vldx, ubuf0, c_dex, vbuf0, c_dex, src_u, src_v);
612 src_y = __lsx_vsrari_h(src_y, 7);
613 src_u = __lsx_vsrari_h(src_u, 7);
614 src_v = __lsx_vsrari_h(src_v, 7);
615 y_l = __lsx_vsllwil_w_h(src_y, 0);
616 y_h = __lsx_vexth_w_h(src_y);
617 uv = __lsx_vilvl_h(src_v, src_u);
619 v = __lsx_vaddwod_w_h(uv,
headroom);
628 __m128i src_y, src_u, src_v;
629 __m128i y_l,
u, v, uv;
631 src_y = __lsx_vldx(buf0, i_dex);
632 src_u = __lsx_vldrepl_d((ubuf0 + count), 0);
633 src_v = __lsx_vldrepl_d((vbuf0 + count), 0);
634 y_l = __lsx_vsrari_h(src_y, 7);
635 y_l = __lsx_vsllwil_w_h(y_l, 0);
636 uv = __lsx_vilvl_h(src_v, src_u);
637 uv = __lsx_vsrari_h(uv, 7);
639 v = __lsx_vaddwod_w_h(uv,
headroom);
644 for (; count < len_count; count++) {
645 int Y1 = (buf0[count * 2 ] + 64) >> 7;
646 int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
647 int U = (ubuf0[count] + 64) >> 7;
648 int V = (vbuf0[count] + 64) >> 7;
656 r,
g,
b, y, target, 0);
659 const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
662 __m128i
headroom = __lsx_vreplgr2vr_w(HEADROOM);
664 for (
i = 0;
i <
len;
i += 8) {
667 int c_dex = count << 1;
668 __m128i src_y, src_u0, src_v0, src_u1, src_v1;
669 __m128i y_l, y_h, u1, u2, v1, v2;
671 DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
672 ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
673 src_v1 = __lsx_vldx(vbuf1, c_dex);
674 src_y = __lsx_vsrari_h(src_y, 7);
675 u1 = __lsx_vaddwev_w_h(src_u0, src_u1);
676 v1 = __lsx_vaddwod_w_h(src_u0, src_u1);
677 u2 = __lsx_vaddwev_w_h(src_v0, src_v1);
678 v2 = __lsx_vaddwod_w_h(src_v0, src_v1);
679 y_l = __lsx_vsllwil_w_h(src_y, 0);
680 y_h = __lsx_vexth_w_h(src_y);
681 u1 = __lsx_vsrari_w(u1, 8);
682 v1 = __lsx_vsrari_w(v1, 8);
683 u2 = __lsx_vsrari_w(u2, 8);
684 v2 = __lsx_vsrari_w(v2, 8);
697 __m128i src_y, src_u0, src_v0, src_u1, src_v1;
700 src_y = __lsx_vldx(buf0, i_dex);
701 src_u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
702 src_v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
703 src_u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
704 src_v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
706 src_u0 = __lsx_vilvl_h(src_u1, src_u0);
707 src_v0 = __lsx_vilvl_h(src_v1, src_v0);
708 src_y = __lsx_vsrari_h(src_y, 7);
709 src_y = __lsx_vsllwil_w_h(src_y, 0);
710 uv = __lsx_vilvl_h(src_v0, src_u0);
711 uv = __lsx_vhaddw_w_h(uv, uv);
712 uv = __lsx_vsrari_w(uv, 8);
718 for (; count < len_count; count++) {
719 int Y1 = (buf0[count * 2 ] + 64) >> 7;
720 int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
721 int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
722 int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
730 r,
g,
b, y, target, 0);
735 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
736 static void name ## ext ## _X_lsx(SwsInternal *c, const int16_t *lumFilter, \
737 const int16_t **lumSrc, int lumFilterSize, \
738 const int16_t *chrFilter, const int16_t **chrUSrc, \
739 const int16_t **chrVSrc, int chrFilterSize, \
740 const int16_t **alpSrc, uint8_t *dest, int dstW, \
743 name ## base ## _X_template_lsx(c, lumFilter, lumSrc, lumFilterSize, \
744 chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
745 alpSrc, dest, dstW, y, fmt, hasAlpha); \
748 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
749 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
750 static void name ## ext ## _2_lsx(SwsInternal *c, const int16_t *buf[2], \
751 const int16_t *ubuf[2], const int16_t *vbuf[2], \
752 const int16_t *abuf[2], uint8_t *dest, int dstW, \
753 int yalpha, int uvalpha, int y) \
755 name ## base ## _2_template_lsx(c, buf, ubuf, vbuf, abuf, dest, \
756 dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
759 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
760 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
761 static void name ## ext ## _1_lsx(SwsInternal *c, const int16_t *buf0, \
762 const int16_t *ubuf[2], const int16_t *vbuf[2], \
763 const int16_t *abuf0, uint8_t *dest, int dstW, \
764 int uvalpha, int y) \
766 name ## base ## _1_template_lsx(c, buf0, ubuf, vbuf, abuf0, dest, \
767 dstW, uvalpha, y, fmt, hasAlpha); \
772 #if CONFIG_SWSCALE_ALPHA
788 uint8_t *dest,
int i,
int R,
int A,
int G,
int B,
793 if ((
R |
G |
B) & 0xC0000000) {
801 dest[0] = hasAlpha ?
A : 255;
815 dest[3] = hasAlpha ?
A : 255;
818 dest[0] = hasAlpha ?
A : 255;
832 dest[3] = hasAlpha ?
A : 255;
848 R += (7*err[0] + 1*
c->dither_error[0][
i] + 5*
c->dither_error[0][
i+1] + 3*
c->dither_error[0][
i+2])>>4;
849 G += (7*err[1] + 1*
c->dither_error[1][
i] + 5*
c->dither_error[1][
i+1] + 3*
c->dither_error[1][
i+2])>>4;
850 B += (7*err[2] + 1*
c->dither_error[2][
i] + 5*
c->dither_error[2][
i+1] + 3*
c->dither_error[2][
i+2])>>4;
851 c->dither_error[0][
i] = err[0];
852 c->dither_error[1][
i] = err[1];
853 c->dither_error[2][
i] = err[2];
854 r =
R >> (isrgb8 ? 5 : 7);
855 g =
G >> (isrgb8 ? 5 : 6);
856 b =
B >> (isrgb8 ? 6 : 7);
860 err[0] =
R -
r*(isrgb8 ? 36 : 255);
861 err[1] =
G -
g*(isrgb8 ? 36 : 85);
862 err[2] =
B -
b*(isrgb8 ? 85 : 255);
867 #define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
886 #define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
906 dest[0] =
r + 2*
g + 8*
b;
908 dest[0] =
b + 2*
g + 8*
r;
910 dest[0] =
r + 8*
g + 64*
b;
912 dest[0] =
b + 4*
g + 32*
r;
919 #define YUVTORGB_SETUP_LSX \
920 int y_offset = c->yuv2rgb_y_offset; \
921 int y_coeff = c->yuv2rgb_y_coeff; \
922 int v2r_coe = c->yuv2rgb_v2r_coeff; \
923 int v2g_coe = c->yuv2rgb_v2g_coeff; \
924 int u2g_coe = c->yuv2rgb_u2g_coeff; \
925 int u2b_coe = c->yuv2rgb_u2b_coeff; \
926 __m128i offset = __lsx_vreplgr2vr_w(y_offset); \
927 __m128i coeff = __lsx_vreplgr2vr_w(y_coeff); \
928 __m128i v2r = __lsx_vreplgr2vr_w(v2r_coe); \
929 __m128i v2g = __lsx_vreplgr2vr_w(v2g_coe); \
930 __m128i u2g = __lsx_vreplgr2vr_w(u2g_coe); \
931 __m128i u2b = __lsx_vreplgr2vr_w(u2b_coe); \
933 #define YUVTORGB_LSX(y, u, v, R, G, B, offset, coeff, \
934 y_temp, v2r, v2g, u2g, u2b) \
936 y = __lsx_vsub_w(y, offset); \
937 y = __lsx_vmul_w(y, coeff); \
938 y = __lsx_vadd_w(y, y_temp); \
939 R = __lsx_vmadd_w(y, v, v2r); \
940 v = __lsx_vmadd_w(y, v, v2g); \
941 G = __lsx_vmadd_w(v, u, u2g); \
942 B = __lsx_vmadd_w(y, u, u2b); \
945 #define WRITE_FULL_A_LSX(r, g, b, a, t1, s) \
947 R = __lsx_vpickve2gr_w(r, t1); \
948 G = __lsx_vpickve2gr_w(g, t1); \
949 B = __lsx_vpickve2gr_w(b, t1); \
950 A = __lsx_vpickve2gr_w(a, t1); \
952 A = av_clip_uint8(A); \
953 yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
957 #define WRITE_FULL_LSX(r, g, b, t1, s) \
959 R = __lsx_vpickve2gr_w(r, t1); \
960 G = __lsx_vpickve2gr_w(g, t1); \
961 B = __lsx_vpickve2gr_w(b, t1); \
962 yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
968 const int16_t **lumSrc,
int lumFilterSize,
969 const int16_t *chrFilter,
const int16_t **chrUSrc,
970 const int16_t **chrVSrc,
int chrFilterSize,
971 const int16_t **alpSrc, uint8_t *dest,
975 int i, j,
B,
G,
R,
A;
979 int a_temp = 1 << 18;
981 int tempc = templ - (128 << 19);
984 __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
991 for (
i = 0;
i <
len;
i += 8) {
992 __m128i l_src, u_src, v_src;
993 __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od,
temp;
994 __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
997 y_ev = y_od = __lsx_vreplgr2vr_w(templ);
998 u_ev = u_od = v_ev = v_od = __lsx_vreplgr2vr_w(tempc);
999 for (j = 0; j < lumFilterSize; j++) {
1000 temp = __lsx_vldrepl_h((lumFilter + j), 0);
1001 l_src = __lsx_vldx(lumSrc[j], n);
1002 y_ev = __lsx_vmaddwev_w_h(y_ev, l_src,
temp);
1003 y_od = __lsx_vmaddwod_w_h(y_od, l_src,
temp);
1005 for (j = 0; j < chrFilterSize; j++) {
1006 temp = __lsx_vldrepl_h((chrFilter + j), 0);
1007 DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n,
1010 v_src,
temp, u_ev, v_ev);
1012 v_src,
temp, u_od, v_od);
1014 y_ev = __lsx_vsrai_w(y_ev, 10);
1015 y_od = __lsx_vsrai_w(y_od, 10);
1016 u_ev = __lsx_vsrai_w(u_ev, 10);
1017 u_od = __lsx_vsrai_w(u_od, 10);
1018 v_ev = __lsx_vsrai_w(v_ev, 10);
1019 v_od = __lsx_vsrai_w(v_od, 10);
1021 y_temp, v2r, v2g, u2g, u2b);
1023 y_temp, v2r, v2g, u2g, u2b);
1026 __m128i a_src, a_ev, a_od;
1028 a_ev = a_od = __lsx_vreplgr2vr_w(a_temp);
1029 for (j = 0; j < lumFilterSize; j++) {
1030 temp = __lsx_vldrepl_h(lumFilter + j, 0);
1031 a_src = __lsx_vldx(alpSrc[j], n);
1032 a_ev = __lsx_vmaddwev_w_h(a_ev, a_src,
temp);
1033 a_od = __lsx_vmaddwod_w_h(a_od, a_src,
temp);
1035 a_ev = __lsx_vsrai_w(a_ev, 19);
1036 a_od = __lsx_vsrai_w(a_od, 19);
1056 if (dstW -
i >= 4) {
1057 __m128i l_src, u_src, v_src;
1058 __m128i y_ev, u_ev, v_ev, uv,
temp;
1059 __m128i R_ev, G_ev, B_ev;
1062 y_ev = __lsx_vreplgr2vr_w(templ);
1063 u_ev = v_ev = __lsx_vreplgr2vr_w(tempc);
1064 for (j = 0; j < lumFilterSize; j++) {
1065 temp = __lsx_vldrepl_h((lumFilter + j), 0);
1066 l_src = __lsx_vldx(lumSrc[j], n);
1067 l_src = __lsx_vilvl_h(l_src, l_src);
1068 y_ev = __lsx_vmaddwev_w_h(y_ev, l_src,
temp);
1070 for (j = 0; j < chrFilterSize; j++) {
1071 temp = __lsx_vldrepl_h((chrFilter + j), 0);
1072 DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
1073 uv = __lsx_vilvl_h(v_src, u_src);
1074 u_ev = __lsx_vmaddwev_w_h(u_ev, uv,
temp);
1075 v_ev = __lsx_vmaddwod_w_h(v_ev, uv,
temp);
1077 y_ev = __lsx_vsrai_w(y_ev, 10);
1078 u_ev = __lsx_vsrai_w(u_ev, 10);
1079 v_ev = __lsx_vsrai_w(v_ev, 10);
1081 y_temp, v2r, v2g, u2g, u2b);
1084 __m128i a_src, a_ev;
1086 a_ev = __lsx_vreplgr2vr_w(a_temp);
1087 for (j = 0; j < lumFilterSize; j++) {
1088 temp = __lsx_vldrepl_h(lumFilter + j, 0);
1089 a_src = __lsx_vldx(alpSrc[j], n);
1090 a_src = __lsx_vilvl_h(a_src, a_src);
1091 a_ev = __lsx_vmaddwev_w_h(a_ev, a_src,
temp);
1093 a_ev = __lsx_vsrai_w(a_ev, 19);
1106 for (;
i < dstW;
i++) {
1108 int V,
U =
V = tempc;
1111 for (j = 0; j < lumFilterSize; j++) {
1112 Y += lumSrc[j][
i] * lumFilter[j];
1114 for (j = 0; j < chrFilterSize; j++) {
1115 U += chrUSrc[j][
i] * chrFilter[j];
1116 V += chrVSrc[j][
i] * chrFilter[j];
1124 for (j = 0; j < lumFilterSize; j++) {
1125 A += alpSrc[j][
i] * lumFilter[j];
1134 R = (unsigned)
Y +
V * v2r_coe;
1135 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1136 B = (unsigned)
Y +
U * u2b_coe;
1137 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1140 c->dither_error[0][
i] = err[0];
1141 c->dither_error[1][
i] = err[1];
1142 c->dither_error[2][
i] = err[2];
1147 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1148 const int16_t *abuf[2], uint8_t *dest,
int dstW,
1149 int yalpha,
int uvalpha,
int y,
1152 const int16_t *buf0 = buf[0], *buf1 = buf[1],
1153 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1154 *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1155 *abuf0 = hasAlpha ? abuf[0] :
NULL,
1156 *abuf1 = hasAlpha ? abuf[1] :
NULL;
1157 int yalpha1 = 4096 - yalpha;
1158 int uvalpha1 = 4096 - uvalpha;
1159 int uvtemp = 128 << 19;
1160 int atemp = 1 << 18;
1162 int ytemp = 1 << 21;
1167 __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
1168 __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
1169 __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
1170 __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
1171 __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
1172 __m128i a_bias = __lsx_vreplgr2vr_w(atemp);
1173 __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
1183 for (
i = 0;
i <
len;
i += 8) {
1184 __m128i
b0,
b1, ub0, ub1, vb0, vb1;
1185 __m128i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
1186 __m128i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
1187 __m128i y_l, y_h, v_l, v_h, u_l, u_h;
1188 __m128i R_l, R_h, G_l, G_h, B_l, B_h;
1191 DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0,
1192 n, ubuf1, n,
b0,
b1, ub0, ub1);
1193 DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0 , vb1);
1195 DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
1196 u0_l, u1_l, v0_l, v1_l);
1198 DUP4_ARG1(__lsx_vexth_w_h, ub0, ub1, vb0, vb1,
1199 u0_h, u1_h, v0_h, v1_h);
1200 y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
1201 y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
1202 u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
1203 u0_h = __lsx_vmul_w(u0_h, v_uvalpha1);
1204 v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
1205 v0_h = __lsx_vmul_w(v0_h, v_uvalpha1);
1206 y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
1207 y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
1208 u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
1209 u_h = __lsx_vmadd_w(u0_h, v_uvalpha, u1_h);
1210 v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
1211 v_h = __lsx_vmadd_w(v0_h, v_uvalpha, v1_h);
1212 u_l = __lsx_vsub_w(u_l, uv);
1213 u_h = __lsx_vsub_w(u_h, uv);
1214 v_l = __lsx_vsub_w(v_l, uv);
1215 v_h = __lsx_vsub_w(v_h, uv);
1216 y_l = __lsx_vsrai_w(y_l, 10);
1217 y_h = __lsx_vsrai_w(y_h, 10);
1218 u_l = __lsx_vsrai_w(u_l, 10);
1219 u_h = __lsx_vsrai_w(u_h, 10);
1220 v_l = __lsx_vsrai_w(v_l, 10);
1221 v_h = __lsx_vsrai_w(v_h, 10);
1223 y_temp, v2r, v2g, u2g, u2b);
1225 y_temp, v2r, v2g, u2g, u2b);
1228 __m128i
a0,
a1, a0_l, a0_h;
1229 __m128i a_l, a_h, a1_l, a1_h;
1234 a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
1235 a_h = __lsx_vmadd_w(a_bias, a0_h, v_yalpha1);
1236 a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
1237 a_h = __lsx_vmadd_w(a_h, v_yalpha, a1_h);
1238 a_l = __lsx_vsrai_w(a_l, 19);
1239 a_h = __lsx_vsrai_w(a_h, 19);
1259 if (dstW -
i >= 4) {
1260 __m128i
b0,
b1, ub0, ub1, vb0, vb1;
1261 __m128i y0_l, y1_l, u0_l;
1262 __m128i v0_l, u1_l, v1_l;
1263 __m128i y_l, u_l, v_l;
1264 __m128i R_l, G_l, B_l;
1267 DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0, n,
1268 ubuf1, n,
b0,
b1, ub0, ub1);
1269 DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0, vb1);
1271 DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
1272 u0_l, u1_l, v0_l, v1_l);
1273 y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
1274 u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
1275 v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
1276 y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
1277 u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
1278 v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
1279 u_l = __lsx_vsub_w(u_l, uv);
1280 v_l = __lsx_vsub_w(v_l, uv);
1281 y_l = __lsx_vsrai_w(y_l, 10);
1282 u_l = __lsx_vsrai_w(u_l, 10);
1283 v_l = __lsx_vsrai_w(v_l, 10);
1285 y_temp, v2r, v2g, u2g, u2b);
1288 __m128i
a0,
a1, a0_l;
1293 a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
1294 a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
1295 a_l = __lsx_vsrai_w(a_l, 19);
1308 for (;
i < dstW;
i++){
1309 int Y = ( buf0[
i] * yalpha1 + buf1[
i] * yalpha ) >> 10;
1310 int U = (ubuf0[
i] * uvalpha1 + ubuf1[
i] * uvalpha- uvtemp) >> 10;
1311 int V = (vbuf0[
i] * uvalpha1 + vbuf1[
i] * uvalpha- uvtemp) >> 10;
1315 A = (abuf0[
i] * yalpha1 + abuf1[
i] * yalpha + atemp) >> 19;
1323 R = (unsigned)
Y +
V * v2r_coe;
1324 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1325 B = (unsigned)
Y +
U * u2b_coe;
1326 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1329 c->dither_error[0][
i] = err[0];
1330 c->dither_error[1][
i] = err[1];
1331 c->dither_error[2][
i] = err[2];
1336 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1337 const int16_t *abuf0, uint8_t *dest,
int dstW,
1341 const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1345 int ytemp = 1 << 21;
1348 __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
1354 if (uvalpha < 2048) {
1355 int uvtemp = 128 << 7;
1356 __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
1357 __m128i
bias = __lsx_vreplgr2vr_w(bias_int);
1359 for (
i = 0;
i <
len;
i += 8) {
1360 __m128i
b,
ub, vb, ub_l, ub_h, vb_l, vb_h;
1361 __m128i y_l, y_h, u_l, u_h, v_l, v_h;
1362 __m128i R_l, R_h, G_l, G_h, B_l, B_h;
1366 vb = __lsx_vldx(vbuf0, n);
1367 y_l = __lsx_vsllwil_w_h(
b, 2);
1368 y_h = __lsx_vexth_w_h(
b);
1369 DUP2_ARG2(__lsx_vsllwil_w_h,
ub, 0, vb, 0, ub_l, vb_l);
1371 y_h = __lsx_vslli_w(y_h, 2);
1372 u_l = __lsx_vsub_w(ub_l, uv);
1373 u_h = __lsx_vsub_w(ub_h, uv);
1374 v_l = __lsx_vsub_w(vb_l, uv);
1375 v_h = __lsx_vsub_w(vb_h, uv);
1376 u_l = __lsx_vslli_w(u_l, 2);
1377 u_h = __lsx_vslli_w(u_h, 2);
1378 v_l = __lsx_vslli_w(v_l, 2);
1379 v_h = __lsx_vslli_w(v_h, 2);
1381 y_temp, v2r, v2g, u2g, u2b);
1383 y_temp, v2r, v2g, u2g, u2b);
1389 a_src = __lsx_vld(abuf0 +
i, 0);
1390 a_l = __lsx_vsllwil_w_h(a_src, 0);
1391 a_h = __lsx_vexth_w_h(a_src);
1392 a_l = __lsx_vadd_w(a_l,
bias);
1393 a_h = __lsx_vadd_w(a_h,
bias);
1394 a_l = __lsx_vsrai_w(a_l, 7);
1395 a_h = __lsx_vsrai_w(a_h, 7);
1415 if (dstW -
i >= 4) {
1416 __m128i
b,
ub, vb, ub_l, vb_l;
1417 __m128i y_l, u_l, v_l;
1418 __m128i R_l, G_l, B_l;
1422 vb = __lsx_vldx(vbuf0, n);
1423 y_l = __lsx_vsllwil_w_h(
b, 0);
1424 DUP2_ARG2(__lsx_vsllwil_w_h,
ub, 0, vb, 0, ub_l, vb_l);
1425 y_l = __lsx_vslli_w(y_l, 2);
1426 u_l = __lsx_vsub_w(ub_l, uv);
1427 v_l = __lsx_vsub_w(vb_l, uv);
1428 u_l = __lsx_vslli_w(u_l, 2);
1429 v_l = __lsx_vslli_w(v_l, 2);
1431 y_temp, v2r, v2g, u2g, u2b);
1436 a_src = __lsx_vldx(abuf0, n);
1437 a_src = __lsx_vsllwil_w_h(a_src, 0);
1438 a_l = __lsx_vadd_w(
bias, a_src);
1439 a_l = __lsx_vsrai_w(a_l, 7);
1452 for (;
i < dstW;
i++) {
1453 int Y = buf0[
i] << 2;
1454 int U = (ubuf0[
i] - uvtemp) << 2;
1455 int V = (vbuf0[
i] - uvtemp) << 2;
1459 A = (abuf0[
i] + 64) >> 7;
1466 R = (unsigned)
Y +
V * v2r_coe;
1467 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1468 B = (unsigned)
Y +
U * u2b_coe;
1469 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1473 const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1474 int uvtemp = 128 << 8;
1475 __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
1476 __m128i
zero = __lsx_vldi(0);
1477 __m128i
bias = __lsx_vreplgr2vr_h(bias_int);
1479 for (
i = 0;
i <
len;
i += 8) {
1480 __m128i
b, ub0, ub1, vb0, vb1;
1481 __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od;
1482 __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1485 DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
1486 ubuf1, n,
b, ub0, vb0, ub1);
1487 vb1 = __lsx_vldx(vbuf, n);
1488 y_ev = __lsx_vaddwev_w_h(
b,
zero);
1489 y_od = __lsx_vaddwod_w_h(
b,
zero);
1490 DUP2_ARG2(__lsx_vaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
1491 DUP2_ARG2(__lsx_vaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
1492 DUP2_ARG2(__lsx_vslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
1493 DUP4_ARG2(__lsx_vsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
1494 u_ev, u_od, v_ev, v_od);
1495 DUP4_ARG2(__lsx_vslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
1496 u_ev, u_od, v_ev, v_od);
1498 y_temp, v2r, v2g, u2g, u2b);
1500 y_temp, v2r, v2g, u2g, u2b);
1506 a_src = __lsx_vld(abuf0 +
i, 0);
1507 a_ev = __lsx_vaddwev_w_h(
bias, a_src);
1508 a_od = __lsx_vaddwod_w_h(
bias, a_src);
1509 a_ev = __lsx_vsrai_w(a_ev, 7);
1510 a_od = __lsx_vsrai_w(a_od, 7);
1530 if (dstW -
i >= 4) {
1531 __m128i
b, ub0, ub1, vb0, vb1;
1532 __m128i y_l, u_l, v_l;
1533 __m128i R_l, G_l, B_l;
1536 DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
1537 ubuf1, n,
b, ub0, vb0, ub1);
1538 vb1 = __lsx_vldx(vbuf1, n);
1539 y_l = __lsx_vsllwil_w_h(
b, 0);
1540 y_l = __lsx_vslli_w(y_l, 2);
1541 DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, vb0, 0, ub1, 0, vb1, 0,
1542 ub0, vb0, ub1, vb1);
1543 DUP2_ARG2(__lsx_vadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
1544 u_l = __lsx_vsub_w(u_l, uv);
1545 v_l = __lsx_vsub_w(v_l, uv);
1546 u_l = __lsx_vslli_w(u_l, 1);
1547 v_l = __lsx_vslli_w(v_l, 1);
1549 y_temp, v2r, v2g, u2g, u2b);
1555 a_src = __lsx_vld(abuf0 +
i, 0);
1556 a_src = __lsx_vilvl_h(a_src, a_src);
1557 a_l = __lsx_vaddwev_w_h(
bias, a_l);
1558 a_l = __lsx_vsrai_w(a_l, 7);
1571 for (;
i < dstW;
i++) {
1572 int Y = buf0[
i] << 2;
1573 int U = (ubuf0[
i] + ubuf1[
i] - uvtemp) << 1;
1574 int V = (vbuf0[
i] + vbuf1[
i] - uvtemp) << 1;
1578 A = (abuf0[
i] + 64) >> 7;
1585 R = (unsigned)
Y +
V * v2r_coe;
1586 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1587 B = (unsigned)
Y +
U * u2b_coe;
1588 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1592 c->dither_error[0][
i] = err[0];
1593 c->dither_error[1][
i] = err[1];
1594 c->dither_error[2][
i] = err[2];
1599 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1601 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1603 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1605 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1607 #if CONFIG_SWSCALE_ALPHA
1640 }
else if (
is16BPS(dstFormat)) {
1641 }
else if (
isNBPS(dstFormat)) {
1650 switch (
c->dstFormat) {
1653 c->yuv2packedX = yuv2rgba32_full_X_lsx;
1654 c->yuv2packed2 = yuv2rgba32_full_2_lsx;
1655 c->yuv2packed1 = yuv2rgba32_full_1_lsx;
1657 #if CONFIG_SWSCALE_ALPHA
1659 c->yuv2packedX = yuv2rgba32_full_X_lsx;
1660 c->yuv2packed2 = yuv2rgba32_full_2_lsx;
1661 c->yuv2packed1 = yuv2rgba32_full_1_lsx;
1665 c->yuv2packedX = yuv2rgbx32_full_X_lsx;
1666 c->yuv2packed2 = yuv2rgbx32_full_2_lsx;
1667 c->yuv2packed1 = yuv2rgbx32_full_1_lsx;
1673 c->yuv2packedX = yuv2argb32_full_X_lsx;
1674 c->yuv2packed2 = yuv2argb32_full_2_lsx;
1675 c->yuv2packed1 = yuv2argb32_full_1_lsx;
1677 #if CONFIG_SWSCALE_ALPHA
1679 c->yuv2packedX = yuv2argb32_full_X_lsx;
1680 c->yuv2packed2 = yuv2argb32_full_2_lsx;
1681 c->yuv2packed1 = yuv2argb32_full_1_lsx;
1685 c->yuv2packedX = yuv2xrgb32_full_X_lsx;
1686 c->yuv2packed2 = yuv2xrgb32_full_2_lsx;
1687 c->yuv2packed1 = yuv2xrgb32_full_1_lsx;
1693 c->yuv2packedX = yuv2bgra32_full_X_lsx;
1694 c->yuv2packed2 = yuv2bgra32_full_2_lsx;
1695 c->yuv2packed1 = yuv2bgra32_full_1_lsx;
1697 #if CONFIG_SWSCALE_ALPHA
1699 c->yuv2packedX = yuv2bgra32_full_X_lsx;
1700 c->yuv2packed2 = yuv2bgra32_full_2_lsx;
1701 c->yuv2packed1 = yuv2bgra32_full_1_lsx;
1705 c->yuv2packedX = yuv2bgrx32_full_X_lsx;
1706 c->yuv2packed2 = yuv2bgrx32_full_2_lsx;
1707 c->yuv2packed1 = yuv2bgrx32_full_1_lsx;
1713 c->yuv2packedX = yuv2abgr32_full_X_lsx;
1714 c->yuv2packed2 = yuv2abgr32_full_2_lsx;
1715 c->yuv2packed1 = yuv2abgr32_full_1_lsx;
1717 #if CONFIG_SWSCALE_ALPHA
1719 c->yuv2packedX = yuv2abgr32_full_X_lsx;
1720 c->yuv2packed2 = yuv2abgr32_full_2_lsx;
1721 c->yuv2packed1 = yuv2abgr32_full_1_lsx;
1725 c->yuv2packedX = yuv2xbgr32_full_X_lsx;
1726 c->yuv2packed2 = yuv2xbgr32_full_2_lsx;
1727 c->yuv2packed1 = yuv2xbgr32_full_1_lsx;
1732 c->yuv2packedX = yuv2rgb24_full_X_lsx;
1733 c->yuv2packed2 = yuv2rgb24_full_2_lsx;
1734 c->yuv2packed1 = yuv2rgb24_full_1_lsx;
1737 c->yuv2packedX = yuv2bgr24_full_X_lsx;
1738 c->yuv2packed2 = yuv2bgr24_full_2_lsx;
1739 c->yuv2packed1 = yuv2bgr24_full_1_lsx;
1742 c->yuv2packedX = yuv2bgr4_byte_full_X_lsx;
1743 c->yuv2packed2 = yuv2bgr4_byte_full_2_lsx;
1744 c->yuv2packed1 = yuv2bgr4_byte_full_1_lsx;
1747 c->yuv2packedX = yuv2rgb4_byte_full_X_lsx;
1748 c->yuv2packed2 = yuv2rgb4_byte_full_2_lsx;
1749 c->yuv2packed1 = yuv2rgb4_byte_full_1_lsx;
1752 c->yuv2packedX = yuv2bgr8_full_X_lsx;
1753 c->yuv2packed2 = yuv2bgr8_full_2_lsx;
1754 c->yuv2packed1 = yuv2bgr8_full_1_lsx;
1757 c->yuv2packedX = yuv2rgb8_full_X_lsx;
1758 c->yuv2packed2 = yuv2rgb8_full_2_lsx;
1759 c->yuv2packed1 = yuv2rgb8_full_1_lsx;
1763 switch (
c->dstFormat) {
1768 #if CONFIG_SWSCALE_ALPHA
1773 c->yuv2packed1 = yuv2rgbx32_1_lsx;
1774 c->yuv2packed2 = yuv2rgbx32_2_lsx;
1775 c->yuv2packedX = yuv2rgbx32_X_lsx;
1783 #if CONFIG_SWSCALE_ALPHA
1788 c->yuv2packed1 = yuv2rgbx32_1_1_lsx;
1789 c->yuv2packed2 = yuv2rgbx32_1_2_lsx;
1790 c->yuv2packedX = yuv2rgbx32_1_X_lsx;
1795 c->yuv2packed1 = yuv2rgb24_1_lsx;
1796 c->yuv2packed2 = yuv2rgb24_2_lsx;
1797 c->yuv2packedX = yuv2rgb24_X_lsx;
1800 c->yuv2packed1 = yuv2bgr24_1_lsx;
1801 c->yuv2packed2 = yuv2bgr24_2_lsx;
1802 c->yuv2packedX = yuv2bgr24_X_lsx;
1808 c->yuv2packed1 = yuv2rgb16_1_lsx;
1809 c->yuv2packed2 = yuv2rgb16_2_lsx;
1810 c->yuv2packedX = yuv2rgb16_X_lsx;
1816 c->yuv2packed1 = yuv2rgb15_1_lsx;
1817 c->yuv2packed2 = yuv2rgb15_2_lsx;
1818 c->yuv2packedX = yuv2rgb15_X_lsx;
1824 c->yuv2packed1 = yuv2rgb12_1_lsx;
1825 c->yuv2packed2 = yuv2rgb12_2_lsx;
1826 c->yuv2packedX = yuv2rgb12_X_lsx;
1830 c->yuv2packed1 = yuv2rgb8_1_lsx;
1831 c->yuv2packed2 = yuv2rgb8_2_lsx;
1832 c->yuv2packedX = yuv2rgb8_X_lsx;
1836 c->yuv2packed1 = yuv2rgb4_1_lsx;
1837 c->yuv2packed2 = yuv2rgb4_2_lsx;
1838 c->yuv2packedX = yuv2rgb4_X_lsx;
1842 c->yuv2packed1 = yuv2rgb4b_1_lsx;
1843 c->yuv2packed2 = yuv2rgb4b_2_lsx;
1844 c->yuv2packedX = yuv2rgb4b_X_lsx;