28 const uint8_t *p_is_pcm,
const uint8_t *q_is_pcm)
30 ptrdiff_t stride_2x = (
stride << 1);
31 ptrdiff_t stride_4x = (
stride << 2);
32 ptrdiff_t stride_3x = stride_2x +
stride;
33 uint8_t *p3 =
src - stride_4x;
34 uint8_t *p2 =
src - stride_3x;
35 uint8_t *p1 =
src - stride_2x;
39 uint8_t *q2 =
src + stride_2x;
40 uint8_t *q3 =
src + stride_3x;
42 int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
43 int32_t dp04, dq04, dp34, dq34, d04, d34;
44 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
47 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
48 __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
50 __m128i temp2, tc_pos, tc_neg;
51 __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
53 __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
55 dp00 =
abs(p2[0] - (p1[0] << 1) + p0[0]);
56 dq00 =
abs(q2[0] - (
q1[0] << 1) +
q0[0]);
57 dp30 =
abs(p2[3] - (p1[3] << 1) + p0[3]);
58 dq30 =
abs(q2[3] - (
q1[3] << 1) +
q0[3]);
61 dp04 =
abs(p2[4] - (p1[4] << 1) + p0[4]);
62 dq04 =
abs(q2[4] - (
q1[4] << 1) +
q0[4]);
63 dp34 =
abs(p2[7] - (p1[7] << 1) + p0[7]);
64 dq34 =
abs(q2[7] - (
q1[7] << 1) +
q0[7]);
68 p_is_pcm0 = p_is_pcm[0];
69 p_is_pcm4 = p_is_pcm[1];
70 q_is_pcm0 = q_is_pcm[0];
71 q_is_pcm4 = q_is_pcm[1];
73 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
74 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
75 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
76 d0030 = (d00 + d30) >= beta;
77 d0434 = (d04 + d34) >= beta;
78 DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
79 cmp3 = __lsx_vpackev_w(cmp1, cmp0);
80 cmp3 = __lsx_vseqi_w(cmp3, 0);
82 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
84 DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
85 p3_src, p2_src, p1_src, p0_src);
86 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
87 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
88 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
93 tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
95 tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
97 DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
99 p0_src, p3_src, p2_src, p1_src, p0_src);
101 q0_src, q1_src, q2_src, q3_src);
102 flag0 =
abs(p3[0] - p0[0]) +
abs(q3[0] -
q0[0]) < beta30 &&
103 abs(p0[0] -
q0[0]) < tc250;
104 flag0 = flag0 && (
abs(p3[3] - p0[3]) +
abs(q3[3] -
q0[3]) < beta30 &&
105 abs(p0[3] -
q0[3]) < tc250 && (d00 << 1) < beta20 &&
106 (d30 << 1) < beta20);
107 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
109 zero, q3_src, q0_src, q1_src, q2_src, q3_src);
111 flag1 =
abs(p3[4] - p0[4]) +
abs(q3[4] -
q0[4]) < beta30 &&
112 abs(p0[4] -
q0[4]) < tc254;
113 flag1 = flag1 && (
abs(p3[7] - p0[7]) +
abs(q3[7] -
q0[7]) < beta30 &&
114 abs(p0[7] -
q0[7]) < tc254 && (d04 << 1) < beta20 &&
115 (d34 << 1) < beta20);
116 DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
117 cmp2 = __lsx_vpackev_w(cmp1, cmp0);
118 cmp2 = __lsx_vseqi_w(cmp2, 0);
120 if (flag0 && flag1) {
122 tc_pos = __lsx_vslli_h(tc_pos, 1);
123 tc_neg = __lsx_vneg_h(tc_pos);
126 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
128 temp1 = __lsx_vadd_h(p3_src, p2_src);
129 temp1 = __lsx_vslli_h(temp1, 1);
130 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
131 temp1 = __lsx_vsrari_h(temp1, 3);
132 temp2 = __lsx_vsub_h(temp1, p2_src);
133 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
134 dst0 = __lsx_vadd_h(temp2, p2_src);
136 temp1 = __lsx_vadd_h(temp0, p2_src);
137 temp1 = __lsx_vsrari_h(temp1, 2);
138 temp2 = __lsx_vsub_h(temp1, p1_src);
139 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
140 dst1 = __lsx_vadd_h(temp2, p1_src);
142 temp1 = __lsx_vslli_h(temp0, 1);
143 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
145 temp1 = __lsx_vsrari_h(temp1, 3);
146 temp2 = __lsx_vsub_h(temp1, p0_src);
147 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
148 dst2 = __lsx_vadd_h(temp2, p0_src);
150 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
151 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
152 p1_src, p_is_pcm_vec, dst0, dst1);
153 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
156 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
158 temp1 = __lsx_vadd_h(q3_src, q2_src);
159 temp1 = __lsx_vslli_h(temp1, 1);
160 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
161 temp1 = __lsx_vsrari_h(temp1, 3);
162 temp2 = __lsx_vsub_h(temp1, q2_src);
163 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
164 dst5 = __lsx_vadd_h(temp2, q2_src);
166 temp1 = __lsx_vadd_h(temp0, q2_src);
167 temp1 = __lsx_vsrari_h(temp1, 2);
168 temp2 = __lsx_vsub_h(temp1, q1_src);
169 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
170 dst4 = __lsx_vadd_h(temp2, q1_src);
172 temp0 = __lsx_vslli_h(temp0, 1);
173 DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
175 temp1 = __lsx_vsrari_h(temp1, 3);
176 temp2 = __lsx_vsub_h(temp1, q0_src);
177 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
178 dst3 = __lsx_vadd_h(temp2, q0_src);
180 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
181 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
182 q1_src, q_is_pcm_vec, dst3, dst4);
183 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
186 DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
187 dst2 = __lsx_vpickev_b(dst5, dst4);
190 DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
192 dst5 = __lsx_vpickev_b(q2_src, q1_src);
194 cmp3 = __lsx_vnor_v(cmp3, cmp3);
195 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
197 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
199 __lsx_vstelm_d(dst0, p2, 0, 0);
200 __lsx_vstelm_d(dst0, p2 +
stride, 0, 1);
201 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
202 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
203 __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
204 __lsx_vstelm_d(dst2, p2 + stride_4x +
stride, 0, 1);
206 }
else if (flag0 == flag1) {
208 tc_neg = __lsx_vneg_h(tc_pos);
209 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
211 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
212 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
213 delta0 = __lsx_vsub_h(diff0, diff1);
214 delta0 = __lsx_vsrari_h(delta0, 4);
215 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
216 __lsx_vslli_h(tc_pos, 1));
217 abs_delta0 = __lsx_vadda_h(delta0,
zero);
218 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
219 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
221 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
222 temp2 = __lsx_vadd_h(delta0, p0_src);
223 temp2 = __lsx_vclip255_h(temp2);
224 temp0 = __lsx_vbitsel_v(temp2, p0_src,
225 __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
226 temp2 = __lsx_vsub_h(q0_src, delta0);
227 temp2 = __lsx_vclip255_h(temp2);
228 temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
230 DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
231 q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
233 tmp = (beta + (beta >> 1)) >> 3;
236 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
237 cmp0 = __lsx_vseqi_d(cmp0, 0);
238 p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
242 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
243 cmp0 = __lsx_vseqi_d(cmp0, 0);
244 q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
245 tc_pos = __lsx_vsrai_h(tc_pos, 1);
246 tc_neg = __lsx_vneg_h(tc_pos);
248 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
250 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
252 delta1 = __lsx_vadd_h(delta1, delta0);
253 delta2 = __lsx_vsub_h(delta2, delta0);
254 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
255 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
256 tc_neg, tc_pos, delta1, delta2);
257 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
259 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
260 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
261 q1_src, q_is_pcm_vec, delta1, delta2);
263 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
264 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
265 p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
266 q1_src, abs_delta0, dst1, dst2, dst3, dst4);
268 DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
270 DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
272 cmp3 = __lsx_vnor_v(cmp3, cmp3);
273 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
277 __lsx_vstelm_d(dst0, p2, 0, 0);
278 __lsx_vstelm_d(dst0, p2 +
stride, 0, 1);
279 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
280 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
284 tc_pos = __lsx_vslli_h(tc_pos, 1);
285 tc_neg = __lsx_vneg_h(tc_pos);
288 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
290 temp1 = __lsx_vadd_h(p3_src, p2_src);
291 temp1 = __lsx_vslli_h(temp1, 1);
292 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
293 temp1 = __lsx_vsrari_h(temp1, 3);
294 temp2 = __lsx_vsub_h(temp1, p2_src);
295 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
296 dst0 = __lsx_vadd_h(temp2, p2_src);
298 temp1 = __lsx_vadd_h(temp0, p2_src);
299 temp1 = __lsx_vsrari_h(temp1, 2);
300 temp2 = __lsx_vsub_h(temp1, p1_src);
301 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
302 dst1 = __lsx_vadd_h(temp2, p1_src);
304 temp1 = __lsx_vslli_h(temp0, 1);
305 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
306 temp1 = __lsx_vsrari_h(temp1, 3);
307 temp2 = __lsx_vsub_h(temp1, p0_src);
308 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
309 dst2 = __lsx_vadd_h(temp2, p0_src);
311 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
312 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
313 p1_src, p_is_pcm_vec, dst0, dst1);
314 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
317 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
319 temp1 = __lsx_vadd_h(q3_src, q2_src);
320 temp1 = __lsx_vslli_h(temp1, 1);
321 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
322 temp1 = __lsx_vsrari_h(temp1, 3);
323 temp2 = __lsx_vsub_h(temp1, q2_src);
324 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
325 dst5 = __lsx_vadd_h(temp2, q2_src);
327 temp1 = __lsx_vadd_h(temp0, q2_src);
328 temp1 = __lsx_vsrari_h(temp1, 2);
329 temp2 = __lsx_vsub_h(temp1, q1_src);
330 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
331 dst4 = __lsx_vadd_h(temp2, q1_src);
333 temp1 = __lsx_vslli_h(temp0, 1);
334 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
335 temp1 = __lsx_vsrari_h(temp1, 3);
336 temp2 = __lsx_vsub_h(temp1, q0_src);
337 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
338 dst3 = __lsx_vadd_h(temp2, q0_src);
340 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
341 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
342 q1_src, q_is_pcm_vec, dst3, dst4);
343 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
346 DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
347 dst2 = __lsx_vpickev_b(dst5, dst4);
351 tc_pos = __lsx_vsrai_h(tc_pos, 1);
352 tc_neg = __lsx_vneg_h(tc_pos);
354 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
356 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
357 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
358 delta0 = __lsx_vsub_h(diff0, diff1);
359 delta0 = __lsx_vsrari_h(delta0, 4);
360 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
361 __lsx_vslli_h(tc_pos, 1));
362 abs_delta0 = __lsx_vadda_h(delta0,
zero);
363 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
364 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
366 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
367 temp2 = __lsx_vadd_h(delta0, p0_src);
368 temp2 = __lsx_vclip255_h(temp2);
369 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
371 temp2 = __lsx_vsub_h(q0_src, delta0);
372 temp2 = __lsx_vclip255_h(temp2);
373 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
375 tmp = (beta + (beta >> 1)) >> 3;
378 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
379 p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
382 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
383 q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
385 tc_pos = __lsx_vsrai_h(tc_pos, 1);
386 tc_neg = __lsx_vneg_h(tc_pos);
388 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
390 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
392 delta1 = __lsx_vadd_h(delta1, delta0);
393 delta2 = __lsx_vsub_h(delta2, delta0);
394 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
395 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
396 tc_pos, delta1, delta2);
397 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
399 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
400 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
401 q1_src, q_is_pcm_vec, delta1, delta2);
402 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
403 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
404 q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
405 q0_src, abs_delta0, delta1, delta2, temp0, temp2);
409 DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
411 dst5 = __lsx_vpickev_b(q2_src, delta2);
414 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
416 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
419 DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
421 dst5 = __lsx_vpickev_b(q2_src, q1_src);
423 cmp3 = __lsx_vnor_v(cmp3, cmp3);
424 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
426 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
428 __lsx_vstelm_d(dst0, p2, 0, 0);
429 __lsx_vstelm_d(dst0, p2 +
stride, 0, 1);
430 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
431 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
432 __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
433 __lsx_vstelm_d(dst2, p2 + stride_4x +
stride, 0, 1);
440 const uint8_t *p_is_pcm,
const uint8_t *q_is_pcm)
442 ptrdiff_t stride_2x = (
stride << 1);
443 ptrdiff_t stride_4x = (
stride << 2);
444 ptrdiff_t stride_3x = stride_2x +
stride;
446 uint8_t *p2 =
src + stride_3x;
447 uint8_t *p1 =
src + stride_4x;
448 uint8_t *p0 =
src + stride_4x + stride_3x;
449 uint8_t flag0, flag1;
450 int32_t dp00, dq00, dp30, dq30, d00, d30;
452 int32_t dp04, dq04, dp34, dq34, d04, d34;
453 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
454 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254,
tmp;
456 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
457 __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
459 __m128i temp0, temp1;
461 __m128i tc_pos, tc_neg;
462 __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
464 __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
466 dp00 =
abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
467 dq00 =
abs(p3[2] - (p3[1] << 1) + p3[0]);
468 dp30 =
abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
469 dq30 =
abs(p2[2] - (p2[1] << 1) + p2[0]);
472 p_is_pcm0 = p_is_pcm[0];
473 q_is_pcm0 = q_is_pcm[0];
475 dp04 =
abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
476 dq04 =
abs(p1[2] - (p1[1] << 1) + p1[0]);
477 dp34 =
abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
478 dq34 =
abs(p0[2] - (p0[1] << 1) + p0[0]);
481 p_is_pcm4 = p_is_pcm[1];
482 q_is_pcm4 = q_is_pcm[1];
484 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
485 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
486 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
488 d0030 = (d00 + d30) >= beta;
489 d0434 = (d04 + d34) >= beta;
491 DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
492 cmp3 = __lsx_vpackev_d(cmp1, cmp0);
493 cmp3 = __lsx_vseqi_d(cmp3, 0);
495 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
496 (!d0030 || !d0434)) {
499 src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
502 src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
505 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
506 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
507 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
512 tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
514 tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
515 DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
516 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
517 LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
518 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
519 q0_src, q1_src, q2_src, q3_src);
521 flag0 =
abs(p3[-4] - p3[-1]) +
abs(p3[3] - p3[0]) < beta30 &&
522 abs(p3[-1] - p3[0]) < tc250;
523 flag0 = flag0 && (
abs(p2[-4] - p2[-1]) +
abs(p2[3] - p2[0]) < beta30 &&
524 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
525 (d30 << 1) < beta20);
526 cmp0 = __lsx_vreplgr2vr_d(flag0);
528 p0_src, p3_src, p2_src, p1_src, p0_src);
530 flag1 =
abs(p1[-4] - p1[-1]) +
abs(p1[3] - p1[0]) < beta30 &&
531 abs(p1[-1] - p1[0]) < tc254;
532 flag1 = flag1 && (
abs(p0[-4] - p0[-1]) +
abs(p0[3] - p0[0]) < beta30 &&
533 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
534 (d34 << 1) < beta20);
536 q3_src, q0_src, q1_src, q2_src, q3_src);
538 cmp1 = __lsx_vreplgr2vr_d(flag1);
539 cmp2 = __lsx_vpackev_d(cmp1, cmp0);
540 cmp2 = __lsx_vseqi_d(cmp2, 0);
542 if (flag0 && flag1) {
544 tc_neg = __lsx_vneg_h(tc_pos);
546 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
548 temp1 = __lsx_vadd_h(p3_src, p2_src);
549 temp1 = __lsx_vslli_h(temp1, 1);
550 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
551 temp1 = __lsx_vsrari_h(temp1, 3);
552 temp2 = __lsx_vsub_h(temp1, p2_src);
553 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
554 dst0 = __lsx_vadd_h(temp2, p2_src);
556 temp1 = __lsx_vadd_h(temp0, p2_src);
557 temp1 = __lsx_vsrari_h(temp1, 2);
558 temp2 = __lsx_vsub_h(temp1, p1_src);
559 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
560 dst1 = __lsx_vadd_h(temp2, p1_src);
562 temp1 = __lsx_vslli_h(temp0, 1);
563 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
564 temp1 = __lsx_vsrari_h(temp1, 3);
565 temp2 = __lsx_vsub_h(temp1, p0_src);
566 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
567 dst2 = __lsx_vadd_h(temp2, p0_src);
569 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
570 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
571 p_is_pcm_vec, dst0, dst1);
572 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
575 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
577 temp1 = __lsx_vadd_h(q3_src, q2_src);
578 temp1 = __lsx_vslli_h(temp1, 1);
579 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
580 temp1 = __lsx_vsrari_h(temp1, 3);
581 temp2 = __lsx_vsub_h(temp1, q2_src);
582 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
583 dst5 = __lsx_vadd_h(temp2, q2_src);
585 temp1 = __lsx_vadd_h(temp0, q2_src);
586 temp1 = __lsx_vsrari_h(temp1, 2);
587 temp2 = __lsx_vsub_h(temp1, q1_src);
588 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
589 dst4 = __lsx_vadd_h(temp2, q1_src);
591 temp1 = __lsx_vslli_h(temp0, 1);
592 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
593 temp1 = __lsx_vsrari_h(temp1, 3);
594 temp2 = __lsx_vsub_h(temp1, q0_src);
595 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
596 dst3 = __lsx_vadd_h(temp2, q0_src);
598 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
599 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
600 q_is_pcm_vec, dst3, dst4);
601 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
603 }
else if (flag0 == flag1) {
605 tc_pos = __lsx_vsrai_h(tc_pos, 1);
606 tc_neg = __lsx_vneg_h(tc_pos);
608 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
610 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
611 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
612 delta0 = __lsx_vsub_h(diff0, diff1);
613 delta0 = __lsx_vsrari_h(delta0, 4);
614 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
615 __lsx_vslli_h(tc_pos, 1));
616 abs_delta0 = __lsx_vadda_h(delta0,
zero);
617 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
618 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
620 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
621 temp2 = __lsx_vadd_h(delta0, p0_src);
622 temp2 = __lsx_vclip255_h(temp2);
623 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
624 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
626 temp2 = __lsx_vsub_h(q0_src, delta0);
627 temp2 = __lsx_vclip255_h(temp2);
628 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
629 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
631 tmp = ((beta + (beta >> 1)) >> 3);
632 DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) <
tmp),
633 !p_is_pcm4 && ((dp04 + dp34) <
tmp), cmp0, cmp1);
634 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
635 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
637 DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 <
tmp),
638 (!q_is_pcm4) && (dq04 + dq34 <
tmp), cmp0, cmp1);
639 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
640 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
641 tc_pos = __lsx_vsrai_h(tc_pos, 1);
642 tc_neg = __lsx_vneg_h(tc_pos);
644 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
646 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
648 delta1 = __lsx_vadd_h(delta1, delta0);
649 delta2 = __lsx_vsub_h(delta2, delta0);
650 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
651 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
652 tc_pos, delta1, delta2);
653 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
655 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
656 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
657 q1_src, q_is_pcm_vec, delta1, delta2);
659 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
660 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
661 p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
662 q1_src, abs_delta0, dst0, dst1, dst2, dst3);
665 cmp3 = __lsx_vnor_v(cmp3, cmp3);
666 DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
667 cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
668 dst0, dst1, dst2, dst3);
669 DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
672 dst4 = __lsx_vilvl_b(dst1, dst0);
673 dst5 = __lsx_vilvh_b(dst1, dst0);
674 dst0 = __lsx_vilvl_h(dst5, dst4);
675 dst1 = __lsx_vilvh_h(dst5, dst4);
678 __lsx_vstelm_w(dst0,
src, 0, 0);
679 __lsx_vstelm_w(dst0,
src +
stride, 0, 1);
680 __lsx_vstelm_w(dst0,
src + stride_2x, 0, 2);
681 __lsx_vstelm_w(dst0,
src + stride_3x, 0, 3);
683 __lsx_vstelm_w(dst1,
src, 0, 0);
684 __lsx_vstelm_w(dst1,
src +
stride, 0, 1);
685 __lsx_vstelm_w(dst1,
src + stride_2x, 0, 2);
686 __lsx_vstelm_w(dst1,
src + stride_3x, 0, 3);
690 tc_neg = __lsx_vneg_h(tc_pos);
693 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
696 temp1 = __lsx_vadd_h(p3_src, p2_src);
697 temp1 = __lsx_vslli_h(temp1, 1);
698 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
699 temp1 = __lsx_vsrari_h(temp1, 3);
700 temp2 = __lsx_vsub_h(temp1, p2_src);
701 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
702 dst0 = __lsx_vadd_h(temp2, p2_src);
704 temp1 = __lsx_vadd_h(temp0, p2_src);
705 temp1 = __lsx_vsrari_h(temp1, 2);
706 temp2 = __lsx_vsub_h(temp1, p1_src);
707 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
708 dst1 = __lsx_vadd_h(temp2, p1_src);
710 temp1 = __lsx_vslli_h(temp0, 1);
711 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
712 temp1 = __lsx_vsrari_h(temp1, 3);
713 temp2 = __lsx_vsub_h(temp1, p0_src);
714 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
715 dst2 = __lsx_vadd_h(temp2, p0_src);
717 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
718 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
719 p_is_pcm_vec, dst0, dst1);
720 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
723 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
724 temp1 = __lsx_vadd_h(q3_src, q2_src);
725 temp1 = __lsx_vslli_h(temp1, 1);
726 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
727 temp1 = __lsx_vsrari_h(temp1, 3);
728 temp2 = __lsx_vsub_h(temp1, q2_src);
729 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
730 dst5 = __lsx_vadd_h(temp2, q2_src);
732 temp1 = __lsx_vadd_h(temp0, q2_src);
733 temp1 = __lsx_vsrari_h(temp1, 2);
734 temp2 = __lsx_vsub_h(temp1, q1_src);
735 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
736 dst4 = __lsx_vadd_h(temp2, q1_src);
738 temp1 = __lsx_vslli_h(temp0, 1);
739 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
740 temp1 = __lsx_vsrari_h(temp1, 3);
741 temp2 = __lsx_vsub_h(temp1, q0_src);
742 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
743 dst3 = __lsx_vadd_h(temp2, q0_src);
745 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
746 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
747 q_is_pcm_vec, dst3, dst4);
748 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
752 tc_pos = __lsx_vsrai_h(tc_pos, 1);
753 tc_neg = __lsx_vneg_h(tc_pos);
755 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
757 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
758 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
759 delta0 = __lsx_vsub_h(diff0, diff1);
760 delta0 = __lsx_vsrari_h(delta0, 4);
762 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
763 __lsx_vslli_h(tc_pos, 1));
764 abs_delta0 = __lsx_vadda_h(delta0,
zero);
765 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
766 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
767 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
768 temp2 = __lsx_vadd_h(delta0, p0_src);
769 temp2 = __lsx_vclip255_h(temp2);
770 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
771 temp2 = __lsx_vsub_h(q0_src, delta0);
772 temp2 = __lsx_vclip255_h(temp2);
773 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
775 tmp = (beta + (beta >> 1)) >> 3;
776 DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) <
tmp),
777 !p_is_pcm4 && ((dp04 + dp34) <
tmp), cmp0, cmp1);
778 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
779 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
781 DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 <
tmp),
782 (!q_is_pcm4) && (dq04 + dq34 <
tmp), cmp0, cmp1);
783 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
784 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
785 tc_pos = __lsx_vsrai_h(tc_pos, 1);
786 tc_neg = __lsx_vneg_h(tc_pos);
788 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
790 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
792 delta1 = __lsx_vadd_h(delta1, delta0);
793 delta2 = __lsx_vsub_h(delta2, delta0);
794 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
795 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
796 tc_pos, delta1, delta2);
797 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
799 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
800 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
801 q1_src, q_is_pcm_vec, delta1, delta2);
803 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
804 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
805 q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
806 q0_src, abs_delta0, delta1, delta2, temp0, temp2);
810 DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
811 cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
812 dst0, dst1, dst2, dst3);
813 DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
817 cmp3 = __lsx_vnor_v(cmp3, cmp3);
818 DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
819 p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
820 DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
824 DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
825 dst5, dst0, dst1, dst2, dst3);
828 DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
829 DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
830 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
831 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
834 __lsx_vstelm_w(dst0,
src, 0, 0);
835 __lsx_vstelm_h(dst2,
src, 4, 0);
837 __lsx_vstelm_w(dst0,
src, 0, 1);
838 __lsx_vstelm_h(dst2,
src, 4, 2);
841 __lsx_vstelm_w(dst0,
src, 0, 2);
842 __lsx_vstelm_h(dst2,
src, 4, 4);
844 __lsx_vstelm_w(dst0,
src, 0, 3);
845 __lsx_vstelm_h(dst2,
src, 4, 6);
848 __lsx_vstelm_w(dst1,
src, 0, 0);
849 __lsx_vstelm_h(dst3,
src, 4, 0);
851 __lsx_vstelm_w(dst1,
src, 0, 1);
852 __lsx_vstelm_h(dst3,
src, 4, 2);
855 __lsx_vstelm_w(dst1,
src, 0, 2);
856 __lsx_vstelm_h(dst3,
src, 4, 4);
858 __lsx_vstelm_w(dst1,
src, 0, 3);
859 __lsx_vstelm_h(dst3,
src, 4, 6);
864 const int32_t *tc,
const uint8_t *p_is_pcm,
865 const uint8_t *q_is_pcm)
869 uint8_t *q0_ptr =
src;
871 __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
872 __m128i p1, p0,
q0,
q1;
873 __m128i tc_pos, tc_neg;
875 __m128i temp0, temp1,
delta;
877 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
878 DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
879 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
880 tc_neg = __lsx_vneg_h(tc_pos);
881 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
882 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
883 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
885 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
886 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
887 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
889 DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
894 temp0 = __lsx_vslli_h(temp0, 2);
895 temp0 = __lsx_vadd_h(temp0, temp1);
896 delta = __lsx_vsrari_h(temp0, 3);
898 temp0 = __lsx_vadd_h(p0,
delta);
899 temp0 = __lsx_vclip255_h(temp0);
900 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
901 temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
903 temp1 = __lsx_vsub_h(
q0,
delta);
904 temp1 = __lsx_vclip255_h(temp1);
905 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
906 temp1 = __lsx_vbitsel_v(temp1,
q0, q_is_pcm_vec);
908 tc_pos = __lsx_vslei_d(tc_pos, 0);
909 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1,
q0, tc_pos,
911 temp0 = __lsx_vpickev_b(temp1, temp0);
912 __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
913 __lsx_vstelm_d(temp0, p0_ptr +
stride, 0, 1);
918 const int32_t *tc,
const uint8_t *p_is_pcm,
919 const uint8_t *q_is_pcm)
921 ptrdiff_t stride_2x = (
stride << 1);
922 ptrdiff_t stride_4x = (
stride << 2);
923 ptrdiff_t stride_3x = stride_2x +
stride;
924 __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
926 __m128i p1, p0,
q0,
q1;
927 __m128i tc_pos, tc_neg;
929 __m128i temp0, temp1,
delta;
931 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
932 DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
933 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
934 tc_neg = __lsx_vneg_h(tc_pos);
936 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
937 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
938 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
939 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
940 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
941 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
948 src + stride_3x, 0, src4, src5, src6, src7);
950 LSX_TRANSPOSE8x4_B(
src0,
src1,
src2, src3, src4, src5, src6, src7,
956 temp0 = __lsx_vslli_h(temp0, 2);
957 temp0 = __lsx_vadd_h(temp0, temp1);
958 delta = __lsx_vsrari_h(temp0, 3);
961 temp0 = __lsx_vadd_h(p0,
delta);
962 temp1 = __lsx_vsub_h(
q0,
delta);
963 DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
964 DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
965 q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
966 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1,
q0,
967 q_is_pcm_vec, temp0, temp1);
969 tc_pos = __lsx_vslei_d(tc_pos, 0);
970 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1,
q0, tc_pos,
972 temp0 = __lsx_vpackev_b(temp1, temp0);
975 __lsx_vstelm_h(temp0,
src, 0, 0);
976 __lsx_vstelm_h(temp0,
src +
stride, 0, 1);
977 __lsx_vstelm_h(temp0,
src + stride_2x, 0, 2);
978 __lsx_vstelm_h(temp0,
src + stride_3x, 0, 3);
980 __lsx_vstelm_h(temp0,
src, 0, 4);
981 __lsx_vstelm_h(temp0,
src +
stride, 0, 5);
982 __lsx_vstelm_h(temp0,
src + stride_2x, 0, 6);
983 __lsx_vstelm_h(temp0,
src + stride_3x, 0, 7);
992 const int16_t *sao_offset_val,
995 const int32_t src_stride_2x = (src_stride << 1);
996 const int32_t dst_stride_2x = (dst_stride << 1);
997 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
998 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
999 __m128i edge_idx = {0x403000201, 0x0};
1000 __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1001 __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1002 __m128i src_minus10, src_minus11, src_plus10,
offset,
src0, dst0;
1003 __m128i const1 = __lsx_vldi(1);
1006 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1010 DUP2_ARG2(__lsx_vld,
src, 0,
src + src_stride, 0, src_minus10, src_minus11);
1013 src += src_stride_2x;
1014 src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1015 src0 = __lsx_vshuf_b(
zero, src_minus10, shuf1);
1016 src_plus10 = __lsx_vshuf_b(
zero, src_minus10, shuf2);
1019 cmp_minus10, cmp_minus11);
1020 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1021 cmp_minus11, diff_minus10, diff_minus11);
1023 cmp_minus10, cmp_minus11);
1024 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1025 cmp_minus11, cmp_minus10, cmp_minus11);
1026 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1027 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1029 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1034 src_minus10, src_minus11);
1039 dst0 = __lsx_vxori_b(dst0, 128);
1041 __lsx_vstelm_w(dst0,
dst, 0, 0);
1042 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 2);
1043 dst += dst_stride_2x;
1046 src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1047 src0 = __lsx_vshuf_b(
zero, src_minus10, shuf1);
1048 src_plus10 = __lsx_vshuf_b(
zero, src_minus10, shuf2);
1052 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1053 diff_minus10, diff_minus11);
1056 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1057 cmp_minus10, cmp_minus11);
1058 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1059 const1, cmp_minus11, diff_minus10, diff_minus11);
1061 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1063 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx,
offset, sao_offset, sao_offset,
1067 dst0 = __lsx_vxori_b(dst0, 128);
1069 __lsx_vstelm_w(dst0,
dst, 0, 0);
1070 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 2);
1077 const int16_t *sao_offset_val,
1080 const int32_t src_stride_2x = (src_stride << 1);
1081 const int32_t dst_stride_2x = (dst_stride << 1);
1082 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1083 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1084 __m128i edge_idx = {0x403000201, 0x0};
1085 __m128i const1 = __lsx_vldi(1);
1086 __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1087 __m128i
src0,
src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1088 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1089 __m128i zeros = {0};
1091 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1095 DUP2_ARG2(__lsx_vld,
src, 0,
src + src_stride, 0, src_minus10, src_minus11);
1098 src += src_stride_2x;
1099 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
1101 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
1102 src_minus11, shuf2, src_plus10, src_plus11);
1103 DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1104 src_plus10, src_minus10, src_plus10);
1108 cmp_minus10, cmp_minus11);
1109 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1110 cmp_minus11, diff_minus10, diff_minus11);
1112 cmp_minus10, cmp_minus11);
1113 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1114 cmp_minus11, cmp_minus10, cmp_minus11);
1115 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1116 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1118 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1123 src_minus10, src_minus11);
1128 dst0 = __lsx_vxori_b(dst0, 128);
1130 __lsx_vstelm_d(dst0,
dst, 0, 0);
1131 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
1132 dst += dst_stride_2x;
1135 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
1137 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
1138 shuf2, src_plus10, src_plus11);
1139 DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1140 src_plus10, src_minus10, src_plus10);
1145 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1146 diff_minus10, diff_minus11);
1149 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1150 cmp_minus10, cmp_minus11);
1151 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1152 const1, cmp_minus11, diff_minus10, diff_minus11);
1154 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1160 dst0 = __lsx_vxori_b(dst0, 128);
1162 __lsx_vstelm_d(dst0,
dst, 0, 0);
1163 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
1170 const int16_t *sao_offset_val,
1175 const uint8_t *src_minus1;
1177 const int32_t src_stride_2x = (src_stride << 1);
1178 const int32_t dst_stride_2x = (dst_stride << 1);
1179 const int32_t src_stride_4x = (src_stride << 2);
1180 const int32_t dst_stride_4x = (dst_stride << 2);
1181 const int32_t src_stride_3x = src_stride_2x + src_stride;
1182 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1184 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1185 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1186 __m128i edge_idx = {0x403000201, 0x0};
1187 __m128i const1 = __lsx_vldi(1);
1189 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1190 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1191 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1192 __m128i diff_plus13;
1193 __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1194 __m128i src_minus10, src_minus11, src_minus12, src_minus13;
1195 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1196 __m128i src_zero0, src_zero1, src_zero2, src_zero3;
1197 __m128i src_plus10, src_plus11, src_plus12, src_plus13;
1199 sao_offset = __lsx_vld(sao_offset_val, 0);
1200 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1203 src_minus1 =
src - 1;
1204 src_minus10 = __lsx_vld(src_minus1, 0);
1205 DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1206 src_stride_2x, src_minus11, src_minus12);
1207 src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
1209 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1211 dst_ptr =
dst + v_cnt;
1212 src10 = __lsx_vld(src_minus1, 0);
1213 DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1214 src_stride_2x, src11, src12);
1215 src13 = __lsx_vldx(src_minus1, src_stride_3x);
1216 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
1217 src_minus11, shuf1, src12, src_minus12, shuf1, src13,
1218 src_minus13, shuf1, src_zero0, src_zero1,
1219 src_zero2, src_zero3);
1220 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
1221 src_minus11, shuf2, src12, src_minus12, shuf2, src13,
1222 src_minus13, shuf2, src_plus10, src_plus11,
1223 src_plus12, src_plus13);
1224 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1225 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1226 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1227 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1228 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1229 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1230 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1231 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1232 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1234 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1235 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1236 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1238 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1239 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1240 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1241 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1242 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1243 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1244 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1245 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1246 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1248 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1249 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1250 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1252 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1253 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1254 cmp_minus11, diff_plus11, const1, cmp_plus11,
1255 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1256 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1257 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1258 cmp_minus13, diff_plus13, const1, cmp_plus13,
1259 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1261 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1262 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1263 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1265 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1266 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1267 offset_mask1, offset_mask2, offset_mask3);
1268 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1269 sao_offset, sao_offset, offset_mask0, offset_mask0,
1271 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1272 sao_offset, sao_offset, offset_mask1, offset_mask1,
1274 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1275 sao_offset, sao_offset, offset_mask2, offset_mask2,
1277 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1278 sao_offset, sao_offset, offset_mask3, offset_mask3,
1281 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
1282 src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
1283 src_zero2, src_zero3);
1284 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1285 offset_mask1, src_zero2, offset_mask2, src_zero3,
1286 offset_mask3, dst0, dst1, dst2, dst3);
1287 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1288 128, dst0, dst1, dst2, dst3);
1290 src_minus10 = src10;
1291 src_minus11 = src11;
1292 src_minus12 = src12;
1293 src_minus13 = src13;
1295 __lsx_vst(dst0, dst_ptr, 0);
1296 __lsx_vst(dst1, dst_ptr + dst_stride, 0);
1297 __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
1298 __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
1300 src += src_stride_4x;
1301 dst += dst_stride_4x;
1309 const int16_t *sao_offset_val,
1312 const int32_t src_stride_2x = (src_stride << 1);
1313 const int32_t dst_stride_2x = (dst_stride << 1);
1314 __m128i edge_idx = {0x403000201, 0x0};
1315 __m128i const1 = __lsx_vldi(1);
1317 __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1318 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1319 __m128i src_minus10, src_minus11, src10, src11;
1320 __m128i src_zero0, src_zero1;
1322 __m128i offset_mask0, offset_mask1;
1324 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1328 src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
1331 src += src_stride_2x;
1332 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1333 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1334 src_minus11, src_zero1);
1335 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1336 cmp_minus10, cmp_minus11);
1337 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1338 cmp_minus11, diff_minus10, diff_minus11);
1339 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1340 src_minus11, cmp_minus10, cmp_minus11);
1341 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1342 cmp_minus11, cmp_minus10, cmp_minus11);
1343 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1344 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1346 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1347 diff_minus11, offset_mask0, offset_mask1);
1348 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1349 offset_mask0, offset_mask1);
1350 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1351 src_zero0,
offset, dst0);
1355 dst0 = __lsx_vxori_b(dst0, 128);
1356 dst0 = __lsx_vsadd_b(dst0,
offset);
1357 dst0 = __lsx_vxori_b(dst0, 128);
1358 src_minus10 = src10;
1359 src_minus11 = src11;
1365 __lsx_vstelm_w(dst0,
dst, 0, 0);
1366 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 2);
1367 dst += dst_stride_2x;
1370 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1371 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1372 src_minus11, src_zero1);
1373 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1374 cmp_minus10, cmp_minus11);
1375 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1376 diff_minus10, diff_minus11);
1377 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1378 cmp_minus10, cmp_minus11);
1379 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1380 cmp_minus10, cmp_minus11);
1381 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1382 const1, cmp_minus11, diff_minus10, diff_minus11);
1384 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1385 diff_minus11, offset_mask0, offset_mask1);
1386 DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1387 offset_mask0, offset_mask1);
1388 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1389 src_zero0,
offset, dst0);
1392 dst0 = __lsx_vxori_b(dst0, 128);
1393 dst0 = __lsx_vsadd_b(dst0,
offset);
1394 dst0 = __lsx_vxori_b(dst0, 128);
1396 __lsx_vstelm_w(dst0,
dst, 0, 0);
1397 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 2);
1404 const int16_t *sao_offset_val,
1407 const int32_t src_stride_2x = (src_stride << 1);
1408 const int32_t dst_stride_2x = (dst_stride << 1);
1409 __m128i edge_idx = {0x403000201, 0x0};
1410 __m128i const1 = __lsx_vldi(1);
1411 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1412 __m128i src_zero0, src_zero1, dst0;
1413 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1414 __m128i src_minus10, src_minus11, src10, src11;
1415 __m128i offset_mask0, offset_mask1;
1417 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1420 DUP2_ARG2(__lsx_vld,
src - src_stride, 0,
src, 0, src_minus10, src_minus11);
1421 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src10, src11);
1424 src += src_stride_2x;
1425 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1426 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1427 src_minus11, src_zero1);
1428 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1429 cmp_minus10, cmp_minus11);
1430 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1431 cmp_minus11, diff_minus10, diff_minus11);
1432 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1433 src_minus11, cmp_minus10, cmp_minus11);
1434 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1435 cmp_minus11, cmp_minus10, cmp_minus11);
1436 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1437 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1439 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1440 diff_minus11, offset_mask0, offset_mask1);
1441 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1442 offset_mask0, offset_mask1);
1443 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1444 src_zero0,
offset, dst0);
1448 dst0 = __lsx_vxori_b(dst0, 128);
1449 dst0 = __lsx_vsadd_b(dst0,
offset);
1450 dst0 = __lsx_vxori_b(dst0, 128);
1451 src_minus10 = src10;
1452 src_minus11 = src11;
1458 __lsx_vstelm_d(dst0,
dst, 0, 0);
1459 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
1460 dst += dst_stride_2x;
1463 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1464 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1465 src_minus11, src_zero1);
1466 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1467 cmp_minus10, cmp_minus11);
1468 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1469 diff_minus10, diff_minus11);
1470 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1471 cmp_minus10, cmp_minus11);
1472 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1473 cmp_minus10, cmp_minus11);
1474 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1475 const1, cmp_minus11, diff_minus10, diff_minus11);
1477 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1478 diff_minus11, offset_mask0, offset_mask1);
1479 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1480 offset_mask0, offset_mask1);
1481 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1482 src_zero0,
offset, dst0);
1485 dst0 = __lsx_vxori_b(dst0, 128);
1486 dst0 = __lsx_vsadd_b(dst0,
offset);
1487 dst0 = __lsx_vxori_b(dst0, 128);
1489 __lsx_vstelm_d(dst0,
dst, 0, 0);
1490 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
1502 const uint8_t *src_orig =
src;
1503 uint8_t *dst_orig =
dst;
1505 const int32_t src_stride_2x = (src_stride << 1);
1506 const int32_t dst_stride_2x = (dst_stride << 1);
1507 const int32_t src_stride_4x = (src_stride << 2);
1508 const int32_t dst_stride_4x = (dst_stride << 2);
1509 const int32_t src_stride_3x = src_stride_2x + src_stride;
1510 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1511 __m128i edge_idx = {0x403000201, 0x0};
1512 __m128i const1 = __lsx_vldi(1);
1513 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1514 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1515 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1516 __m128i diff_plus13;
1517 __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1518 __m128i src12, dst2, src13, dst3;
1519 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1521 sao_offset = __lsx_vld(sao_offset_val, 0);
1522 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1524 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1525 src = src_orig + v_cnt;
1526 dst = dst_orig + v_cnt;
1529 src_minus10, src_minus11);
1531 for (h_cnt = (
height >> 2); h_cnt--;) {
1533 src, src_stride_3x,
src, src_stride_4x,
1534 src10, src11, src12, src13);
1535 DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
1536 src10, src10, src_minus11, src10, src11, cmp_minus10,
1537 cmp_plus10, cmp_minus11, cmp_plus11);
1538 DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
1539 src12, src13, cmp_minus12, cmp_plus12,
1540 cmp_minus13, cmp_plus13);
1541 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1542 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1543 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1545 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1546 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1547 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1549 DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
1550 src10, src10, src_minus11, src10, src11, cmp_minus10,
1551 cmp_plus10, cmp_minus11, cmp_plus11);
1552 DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
1553 src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
1555 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1556 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1557 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1559 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1560 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1561 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1563 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1564 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1565 cmp_minus11, diff_plus11, const1, cmp_plus11,
1566 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1567 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1568 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1569 cmp_minus13, diff_plus13, const1, cmp_plus13,
1570 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1572 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1573 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1574 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1576 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1577 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1578 offset_mask1, offset_mask2, offset_mask3);
1579 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1580 sao_offset, sao_offset, offset_mask0,\
1581 offset_mask0, offset_mask0);
1582 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1583 sao_offset, sao_offset, offset_mask1, offset_mask1,
1585 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1586 sao_offset, sao_offset, offset_mask2, offset_mask2,
1588 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1589 sao_offset, sao_offset, offset_mask3, offset_mask3,
1592 src_minus10 = src12;
1593 DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
1594 src12, 128, src_minus11, src10, src11, src12);
1595 DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
1596 offset_mask1, src11, offset_mask2, src12,
1597 offset_mask3, dst0, dst1, dst2, dst3);
1598 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1599 128, dst0, dst1, dst2, dst3);
1600 src_minus11 = src13;
1602 __lsx_vst(dst0,
dst, 0);
1603 __lsx_vstx(dst1,
dst, dst_stride);
1604 __lsx_vstx(dst2,
dst, dst_stride_2x);
1605 __lsx_vstx(dst3,
dst, dst_stride_3x);
1606 src += src_stride_4x;
1607 dst += dst_stride_4x;
1616 const int16_t *sao_offset_val,
1619 const uint8_t *src_orig;
1620 const int32_t src_stride_2x = (src_stride << 1);
1621 const int32_t dst_stride_2x = (dst_stride << 1);
1622 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1623 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1624 __m128i edge_idx = {0x403000201, 0x0};
1625 __m128i const1 = __lsx_vldi(1);
1626 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1627 __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1628 __m128i src_minus11, src10, src11;
1629 __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1630 __m128i offset_mask0, offset_mask1;
1631 __m128i zeros = {0};
1633 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1637 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
1638 src_minus10, src_minus11);
1639 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1643 src_orig += src_stride_2x;
1645 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1646 shuf1, src_zero0, src_zero1);
1647 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1648 src_plus0, src_plus1);
1650 DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
1651 src_minus11, src_minus10, src_minus11);
1652 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
1653 src_zero1, src_zero0, src_zero1);
1654 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
1655 src_minus11, cmp_minus10, cmp_minus11);
1656 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1657 cmp_minus11, diff_minus10, diff_minus11);
1658 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1659 src_minus11, cmp_minus10, cmp_minus11);
1660 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1661 cmp_minus11, cmp_minus10, cmp_minus11);
1662 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1663 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1665 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1666 diff_minus11, offset_mask0, offset_mask1);
1667 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1668 offset_mask0, offset_mask1);
1669 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1670 src_zero0,
offset, dst0);
1673 dst0 = __lsx_vxori_b(dst0, 128);
1674 dst0 = __lsx_vsadd_b(dst0,
offset);
1675 dst0 = __lsx_vxori_b(dst0, 128);
1677 src_minus10 = src10;
1678 src_minus11 = src11;
1681 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1684 __lsx_vstelm_w(dst0,
dst, 0, 0);
1685 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 2);
1686 dst += dst_stride_2x;
1689 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1690 src_zero0, src_zero1);
1691 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1692 src_plus0, src_plus1);
1694 DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
1695 src_minus10, src_minus11);
1696 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1697 src_zero0, src_zero1);
1698 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1699 cmp_minus10, cmp_minus11);
1700 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1701 diff_minus10, diff_minus11);
1702 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1703 cmp_minus10, cmp_minus11);
1704 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1705 cmp_minus10, cmp_minus11);
1706 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1707 const1, cmp_minus11, diff_minus10, diff_minus11);
1709 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1710 diff_minus11, offset_mask0, offset_mask1);
1711 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1713 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1714 src_zero0,
offset, dst0);
1717 dst0 = __lsx_vxori_b(dst0, 128);
1718 dst0 = __lsx_vsadd_b(dst0,
offset);
1719 dst0 = __lsx_vxori_b(dst0, 128);
1721 __lsx_vstelm_w(dst0,
dst, 0, 0);
1722 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 2);
1729 const int16_t *sao_offset_val,
1732 const uint8_t *src_orig;
1733 const int32_t src_stride_2x = (src_stride << 1);
1734 const int32_t dst_stride_2x = (dst_stride << 1);
1735 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1736 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1737 __m128i edge_idx = {0x403000201, 0x0};
1738 __m128i const1 = __lsx_vldi(1);
1739 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1740 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1741 __m128i src_minus10, src10, src_minus11, src11;
1742 __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1743 __m128i offset_mask0, offset_mask1;
1744 __m128i zeros = {0};
1746 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1750 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
1752 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1756 src_orig += src_stride_2x;
1758 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1759 shuf1, src_zero0, src_zero1);
1760 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1761 src_plus10, src_plus11);
1763 DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
1764 src_minus11, src_minus10, src_minus11);
1765 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1766 src_zero0, src_zero1);
1767 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1768 cmp_minus10, cmp_minus11);
1769 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1770 cmp_minus11, diff_minus10, diff_minus11);
1771 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1772 src_minus11, cmp_minus10, cmp_minus11);
1773 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1774 cmp_minus11, cmp_minus10, cmp_minus11);
1775 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1776 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1778 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1779 diff_minus11, offset_mask0, offset_mask1);
1780 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1781 offset_mask0, offset_mask1);
1782 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1783 src_zero0,
offset, dst0);
1786 dst0 = __lsx_vxori_b(dst0, 128);
1787 dst0 = __lsx_vsadd_b(dst0,
offset);
1788 dst0 = __lsx_vxori_b(dst0, 128);
1790 src_minus10 = src10;
1791 src_minus11 = src11;
1794 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1796 __lsx_vstelm_d(dst0,
dst, 0, 0);
1797 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
1798 dst += dst_stride_2x;
1801 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1802 src_zero0, src_zero1);
1803 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1804 src_plus10, src_plus11);
1805 DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
1806 src_minus10, src_minus11);
1807 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1808 src_zero0, src_zero1);
1810 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1811 cmp_minus10, cmp_minus11);
1812 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1813 cmp_minus11, diff_minus10, diff_minus11);
1814 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1815 cmp_minus10, cmp_minus11);
1816 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1817 cmp_minus10, cmp_minus11);
1818 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1819 const1, cmp_minus11, diff_minus10, diff_minus11);
1821 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1822 diff_minus11, offset_mask0, offset_mask1);
1823 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1825 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1826 src_zero0,
offset, dst0);
1829 dst0 = __lsx_vxori_b(dst0, 128);
1830 dst0 = __lsx_vsadd_b(dst0,
offset);
1831 dst0 = __lsx_vxori_b(dst0, 128);
1833 src_minus10 = src10;
1834 src_minus11 = src11;
1837 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1840 __lsx_vstelm_d(dst0,
dst, 0, 0);
1841 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
1853 const uint8_t *src_orig =
src;
1854 uint8_t *dst_orig =
dst;
1856 const int32_t src_stride_2x = (src_stride << 1);
1857 const int32_t dst_stride_2x = (dst_stride << 1);
1858 const int32_t src_stride_4x = (src_stride << 2);
1859 const int32_t dst_stride_4x = (dst_stride << 2);
1860 const int32_t src_stride_3x = src_stride_2x + src_stride;
1861 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1863 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1864 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1865 __m128i edge_idx = {0x403000201, 0x0};
1866 __m128i const1 = __lsx_vldi(1);
1867 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1868 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1869 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1870 __m128i diff_plus13, src_minus14, src_plus13;
1871 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1872 __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1873 __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
1874 __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
1875 __m128i src_zero3, sao_offset, src_plus12;
1877 sao_offset = __lsx_vld(sao_offset_val, 0);
1878 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1883 src_minus11 = __lsx_vld(src_orig, 0);
1884 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1885 src_minus12, src_minus13);
1886 src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
1888 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1889 src_minus10 = __lsx_vld(src_orig - src_stride, 0);
1891 src10 = __lsx_vld(src_orig, 0);
1892 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
1893 src_stride_2x, src11, src12);
1894 src13 = __lsx_vldx(src_orig, src_stride_3x);
1895 src_plus13 = __lsx_vld(
src + v_cnt + src_stride_4x, 1);
1897 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
1898 src_minus12, shuf1, src12, src_minus13, shuf1,
1899 src13, src_minus14, shuf1, src_zero0, src_zero1,
1900 src_zero2, src_zero3);
1901 DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
1902 src_minus13, shuf2, src_plus10, src_plus11);
1903 src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
1905 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1906 src_plus10, src_zero1, src_minus11, src_zero1,
1907 src_plus11, cmp_minus10, cmp_plus10,
1908 cmp_minus11, cmp_plus11);
1909 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1910 src_plus12, src_zero3, src_minus13, src_zero3,
1911 src_plus13, cmp_minus12, cmp_plus12,
1912 cmp_minus13, cmp_plus13);
1913 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1914 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1915 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1917 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1918 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1919 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1921 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1922 src_plus10, src_zero1, src_minus11, src_zero1,
1923 src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1925 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1926 src_plus12, src_zero3, src_minus13, src_zero3,
1927 src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1929 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1930 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1931 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1933 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1934 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1935 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1937 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1938 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1939 cmp_minus11, diff_plus11, const1, cmp_plus11,
1940 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1941 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1942 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1943 cmp_minus13, diff_plus13, const1, cmp_plus13,
1944 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1946 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1947 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1948 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1950 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1951 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1952 offset_mask1, offset_mask2, offset_mask3);
1954 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1955 sao_offset, sao_offset, offset_mask0, offset_mask0,
1957 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1958 sao_offset, sao_offset, offset_mask1, offset_mask1,
1960 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1961 sao_offset, sao_offset, offset_mask2, offset_mask2,
1963 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1964 sao_offset, sao_offset, offset_mask3, offset_mask3,
1967 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
1968 128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
1970 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1971 offset_mask1, src_zero2, offset_mask2, src_zero3,
1972 offset_mask3, dst0, dst1, dst2, dst3);
1973 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1974 128, dst0, dst1, dst2, dst3);
1976 src_minus11 = src10;
1977 src_minus12 = src11;
1978 src_minus13 = src12;
1979 src_minus14 = src13;
1981 __lsx_vst(dst0, dst_orig, 0);
1982 __lsx_vstx(dst1, dst_orig, dst_stride);
1983 __lsx_vstx(dst2, dst_orig, dst_stride_2x);
1984 __lsx_vstx(dst3, dst_orig, dst_stride_3x);
1987 src += src_stride_4x;
1988 dst += dst_stride_4x;
1996 const int16_t *sao_offset_val,
1999 const uint8_t *src_orig;
2000 const int32_t src_stride_2x = (src_stride << 1);
2001 const int32_t dst_stride_2x = (dst_stride << 1);
2003 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2004 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2005 __m128i edge_idx = {0x403000201, 0x0};
2006 __m128i const1 = __lsx_vldi(1);
2007 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2008 __m128i src_zero0, src_zero1, dst0;
2009 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2010 __m128i src_minus10, src10, src_minus11, src11;
2011 __m128i offset_mask0, offset_mask1;
2012 __m128i zeros = {0};
2014 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2018 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2019 src_minus10, src_minus11);
2020 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2024 src_orig += src_stride_2x;
2026 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2027 shuf1, src_zero0, src_zero1);
2028 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2029 shuf2, src_minus10, src_minus11);
2031 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2032 src_minus10, src_minus11);
2033 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2034 src_zero0, src_zero1);
2035 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2036 cmp_minus10, cmp_minus11);
2037 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2038 cmp_minus11, diff_minus10, diff_minus11);
2039 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2040 src_minus11, cmp_minus10, cmp_minus11);
2041 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2042 cmp_minus11, cmp_minus10, cmp_minus11);
2043 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2044 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2046 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2047 diff_minus11, offset_mask0, offset_mask1);
2048 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2049 offset_mask0, offset_mask1);
2050 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2051 src_zero0,
offset, dst0);
2054 dst0 = __lsx_vxori_b(dst0, 128);
2055 dst0 = __lsx_vsadd_b(dst0,
offset);
2056 dst0 = __lsx_vxori_b(dst0, 128);
2058 src_minus10 = src10;
2059 src_minus11 = src11;
2062 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2065 __lsx_vstelm_w(dst0,
dst, 0, 0);
2066 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 2);
2067 dst += dst_stride_2x;
2070 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2071 src_zero0, src_zero1);
2072 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2073 shuf2, src_minus10, src_minus11);
2075 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2076 src_minus10, src_minus11);
2077 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2078 src_zero0, src_zero1);
2079 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2080 cmp_minus10, cmp_minus11);
2081 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2082 cmp_minus11, diff_minus10, diff_minus11);
2083 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2084 cmp_minus10, cmp_minus11);
2085 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2086 cmp_minus10, cmp_minus11);
2087 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2088 const1, cmp_minus11, diff_minus10, diff_minus11);
2090 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2091 diff_minus11, offset_mask0, offset_mask1);
2092 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2094 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2095 src_zero0,
offset, dst0);
2098 dst0 = __lsx_vxori_b(dst0, 128);
2099 dst0 = __lsx_vsadd_b(dst0,
offset);
2100 dst0 = __lsx_vxori_b(dst0, 128);
2102 __lsx_vstelm_w(dst0,
dst, 0, 0);
2103 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 2);
2104 dst += dst_stride_2x;
2111 const int16_t *sao_offset_val,
2114 const uint8_t *src_orig;
2115 const int32_t src_stride_2x = (src_stride << 1);
2116 const int32_t dst_stride_2x = (dst_stride << 1);
2118 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2119 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2120 __m128i edge_idx = {0x403000201, 0x0};
2121 __m128i const1 = __lsx_vldi(1);
2122 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2123 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2124 __m128i src_minus10, src10, src_minus11, src11;
2125 __m128i src_zero0, src_zero1, dst0;
2126 __m128i offset_mask0, offset_mask1;
2127 __m128i zeros = {0};
2129 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2133 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2134 src_minus10, src_minus11);
2135 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2139 src_orig += src_stride_2x;
2141 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2142 shuf1, src_zero0, src_zero1);
2143 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2144 shuf2, src_minus10, src_minus11);
2146 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2147 src_minus10, src_minus11);
2148 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2149 src_zero0, src_zero1);
2150 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2151 cmp_minus10, cmp_minus11);
2152 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2153 cmp_minus11, diff_minus10, diff_minus11);
2154 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2155 src_minus11, cmp_minus10, cmp_minus11);
2156 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2157 cmp_minus11, cmp_minus10, cmp_minus11);
2158 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2159 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2161 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2162 diff_minus11, offset_mask0, offset_mask1);
2163 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2164 offset_mask0, offset_mask1);
2165 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2166 src_zero0,
offset, dst0);
2169 dst0 = __lsx_vxori_b(dst0, 128);
2170 dst0 = __lsx_vsadd_b(dst0,
offset);
2171 dst0 = __lsx_vxori_b(dst0, 128);
2173 src_minus10 = src10;
2174 src_minus11 = src11;
2177 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2180 __lsx_vstelm_d(dst0,
dst, 0, 0);
2181 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
2182 dst += dst_stride_2x;
2185 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2186 src_zero0, src_zero1);
2187 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2188 shuf2, src_minus10, src_minus11);
2190 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2191 src_minus10, src_minus11);
2192 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2193 src_zero0, src_zero1);
2194 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2195 cmp_minus10, cmp_minus11);
2196 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2197 diff_minus10, diff_minus11);
2198 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2199 cmp_minus10, cmp_minus11);
2200 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2201 cmp_minus10, cmp_minus11);
2202 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2203 const1, cmp_minus11, diff_minus10, diff_minus11);
2205 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2206 diff_minus11, offset_mask0, offset_mask1);
2207 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2209 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2210 src_zero0,
offset, dst0);
2213 dst0 = __lsx_vxori_b(dst0, 128);
2214 dst0 = __lsx_vsadd_b(dst0,
offset);
2215 dst0 = __lsx_vxori_b(dst0, 128);
2217 __lsx_vstelm_d(dst0,
dst, 0, 0);
2218 __lsx_vstelm_d(dst0,
dst + dst_stride, 0, 1);
2225 const int16_t *sao_offset_val,
2229 const uint8_t *src_orig;
2232 const int32_t src_stride_2x = (src_stride << 1);
2233 const int32_t dst_stride_2x = (dst_stride << 1);
2234 const int32_t src_stride_4x = (src_stride << 2);
2235 const int32_t dst_stride_4x = (dst_stride << 2);
2236 const int32_t src_stride_3x = src_stride_2x + src_stride;
2237 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2239 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2240 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2241 __m128i edge_idx = {0x403000201, 0x0};
2242 __m128i const1 = __lsx_vldi(1);
2243 __m128i dst0, dst1, dst2, dst3;
2244 __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2245 __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2246 __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2247 __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2248 __m128i src_plus10, src_plus11, src_plus12, src_plus13;
2249 __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2250 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2252 sao_offset = __lsx_vld(sao_offset_val, 0);
2253 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2259 src_minus11 = __lsx_vld(src_orig, 0);
2260 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2261 src_plus10, src_plus11);
2262 src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
2264 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2265 src_minus10 = __lsx_vld(src_orig - src_stride, 2);
2266 src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
2268 src10 = __lsx_vld(src_orig, 0);
2269 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2271 src13 =__lsx_vldx(src_orig, src_stride_3x);
2273 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
2274 src_plus10, shuf1, src12, src_plus11, shuf1, src13,
2275 src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
2277 src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
2278 DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
2279 src_plus11, shuf2, src_minus12, src_minus13);
2281 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
2282 src_plus10, src_zero1, src_minus11, src_zero1,
2283 src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2285 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
2286 src_plus12, src_zero3, src_minus13, src_zero3,
2287 src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2289 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2290 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2291 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
2293 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2294 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2295 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
2297 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
2298 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
2299 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
2300 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
2301 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
2302 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
2303 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2304 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2305 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2307 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2308 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2309 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2311 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2312 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
2313 cmp_minus11, diff_plus11, const1, cmp_plus11,
2314 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
2315 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
2316 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
2317 cmp_minus13, diff_plus13, const1, cmp_plus13,
2318 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
2320 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
2321 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
2322 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
2324 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
2325 offset_mask2, 2, offset_mask3, 2, offset_mask0,
2326 offset_mask1, offset_mask2, offset_mask3);
2328 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
2329 sao_offset, sao_offset, offset_mask0, offset_mask0,
2331 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
2332 sao_offset, sao_offset, offset_mask1, offset_mask1,
2334 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
2335 sao_offset, sao_offset, offset_mask2, offset_mask2,
2337 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
2338 sao_offset, sao_offset, offset_mask3, offset_mask3,
2341 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
2342 src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
2343 src_zero2, src_zero3);
2344 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
2345 offset_mask1, src_zero2, offset_mask2, src_zero3,
2346 offset_mask3, dst0, dst1, dst2, dst3);
2347 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
2348 128, dst0, dst1, dst2, dst3);
2350 src_minus11 = src10;
2355 __lsx_vst(dst0, dst_orig, 0);
2356 __lsx_vstx(dst1, dst_orig, dst_stride);
2357 __lsx_vstx(dst2, dst_orig, dst_stride_2x);
2358 __lsx_vstx(dst3, dst_orig, dst_stride_3x);
2362 src += src_stride_4x;
2363 dst += dst_stride_4x;
2368 ptrdiff_t stride_dst,
2369 const int16_t *sao_offset_val,