26 const uint8_t *p_is_pcm,
const uint8_t *q_is_pcm)
37 int32_t dp00, dq00, dp30, dq30, d00, d30;
39 int32_t dp04, dq04, dp34, dq34, d04, d34;
40 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
42 uint64_t dst_val0, dst_val1;
43 v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
44 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
49 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
51 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
53 dp00 =
abs(p2[0] - (p1[0] << 1) + p0[0]);
54 dq00 =
abs(q2[0] - (
q1[0] << 1) +
q0[0]);
55 dp30 =
abs(p2[3] - (p1[3] << 1) + p0[3]);
56 dq30 =
abs(q2[3] - (
q1[3] << 1) +
q0[3]);
59 dp04 =
abs(p2[4] - (p1[4] << 1) + p0[4]);
60 dq04 =
abs(q2[4] - (
q1[4] << 1) +
q0[4]);
61 dp34 =
abs(p2[7] - (p1[7] << 1) + p0[7]);
62 dq34 =
abs(q2[7] - (
q1[7] << 1) +
q0[7]);
66 p_is_pcm0 = p_is_pcm[0];
67 p_is_pcm4 = p_is_pcm[1];
68 q_is_pcm0 = q_is_pcm[0];
69 q_is_pcm4 = q_is_pcm[1];
71 cmp0 = __msa_fill_d(p_is_pcm0);
72 cmp1 = __msa_fill_d(p_is_pcm4);
73 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
74 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
76 d0030 = (d00 + d30) >= beta;
77 d0434 = (d04 + d34) >= beta;
79 cmp0 = (v2i64) __msa_fill_w(d0030);
80 cmp1 = (v2i64) __msa_fill_w(d0434);
81 cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
82 cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
84 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
91 cmp0 = __msa_fill_d(q_is_pcm0);
92 cmp1 = __msa_fill_d(q_is_pcm4);
93 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
94 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
99 tc250 = ((tc0 * 5 + 1) >> 1);
101 tc254 = ((tc4 * 5 + 1) >> 1);
103 cmp0 = (v2i64) __msa_fill_h(tc0);
104 cmp1 = (v2i64) __msa_fill_h(tc4);
107 p3_src, p2_src, p1_src, p0_src);
113 flag0 =
abs(p3[0] - p0[0]) +
abs(q3[0] -
q0[0]) < beta30 &&
114 abs(p0[0] -
q0[0]) < tc250;
115 flag0 = flag0 && (
abs(p3[3] - p0[3]) +
abs(q3[3] -
q0[3]) < beta30 &&
116 abs(p0[3] -
q0[3]) < tc250 && (d00 << 1) < beta20 &&
117 (d30 << 1) < beta20);
119 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
121 q0_src, q1_src, q2_src, q3_src);
122 flag1 =
abs(p3[4] - p0[4]) +
abs(q3[4] -
q0[4]) < beta30 &&
123 abs(p0[4] -
q0[4]) < tc254;
124 flag1 = flag1 && (
abs(p3[7] - p0[7]) +
abs(q3[7] -
q0[7]) < beta30 &&
125 abs(p0[7] -
q0[7]) < tc254 && (d04 << 1) < beta20 &&
126 (d34 << 1) < beta20);
128 cmp0 = (v2i64) __msa_fill_w(flag0);
129 cmp1 = (v2i64) __msa_fill_w(flag1);
130 cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
131 cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
133 if (flag0 && flag1) {
139 temp0 = (p1_src + p0_src + q0_src);
140 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
141 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
142 temp2 = (v8i16) (temp1 - p2_src);
143 CLIP_SH(temp2, tc_neg, tc_pos);
144 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
146 temp1 = temp0 + p2_src;
147 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
148 temp2 = (v8i16) (temp1 - p1_src);
149 CLIP_SH(temp2, tc_neg, tc_pos);
150 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
152 temp1 = (temp0 << 1) + p2_src + q1_src;
153 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
154 temp2 = (v8i16) (temp1 - p0_src);
155 CLIP_SH(temp2, tc_neg, tc_pos);
156 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
158 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
159 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
160 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
163 temp0 = (q1_src + p0_src + q0_src);
165 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
166 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
167 temp2 = (v8i16) (temp1 - q2_src);
168 CLIP_SH(temp2, tc_neg, tc_pos);
169 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
171 temp1 = temp0 + q2_src;
172 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
173 temp2 = (v8i16) (temp1 - q1_src);
174 CLIP_SH(temp2, tc_neg, tc_pos);
175 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
177 temp1 = (temp0 << 1) + p1_src + q2_src;
178 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
179 temp2 = (v8i16) (temp1 - q0_src);
180 CLIP_SH(temp2, tc_neg, tc_pos);
181 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
183 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
184 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
185 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
189 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
192 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
193 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
195 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
196 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
197 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
199 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
200 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
206 }
else if (flag0 == flag1) {
210 diff0 = (v8i16) (q0_src - p0_src);
211 diff1 = (v8i16) (q1_src - p1_src);
212 diff0 = (diff0 << 3) + diff0;
213 diff1 = (diff1 << 1) + diff1;
214 delta0 = diff0 - diff1;
215 delta0 = __msa_srari_h(delta0, 4);
217 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
218 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
219 abs_delta0 = (v8u16) abs_delta0 < temp1;
221 CLIP_SH(delta0, tc_neg, tc_pos);
223 temp2 = (v8i16) (delta0 + p0_src);
225 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
226 (v16u8) p_is_pcm_vec);
228 temp2 = (v8i16) (q0_src - delta0);
230 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
231 (v16u8) q_is_pcm_vec);
233 p_is_pcm_vec = ~p_is_pcm_vec;
234 q_is_pcm_vec = ~q_is_pcm_vec;
235 tmp = (beta + (beta >> 1)) >> 3;
236 cmp0 = __msa_fill_d(dp00 + dp30 <
tmp);
237 cmp1 = __msa_fill_d(dp04 + dp34 <
tmp);
238 cmp0 = __msa_ilvev_d(cmp1, cmp0);
239 cmp0 = __msa_ceqi_d(cmp0, 0);
240 p_is_pcm_vec = p_is_pcm_vec | cmp0;
242 cmp0 = __msa_fill_d(dq00 + dq30 <
tmp);
243 cmp1 = __msa_fill_d(dq04 + dq34 <
tmp);
244 cmp0 = __msa_ilvev_d(cmp1, cmp0);
245 cmp0 = __msa_ceqi_d(cmp0, 0);
246 q_is_pcm_vec = q_is_pcm_vec | cmp0;
251 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
252 delta1 -= (v8i16) p1_src;
255 CLIP_SH(delta1, tc_neg, tc_pos);
256 delta1 = (v8i16) p1_src + (v8i16) delta1;
258 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
259 (v16u8) p_is_pcm_vec);
261 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
262 delta2 = delta2 - (v8i16) q1_src;
263 delta2 = delta2 - delta0;
264 delta2 = delta2 >> 1;
265 CLIP_SH(delta2, tc_neg, tc_pos);
266 delta2 = (v8i16) q1_src + (v8i16) delta2;
268 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
269 (v16u8) q_is_pcm_vec);
271 dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
273 dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
275 dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
277 dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
283 PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3);
285 dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3);
286 dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
297 temp0 = (p1_src + p0_src + q0_src);
298 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
299 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
300 temp2 = (v8i16) (temp1 - p2_src);
301 CLIP_SH(temp2, tc_neg, tc_pos);
302 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
304 temp1 = temp0 + p2_src;
305 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
306 temp2 = (v8i16) (temp1 - p1_src);
307 CLIP_SH(temp2, tc_neg, tc_pos);
308 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
310 temp1 = (temp0 << 1) + p2_src + q1_src;
311 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
312 temp2 = (v8i16) (temp1 - p0_src);
313 CLIP_SH(temp2, tc_neg, tc_pos);
314 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
316 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
317 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
318 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
321 temp0 = (q1_src + p0_src + q0_src);
323 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
324 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
325 temp2 = (v8i16) (temp1 - q2_src);
326 CLIP_SH(temp2, tc_neg, tc_pos);
327 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
329 temp1 = temp0 + q2_src;
330 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
331 temp2 = (v8i16) (temp1 - q1_src);
332 CLIP_SH(temp2, tc_neg, tc_pos);
333 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
335 temp1 = (temp0 << 1) + p1_src + q2_src;
336 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
337 temp2 = (v8i16) (temp1 - q0_src);
338 CLIP_SH(temp2, tc_neg, tc_pos);
339 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
341 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
342 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
343 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
347 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
354 diff0 = (v8i16) (q0_src - p0_src);
355 diff1 = (v8i16) (q1_src - p1_src);
356 diff0 = (diff0 << 3) + diff0;
357 diff1 = (diff1 << 1) + diff1;
358 delta0 = diff0 - diff1;
359 delta0 = __msa_srari_h(delta0, 4);
361 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
362 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
363 abs_delta0 = (v8u16) abs_delta0 < temp1;
365 CLIP_SH(delta0, tc_neg, tc_pos);
367 temp2 = (v8i16) (delta0 + p0_src);
369 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
370 (v16u8) p_is_pcm_vec);
372 temp2 = (v8i16) (q0_src - delta0);
374 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
375 (v16u8) q_is_pcm_vec);
377 p_is_pcm_vec = ~p_is_pcm_vec;
378 q_is_pcm_vec = ~q_is_pcm_vec;
379 tmp = (beta + (beta >> 1)) >> 3;
380 cmp0 = __msa_fill_d(dp00 + dp30 <
tmp);
381 cmp1 = __msa_fill_d(dp04 + dp34 <
tmp);
382 cmp0 = __msa_ilvev_d(cmp1, cmp0);
383 p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
385 cmp0 = __msa_fill_d(dq00 + dq30 <
tmp);
386 cmp1 = __msa_fill_d(dq04 + dq34 <
tmp);
387 cmp0 = __msa_ilvev_d(cmp1, cmp0);
388 q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
393 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
394 delta1 -= (v8i16) p1_src;
397 CLIP_SH(delta1, tc_neg, tc_pos);
398 delta1 = (v8i16) p1_src + (v8i16) delta1;
400 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
401 (v16u8) p_is_pcm_vec);
403 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
404 delta2 = delta2 - (v8i16) q1_src;
405 delta2 = delta2 - delta0;
406 delta2 = delta2 >> 1;
407 CLIP_SH(delta2, tc_neg, tc_pos);
408 delta2 = (v8i16) q1_src + (v8i16) delta2;
410 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
411 (v16u8) q_is_pcm_vec);
413 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
415 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
417 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
419 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
424 PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4);
425 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2);
428 dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2);
429 dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2);
430 dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2);
433 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
434 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
436 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
437 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
438 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
440 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
441 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
452 const uint8_t *p_is_pcm,
const uint8_t *q_is_pcm)
458 uint8_t flag0, flag1;
461 int32_t dp00, dq00, dp30, dq30, d00, d30;
463 int32_t dp04, dq04, dp34, dq34, d04, d34;
464 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
465 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254,
tmp;
466 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
467 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
471 v8i16 tc_pos, tc_neg;
472 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
474 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
476 dp00 =
abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
477 dq00 =
abs(p3[2] - (p3[1] << 1) + p3[0]);
478 dp30 =
abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
479 dq30 =
abs(p2[2] - (p2[1] << 1) + p2[0]);
482 p_is_pcm0 = p_is_pcm[0];
483 q_is_pcm0 = q_is_pcm[0];
485 dp04 =
abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
486 dq04 =
abs(p1[2] - (p1[1] << 1) + p1[0]);
487 dp34 =
abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
488 dq34 =
abs(p0[2] - (p0[1] << 1) + p0[0]);
491 p_is_pcm4 = p_is_pcm[1];
492 q_is_pcm4 = q_is_pcm[1];
494 cmp0 = __msa_fill_d(p_is_pcm0);
495 cmp1 = __msa_fill_d(p_is_pcm4);
496 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
497 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
499 d0030 = (d00 + d30) >= beta;
500 d0434 = (d04 + d34) >= beta;
502 cmp0 = __msa_fill_d(d0030);
503 cmp1 = __msa_fill_d(d0434);
504 cmp3 = __msa_ilvev_d(cmp1, cmp0);
505 cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0);
507 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
508 (!d0030 || !d0434)) {
513 cmp0 = __msa_fill_d(q_is_pcm0);
514 cmp1 = __msa_fill_d(q_is_pcm4);
515 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
516 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
521 tc250 = ((tc0 * 5 + 1) >> 1);
524 tc254 = ((tc4 * 5 + 1) >> 1);
525 cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
526 cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
527 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
530 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
531 q0_src, q1_src, q2_src, q3_src);
533 flag0 =
abs(p3[-4] - p3[-1]) +
abs(p3[3] - p3[0]) < beta30 &&
534 abs(p3[-1] - p3[0]) < tc250;
535 flag0 = flag0 && (
abs(p2[-4] - p2[-1]) +
abs(p2[3] - p2[0]) < beta30 &&
536 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
537 (d30 << 1) < beta20);
538 cmp0 = __msa_fill_d(flag0);
540 p3_src, p2_src, p1_src, p0_src);
542 flag1 =
abs(p1[-4] - p1[-1]) +
abs(p1[3] - p1[0]) < beta30 &&
543 abs(p1[-1] - p1[0]) < tc254;
544 flag1 = flag1 && (
abs(p0[-4] - p0[-1]) +
abs(p0[3] - p0[0]) < beta30 &&
545 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
546 (d34 << 1) < beta20);
548 q0_src, q1_src, q2_src, q3_src);
550 cmp1 = __msa_fill_d(flag1);
551 cmp2 = __msa_ilvev_d(cmp1, cmp0);
552 cmp2 = __msa_ceqi_d(cmp2, 0);
554 if (flag0 && flag1) {
559 temp0 = (p1_src + p0_src + q0_src);
561 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
562 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
563 temp2 = (v8i16) (temp1 - p2_src);
564 CLIP_SH(temp2, tc_neg, tc_pos);
565 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
567 temp1 = temp0 + p2_src;
568 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
569 temp2 = (v8i16) (temp1 - p1_src);
570 CLIP_SH(temp2, tc_neg, tc_pos);
571 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
573 temp1 = (temp0 << 1) + p2_src + q1_src;
574 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
575 temp2 = (v8i16) (temp1 - p0_src);
576 CLIP_SH(temp2, tc_neg, tc_pos);
577 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
579 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
580 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
581 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
584 temp0 = (q1_src + p0_src + q0_src);
585 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
586 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
587 temp2 = (v8i16) (temp1 - q2_src);
588 CLIP_SH(temp2, tc_neg, tc_pos);
589 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
591 temp1 = temp0 + q2_src;
592 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
593 temp2 = (v8i16) (temp1 - q1_src);
594 CLIP_SH(temp2, tc_neg, tc_pos);
595 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
597 temp1 = (temp0 << 1) + p1_src + q2_src;
598 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
599 temp2 = (v8i16) (temp1 - q0_src);
600 CLIP_SH(temp2, tc_neg, tc_pos);
601 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
603 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
604 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
605 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
607 }
else if (flag0 == flag1) {
612 diff0 = (v8i16) (q0_src - p0_src);
613 diff1 = (v8i16) (q1_src - p1_src);
614 diff0 = (diff0 << 3) + diff0;
615 diff1 = (diff1 << 1) + diff1;
616 delta0 = diff0 - diff1;
617 delta0 = __msa_srari_h(delta0, 4);
619 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
620 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
621 abs_delta0 = (v8u16) abs_delta0 < temp1;
623 CLIP_SH(delta0, tc_neg, tc_pos);
624 temp2 = (v8i16) (delta0 + p0_src);
626 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
627 (v16u8) p_is_pcm_vec);
629 temp2 = (v8i16) (q0_src - delta0);
631 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
632 (v16u8) q_is_pcm_vec);
634 tmp = ((beta + (beta >> 1)) >> 3);
635 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) <
tmp));
636 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) <
tmp));
637 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
638 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
640 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 <
tmp));
641 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 <
tmp));
642 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
643 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
648 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
649 delta1 -= (v8i16) p1_src;
652 CLIP_SH(delta1, tc_neg, tc_pos);
653 delta1 = (v8i16) p1_src + (v8i16) delta1;
655 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
656 (v16u8) p_is_pcm_vec);
658 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
659 delta2 = delta2 - (v8i16) q1_src;
660 delta2 = delta2 - delta0;
661 delta2 = delta2 >> 1;
662 CLIP_SH(delta2, tc_neg, tc_pos);
663 delta2 = (v8i16) q1_src + (v8i16) delta2;
665 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
666 (v16u8) q_is_pcm_vec);
668 dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
670 dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
672 dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
674 dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
678 dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3);
679 dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3);
680 dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3);
681 dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3);
691 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
692 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
698 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
699 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
705 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
706 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
712 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
713 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
724 temp0 = (p1_src + p0_src + q0_src);
726 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
727 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
728 temp2 = (v8i16) (temp1 - p2_src);
729 CLIP_SH(temp2, tc_neg, tc_pos);
730 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
732 temp1 = temp0 + p2_src;
733 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
734 temp2 = (v8i16) (temp1 - p1_src);
735 CLIP_SH(temp2, tc_neg, tc_pos);
736 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
738 temp1 = (temp0 << 1) + p2_src + q1_src;
739 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
740 temp2 = (v8i16) (temp1 - p0_src);
741 CLIP_SH(temp2, tc_neg, tc_pos);
742 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
744 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
745 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
746 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
749 temp0 = (q1_src + p0_src + q0_src);
750 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
751 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
752 temp2 = (v8i16) (temp1 - q2_src);
753 CLIP_SH(temp2, tc_neg, tc_pos);
754 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
756 temp1 = temp0 + q2_src;
757 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
758 temp2 = (v8i16) (temp1 - q1_src);
759 CLIP_SH(temp2, tc_neg, tc_pos);
760 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
762 temp1 = (temp0 << 1) + p1_src + q2_src;
763 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
764 temp2 = (v8i16) (temp1 - q0_src);
765 CLIP_SH(temp2, tc_neg, tc_pos);
766 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
768 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
769 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
770 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
777 diff0 = (v8i16) (q0_src - p0_src);
778 diff1 = (v8i16) (q1_src - p1_src);
779 diff0 = (diff0 << 3) + diff0;
780 diff1 = (diff1 << 1) + diff1;
781 delta0 = diff0 - diff1;
782 delta0 = __msa_srari_h(delta0, 4);
784 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
785 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
786 abs_delta0 = (v8u16) abs_delta0 < temp1;
788 CLIP_SH(delta0, tc_neg, tc_pos);
790 temp2 = (v8i16) (delta0 + p0_src);
792 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
793 (v16u8) p_is_pcm_vec);
795 temp2 = (v8i16) (q0_src - delta0);
797 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
798 (v16u8) q_is_pcm_vec);
800 tmp = (beta + (beta >> 1)) >> 3;
801 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) <
tmp));
802 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) <
tmp));
803 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
804 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
806 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 <
tmp));
807 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 <
tmp));
808 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
809 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
814 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
815 delta1 -= (v8i16) p1_src;
818 CLIP_SH(delta1, tc_neg, tc_pos);
819 delta1 = (v8i16) p1_src + (v8i16) delta1;
821 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
822 (v16u8) p_is_pcm_vec);
824 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
825 delta2 = delta2 - (v8i16) q1_src;
826 delta2 = delta2 - delta0;
827 delta2 = delta2 >> 1;
828 CLIP_SH(delta2, tc_neg, tc_pos);
829 delta2 = (v8i16) q1_src + (v8i16) delta2;
831 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
832 (v16u8) q_is_pcm_vec);
833 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
835 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
837 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
839 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
844 dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
845 dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
846 dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
847 dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
848 dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
849 dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
852 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3);
853 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3);
854 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3);
855 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3);
856 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3);
857 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3);
860 PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1,
871 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
872 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
873 tmp0 = __msa_copy_u_h((v8i16) dst2, 0);
874 tmp1 = __msa_copy_u_h((v8i16) dst2, 2);
882 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
883 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
884 tmp0 = __msa_copy_u_h((v8i16) dst2, 4);
885 tmp1 = __msa_copy_u_h((v8i16) dst2, 6);
893 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
894 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
895 tmp0 = __msa_copy_u_h((v8i16) dst3, 0);
896 tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
904 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
905 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
906 tmp0 = __msa_copy_u_h((v8i16) dst3, 4);
907 tmp1 = __msa_copy_u_h((v8i16) dst3, 6);
917 const int32_t *tc,
const uint8_t *p_is_pcm,
918 const uint8_t *q_is_pcm)
922 uint8_t *q0_ptr =
src;
924 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925 v8u16 p1, p0,
q0,
q1;
926 v8i16 tc_pos, tc_neg;
928 v8i16 temp0, temp1,
delta;
930 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
931 cmp0 = (v2i64) __msa_fill_h(tc[0]);
932 cmp1 = (v2i64) __msa_fill_h(tc[1]);
933 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
936 cmp0 = __msa_fill_d(p_is_pcm[0]);
937 cmp1 = __msa_fill_d(p_is_pcm[1]);
938 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
939 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
941 cmp0 = __msa_fill_d(q_is_pcm[0]);
942 cmp1 = __msa_fill_d(q_is_pcm[1]);
943 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
944 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
951 ILVR_B4_UH(
zero, p1,
zero, p0,
zero,
q0,
zero,
q1, p1, p0,
q0,
q1);
953 temp0 = (v8i16) (
q0 - p0);
954 temp1 = (v8i16) (p1 -
q1);
957 delta = __msa_srari_h((v8i16) temp0, 3);
960 temp0 = (v8i16) ((v8i16) p0 +
delta);
962 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
963 (v16u8) p_is_pcm_vec);
965 temp1 = (v8i16) ((v8i16)
q0 -
delta);
967 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8)
q0,
968 (v16u8) q_is_pcm_vec);
970 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
971 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
972 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8)
q0, (v16u8) tc_pos);
974 temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
980 const int32_t *tc,
const uint8_t *p_is_pcm,
981 const uint8_t *q_is_pcm)
983 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
985 v8u16 p1, p0,
q0,
q1;
986 v8i16 tc_pos, tc_neg;
988 v8i16 temp0, temp1,
delta;
990 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
991 cmp0 = (v2i64) __msa_fill_h(tc[0]);
992 cmp1 = (v2i64) __msa_fill_h(tc[1]);
993 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
996 cmp0 = __msa_fill_d(p_is_pcm[0]);
997 cmp1 = __msa_fill_d(p_is_pcm[1]);
998 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
999 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
1001 cmp0 = __msa_fill_d(q_is_pcm[0]);
1002 cmp1 = __msa_fill_d(q_is_pcm[1]);
1003 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
1004 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
1010 ILVR_B4_UH(
zero, p1,
zero, p0,
zero,
q0,
zero,
q1, p1, p0,
q0,
q1);
1012 temp0 = (v8i16) (
q0 - p0);
1013 temp1 = (v8i16) (p1 -
q1);
1016 delta = __msa_srari_h((v8i16) temp0, 3);
1019 temp0 = (v8i16) ((v8i16) p0 +
delta);
1021 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
1022 (v16u8) p_is_pcm_vec);
1024 temp1 = (v8i16) ((v8i16)
q0 -
delta);
1026 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8)
q0,
1027 (v16u8) q_is_pcm_vec);
1029 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
1030 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
1031 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8)
q0, (v16u8) tc_pos);
1033 temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
1036 ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7,
src,
stride);
1043 const int16_t *sao_offset_val,
1047 v16i8 src0_r, src1_r;
1049 v16i8 dst0, offset0, offset1;
1052 offset_val =
LD_SB(sao_offset_val + 1);
1053 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1055 offset_val = __msa_pckev_b(offset_val, offset_val);
1056 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1057 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1058 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1063 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1064 SWAP(offset0, offset1);
1068 src += (4 * src_stride);
1072 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1073 mask = __msa_srli_b(src0_r, 3);
1074 offset = __msa_vshf_b(
mask, offset1, offset0);
1076 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1077 dst0 = __msa_adds_s_b(src0_r,
offset);
1078 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1084 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1085 dst += (4 * dst_stride);
1090 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1091 mask = __msa_srli_b(src0_r, 3);
1092 offset = __msa_vshf_b(
mask, offset1, offset0);
1094 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1095 dst0 = __msa_adds_s_b(src0_r,
offset);
1096 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1099 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1105 const int16_t *sao_offset_val,
1109 v16i8 src0_r, src1_r, mask0, mask1;
1110 v16i8 offset_mask0, offset_mask1, offset_val;
1111 v16i8 offset0, offset1, dst0, dst1;
1114 offset_val =
LD_SB(sao_offset_val + 1);
1115 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1116 offset_val = __msa_pckev_b(offset_val, offset_val);
1117 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1118 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1119 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1124 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1125 SWAP(offset0, offset1);
1129 src += src_stride << 2;
1133 mask0 = __msa_srli_b(src0_r, 3);
1134 mask1 = __msa_srli_b(src1_r, 3);
1136 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1137 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1144 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1145 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1150 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1151 dst += dst_stride << 2;
1156 mask0 = __msa_srli_b(src0_r, 3);
1157 mask1 = __msa_srli_b(src1_r, 3);
1159 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1160 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1164 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1165 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1170 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1178 const int16_t *sao_offset_val,
1183 v16i8 out0, out1, out2, out3;
1184 v16i8 mask0, mask1, mask2, mask3;
1185 v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
1186 v16i8 offset0, offset1;
1189 offset_val =
LD_SB(sao_offset_val + 1);
1190 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1191 offset_val = __msa_pckev_b(offset_val, offset_val);
1192 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1193 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1194 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1196 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1197 SWAP(offset0, offset1);
1204 for (w_cnt = 16; w_cnt <
width; w_cnt += 16) {
1205 mask0 = __msa_srli_b((v16i8)
src0, 3);
1206 mask1 = __msa_srli_b((v16i8)
src1, 3);
1207 mask2 = __msa_srli_b((v16i8)
src2, 3);
1208 mask3 = __msa_srli_b((v16i8) src3, 3);
1210 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
1212 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
1216 out0 = __msa_adds_s_b((v16i8)
src0, tmp0);
1217 out1 = __msa_adds_s_b((v16i8)
src1, tmp1);
1218 out2 = __msa_adds_s_b((v16i8)
src2, tmp2);
1219 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1226 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1229 mask0 = __msa_srli_b((v16i8)
src0, 3);
1230 mask1 = __msa_srli_b((v16i8)
src1, 3);
1231 mask2 = __msa_srli_b((v16i8)
src2, 3);
1232 mask3 = __msa_srli_b((v16i8) src3, 3);
1234 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, tmp0,
1236 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, tmp2,
1240 out0 = __msa_adds_s_b((v16i8)
src0, tmp0);
1241 out1 = __msa_adds_s_b((v16i8)
src1, tmp1);
1242 out2 = __msa_adds_s_b((v16i8)
src2, tmp2);
1243 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1247 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1249 src += src_stride << 2;
1250 dst += dst_stride << 2;
1259 const int16_t *sao_offset_val,
1262 uint32_t dst_val0, dst_val1;
1263 v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11;
1264 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1265 v16i8 sao_offset =
LD_SB(sao_offset_val);
1267 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1270 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1274 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1277 src += (2 * src_stride);
1279 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1280 (v2i64) src_minus10);
1282 src0 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 1);
1283 src_plus10 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 2);
1285 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1286 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1287 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1288 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1290 cmp_minus10 = ((v16u8)
src0 == (v16u8) src_plus10);
1291 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1292 cmp_minus10 = ((v16u8) src_plus10 < (v16u8)
src0);
1293 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1295 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1298 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1303 src0 = (v16i8) __msa_xori_b((v16u8)
src0, 128);
1305 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1307 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1308 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1315 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1316 (v2i64) src_minus10);
1318 src0 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 1);
1319 src_plus10 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 2);
1321 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1322 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1323 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1324 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1326 cmp_minus10 = ((v16u8)
src0 == (v16u8) src_plus10);
1327 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1328 cmp_minus10 = ((v16u8) src_plus10 < (v16u8)
src0);
1329 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1331 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1335 src0 = (v16i8) __msa_xori_b((v16u8)
src0, 128);
1337 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1339 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1340 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1351 const int16_t *sao_offset_val,
1354 uint64_t dst_val0, dst_val1;
1355 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1356 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1357 v16u8 cmp_minus10, diff_minus10, diff_minus11;
1358 v16u8
src0,
src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1360 v16i8 zeros = { 0 };
1362 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1366 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1369 src += (src_stride << 1);
1372 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
1374 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
1375 src_minus10, src_plus10);
1376 src0 = (v16u8) __msa_pckev_d((v2i64)
src1, (v2i64)
src0);
1378 cmp_minus10 = (
src0 == src_minus10);
1379 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1380 cmp_minus10 = (src_minus10 <
src0);
1381 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1383 cmp_minus10 = (
src0 == src_plus10);
1384 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1385 cmp_minus10 = (src_plus10 <
src0);
1386 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1388 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1391 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1397 dst0 = (v16u8) __msa_adds_s_b((v16i8)
src0,
offset);
1398 dst0 = __msa_xori_b(dst0, 128);
1400 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1401 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1409 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
1411 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
1413 src0 = (v16u8) __msa_pckev_d((v2i64)
src1, (v2i64)
src0);
1415 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1416 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1417 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1418 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1420 cmp_minus10 = (
src0 == src_plus10);
1421 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1422 cmp_minus10 = (src_plus10 <
src0);
1423 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1425 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1431 dst0 = (v16u8) __msa_adds_s_b((v16i8)
src0,
offset);
1432 dst0 = __msa_xori_b(dst0, 128);
1434 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1435 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1445 const int16_t *sao_offset_val,
1449 const uint8_t *src_minus1;
1452 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1453 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1455 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1456 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1457 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1459 v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1460 v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
1461 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1462 v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
1463 v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
1465 sao_offset =
LD_SB(sao_offset_val);
1466 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1469 src_minus1 =
src - 1;
1470 LD_UB4(src_minus1, src_stride,
1471 src_minus10, src_minus11, src_minus12, src_minus13);
1473 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1475 dst_ptr = dst + v_cnt;
1476 LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
1478 SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
1479 src12, src_minus12, src13, src_minus13, 1,
1480 src_zero0, src_zero1, src_zero2, src_zero3);
1481 SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
1482 src12, src_minus12, src13, src_minus13, 2,
1483 src_plus10, src_plus11, src_plus12, src_plus13);
1485 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1486 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1487 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1488 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1489 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1490 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1491 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1492 cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
1494 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1495 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1496 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1497 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1498 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1499 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1500 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1501 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1503 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1504 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1505 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1506 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1507 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1508 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1509 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1510 cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
1512 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1513 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1514 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1515 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1516 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1517 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1518 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1519 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1521 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1522 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1523 offset_mask0, offset_mask0, offset_mask0);
1524 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1525 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1526 offset_mask1, offset_mask1, offset_mask1);
1527 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1528 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
1529 offset_mask2, offset_mask2, offset_mask2);
1530 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1531 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
1532 offset_mask3, offset_mask3, offset_mask3);
1536 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
1537 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
1538 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
1539 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
1543 src_minus10 = src10;
1544 ST_UB(dst0, dst_ptr);
1545 src_minus11 = src11;
1546 ST_UB(dst1, dst_ptr + dst_stride);
1547 src_minus12 = src12;
1548 ST_UB(dst2, dst_ptr + (dst_stride << 1));
1549 src_minus13 = src13;
1550 ST_UB(dst3, dst_ptr + (dst_stride * 3));
1553 src += (src_stride << 2);
1554 dst += (dst_stride << 2);
1562 const int16_t *sao_offset_val,
1565 uint32_t dst_val0, dst_val1;
1566 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1567 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1569 v16i8 sao_offset =
LD_SB(sao_offset_val);
1570 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1571 v16u8 src_minus10, src_minus11, src10, src11;
1572 v16i8 src_zero0, src_zero1;
1574 v8i16 offset_mask0, offset_mask1;
1576 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1579 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1580 LD_UB2(
src + src_stride, src_stride, src10, src11);
1583 src += (src_stride << 1);
1585 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1586 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1587 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1588 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1590 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1591 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1592 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1593 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1595 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1596 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1597 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1598 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1600 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1601 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1603 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1604 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1609 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1610 dst0 = __msa_adds_s_b(dst0,
offset);
1611 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1613 src_minus10 = src10;
1614 src_minus11 = src11;
1617 LD_UB2(
src + src_stride, src_stride, src10, src11);
1619 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1620 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1628 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1629 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1630 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1631 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1633 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1634 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1635 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1636 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1638 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1639 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1640 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1641 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1643 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1644 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1646 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1647 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1652 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1653 dst0 = __msa_adds_s_b(dst0,
offset);
1654 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1656 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1657 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1667 const int16_t *sao_offset_val,
1670 uint64_t dst_val0, dst_val1;
1671 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1672 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1674 v16i8 src_zero0, src_zero1, dst0;
1675 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1676 v16u8 src_minus10, src_minus11, src10, src11;
1677 v8i16 offset_mask0, offset_mask1;
1679 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1682 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1683 LD_UB2(
src + src_stride, src_stride, src10, src11);
1686 src += (src_stride << 1);
1688 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1689 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1690 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1691 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1693 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1694 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1695 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1696 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1698 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1699 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1700 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1701 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1703 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1704 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1706 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1707 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1712 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1713 dst0 = __msa_adds_s_b(dst0,
offset);
1714 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1716 src_minus10 = src10;
1717 src_minus11 = src11;
1720 LD_UB2(
src + src_stride, src_stride, src10, src11);
1722 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1723 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1730 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1731 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1732 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1733 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1735 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1736 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1737 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1738 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1740 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1741 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1742 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1743 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1745 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1746 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1748 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1749 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1754 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1755 dst0 = __msa_adds_s_b(dst0,
offset);
1756 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1758 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1759 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1774 const uint8_t *src_orig =
src;
1775 uint8_t *dst_orig = dst;
1777 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1778 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1779 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1780 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1781 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1783 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1784 v16u8 src12, dst2, src13, dst3;
1785 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1787 sao_offset =
LD_SB(sao_offset_val);
1788 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1790 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1791 src = src_orig + v_cnt;
1792 dst = dst_orig + v_cnt;
1794 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1796 for (h_cnt = (
height >> 2); h_cnt--;) {
1797 LD_UB4(
src + src_stride, src_stride, src10, src11, src12, src13);
1799 cmp_minus10 = (src_minus11 == src_minus10);
1800 cmp_plus10 = (src_minus11 == src10);
1801 cmp_minus11 = (src10 == src_minus11);
1802 cmp_plus11 = (src10 == src11);
1803 cmp_minus12 = (src11 == src10);
1804 cmp_plus12 = (src11 == src12);
1805 cmp_minus13 = (src12 == src11);
1806 cmp_plus13 = (src12 == src13);
1808 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1809 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1810 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1811 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1812 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1813 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1814 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1815 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1817 cmp_minus10 = (src_minus10 < src_minus11);
1818 cmp_plus10 = (src10 < src_minus11);
1819 cmp_minus11 = (src_minus11 < src10);
1820 cmp_plus11 = (src11 < src10);
1821 cmp_minus12 = (src10 < src11);
1822 cmp_plus12 = (src12 < src11);
1823 cmp_minus13 = (src11 < src12);
1824 cmp_plus13 = (src13 < src12);
1826 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1827 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1828 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1829 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1830 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1831 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1832 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1833 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1835 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1836 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1837 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1838 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1839 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1840 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1841 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1842 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1843 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1844 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1845 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1846 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1848 src_minus10 = src12;
1851 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_minus11, offset_mask0);
1852 dst1 = (v16u8) __msa_adds_s_b((v16i8) src10, offset_mask1);
1853 dst2 = (v16u8) __msa_adds_s_b((v16i8) src11, offset_mask2);
1854 dst3 = (v16u8) __msa_adds_s_b((v16i8) src12, offset_mask3);
1857 src_minus11 = src13;
1859 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1861 src += (src_stride << 2);
1862 dst += (dst_stride << 2);
1871 const int16_t *sao_offset_val,
1874 const uint8_t *src_orig;
1875 uint32_t dst_val0, dst_val1;
1876 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1877 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1879 v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1880 v16u8 src_minus11, src10, src11;
1881 v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1882 v8i16 offset_mask0, offset_mask1;
1883 v16i8 zeros = { 0 };
1885 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1890 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1891 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1894 src_orig += (src_stride << 1);
1896 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
1897 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
1899 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1901 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1904 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1905 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1906 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1907 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1909 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1910 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1911 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1912 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1914 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1915 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1917 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1918 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1923 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1924 dst0 = __msa_adds_s_b(dst0,
offset);
1925 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1927 src_minus10 = src10;
1928 src_minus11 = src11;
1931 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1933 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1934 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1942 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
1943 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
1945 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1947 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1950 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1951 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1952 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1953 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1955 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1956 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1957 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1958 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1960 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1961 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1963 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1964 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1969 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1970 dst0 = __msa_adds_s_b(dst0,
offset);
1971 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1973 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1974 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1984 const int16_t *sao_offset_val,
1987 const uint8_t *src_orig;
1988 uint64_t dst_val0, dst_val1;
1989 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1990 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1992 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1993 v16u8 src_minus10, src10, src_minus11, src11;
1994 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1995 v8i16 offset_mask0, offset_mask1;
1996 v16i8 zeros = { 0 };
1998 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2002 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2003 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2006 src_orig += (src_stride << 1);
2008 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2009 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
2011 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
2012 src_minus10, src_minus11);
2013 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
2014 src_zero0, src_zero1);
2016 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2017 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2018 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2019 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2021 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2022 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2023 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2024 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2026 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2027 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2029 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2030 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2035 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2036 dst0 = __msa_adds_s_b(dst0,
offset);
2037 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2039 src_minus10 = src10;
2040 src_minus11 = src11;
2043 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2045 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2046 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2053 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2054 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
2055 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
2057 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2060 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2061 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2062 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2063 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2065 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2066 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2067 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2068 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2070 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2071 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2073 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2074 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2079 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2080 dst0 = __msa_adds_s_b(dst0,
offset);
2081 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2083 src_minus10 = src10;
2084 src_minus11 = src11;
2087 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2089 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2090 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2105 const uint8_t *src_orig =
src;
2106 uint8_t *dst_orig = dst;
2108 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2109 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2110 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
2111 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
2112 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
2113 v16u8 diff_plus13, src_minus14, src_plus13;
2114 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
2115 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
2116 v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
2117 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
2118 v16i8 src_zero3, sao_offset;
2120 sao_offset =
LD_SB(sao_offset_val);
2121 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2126 LD_UB4(src_orig, src_stride, src_minus11, src_minus12, src_minus13,
2129 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2130 src_minus10 =
LD_UB(src_orig - src_stride);
2131 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2132 src_plus13 =
LD_UB(
src + 1 + v_cnt + (src_stride << 2));
2135 SLDI_B4_SB(src10, src_minus11, src11, src_minus12,
2136 src12, src_minus13, src13, src_minus14, 1,
2137 src_zero0, src_zero1, src_zero2, src_zero3);
2138 SLDI_B2_SB(src11, src_minus12, src12, src_minus13, 2, src_plus10,
2141 src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
2143 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2144 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
2145 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2146 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
2147 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
2148 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
2149 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
2150 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2152 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2153 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2154 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2155 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2156 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2157 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2158 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2159 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2161 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2162 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
2163 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2164 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
2165 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
2166 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
2167 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
2168 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2170 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2171 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2172 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2173 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2174 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2175 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2176 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2177 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2179 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2180 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2181 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2182 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2184 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2185 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2186 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2187 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2188 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2189 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2190 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2191 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2195 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2196 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2197 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2198 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2202 src_minus11 = src10;
2203 src_minus12 = src11;
2204 src_minus13 = src12;
2205 src_minus14 = src13;
2207 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2211 src += (src_stride << 2);
2212 dst += (dst_stride << 2);
2220 const int16_t *sao_offset_val,
2223 const uint8_t *src_orig;
2224 uint32_t dst_val0, dst_val1;
2225 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2226 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2228 v16i8 src_zero0, src_zero1, dst0;
2229 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2230 v16u8 src_minus10, src10, src_minus11, src11;
2231 v8i16 offset_mask0, offset_mask1;
2232 v16i8 zeros = { 0 };
2234 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2238 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2239 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2242 src_orig += (src_stride << 1);
2244 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2245 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2247 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2249 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2252 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2253 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2254 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2255 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2257 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2258 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2259 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2260 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2262 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2263 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2265 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2266 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2271 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2272 dst0 = __msa_adds_s_b(dst0,
offset);
2273 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2275 src_minus10 = src10;
2276 src_minus11 = src11;
2279 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2281 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2282 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2291 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2292 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2294 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2296 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2299 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2300 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2301 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2302 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2304 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2305 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2306 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2307 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2309 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2310 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2312 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2313 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2318 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2319 dst0 = __msa_adds_s_b(dst0,
offset);
2320 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2322 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2323 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2335 const int16_t *sao_offset_val,
2338 const uint8_t *src_orig;
2339 uint64_t dst_val0, dst_val1;
2340 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2341 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2343 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2344 v16u8 src_minus10, src10, src_minus11, src11;
2345 v16i8 src_zero0, src_zero1, dst0;
2346 v8i16 offset_mask0, offset_mask1;
2347 v16i8 zeros = { 0 };
2349 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2353 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2354 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2357 src_orig += (src_stride << 1);
2359 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2360 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2361 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2363 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2366 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2367 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2368 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2369 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2371 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2372 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2373 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2374 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2376 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2377 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2379 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2380 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2385 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2386 dst0 = __msa_adds_s_b(dst0,
offset);
2387 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2389 src_minus10 = src10;
2390 src_minus11 = src11;
2393 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2395 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2396 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2404 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2405 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2406 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2408 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2411 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2412 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2413 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2414 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2416 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2417 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2418 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2419 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2421 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2422 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2424 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2425 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2430 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2431 dst0 = __msa_adds_s_b(dst0,
offset);
2432 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2434 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2435 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2452 const uint8_t *src_orig;
2455 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2456 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2457 v16u8 dst0, dst1, dst2, dst3;
2458 v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2459 v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2460 v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2461 v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2462 v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
2463 v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2464 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2466 sao_offset =
LD_SB(sao_offset_val);
2467 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2473 LD_UB4(src_orig, src_stride, src_minus11, src_plus10, src_plus11,
2476 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2477 src_minus10 =
LD_UB(src_orig + 2 - src_stride);
2478 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2479 src_plus13 =
LD_UB(src_orig + (src_stride << 2));
2482 src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
2483 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2484 cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
2486 src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
2487 src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
2488 (v16i8) src_minus11, 2);
2489 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2490 cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
2492 src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
2493 src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
2494 cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
2495 cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
2497 src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
2498 src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
2499 cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
2500 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2502 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2503 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2504 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2505 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2506 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2507 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2508 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2509 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2511 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2512 cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
2513 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2514 cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
2515 cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
2516 cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
2517 cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
2518 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2520 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2521 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2522 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2523 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2524 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2525 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2526 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2527 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2529 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2530 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2531 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2532 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2534 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2535 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2536 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2537 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2538 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2539 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2540 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2541 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2545 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2546 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2547 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2548 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2552 src_minus11 = src10;
2557 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2561 src += (src_stride << 2);
2562 dst += (dst_stride << 2);
2567 ptrdiff_t src_stride,
2569 const uint8_t *no_p,
const uint8_t *no_q)
2575 ptrdiff_t src_stride,
2577 const uint8_t *no_p,
const uint8_t *no_q)
2583 ptrdiff_t src_stride,
2584 const int32_t *tc,
const uint8_t *no_p,
2585 const uint8_t *no_q)
2591 ptrdiff_t src_stride,
2592 const int32_t *tc,
const uint8_t *no_p,
2593 const uint8_t *no_q)
2599 ptrdiff_t stride_dst, ptrdiff_t stride_src,
2600 const int16_t *sao_offset_val,
int sao_left_class,
2605 sao_left_class, sao_offset_val,
2614 sao_left_class, sao_offset_val,
height);
2622 sao_left_class, sao_offset_val,
height);
2627 ptrdiff_t stride_dst,
2628 const int16_t *sao_offset_val,