37 int32_t dp00, dq00, dp30, dq30, d00, d30;
39 int32_t dp04, dq04, dp34, dq34, d04, d34;
40 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
42 uint64_t dst_val0, dst_val1;
43 v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
44 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
49 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
51 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
53 dp00 =
abs(p2[0] - (p1[0] << 1) + p0[0]);
54 dq00 =
abs(q2[0] - (
q1[0] << 1) +
q0[0]);
55 dp30 =
abs(p2[3] - (p1[3] << 1) + p0[3]);
56 dq30 =
abs(q2[3] - (
q1[3] << 1) +
q0[3]);
59 dp04 =
abs(p2[4] - (p1[4] << 1) + p0[4]);
60 dq04 =
abs(q2[4] - (
q1[4] << 1) +
q0[4]);
61 dp34 =
abs(p2[7] - (p1[7] << 1) + p0[7]);
62 dq34 =
abs(q2[7] - (
q1[7] << 1) +
q0[7]);
66 p_is_pcm0 = p_is_pcm[0];
67 p_is_pcm4 = p_is_pcm[1];
68 q_is_pcm0 = q_is_pcm[0];
69 q_is_pcm4 = q_is_pcm[1];
71 cmp0 = __msa_fill_d(p_is_pcm0);
72 cmp1 = __msa_fill_d(p_is_pcm4);
73 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
74 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
76 d0030 = (d00 + d30) >= beta;
77 d0434 = (d04 + d34) >= beta;
79 cmp0 = (v2i64) __msa_fill_w(d0030);
80 cmp1 = (v2i64) __msa_fill_w(d0434);
81 cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
82 cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
84 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
91 cmp0 = __msa_fill_d(q_is_pcm0);
92 cmp1 = __msa_fill_d(q_is_pcm4);
93 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
94 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
99 tc250 = ((tc0 * 5 + 1) >> 1);
101 tc254 = ((tc4 * 5 + 1) >> 1);
103 cmp0 = (v2i64) __msa_fill_h(tc0);
104 cmp1 = (v2i64) __msa_fill_h(tc4);
107 p3_src, p2_src, p1_src, p0_src);
113 flag0 =
abs(p3[0] - p0[0]) +
abs(q3[0] -
q0[0]) < beta30 &&
114 abs(p0[0] -
q0[0]) < tc250;
115 flag0 = flag0 && (
abs(p3[3] - p0[3]) +
abs(q3[3] -
q0[3]) < beta30 &&
116 abs(p0[3] -
q0[3]) < tc250 && (d00 << 1) < beta20 &&
117 (d30 << 1) < beta20);
119 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
121 q0_src, q1_src, q2_src, q3_src);
122 flag1 =
abs(p3[4] - p0[4]) +
abs(q3[4] -
q0[4]) < beta30 &&
123 abs(p0[4] -
q0[4]) < tc254;
124 flag1 = flag1 && (
abs(p3[7] - p0[7]) +
abs(q3[7] -
q0[7]) < beta30 &&
125 abs(p0[7] -
q0[7]) < tc254 && (d04 << 1) < beta20 &&
126 (d34 << 1) < beta20);
128 cmp0 = (v2i64) __msa_fill_w(flag0);
129 cmp1 = (v2i64) __msa_fill_w(flag1);
130 cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
131 cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
133 if (flag0 && flag1) {
139 temp0 = (p1_src + p0_src + q0_src);
140 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
141 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
142 temp2 = (v8i16) (temp1 - p2_src);
143 CLIP_SH(temp2, tc_neg, tc_pos);
144 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
146 temp1 = temp0 + p2_src;
147 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
148 temp2 = (v8i16) (temp1 - p1_src);
149 CLIP_SH(temp2, tc_neg, tc_pos);
150 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
152 temp1 = (temp0 << 1) + p2_src + q1_src;
153 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
154 temp2 = (v8i16) (temp1 - p0_src);
155 CLIP_SH(temp2, tc_neg, tc_pos);
156 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
158 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
159 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
160 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
163 temp0 = (q1_src + p0_src + q0_src);
165 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
166 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
167 temp2 = (v8i16) (temp1 - q2_src);
168 CLIP_SH(temp2, tc_neg, tc_pos);
169 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
171 temp1 = temp0 + q2_src;
172 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
173 temp2 = (v8i16) (temp1 - q1_src);
174 CLIP_SH(temp2, tc_neg, tc_pos);
175 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
177 temp1 = (temp0 << 1) + p1_src + q2_src;
178 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
179 temp2 = (v8i16) (temp1 - q0_src);
180 CLIP_SH(temp2, tc_neg, tc_pos);
181 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
183 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
184 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
185 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
189 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
192 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
193 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
195 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
196 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
197 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
199 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
200 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
206 }
else if (flag0 == flag1) {
210 diff0 = (v8i16) (q0_src - p0_src);
211 diff1 = (v8i16) (q1_src - p1_src);
212 diff0 = (diff0 << 3) + diff0;
213 diff1 = (diff1 << 1) + diff1;
214 delta0 = diff0 - diff1;
215 delta0 = __msa_srari_h(delta0, 4);
217 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
218 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
219 abs_delta0 = (v8u16) abs_delta0 < temp1;
221 CLIP_SH(delta0, tc_neg, tc_pos);
223 temp2 = (v8i16) (delta0 + p0_src);
225 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
226 (v16u8) p_is_pcm_vec);
228 temp2 = (v8i16) (q0_src - delta0);
230 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
231 (v16u8) q_is_pcm_vec);
233 p_is_pcm_vec = ~p_is_pcm_vec;
234 q_is_pcm_vec = ~q_is_pcm_vec;
235 tmp = (beta + (beta >> 1)) >> 3;
236 cmp0 = __msa_fill_d(dp00 + dp30 <
tmp);
237 cmp1 = __msa_fill_d(dp04 + dp34 <
tmp);
238 cmp0 = __msa_ilvev_d(cmp1, cmp0);
239 cmp0 = __msa_ceqi_d(cmp0, 0);
240 p_is_pcm_vec = p_is_pcm_vec | cmp0;
242 cmp0 = __msa_fill_d(dq00 + dq30 <
tmp);
243 cmp1 = __msa_fill_d(dq04 + dq34 <
tmp);
244 cmp0 = __msa_ilvev_d(cmp1, cmp0);
245 cmp0 = __msa_ceqi_d(cmp0, 0);
246 q_is_pcm_vec = q_is_pcm_vec | cmp0;
251 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
252 delta1 -= (v8i16) p1_src;
255 CLIP_SH(delta1, tc_neg, tc_pos);
256 delta1 = (v8i16) p1_src + (v8i16) delta1;
258 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
259 (v16u8) p_is_pcm_vec);
261 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
262 delta2 = delta2 - (v8i16) q1_src;
263 delta2 = delta2 - delta0;
264 delta2 = delta2 >> 1;
265 CLIP_SH(delta2, tc_neg, tc_pos);
266 delta2 = (v8i16) q1_src + (v8i16) delta2;
268 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
269 (v16u8) q_is_pcm_vec);
271 dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
273 dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
275 dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
277 dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
283 PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3);
285 dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3);
286 dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
297 temp0 = (p1_src + p0_src + q0_src);
298 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
299 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
300 temp2 = (v8i16) (temp1 - p2_src);
301 CLIP_SH(temp2, tc_neg, tc_pos);
302 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
304 temp1 = temp0 + p2_src;
305 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
306 temp2 = (v8i16) (temp1 - p1_src);
307 CLIP_SH(temp2, tc_neg, tc_pos);
308 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
310 temp1 = (temp0 << 1) + p2_src + q1_src;
311 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
312 temp2 = (v8i16) (temp1 - p0_src);
313 CLIP_SH(temp2, tc_neg, tc_pos);
314 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
316 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
317 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
318 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
321 temp0 = (q1_src + p0_src + q0_src);
323 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
324 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
325 temp2 = (v8i16) (temp1 - q2_src);
326 CLIP_SH(temp2, tc_neg, tc_pos);
327 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
329 temp1 = temp0 + q2_src;
330 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
331 temp2 = (v8i16) (temp1 - q1_src);
332 CLIP_SH(temp2, tc_neg, tc_pos);
333 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
335 temp1 = (temp0 << 1) + p1_src + q2_src;
336 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
337 temp2 = (v8i16) (temp1 - q0_src);
338 CLIP_SH(temp2, tc_neg, tc_pos);
339 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
341 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
342 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
343 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
347 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
354 diff0 = (v8i16) (q0_src - p0_src);
355 diff1 = (v8i16) (q1_src - p1_src);
356 diff0 = (diff0 << 3) + diff0;
357 diff1 = (diff1 << 1) + diff1;
358 delta0 = diff0 - diff1;
359 delta0 = __msa_srari_h(delta0, 4);
361 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
362 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
363 abs_delta0 = (v8u16) abs_delta0 < temp1;
365 CLIP_SH(delta0, tc_neg, tc_pos);
367 temp2 = (v8i16) (delta0 + p0_src);
369 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
370 (v16u8) p_is_pcm_vec);
372 temp2 = (v8i16) (q0_src - delta0);
374 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
375 (v16u8) q_is_pcm_vec);
377 p_is_pcm_vec = ~p_is_pcm_vec;
378 q_is_pcm_vec = ~q_is_pcm_vec;
379 tmp = (beta + (beta >> 1)) >> 3;
380 cmp0 = __msa_fill_d(dp00 + dp30 <
tmp);
381 cmp1 = __msa_fill_d(dp04 + dp34 <
tmp);
382 cmp0 = __msa_ilvev_d(cmp1, cmp0);
383 p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
385 cmp0 = __msa_fill_d(dq00 + dq30 <
tmp);
386 cmp1 = __msa_fill_d(dq04 + dq34 <
tmp);
387 cmp0 = __msa_ilvev_d(cmp1, cmp0);
388 q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
393 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
394 delta1 -= (v8i16) p1_src;
397 CLIP_SH(delta1, tc_neg, tc_pos);
398 delta1 = (v8i16) p1_src + (v8i16) delta1;
400 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
401 (v16u8) p_is_pcm_vec);
403 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
404 delta2 = delta2 - (v8i16) q1_src;
405 delta2 = delta2 - delta0;
406 delta2 = delta2 >> 1;
407 CLIP_SH(delta2, tc_neg, tc_pos);
408 delta2 = (v8i16) q1_src + (v8i16) delta2;
410 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
411 (v16u8) q_is_pcm_vec);
413 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
415 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
417 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
419 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
424 PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4);
425 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2);
428 dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2);
429 dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2);
430 dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2);
433 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
434 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
436 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
437 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
438 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
440 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
441 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
461 int32_t dp00, dq00, dp30, dq30, d00, d30;
463 int32_t dp04, dq04, dp34, dq34, d04, d34;
464 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
465 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254,
tmp;
466 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
467 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
471 v8i16 tc_pos, tc_neg;
472 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
474 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
476 dp00 =
abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
477 dq00 =
abs(p3[2] - (p3[1] << 1) + p3[0]);
478 dp30 =
abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
479 dq30 =
abs(p2[2] - (p2[1] << 1) + p2[0]);
482 p_is_pcm0 = p_is_pcm[0];
483 q_is_pcm0 = q_is_pcm[0];
485 dp04 =
abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
486 dq04 =
abs(p1[2] - (p1[1] << 1) + p1[0]);
487 dp34 =
abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
488 dq34 =
abs(p0[2] - (p0[1] << 1) + p0[0]);
491 p_is_pcm4 = p_is_pcm[1];
492 q_is_pcm4 = q_is_pcm[1];
494 cmp0 = __msa_fill_d(p_is_pcm0);
495 cmp1 = __msa_fill_d(p_is_pcm4);
496 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
497 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
499 d0030 = (d00 + d30) >= beta;
500 d0434 = (d04 + d34) >= beta;
502 cmp0 = __msa_fill_d(d0030);
503 cmp1 = __msa_fill_d(d0434);
504 cmp3 = __msa_ilvev_d(cmp1, cmp0);
505 cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0);
507 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
508 (!d0030 || !d0434)) {
513 cmp0 = __msa_fill_d(q_is_pcm0);
514 cmp1 = __msa_fill_d(q_is_pcm4);
515 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
516 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
521 tc250 = ((tc0 * 5 + 1) >> 1);
524 tc254 = ((tc4 * 5 + 1) >> 1);
525 cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
526 cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
527 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
530 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
531 q0_src, q1_src, q2_src, q3_src);
533 flag0 =
abs(p3[-4] - p3[-1]) +
abs(p3[3] - p3[0]) < beta30 &&
534 abs(p3[-1] - p3[0]) < tc250;
535 flag0 = flag0 && (
abs(p2[-4] - p2[-1]) +
abs(p2[3] - p2[0]) < beta30 &&
536 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
537 (d30 << 1) < beta20);
538 cmp0 = __msa_fill_d(flag0);
540 p3_src, p2_src, p1_src, p0_src);
542 flag1 =
abs(p1[-4] - p1[-1]) +
abs(p1[3] - p1[0]) < beta30 &&
543 abs(p1[-1] - p1[0]) < tc254;
544 flag1 = flag1 && (
abs(p0[-4] - p0[-1]) +
abs(p0[3] - p0[0]) < beta30 &&
545 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
546 (d34 << 1) < beta20);
548 q0_src, q1_src, q2_src, q3_src);
550 cmp1 = __msa_fill_d(flag1);
551 cmp2 = __msa_ilvev_d(cmp1, cmp0);
552 cmp2 = __msa_ceqi_d(cmp2, 0);
554 if (flag0 && flag1) {
559 temp0 = (p1_src + p0_src + q0_src);
561 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
562 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
563 temp2 = (v8i16) (temp1 - p2_src);
564 CLIP_SH(temp2, tc_neg, tc_pos);
565 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
567 temp1 = temp0 + p2_src;
568 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
569 temp2 = (v8i16) (temp1 - p1_src);
570 CLIP_SH(temp2, tc_neg, tc_pos);
571 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
573 temp1 = (temp0 << 1) + p2_src + q1_src;
574 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
575 temp2 = (v8i16) (temp1 - p0_src);
576 CLIP_SH(temp2, tc_neg, tc_pos);
577 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
579 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
580 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
581 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
584 temp0 = (q1_src + p0_src + q0_src);
585 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
586 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
587 temp2 = (v8i16) (temp1 - q2_src);
588 CLIP_SH(temp2, tc_neg, tc_pos);
589 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
591 temp1 = temp0 + q2_src;
592 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
593 temp2 = (v8i16) (temp1 - q1_src);
594 CLIP_SH(temp2, tc_neg, tc_pos);
595 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
597 temp1 = (temp0 << 1) + p1_src + q2_src;
598 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
599 temp2 = (v8i16) (temp1 - q0_src);
600 CLIP_SH(temp2, tc_neg, tc_pos);
601 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
603 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
604 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
605 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
607 }
else if (flag0 == flag1) {
612 diff0 = (v8i16) (q0_src - p0_src);
613 diff1 = (v8i16) (q1_src - p1_src);
614 diff0 = (diff0 << 3) + diff0;
615 diff1 = (diff1 << 1) + diff1;
616 delta0 = diff0 - diff1;
617 delta0 = __msa_srari_h(delta0, 4);
619 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
620 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
621 abs_delta0 = (v8u16) abs_delta0 < temp1;
623 CLIP_SH(delta0, tc_neg, tc_pos);
624 temp2 = (v8i16) (delta0 + p0_src);
626 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
627 (v16u8) p_is_pcm_vec);
629 temp2 = (v8i16) (q0_src - delta0);
631 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
632 (v16u8) q_is_pcm_vec);
634 tmp = ((beta + (beta >> 1)) >> 3);
635 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) <
tmp));
636 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) <
tmp));
637 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
638 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
640 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 <
tmp));
641 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 <
tmp));
642 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
643 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
648 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
649 delta1 -= (v8i16) p1_src;
652 CLIP_SH(delta1, tc_neg, tc_pos);
653 delta1 = (v8i16) p1_src + (v8i16) delta1;
655 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
656 (v16u8) p_is_pcm_vec);
658 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
659 delta2 = delta2 - (v8i16) q1_src;
660 delta2 = delta2 - delta0;
661 delta2 = delta2 >> 1;
662 CLIP_SH(delta2, tc_neg, tc_pos);
663 delta2 = (v8i16) q1_src + (v8i16) delta2;
665 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
666 (v16u8) q_is_pcm_vec);
668 dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
670 dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
672 dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
674 dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
678 dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3);
679 dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3);
680 dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3);
681 dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3);
691 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
692 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
698 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
699 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
705 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
706 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
712 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
713 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
724 temp0 = (p1_src + p0_src + q0_src);
726 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
727 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
728 temp2 = (v8i16) (temp1 - p2_src);
729 CLIP_SH(temp2, tc_neg, tc_pos);
730 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
732 temp1 = temp0 + p2_src;
733 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
734 temp2 = (v8i16) (temp1 - p1_src);
735 CLIP_SH(temp2, tc_neg, tc_pos);
736 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
738 temp1 = (temp0 << 1) + p2_src + q1_src;
739 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
740 temp2 = (v8i16) (temp1 - p0_src);
741 CLIP_SH(temp2, tc_neg, tc_pos);
742 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
744 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
745 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
746 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
749 temp0 = (q1_src + p0_src + q0_src);
750 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
751 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
752 temp2 = (v8i16) (temp1 - q2_src);
753 CLIP_SH(temp2, tc_neg, tc_pos);
754 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
756 temp1 = temp0 + q2_src;
757 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
758 temp2 = (v8i16) (temp1 - q1_src);
759 CLIP_SH(temp2, tc_neg, tc_pos);
760 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
762 temp1 = (temp0 << 1) + p1_src + q2_src;
763 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
764 temp2 = (v8i16) (temp1 - q0_src);
765 CLIP_SH(temp2, tc_neg, tc_pos);
766 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
768 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
769 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
770 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
777 diff0 = (v8i16) (q0_src - p0_src);
778 diff1 = (v8i16) (q1_src - p1_src);
779 diff0 = (diff0 << 3) + diff0;
780 diff1 = (diff1 << 1) + diff1;
781 delta0 = diff0 - diff1;
782 delta0 = __msa_srari_h(delta0, 4);
784 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
785 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
786 abs_delta0 = (v8u16) abs_delta0 < temp1;
788 CLIP_SH(delta0, tc_neg, tc_pos);
790 temp2 = (v8i16) (delta0 + p0_src);
792 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
793 (v16u8) p_is_pcm_vec);
795 temp2 = (v8i16) (q0_src - delta0);
797 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
798 (v16u8) q_is_pcm_vec);
800 tmp = (beta + (beta >> 1)) >> 3;
801 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) <
tmp));
802 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) <
tmp));
803 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
804 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
806 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 <
tmp));
807 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 <
tmp));
808 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
809 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
814 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
815 delta1 -= (v8i16) p1_src;
818 CLIP_SH(delta1, tc_neg, tc_pos);
819 delta1 = (v8i16) p1_src + (v8i16) delta1;
821 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
822 (v16u8) p_is_pcm_vec);
824 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
825 delta2 = delta2 - (v8i16) q1_src;
826 delta2 = delta2 - delta0;
827 delta2 = delta2 >> 1;
828 CLIP_SH(delta2, tc_neg, tc_pos);
829 delta2 = (v8i16) q1_src + (v8i16) delta2;
831 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
832 (v16u8) q_is_pcm_vec);
833 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
835 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
837 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
839 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
844 dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
845 dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
846 dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
847 dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
848 dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
849 dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
852 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3);
853 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3);
854 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3);
855 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3);
856 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3);
857 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3);
860 PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1,
871 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
872 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
873 tmp0 = __msa_copy_u_h((v8i16) dst2, 0);
874 tmp1 = __msa_copy_u_h((v8i16) dst2, 2);
882 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
883 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
884 tmp0 = __msa_copy_u_h((v8i16) dst2, 4);
885 tmp1 = __msa_copy_u_h((v8i16) dst2, 6);
893 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
894 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
895 tmp0 = __msa_copy_u_h((v8i16) dst3, 0);
896 tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
904 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
905 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
906 tmp0 = __msa_copy_u_h((v8i16) dst3, 4);
907 tmp1 = __msa_copy_u_h((v8i16) dst3, 6);
924 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925 v8u16 p1, p0,
q0,
q1;
926 v8i16 tc_pos, tc_neg;
928 v8i16 temp0, temp1,
delta;
930 if (!(
tc[0] <= 0) || !(
tc[1] <= 0)) {
931 cmp0 = (v2i64) __msa_fill_h(
tc[0]);
932 cmp1 = (v2i64) __msa_fill_h(
tc[1]);
933 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
936 cmp0 = __msa_fill_d(p_is_pcm[0]);
937 cmp1 = __msa_fill_d(p_is_pcm[1]);
938 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
939 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
941 cmp0 = __msa_fill_d(q_is_pcm[0]);
942 cmp1 = __msa_fill_d(q_is_pcm[1]);
943 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
944 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
951 ILVR_B4_UH(
zero, p1,
zero, p0,
zero,
q0,
zero,
q1, p1, p0,
q0,
q1);
953 temp0 = (v8i16) (
q0 - p0);
954 temp1 = (v8i16) (p1 -
q1);
957 delta = __msa_srari_h((v8i16) temp0, 3);
960 temp0 = (v8i16) ((v8i16) p0 +
delta);
962 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
963 (v16u8) p_is_pcm_vec);
965 temp1 = (v8i16) ((v8i16)
q0 -
delta);
967 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8)
q0,
968 (v16u8) q_is_pcm_vec);
970 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
971 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
972 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8)
q0, (v16u8) tc_pos);
974 temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
983 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
984 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
985 v8u16 p1, p0,
q0,
q1;
986 v8i16 tc_pos, tc_neg;
988 v8i16 temp0, temp1,
delta;
990 if (!(
tc[0] <= 0) || !(
tc[1] <= 0)) {
991 cmp0 = (v2i64) __msa_fill_h(
tc[0]);
992 cmp1 = (v2i64) __msa_fill_h(
tc[1]);
993 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
996 cmp0 = __msa_fill_d(p_is_pcm[0]);
997 cmp1 = __msa_fill_d(p_is_pcm[1]);
998 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
999 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
1001 cmp0 = __msa_fill_d(q_is_pcm[0]);
1002 cmp1 = __msa_fill_d(q_is_pcm[1]);
1003 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
1004 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
1010 ILVR_B4_UH(
zero, p1,
zero, p0,
zero,
q0,
zero,
q1, p1, p0,
q0,
q1);
1012 temp0 = (v8i16) (
q0 - p0);
1013 temp1 = (v8i16) (p1 -
q1);
1016 delta = __msa_srari_h((v8i16) temp0, 3);
1019 temp0 = (v8i16) ((v8i16) p0 +
delta);
1021 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
1022 (v16u8) p_is_pcm_vec);
1024 temp1 = (v8i16) ((v8i16)
q0 -
delta);
1026 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8)
q0,
1027 (v16u8) q_is_pcm_vec);
1029 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
1030 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
1031 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8)
q0, (v16u8) tc_pos);
1033 temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
1036 ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7,
src,
stride);
1043 int16_t *sao_offset_val,
1047 v16i8 src0_r, src1_r;
1049 v16i8 dst0, offset0, offset1;
1052 offset_val =
LD_SB(sao_offset_val + 1);
1053 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1055 offset_val = __msa_pckev_b(offset_val, offset_val);
1056 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1057 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1058 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1063 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1064 SWAP(offset0, offset1);
1068 src += (4 * src_stride);
1072 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1073 mask = __msa_srli_b(src0_r, 3);
1074 offset = __msa_vshf_b(
mask, offset1, offset0);
1076 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1077 dst0 = __msa_adds_s_b(src0_r,
offset);
1078 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1084 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1085 dst += (4 * dst_stride);
1090 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1091 mask = __msa_srli_b(src0_r, 3);
1092 offset = __msa_vshf_b(
mask, offset1, offset0);
1094 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1095 dst0 = __msa_adds_s_b(src0_r,
offset);
1096 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1099 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1105 int16_t *sao_offset_val,
1109 v16i8 src0_r, src1_r, mask0, mask1;
1110 v16i8 offset_mask0, offset_mask1, offset_val;
1111 v16i8 offset0, offset1, dst0, dst1;
1114 offset_val =
LD_SB(sao_offset_val + 1);
1115 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1116 offset_val = __msa_pckev_b(offset_val, offset_val);
1117 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1118 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1119 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1124 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1125 SWAP(offset0, offset1);
1129 src += src_stride << 2;
1133 mask0 = __msa_srli_b(src0_r, 3);
1134 mask1 = __msa_srli_b(src1_r, 3);
1136 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1137 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1144 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1145 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1150 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1151 dst += dst_stride << 2;
1156 mask0 = __msa_srli_b(src0_r, 3);
1157 mask1 = __msa_srli_b(src1_r, 3);
1159 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1160 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1164 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1165 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1170 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1178 int16_t *sao_offset_val,
1183 v16i8 out0, out1, out2, out3;
1184 v16i8 mask0, mask1, mask2, mask3;
1185 v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
1186 v16i8 offset0, offset1;
1189 offset_val =
LD_SB(sao_offset_val + 1);
1190 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1191 offset_val = __msa_pckev_b(offset_val, offset_val);
1192 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1193 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1194 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1196 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1197 SWAP(offset0, offset1);
1204 for (w_cnt = 16; w_cnt <
width; w_cnt += 16) {
1205 mask0 = __msa_srli_b((v16i8)
src0, 3);
1206 mask1 = __msa_srli_b((v16i8)
src1, 3);
1207 mask2 = __msa_srli_b((v16i8) src2, 3);
1208 mask3 = __msa_srli_b((v16i8) src3, 3);
1210 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
1212 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
1216 out0 = __msa_adds_s_b((v16i8)
src0, tmp0);
1217 out1 = __msa_adds_s_b((v16i8)
src1, tmp1);
1218 out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1219 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1226 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1229 mask0 = __msa_srli_b((v16i8)
src0, 3);
1230 mask1 = __msa_srli_b((v16i8)
src1, 3);
1231 mask2 = __msa_srli_b((v16i8) src2, 3);
1232 mask3 = __msa_srli_b((v16i8) src3, 3);
1234 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, tmp0,
1236 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, tmp2,
1240 out0 = __msa_adds_s_b((v16i8)
src0, tmp0);
1241 out1 = __msa_adds_s_b((v16i8)
src1, tmp1);
1242 out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1243 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1247 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1249 src += src_stride << 2;
1250 dst += dst_stride << 2;
1259 int16_t *sao_offset_val,
1262 uint32_t dst_val0, dst_val1;
1263 v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11;
1264 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1265 v16i8 sao_offset =
LD_SB(sao_offset_val);
1267 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1270 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1274 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1277 src += (2 * src_stride);
1279 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1280 (v2i64) src_minus10);
1282 src0 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 1);
1283 src_plus10 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 2);
1285 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1286 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1287 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1288 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1290 cmp_minus10 = ((v16u8)
src0 == (v16u8) src_plus10);
1291 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1292 cmp_minus10 = ((v16u8) src_plus10 < (v16u8)
src0);
1293 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1295 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1298 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1303 src0 = (v16i8) __msa_xori_b((v16u8)
src0, 128);
1305 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1307 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1308 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1315 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1316 (v2i64) src_minus10);
1318 src0 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 1);
1319 src_plus10 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 2);
1321 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1322 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1323 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1324 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1326 cmp_minus10 = ((v16u8)
src0 == (v16u8) src_plus10);
1327 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1328 cmp_minus10 = ((v16u8) src_plus10 < (v16u8)
src0);
1329 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1331 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1335 src0 = (v16i8) __msa_xori_b((v16u8)
src0, 128);
1337 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1339 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1340 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1351 int16_t *sao_offset_val,
1354 uint64_t dst_val0, dst_val1;
1355 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1356 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1357 v16u8 cmp_minus10, diff_minus10, diff_minus11;
1358 v16u8
src0,
src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1360 v16i8 zeros = { 0 };
1362 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1366 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1369 src += (src_stride << 1);
1372 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
1374 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
1375 src_minus10, src_plus10);
1376 src0 = (v16u8) __msa_pckev_d((v2i64)
src1, (v2i64)
src0);
1378 cmp_minus10 = (
src0 == src_minus10);
1379 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1380 cmp_minus10 = (src_minus10 <
src0);
1381 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1383 cmp_minus10 = (
src0 == src_plus10);
1384 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1385 cmp_minus10 = (src_plus10 <
src0);
1386 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1388 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1391 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1397 dst0 = (v16u8) __msa_adds_s_b((v16i8)
src0,
offset);
1398 dst0 = __msa_xori_b(dst0, 128);
1400 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1401 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1409 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
1411 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
1413 src0 = (v16u8) __msa_pckev_d((v2i64)
src1, (v2i64)
src0);
1415 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1416 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1417 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1418 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1420 cmp_minus10 = (
src0 == src_plus10);
1421 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1422 cmp_minus10 = (src_plus10 <
src0);
1423 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1425 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1431 dst0 = (v16u8) __msa_adds_s_b((v16i8)
src0,
offset);
1432 dst0 = __msa_xori_b(dst0, 128);
1434 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1435 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1445 int16_t *sao_offset_val,
1449 uint8_t *dst_ptr, *src_minus1;
1451 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1452 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1454 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1455 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1456 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1458 v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1459 v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
1460 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1461 v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
1462 v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
1464 sao_offset =
LD_SB(sao_offset_val);
1465 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1468 src_minus1 =
src - 1;
1469 LD_UB4(src_minus1, src_stride,
1470 src_minus10, src_minus11, src_minus12, src_minus13);
1472 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1474 dst_ptr = dst + v_cnt;
1475 LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
1477 SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
1478 src12, src_minus12, src13, src_minus13, 1,
1479 src_zero0, src_zero1, src_zero2, src_zero3);
1480 SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
1481 src12, src_minus12, src13, src_minus13, 2,
1482 src_plus10, src_plus11, src_plus12, src_plus13);
1484 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1485 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1486 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1487 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1488 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1489 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1490 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1491 cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
1493 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1494 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1495 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1496 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1497 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1498 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1499 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1500 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1502 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1503 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1504 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1505 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1506 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1507 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1508 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1509 cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
1511 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1512 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1513 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1514 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1515 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1516 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1517 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1518 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1520 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1521 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1522 offset_mask0, offset_mask0, offset_mask0);
1523 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1524 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1525 offset_mask1, offset_mask1, offset_mask1);
1526 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1527 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
1528 offset_mask2, offset_mask2, offset_mask2);
1529 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1530 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
1531 offset_mask3, offset_mask3, offset_mask3);
1535 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
1536 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
1537 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
1538 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
1542 src_minus10 = src10;
1543 ST_UB(dst0, dst_ptr);
1544 src_minus11 = src11;
1545 ST_UB(dst1, dst_ptr + dst_stride);
1546 src_minus12 = src12;
1547 ST_UB(dst2, dst_ptr + (dst_stride << 1));
1548 src_minus13 = src13;
1549 ST_UB(dst3, dst_ptr + (dst_stride * 3));
1552 src += (src_stride << 2);
1553 dst += (dst_stride << 2);
1561 int16_t *sao_offset_val,
1564 uint32_t dst_val0, dst_val1;
1565 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1566 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1568 v16i8 sao_offset =
LD_SB(sao_offset_val);
1569 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1570 v16u8 src_minus10, src_minus11, src10, src11;
1571 v16i8 src_zero0, src_zero1;
1573 v8i16 offset_mask0, offset_mask1;
1575 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1578 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1579 LD_UB2(
src + src_stride, src_stride, src10, src11);
1582 src += (src_stride << 1);
1584 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1585 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1586 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1587 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1589 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1590 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1591 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1592 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1594 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1595 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1596 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1597 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1599 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1600 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1602 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1603 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1608 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1609 dst0 = __msa_adds_s_b(dst0,
offset);
1610 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1612 src_minus10 = src10;
1613 src_minus11 = src11;
1616 LD_UB2(
src + src_stride, src_stride, src10, src11);
1618 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1619 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1627 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1628 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1629 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1630 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1632 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1633 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1634 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1635 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1637 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1638 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1639 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1640 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1642 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1643 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1645 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1646 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1651 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1652 dst0 = __msa_adds_s_b(dst0,
offset);
1653 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1655 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1656 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1666 int16_t *sao_offset_val,
1669 uint64_t dst_val0, dst_val1;
1670 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1671 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1673 v16i8 src_zero0, src_zero1, dst0;
1674 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1675 v16u8 src_minus10, src_minus11, src10, src11;
1676 v8i16 offset_mask0, offset_mask1;
1678 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1681 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1682 LD_UB2(
src + src_stride, src_stride, src10, src11);
1685 src += (src_stride << 1);
1687 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1688 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1689 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1690 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1692 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1693 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1694 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1695 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1697 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1698 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1699 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1700 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1702 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1703 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1705 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1706 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1711 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1712 dst0 = __msa_adds_s_b(dst0,
offset);
1713 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1715 src_minus10 = src10;
1716 src_minus11 = src11;
1719 LD_UB2(
src + src_stride, src_stride, src10, src11);
1721 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1722 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1729 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1730 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1731 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1732 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1734 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1735 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1736 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1737 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1739 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1740 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1741 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1742 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1744 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1745 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1747 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1748 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1753 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1754 dst0 = __msa_adds_s_b(dst0,
offset);
1755 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1757 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1758 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1776 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1777 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1778 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1779 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1780 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1782 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1783 v16u8 src12, dst2, src13, dst3;
1784 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1786 sao_offset =
LD_SB(sao_offset_val);
1787 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1789 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1790 src = src_orig + v_cnt;
1791 dst = dst_orig + v_cnt;
1793 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1795 for (h_cnt = (
height >> 2); h_cnt--;) {
1796 LD_UB4(
src + src_stride, src_stride, src10, src11, src12, src13);
1798 cmp_minus10 = (src_minus11 == src_minus10);
1799 cmp_plus10 = (src_minus11 == src10);
1800 cmp_minus11 = (src10 == src_minus11);
1801 cmp_plus11 = (src10 == src11);
1802 cmp_minus12 = (src11 == src10);
1803 cmp_plus12 = (src11 == src12);
1804 cmp_minus13 = (src12 == src11);
1805 cmp_plus13 = (src12 == src13);
1807 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1808 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1809 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1810 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1811 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1812 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1813 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1814 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1816 cmp_minus10 = (src_minus10 < src_minus11);
1817 cmp_plus10 = (src10 < src_minus11);
1818 cmp_minus11 = (src_minus11 < src10);
1819 cmp_plus11 = (src11 < src10);
1820 cmp_minus12 = (src10 < src11);
1821 cmp_plus12 = (src12 < src11);
1822 cmp_minus13 = (src11 < src12);
1823 cmp_plus13 = (src13 < src12);
1825 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1826 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1827 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1828 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1829 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1830 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1831 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1832 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1834 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1835 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1836 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1837 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1838 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1839 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1840 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1841 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1842 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1843 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1844 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1845 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1847 src_minus10 = src12;
1850 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_minus11, offset_mask0);
1851 dst1 = (v16u8) __msa_adds_s_b((v16i8) src10, offset_mask1);
1852 dst2 = (v16u8) __msa_adds_s_b((v16i8) src11, offset_mask2);
1853 dst3 = (v16u8) __msa_adds_s_b((v16i8) src12, offset_mask3);
1856 src_minus11 = src13;
1858 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1860 src += (src_stride << 2);
1861 dst += (dst_stride << 2);
1870 int16_t *sao_offset_val,
1874 uint32_t dst_val0, dst_val1;
1875 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1876 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1878 v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1879 v16u8 src_minus11, src10, src11;
1880 v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1881 v8i16 offset_mask0, offset_mask1;
1882 v16i8 zeros = { 0 };
1884 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1889 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1890 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1893 src_orig += (src_stride << 1);
1895 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
1896 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
1898 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1900 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1903 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1904 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1905 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1906 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1908 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1909 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1910 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1911 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1913 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1914 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1916 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1917 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1922 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1923 dst0 = __msa_adds_s_b(dst0,
offset);
1924 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1926 src_minus10 = src10;
1927 src_minus11 = src11;
1930 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1932 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1933 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1941 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
1942 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
1944 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1946 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1949 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1950 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1951 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1952 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1954 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1955 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1956 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1957 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1959 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1960 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1962 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1963 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1968 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1969 dst0 = __msa_adds_s_b(dst0,
offset);
1970 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1972 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1973 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1983 int16_t *sao_offset_val,
1987 uint64_t dst_val0, dst_val1;
1988 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1989 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1991 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1992 v16u8 src_minus10, src10, src_minus11, src11;
1993 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1994 v8i16 offset_mask0, offset_mask1;
1995 v16i8 zeros = { 0 };
1997 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2001 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2002 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2005 src_orig += (src_stride << 1);
2007 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2008 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
2010 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
2011 src_minus10, src_minus11);
2012 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
2013 src_zero0, src_zero1);
2015 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2016 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2017 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2018 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2020 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2021 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2022 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2023 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2025 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2026 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2028 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2029 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2034 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2035 dst0 = __msa_adds_s_b(dst0,
offset);
2036 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2038 src_minus10 = src10;
2039 src_minus11 = src11;
2042 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2044 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2045 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2052 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2053 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
2054 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
2056 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2059 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2060 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2061 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2062 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2064 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2065 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2066 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2067 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2069 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2070 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2072 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2073 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2078 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2079 dst0 = __msa_adds_s_b(dst0,
offset);
2080 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2082 src_minus10 = src10;
2083 src_minus11 = src11;
2086 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2088 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2089 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2107 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2108 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2109 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
2110 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
2111 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
2112 v16u8 diff_plus13, src_minus14, src_plus13;
2113 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
2114 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
2115 v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
2116 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
2117 v16i8 src_zero3, sao_offset;
2119 sao_offset =
LD_SB(sao_offset_val);
2120 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2125 LD_UB4(src_orig, src_stride, src_minus11, src_minus12, src_minus13,
2128 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2129 src_minus10 =
LD_UB(src_orig - src_stride);
2130 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2131 src_plus13 =
LD_UB(
src + 1 + v_cnt + (src_stride << 2));
2134 SLDI_B4_SB(src10, src_minus11, src11, src_minus12,
2135 src12, src_minus13, src13, src_minus14, 1,
2136 src_zero0, src_zero1, src_zero2, src_zero3);
2137 SLDI_B2_SB(src11, src_minus12, src12, src_minus13, 2, src_plus10,
2140 src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
2142 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2143 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
2144 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2145 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
2146 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
2147 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
2148 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
2149 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2151 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2152 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2153 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2154 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2155 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2156 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2157 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2158 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2160 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2161 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
2162 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2163 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
2164 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
2165 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
2166 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
2167 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2169 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2170 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2171 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2172 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2173 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2174 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2175 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2176 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2178 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2179 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2180 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2181 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2183 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2184 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2185 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2186 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2187 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2188 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2189 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2190 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2194 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2195 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2196 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2197 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2201 src_minus11 = src10;
2202 src_minus12 = src11;
2203 src_minus13 = src12;
2204 src_minus14 = src13;
2206 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2210 src += (src_stride << 2);
2211 dst += (dst_stride << 2);
2219 int16_t *sao_offset_val,
2223 uint32_t dst_val0, dst_val1;
2224 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2225 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2227 v16i8 src_zero0, src_zero1, dst0;
2228 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2229 v16u8 src_minus10, src10, src_minus11, src11;
2230 v8i16 offset_mask0, offset_mask1;
2231 v16i8 zeros = { 0 };
2233 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2237 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2238 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2241 src_orig += (src_stride << 1);
2243 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2244 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2246 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2248 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2251 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2252 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2253 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2254 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2256 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2257 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2258 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2259 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2261 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2262 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2264 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2265 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2270 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2271 dst0 = __msa_adds_s_b(dst0,
offset);
2272 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2274 src_minus10 = src10;
2275 src_minus11 = src11;
2278 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2280 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2281 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2290 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2291 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2293 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2295 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2298 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2299 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2300 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2301 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2303 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2304 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2305 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2306 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2308 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2309 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2311 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2312 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2317 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2318 dst0 = __msa_adds_s_b(dst0,
offset);
2319 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2321 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2322 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2334 int16_t *sao_offset_val,
2338 uint64_t dst_val0, dst_val1;
2339 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2340 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2342 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2343 v16u8 src_minus10, src10, src_minus11, src11;
2344 v16i8 src_zero0, src_zero1, dst0;
2345 v8i16 offset_mask0, offset_mask1;
2346 v16i8 zeros = { 0 };
2348 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2352 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2353 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2356 src_orig += (src_stride << 1);
2358 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2359 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2360 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2362 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2365 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2366 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2367 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2368 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2370 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2371 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2372 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2373 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2375 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2376 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2378 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2379 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2384 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2385 dst0 = __msa_adds_s_b(dst0,
offset);
2386 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2388 src_minus10 = src10;
2389 src_minus11 = src11;
2392 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2394 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2395 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2403 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2404 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2405 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2407 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2410 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2411 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2412 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2413 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2415 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2416 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2417 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2418 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2420 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2421 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2423 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2424 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2429 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2430 dst0 = __msa_adds_s_b(dst0,
offset);
2431 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2433 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2434 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2453 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2454 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2455 v16u8 dst0, dst1, dst2, dst3;
2456 v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2457 v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2458 v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2459 v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2460 v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
2461 v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2462 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2464 sao_offset =
LD_SB(sao_offset_val);
2465 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2471 LD_UB4(src_orig, src_stride, src_minus11, src_plus10, src_plus11,
2474 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2475 src_minus10 =
LD_UB(src_orig + 2 - src_stride);
2476 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2477 src_plus13 =
LD_UB(src_orig + (src_stride << 2));
2480 src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
2481 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2482 cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
2484 src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
2485 src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
2486 (v16i8) src_minus11, 2);
2487 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2488 cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
2490 src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
2491 src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
2492 cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
2493 cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
2495 src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
2496 src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
2497 cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
2498 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2500 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2501 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2502 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2503 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2504 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2505 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2506 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2507 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2509 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2510 cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
2511 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2512 cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
2513 cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
2514 cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
2515 cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
2516 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2518 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2519 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2520 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2521 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2522 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2523 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2524 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2525 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2527 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2528 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2529 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2530 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2532 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2533 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2534 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2535 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2536 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2537 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2538 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2539 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2543 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2544 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2545 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2546 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2550 src_minus11 = src10;
2555 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2559 src += (src_stride << 2);
2560 dst += (dst_stride << 2);
2565 ptrdiff_t src_stride,
2573 ptrdiff_t src_stride,
2581 ptrdiff_t src_stride,
2589 ptrdiff_t src_stride,
2597 ptrdiff_t stride_dst, ptrdiff_t stride_src,
2598 int16_t *sao_offset_val,
int sao_left_class,
2603 sao_left_class, sao_offset_val,
2612 sao_left_class, sao_offset_val,
height);
2620 sao_left_class, sao_offset_val,
height);
2625 ptrdiff_t stride_dst,
2626 int16_t *sao_offset_val,