FFmpeg
hevc_lpf_sao_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Loongson Technology Corporation Limited
3  * Contributed by Lu Wang <wanglu@loongson.cn>
4  * Hao Chen <chenhao@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
24 #include "hevcdsp_lsx.h"
25 
26 void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
27  int32_t beta, const int32_t *tc,
28  const uint8_t *p_is_pcm, const uint8_t *q_is_pcm)
29 {
30  ptrdiff_t stride_2x = (stride << 1);
31  ptrdiff_t stride_4x = (stride << 2);
32  ptrdiff_t stride_3x = stride_2x + stride;
33  uint8_t *p3 = src - stride_4x;
34  uint8_t *p2 = src - stride_3x;
35  uint8_t *p1 = src - stride_2x;
36  uint8_t *p0 = src - stride;
37  uint8_t *q0 = src;
38  uint8_t *q1 = src + stride;
39  uint8_t *q2 = src + stride_2x;
40  uint8_t *q3 = src + stride_3x;
41  uint8_t flag0, flag1;
42  int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
43  int32_t dp04, dq04, dp34, dq34, d04, d34;
44  int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
45  int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
46 
47  __m128i dst0, dst1, dst2, dst3, dst4, dst5;
48  __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
49  __m128i temp0, temp1;
50  __m128i temp2, tc_pos, tc_neg;
51  __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
52  __m128i zero = {0};
53  __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
54 
55  dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
56  dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
57  dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
58  dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
59  d00 = dp00 + dq00;
60  d30 = dp30 + dq30;
61  dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
62  dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
63  dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
64  dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
65  d04 = dp04 + dq04;
66  d34 = dp34 + dq34;
67 
68  p_is_pcm0 = p_is_pcm[0];
69  p_is_pcm4 = p_is_pcm[1];
70  q_is_pcm0 = q_is_pcm[0];
71  q_is_pcm4 = q_is_pcm[1];
72 
73  DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
74  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
75  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
76  d0030 = (d00 + d30) >= beta;
77  d0434 = (d04 + d34) >= beta;
78  DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
79  cmp3 = __lsx_vpackev_w(cmp1, cmp0);
80  cmp3 = __lsx_vseqi_w(cmp3, 0);
81 
82  if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
83  (!d0030 || !d0434)) {
84  DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
85  p3_src, p2_src, p1_src, p0_src);
86  DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
87  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
88  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
89 
90  tc0 = tc[0];
91  beta30 = beta >> 3;
92  beta20 = beta >> 2;
93  tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
94  tc4 = tc[1];
95  tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
96 
97  DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
98  DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
99  p0_src, p3_src, p2_src, p1_src, p0_src);
100  DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
101  q0_src, q1_src, q2_src, q3_src);
102  flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
103  abs(p0[0] - q0[0]) < tc250;
104  flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
105  abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
106  (d30 << 1) < beta20);
107  tc_pos = __lsx_vpackev_d(cmp1, cmp0);
108  DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
109  zero, q3_src, q0_src, q1_src, q2_src, q3_src);
110 
111  flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
112  abs(p0[4] - q0[4]) < tc254;
113  flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
114  abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
115  (d34 << 1) < beta20);
116  DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
117  cmp2 = __lsx_vpackev_w(cmp1, cmp0);
118  cmp2 = __lsx_vseqi_w(cmp2, 0);
119 
120  if (flag0 && flag1) { /* strong only */
121  /* strong filter */
122  tc_pos = __lsx_vslli_h(tc_pos, 1);
123  tc_neg = __lsx_vneg_h(tc_pos);
124 
125  /* p part */
126  DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
127  temp0, temp0);
128  temp1 = __lsx_vadd_h(p3_src, p2_src);
129  temp1 = __lsx_vslli_h(temp1, 1);
130  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
131  temp1 = __lsx_vsrari_h(temp1, 3);
132  temp2 = __lsx_vsub_h(temp1, p2_src);
133  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
134  dst0 = __lsx_vadd_h(temp2, p2_src);
135 
136  temp1 = __lsx_vadd_h(temp0, p2_src);
137  temp1 = __lsx_vsrari_h(temp1, 2);
138  temp2 = __lsx_vsub_h(temp1, p1_src);
139  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
140  dst1 = __lsx_vadd_h(temp2, p1_src);
141 
142  temp1 = __lsx_vslli_h(temp0, 1);
143  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
144  temp1, temp1);
145  temp1 = __lsx_vsrari_h(temp1, 3);
146  temp2 = __lsx_vsub_h(temp1, p0_src);
147  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
148  dst2 = __lsx_vadd_h(temp2, p0_src);
149 
150  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
151  DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
152  p1_src, p_is_pcm_vec, dst0, dst1);
153  dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
154 
155  /* q part */
156  DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
157  temp0, temp0);
158  temp1 = __lsx_vadd_h(q3_src, q2_src);
159  temp1 = __lsx_vslli_h(temp1, 1);
160  DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
161  temp1 = __lsx_vsrari_h(temp1, 3);
162  temp2 = __lsx_vsub_h(temp1, q2_src);
163  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
164  dst5 = __lsx_vadd_h(temp2, q2_src);
165 
166  temp1 = __lsx_vadd_h(temp0, q2_src);
167  temp1 = __lsx_vsrari_h(temp1, 2);
168  temp2 = __lsx_vsub_h(temp1, q1_src);
169  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
170  dst4 = __lsx_vadd_h(temp2, q1_src);
171 
172  temp0 = __lsx_vslli_h(temp0, 1);
173  DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
174  temp1, temp1);
175  temp1 = __lsx_vsrari_h(temp1, 3);
176  temp2 = __lsx_vsub_h(temp1, q0_src);
177  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
178  dst3 = __lsx_vadd_h(temp2, q0_src);
179 
180  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
181  DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
182  q1_src, q_is_pcm_vec, dst3, dst4);
183  dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
184 
185  /* pack results to 8 bit */
186  DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
187  dst2 = __lsx_vpickev_b(dst5, dst4);
188 
189  /* pack src to 8 bit */
190  DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
191  dst3, dst4);
192  dst5 = __lsx_vpickev_b(q2_src, q1_src);
193 
194  cmp3 = __lsx_vnor_v(cmp3, cmp3);
195  DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
196  dst0, dst1);
197  dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
198 
199  __lsx_vstelm_d(dst0, p2, 0, 0);
200  __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
201  __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
202  __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
203  __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
204  __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
205  /* strong filter ends */
206  } else if (flag0 == flag1) { /* weak only */
207  /* weak filter */
208  tc_neg = __lsx_vneg_h(tc_pos);
209  DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
210  diff0, diff1);
211  DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
212  __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
213  delta0 = __lsx_vsub_h(diff0, diff1);
214  delta0 = __lsx_vsrari_h(delta0, 4);
215  temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
216  __lsx_vslli_h(tc_pos, 1));
217  abs_delta0 = __lsx_vadda_h(delta0, zero);
218  abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
219  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
220 
221  delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
222  temp2 = __lsx_vadd_h(delta0, p0_src);
223  temp2 = __lsx_vclip255_h(temp2);
224  temp0 = __lsx_vbitsel_v(temp2, p0_src,
225  __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
226  temp2 = __lsx_vsub_h(q0_src, delta0);
227  temp2 = __lsx_vclip255_h(temp2);
228  temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
229  q_is_pcm_vec));
230  DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
231  q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
232 
233  tmp = (beta + (beta >> 1)) >> 3;
234  DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
235  cmp0, cmp1);
236  cmp0 = __lsx_vpackev_d(cmp1, cmp0);
237  cmp0 = __lsx_vseqi_d(cmp0, 0);
238  p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
239 
240  DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
241  cmp0, cmp1);
242  cmp0 = __lsx_vpackev_d(cmp1, cmp0);
243  cmp0 = __lsx_vseqi_d(cmp0, 0);
244  q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
245  tc_pos = __lsx_vsrai_h(tc_pos, 1);
246  tc_neg = __lsx_vneg_h(tc_pos);
247 
248  DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
249  delta1, delta2);
250  DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
251  delta1, delta2);
252  delta1 = __lsx_vadd_h(delta1, delta0);
253  delta2 = __lsx_vsub_h(delta2, delta0);
254  DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
255  DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
256  tc_neg, tc_pos, delta1, delta2);
257  DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
258  delta1, delta2);
259  DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
260  DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
261  q1_src, q_is_pcm_vec, delta1, delta2);
262 
263  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
264  DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
265  p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
266  q1_src, abs_delta0, dst1, dst2, dst3, dst4);
267  /* pack results to 8 bit */
268  DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
269  /* pack src to 8 bit */
270  DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
271  dst2, dst3);
272  cmp3 = __lsx_vnor_v(cmp3, cmp3);
273  DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
274  dst0, dst1);
275 
276  p2 += stride;
277  __lsx_vstelm_d(dst0, p2, 0, 0);
278  __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
279  __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
280  __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
281  /* weak filter ends */
282  } else { /* strong + weak */
283  /* strong filter */
284  tc_pos = __lsx_vslli_h(tc_pos, 1);
285  tc_neg = __lsx_vneg_h(tc_pos);
286 
287  /* p part */
288  DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
289  temp0, temp0);
290  temp1 = __lsx_vadd_h(p3_src, p2_src);
291  temp1 = __lsx_vslli_h(temp1, 1);
292  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
293  temp1 = __lsx_vsrari_h(temp1, 3);
294  temp2 = __lsx_vsub_h(temp1, p2_src);
295  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
296  dst0 = __lsx_vadd_h(temp2, p2_src);
297 
298  temp1 = __lsx_vadd_h(temp0, p2_src);
299  temp1 = __lsx_vsrari_h(temp1, 2);
300  temp2 = __lsx_vsub_h(temp1, p1_src);
301  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
302  dst1 = __lsx_vadd_h(temp2, p1_src);
303 
304  temp1 = __lsx_vslli_h(temp0, 1);
305  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
306  temp1 = __lsx_vsrari_h(temp1, 3);
307  temp2 = __lsx_vsub_h(temp1, p0_src);
308  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
309  dst2 = __lsx_vadd_h(temp2, p0_src);
310 
311  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
312  DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
313  p1_src, p_is_pcm_vec, dst0, dst1);
314  dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
315 
316  /* q part */
317  DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
318  temp0, temp0);
319  temp1 = __lsx_vadd_h(q3_src, q2_src);
320  temp1 = __lsx_vslli_h(temp1, 1);
321  DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
322  temp1 = __lsx_vsrari_h(temp1, 3);
323  temp2 = __lsx_vsub_h(temp1, q2_src);
324  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
325  dst5 = __lsx_vadd_h(temp2, q2_src);
326 
327  temp1 = __lsx_vadd_h(temp0, q2_src);
328  temp1 = __lsx_vsrari_h(temp1, 2);
329  temp2 = __lsx_vsub_h(temp1, q1_src);
330  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
331  dst4 = __lsx_vadd_h(temp2, q1_src);
332 
333  temp1 = __lsx_vslli_h(temp0, 1);
334  DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
335  temp1 = __lsx_vsrari_h(temp1, 3);
336  temp2 = __lsx_vsub_h(temp1, q0_src);
337  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
338  dst3 = __lsx_vadd_h(temp2, q0_src);
339 
340  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
341  DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
342  q1_src, q_is_pcm_vec, dst3, dst4);
343  dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
344 
345  /* pack strong results to 8 bit */
346  DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
347  dst2 = __lsx_vpickev_b(dst5, dst4);
348  /* strong filter ends */
349 
350  /* weak filter */
351  tc_pos = __lsx_vsrai_h(tc_pos, 1);
352  tc_neg = __lsx_vneg_h(tc_pos);
353 
354  DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
355  diff0, diff1);
356  DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
357  __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
358  delta0 = __lsx_vsub_h(diff0, diff1);
359  delta0 = __lsx_vsrari_h(delta0, 4);
360  temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
361  __lsx_vslli_h(tc_pos, 1));
362  abs_delta0 = __lsx_vadda_h(delta0, zero);
363  abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
364  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
365 
366  delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
367  temp2 = __lsx_vadd_h(delta0, p0_src);
368  temp2 = __lsx_vclip255_h(temp2);
369  temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
370 
371  temp2 = __lsx_vsub_h(q0_src, delta0);
372  temp2 = __lsx_vclip255_h(temp2);
373  temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
374 
375  tmp = (beta + (beta >> 1)) >> 3;
376  DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
377  cmp0, cmp1);
378  cmp0 = __lsx_vpackev_d(cmp1, cmp0);
379  p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
380  DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
381  cmp0, cmp1);
382  cmp0 = __lsx_vpackev_d(cmp1, cmp0);
383  q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
384 
385  tc_pos = __lsx_vsrai_h(tc_pos, 1);
386  tc_neg = __lsx_vneg_h(tc_pos);
387 
388  DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
389  delta1, delta2);
390  DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
391  delta1, delta2);
392  delta1 = __lsx_vadd_h(delta1, delta0);
393  delta2 = __lsx_vsub_h(delta2, delta0);
394  DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
395  DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
396  tc_pos, delta1, delta2);
397  DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
398  delta1, delta2);
399  DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
400  DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
401  q1_src, q_is_pcm_vec, delta1, delta2);
402  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
403  DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
404  q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
405  q0_src, abs_delta0, delta1, delta2, temp0, temp2);
406  /* weak filter ends */
407 
408  /* pack weak results to 8 bit */
409  DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
410  dst3, dst4);
411  dst5 = __lsx_vpickev_b(q2_src, delta2);
412 
413  /* select between weak or strong */
414  DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
415  dst0, dst1);
416  dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
417 
418  /* pack src to 8 bit */
419  DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
420  dst3, dst4);
421  dst5 = __lsx_vpickev_b(q2_src, q1_src);
422 
423  cmp3 = __lsx_vnor_v(cmp3, cmp3);
424  DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
425  dst0, dst1);
426  dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
427 
428  __lsx_vstelm_d(dst0, p2, 0, 0);
429  __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
430  __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
431  __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
432  __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
433  __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
434  }
435  }
436 }
437 
438 void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
439  int32_t beta, const int32_t *tc,
440  const uint8_t *p_is_pcm, const uint8_t *q_is_pcm)
441 {
442  ptrdiff_t stride_2x = (stride << 1);
443  ptrdiff_t stride_4x = (stride << 2);
444  ptrdiff_t stride_3x = stride_2x + stride;
445  uint8_t *p3 = src;
446  uint8_t *p2 = src + stride_3x;
447  uint8_t *p1 = src + stride_4x;
448  uint8_t *p0 = src + stride_4x + stride_3x;
449  uint8_t flag0, flag1;
450  int32_t dp00, dq00, dp30, dq30, d00, d30;
451  int32_t d0030, d0434;
452  int32_t dp04, dq04, dp34, dq34, d04, d34;
453  int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
454  int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
455 
456  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
457  __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
458  __m128i cmp3;
459  __m128i temp0, temp1;
460  __m128i temp2;
461  __m128i tc_pos, tc_neg;
462  __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
463  __m128i zero = {0};
464  __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
465 
466  dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
467  dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
468  dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
469  dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
470  d00 = dp00 + dq00;
471  d30 = dp30 + dq30;
472  p_is_pcm0 = p_is_pcm[0];
473  q_is_pcm0 = q_is_pcm[0];
474 
475  dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
476  dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
477  dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
478  dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
479  d04 = dp04 + dq04;
480  d34 = dp34 + dq34;
481  p_is_pcm4 = p_is_pcm[1];
482  q_is_pcm4 = q_is_pcm[1];
483 
484  DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
485  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
486  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
487 
488  d0030 = (d00 + d30) >= beta;
489  d0434 = (d04 + d34) >= beta;
490 
491  DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
492  cmp3 = __lsx_vpackev_d(cmp1, cmp0);
493  cmp3 = __lsx_vseqi_d(cmp3, 0);
494 
495  if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
496  (!d0030 || !d0434)) {
497  src -= 4;
498  DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
499  src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
500  src += stride_4x;
501  DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
502  src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
503  src -= stride_4x;
504 
505  DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
506  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
507  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
508 
509  tc0 = tc[0];
510  beta30 = beta >> 3;
511  beta20 = beta >> 2;
512  tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
513  tc4 = tc[1];
514  tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
515  DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
516  tc_pos = __lsx_vpackev_d(cmp1, cmp0);
517  LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
518  q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
519  q0_src, q1_src, q2_src, q3_src);
520 
521  flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
522  abs(p3[-1] - p3[0]) < tc250;
523  flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
524  abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
525  (d30 << 1) < beta20);
526  cmp0 = __lsx_vreplgr2vr_d(flag0);
527  DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
528  p0_src, p3_src, p2_src, p1_src, p0_src);
529 
530  flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
531  abs(p1[-1] - p1[0]) < tc254;
532  flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
533  abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
534  (d34 << 1) < beta20);
535  DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
536  q3_src, q0_src, q1_src, q2_src, q3_src);
537 
538  cmp1 = __lsx_vreplgr2vr_d(flag1);
539  cmp2 = __lsx_vpackev_d(cmp1, cmp0);
540  cmp2 = __lsx_vseqi_d(cmp2, 0);
541 
542  if (flag0 && flag1) { /* strong only */
543  /* strong filter */
544  tc_neg = __lsx_vneg_h(tc_pos);
545  /* p part */
546  DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
547  temp0, temp0);
548  temp1 = __lsx_vadd_h(p3_src, p2_src);
549  temp1 = __lsx_vslli_h(temp1, 1);
550  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
551  temp1 = __lsx_vsrari_h(temp1, 3);
552  temp2 = __lsx_vsub_h(temp1, p2_src);
553  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
554  dst0 = __lsx_vadd_h(temp2, p2_src);
555 
556  temp1 = __lsx_vadd_h(temp0, p2_src);
557  temp1 = __lsx_vsrari_h(temp1, 2);
558  temp2 = __lsx_vsub_h(temp1, p1_src);
559  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
560  dst1 = __lsx_vadd_h(temp2, p1_src);
561 
562  temp1 = __lsx_vslli_h(temp0, 1);
563  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
564  temp1 = __lsx_vsrari_h(temp1, 3);
565  temp2 = __lsx_vsub_h(temp1, p0_src);
566  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
567  dst2 = __lsx_vadd_h(temp2, p0_src);
568 
569  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
570  DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
571  p_is_pcm_vec, dst0, dst1);
572  dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
573 
574  /* q part */
575  DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
576  temp0, temp0);
577  temp1 = __lsx_vadd_h(q3_src, q2_src);
578  temp1 = __lsx_vslli_h(temp1, 1);
579  DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
580  temp1 = __lsx_vsrari_h(temp1, 3);
581  temp2 = __lsx_vsub_h(temp1, q2_src);
582  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
583  dst5 = __lsx_vadd_h(temp2, q2_src);
584 
585  temp1 = __lsx_vadd_h(temp0, q2_src);
586  temp1 = __lsx_vsrari_h(temp1, 2);
587  temp2 = __lsx_vsub_h(temp1, q1_src);
588  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
589  dst4 = __lsx_vadd_h(temp2, q1_src);
590 
591  temp1 = __lsx_vslli_h(temp0, 1);
592  DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
593  temp1 = __lsx_vsrari_h(temp1, 3);
594  temp2 = __lsx_vsub_h(temp1, q0_src);
595  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
596  dst3 = __lsx_vadd_h(temp2, q0_src);
597 
598  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
599  DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
600  q_is_pcm_vec, dst3, dst4);
601  dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
602  /* strong filter ends */
603  } else if (flag0 == flag1) { /* weak only */
604  /* weak filter */
605  tc_pos = __lsx_vsrai_h(tc_pos, 1);
606  tc_neg = __lsx_vneg_h(tc_pos);
607 
608  DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
609  diff0, diff1);
610  DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
611  __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
612  delta0 = __lsx_vsub_h(diff0, diff1);
613  delta0 = __lsx_vsrari_h(delta0, 4);
614  temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
615  __lsx_vslli_h(tc_pos, 1));
616  abs_delta0 = __lsx_vadda_h(delta0, zero);
617  abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
618  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
619 
620  delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
621  temp2 = __lsx_vadd_h(delta0, p0_src);
622  temp2 = __lsx_vclip255_h(temp2);
623  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
624  temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
625 
626  temp2 = __lsx_vsub_h(q0_src, delta0);
627  temp2 = __lsx_vclip255_h(temp2);
628  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
629  temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
630 
631  tmp = ((beta + (beta >> 1)) >> 3);
632  DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
633  !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
634  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
635  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
636 
637  DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
638  (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
639  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
640  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
641  tc_pos = __lsx_vsrai_h(tc_pos, 1);
642  tc_neg = __lsx_vneg_h(tc_pos);
643 
644  DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
645  delta1, delta2);
646  DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
647  delta1, delta2);
648  delta1 = __lsx_vadd_h(delta1, delta0);
649  delta2 = __lsx_vsub_h(delta2, delta0);
650  DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
651  DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
652  tc_pos, delta1, delta2);
653  DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
654  delta1, delta2);
655  DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
656  DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
657  q1_src, q_is_pcm_vec, delta1, delta2);
658 
659  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
660  DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
661  p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
662  q1_src, abs_delta0, dst0, dst1, dst2, dst3);
663  /* weak filter ends */
664 
665  cmp3 = __lsx_vnor_v(cmp3, cmp3);
666  DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
667  cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
668  dst0, dst1, dst2, dst3);
669  DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
670 
671  /* transpose */
672  dst4 = __lsx_vilvl_b(dst1, dst0);
673  dst5 = __lsx_vilvh_b(dst1, dst0);
674  dst0 = __lsx_vilvl_h(dst5, dst4);
675  dst1 = __lsx_vilvh_h(dst5, dst4);
676 
677  src += 2;
678  __lsx_vstelm_w(dst0, src, 0, 0);
679  __lsx_vstelm_w(dst0, src + stride, 0, 1);
680  __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
681  __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
682  src += stride_4x;
683  __lsx_vstelm_w(dst1, src, 0, 0);
684  __lsx_vstelm_w(dst1, src + stride, 0, 1);
685  __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
686  __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
687  return;
688  } else { /* strong + weak */
689  /* strong filter */
690  tc_neg = __lsx_vneg_h(tc_pos);
691 
692  /* p part */
693  DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
694  temp0, temp0);
695 
696  temp1 = __lsx_vadd_h(p3_src, p2_src);
697  temp1 = __lsx_vslli_h(temp1, 1);
698  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
699  temp1 = __lsx_vsrari_h(temp1, 3);
700  temp2 = __lsx_vsub_h(temp1, p2_src);
701  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
702  dst0 = __lsx_vadd_h(temp2, p2_src);
703 
704  temp1 = __lsx_vadd_h(temp0, p2_src);
705  temp1 = __lsx_vsrari_h(temp1, 2);
706  temp2 = __lsx_vsub_h(temp1, p1_src);
707  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
708  dst1 = __lsx_vadd_h(temp2, p1_src);
709 
710  temp1 = __lsx_vslli_h(temp0, 1);
711  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
712  temp1 = __lsx_vsrari_h(temp1, 3);
713  temp2 = __lsx_vsub_h(temp1, p0_src);
714  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
715  dst2 = __lsx_vadd_h(temp2, p0_src);
716 
717  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
718  DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
719  p_is_pcm_vec, dst0, dst1);
720  dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
721 
722  /* q part */
723  DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
724  temp1 = __lsx_vadd_h(q3_src, q2_src);
725  temp1 = __lsx_vslli_h(temp1, 1);
726  DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
727  temp1 = __lsx_vsrari_h(temp1, 3);
728  temp2 = __lsx_vsub_h(temp1, q2_src);
729  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
730  dst5 = __lsx_vadd_h(temp2, q2_src);
731 
732  temp1 = __lsx_vadd_h(temp0, q2_src);
733  temp1 = __lsx_vsrari_h(temp1, 2);
734  temp2 = __lsx_vsub_h(temp1, q1_src);
735  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
736  dst4 = __lsx_vadd_h(temp2, q1_src);
737 
738  temp1 = __lsx_vslli_h(temp0, 1);
739  DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
740  temp1 = __lsx_vsrari_h(temp1, 3);
741  temp2 = __lsx_vsub_h(temp1, q0_src);
742  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
743  dst3 = __lsx_vadd_h(temp2, q0_src);
744 
745  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
746  DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
747  q_is_pcm_vec, dst3, dst4);
748  dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
749  /* strong filter ends */
750 
751  /* weak filter */
752  tc_pos = __lsx_vsrai_h(tc_pos, 1);
753  tc_neg = __lsx_vneg_h(tc_pos);
754 
755  DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
756  diff0, diff1);
757  DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
758  __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
759  delta0 = __lsx_vsub_h(diff0, diff1);
760  delta0 = __lsx_vsrari_h(delta0, 4);
761 
762  temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
763  __lsx_vslli_h(tc_pos, 1));
764  abs_delta0 = __lsx_vadda_h(delta0, zero);
765  abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
766  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
767  delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
768  temp2 = __lsx_vadd_h(delta0, p0_src);
769  temp2 = __lsx_vclip255_h(temp2);
770  temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
771  temp2 = __lsx_vsub_h(q0_src, delta0);
772  temp2 = __lsx_vclip255_h(temp2);
773  temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
774 
775  tmp = (beta + (beta >> 1)) >> 3;
776  DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
777  !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
778  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
779  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
780 
781  DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
782  (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
783  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
784  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
785  tc_pos = __lsx_vsrai_h(tc_pos, 1);
786  tc_neg = __lsx_vneg_h(tc_pos);
787 
788  DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
789  delta1, delta2);
790  DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
791  delta1, delta2);
792  delta1 = __lsx_vadd_h(delta1, delta0);
793  delta2 = __lsx_vsub_h(delta2, delta0);
794  DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
795  DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
796  tc_pos, delta1, delta2);
797  DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
798  delta1, delta2);
799  DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
800  DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
801  q1_src, q_is_pcm_vec, delta1, delta2);
802 
803  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
804  DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
805  q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
806  q0_src, abs_delta0, delta1, delta2, temp0, temp2);
807  /* weak filter ends*/
808 
809  /* select between weak or strong */
810  DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
811  cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
812  dst0, dst1, dst2, dst3);
813  DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
814  dst4, dst5);
815  }
816 
817  cmp3 = __lsx_vnor_v(cmp3, cmp3);
818  DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
819  p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
820  DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
821  dst4, dst5);
822 
823  /* pack results to 8 bit */
824  DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
825  dst5, dst0, dst1, dst2, dst3);
826 
827  /* transpose */
828  DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
829  DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
830  DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
831  DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
832 
833  src += 1;
834  __lsx_vstelm_w(dst0, src, 0, 0);
835  __lsx_vstelm_h(dst2, src, 4, 0);
836  src += stride;
837  __lsx_vstelm_w(dst0, src, 0, 1);
838  __lsx_vstelm_h(dst2, src, 4, 2);
839  src += stride;
840 
841  __lsx_vstelm_w(dst0, src, 0, 2);
842  __lsx_vstelm_h(dst2, src, 4, 4);
843  src += stride;
844  __lsx_vstelm_w(dst0, src, 0, 3);
845  __lsx_vstelm_h(dst2, src, 4, 6);
846  src += stride;
847 
848  __lsx_vstelm_w(dst1, src, 0, 0);
849  __lsx_vstelm_h(dst3, src, 4, 0);
850  src += stride;
851  __lsx_vstelm_w(dst1, src, 0, 1);
852  __lsx_vstelm_h(dst3, src, 4, 2);
853  src += stride;
854 
855  __lsx_vstelm_w(dst1, src, 0, 2);
856  __lsx_vstelm_h(dst3, src, 4, 4);
857  src += stride;
858  __lsx_vstelm_w(dst1, src, 0, 3);
859  __lsx_vstelm_h(dst3, src, 4, 6);
860  }
861 }
862 
864  const int32_t *tc, const uint8_t *p_is_pcm,
865  const uint8_t *q_is_pcm)
866 {
867  uint8_t *p1_ptr = src - (stride << 1);
868  uint8_t *p0_ptr = src - stride;
869  uint8_t *q0_ptr = src;
870  uint8_t *q1_ptr = src + stride;
871  __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
872  __m128i p1, p0, q0, q1;
873  __m128i tc_pos, tc_neg;
874  __m128i zero = {0};
875  __m128i temp0, temp1, delta;
876 
877  if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
878  DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
879  tc_pos = __lsx_vpackev_d(cmp1, cmp0);
880  tc_neg = __lsx_vneg_h(tc_pos);
881  DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
882  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
883  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
884 
885  DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
886  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
887  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
888 
889  DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
890  p1, p0, q0, q1);
891  DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
892  p1, p0, q0, q1);
893  DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
894  temp0 = __lsx_vslli_h(temp0, 2);
895  temp0 = __lsx_vadd_h(temp0, temp1);
896  delta = __lsx_vsrari_h(temp0, 3);
897  delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
898  temp0 = __lsx_vadd_h(p0, delta);
899  temp0 = __lsx_vclip255_h(temp0);
900  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
901  temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
902 
903  temp1 = __lsx_vsub_h(q0, delta);
904  temp1 = __lsx_vclip255_h(temp1);
905  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
906  temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
907 
908  tc_pos = __lsx_vslei_d(tc_pos, 0);
909  DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
910  temp0, temp1);
911  temp0 = __lsx_vpickev_b(temp1, temp0);
912  __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
913  __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
914  }
915 }
916 
918  const int32_t *tc, const uint8_t *p_is_pcm,
919  const uint8_t *q_is_pcm)
920 {
921  ptrdiff_t stride_2x = (stride << 1);
922  ptrdiff_t stride_4x = (stride << 2);
923  ptrdiff_t stride_3x = stride_2x + stride;
924  __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
926  __m128i p1, p0, q0, q1;
927  __m128i tc_pos, tc_neg;
928  __m128i zero = {0};
929  __m128i temp0, temp1, delta;
930 
931  if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
932  DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
933  tc_pos = __lsx_vpackev_d(cmp1, cmp0);
934  tc_neg = __lsx_vneg_h(tc_pos);
935 
936  DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
937  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
938  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
939  DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
940  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
941  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
942 
943  src -= 2;
944  DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
945  src + stride_3x, 0, src0, src1, src2, src3);
946  src += stride_4x;
947  DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
948  src + stride_3x, 0, src4, src5, src6, src7);
949  src -= stride_4x;
950  LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
951  p1, p0, q0, q1);
952  DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
953  p1, p0, q0, q1);
954 
955  DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
956  temp0 = __lsx_vslli_h(temp0, 2);
957  temp0 = __lsx_vadd_h(temp0, temp1);
958  delta = __lsx_vsrari_h(temp0, 3);
959  delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
960 
961  temp0 = __lsx_vadd_h(p0, delta);
962  temp1 = __lsx_vsub_h(q0, delta);
963  DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
964  DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
965  q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
966  DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
967  q_is_pcm_vec, temp0, temp1);
968 
969  tc_pos = __lsx_vslei_d(tc_pos, 0);
970  DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
971  temp0, temp1);
972  temp0 = __lsx_vpackev_b(temp1, temp0);
973 
974  src += 1;
975  __lsx_vstelm_h(temp0, src, 0, 0);
976  __lsx_vstelm_h(temp0, src + stride, 0, 1);
977  __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
978  __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
979  src += stride_4x;
980  __lsx_vstelm_h(temp0, src, 0, 4);
981  __lsx_vstelm_h(temp0, src + stride, 0, 5);
982  __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
983  __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
984  src -= stride_4x;
985  }
986 }
987 
989  int32_t dst_stride,
990  const uint8_t *src,
991  int32_t src_stride,
992  const int16_t *sao_offset_val,
993  int32_t height)
994 {
995  const int32_t src_stride_2x = (src_stride << 1);
996  const int32_t dst_stride_2x = (dst_stride << 1);
997  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
998  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
999  __m128i edge_idx = {0x403000201, 0x0};
1000  __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1001  __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1002  __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
1003  __m128i const1 = __lsx_vldi(1);
1004  __m128i zero = {0};
1005 
1006  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1007  src -= 1;
1008 
1009  /* load in advance */
1010  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1011 
1012  for (height -= 2; height; height -= 2) {
1013  src += src_stride_2x;
1014  src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1015  src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1016  src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1017 
1018  DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1019  cmp_minus10, cmp_minus11);
1020  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1021  cmp_minus11, diff_minus10, diff_minus11);
1022  DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1023  cmp_minus10, cmp_minus11);
1024  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1025  cmp_minus11, cmp_minus10, cmp_minus11);
1026  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1027  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1028 
1029  offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1030  offset = __lsx_vaddi_bu(offset, 2);
1031 
1032  /* load in advance */
1033  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1034  src_minus10, src_minus11);
1035  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
1036  sao_offset, sao_offset, offset, offset, offset);
1037  src0 = __lsx_vxori_b(src0, 128);
1038  dst0 = __lsx_vsadd_b(src0, offset);
1039  dst0 = __lsx_vxori_b(dst0, 128);
1040 
1041  __lsx_vstelm_w(dst0, dst, 0, 0);
1042  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1043  dst += dst_stride_2x;
1044  }
1045 
1046  src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1047  src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1048  src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1049 
1050  DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1051  cmp_minus11);
1052  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1053  diff_minus10, diff_minus11);
1054  DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1055  cmp_minus11);
1056  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1057  cmp_minus10, cmp_minus11);
1058  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1059  const1, cmp_minus11, diff_minus10, diff_minus11);
1060 
1061  offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1062  offset = __lsx_vaddi_bu(offset, 2);
1063  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
1064  offset, offset, offset);
1065  src0 = __lsx_vxori_b(src0, 128);
1066  dst0 = __lsx_vsadd_b(src0, offset);
1067  dst0 = __lsx_vxori_b(dst0, 128);
1068 
1069  __lsx_vstelm_w(dst0, dst, 0, 0);
1070  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1071 }
1072 
1074  int32_t dst_stride,
1075  const uint8_t *src,
1076  int32_t src_stride,
1077  const int16_t *sao_offset_val,
1078  int32_t height)
1079 {
1080  const int32_t src_stride_2x = (src_stride << 1);
1081  const int32_t dst_stride_2x = (dst_stride << 1);
1082  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1083  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1084  __m128i edge_idx = {0x403000201, 0x0};
1085  __m128i const1 = __lsx_vldi(1);
1086  __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1087  __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1088  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1089  __m128i zeros = {0};
1090 
1091  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1092  src -= 1;
1093 
1094  /* load in advance */
1095  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1096 
1097  for (height -= 2; height; height -= 2) {
1098  src += src_stride_2x;
1099  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
1100  src_minus11, shuf1, src0, src1);
1101  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
1102  src_minus11, shuf2, src_plus10, src_plus11);
1103  DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1104  src_plus10, src_minus10, src_plus10);
1105  src0 = __lsx_vpickev_d(src1, src0);
1106 
1107  DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1108  cmp_minus10, cmp_minus11);
1109  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1110  cmp_minus11, diff_minus10, diff_minus11);
1111  DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1112  cmp_minus10, cmp_minus11);
1113  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1114  cmp_minus11, cmp_minus10, cmp_minus11);
1115  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1116  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1117 
1118  offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1119  offset = __lsx_vaddi_bu(offset, 2);
1120 
1121  /* load in advance */
1122  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1123  src_minus10, src_minus11);
1124  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1125  sao_offset, offset, offset, offset);
1126  src0 = __lsx_vxori_b(src0, 128);
1127  dst0 = __lsx_vsadd_b(src0, offset);
1128  dst0 = __lsx_vxori_b(dst0, 128);
1129 
1130  __lsx_vstelm_d(dst0, dst, 0, 0);
1131  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1132  dst += dst_stride_2x;
1133  }
1134 
1135  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
1136  shuf1, src0, src1);
1137  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
1138  shuf2, src_plus10, src_plus11);
1139  DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1140  src_plus10, src_minus10, src_plus10);
1141  src0 = __lsx_vpickev_d(src1, src0);
1142 
1143  DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1144  cmp_minus11);
1145  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1146  diff_minus10, diff_minus11);
1147  DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1148  cmp_minus11);
1149  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1150  cmp_minus10, cmp_minus11);
1151  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1152  const1, cmp_minus11, diff_minus10, diff_minus11);
1153 
1154  offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1155  offset = __lsx_vaddi_bu(offset, 2);
1156  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1157  sao_offset, offset, offset, offset);
1158  src0 = __lsx_vxori_b(src0, 128);
1159  dst0 = __lsx_vsadd_b(src0, offset);
1160  dst0 = __lsx_vxori_b(dst0, 128);
1161 
1162  __lsx_vstelm_d(dst0, dst, 0, 0);
1163  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1164 }
1165 
1167  int32_t dst_stride,
1168  const uint8_t *src,
1169  int32_t src_stride,
1170  const int16_t *sao_offset_val,
1171  int32_t width,
1172  int32_t height)
1173 {
1174  uint8_t *dst_ptr;
1175  const uint8_t *src_minus1;
1176  int32_t v_cnt;
1177  const int32_t src_stride_2x = (src_stride << 1);
1178  const int32_t dst_stride_2x = (dst_stride << 1);
1179  const int32_t src_stride_4x = (src_stride << 2);
1180  const int32_t dst_stride_4x = (dst_stride << 2);
1181  const int32_t src_stride_3x = src_stride_2x + src_stride;
1182  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1183 
1184  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1185  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1186  __m128i edge_idx = {0x403000201, 0x0};
1187  __m128i const1 = __lsx_vldi(1);
1188  __m128i sao_offset;
1189  __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1190  __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1191  __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1192  __m128i diff_plus13;
1193  __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1194  __m128i src_minus10, src_minus11, src_minus12, src_minus13;
1195  __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1196  __m128i src_zero0, src_zero1, src_zero2, src_zero3;
1197  __m128i src_plus10, src_plus11, src_plus12, src_plus13;
1198 
1199  sao_offset = __lsx_vld(sao_offset_val, 0);
1200  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1201 
1202  for (; height; height -= 4) {
1203  src_minus1 = src - 1;
1204  src_minus10 = __lsx_vld(src_minus1, 0);
1205  DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1206  src_stride_2x, src_minus11, src_minus12);
1207  src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
1208 
1209  for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1210  src_minus1 += 16;
1211  dst_ptr = dst + v_cnt;
1212  src10 = __lsx_vld(src_minus1, 0);
1213  DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1214  src_stride_2x, src11, src12);
1215  src13 = __lsx_vldx(src_minus1, src_stride_3x);
1216  DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
1217  src_minus11, shuf1, src12, src_minus12, shuf1, src13,
1218  src_minus13, shuf1, src_zero0, src_zero1,
1219  src_zero2, src_zero3);
1220  DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
1221  src_minus11, shuf2, src12, src_minus12, shuf2, src13,
1222  src_minus13, shuf2, src_plus10, src_plus11,
1223  src_plus12, src_plus13);
1224  DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1225  src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1226  cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1227  DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1228  src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1229  cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1230  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1231  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1232  cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1233  diff_plus11);
1234  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1235  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1236  cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1237  diff_plus13);
1238  DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1239  src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1240  cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1241  DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1242  src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1243  cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1244  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1245  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1246  cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1247  cmp_plus11);
1248  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1249  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1250  cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1251  cmp_plus13);
1252  DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1253  diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1254  cmp_minus11, diff_plus11, const1, cmp_plus11,
1255  diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1256  DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1257  diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1258  cmp_minus13, diff_plus13, const1, cmp_plus13,
1259  diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1260 
1261  DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1262  diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1263  diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1264  offset_mask3);
1265  DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1266  offset_mask2, 2, offset_mask3, 2, offset_mask0,
1267  offset_mask1, offset_mask2, offset_mask3);
1268  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1269  sao_offset, sao_offset, offset_mask0, offset_mask0,
1270  offset_mask0);
1271  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1272  sao_offset, sao_offset, offset_mask1, offset_mask1,
1273  offset_mask1);
1274  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1275  sao_offset, sao_offset, offset_mask2, offset_mask2,
1276  offset_mask2);
1277  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1278  sao_offset, sao_offset, offset_mask3, offset_mask3,
1279  offset_mask3);
1280 
1281  DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
1282  src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
1283  src_zero2, src_zero3);
1284  DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1285  offset_mask1, src_zero2, offset_mask2, src_zero3,
1286  offset_mask3, dst0, dst1, dst2, dst3);
1287  DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1288  128, dst0, dst1, dst2, dst3);
1289 
1290  src_minus10 = src10;
1291  src_minus11 = src11;
1292  src_minus12 = src12;
1293  src_minus13 = src13;
1294 
1295  __lsx_vst(dst0, dst_ptr, 0);
1296  __lsx_vst(dst1, dst_ptr + dst_stride, 0);
1297  __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
1298  __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
1299  }
1300  src += src_stride_4x;
1301  dst += dst_stride_4x;
1302  }
1303 }
1304 
1306  int32_t dst_stride,
1307  const uint8_t *src,
1308  int32_t src_stride,
1309  const int16_t *sao_offset_val,
1310  int32_t height)
1311 {
1312  const int32_t src_stride_2x = (src_stride << 1);
1313  const int32_t dst_stride_2x = (dst_stride << 1);
1314  __m128i edge_idx = {0x403000201, 0x0};
1315  __m128i const1 = __lsx_vldi(1);
1316  __m128i dst0;
1317  __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1318  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1319  __m128i src_minus10, src_minus11, src10, src11;
1320  __m128i src_zero0, src_zero1;
1321  __m128i offset;
1322  __m128i offset_mask0, offset_mask1;
1323 
1324  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1325 
1326  /* load in advance */
1327  DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
1328  src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
1329 
1330  for (height -= 2; height; height -= 2) {
1331  src += src_stride_2x;
1332  DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1333  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1334  src_minus11, src_zero1);
1335  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1336  cmp_minus10, cmp_minus11);
1337  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1338  cmp_minus11, diff_minus10, diff_minus11);
1339  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1340  src_minus11, cmp_minus10, cmp_minus11);
1341  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1342  cmp_minus11, cmp_minus10, cmp_minus11);
1343  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1344  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1345 
1346  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1347  diff_minus11, offset_mask0, offset_mask1);
1348  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1349  offset_mask0, offset_mask1);
1350  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1351  src_zero0, offset, dst0);
1352  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1353  sao_offset, offset, offset, offset);
1354 
1355  dst0 = __lsx_vxori_b(dst0, 128);
1356  dst0 = __lsx_vsadd_b(dst0, offset);
1357  dst0 = __lsx_vxori_b(dst0, 128);
1358  src_minus10 = src10;
1359  src_minus11 = src11;
1360 
1361  /* load in advance */
1362  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1363  src10, src11);
1364 
1365  __lsx_vstelm_w(dst0, dst, 0, 0);
1366  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1367  dst += dst_stride_2x;
1368  }
1369 
1370  DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1371  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1372  src_minus11, src_zero1);
1373  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1374  cmp_minus10, cmp_minus11);
1375  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1376  diff_minus10, diff_minus11);
1377  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1378  cmp_minus10, cmp_minus11);
1379  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1380  cmp_minus10, cmp_minus11);
1381  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1382  const1, cmp_minus11, diff_minus10, diff_minus11);
1383 
1384  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1385  diff_minus11, offset_mask0, offset_mask1);
1386  DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1387  offset_mask0, offset_mask1);
1388  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1389  src_zero0, offset, dst0);
1390  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1391  sao_offset, offset, offset, offset);
1392  dst0 = __lsx_vxori_b(dst0, 128);
1393  dst0 = __lsx_vsadd_b(dst0, offset);
1394  dst0 = __lsx_vxori_b(dst0, 128);
1395 
1396  __lsx_vstelm_w(dst0, dst, 0, 0);
1397  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1398 }
1399 
1401  int32_t dst_stride,
1402  const uint8_t *src,
1403  int32_t src_stride,
1404  const int16_t *sao_offset_val,
1405  int32_t height)
1406 {
1407  const int32_t src_stride_2x = (src_stride << 1);
1408  const int32_t dst_stride_2x = (dst_stride << 1);
1409  __m128i edge_idx = {0x403000201, 0x0};
1410  __m128i const1 = __lsx_vldi(1);
1411  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1412  __m128i src_zero0, src_zero1, dst0;
1413  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1414  __m128i src_minus10, src_minus11, src10, src11;
1415  __m128i offset_mask0, offset_mask1;
1416 
1417  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1418 
1419  /* load in advance */
1420  DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
1421  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
1422 
1423  for (height -= 2; height; height -= 2) {
1424  src += src_stride_2x;
1425  DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1426  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1427  src_minus11, src_zero1);
1428  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1429  cmp_minus10, cmp_minus11);
1430  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1431  cmp_minus11, diff_minus10, diff_minus11);
1432  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1433  src_minus11, cmp_minus10, cmp_minus11);
1434  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1435  cmp_minus11, cmp_minus10, cmp_minus11);
1436  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1437  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1438 
1439  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1440  diff_minus11, offset_mask0, offset_mask1);
1441  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1442  offset_mask0, offset_mask1);
1443  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1444  src_zero0, offset, dst0);
1445  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1446  sao_offset, offset, offset, offset);
1447 
1448  dst0 = __lsx_vxori_b(dst0, 128);
1449  dst0 = __lsx_vsadd_b(dst0, offset);
1450  dst0 = __lsx_vxori_b(dst0, 128);
1451  src_minus10 = src10;
1452  src_minus11 = src11;
1453 
1454  /* load in advance */
1455  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1456  src10, src11);
1457 
1458  __lsx_vstelm_d(dst0, dst, 0, 0);
1459  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1460  dst += dst_stride_2x;
1461  }
1462 
1463  DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1464  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1465  src_minus11, src_zero1);
1466  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1467  cmp_minus10, cmp_minus11);
1468  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1469  diff_minus10, diff_minus11);
1470  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1471  cmp_minus10, cmp_minus11);
1472  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1473  cmp_minus10, cmp_minus11);
1474  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1475  const1, cmp_minus11, diff_minus10, diff_minus11);
1476 
1477  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1478  diff_minus11, offset_mask0, offset_mask1);
1479  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1480  offset_mask0, offset_mask1);
1481  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1482  src_zero0, offset, dst0);
1483  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1484  sao_offset, offset, offset, offset);
1485  dst0 = __lsx_vxori_b(dst0, 128);
1486  dst0 = __lsx_vsadd_b(dst0, offset);
1487  dst0 = __lsx_vxori_b(dst0, 128);
1488 
1489  __lsx_vstelm_d(dst0, dst, 0, 0);
1490  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1491 }
1492 
1494  int32_t dst_stride,
1495  const uint8_t *src,
1496  int32_t src_stride,
1497  const int16_t *
1498  sao_offset_val,
1499  int32_t width,
1500  int32_t height)
1501 {
1502  const uint8_t *src_orig = src;
1503  uint8_t *dst_orig = dst;
1504  int32_t h_cnt, v_cnt;
1505  const int32_t src_stride_2x = (src_stride << 1);
1506  const int32_t dst_stride_2x = (dst_stride << 1);
1507  const int32_t src_stride_4x = (src_stride << 2);
1508  const int32_t dst_stride_4x = (dst_stride << 2);
1509  const int32_t src_stride_3x = src_stride_2x + src_stride;
1510  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1511  __m128i edge_idx = {0x403000201, 0x0};
1512  __m128i const1 = __lsx_vldi(1);
1513  __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1514  __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1515  __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1516  __m128i diff_plus13;
1517  __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1518  __m128i src12, dst2, src13, dst3;
1519  __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1520 
1521  sao_offset = __lsx_vld(sao_offset_val, 0);
1522  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1523 
1524  for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1525  src = src_orig + v_cnt;
1526  dst = dst_orig + v_cnt;
1527 
1528  DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
1529  src_minus10, src_minus11);
1530 
1531  for (h_cnt = (height >> 2); h_cnt--;) {
1532  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1533  src, src_stride_3x, src, src_stride_4x,
1534  src10, src11, src12, src13);
1535  DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
1536  src10, src10, src_minus11, src10, src11, cmp_minus10,
1537  cmp_plus10, cmp_minus11, cmp_plus11);
1538  DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
1539  src12, src13, cmp_minus12, cmp_plus12,
1540  cmp_minus13, cmp_plus13);
1541  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1542  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1543  cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1544  diff_plus11);
1545  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1546  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1547  cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1548  diff_plus13);
1549  DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
1550  src10, src10, src_minus11, src10, src11, cmp_minus10,
1551  cmp_plus10, cmp_minus11, cmp_plus11);
1552  DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
1553  src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
1554  cmp_plus13);
1555  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1556  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1557  cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1558  cmp_plus11);
1559  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1560  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1561  cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1562  cmp_plus13);
1563  DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1564  diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1565  cmp_minus11, diff_plus11, const1, cmp_plus11,
1566  diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1567  DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1568  diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1569  cmp_minus13, diff_plus13, const1, cmp_plus13,
1570  diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1571 
1572  DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1573  diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1574  diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1575  offset_mask3);
1576  DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1577  offset_mask2, 2, offset_mask3, 2, offset_mask0,
1578  offset_mask1, offset_mask2, offset_mask3);
1579  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1580  sao_offset, sao_offset, offset_mask0,\
1581  offset_mask0, offset_mask0);
1582  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1583  sao_offset, sao_offset, offset_mask1, offset_mask1,
1584  offset_mask1);
1585  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1586  sao_offset, sao_offset, offset_mask2, offset_mask2,
1587  offset_mask2);
1588  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1589  sao_offset, sao_offset, offset_mask3, offset_mask3,
1590  offset_mask3);
1591 
1592  src_minus10 = src12;
1593  DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
1594  src12, 128, src_minus11, src10, src11, src12);
1595  DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
1596  offset_mask1, src11, offset_mask2, src12,
1597  offset_mask3, dst0, dst1, dst2, dst3);
1598  DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1599  128, dst0, dst1, dst2, dst3);
1600  src_minus11 = src13;
1601 
1602  __lsx_vst(dst0, dst, 0);
1603  __lsx_vstx(dst1, dst, dst_stride);
1604  __lsx_vstx(dst2, dst, dst_stride_2x);
1605  __lsx_vstx(dst3, dst, dst_stride_3x);
1606  src += src_stride_4x;
1607  dst += dst_stride_4x;
1608  }
1609  }
1610 }
1611 
1613  int32_t dst_stride,
1614  const uint8_t *src,
1615  int32_t src_stride,
1616  const int16_t *sao_offset_val,
1617  int32_t height)
1618 {
1619  const uint8_t *src_orig;
1620  const int32_t src_stride_2x = (src_stride << 1);
1621  const int32_t dst_stride_2x = (dst_stride << 1);
1622  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1623  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1624  __m128i edge_idx = {0x403000201, 0x0};
1625  __m128i const1 = __lsx_vldi(1);
1626  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1627  __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1628  __m128i src_minus11, src10, src11;
1629  __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1630  __m128i offset_mask0, offset_mask1;
1631  __m128i zeros = {0};
1632 
1633  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1634  src_orig = src - 1;
1635 
1636  /* load in advance */
1637  DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
1638  src_minus10, src_minus11);
1639  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1640  src10, src11);
1641 
1642  for (height -= 2; height; height -= 2) {
1643  src_orig += src_stride_2x;
1644 
1645  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1646  shuf1, src_zero0, src_zero1);
1647  DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1648  src_plus0, src_plus1);
1649 
1650  DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
1651  src_minus11, src_minus10, src_minus11);
1652  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
1653  src_zero1, src_zero0, src_zero1);
1654  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
1655  src_minus11, cmp_minus10, cmp_minus11);
1656  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1657  cmp_minus11, diff_minus10, diff_minus11);
1658  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1659  src_minus11, cmp_minus10, cmp_minus11);
1660  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1661  cmp_minus11, cmp_minus10, cmp_minus11);
1662  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1663  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1664 
1665  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1666  diff_minus11, offset_mask0, offset_mask1);
1667  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1668  offset_mask0, offset_mask1);
1669  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1670  src_zero0, offset, dst0);
1671  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1672  sao_offset, offset, offset, offset);
1673  dst0 = __lsx_vxori_b(dst0, 128);
1674  dst0 = __lsx_vsadd_b(dst0, offset);
1675  dst0 = __lsx_vxori_b(dst0, 128);
1676 
1677  src_minus10 = src10;
1678  src_minus11 = src11;
1679 
1680  /* load in advance */
1681  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1682  src10, src11);
1683 
1684  __lsx_vstelm_w(dst0, dst, 0, 0);
1685  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1686  dst += dst_stride_2x;
1687  }
1688 
1689  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1690  src_zero0, src_zero1);
1691  DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1692  src_plus0, src_plus1);
1693 
1694  DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
1695  src_minus10, src_minus11);
1696  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1697  src_zero0, src_zero1);
1698  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1699  cmp_minus10, cmp_minus11);
1700  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1701  diff_minus10, diff_minus11);
1702  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1703  cmp_minus10, cmp_minus11);
1704  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1705  cmp_minus10, cmp_minus11);
1706  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1707  const1, cmp_minus11, diff_minus10, diff_minus11);
1708 
1709  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1710  diff_minus11, offset_mask0, offset_mask1);
1711  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1712  offset_mask1);
1713  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1714  src_zero0, offset, dst0);
1715  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1716  sao_offset, offset, offset, offset);
1717  dst0 = __lsx_vxori_b(dst0, 128);
1718  dst0 = __lsx_vsadd_b(dst0, offset);
1719  dst0 = __lsx_vxori_b(dst0, 128);
1720 
1721  __lsx_vstelm_w(dst0, dst, 0, 0);
1722  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1723 }
1724 
1726  int32_t dst_stride,
1727  const uint8_t *src,
1728  int32_t src_stride,
1729  const int16_t *sao_offset_val,
1730  int32_t height)
1731 {
1732  const uint8_t *src_orig;
1733  const int32_t src_stride_2x = (src_stride << 1);
1734  const int32_t dst_stride_2x = (dst_stride << 1);
1735  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1736  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1737  __m128i edge_idx = {0x403000201, 0x0};
1738  __m128i const1 = __lsx_vldi(1);
1739  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1740  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1741  __m128i src_minus10, src10, src_minus11, src11;
1742  __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1743  __m128i offset_mask0, offset_mask1;
1744  __m128i zeros = {0};
1745 
1746  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1747  src_orig = src - 1;
1748 
1749  /* load in advance */
1750  DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
1751  src_minus11);
1752  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1753  src10, src11);
1754 
1755  for (height -= 2; height; height -= 2) {
1756  src_orig += src_stride_2x;
1757 
1758  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1759  shuf1, src_zero0, src_zero1);
1760  DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1761  src_plus10, src_plus11);
1762 
1763  DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
1764  src_minus11, src_minus10, src_minus11);
1765  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1766  src_zero0, src_zero1);
1767  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1768  cmp_minus10, cmp_minus11);
1769  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1770  cmp_minus11, diff_minus10, diff_minus11);
1771  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1772  src_minus11, cmp_minus10, cmp_minus11);
1773  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1774  cmp_minus11, cmp_minus10, cmp_minus11);
1775  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1776  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1777 
1778  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1779  diff_minus11, offset_mask0, offset_mask1);
1780  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1781  offset_mask0, offset_mask1);
1782  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1783  src_zero0, offset, dst0);
1784  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1785  sao_offset, offset, offset, offset);
1786  dst0 = __lsx_vxori_b(dst0, 128);
1787  dst0 = __lsx_vsadd_b(dst0, offset);
1788  dst0 = __lsx_vxori_b(dst0, 128);
1789 
1790  src_minus10 = src10;
1791  src_minus11 = src11;
1792 
1793  /* load in advance */
1794  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1795  src10, src11)
1796  __lsx_vstelm_d(dst0, dst, 0, 0);
1797  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1798  dst += dst_stride_2x;
1799  }
1800 
1801  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1802  src_zero0, src_zero1);
1803  DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1804  src_plus10, src_plus11);
1805  DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
1806  src_minus10, src_minus11);
1807  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1808  src_zero0, src_zero1);
1809 
1810  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1811  cmp_minus10, cmp_minus11);
1812  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1813  cmp_minus11, diff_minus10, diff_minus11);
1814  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1815  cmp_minus10, cmp_minus11);
1816  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1817  cmp_minus10, cmp_minus11);
1818  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1819  const1, cmp_minus11, diff_minus10, diff_minus11);
1820 
1821  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1822  diff_minus11, offset_mask0, offset_mask1);
1823  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1824  offset_mask1);
1825  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1826  src_zero0, offset, dst0);
1827  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1828  sao_offset, offset, offset, offset);
1829  dst0 = __lsx_vxori_b(dst0, 128);
1830  dst0 = __lsx_vsadd_b(dst0, offset);
1831  dst0 = __lsx_vxori_b(dst0, 128);
1832 
1833  src_minus10 = src10;
1834  src_minus11 = src11;
1835 
1836  /* load in advance */
1837  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1838  src10, src11);
1839 
1840  __lsx_vstelm_d(dst0, dst, 0, 0);
1841  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1842 }
1843 
1845  int32_t dst_stride,
1846  const uint8_t *src,
1847  int32_t src_stride,
1848  const int16_t *
1849  sao_offset_val,
1850  int32_t width,
1851  int32_t height)
1852 {
1853  const uint8_t *src_orig = src;
1854  uint8_t *dst_orig = dst;
1855  int32_t v_cnt;
1856  const int32_t src_stride_2x = (src_stride << 1);
1857  const int32_t dst_stride_2x = (dst_stride << 1);
1858  const int32_t src_stride_4x = (src_stride << 2);
1859  const int32_t dst_stride_4x = (dst_stride << 2);
1860  const int32_t src_stride_3x = src_stride_2x + src_stride;
1861  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1862 
1863  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1864  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1865  __m128i edge_idx = {0x403000201, 0x0};
1866  __m128i const1 = __lsx_vldi(1);
1867  __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1868  __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1869  __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1870  __m128i diff_plus13, src_minus14, src_plus13;
1871  __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1872  __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1873  __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
1874  __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
1875  __m128i src_zero3, sao_offset, src_plus12;
1876 
1877  sao_offset = __lsx_vld(sao_offset_val, 0);
1878  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1879 
1880  for (; height; height -= 4) {
1881  src_orig = src - 1;
1882  dst_orig = dst;
1883  src_minus11 = __lsx_vld(src_orig, 0);
1884  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1885  src_minus12, src_minus13);
1886  src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
1887 
1888  for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1889  src_minus10 = __lsx_vld(src_orig - src_stride, 0);
1890  src_orig += 16;
1891  src10 = __lsx_vld(src_orig, 0);
1892  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
1893  src_stride_2x, src11, src12);
1894  src13 = __lsx_vldx(src_orig, src_stride_3x);
1895  src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
1896 
1897  DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
1898  src_minus12, shuf1, src12, src_minus13, shuf1,
1899  src13, src_minus14, shuf1, src_zero0, src_zero1,
1900  src_zero2, src_zero3);
1901  DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
1902  src_minus13, shuf2, src_plus10, src_plus11);
1903  src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
1904 
1905  DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1906  src_plus10, src_zero1, src_minus11, src_zero1,
1907  src_plus11, cmp_minus10, cmp_plus10,
1908  cmp_minus11, cmp_plus11);
1909  DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1910  src_plus12, src_zero3, src_minus13, src_zero3,
1911  src_plus13, cmp_minus12, cmp_plus12,
1912  cmp_minus13, cmp_plus13);
1913  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1914  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1915  cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1916  diff_plus11);
1917  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1918  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1919  cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1920  diff_plus13);
1921  DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1922  src_plus10, src_zero1, src_minus11, src_zero1,
1923  src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1924  cmp_plus11);
1925  DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1926  src_plus12, src_zero3, src_minus13, src_zero3,
1927  src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1928  cmp_plus13);
1929  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1930  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1931  cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1932  cmp_plus11);
1933  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1934  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1935  cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1936  cmp_plus13);
1937  DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1938  diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1939  cmp_minus11, diff_plus11, const1, cmp_plus11,
1940  diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1941  DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1942  diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1943  cmp_minus13, diff_plus13, const1, cmp_plus13,
1944  diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1945 
1946  DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1947  diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1948  diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1949  offset_mask3);
1950  DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1951  offset_mask2, 2, offset_mask3, 2, offset_mask0,
1952  offset_mask1, offset_mask2, offset_mask3);
1953 
1954  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1955  sao_offset, sao_offset, offset_mask0, offset_mask0,
1956  offset_mask0);
1957  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1958  sao_offset, sao_offset, offset_mask1, offset_mask1,
1959  offset_mask1);
1960  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1961  sao_offset, sao_offset, offset_mask2, offset_mask2,
1962  offset_mask2);
1963  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1964  sao_offset, sao_offset, offset_mask3, offset_mask3,
1965  offset_mask3);
1966 
1967  DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
1968  128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
1969  src_zero3);
1970  DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1971  offset_mask1, src_zero2, offset_mask2, src_zero3,
1972  offset_mask3, dst0, dst1, dst2, dst3);
1973  DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1974  128, dst0, dst1, dst2, dst3);
1975 
1976  src_minus11 = src10;
1977  src_minus12 = src11;
1978  src_minus13 = src12;
1979  src_minus14 = src13;
1980 
1981  __lsx_vst(dst0, dst_orig, 0);
1982  __lsx_vstx(dst1, dst_orig, dst_stride);
1983  __lsx_vstx(dst2, dst_orig, dst_stride_2x);
1984  __lsx_vstx(dst3, dst_orig, dst_stride_3x);
1985  dst_orig += 16;
1986  }
1987  src += src_stride_4x;
1988  dst += dst_stride_4x;
1989  }
1990 }
1991 
1993  int32_t dst_stride,
1994  const uint8_t *src,
1995  int32_t src_stride,
1996  const int16_t *sao_offset_val,
1997  int32_t height)
1998 {
1999  const uint8_t *src_orig;
2000  const int32_t src_stride_2x = (src_stride << 1);
2001  const int32_t dst_stride_2x = (dst_stride << 1);
2002 
2003  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2004  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2005  __m128i edge_idx = {0x403000201, 0x0};
2006  __m128i const1 = __lsx_vldi(1);
2007  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2008  __m128i src_zero0, src_zero1, dst0;
2009  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2010  __m128i src_minus10, src10, src_minus11, src11;
2011  __m128i offset_mask0, offset_mask1;
2012  __m128i zeros = {0};
2013 
2014  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2015  src_orig = src - 1;
2016 
2017  /* load in advance */
2018  DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2019  src_minus10, src_minus11);
2020  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2021  src10, src11);
2022 
2023  for (height -= 2; height; height -= 2) {
2024  src_orig += src_stride_2x;
2025 
2026  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2027  shuf1, src_zero0, src_zero1);
2028  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2029  shuf2, src_minus10, src_minus11);
2030 
2031  DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2032  src_minus10, src_minus11);
2033  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2034  src_zero0, src_zero1);
2035  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2036  cmp_minus10, cmp_minus11);
2037  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2038  cmp_minus11, diff_minus10, diff_minus11);
2039  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2040  src_minus11, cmp_minus10, cmp_minus11);
2041  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2042  cmp_minus11, cmp_minus10, cmp_minus11);
2043  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2044  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2045 
2046  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2047  diff_minus11, offset_mask0, offset_mask1);
2048  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2049  offset_mask0, offset_mask1);
2050  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2051  src_zero0, offset, dst0);
2052  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2053  sao_offset, offset, offset, offset);
2054  dst0 = __lsx_vxori_b(dst0, 128);
2055  dst0 = __lsx_vsadd_b(dst0, offset);
2056  dst0 = __lsx_vxori_b(dst0, 128);
2057 
2058  src_minus10 = src10;
2059  src_minus11 = src11;
2060 
2061  /* load in advance */
2062  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2063  src10, src11);
2064 
2065  __lsx_vstelm_w(dst0, dst, 0, 0);
2066  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2067  dst += dst_stride_2x;
2068  }
2069 
2070  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2071  src_zero0, src_zero1);
2072  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2073  shuf2, src_minus10, src_minus11);
2074 
2075  DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2076  src_minus10, src_minus11);
2077  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2078  src_zero0, src_zero1);
2079  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2080  cmp_minus10, cmp_minus11);
2081  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2082  cmp_minus11, diff_minus10, diff_minus11);
2083  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2084  cmp_minus10, cmp_minus11);
2085  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2086  cmp_minus10, cmp_minus11);
2087  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2088  const1, cmp_minus11, diff_minus10, diff_minus11);
2089 
2090  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2091  diff_minus11, offset_mask0, offset_mask1);
2092  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2093  offset_mask1);
2094  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2095  src_zero0, offset, dst0);
2096  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2097  sao_offset, offset, offset, offset);
2098  dst0 = __lsx_vxori_b(dst0, 128);
2099  dst0 = __lsx_vsadd_b(dst0, offset);
2100  dst0 = __lsx_vxori_b(dst0, 128);
2101 
2102  __lsx_vstelm_w(dst0, dst, 0, 0);
2103  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2104  dst += dst_stride_2x;
2105 }
2106 
2108  int32_t dst_stride,
2109  const uint8_t *src,
2110  int32_t src_stride,
2111  const int16_t *sao_offset_val,
2112  int32_t height)
2113 {
2114  const uint8_t *src_orig;
2115  const int32_t src_stride_2x = (src_stride << 1);
2116  const int32_t dst_stride_2x = (dst_stride << 1);
2117 
2118  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2119  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2120  __m128i edge_idx = {0x403000201, 0x0};
2121  __m128i const1 = __lsx_vldi(1);
2122  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2123  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2124  __m128i src_minus10, src10, src_minus11, src11;
2125  __m128i src_zero0, src_zero1, dst0;
2126  __m128i offset_mask0, offset_mask1;
2127  __m128i zeros = {0};
2128 
2129  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2130  src_orig = src - 1;
2131 
2132  /* load in advance */
2133  DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2134  src_minus10, src_minus11);
2135  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2136  src10, src11);
2137 
2138  for (height -= 2; height; height -= 2) {
2139  src_orig += src_stride_2x;
2140 
2141  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2142  shuf1, src_zero0, src_zero1);
2143  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2144  shuf2, src_minus10, src_minus11);
2145 
2146  DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2147  src_minus10, src_minus11);
2148  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2149  src_zero0, src_zero1);
2150  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2151  cmp_minus10, cmp_minus11);
2152  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2153  cmp_minus11, diff_minus10, diff_minus11);
2154  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2155  src_minus11, cmp_minus10, cmp_minus11);
2156  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2157  cmp_minus11, cmp_minus10, cmp_minus11);
2158  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2159  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2160 
2161  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2162  diff_minus11, offset_mask0, offset_mask1);
2163  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2164  offset_mask0, offset_mask1);
2165  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2166  src_zero0, offset, dst0);
2167  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2168  sao_offset, offset, offset, offset);
2169  dst0 = __lsx_vxori_b(dst0, 128);
2170  dst0 = __lsx_vsadd_b(dst0, offset);
2171  dst0 = __lsx_vxori_b(dst0, 128);
2172 
2173  src_minus10 = src10;
2174  src_minus11 = src11;
2175 
2176  /* load in advance */
2177  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2178  src10, src11);
2179 
2180  __lsx_vstelm_d(dst0, dst, 0, 0);
2181  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2182  dst += dst_stride_2x;
2183  }
2184 
2185  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2186  src_zero0, src_zero1);
2187  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2188  shuf2, src_minus10, src_minus11);
2189 
2190  DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2191  src_minus10, src_minus11);
2192  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2193  src_zero0, src_zero1);
2194  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2195  cmp_minus10, cmp_minus11);
2196  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2197  diff_minus10, diff_minus11);
2198  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2199  cmp_minus10, cmp_minus11);
2200  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2201  cmp_minus10, cmp_minus11);
2202  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2203  const1, cmp_minus11, diff_minus10, diff_minus11);
2204 
2205  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2206  diff_minus11, offset_mask0, offset_mask1);
2207  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2208  offset_mask1);
2209  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2210  src_zero0, offset, dst0);
2211  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2212  sao_offset, offset, offset, offset);
2213  dst0 = __lsx_vxori_b(dst0, 128);
2214  dst0 = __lsx_vsadd_b(dst0, offset);
2215  dst0 = __lsx_vxori_b(dst0, 128);
2216 
2217  __lsx_vstelm_d(dst0, dst, 0, 0);
2218  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2219 }
2220 
2222  int32_t dst_stride,
2223  const uint8_t *src,
2224  int32_t src_stride,
2225  const int16_t *sao_offset_val,
2226  int32_t width,
2227  int32_t height)
2228 {
2229  const uint8_t *src_orig;
2230  uint8_t *dst_orig;
2231  int32_t v_cnt;
2232  const int32_t src_stride_2x = (src_stride << 1);
2233  const int32_t dst_stride_2x = (dst_stride << 1);
2234  const int32_t src_stride_4x = (src_stride << 2);
2235  const int32_t dst_stride_4x = (dst_stride << 2);
2236  const int32_t src_stride_3x = src_stride_2x + src_stride;
2237  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2238 
2239  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2240  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2241  __m128i edge_idx = {0x403000201, 0x0};
2242  __m128i const1 = __lsx_vldi(1);
2243  __m128i dst0, dst1, dst2, dst3;
2244  __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2245  __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2246  __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2247  __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2248  __m128i src_plus10, src_plus11, src_plus12, src_plus13;
2249  __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2250  __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2251 
2252  sao_offset = __lsx_vld(sao_offset_val, 0);
2253  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2254 
2255  for (; height; height -= 4) {
2256  src_orig = src - 1;
2257  dst_orig = dst;
2258 
2259  src_minus11 = __lsx_vld(src_orig, 0);
2260  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2261  src_plus10, src_plus11);
2262  src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
2263 
2264  for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2265  src_minus10 = __lsx_vld(src_orig - src_stride, 2);
2266  src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
2267  src_orig += 16;
2268  src10 = __lsx_vld(src_orig, 0);
2269  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2270  src11, src12);
2271  src13 =__lsx_vldx(src_orig, src_stride_3x);
2272 
2273  DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
2274  src_plus10, shuf1, src12, src_plus11, shuf1, src13,
2275  src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
2276  src_zero3);
2277  src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
2278  DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
2279  src_plus11, shuf2, src_minus12, src_minus13);
2280 
2281  DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
2282  src_plus10, src_zero1, src_minus11, src_zero1,
2283  src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2284  cmp_plus11);
2285  DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
2286  src_plus12, src_zero3, src_minus13, src_zero3,
2287  src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2288  cmp_plus13);
2289  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2290  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2291  cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
2292  diff_plus11);
2293  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2294  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2295  cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
2296  diff_plus13);
2297  DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
2298  src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
2299  cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
2300  DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
2301  src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
2302  cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
2303  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2304  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2305  cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2306  cmp_plus11);
2307  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2308  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2309  cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2310  cmp_plus13);
2311  DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2312  diff_plus10, const1, cmp_plus10, diff_minus11, const1,
2313  cmp_minus11, diff_plus11, const1, cmp_plus11,
2314  diff_minus10, diff_plus10, diff_minus11, diff_plus11);
2315  DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
2316  diff_plus12, const1, cmp_plus12, diff_minus13, const1,
2317  cmp_minus13, diff_plus13, const1, cmp_plus13,
2318  diff_minus12, diff_plus12, diff_minus13, diff_plus13);
2319 
2320  DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
2321  diff_plus11, diff_minus12, diff_plus12, diff_minus13,
2322  diff_plus13, offset_mask0, offset_mask1, offset_mask2,
2323  offset_mask3);
2324  DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
2325  offset_mask2, 2, offset_mask3, 2, offset_mask0,
2326  offset_mask1, offset_mask2, offset_mask3);
2327 
2328  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
2329  sao_offset, sao_offset, offset_mask0, offset_mask0,
2330  offset_mask0);
2331  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
2332  sao_offset, sao_offset, offset_mask1, offset_mask1,
2333  offset_mask1);
2334  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
2335  sao_offset, sao_offset, offset_mask2, offset_mask2,
2336  offset_mask2);
2337  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
2338  sao_offset, sao_offset, offset_mask3, offset_mask3,
2339  offset_mask3);
2340 
2341  DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
2342  src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
2343  src_zero2, src_zero3);
2344  DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
2345  offset_mask1, src_zero2, offset_mask2, src_zero3,
2346  offset_mask3, dst0, dst1, dst2, dst3);
2347  DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
2348  128, dst0, dst1, dst2, dst3);
2349 
2350  src_minus11 = src10;
2351  src_plus10 = src11;
2352  src_plus11 = src12;
2353  src_plus12 = src13;
2354 
2355  __lsx_vst(dst0, dst_orig, 0);
2356  __lsx_vstx(dst1, dst_orig, dst_stride);
2357  __lsx_vstx(dst2, dst_orig, dst_stride_2x);
2358  __lsx_vstx(dst3, dst_orig, dst_stride_3x);
2359  dst_orig += 16;
2360  }
2361 
2362  src += src_stride_4x;
2363  dst += dst_stride_4x;
2364  }
2365 }
2366 
2367 void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, const uint8_t *src,
2368  ptrdiff_t stride_dst,
2369  const int16_t *sao_offset_val,
2370  int eo, int width, int height)
2371 {
2372  ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
2373 
2374  switch (eo) {
2375  case 0:
2376  if (width >> 4) {
2378  src, stride_src,
2379  sao_offset_val,
2380  width - (width & 0x0F),
2381  height);
2382  dst += width & 0xFFFFFFF0;
2383  src += width & 0xFFFFFFF0;
2384  width &= 0x0F;
2385  }
2386 
2387  if (width >> 3) {
2389  src, stride_src,
2390  sao_offset_val, height);
2391  dst += 8;
2392  src += 8;
2393  width &= 0x07;
2394  }
2395 
2396  if (width) {
2398  src, stride_src,
2399  sao_offset_val, height);
2400  }
2401  break;
2402 
2403  case 1:
2404  if (width >> 4) {
2406  src, stride_src,
2407  sao_offset_val,
2408  width - (width & 0x0F),
2409  height);
2410  dst += width & 0xFFFFFFF0;
2411  src += width & 0xFFFFFFF0;
2412  width &= 0x0F;
2413  }
2414 
2415  if (width >> 3) {
2417  src, stride_src,
2418  sao_offset_val, height);
2419  dst += 8;
2420  src += 8;
2421  width &= 0x07;
2422  }
2423 
2424  if (width) {
2426  src, stride_src,
2427  sao_offset_val, height);
2428  }
2429  break;
2430 
2431  case 2:
2432  if (width >> 4) {
2434  src, stride_src,
2435  sao_offset_val,
2436  width - (width & 0x0F),
2437  height);
2438  dst += width & 0xFFFFFFF0;
2439  src += width & 0xFFFFFFF0;
2440  width &= 0x0F;
2441  }
2442 
2443  if (width >> 3) {
2445  src, stride_src,
2446  sao_offset_val, height);
2447  dst += 8;
2448  src += 8;
2449  width &= 0x07;
2450  }
2451 
2452  if (width) {
2454  src, stride_src,
2455  sao_offset_val, height);
2456  }
2457  break;
2458 
2459  case 3:
2460  if (width >> 4) {
2462  src, stride_src,
2463  sao_offset_val,
2464  width - (width & 0x0F),
2465  height);
2466  dst += width & 0xFFFFFFF0;
2467  src += width & 0xFFFFFFF0;
2468  width &= 0x0F;
2469  }
2470 
2471  if (width >> 3) {
2473  src, stride_src,
2474  sao_offset_val, height);
2475  dst += 8;
2476  src += 8;
2477  width &= 0x07;
2478  }
2479 
2480  if (width) {
2482  src, stride_src,
2483  sao_offset_val, height);
2484  }
2485  break;
2486  }
2487 }
hevc_sao_edge_filter_45degree_4width_lsx
static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1612
q1
static const uint8_t q1[256]
Definition: twofish.c:100
hevc_sao_edge_filter_0degree_16multiple_lsx
static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t width, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1166
src1
const pixel * src1
Definition: h264pred_template.c:421
ff_hevc_loop_filter_chroma_h_8_lsx
void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride, const int32_t *tc, const uint8_t *p_is_pcm, const uint8_t *q_is_pcm)
Definition: hevc_lpf_sao_lsx.c:863
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
hevc_sao_edge_filter_0degree_4width_lsx
static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:988
hevc_sao_edge_filter_90degree_4width_lsx
static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1305
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
hevc_sao_edge_filter_135degree_4width_lsx
static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1992
ff_hevc_loop_filter_luma_v_8_lsx
void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t beta, const int32_t *tc, const uint8_t *p_is_pcm, const uint8_t *q_is_pcm)
Definition: hevc_lpf_sao_lsx.c:438
ff_hevc_sao_edge_filter_8_lsx
void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
Definition: hevc_lpf_sao_lsx.c:2367
hevc_sao_edge_filter_135degree_16multiple_lsx
static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t width, int32_t height)
Definition: hevc_lpf_sao_lsx.c:2221
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
hevc_sao_edge_filter_90degree_16multiple_lsx
static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t width, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1493
q0
static const uint8_t q0[256]
Definition: twofish.c:81
hevc_sao_edge_filter_45degree_16multiple_lsx
static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t width, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1844
hevc_sao_edge_filter_45degree_8width_lsx
static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1725
abs
#define abs(x)
Definition: cuda_runtime.h:35
DUP2_ARG1
#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:52
height
#define height
Definition: dsp.h:85
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
ff_hevc_loop_filter_luma_h_8_lsx
void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t beta, const int32_t *tc, const uint8_t *p_is_pcm, const uint8_t *q_is_pcm)
Definition: hevc_lpf_sao_lsx.c:26
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
src2
const pixel * src2
Definition: h264pred_template.c:422
delta
float delta
Definition: vorbis_enc_data.h:430
stride
#define stride
Definition: h264pred_template.c:537
MAX_PB_SIZE
#define MAX_PB_SIZE
Definition: dsp.h:32
AV_INPUT_BUFFER_PADDING_SIZE
#define AV_INPUT_BUFFER_PADDING_SIZE
Definition: defs.h:40
hevc_sao_edge_filter_135degree_8width_lsx
static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:2107
hevc_sao_edge_filter_0degree_8width_lsx
static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1073
src0
const pixel *const src0
Definition: h264pred_template.c:420
loongson_intrinsics.h
ff_hevc_loop_filter_chroma_v_8_lsx
void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride, const int32_t *tc, const uint8_t *p_is_pcm, const uint8_t *q_is_pcm)
Definition: hevc_lpf_sao_lsx.c:917
int32_t
int32_t
Definition: audioconvert.c:56
hevc_sao_edge_filter_90degree_8width_lsx
static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst, int32_t dst_stride, const uint8_t *src, int32_t src_stride, const int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1400
width
#define width
Definition: dsp.h:85
hevcdsp_lsx.h
src
#define src
Definition: vp8dsp.c:248
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83