24 #define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \
26 v16u8 out, tmp0, tmp1; \
27 v16u8 data0, data1, data2, data3, data4, data5; \
29 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
30 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
32 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
33 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
34 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
35 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
36 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
37 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
38 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
39 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
40 sum0_r *= (v8u16) (coef0); \
41 sum0_l *= (v8u16) (coef0); \
42 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
43 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
44 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
45 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
46 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
47 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
48 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
49 res_r = (v8i16) (sum0_r - sum3_r); \
50 res_l = (v8i16) (sum0_l - sum3_l); \
51 SRARI_H2_SH(res_r, res_l, 5); \
52 CLIP_SH2_0_255(res_r, res_l); \
53 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
58 #define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \
59 mask0, mask1, mask2, mask3, \
60 coef0, coef1, coef2) \
63 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
64 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
65 v8i16 res0_r, res1_r; \
67 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
68 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
69 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
70 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
71 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
72 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
73 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
74 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
75 res0_r = (v8i16) (sum0_r - sum3_r); \
76 res1_r = (v8i16) (sum4_r - sum7_r); \
77 SRARI_H2_SH(res0_r, res1_r, 5); \
78 CLIP_SH2_0_255(res0_r, res1_r); \
79 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
84 #define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \
85 mask0, mask1, mask2, mask3, \
86 coef0, coef1, coef2) \
90 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
92 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
93 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
94 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
95 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
96 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
97 res0_r = (v8i16) (sum0_r - sum3_r); \
98 res0_r = __msa_srari_h(res0_r, 5); \
99 res0_r = CLIP_SH_0_255(res0_r); \
100 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
105 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \
106 mask2, mask3, coef0, \
111 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
113 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
114 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
115 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
116 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
117 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
118 res0_r = (v8i16) (sum0_r - sum3_r); \
121 res0_r = CLIP_SH_0_255(res0_r); \
122 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
127 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \
128 coef0, coef1, coef2) \
130 v16u8 out, tmp0, tmp1; \
131 v16u8 data0, data1, data2, data3, data4, data5; \
132 v8i16 res_r, res_l; \
133 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
134 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
136 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
137 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
138 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
139 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
140 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
141 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
142 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
143 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
144 sum0_r *= (v8u16) (coef0); \
145 sum0_l *= (v8u16) (coef0); \
146 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
147 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
148 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
149 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
150 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
151 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
152 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
153 res_r = (v8i16) (sum0_r - sum3_r); \
154 res_l = (v8i16) (sum0_l - sum3_l); \
159 CLIP_SH2_0_255(res_r, res_l); \
160 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
165 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \
166 mask0, mask1, mask2, mask3, \
167 coef0, coef1, coef2) \
170 v8i16 res0_r, res1_r; \
171 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
172 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
174 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
175 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
176 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
177 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
178 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
179 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
180 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
181 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
182 res0_r = (v8i16) (sum0_r - sum3_r); \
183 res1_r = (v8i16) (sum4_r - sum7_r); \
188 CLIP_SH2_0_255(res0_r, res1_r); \
189 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
194 #define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \
195 inp4, inp5, inp6, inp7, \
196 coef0, coef1, coef2) \
199 v8i16 res_r, res_l; \
200 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
201 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
203 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
204 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
205 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
206 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
207 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
208 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
209 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
210 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
211 res_r = (v8i16) (sum0_r - sum3_r); \
212 res_l = (v8i16) (sum0_l - sum3_l); \
213 SRARI_H2_SH(res_r, res_l, 5); \
214 CLIP_SH2_0_255(res_r, res_l); \
215 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
220 #define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
221 inp04, inp05, inp06, inp07, \
222 inp10, inp11, inp12, inp13, \
223 inp14, inp15, inp16, inp17, \
224 coef0, coef1, coef2) \
228 v8u16 sum00, sum01, sum02, sum03; \
229 v8u16 sum10, sum11, sum12, sum13; \
231 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
232 sum00, sum10, sum03, sum13); \
233 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
234 HADD_UB2_UH(sum03, sum13, sum03, sum13); \
235 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
236 sum02, sum12, sum01, sum11); \
237 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
238 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
239 val0 = (v8i16) (sum00 - sum03); \
240 val1 = (v8i16) (sum10 - sum13); \
241 SRARI_H2_SH(val0, val1, 5); \
242 CLIP_SH2_0_255(val0, val1); \
243 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
248 #define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \
249 inp4, inp5, inp6, inp7, \
250 coef0, coef1, coef2) \
253 v8i16 res_r, res_l; \
254 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
255 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
257 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
258 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
259 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
260 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
261 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
262 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
263 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
264 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
265 res_r = (v8i16) (sum0_r - sum3_r); \
266 res_l = (v8i16) (sum0_l - sum3_l); \
271 CLIP_SH2_0_255(res_r, res_l); \
272 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
277 #define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
278 inp04, inp05, inp06, inp07, \
279 inp10, inp11, inp12, inp13, \
280 inp14, inp15, inp16, inp17, \
281 coef0, coef1, coef2) \
285 v8u16 sum00, sum01, sum02, sum03; \
286 v8u16 sum10, sum11, sum12, sum13; \
288 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
289 sum00, sum10, sum03, sum13); \
290 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
291 HADD_UB2_UH(sum03, sum13, sum03, sum13); \
292 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
293 sum02, sum12, sum01, sum11); \
294 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
295 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
296 val0 = (v8i16) (sum00 - sum03); \
297 val1 = (v8i16) (sum10 - sum13); \
302 CLIP_SH2_0_255(val0, val1); \
303 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
315 v16u8 inp0, inp1, inp2, inp3;
317 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
318 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
319 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
320 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
321 v16u8 const20 = (v16u8) __msa_ldi_b(20);
322 v16u8 const6 = (v16u8) __msa_ldi_b(6);
323 v16u8 const3 = (v16u8) __msa_ldi_b(3);
325 for (loop_count = (
height >> 2); loop_count--;) {
326 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
327 src += (4 * src_stride);
329 mask0, mask1, mask2, mask3,
330 const20, const6, const3);
332 mask0, mask1, mask2, mask3,
333 const20, const6, const3);
334 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
335 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
337 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
338 dst += (4 * dst_stride);
349 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
351 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
352 v16u8 const6 = (v16u8) __msa_ldi_b(6);
353 v16u8 const3 = (v16u8) __msa_ldi_b(3);
354 v8u16 const20 = (v8u16) __msa_ldi_h(20);
356 for (loop_count = (
height >> 2); loop_count--;) {
357 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
358 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
359 src += (4 * src_stride);
361 const20, const6, const3);
362 res = __msa_aver_u_b(inp0, res);
367 const20, const6, const3);
368 res = __msa_aver_u_b(inp2, res);
373 const20, const6, const3);
374 res = __msa_aver_u_b(inp4, res);
379 const20, const6, const3);
380 res = __msa_aver_u_b(inp6, res);
393 v16u8 inp0, inp1, inp2, inp3;
395 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
396 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
397 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
398 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
399 v16u8 const20 = (v16u8) __msa_ldi_b(20);
400 v16u8 const6 = (v16u8) __msa_ldi_b(6);
401 v16u8 const3 = (v16u8) __msa_ldi_b(3);
403 for (loop_count = (
height >> 2); loop_count--;) {
404 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
405 src += (4 * src_stride);
407 mask0, mask1, mask2, mask3,
408 const20, const6, const3);
410 mask0, mask1, mask2, mask3,
411 const20, const6, const3);
412 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
413 dst += (4 * dst_stride);
424 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
426 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
427 v8u16 const20 = (v8u16) __msa_ldi_h(20);
428 v16u8 const6 = (v16u8) __msa_ldi_b(6);
429 v16u8 const3 = (v16u8) __msa_ldi_b(3);
431 for (loop_count = (
height >> 2); loop_count--;) {
432 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
433 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
434 src += (4 * src_stride);
436 const20, const6, const3);
441 const20, const6, const3);
446 const20, const6, const3);
451 const20, const6, const3);
464 v16u8 inp0, inp1, inp2, inp3;
466 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
467 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
468 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
469 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
470 v16u8 const20 = (v16u8) __msa_ldi_b(20);
471 v16u8 const6 = (v16u8) __msa_ldi_b(6);
472 v16u8 const3 = (v16u8) __msa_ldi_b(3);
474 for (loop_count = (
height >> 2); loop_count--;) {
475 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
476 src += (4 * src_stride);
478 mask0, mask1, mask2, mask3,
479 const20, const6, const3);
481 mask0, mask1, mask2, mask3,
482 const20, const6, const3);
483 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
484 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
485 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
486 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
488 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
489 dst += (4 * dst_stride);
500 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
502 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
503 v8u16 const20 = (v8u16) __msa_ldi_h(20);
504 v16u8 const6 = (v16u8) __msa_ldi_b(6);
505 v16u8 const3 = (v16u8) __msa_ldi_b(3);
507 for (loop_count = (
height >> 2); loop_count--;) {
508 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
509 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
510 src += (4 * src_stride);
512 const20, const6, const3);
513 res = __msa_aver_u_b(res, inp1);
518 const20, const6, const3);
519 res = __msa_aver_u_b(res, inp3);
524 const20, const6, const3);
525 res = __msa_aver_u_b(res, inp5);
530 const20, const6, const3);
531 res = __msa_aver_u_b(res, inp7);
544 v16u8 inp0, inp1, inp2, inp3;
546 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
547 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
548 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
549 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
550 v16u8 const20 = (v16u8) __msa_ldi_b(20);
551 v16u8 const6 = (v16u8) __msa_ldi_b(6);
552 v16u8 const3 = (v16u8) __msa_ldi_b(3);
554 for (loop_count = (
height >> 2); loop_count--;) {
555 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
556 src += (4 * src_stride);
558 mask2, mask3, const20,
561 mask2, mask3, const20,
563 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
564 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
565 res0 = __msa_ave_u_b(inp0, res0);
566 res1 = __msa_ave_u_b(inp2, res1);
567 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
568 dst += (4 * dst_stride);
579 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
581 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
582 v8u16 const20 = (v8u16) __msa_ldi_h(20);
583 v16u8 const6 = (v16u8) __msa_ldi_b(6);
584 v16u8 const3 = (v16u8) __msa_ldi_b(3);
586 for (loop_count = (
height >> 2); loop_count--;) {
587 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
588 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
589 src += (4 * src_stride);
591 const20, const6, const3);
592 res = __msa_ave_u_b(inp0, res);
597 const20, const6, const3);
598 res = __msa_ave_u_b(inp2, res);
603 const20, const6, const3);
604 res = __msa_ave_u_b(inp4, res);
609 const20, const6, const3);
610 res = __msa_ave_u_b(inp6, res);
623 v16u8 inp0, inp1, inp2, inp3;
625 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
626 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
627 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
628 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
629 v16u8 const20 = (v16u8) __msa_ldi_b(20);
630 v16u8 const6 = (v16u8) __msa_ldi_b(6);
631 v16u8 const3 = (v16u8) __msa_ldi_b(3);
633 for (loop_count = (
height >> 2); loop_count--;) {
634 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
635 src += (4 * src_stride);
637 mask2, mask3, const20,
640 mask2, mask3, const20,
642 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
643 dst += (4 * dst_stride);
654 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
656 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
657 v16u8 const6 = (v16u8) __msa_ldi_b(6);
658 v16u8 const3 = (v16u8) __msa_ldi_b(3);
659 v8u16 const20 = (v8u16) __msa_ldi_h(20);
661 for (loop_count = (
height >> 2); loop_count--;) {
662 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
663 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
664 src += (4 * src_stride);
666 const20, const6, const3);
671 const20, const6, const3);
676 const20, const6, const3);
681 const20, const6, const3);
694 v16u8 inp0, inp1, inp2, inp3;
696 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
697 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
698 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
699 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
700 v16u8 const20 = (v16u8) __msa_ldi_b(20);
701 v16u8 const6 = (v16u8) __msa_ldi_b(6);
702 v16u8 const3 = (v16u8) __msa_ldi_b(3);
704 for (loop_count = (
height >> 2); loop_count--;) {
705 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
706 src += (4 * src_stride);
708 mask2, mask3, const20,
711 mask2, mask3, const20,
713 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
714 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
715 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
716 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
717 res0 = __msa_ave_u_b(inp0, res0);
718 res1 = __msa_ave_u_b(inp2, res1);
719 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
720 dst += (4 * dst_stride);
731 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
733 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
734 v16u8 const6 = (v16u8) __msa_ldi_b(6);
735 v16u8 const3 = (v16u8) __msa_ldi_b(3);
736 v8u16 const20 = (v8u16) __msa_ldi_h(20);
738 for (loop_count = (
height >> 2); loop_count--;) {
739 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
740 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
741 src += (4 * src_stride);
743 const20, const6, const3);
744 res = __msa_ave_u_b(res, inp1);
749 const20, const6, const3);
750 res = __msa_ave_u_b(res, inp3);
755 const20, const6, const3);
756 res = __msa_ave_u_b(res, inp5);
761 const20, const6, const3);
762 res = __msa_ave_u_b(res, inp7);
775 v16u8 inp0, inp1, inp2, inp3;
776 v16u8 dst0, dst1, dst2, dst3;
778 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
779 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
780 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
781 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
782 v16u8 const20 = (v16u8) __msa_ldi_b(20);
783 v16u8 const6 = (v16u8) __msa_ldi_b(6);
784 v16u8 const3 = (v16u8) __msa_ldi_b(3);
786 for (loop_count = (
height >> 2); loop_count--;) {
787 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
788 src += (4 * src_stride);
790 mask0, mask1, mask2, mask3,
791 const20, const6, const3);
793 mask0, mask1, mask2, mask3,
794 const20, const6, const3);
795 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
796 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
797 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
798 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
799 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
802 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
803 dst += (4 * dst_stride);
814 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
817 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
818 v16u8 const6 = (v16u8) __msa_ldi_b(6);
819 v16u8 const3 = (v16u8) __msa_ldi_b(3);
820 v8u16 const20 = (v8u16) __msa_ldi_h(20);
822 for (loop_count = (
height >> 2); loop_count--;) {
823 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
824 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
825 src += (4 * src_stride);
827 const20, const6, const3);
829 const20, const6, const3);
830 LD_UB2(dst, dst_stride, dst0, dst1);
833 ST_UB2(res0, res1, dst, dst_stride);
834 dst += (2 * dst_stride);
837 const20, const6, const3);
839 const20, const6, const3);
840 LD_UB2(dst, dst_stride, dst0, dst1);
843 ST_UB2(res0, res1, dst, dst_stride);
844 dst += (2 * dst_stride);
855 v16u8 inp0, inp1, inp2, inp3;
856 v16u8 dst0, dst1, dst2, dst3;
858 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
859 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
860 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
861 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
862 v16u8 const20 = (v16u8) __msa_ldi_b(20);
863 v16u8 const6 = (v16u8) __msa_ldi_b(6);
864 v16u8 const3 = (v16u8) __msa_ldi_b(3);
866 for (loop_count = (
height >> 2); loop_count--;) {
867 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
868 src += (4 * src_stride);
870 mask0, mask1, mask2, mask3,
871 const20, const6, const3);
873 mask0, mask1, mask2, mask3,
874 const20, const6, const3);
875 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
876 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
877 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
879 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
880 dst += (4 * dst_stride);
891 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
894 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
895 v16u8 const6 = (v16u8) __msa_ldi_b(6);
896 v16u8 const3 = (v16u8) __msa_ldi_b(3);
897 v8u16 const20 = (v8u16) __msa_ldi_h(20);
899 for (loop_count = (
height >> 2); loop_count--;) {
900 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
901 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
902 src += (4 * src_stride);
904 const20, const6, const3);
906 const20, const6, const3);
907 LD_UB2(dst, dst_stride, dst0, dst1);
909 ST_UB2(res0, res1, dst, dst_stride);
910 dst += (2 * dst_stride);
913 const20, const6, const3);
915 const20, const6, const3);
916 LD_UB2(dst, dst_stride, dst0, dst1);
918 ST_UB2(res0, res1, dst, dst_stride);
919 dst += (2 * dst_stride);
930 v16u8 inp0, inp1, inp2, inp3;
931 v16u8 dst0, dst1, dst2, dst3;
933 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
934 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
935 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
936 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
937 v16u8 const20 = (v16u8) __msa_ldi_b(20);
938 v16u8 const6 = (v16u8) __msa_ldi_b(6);
939 v16u8 const3 = (v16u8) __msa_ldi_b(3);
941 for (loop_count = (
height >> 2); loop_count--;) {
942 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
943 src += (4 * src_stride);
945 mask0, mask1, mask2, mask3,
946 const20, const6, const3);
948 mask0, mask1, mask2, mask3,
949 const20, const6, const3);
950 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
951 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
952 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
953 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
954 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
955 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
956 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
959 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
960 dst += (4 * dst_stride);
971 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
972 v16u8 res0, res1, dst0, dst1;
973 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
974 v16u8 const6 = (v16u8) __msa_ldi_b(6);
975 v16u8 const3 = (v16u8) __msa_ldi_b(3);
976 v8u16 const20 = (v8u16) __msa_ldi_h(20);
978 for (loop_count = (
height >> 2); loop_count--;) {
979 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
980 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
981 src += (4 * src_stride);
983 const20, const6, const3);
985 const20, const6, const3);
986 LD_UB2(dst, dst_stride, dst0, dst1);
989 ST_UB2(res0, res1, dst, dst_stride);
990 dst += (2 * dst_stride);
992 const20, const6, const3);
994 const20, const6, const3);
995 LD_UB2(dst, dst_stride, dst0, dst1);
998 ST_UB2(res0, res1, dst, dst_stride);
999 dst += (2 * dst_stride);
1009 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1010 v16u8 tmp0, tmp1, res0, res1;
1011 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1012 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1013 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1015 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
1016 src += (4 * src_stride);
1018 src += (2 * src_stride);
1020 inp1, inp2, inp3, inp4,
1021 inp1, inp0, inp0, inp1,
1022 inp2, inp3, inp4, inp5,
1023 const20, const6, const3);
1025 src += (2 * src_stride);
1027 inp3, inp4, inp5, inp6,
1028 inp3, inp2, inp1, inp0,
1029 inp4, inp5, inp6, inp7,
1030 const20, const6, const3);
1031 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1032 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1034 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1038 inp5, inp6, inp7, inp8,
1039 inp5, inp4, inp3, inp2,
1040 inp6, inp7, inp8, inp8,
1041 const20, const6, const3);
1043 inp7, inp8, inp8, inp7,
1044 inp7, inp6, inp5, inp4,
1045 inp8, inp8, inp7, inp6,
1046 const20, const6, const3);
1047 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1048 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1050 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1058 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1059 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1061 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1062 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1063 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1065 LD_UB5(
src, src_stride, inp0, inp1, inp2, inp3, inp4);
1066 src += (5 * src_stride);
1068 inp1, inp2, inp3, inp4,
1069 const20, const6, const3);
1070 res0 = __msa_aver_u_b(res0, inp0);
1077 inp2, inp3, inp4, inp5,
1078 const20, const6, const3);
1079 res0 = __msa_aver_u_b(res0, inp1);
1086 inp3, inp4, inp5, inp6,
1087 const20, const6, const3);
1088 res0 = __msa_aver_u_b(res0, inp2);
1095 inp4, inp5, inp6, inp7,
1096 const20, const6, const3);
1097 res0 = __msa_aver_u_b(res0, inp3);
1102 src += (2 * src_stride);
1104 inp5, inp6, inp7, inp8,
1105 const20, const6, const3);
1106 res0 = __msa_aver_u_b(res0, inp4);
1111 inp6, inp7, inp8, inp9,
1112 const20, const6, const3);
1113 res0 = __msa_aver_u_b(res0, inp5);
1118 src += (2 * src_stride);
1120 inp7, inp8, inp9, inp10,
1121 const20, const6, const3);
1122 res0 = __msa_aver_u_b(res0, inp6);
1127 inp8, inp9, inp10, inp11,
1128 const20, const6, const3);
1129 res0 = __msa_aver_u_b(res0, inp7);
1134 src += (2 * src_stride);
1136 inp9, inp10, inp11, inp12,
1137 const20, const6, const3);
1138 res0 = __msa_aver_u_b(res0, inp8);
1143 inp10, inp11, inp12, inp13,
1144 const20, const6, const3);
1145 res0 = __msa_aver_u_b(res0, inp9);
1150 src += (2 * src_stride);
1152 inp11, inp12, inp13, inp14,
1153 const20, const6, const3);
1154 res0 = __msa_aver_u_b(res0, inp10);
1159 inp12, inp13, inp14, inp15,
1160 const20, const6, const3);
1161 res0 = __msa_aver_u_b(res0, inp11);
1167 inp13, inp14, inp15, inp16,
1168 const20, const6, const3);
1169 res0 = __msa_aver_u_b(res0, inp12);
1174 inp14, inp15, inp16, inp16,
1175 const20, const6, const3);
1176 res0 = __msa_aver_u_b(res0, inp13);
1181 inp15, inp16, inp16, inp15,
1182 const20, const6, const3);
1183 res0 = __msa_aver_u_b(res0, inp14);
1188 inp16, inp16, inp15, inp14,
1189 const20, const6, const3);
1190 res0 = __msa_aver_u_b(res0, inp15);
1199 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1201 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1202 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1203 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1205 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
1206 src += (4 * src_stride);
1208 src += (2 * src_stride);
1210 inp1, inp2, inp3, inp4,
1211 inp1, inp0, inp0, inp1,
1212 inp2, inp3, inp4, inp5,
1213 const20, const6, const3);
1215 src += (2 * src_stride);
1217 inp3, inp4, inp5, inp6,
1218 inp3, inp2, inp1, inp0,
1219 inp4, inp5, inp6, inp7,
1220 const20, const6, const3);
1221 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1225 inp5, inp6, inp7, inp8,
1226 inp5, inp4, inp3, inp2,
1227 inp6, inp7, inp8, inp8,
1228 const20, const6, const3);
1230 inp7, inp8, inp8, inp7,
1231 inp7, inp6, inp5, inp4,
1232 inp8, inp8, inp7, inp6,
1233 const20, const6, const3);
1234 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1242 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1243 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1245 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1246 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1247 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1249 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
1250 src += (4 * src_stride);
1254 inp1, inp2, inp3, inp4,
1255 const20, const6, const3);
1262 inp2, inp3, inp4, inp5,
1263 const20, const6, const3);
1270 inp3, inp4, inp5, inp6,
1271 const20, const6, const3);
1278 inp4, inp5, inp6, inp7,
1279 const20, const6, const3);
1286 inp5, inp6, inp7, inp8,
1287 const20, const6, const3);
1294 inp6, inp7, inp8, inp9,
1295 const20, const6, const3);
1302 inp7, inp8, inp9, inp10,
1303 const20, const6, const3);
1310 inp8, inp9, inp10, inp11,
1311 const20, const6, const3);
1318 inp9, inp10, inp11, inp12,
1319 const20, const6, const3);
1326 inp10, inp11, inp12, inp13,
1327 const20, const6, const3);
1334 inp11, inp12, inp13, inp14,
1335 const20, const6, const3);
1342 inp12, inp13, inp14, inp15,
1343 const20, const6, const3);
1349 inp13, inp14, inp15, inp16,
1350 const20, const6, const3);
1355 inp14, inp15, inp16, inp16,
1356 const20, const6, const3);
1361 inp15, inp16, inp16, inp15,
1362 const20, const6, const3);
1367 inp16, inp16, inp15, inp14,
1368 const20, const6, const3);
1378 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1379 v16u8 tmp0, tmp1, res0, res1;
1380 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1381 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1382 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1384 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
1385 src += (4 * src_stride);
1387 src += (2 * src_stride);
1389 inp1, inp2, inp3, inp4,
1390 inp1, inp0, inp0, inp1,
1391 inp2, inp3, inp4, inp5,
1392 const20, const6, const3);
1395 src += (2 * src_stride);
1397 inp3, inp4, inp5, inp6,
1398 inp3, inp2, inp1, inp0,
1399 inp4, inp5, inp6, inp7,
1400 const20, const6, const3);
1401 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1402 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1404 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1408 inp5, inp6, inp7, inp8,
1409 inp5, inp4, inp3, inp2,
1410 inp6, inp7, inp8, inp8,
1411 const20, const6, const3);
1413 inp7, inp8, inp8, inp7,
1414 inp7, inp6, inp5, inp4,
1415 inp8, inp8, inp7, inp6,
1416 const20, const6, const3);
1417 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1418 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1420 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1428 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1429 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1431 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1432 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1433 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1435 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
1436 src += (4 * src_stride);
1440 inp1, inp2, inp3, inp4,
1441 const20, const6, const3);
1442 res0 = __msa_aver_u_b(res0, inp1);
1449 inp2, inp3, inp4, inp5,
1450 const20, const6, const3);
1451 res0 = __msa_aver_u_b(res0, inp2);
1458 inp3, inp4, inp5, inp6,
1459 const20, const6, const3);
1460 res0 = __msa_aver_u_b(res0, inp3);
1467 inp4, inp5, inp6, inp7,
1468 const20, const6, const3);
1469 res0 = __msa_aver_u_b(res0, inp4);
1476 inp5, inp6, inp7, inp8,
1477 const20, const6, const3);
1478 res0 = __msa_aver_u_b(res0, inp5);
1485 inp6, inp7, inp8, inp9,
1486 const20, const6, const3);
1487 res0 = __msa_aver_u_b(res0, inp6);
1494 inp7, inp8, inp9, inp10,
1495 const20, const6, const3);
1496 res0 = __msa_aver_u_b(res0, inp7);
1503 inp8, inp9, inp10, inp11,
1504 const20, const6, const3);
1505 res0 = __msa_aver_u_b(res0, inp8);
1512 inp9, inp10, inp11, inp12,
1513 const20, const6, const3);
1514 res0 = __msa_aver_u_b(res0, inp9);
1521 inp10, inp11, inp12, inp13,
1522 const20, const6, const3);
1523 res0 = __msa_aver_u_b(res0, inp10);
1530 inp11, inp12, inp13, inp14,
1531 const20, const6, const3);
1532 res0 = __msa_aver_u_b(res0, inp11);
1539 inp12, inp13, inp14, inp15,
1540 const20, const6, const3);
1541 res0 = __msa_aver_u_b(res0, inp12);
1547 inp13, inp14, inp15, inp16,
1548 const20, const6, const3);
1549 res0 = __msa_aver_u_b(res0, inp13);
1554 inp14, inp15, inp16, inp16,
1555 const20, const6, const3);
1556 res0 = __msa_aver_u_b(res0, inp14);
1561 inp15, inp16, inp16, inp15,
1562 const20, const6, const3);
1563 res0 = __msa_aver_u_b(res0, inp15);
1568 inp16, inp16, inp15, inp14,
1569 const20, const6, const3);
1570 res0 = __msa_aver_u_b(res0, inp16);
1579 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1580 v16u8 tmp0, tmp1, res0, res1;
1581 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1582 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1583 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1585 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
1586 src += (4 * src_stride);
1588 src += (2 * src_stride);
1590 inp1, inp2, inp3, inp4,
1591 inp1, inp0, inp0, inp1,
1592 inp2, inp3, inp4, inp5,
1593 const20, const6, const3);
1595 src += (2 * src_stride);
1597 inp3, inp4, inp5, inp6,
1598 inp3, inp2, inp1, inp0,
1599 inp4, inp5, inp6, inp7,
1600 const20, const6, const3);
1601 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1602 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1603 res0 = __msa_ave_u_b(res0, tmp0);
1604 res1 = __msa_ave_u_b(res1, tmp1);
1605 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1609 inp5, inp6, inp7, inp8,
1610 inp5, inp4, inp3, inp2,
1611 inp6, inp7, inp8, inp8,
1612 const20, const6, const3);
1614 inp7, inp8, inp8, inp7,
1615 inp7, inp6, inp5, inp4,
1616 inp8, inp8, inp7, inp6,
1617 const20, const6, const3);
1618 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1619 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1620 res0 = __msa_ave_u_b(res0, tmp0);
1621 res1 = __msa_ave_u_b(res1, tmp1);
1622 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1630 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1631 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1633 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1634 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1635 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1637 LD_UB5(
src, src_stride, inp0, inp1, inp2, inp3, inp4);
1638 src += (5 * src_stride);
1640 inp1, inp2, inp3, inp4,
1641 const20, const6, const3);
1642 res0 = __msa_ave_u_b(res0, inp0);
1649 inp2, inp3, inp4, inp5,
1650 const20, const6, const3);
1651 res0 = __msa_ave_u_b(res0, inp1);
1658 inp3, inp4, inp5, inp6,
1659 const20, const6, const3);
1660 res0 = __msa_ave_u_b(res0, inp2);
1667 inp4, inp5, inp6, inp7,
1668 const20, const6, const3);
1669 res0 = __msa_ave_u_b(res0, inp3);
1676 inp5, inp6, inp7, inp8,
1677 const20, const6, const3);
1678 res0 = __msa_ave_u_b(res0, inp4);
1685 inp6, inp7, inp8, inp9,
1686 const20, const6, const3);
1687 res0 = __msa_ave_u_b(res0, inp5);
1694 inp7, inp8, inp9, inp10,
1695 const20, const6, const3);
1696 res0 = __msa_ave_u_b(res0, inp6);
1703 inp8, inp9, inp10, inp11,
1704 const20, const6, const3);
1705 res0 = __msa_ave_u_b(res0, inp7);
1712 inp9, inp10, inp11, inp12,
1713 const20, const6, const3);
1714 res0 = __msa_ave_u_b(res0, inp8);
1721 inp10, inp11, inp12, inp13,
1722 const20, const6, const3);
1723 res0 = __msa_ave_u_b(res0, inp9);
1730 inp11, inp12, inp13, inp14,
1731 const20, const6, const3);
1732 res0 = __msa_ave_u_b(res0, inp10);
1739 inp12, inp13, inp14, inp15,
1740 const20, const6, const3);
1741 res0 = __msa_ave_u_b(res0, inp11);
1747 inp13, inp14, inp15, inp16,
1748 const20, const6, const3);
1749 res0 = __msa_ave_u_b(res0, inp12);
1754 inp14, inp15, inp16, inp16,
1755 const20, const6, const3);
1756 res0 = __msa_ave_u_b(res0, inp13);
1761 inp15, inp16, inp16, inp15,
1762 const20, const6, const3);
1763 res0 = __msa_ave_u_b(res0, inp14);
1768 inp16, inp16, inp15, inp14,
1769 const20, const6, const3);
1770 res0 = __msa_ave_u_b(res0, inp15);
1780 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1782 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1783 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1784 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1786 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
1787 src += (4 * src_stride);
1789 src += (2 * src_stride);
1791 inp1, inp2, inp3, inp4,
1792 inp1, inp0, inp0, inp1,
1793 inp2, inp3, inp4, inp5,
1794 const20, const6, const3);
1796 src += (2 * src_stride);
1798 inp3, inp4, inp5, inp6,
1799 inp3, inp2, inp1, inp0,
1800 inp4, inp5, inp6, inp7,
1801 const20, const6, const3);
1802 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1806 inp5, inp6, inp7, inp8,
1807 inp5, inp4, inp3, inp2,
1808 inp6, inp7, inp8, inp8,
1809 const20, const6, const3);
1811 inp7, inp8, inp8, inp7,
1812 inp7, inp6, inp5, inp4,
1813 inp8, inp8, inp7, inp6,
1814 const20, const6, const3);
1815 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1823 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1824 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1826 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1827 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1828 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1830 LD_UB5(
src, src_stride, inp0, inp1, inp2, inp3, inp4);
1831 src += (5 * src_stride);
1833 inp1, inp2, inp3, inp4,
1834 const20, const6, const3);
1841 inp2, inp3, inp4, inp5,
1842 const20, const6, const3);
1849 inp3, inp4, inp5, inp6,
1850 const20, const6, const3);
1857 inp4, inp5, inp6, inp7,
1858 const20, const6, const3);
1865 inp5, inp6, inp7, inp8,
1866 const20, const6, const3);
1873 inp6, inp7, inp8, inp9,
1874 const20, const6, const3);
1881 inp7, inp8, inp9, inp10,
1882 const20, const6, const3);
1889 inp8, inp9, inp10, inp11,
1890 const20, const6, const3);
1897 inp9, inp10, inp11, inp12,
1898 const20, const6, const3);
1905 inp10, inp11, inp12, inp13,
1906 const20, const6, const3);
1913 inp11, inp12, inp13, inp14,
1914 const20, const6, const3);
1921 inp12, inp13, inp14, inp15,
1922 const20, const6, const3);
1928 inp13, inp14, inp15, inp16,
1929 const20, const6, const3);
1934 inp14, inp15, inp16, inp16,
1935 const20, const6, const3);
1940 inp15, inp16, inp16, inp15,
1941 const20, const6, const3);
1946 inp16, inp16, inp15, inp14,
1947 const20, const6, const3);
1956 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1957 v16u8 tmp0, tmp1, res0, res1;
1958 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1959 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1960 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1962 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
1963 src += (4 * src_stride);
1965 src += (2 * src_stride);
1967 inp1, inp2, inp3, inp4,
1968 inp1, inp0, inp0, inp1,
1969 inp2, inp3, inp4, inp5,
1970 const20, const6, const3);
1972 src += (2 * src_stride);
1974 inp3, inp4, inp5, inp6,
1975 inp3, inp2, inp1, inp0,
1976 inp4, inp5, inp6, inp7,
1977 const20, const6, const3);
1978 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1979 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1980 res0 = __msa_ave_u_b(res0, tmp0);
1981 res1 = __msa_ave_u_b(res1, tmp1);
1982 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1986 inp5, inp6, inp7, inp8,
1987 inp5, inp4, inp3, inp2,
1988 inp6, inp7, inp8, inp8,
1989 const20, const6, const3);
1991 inp7, inp8, inp8, inp7,
1992 inp7, inp6, inp5, inp4,
1993 inp8, inp8, inp7, inp6,
1994 const20, const6, const3);
1995 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1996 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1997 res0 = __msa_ave_u_b(res0, tmp0);
1998 res1 = __msa_ave_u_b(res1, tmp1);
1999 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2007 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2008 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2010 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2011 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2012 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2014 LD_UB5(
src, src_stride, inp0, inp1, inp2, inp3, inp4);
2015 src += (5 * src_stride);
2017 inp1, inp2, inp3, inp4,
2018 const20, const6, const3);
2019 res0 = __msa_ave_u_b(res0, inp1);
2026 inp2, inp3, inp4, inp5,
2027 const20, const6, const3);
2028 res0 = __msa_ave_u_b(res0, inp2);
2035 inp3, inp4, inp5, inp6,
2036 const20, const6, const3);
2037 res0 = __msa_ave_u_b(res0, inp3);
2044 inp4, inp5, inp6, inp7,
2045 const20, const6, const3);
2046 res0 = __msa_ave_u_b(res0, inp4);
2053 inp5, inp6, inp7, inp8,
2054 const20, const6, const3);
2055 res0 = __msa_ave_u_b(res0, inp5);
2062 inp6, inp7, inp8, inp9,
2063 const20, const6, const3);
2064 res0 = __msa_ave_u_b(res0, inp6);
2071 inp7, inp8, inp9, inp10,
2072 const20, const6, const3);
2073 res0 = __msa_ave_u_b(res0, inp7);
2080 inp8, inp9, inp10, inp11,
2081 const20, const6, const3);
2082 res0 = __msa_ave_u_b(res0, inp8);
2089 inp9, inp10, inp11, inp12,
2090 const20, const6, const3);
2091 res0 = __msa_ave_u_b(res0, inp9);
2098 inp10, inp11, inp12, inp13,
2099 const20, const6, const3);
2100 res0 = __msa_ave_u_b(res0, inp10);
2107 inp11, inp12, inp13, inp14,
2108 const20, const6, const3);
2109 res0 = __msa_ave_u_b(res0, inp11);
2116 inp12, inp13, inp14, inp15,
2117 const20, const6, const3);
2118 res0 = __msa_ave_u_b(res0, inp12);
2124 inp13, inp14, inp15, inp16,
2125 const20, const6, const3);
2126 res0 = __msa_ave_u_b(res0, inp13);
2131 inp14, inp15, inp16, inp16,
2132 const20, const6, const3);
2133 res0 = __msa_ave_u_b(res0, inp14);
2138 inp15, inp16, inp16, inp15,
2139 const20, const6, const3);
2140 res0 = __msa_ave_u_b(res0, inp15);
2145 inp16, inp16, inp15, inp14,
2146 const20, const6, const3);
2147 res0 = __msa_ave_u_b(res0, inp16);
2156 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2157 v16u8 dst0, dst1, dst2, dst3;
2158 v16u8 tmp0, tmp1, res0, res1;
2159 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2160 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2161 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2163 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
2164 src += (4 * src_stride);
2166 src += (2 * src_stride);
2168 inp1, inp2, inp3, inp4,
2169 inp1, inp0, inp0, inp1,
2170 inp2, inp3, inp4, inp5,
2171 const20, const6, const3);
2174 src += (2 * src_stride);
2176 inp3, inp4, inp5, inp6,
2177 inp3, inp2, inp1, inp0,
2178 inp4, inp5, inp6, inp7,
2179 const20, const6, const3);
2181 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2182 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
2183 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
2184 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2185 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2188 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2189 dst += (4 * dst_stride);
2193 inp5, inp6, inp7, inp8,
2194 inp5, inp4, inp3, inp2,
2195 inp6, inp7, inp8, inp8,
2196 const20, const6, const3);
2198 inp7, inp8, inp8, inp7,
2199 inp7, inp6, inp5, inp4,
2200 inp8, inp8, inp7, inp6,
2201 const20, const6, const3);
2203 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2204 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
2205 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
2206 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2207 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2210 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2218 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2219 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2220 v16u8 res0, res1, dst0, dst1;
2221 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2222 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2223 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2225 LD_UB5(
src, src_stride, inp0, inp1, inp2, inp3, inp4);
2226 src += (5 * src_stride);
2228 inp1, inp2, inp3, inp4,
2229 const20, const6, const3);
2234 inp2, inp3, inp4, inp5,
2235 const20, const6, const3);
2237 LD_UB2(dst, dst_stride, dst0, dst1);
2240 ST_UB2(res0, res1, dst, dst_stride);
2241 dst += (2 * dst_stride);
2246 inp3, inp4, inp5, inp6,
2247 const20, const6, const3);
2252 inp4, inp5, inp6, inp7,
2253 const20, const6, const3);
2255 LD_UB2(dst, dst_stride, dst0, dst1);
2258 ST_UB2(res0, res1, dst, dst_stride);
2259 dst += (2 * dst_stride);
2262 src += (2 * src_stride);
2264 inp5, inp6, inp7, inp8,
2265 const20, const6, const3);
2267 inp6, inp7, inp8, inp9,
2268 const20, const6, const3);
2270 LD_UB2(dst, dst_stride, dst0, dst1);
2273 ST_UB2(res0, res1, dst, dst_stride);
2274 dst += (2 * dst_stride);
2277 src += (2 * src_stride);
2279 inp7, inp8, inp9, inp10,
2280 const20, const6, const3);
2282 inp8, inp9, inp10, inp11,
2283 const20, const6, const3);
2285 LD_UB2(dst, dst_stride, dst0, dst1);
2288 ST_UB2(res0, res1, dst, dst_stride);
2289 dst += (2 * dst_stride);
2292 src += (2 * src_stride);
2294 inp9, inp10, inp11, inp12,
2295 const20, const6, const3);
2297 inp10, inp11, inp12, inp13,
2298 const20, const6, const3);
2299 LD_UB2(dst, dst_stride, dst0, dst1);
2302 ST_UB2(res0, res1, dst, dst_stride);
2303 dst += (2 * dst_stride);
2306 src += (2 * src_stride);
2308 inp11, inp12, inp13, inp14,
2309 const20, const6, const3);
2311 inp12, inp13, inp14, inp15,
2312 const20, const6, const3);
2314 LD_UB2(dst, dst_stride, dst0, dst1);
2315 AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
2317 ST_UB2(res0, res1, dst, dst_stride);
2318 dst += (2 * dst_stride);
2322 inp13, inp14, inp15, inp16,
2323 const20, const6, const3);
2325 inp14, inp15, inp16, inp16,
2326 const20, const6, const3);
2327 LD_UB2(dst, dst_stride, dst0, dst1);
2328 AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
2330 ST_UB2(res0, res1, dst, dst_stride);
2331 dst += (2 * dst_stride);
2334 inp15, inp16, inp16, inp15,
2335 const20, const6, const3);
2337 inp16, inp16, inp15, inp14,
2338 const20, const6, const3);
2339 LD_UB2(dst, dst_stride, dst0, dst1);
2340 AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
2342 ST_UB2(res0, res1, dst, dst_stride);
2350 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2351 v16u8 dst0, dst1, dst2, dst3;
2353 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2354 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2355 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2357 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
2358 src += (4 * src_stride);
2360 src += (2 * src_stride);
2362 inp1, inp2, inp3, inp4,
2363 inp1, inp0, inp0, inp1,
2364 inp2, inp3, inp4, inp5,
2365 const20, const6, const3);
2367 src += (2 * src_stride);
2369 inp3, inp4, inp5, inp6,
2370 inp3, inp2, inp1, inp0,
2371 inp4, inp5, inp6, inp7,
2372 const20, const6, const3);
2373 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2374 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2375 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2377 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2378 dst += (4 * dst_stride);
2382 inp5, inp6, inp7, inp8,
2383 inp5, inp4, inp3, inp2,
2384 inp6, inp7, inp8, inp8,
2385 const20, const6, const3);
2387 inp7, inp8, inp8, inp7,
2388 inp7, inp6, inp5, inp4,
2389 inp8, inp8, inp7, inp6,
2390 const20, const6, const3);
2391 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2392 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2393 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2395 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2403 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2404 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2405 v16u8 res0, res1, dst0, dst1;
2406 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2407 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2408 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2410 LD_UB5(
src, src_stride, inp0, inp1, inp2, inp3, inp4);
2411 src += (5 * src_stride);
2413 inp1, inp2, inp3, inp4,
2414 const20, const6, const3);
2418 inp2, inp3, inp4, inp5,
2419 const20, const6, const3);
2420 LD_UB2(dst, dst_stride, dst0, dst1);
2422 ST_UB2(res0, res1, dst, dst_stride);
2423 dst += (2 * dst_stride);
2428 inp3, inp4, inp5, inp6,
2429 const20, const6, const3);
2433 inp4, inp5, inp6, inp7,
2434 const20, const6, const3);
2435 LD_UB2(dst, dst_stride, dst0, dst1);
2437 ST_UB2(res0, res1, dst, dst_stride);
2438 dst += (2 * dst_stride);
2443 inp5, inp6, inp7, inp8,
2444 const20, const6, const3);
2448 inp6, inp7, inp8, inp9,
2449 const20, const6, const3);
2450 LD_UB2(dst, dst_stride, dst0, dst1);
2452 ST_UB2(res0, res1, dst, dst_stride);
2453 dst += (2 * dst_stride);
2458 inp7, inp8, inp9, inp10,
2459 const20, const6, const3);
2463 inp8, inp9, inp10, inp11,
2464 const20, const6, const3);
2465 LD_UB2(dst, dst_stride, dst0, dst1);
2467 ST_UB2(res0, res1, dst, dst_stride);
2468 dst += (2 * dst_stride);
2473 inp9, inp10, inp11, inp12,
2474 const20, const6, const3);
2478 inp10, inp11, inp12, inp13,
2479 const20, const6, const3);
2480 LD_UB2(dst, dst_stride, dst0, dst1);
2482 ST_UB2(res0, res1, dst, dst_stride);
2483 dst += (2 * dst_stride);
2488 inp11, inp12, inp13, inp14,
2489 const20, const6, const3);
2493 inp12, inp13, inp14, inp15,
2494 const20, const6, const3);
2495 LD_UB2(dst, dst_stride, dst0, dst1);
2497 ST_UB2(res0, res1, dst, dst_stride);
2498 dst += (2 * dst_stride);
2502 inp13, inp14, inp15, inp16,
2503 const20, const6, const3);
2505 inp14, inp15, inp16, inp16,
2506 const20, const6, const3);
2507 LD_UB2(dst, dst_stride, dst0, dst1);
2509 ST_UB2(res0, res1, dst, dst_stride);
2510 dst += (2 * dst_stride);
2513 inp15, inp16, inp16, inp15,
2514 const20, const6, const3);
2516 inp16, inp16, inp15, inp14,
2517 const20, const6, const3);
2518 LD_UB2(dst, dst_stride, dst0, dst1);
2520 ST_UB2(res0, res1, dst, dst_stride);
2528 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2529 v16u8 dst0, dst1, dst2, dst3;
2530 v16u8 tmp0, tmp1, res0, res1;
2531 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2532 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2533 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2535 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
2536 src += (4 * src_stride);
2538 src += (2 * src_stride);
2540 inp1, inp2, inp3, inp4,
2541 inp1, inp0, inp0, inp1,
2542 inp2, inp3, inp4, inp5,
2543 const20, const6, const3);
2545 src += (2 * src_stride);
2547 inp3, inp4, inp5, inp6,
2548 inp3, inp2, inp1, inp0,
2549 inp4, inp5, inp6, inp7,
2550 const20, const6, const3);
2551 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2552 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
2553 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
2554 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2555 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2558 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2559 dst += (4 * dst_stride);
2563 inp5, inp6, inp7, inp8,
2564 inp5, inp4, inp3, inp2,
2565 inp6, inp7, inp8, inp8,
2566 const20, const6, const3);
2568 inp7, inp8, inp8, inp7,
2569 inp7, inp6, inp5, inp4,
2570 inp8, inp8, inp7, inp6,
2571 const20, const6, const3);
2572 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2573 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2574 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2575 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2576 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2579 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2587 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2588 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2589 v16u8 res0, res1, dst0, dst1;
2590 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2591 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2592 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2594 LD_UB5(
src, src_stride, inp0, inp1, inp2, inp3, inp4);
2595 src += (5 * src_stride);
2597 inp1, inp2, inp3, inp4,
2598 const20, const6, const3);
2602 inp2, inp3, inp4, inp5,
2603 const20, const6, const3);
2604 LD_UB2(dst, dst_stride, dst0, dst1);
2607 ST_UB2(res0, res1, dst, dst_stride);
2608 dst += (2 * dst_stride);
2613 inp3, inp4, inp5, inp6,
2614 const20, const6, const3);
2618 inp4, inp5, inp6, inp7,
2619 const20, const6, const3);
2620 LD_UB2(dst, dst_stride, dst0, dst1);
2623 ST_UB2(res0, res1, dst, dst_stride);
2624 dst += (2 * dst_stride);
2629 inp5, inp6, inp7, inp8,
2630 const20, const6, const3);
2634 inp6, inp7, inp8, inp9,
2635 const20, const6, const3);
2636 LD_UB2(dst, dst_stride, dst0, dst1);
2639 ST_UB2(res0, res1, dst, dst_stride);
2640 dst += (2 * dst_stride);
2645 inp7, inp8, inp9, inp10,
2646 const20, const6, const3);
2650 inp8, inp9, inp10, inp11,
2651 const20, const6, const3);
2652 LD_UB2(dst, dst_stride, dst0, dst1);
2655 ST_UB2(res0, res1, dst, dst_stride);
2656 dst += (2 * dst_stride);
2661 inp9, inp10, inp11, inp12,
2662 const20, const6, const3);
2666 inp10, inp11, inp12, inp13,
2667 const20, const6, const3);
2668 LD_UB2(dst, dst_stride, dst0, dst1);
2671 ST_UB2(res0, res1, dst, dst_stride);
2672 dst += (2 * dst_stride);
2677 inp11, inp12, inp13, inp14,
2678 const20, const6, const3);
2682 inp12, inp13, inp14, inp15,
2683 const20, const6, const3);
2684 LD_UB2(dst, dst_stride, dst0, dst1);
2685 AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
2687 ST_UB2(res0, res1, dst, dst_stride);
2688 dst += (2 * dst_stride);
2692 inp13, inp14, inp15, inp16,
2693 const20, const6, const3);
2695 inp14, inp15, inp16, inp16,
2696 const20, const6, const3);
2697 LD_UB2(dst, dst_stride, dst0, dst1);
2698 AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
2700 ST_UB2(res0, res1, dst, dst_stride);
2701 dst += (2 * dst_stride);
2704 inp15, inp16, inp16, inp15,
2705 const20, const6, const3);
2707 inp16, inp16, inp15, inp14,
2708 const20, const6, const3);
2709 LD_UB2(dst, dst_stride, dst0, dst1);
2710 AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
2712 ST_UB2(res0, res1, dst, dst_stride);
2722 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2724 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2725 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2726 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2727 v8u16 const20 = (v8u16) __msa_ldi_h(20);
2729 for (loop_count = (
height >> 2); loop_count--;) {
2730 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
2731 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
2732 src += (4 * src_stride);
2734 const20, const6, const3);
2735 res = __msa_ave_u_b(inp0, res);
2740 const20, const6, const3);
2741 res = __msa_ave_u_b(inp2, res);
2746 const20, const6, const3);
2747 res = __msa_ave_u_b(inp4, res);
2752 const20, const6, const3);
2753 res = __msa_ave_u_b(inp6, res);
2760 const20, const6, const3);
2761 res = __msa_ave_u_b(inp0, res);
2781 v16u8 inp0, inp1, inp2, inp3;
2782 v16u8 res0, res1, avg0, avg1;
2783 v16u8 horiz0, horiz1, horiz2, horiz3;
2784 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2785 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2786 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2787 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2788 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2789 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2790 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2791 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2794 src += (2 * src_stride);
2796 mask2, mask3, const20,
2798 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2799 horiz0 = __msa_ave_u_b(inp0, res0);
2800 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2802 src += (2 * src_stride);
2804 mask2, mask3, const20,
2806 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2807 horiz2 = __msa_ave_u_b(inp2, res1);
2808 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2810 src += (2 * src_stride);
2812 mask2, mask3, const20,
2814 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2815 horiz4 = __msa_ave_u_b(inp0, res0);
2816 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2818 horiz1, horiz2, horiz3, horiz4,
2819 horiz1, horiz0, horiz0, horiz1,
2820 horiz2, horiz3, horiz4, horiz5,
2821 const20, const6, const3);
2822 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2823 res0 = __msa_ave_u_b(avg0, res0);
2824 ST_D2(res0, 0, 1, dst, dst_stride);
2825 dst += (2 * dst_stride);
2828 src += (2 * src_stride);
2830 mask2, mask3, const20,
2832 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2833 horiz6 = __msa_ave_u_b(inp2, res1);
2834 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2837 mask2, mask3, const20,
2839 horiz8 = __msa_ave_u_b(inp0, res0);
2841 horiz3, horiz4, horiz5, horiz6,
2842 horiz3, horiz2, horiz1, horiz0,
2843 horiz4, horiz5, horiz6, horiz7,
2844 const20, const6, const3);
2845 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2846 res1 = __msa_ave_u_b(avg1, res1);
2848 horiz5, horiz6, horiz7, horiz8,
2849 horiz5, horiz4, horiz3, horiz2,
2850 horiz6, horiz7, horiz8, horiz8,
2851 const20, const6, const3);
2852 ST_D2(res1, 0, 1, dst, dst_stride);
2853 dst += 2 * dst_stride;
2855 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
2856 res0 = __msa_ave_u_b(avg0, res0);
2858 horiz7, horiz8, horiz8, horiz7,
2859 horiz7, horiz6, horiz5, horiz4,
2860 horiz8, horiz8, horiz7, horiz6,
2861 const20, const6, const3);
2862 ST_D2(res0, 0, 1, dst, dst_stride);
2863 dst += 2 * dst_stride;
2865 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
2866 res1 = __msa_ave_u_b(avg1, res1);
2867 ST_D2(res1, 0, 1, dst, dst_stride);
2877 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2879 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2880 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2881 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2882 v8u16 const20 = (v8u16) __msa_ldi_h(20);
2884 for (loop_count = (
height >> 2); loop_count--;) {
2885 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
2886 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
2887 src += (4 * src_stride);
2889 const20, const6, const3);
2894 const20, const6, const3);
2899 const20, const6, const3);
2904 const20, const6, const3);
2911 const20, const6, const3);
2931 v16u8 inp0, inp1, inp2, inp3;
2932 v16u8 res0, res1, avg0, avg1;
2933 v16u8 horiz0, horiz1, horiz2, horiz3;
2934 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2935 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2936 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2937 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2938 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2939 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2940 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2941 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2944 src += (2 * src_stride);
2946 mask2, mask3, const20,
2948 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2951 src += (2 * src_stride);
2953 mask2, mask3, const20,
2955 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2957 src += (2 * src_stride);
2959 mask2, mask3, const20,
2961 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2963 horiz1, horiz2, horiz3, horiz4,
2964 horiz1, horiz0, horiz0, horiz1,
2965 horiz2, horiz3, horiz4, horiz5,
2966 const20, const6, const3);
2967 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2968 res0 = __msa_ave_u_b(avg0, res0);
2969 ST_D2(res0, 0, 1, dst, dst_stride);
2970 dst += (2 * dst_stride);
2973 src += (2 * src_stride);
2975 mask2, mask3, const20,
2977 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2980 mask2, mask3, const20,
2983 horiz3, horiz4, horiz5, horiz6,
2984 horiz3, horiz2, horiz1, horiz0,
2985 horiz4, horiz5, horiz6, horiz7,
2986 const20, const6, const3);
2987 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2988 res1 = __msa_ave_u_b(avg1, res1);
2989 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2990 res0 = __msa_ave_u_b(avg0, res0);
2991 ST_D2(res1, 0, 1, dst, dst_stride);
2992 dst += (2 * dst_stride);
2995 horiz5, horiz6, horiz7, horiz8,
2996 horiz5, horiz4, horiz3, horiz2,
2997 horiz6, horiz7, horiz8, horiz8,
2998 const20, const6, const3);
2999 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3000 res0 = __msa_ave_u_b(avg0, res0);
3001 ST_D2(res0, 0, 1, dst, dst_stride);
3002 dst += (2 * dst_stride);
3005 horiz7, horiz8, horiz8, horiz7,
3006 horiz7, horiz6, horiz5, horiz4,
3007 horiz8, horiz8, horiz7, horiz6,
3008 const20, const6, const3);
3009 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3010 res1 = __msa_ave_u_b(avg1, res1);
3011 ST_D2(res1, 0, 1, dst, dst_stride);
3021 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3023 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3024 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3025 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3026 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3028 for (loop_count = (
height >> 2); loop_count--;) {
3029 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
3030 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
3031 src += (4 * src_stride);
3033 const20, const6, const3);
3034 res = __msa_ave_u_b(res, inp1);
3039 const20, const6, const3);
3040 res = __msa_ave_u_b(res, inp3);
3045 const20, const6, const3);
3046 res = __msa_ave_u_b(res, inp5);
3051 const20, const6, const3);
3052 res = __msa_ave_u_b(res, inp7);
3059 const20, const6, const3);
3060 res = __msa_ave_u_b(inp1, res);
3080 v16u8 inp0, inp1, inp2, inp3;
3081 v16u8 res0, res1, avg0, avg1;
3082 v16u8 horiz0, horiz1, horiz2, horiz3;
3083 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3084 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3085 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3086 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3087 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3088 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3089 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3090 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3093 src += (2 * src_stride);
3095 mask2, mask3, const20,
3097 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3099 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3100 horiz0 = __msa_ave_u_b(inp0, res0);
3101 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3103 src += (2 * src_stride);
3105 mask2, mask3, const20,
3107 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3109 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3110 horiz2 = __msa_ave_u_b(inp2, res1);
3111 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3113 src += (2 * src_stride);
3115 mask2, mask3, const20,
3117 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3119 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3120 horiz4 = __msa_ave_u_b(inp0, res0);
3121 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3123 horiz1, horiz2, horiz3, horiz4,
3124 horiz1, horiz0, horiz0, horiz1,
3125 horiz2, horiz3, horiz4, horiz5,
3126 const20, const6, const3);
3127 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3128 res0 = __msa_ave_u_b(avg0, res0);
3129 ST_D2(res0, 0, 1, dst, dst_stride);
3130 dst += (2 * dst_stride);
3133 src += (2 * src_stride);
3135 mask2, mask3, const20,
3137 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3139 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3140 horiz6 = __msa_ave_u_b(inp2, res1);
3141 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3144 mask2, mask3, const20,
3146 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3147 horiz8 = __msa_ave_u_b(inp0, res0);
3149 horiz3, horiz4, horiz5, horiz6,
3150 horiz3, horiz2, horiz1, horiz0,
3151 horiz4, horiz5, horiz6, horiz7,
3152 const20, const6, const3);
3153 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3154 res1 = __msa_ave_u_b(avg1, res1);
3155 ST_D2(res1, 0, 1, dst, dst_stride);
3156 dst += (2 * dst_stride);
3159 horiz5, horiz6, horiz7, horiz8,
3160 horiz5, horiz4, horiz3, horiz2,
3161 horiz6, horiz7, horiz8, horiz8,
3162 const20, const6, const3);
3163 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3164 res0 = __msa_ave_u_b(avg0, res0);
3165 ST_D2(res0, 0, 1, dst, dst_stride);
3166 dst += (2 * dst_stride);
3169 horiz7, horiz8, horiz8, horiz7,
3170 horiz7, horiz6, horiz5, horiz4,
3171 horiz8, horiz8, horiz7, horiz6,
3172 const20, const6, const3);
3173 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3174 res1 = __msa_ave_u_b(avg1, res1);
3175 ST_D2(res1, 0, 1, dst, dst_stride);
3194 v16u8 inp0, inp1, inp2, inp3;
3196 v16u8 horiz0, horiz1, horiz2, horiz3;
3197 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3198 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3199 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3200 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3201 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3202 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3203 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3204 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3207 src += (2 * src_stride);
3209 mask2, mask3, const20,
3211 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3212 horiz0 = __msa_ave_u_b(inp0, res0);
3213 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3215 src += (2 * src_stride);
3217 mask2, mask3, const20,
3219 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3220 horiz2 = __msa_ave_u_b(inp2, res1);
3221 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3223 src += (2 * src_stride);
3225 mask2, mask3, const20,
3227 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3228 horiz4 = __msa_ave_u_b(inp0, res0);
3229 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3231 horiz1, horiz2, horiz3, horiz4,
3232 horiz1, horiz0, horiz0, horiz1,
3233 horiz2, horiz3, horiz4, horiz5,
3234 const20, const6, const3);
3237 src += (2 * src_stride);
3238 ST_D2(res0, 0, 1, dst, dst_stride);
3239 dst += 2 * dst_stride;
3242 mask2, mask3, const20,
3244 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3245 horiz6 = __msa_ave_u_b(inp2, res1);
3246 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3249 mask2, mask3, const20,
3251 horiz8 = __msa_ave_u_b(inp0, res0);
3253 horiz3, horiz4, horiz5, horiz6,
3254 horiz3, horiz2, horiz1, horiz0,
3255 horiz4, horiz5, horiz6, horiz7,
3256 const20, const6, const3);
3258 horiz5, horiz6, horiz7, horiz8,
3259 horiz5, horiz4, horiz3, horiz2,
3260 horiz6, horiz7, horiz8, horiz8,
3261 const20, const6, const3);
3262 ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride);
3263 dst += (4 * dst_stride);
3266 horiz7, horiz8, horiz8, horiz7,
3267 horiz7, horiz6, horiz5, horiz4,
3268 horiz8, horiz8, horiz7, horiz6,
3269 const20, const6, const3);
3270 ST_D2(res1, 0, 1, dst, dst_stride);
3289 v16u8 inp0, inp1, inp2, inp3;
3291 v16u8 horiz0, horiz1, horiz2, horiz3;
3292 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3293 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3294 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3295 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3296 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3297 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3298 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3299 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3302 src += (2 * src_stride);
3304 mask2, mask3, const20,
3306 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3308 src += (2 * src_stride);
3310 mask2, mask3, const20,
3312 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3314 src += (2 * src_stride);
3316 mask2, mask3, const20,
3318 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3320 horiz1, horiz2, horiz3, horiz4,
3321 horiz1, horiz0, horiz0, horiz1,
3322 horiz2, horiz3, horiz4, horiz5,
3323 const20, const6, const3);
3325 src += (2 * src_stride);
3326 ST_D2(res0, 0, 1, dst, dst_stride);
3327 dst += 2 * dst_stride;
3330 mask2, mask3, const20,
3332 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3335 mask2, mask3, const20,
3338 horiz3, horiz4, horiz5, horiz6,
3339 horiz3, horiz2, horiz1, horiz0,
3340 horiz4, horiz5, horiz6, horiz7,
3341 const20, const6, const3);
3343 horiz5, horiz6, horiz7, horiz8,
3344 horiz5, horiz4, horiz3, horiz2,
3345 horiz6, horiz7, horiz8, horiz8,
3346 const20, const6, const3);
3347 ST_D2(res1, 0, 1, dst, dst_stride);
3348 dst += 2 * dst_stride;
3352 horiz7, horiz8, horiz8, horiz7,
3353 horiz7, horiz6, horiz5, horiz4,
3354 horiz8, horiz8, horiz7, horiz6,
3355 const20, const6, const3);
3356 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3375 v16u8 inp0, inp1, inp2, inp3;
3377 v16u8 horiz0, horiz1, horiz2, horiz3;
3378 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3379 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3380 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3381 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3382 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3383 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3384 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3385 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3388 src += (2 * src_stride);
3390 mask2, mask3, const20,
3392 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3394 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3395 horiz0 = __msa_ave_u_b(inp0, res0);
3396 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3398 src += (2 * src_stride);
3400 mask2, mask3, const20,
3402 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3404 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3405 horiz2 = __msa_ave_u_b(inp2, res1);
3406 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3408 src += (2 * src_stride);
3410 mask2, mask3, const20,
3412 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3414 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3415 horiz4 = __msa_ave_u_b(inp0, res0);
3416 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3418 horiz1, horiz2, horiz3, horiz4,
3419 horiz1, horiz0, horiz0, horiz1,
3420 horiz2, horiz3, horiz4, horiz5,
3421 const20, const6, const3);
3423 src += (2 * src_stride);
3424 ST_D2(res0, 0, 1, dst, dst_stride);
3425 dst += 2 * dst_stride;
3428 mask2, mask3, const20,
3430 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3432 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3433 horiz6 = __msa_ave_u_b(inp2, res1);
3434 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3437 mask2, mask3, const20,
3439 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3440 horiz8 = __msa_ave_u_b(inp0, res0);
3442 horiz3, horiz4, horiz5, horiz6,
3443 horiz3, horiz2, horiz1, horiz0,
3444 horiz4, horiz5, horiz6, horiz7,
3445 const20, const6, const3);
3447 horiz5, horiz6, horiz7, horiz8,
3448 horiz5, horiz4, horiz3, horiz2,
3449 horiz6, horiz7, horiz8, horiz8,
3450 const20, const6, const3);
3451 ST_D2(res1, 0, 1, dst, dst_stride);
3452 dst += 2 * dst_stride;
3455 horiz7, horiz8, horiz8, horiz7,
3456 horiz7, horiz6, horiz5, horiz4,
3457 horiz8, horiz8, horiz7, horiz6,
3458 const20, const6, const3);
3459 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3478 v16u8 inp0, inp1, inp2, inp3;
3479 v16u8 res0, res1, avg0, avg1;
3480 v16u8 horiz0, horiz1, horiz2, horiz3;
3481 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3482 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3483 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3484 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3485 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3486 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3487 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3488 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3491 src += (2 * src_stride);
3493 mask2, mask3, const20,
3495 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3496 horiz0 = __msa_ave_u_b(inp0, res0);
3497 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3499 src += (2 * src_stride);
3501 mask2, mask3, const20,
3503 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3504 horiz2 = __msa_ave_u_b(inp2, res1);
3505 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3507 src += (2 * src_stride);
3509 mask2, mask3, const20,
3511 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3512 horiz4 = __msa_ave_u_b(inp0, res0);
3513 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3515 horiz1, horiz2, horiz3, horiz4,
3516 horiz1, horiz0, horiz0, horiz1,
3517 horiz2, horiz3, horiz4, horiz5,
3518 const20, const6, const3);
3519 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3520 res0 = __msa_ave_u_b(avg0, res0);
3521 ST_D2(res0, 0, 1, dst, dst_stride);
3522 dst += (2 * dst_stride);
3525 src += (2 * src_stride);
3527 mask2, mask3, const20,
3529 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3530 horiz6 = __msa_ave_u_b(inp2, res1);
3531 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3534 mask2, mask3, const20,
3536 horiz8 = __msa_ave_u_b(inp0, res0);
3538 horiz3, horiz4, horiz5, horiz6,
3539 horiz3, horiz2, horiz1, horiz0,
3540 horiz4, horiz5, horiz6, horiz7,
3541 const20, const6, const3);
3542 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3543 res1 = __msa_ave_u_b(avg1, res1);
3545 horiz5, horiz6, horiz7, horiz8,
3546 horiz5, horiz4, horiz3, horiz2,
3547 horiz6, horiz7, horiz8, horiz8,
3548 const20, const6, const3);
3549 ST_D2(res1, 0, 1, dst, dst_stride);
3550 dst += 2 * dst_stride;
3552 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3553 res0 = __msa_ave_u_b(avg0, res0);
3556 horiz7, horiz8, horiz8, horiz7,
3557 horiz7, horiz6, horiz5, horiz4,
3558 horiz8, horiz8, horiz7, horiz6,
3559 const20, const6, const3);
3560 ST_D2(res0, 0, 1, dst, dst_stride);
3561 dst += 2 * dst_stride;
3563 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3564 res1 = __msa_ave_u_b(avg1, res1);
3565 ST_D2(res1, 0, 1, dst, dst_stride);
3584 v16u8 inp0, inp1, inp2, inp3;
3585 v16u8 res0, res1, avg0, avg1;
3586 v16u8 horiz0, horiz1, horiz2, horiz3;
3587 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3588 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3589 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3590 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3591 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3592 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3593 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3594 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3597 src += (2 * src_stride);
3599 mask2, mask3, const20,
3601 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3603 src += (2 * src_stride);
3605 mask2, mask3, const20,
3607 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3609 src += (2 * src_stride);
3611 mask2, mask3, const20,
3613 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3615 horiz1, horiz2, horiz3, horiz4,
3616 horiz1, horiz0, horiz0, horiz1,
3617 horiz2, horiz3, horiz4, horiz5,
3618 const20, const6, const3);
3619 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3620 res0 = __msa_ave_u_b(avg0, res0);
3622 src += (2 * src_stride);
3623 ST_D2(res0, 0, 1, dst, dst_stride);
3624 dst += 2 * dst_stride;
3627 mask2, mask3, const20,
3629 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3631 horiz3, horiz4, horiz5, horiz6,
3632 horiz3, horiz2, horiz1, horiz0,
3633 horiz4, horiz5, horiz6, horiz7,
3634 const20, const6, const3);
3635 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3636 res1 = __msa_ave_u_b(avg1, res1);
3639 mask2, mask3, const20,
3641 ST_D2(res1, 0, 1, dst, dst_stride);
3642 dst += 2 * dst_stride;
3645 horiz5, horiz6, horiz7, horiz8,
3646 horiz5, horiz4, horiz3, horiz2,
3647 horiz6, horiz7, horiz8, horiz8,
3648 const20, const6, const3);
3649 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3650 res0 = __msa_ave_u_b(avg0, res0);
3652 horiz7, horiz8, horiz8, horiz7,
3653 horiz7, horiz6, horiz5, horiz4,
3654 horiz8, horiz8, horiz7, horiz6,
3655 const20, const6, const3);
3656 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3657 res1 = __msa_ave_u_b(avg1, res1);
3658 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3677 v16u8 inp0, inp1, inp2, inp3;
3678 v16u8 res0, res1, avg0, avg1;
3679 v16u8 horiz0, horiz1, horiz2, horiz3;
3680 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3681 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3682 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3683 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3684 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3685 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3686 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3687 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3690 src += (2 * src_stride);
3692 mask2, mask3, const20,
3694 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3696 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3697 horiz0 = __msa_ave_u_b(inp0, res0);
3698 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3700 src += (2 * src_stride);
3702 mask2, mask3, const20,
3704 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3706 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3707 horiz2 = __msa_ave_u_b(inp2, res1);
3708 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3710 src += (2 * src_stride);
3712 mask2, mask3, const20,
3715 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3716 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3717 horiz4 = __msa_ave_u_b(inp0, res0);
3718 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3720 horiz1, horiz2, horiz3, horiz4,
3721 horiz1, horiz0, horiz0, horiz1,
3722 horiz2, horiz3, horiz4, horiz5,
3723 const20, const6, const3);
3724 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3725 res0 = __msa_ave_u_b(avg0, res0);
3726 ST_D2(res0, 0, 1, dst, dst_stride);
3727 dst += (2 * dst_stride);
3730 src += (2 * src_stride);
3732 mask2, mask3, const20,
3734 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3736 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3737 horiz6 = __msa_ave_u_b(inp2, res1);
3738 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3740 horiz3, horiz4, horiz5, horiz6,
3741 horiz3, horiz2, horiz1, horiz0,
3742 horiz4, horiz5, horiz6, horiz7,
3743 const20, const6, const3);
3744 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3745 res1 = __msa_ave_u_b(avg1, res1);
3746 ST_D2(res1, 0, 1, dst, dst_stride);
3747 dst += (2 * dst_stride);
3751 mask2, mask3, const20,
3753 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3754 horiz8 = __msa_ave_u_b(inp0, res0);
3756 horiz5, horiz6, horiz7, horiz8,
3757 horiz5, horiz4, horiz3, horiz2,
3758 horiz6, horiz7, horiz8, horiz8,
3759 const20, const6, const3);
3761 horiz7, horiz8, horiz8, horiz7,
3762 horiz7, horiz6, horiz5, horiz4,
3763 horiz8, horiz8, horiz7, horiz6,
3764 const20, const6, const3);
3765 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3766 res0 = __msa_ave_u_b(avg0, res0);
3767 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3768 res1 = __msa_ave_u_b(avg1, res1);
3769 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3779 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3781 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3782 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3783 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3784 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3786 for (loop_count = (
height >> 2); loop_count--;) {
3787 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
3788 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
3789 src += (4 * src_stride);
3791 const20, const6, const3);
3792 res = __msa_aver_u_b(inp0, res);
3797 const20, const6, const3);
3798 res = __msa_aver_u_b(inp2, res);
3803 const20, const6, const3);
3804 res = __msa_aver_u_b(inp4, res);
3809 const20, const6, const3);
3810 res = __msa_aver_u_b(inp6, res);
3817 res = __msa_aver_u_b(inp0, res);
3837 v16u8 inp0, inp1, inp2, inp3;
3838 v16u8 res0, res1, avg0, avg1;
3839 v16u8 horiz0, horiz1, horiz2, horiz3;
3840 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3841 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3842 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3843 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3844 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3845 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3846 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3847 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3849 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
3850 src += (4 * src_stride);
3852 const20, const6, const3);
3854 const20, const6, const3);
3855 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3856 horiz0 = __msa_aver_u_b(inp0, res0);
3857 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3858 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3859 horiz2 = __msa_aver_u_b(inp2, res1);
3860 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3862 src += (2 * src_stride);
3864 const20, const6, const3);
3865 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3866 horiz4 = __msa_aver_u_b(inp0, res0);
3867 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3869 horiz1, horiz2, horiz3, horiz4,
3870 horiz1, horiz0, horiz0, horiz1,
3871 horiz2, horiz3, horiz4, horiz5,
3872 const20, const6, const3);
3873 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3874 res0 = __msa_aver_u_b(avg0, res0);
3875 ST_D2(res0, 0, 1, dst, dst_stride);
3876 dst += (2 * dst_stride);
3879 src += (2 * src_stride);
3881 const20, const6, const3);
3882 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3883 horiz6 = __msa_aver_u_b(inp2, res1);
3884 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3886 horiz3, horiz4, horiz5, horiz6,
3887 horiz3, horiz2, horiz1, horiz0,
3888 horiz4, horiz5, horiz6, horiz7,
3889 const20, const6, const3);
3890 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3891 res1 = __msa_aver_u_b(avg1, res1);
3895 const20, const6, const3);
3896 horiz8 = __msa_aver_u_b(inp0, res0);
3897 ST_D2(res1, 0, 1, dst, dst_stride);
3898 dst += 2 * dst_stride;
3901 horiz5, horiz6, horiz7, horiz8,
3902 horiz5, horiz4, horiz3, horiz2,
3903 horiz6, horiz7, horiz8, horiz8,
3904 const20, const6, const3);
3905 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3906 res0 = __msa_aver_u_b(avg0, res0);
3908 horiz7, horiz8, horiz8, horiz7,
3909 horiz7, horiz6, horiz5, horiz4,
3910 horiz8, horiz8, horiz7, horiz6,
3911 const20, const6, const3);
3912 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3913 res1 = __msa_aver_u_b(avg1, res1);
3914 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3924 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3926 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3927 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3928 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3929 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3931 for (loop_count = (
height >> 2); loop_count--;) {
3932 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
3933 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
3934 src += (4 * src_stride);
3936 const20, const6, const3);
3941 const20, const6, const3);
3946 const20, const6, const3);
3951 const20, const6, const3);
3977 v16u8 inp0, inp1, inp2, inp3;
3978 v16u8 res0, res1, avg0, avg1;
3979 v16u8 horiz0, horiz1, horiz2, horiz3;
3980 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3981 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3982 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3983 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3984 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3985 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3986 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3987 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3990 src += (2 * src_stride);
3992 mask0, mask1, mask2, mask3,
3993 const20, const6, const3);
3994 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3996 src += (2 * src_stride);
3998 mask0, mask1, mask2, mask3,
3999 const20, const6, const3);
4000 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4002 src += (2 * src_stride);
4004 mask0, mask1, mask2, mask3,
4005 const20, const6, const3);
4006 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4008 horiz1, horiz2, horiz3, horiz4,
4009 horiz1, horiz0, horiz0, horiz1,
4010 horiz2, horiz3, horiz4, horiz5,
4011 const20, const6, const3);
4012 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4013 res0 = __msa_aver_u_b(avg0, res0);
4014 ST_D2(res0, 0, 1, dst, dst_stride);
4015 dst += (2 * dst_stride);
4018 src += (2 * src_stride);
4020 mask0, mask1, mask2, mask3,
4021 const20, const6, const3);
4022 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4024 horiz3, horiz4, horiz5, horiz6,
4025 horiz3, horiz2, horiz1, horiz0,
4026 horiz4, horiz5, horiz6, horiz7,
4027 const20, const6, const3);
4030 mask0, mask1, mask2, mask3,
4031 const20, const6, const3);
4032 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4033 res1 = __msa_aver_u_b(avg1, res1);
4035 horiz5, horiz6, horiz7, horiz8,
4036 horiz5, horiz4, horiz3, horiz2,
4037 horiz6, horiz7, horiz8, horiz8,
4038 const20, const6, const3);
4039 ST_D2(res1, 0, 1, dst, dst_stride);
4040 dst += 2 * dst_stride;
4042 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4043 res0 = __msa_aver_u_b(avg0, res0);
4045 horiz7, horiz8, horiz8, horiz7,
4046 horiz7, horiz6, horiz5, horiz4,
4047 horiz8, horiz8, horiz7, horiz6,
4048 const20, const6, const3);
4049 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4050 res1 = __msa_aver_u_b(avg1, res1);
4051 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4061 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
4063 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
4064 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4065 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4066 v8u16 const20 = (v8u16) __msa_ldi_h(20);
4068 for (loop_count = (
height >> 2); loop_count--;) {
4069 LD_UB4(
src, src_stride, inp0, inp2, inp4, inp6);
4070 LD_UB4((
src + 1), src_stride, inp1, inp3, inp5, inp7);
4071 src += (4 * src_stride);
4073 const20, const6, const3);
4074 res = __msa_aver_u_b(res, inp1);
4079 const20, const6, const3);
4080 res = __msa_aver_u_b(res, inp3);
4085 const20, const6, const3);
4086 res = __msa_aver_u_b(res, inp5);
4091 const20, const6, const3);
4092 res = __msa_aver_u_b(res, inp7);
4099 res = __msa_aver_u_b(inp1, res);
4119 v16u8 inp0, inp1, inp2, inp3;
4120 v16u8 res0, res1, avg0, avg1;
4121 v16u8 horiz0, horiz1, horiz2, horiz3;
4122 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4123 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4124 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4125 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4126 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4127 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4128 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4129 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4131 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
4132 src += (4 * src_stride);
4134 const20, const6, const3);
4136 const20, const6, const3);
4137 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4139 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4140 horiz0 = __msa_aver_u_b(inp0, res0);
4141 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4142 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4144 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4145 horiz2 = __msa_aver_u_b(inp2, res1);
4146 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4147 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
4148 src += (4 * src_stride);
4150 const20, const6, const3);
4152 const20, const6, const3);
4153 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4155 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4156 horiz4 = __msa_aver_u_b(inp0, res0);
4157 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4158 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4160 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4161 horiz6 = __msa_aver_u_b(inp2, res1);
4162 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4164 horiz1, horiz2, horiz3, horiz4,
4165 horiz1, horiz0, horiz0, horiz1,
4166 horiz2, horiz3, horiz4, horiz5,
4167 const20, const6, const3);
4168 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4169 res0 = __msa_aver_u_b(avg0, res0);
4171 horiz3, horiz4, horiz5, horiz6,
4172 horiz3, horiz2, horiz1, horiz0,
4173 horiz4, horiz5, horiz6, horiz7,
4174 const20, const6, const3);
4175 ST_D2(res0, 0, 1, dst, dst_stride);
4176 dst += 2 * dst_stride;
4180 const20, const6, const3);
4181 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4182 res1 = __msa_aver_u_b(avg1, res1);
4183 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4184 horiz8 = __msa_aver_u_b(inp0, res0);
4186 horiz5, horiz6, horiz7, horiz8,
4187 horiz5, horiz4, horiz3, horiz2,
4188 horiz6, horiz7, horiz8, horiz8,
4189 const20, const6, const3);
4190 ST_D2(res1, 0, 1, dst, dst_stride);
4191 dst += 2 * dst_stride;
4193 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4194 res0 = __msa_aver_u_b(avg0, res0);
4196 horiz7, horiz8, horiz8, horiz7,
4197 horiz7, horiz6, horiz5, horiz4,
4198 horiz8, horiz8, horiz7, horiz6,
4199 const20, const6, const3);
4200 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4201 res1 = __msa_aver_u_b(avg1, res1);
4202 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4221 v16u8 inp0, inp1, inp2, inp3;
4223 v16u8 horiz0, horiz1, horiz2, horiz3;
4224 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4225 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4226 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4227 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4228 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4229 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4230 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4231 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4234 src += (2 * src_stride);
4236 const20, const6, const3);
4237 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4238 horiz0 = __msa_aver_u_b(inp0, res0);
4239 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4242 src += (2 * src_stride);
4244 const20, const6, const3);
4245 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4246 horiz2 = __msa_aver_u_b(inp2, res1);
4247 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4249 src += (2 * src_stride);
4251 const20, const6, const3);
4252 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4253 horiz4 = __msa_aver_u_b(inp0, res0);
4254 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4256 horiz1, horiz2, horiz3, horiz4,
4257 horiz1, horiz0, horiz0, horiz1,
4258 horiz2, horiz3, horiz4, horiz5,
4259 const20, const6, const3);
4260 ST_D2(res0, 0, 1, dst, dst_stride);
4261 dst += (2 * dst_stride);
4264 src += (2 * src_stride);
4266 const20, const6, const3);
4267 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4268 horiz6 = __msa_aver_u_b(inp2, res1);
4269 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4271 horiz3, horiz4, horiz5, horiz6,
4272 horiz3, horiz2, horiz1, horiz0,
4273 horiz4, horiz5, horiz6, horiz7,
4274 const20, const6, const3);
4277 const20, const6, const3);
4278 horiz8 = __msa_aver_u_b(inp0, res0);
4280 horiz5, horiz6, horiz7, horiz8,
4281 horiz5, horiz4, horiz3, horiz2,
4282 horiz6, horiz7, horiz8, horiz8,
4283 const20, const6, const3);
4284 ST_D2(res1, 0, 1, dst, dst_stride);
4285 dst += 2 * dst_stride;
4288 horiz7, horiz8, horiz8, horiz7,
4289 horiz7, horiz6, horiz5, horiz4,
4290 horiz8, horiz8, horiz7, horiz6,
4291 const20, const6, const3);
4292 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4309 v16u8 inp0, inp1, inp2, inp3;
4311 v16u8 horiz0, horiz1, horiz2, horiz3;
4312 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4313 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4314 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4315 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4316 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4317 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4318 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4319 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4322 src += (2 * src_stride);
4324 mask0, mask1, mask2, mask3,
4325 const20, const6, const3);
4326 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4328 src += (2 * src_stride);
4330 mask0, mask1, mask2, mask3,
4331 const20, const6, const3);
4332 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4334 src += (2 * src_stride);
4336 mask0, mask1, mask2, mask3,
4337 const20, const6, const3);
4338 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4340 horiz1, horiz2, horiz3, horiz4,
4341 horiz1, horiz0, horiz0, horiz1,
4342 horiz2, horiz3, horiz4, horiz5,
4343 const20, const6, const3);
4344 ST_D2(res0, 0, 1, dst, dst_stride);
4345 dst += (2 * dst_stride);
4348 src += (2 * src_stride);
4350 mask0, mask1, mask2, mask3,
4351 const20, const6, const3);
4352 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4354 horiz3, horiz4, horiz5, horiz6,
4355 horiz3, horiz2, horiz1, horiz0,
4356 horiz4, horiz5, horiz6, horiz7,
4357 const20, const6, const3);
4360 mask0, mask1, mask2, mask3,
4361 const20, const6, const3);
4362 ST_D2(res1, 0, 1, dst, dst_stride);
4363 dst += 2 * dst_stride;
4366 horiz5, horiz6, horiz7, horiz8,
4367 horiz5, horiz4, horiz3, horiz2,
4368 horiz6, horiz7, horiz8, horiz8,
4369 const20, const6, const3);
4371 horiz7, horiz8, horiz8, horiz7,
4372 horiz7, horiz6, horiz5, horiz4,
4373 horiz8, horiz8, horiz7, horiz6,
4374 const20, const6, const3);
4375 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4394 v16u8 inp0, inp1, inp2, inp3;
4396 v16u8 horiz0, horiz1, horiz2, horiz3;
4397 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4398 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4399 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4400 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4401 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4402 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4403 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4404 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4406 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
4407 src += (4 * src_stride);
4410 const20, const6, const3);
4412 const20, const6, const3);
4413 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4415 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4416 horiz0 = __msa_aver_u_b(inp0, res0);
4417 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4418 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4420 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4421 horiz2 = __msa_aver_u_b(inp2, res1);
4422 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4423 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
4424 src += (4 * src_stride);
4426 const20, const6, const3);
4428 const20, const6, const3);
4429 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4431 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4432 horiz4 = __msa_aver_u_b(inp0, res0);
4433 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4434 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4436 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4437 horiz6 = __msa_aver_u_b(inp2, res1);
4438 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4441 const20, const6, const3);
4442 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4443 horiz8 = __msa_aver_u_b(inp0, res0);
4445 horiz1, horiz2, horiz3, horiz4,
4446 horiz1, horiz0, horiz0, horiz1,
4447 horiz2, horiz3, horiz4, horiz5,
4448 const20, const6, const3);
4450 horiz3, horiz4, horiz5, horiz6,
4451 horiz3, horiz2, horiz1, horiz0,
4452 horiz4, horiz5, horiz6, horiz7,
4453 const20, const6, const3);
4454 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4455 dst += (4 * dst_stride);
4458 horiz5, horiz6, horiz7, horiz8,
4459 horiz5, horiz4, horiz3, horiz2,
4460 horiz6, horiz7, horiz8, horiz8,
4461 const20, const6, const3);
4463 horiz7, horiz8, horiz8, horiz7,
4464 horiz7, horiz6, horiz5, horiz4,
4465 horiz8, horiz8, horiz7, horiz6,
4466 const20, const6, const3);
4467 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4486 v16u8 inp0, inp1, inp2, inp3;
4487 v16u8 res0, res1, avg0, avg1;
4488 v16u8 horiz0, horiz1, horiz2, horiz3;
4489 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4490 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4491 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4492 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4493 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4494 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4495 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4496 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4498 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
4499 src += (4 * src_stride);
4502 const20, const6, const3);
4504 const20, const6, const3);
4505 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4506 horiz0 = __msa_aver_u_b(inp0, res0);
4507 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4508 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4509 horiz2 = __msa_aver_u_b(inp2, res1);
4510 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4512 src += (2 * src_stride);
4515 const20, const6, const3);
4516 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4517 horiz4 = __msa_aver_u_b(inp0, res0);
4518 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4520 horiz1, horiz2, horiz3, horiz4,
4521 horiz1, horiz0, horiz0, horiz1,
4522 horiz2, horiz3, horiz4, horiz5,
4523 const20, const6, const3);
4524 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4525 res0 = __msa_aver_u_b(avg0, res0);
4526 ST_D2(res0, 0, 1, dst, dst_stride);
4527 dst += (2 * dst_stride);
4530 src += (2 * src_stride);
4532 const20, const6, const3);
4533 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4534 horiz6 = __msa_aver_u_b(inp2, res1);
4535 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4538 const20, const6, const3);
4539 horiz8 = __msa_aver_u_b(inp0, res0);
4541 horiz3, horiz4, horiz5, horiz6,
4542 horiz3, horiz2, horiz1, horiz0,
4543 horiz4, horiz5, horiz6, horiz7,
4544 const20, const6, const3);
4545 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4546 res1 = __msa_aver_u_b(avg1, res1);
4548 horiz5, horiz6, horiz7, horiz8,
4549 horiz5, horiz4, horiz3, horiz2,
4550 horiz6, horiz7, horiz8, horiz8,
4551 const20, const6, const3);
4552 ST_D2(res1, 0, 1, dst, dst_stride);
4553 dst += 2 * dst_stride;
4555 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4556 res0 = __msa_aver_u_b(avg0, res0);
4558 horiz7, horiz8, horiz8, horiz7,
4559 horiz7, horiz6, horiz5, horiz4,
4560 horiz8, horiz8, horiz7, horiz6,
4561 const20, const6, const3);
4562 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4563 res1 = __msa_aver_u_b(avg1, res1);
4564 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4583 v16u8 inp0, inp1, inp2, inp3;
4584 v16u8 res0, res1, avg0, avg1;
4585 v16u8 horiz0, horiz1, horiz2, horiz3;
4586 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4587 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4588 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4589 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4590 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4591 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4592 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4593 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4596 src += (2 * src_stride);
4598 mask0, mask1, mask2, mask3,
4599 const20, const6, const3);
4600 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4602 src += (2 * src_stride);
4604 mask0, mask1, mask2, mask3,
4605 const20, const6, const3);
4606 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4608 src += (2 * src_stride);
4610 mask0, mask1, mask2, mask3,
4611 const20, const6, const3);
4612 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4613 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4615 horiz1, horiz2, horiz3, horiz4,
4616 horiz1, horiz0, horiz0, horiz1,
4617 horiz2, horiz3, horiz4, horiz5,
4618 const20, const6, const3);
4619 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4620 res0 = __msa_aver_u_b(avg0, res0);
4621 ST_D2(res0, 0, 1, dst, dst_stride);
4622 dst += (2 * dst_stride);
4625 src += (2 * src_stride);
4627 mask0, mask1, mask2, mask3,
4628 const20, const6, const3);
4629 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4631 horiz3, horiz4, horiz5, horiz6,
4632 horiz3, horiz2, horiz1, horiz0,
4633 horiz4, horiz5, horiz6, horiz7,
4634 const20, const6, const3);
4637 mask0, mask1, mask2, mask3,
4638 const20, const6, const3);
4639 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4640 res1 = __msa_aver_u_b(avg1, res1);
4642 horiz5, horiz6, horiz7, horiz8,
4643 horiz5, horiz4, horiz3, horiz2,
4644 horiz6, horiz7, horiz8, horiz8,
4645 const20, const6, const3);
4646 ST_D2(res1, 0, 1, dst, dst_stride);
4647 dst += 2 * dst_stride;
4648 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4649 res0 = __msa_aver_u_b(avg0, res0);
4652 horiz7, horiz8, horiz8, horiz7,
4653 horiz7, horiz6, horiz5, horiz4,
4654 horiz8, horiz8, horiz7, horiz6,
4655 const20, const6, const3);
4656 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4657 res1 = __msa_aver_u_b(avg1, res1);
4658 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4676 v16u8 inp0, inp1, inp2, inp3;
4677 v16u8 res0, res1, avg0, avg1;
4678 v16u8 horiz0, horiz1, horiz2, horiz3;
4679 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4680 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4681 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4682 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4683 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4684 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4685 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4686 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4688 LD_UB4(
src, src_stride, inp0, inp1, inp2, inp3);
4689 src += (4 * src_stride);
4691 mask0, mask1, mask2, mask3,
4692 const20, const6, const3);
4693 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4695 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4696 horiz0 = __msa_aver_u_b(inp0, res0);
4697 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4699 const20, const6, const3);
4700 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4702 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4703 horiz2 = __msa_aver_u_b(inp2, res1);
4704 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4706 src += (2 * src_stride);
4708 const20, const6, const3);
4709 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4711 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4712 horiz4 = __msa_aver_u_b(inp0, res0);
4713 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4715 horiz1, horiz2, horiz3, horiz4,
4716 horiz1, horiz0, horiz0, horiz1,
4717 horiz2, horiz3, horiz4, horiz5,
4718 const20, const6, const3);
4719 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
4720 res0 = __msa_aver_u_b(avg0, res0);
4722 src += (2 * src_stride);
4723 ST_D2(res0, 0, 1, dst, dst_stride);
4724 dst += 2 * dst_stride;
4727 const20, const6, const3);
4728 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4730 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4731 horiz6 = __msa_aver_u_b(inp2, res1);
4732 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4734 horiz3, horiz4, horiz5, horiz6,
4735 horiz3, horiz2, horiz1, horiz0,
4736 horiz4, horiz5, horiz6, horiz7,
4737 const20, const6, const3);
4738 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
4739 res1 = __msa_aver_u_b(avg1, res1);
4742 const20, const6, const3);
4743 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4744 horiz8 = __msa_aver_u_b(inp0, res0);
4746 horiz5, horiz6, horiz7, horiz8,
4747 horiz5, horiz4, horiz3, horiz2,
4748 horiz6, horiz7, horiz8, horiz8,
4749 const20, const6, const3);
4750 ST_D2(res1, 0, 1, dst, dst_stride);
4751 dst += 2 * dst_stride;
4753 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
4754 res0 = __msa_aver_u_b(avg0, res0);
4756 horiz7, horiz8, horiz8, horiz7,
4757 horiz7, horiz6, horiz5, horiz4,
4758 horiz8, horiz8, horiz7, horiz6,
4759 const20, const6, const3);
4760 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
4761 res1 = __msa_aver_u_b(avg1, res1);
4762 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4781 v16u8 inp0, inp1, inp2, inp3;
4782 v16u8 res0, res1, avg0, avg1;
4783 v16u8 horiz0, horiz1, horiz2, horiz3;
4784 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4786 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4787 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4788 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4789 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4790 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4791 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4792 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4795 src += (2 * src_stride);
4797 const20, const6, const3);
4799 src += (2 * src_stride);
4800 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4801 horiz0 = __msa_aver_u_b(inp0, res0);
4802 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4804 const20, const6, const3);
4806 src += (2 * src_stride);
4807 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4808 horiz2 = __msa_aver_u_b(inp2, res1);
4809 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4811 const20, const6, const3);
4812 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4813 horiz4 = __msa_aver_u_b(inp0, res0);
4814 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4815 LD_UB2(dst, dst_stride, dst0, dst1);
4816 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4818 horiz1, horiz2, horiz3, horiz4,
4819 horiz1, horiz0, horiz0, horiz1,
4820 horiz2, horiz3, horiz4, horiz5,
4821 const20, const6, const3);
4822 res0 = __msa_aver_u_b(avg0, res0);
4823 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4824 res0 = __msa_aver_u_b(avg0, res0);
4825 ST_D2(res0, 0, 1, dst, dst_stride);
4826 dst += (2 * dst_stride);
4829 src += (2 * src_stride);
4831 const20, const6, const3);
4832 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4833 horiz6 = __msa_aver_u_b(inp2, res1);
4834 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4835 LD_UB2(dst, dst_stride, dst0, dst1);
4836 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4838 horiz3, horiz4, horiz5, horiz6,
4839 horiz3, horiz2, horiz1, horiz0,
4840 horiz4, horiz5, horiz6, horiz7,
4841 const20, const6, const3);
4842 res1 = __msa_aver_u_b(avg1, res1);
4843 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4844 res1 = __msa_aver_u_b(avg1, res1);
4845 ST_D2(res1, 0, 1, dst, dst_stride);
4846 dst += (2 * dst_stride);
4850 const20, const6, const3);
4851 horiz8 = __msa_aver_u_b(inp0, res0);
4852 LD_UB2(dst, dst_stride, dst0, dst1);
4853 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4855 horiz5, horiz6, horiz7, horiz8,
4856 horiz5, horiz4, horiz3, horiz2,
4857 horiz6, horiz7, horiz8, horiz8,
4858 const20, const6, const3);
4859 res0 = __msa_aver_u_b(avg0, res0);
4860 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4861 res0 = __msa_aver_u_b(avg0, res0);
4862 ST_D2(res0, 0, 1, dst, dst_stride);
4863 dst += (2 * dst_stride);
4865 LD_UB2(dst, dst_stride, dst0, dst1);
4866 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4868 horiz7, horiz8, horiz8, horiz7,
4869 horiz7, horiz6, horiz5, horiz4,
4870 horiz8, horiz8, horiz7, horiz6,
4871 const20, const6, const3);
4872 res1 = __msa_aver_u_b(avg1, res1);
4873 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4874 res1 = __msa_aver_u_b(avg1, res1);
4875 ST_D2(res1, 0, 1, dst, dst_stride);
4894 v16u8 inp0, inp1, inp2, inp3;
4895 v16u8 res0, res1, avg0, avg1;
4896 v16u8 horiz0, horiz1, horiz2, horiz3;
4897 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4899 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4900 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4901 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4902 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4903 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4904 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4905 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4908 src += (2 * src_stride);
4910 mask0, mask1, mask2, mask3,
4911 const20, const6, const3);
4913 src += (2 * src_stride);
4914 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4916 mask0, mask1, mask2, mask3,
4917 const20, const6, const3);
4919 src += (2 * src_stride);
4920 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4922 mask0, mask1, mask2, mask3,
4923 const20, const6, const3);
4924 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4925 LD_UB2(dst, dst_stride, dst0, dst1);
4926 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4928 horiz1, horiz2, horiz3, horiz4,
4929 horiz1, horiz0, horiz0, horiz1,
4930 horiz2, horiz3, horiz4, horiz5,
4931 const20, const6, const3);
4932 res0 = __msa_aver_u_b(avg0, res0);
4933 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4934 res0 = __msa_aver_u_b(avg0, res0);
4935 ST_D2(res0, 0, 1, dst, dst_stride);
4936 dst += (2 * dst_stride);
4939 src += (2 * src_stride);
4941 mask0, mask1, mask2, mask3,
4942 const20, const6, const3);
4943 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4944 LD_UB2(dst, dst_stride, dst0, dst1);
4945 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4947 horiz3, horiz4, horiz5, horiz6,
4948 horiz3, horiz2, horiz1, horiz0,
4949 horiz4, horiz5, horiz6, horiz7,
4950 const20, const6, const3);
4951 res1 = __msa_aver_u_b(avg1, res1);
4952 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4953 res1 = __msa_aver_u_b(avg1, res1);
4954 ST_D2(res1, 0, 1, dst, dst_stride);
4955 dst += (2 * dst_stride);
4959 mask0, mask1, mask2, mask3,
4960 const20, const6, const3);
4961 LD_UB2(dst, dst_stride, dst0, dst1);
4962 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4964 horiz5, horiz6, horiz7, horiz8,
4965 horiz5, horiz4, horiz3, horiz2,
4966 horiz6, horiz7, horiz8, horiz8,
4967 const20, const6, const3);
4968 res0 = __msa_aver_u_b(avg0, res0);
4969 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4970 res0 = __msa_aver_u_b(avg0, res0);
4971 ST_D2(res0, 0, 1, dst, dst_stride);
4972 dst += (2 * dst_stride);
4974 LD_UB2(dst, dst_stride, dst0, dst1);
4975 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4977 horiz7, horiz8, horiz8, horiz7,
4978 horiz7, horiz6, horiz5, horiz4,
4979 horiz8, horiz8, horiz7, horiz6,
4980 const20, const6, const3);
4981 res1 = __msa_aver_u_b(avg1, res1);
4982 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4983 res1 = __msa_aver_u_b(avg1, res1);
4984 ST_D2(res1, 0, 1, dst, dst_stride);
5003 v16u8 inp0, inp1, inp2, inp3;
5004 v16u8 res0, res1, avg0, avg1;
5005 v16u8 horiz0, horiz1, horiz2, horiz3;
5006 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5008 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5009 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5010 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5011 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5012 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5013 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5014 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5017 src += (2 * src_stride);
5019 const20, const6, const3);
5022 src += (2 * src_stride);
5023 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5025 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5026 horiz0 = __msa_aver_u_b(inp0, res0);
5027 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5029 const20, const6, const3);
5031 src += (2 * src_stride);
5032 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5034 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5035 horiz2 = __msa_aver_u_b(inp2, res1);
5036 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5038 const20, const6, const3);
5040 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5042 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5043 horiz4 = __msa_aver_u_b(inp0, res0);
5044 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5045 LD_UB2(dst, dst_stride, dst0, dst1);
5046 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
5048 horiz1, horiz2, horiz3, horiz4,
5049 horiz1, horiz0, horiz0, horiz1,
5050 horiz2, horiz3, horiz4, horiz5,
5051 const20, const6, const3);
5052 res0 = __msa_aver_u_b(avg0, res0);
5053 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5054 res0 = __msa_aver_u_b(avg0, res0);
5055 ST_D2(res0, 0, 1, dst, dst_stride);
5056 dst += (2 * dst_stride);
5059 src += (2 * src_stride);
5061 const20, const6, const3);
5063 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5065 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5066 horiz6 = __msa_aver_u_b(inp2, res1);
5067 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5068 LD_UB2(dst, dst_stride, dst0, dst1);
5069 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
5071 horiz3, horiz4, horiz5, horiz6,
5072 horiz3, horiz2, horiz1, horiz0,
5073 horiz4, horiz5, horiz6, horiz7,
5074 const20, const6, const3);
5075 res1 = __msa_aver_u_b(avg1, res1);
5076 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5077 res1 = __msa_aver_u_b(avg1, res1);
5078 ST_D2(res1, 0, 1, dst, dst_stride);
5079 dst += (2 * dst_stride);
5083 const20, const6, const3);
5084 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5085 horiz8 = __msa_aver_u_b(inp0, res0);
5086 LD_UB2(dst, dst_stride, dst0, dst1);
5087 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5089 horiz5, horiz6, horiz7, horiz8,
5090 horiz5, horiz4, horiz3, horiz2,
5091 horiz6, horiz7, horiz8, horiz8,
5092 const20, const6, const3);
5093 res0 = __msa_aver_u_b(avg0, res0);
5094 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5095 res0 = __msa_aver_u_b(avg0, res0);
5096 ST_D2(res0, 0, 1, dst, dst_stride);
5097 dst += (2 * dst_stride);
5099 LD_UB2(dst, dst_stride, dst0, dst1);
5100 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5102 horiz7, horiz8, horiz8, horiz7,
5103 horiz7, horiz6, horiz5, horiz4,
5104 horiz8, horiz8, horiz7, horiz6,
5105 const20, const6, const3);
5106 res1 = __msa_aver_u_b(avg1, res1);
5107 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5108 res1 = __msa_aver_u_b(avg1, res1);
5109 ST_D2(res1, 0, 1, dst, dst_stride);
5128 v16u8 inp0, inp1, inp2, inp3;
5129 v16u8 res0, res1, avg0, avg1;
5130 v16u8 horiz0, horiz1, horiz2, horiz3;
5131 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5133 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5134 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5135 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5136 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5137 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5138 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5139 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5142 src += (2 * src_stride);
5144 const20, const6, const3);
5146 src += (2 * src_stride);
5147 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5148 horiz0 = __msa_aver_u_b(inp0, res0);
5149 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5151 const20, const6, const3);
5153 src += (2 * src_stride);
5154 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5155 horiz2 = __msa_aver_u_b(inp2, res1);
5156 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5158 const20, const6, const3);
5159 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5160 horiz4 = __msa_aver_u_b(inp0, res0);
5161 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5162 LD_UB2(dst, dst_stride, dst0, dst1);
5164 horiz1, horiz2, horiz3, horiz4,
5165 horiz1, horiz0, horiz0, horiz1,
5166 horiz2, horiz3, horiz4, horiz5,
5167 const20, const6, const3);
5168 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5169 res0 = __msa_aver_u_b(avg0, res0);
5170 ST_D2(res0, 0, 1, dst, dst_stride);
5171 dst += (2 * dst_stride);
5174 src += (2 * src_stride);
5176 const20, const6, const3);
5177 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5178 horiz6 = __msa_aver_u_b(inp2, res1);
5179 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5180 LD_UB2(dst, dst_stride, dst0, dst1);
5182 horiz3, horiz4, horiz5, horiz6,
5183 horiz3, horiz2, horiz1, horiz0,
5184 horiz4, horiz5, horiz6, horiz7,
5185 const20, const6, const3);
5186 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5187 res1 = __msa_aver_u_b(avg1, res1);
5188 ST_D2(res1, 0, 1, dst, dst_stride);
5189 dst += (2 * dst_stride);
5193 const20, const6, const3);
5194 horiz8 = __msa_aver_u_b(inp0, res0);
5195 LD_UB2(dst, dst_stride, dst0, dst1);
5197 horiz5, horiz6, horiz7, horiz8,
5198 horiz5, horiz4, horiz3, horiz2,
5199 horiz6, horiz7, horiz8, horiz8,
5200 const20, const6, const3);
5201 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5202 res0 = __msa_aver_u_b(avg0, res0);
5203 ST_D2(res0, 0, 1, dst, dst_stride);
5204 dst += (2 * dst_stride);
5206 LD_UB2(dst, dst_stride, dst0, dst1);
5208 horiz7, horiz8, horiz8, horiz7,
5209 horiz7, horiz6, horiz5, horiz4,
5210 horiz8, horiz8, horiz7, horiz6,
5211 const20, const6, const3);
5212 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5213 res1 = __msa_aver_u_b(avg1, res1);
5214 ST_D2(res1, 0, 1, dst, dst_stride);
5230 v16u8 inp0, inp1, inp2, inp3;
5231 v16u8 res0, res1, avg0, avg1;
5232 v16u8 horiz0, horiz1, horiz2, horiz3;
5233 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5235 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5236 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5237 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5238 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5239 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5240 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5241 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5244 src += (2 * src_stride);
5246 mask0, mask1, mask2, mask3,
5247 const20, const6, const3);
5249 src += (2 * src_stride);
5250 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5252 mask0, mask1, mask2, mask3,
5253 const20, const6, const3);
5255 src += (2 * src_stride);
5256 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5258 mask0, mask1, mask2, mask3,
5259 const20, const6, const3);
5260 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5262 src += (2 * src_stride);
5264 mask0, mask1, mask2, mask3,
5265 const20, const6, const3);
5266 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5269 mask0, mask1, mask2, mask3,
5270 const20, const6, const3);
5271 LD_UB2(dst, dst_stride, dst0, dst1);
5273 horiz1, horiz2, horiz3, horiz4,
5274 horiz1, horiz0, horiz0, horiz1,
5275 horiz2, horiz3, horiz4, horiz5,
5276 const20, const6, const3);
5277 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5278 res0 = __msa_aver_u_b(avg0, res0);
5279 ST_D2(res0, 0, 1, dst, dst_stride);
5280 dst += (2 * dst_stride);
5282 LD_UB2(dst, dst_stride, dst0, dst1);
5284 horiz3, horiz4, horiz5, horiz6,
5285 horiz3, horiz2, horiz1, horiz0,
5286 horiz4, horiz5, horiz6, horiz7,
5287 const20, const6, const3);
5288 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5289 res1 = __msa_aver_u_b(avg1, res1);
5290 ST_D2(res1, 0, 1, dst, dst_stride);
5291 dst += (2 * dst_stride);
5293 LD_UB2(dst, dst_stride, dst0, dst1);
5295 horiz5, horiz6, horiz7, horiz8,
5296 horiz5, horiz4, horiz3, horiz2,
5297 horiz6, horiz7, horiz8, horiz8,
5298 const20, const6, const3);
5299 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5300 res0 = __msa_aver_u_b(avg0, res0);
5301 ST_D2(res0, 0, 1, dst, dst_stride);
5302 dst += (2 * dst_stride);
5304 LD_UB2(dst, dst_stride, dst0, dst1);
5306 horiz7, horiz8, horiz8, horiz7,
5307 horiz7, horiz6, horiz5, horiz4,
5308 horiz8, horiz8, horiz7, horiz6,
5309 const20, const6, const3);
5310 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5311 res1 = __msa_aver_u_b(avg1, res1);
5312 ST_D2(res1, 0, 1, dst, dst_stride);
5331 v16u8 inp0, inp1, inp2, inp3;
5332 v16u8 res0, res1, avg0, avg1;
5333 v16u8 horiz0, horiz1, horiz2, horiz3;
5334 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5336 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5337 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5338 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5339 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5340 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5341 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5342 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5345 src += (2 * src_stride);
5347 const20, const6, const3);
5349 src += (2 * src_stride);
5350 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5352 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5353 horiz0 = __msa_aver_u_b(inp0, res0);
5354 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5356 const20, const6, const3);
5358 src += (2 * src_stride);
5359 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5361 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5362 horiz2 = __msa_aver_u_b(inp2, res1);
5363 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5365 const20, const6, const3);
5367 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5369 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5370 horiz4 = __msa_aver_u_b(inp0, res0);
5371 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5372 LD_UB2(dst, dst_stride, dst0, dst1);
5374 horiz1, horiz2, horiz3, horiz4,
5375 horiz1, horiz0, horiz0, horiz1,
5376 horiz2, horiz3, horiz4, horiz5,
5377 const20, const6, const3);
5378 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5379 res0 = __msa_aver_u_b(avg0, res0);
5380 ST_D2(res0, 0, 1, dst, dst_stride);
5381 dst += (2 * dst_stride);
5384 src += (2 * src_stride);
5386 const20, const6, const3);
5388 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5390 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5391 horiz6 = __msa_aver_u_b(inp2, res1);
5392 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5393 LD_UB2(dst, dst_stride, dst0, dst1);
5395 horiz3, horiz4, horiz5, horiz6,
5396 horiz3, horiz2, horiz1, horiz0,
5397 horiz4, horiz5, horiz6, horiz7,
5398 const20, const6, const3);
5399 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5400 res1 = __msa_aver_u_b(avg1, res1);
5401 ST_D2(res1, 0, 1, dst, dst_stride);
5402 dst += (2 * dst_stride);
5406 const20, const6, const3);
5407 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5408 horiz8 = __msa_aver_u_b(inp0, res0);
5409 LD_UB2(dst, dst_stride, dst0, dst1);
5411 horiz5, horiz6, horiz7, horiz8,
5412 horiz5, horiz4, horiz3, horiz2,
5413 horiz6, horiz7, horiz8, horiz8,
5414 const20, const6, const3);
5415 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5416 res0 = __msa_aver_u_b(avg0, res0);
5417 ST_D2(res0, 0, 1, dst, dst_stride);
5418 dst += (2 * dst_stride);
5420 LD_UB2(dst, dst_stride, dst0, dst1);
5422 horiz7, horiz8, horiz8, horiz7,
5423 horiz7, horiz6, horiz5, horiz4,
5424 horiz8, horiz8, horiz7, horiz6,
5425 const20, const6, const3);
5426 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5427 res1 = __msa_aver_u_b(avg1, res1);
5428 ST_D2(res1, 0, 1, dst, dst_stride);
5447 v16u8 inp0, inp1, inp2, inp3;
5448 v16u8 res0, res1, avg0, avg1;
5449 v16u8 horiz0, horiz1, horiz2, horiz3;
5450 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5452 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5453 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5454 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5455 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5456 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5457 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5458 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5461 src += (2 * src_stride);
5464 const20, const6, const3);
5465 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5466 horiz0 = __msa_aver_u_b(inp0, res0);
5467 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5469 src += (2 * src_stride);
5471 const20, const6, const3);
5472 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5473 horiz2 = __msa_aver_u_b(inp2, res1);
5474 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5475 LD_UB2(dst, dst_stride, dst0, dst1);
5477 src += (2 * src_stride);
5479 const20, const6, const3);
5480 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5481 horiz4 = __msa_aver_u_b(inp0, res0);
5482 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5484 horiz1, horiz2, horiz3, horiz4,
5485 horiz1, horiz0, horiz0, horiz1,
5486 horiz2, horiz3, horiz4, horiz5,
5487 const20, const6, const3);
5488 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5489 res0 = __msa_aver_u_b(avg0, res0);
5490 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5491 res0 = __msa_aver_u_b(avg0, res0);
5492 ST_D2(res0, 0, 1, dst, dst_stride);
5493 dst += (2 * dst_stride);
5495 LD_UB2(dst, dst_stride, dst0, dst1);
5497 src += (2 * src_stride);
5499 const20, const6, const3);
5500 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5501 horiz6 = __msa_aver_u_b(inp2, res1);
5502 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5504 horiz3, horiz4, horiz5, horiz6,
5505 horiz3, horiz2, horiz1, horiz0,
5506 horiz4, horiz5, horiz6, horiz7,
5507 const20, const6, const3);
5508 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5509 res1 = __msa_aver_u_b(avg1, res1);
5510 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5511 res1 = __msa_aver_u_b(avg1, res1);
5512 ST_D2(res1, 0, 1, dst, dst_stride);
5513 dst += (2 * dst_stride);
5517 const20, const6, const3);
5518 horiz8 = __msa_aver_u_b(inp0, res0);
5520 horiz5, horiz6, horiz7, horiz8,
5521 horiz5, horiz4, horiz3, horiz2,
5522 horiz6, horiz7, horiz8, horiz8,
5523 const20, const6, const3);
5525 horiz7, horiz8, horiz8, horiz7,
5526 horiz7, horiz6, horiz5, horiz4,
5527 horiz8, horiz8, horiz7, horiz6,
5528 const20, const6, const3);
5529 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5530 res0 = __msa_aver_u_b(avg0, res0);
5531 LD_UB2(dst, dst_stride, dst0, dst1);
5532 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5533 res0 = __msa_aver_u_b(avg0, res0);
5534 ST_D2(res0, 0, 1, dst, dst_stride);
5535 dst += (2 * dst_stride);
5537 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5538 res1 = __msa_aver_u_b(avg1, res1);
5539 LD_UB2(dst, dst_stride, dst0, dst1);
5540 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5541 res1 = __msa_aver_u_b(avg1, res1);
5542 ST_D2(res1, 0, 1, dst, dst_stride);
5561 v16u8 inp0, inp1, inp2, inp3;
5562 v16u8 res0, res1, avg0, avg1;
5563 v16u8 horiz0, horiz1, horiz2, horiz3;
5564 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5566 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5567 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5568 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5569 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5570 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5571 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5572 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5575 src += (2 * src_stride);
5577 mask0, mask1, mask2, mask3,
5578 const20, const6, const3);
5579 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5581 src += (2 * src_stride);
5583 mask0, mask1, mask2, mask3,
5584 const20, const6, const3);
5585 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5586 LD_UB2(dst, dst_stride, dst0, dst1);
5588 src += (2 * src_stride);
5590 mask0, mask1, mask2, mask3,
5591 const20, const6, const3);
5592 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5594 horiz1, horiz2, horiz3, horiz4,
5595 horiz1, horiz0, horiz0, horiz1,
5596 horiz2, horiz3, horiz4, horiz5,
5597 const20, const6, const3);
5598 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5599 res0 = __msa_aver_u_b(avg0, res0);
5600 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5601 res0 = __msa_aver_u_b(avg0, res0);
5602 ST_D2(res0, 0, 1, dst, dst_stride);
5603 dst += (2 * dst_stride);
5605 LD_UB2(dst, dst_stride, dst0, dst1);
5607 src += (2 * src_stride);
5609 mask0, mask1, mask2, mask3,
5610 const20, const6, const3);
5611 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5613 horiz3, horiz4, horiz5, horiz6,
5614 horiz3, horiz2, horiz1, horiz0,
5615 horiz4, horiz5, horiz6, horiz7,
5616 const20, const6, const3);
5617 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5618 res1 = __msa_aver_u_b(avg1, res1);
5619 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5620 res1 = __msa_aver_u_b(avg1, res1);
5621 ST_D2(res1, 0, 1, dst, dst_stride);
5622 dst += (2 * dst_stride);
5626 mask0, mask1, mask2, mask3,
5627 const20, const6, const3);
5629 horiz6, horiz7, horiz8, horiz5, horiz4,
5630 horiz3, horiz2, horiz6, horiz7, horiz8,
5631 horiz8, const20, const6, const3);
5633 horiz8, horiz8, horiz7, horiz7, horiz6,
5634 horiz5, horiz4, horiz8, horiz8, horiz7,
5635 horiz6, const20, const6, const3);
5636 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5637 res0 = __msa_aver_u_b(avg0, res0);
5638 LD_UB2(dst, dst_stride, dst0, dst1);
5639 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5640 res0 = __msa_aver_u_b(avg0, res0);
5641 ST_D2(res0, 0, 1, dst, dst_stride);
5642 dst += (2 * dst_stride);
5644 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5645 res1 = __msa_aver_u_b(avg1, res1);
5646 LD_UB2(dst, dst_stride, dst0, dst1);
5647 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5648 res1 = __msa_aver_u_b(avg1, res1);
5649 ST_D2(res1, 0, 1, dst, dst_stride);
5668 v16u8 inp0, inp1, inp2, inp3;
5669 v16u8 res0, res1, avg0, avg1;
5670 v16u8 horiz0, horiz1, horiz2, horiz3;
5671 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5673 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5674 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5675 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5676 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5677 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5678 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5679 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5682 src += (2 * src_stride);
5684 const20, const6, const3);
5686 src += (2 * src_stride);
5687 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5689 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5690 horiz0 = __msa_aver_u_b(inp0, res0);
5691 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5693 const20, const6, const3);
5695 src += (2 * src_stride);
5696 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5698 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5699 horiz2 = __msa_aver_u_b(inp2, res1);
5700 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5702 const20, const6, const3);
5703 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5705 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5706 horiz4 = __msa_aver_u_b(inp0, res0);
5707 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5708 LD_UB2(dst, dst_stride, dst0, dst1);
5709 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5711 horiz2, horiz3, horiz4, horiz1, horiz0,
5712 horiz0, horiz1, horiz2, horiz3, horiz4,
5713 horiz5, const20, const6, const3);
5714 res0 = __msa_aver_u_b(avg0, res0);
5715 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5716 res0 = __msa_aver_u_b(avg0, res0);
5717 ST_D2(res0, 0, 1, dst, dst_stride);
5718 dst += (2 * dst_stride);
5721 src += (2 * src_stride);
5723 const20, const6, const3);
5724 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5726 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5727 horiz6 = __msa_aver_u_b(inp2, res1);
5728 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5729 LD_UB2(dst, dst_stride, dst0, dst1);
5730 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5732 horiz4, horiz5, horiz6, horiz3, horiz2,
5733 horiz1, horiz0, horiz4, horiz5, horiz6,
5734 horiz7, const20, const6, const3);
5735 res1 = __msa_aver_u_b(avg1, res1);
5736 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5737 res1 = __msa_aver_u_b(avg1, res1);
5738 ST_D2(res1, 0, 1, dst, dst_stride);
5739 dst += (2 * dst_stride);
5743 const20, const6, const3);
5744 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5745 horiz8 = __msa_aver_u_b(inp0, res0);
5746 LD_UB2(dst, dst_stride, dst0, dst1);
5747 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5749 horiz6, horiz7, horiz8, horiz5, horiz4,
5750 horiz3, horiz2, horiz6, horiz7, horiz8,
5751 horiz8, const20, const6, const3);
5752 res0 = __msa_aver_u_b(avg0, res0);
5753 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5754 res0 = __msa_aver_u_b(avg0, res0);
5755 ST_D2(res0, 0, 1, dst, dst_stride);
5756 dst += (2 * dst_stride);
5758 LD_UB2(dst, dst_stride, dst0, dst1);
5759 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5761 horiz8, horiz8, horiz7, horiz7, horiz6,
5762 horiz5, horiz4, horiz8, horiz8, horiz7,
5763 horiz6, const20, const6, const3);
5764 res1 = __msa_aver_u_b(avg1, res1);
5765 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5766 res1 = __msa_aver_u_b(avg1, res1);
5767 ST_D2(res1, 0, 1, dst, dst_stride);
5776 for (loop_cnt = 4; loop_cnt--;) {
5792 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
5793 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
5796 src += (8 * src_stride);
5798 src8, src9, src10, src11, src12, src13, src14, src15);
5800 ST_UB8(
src0,
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
5801 dst += (8 * dst_stride);
5802 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
5811 uint64_t out0, out1, out2, out3;
5813 v16u8 dst0, dst1, dst2, dst3;
5815 for (cnt = (
height / 4); cnt--;) {
5817 src += (4 * src_stride);
5818 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
5821 dst0, dst1, dst2, dst3);
5823 out0 = __msa_copy_u_d((v2i64) dst0, 0);
5824 out1 = __msa_copy_u_d((v2i64) dst1, 0);
5825 out2 = __msa_copy_u_d((v2i64) dst2, 0);
5826 out3 = __msa_copy_u_d((v2i64) dst3, 0);
5827 SD4(out0, out1, out2, out3, dst, dst_stride);
5828 dst += (4 * dst_stride);
5837 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
5838 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5840 for (cnt = (
height / 8); cnt--;) {
5842 src += (8 * src_stride);
5843 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5846 dst0, dst1, dst2, dst3);
5847 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
5848 dst4, dst5, dst6, dst7);
5849 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
5850 dst += (8 * dst_stride);