FFmpeg
hevcpred_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/hevc/dec.h"
23 #include "hevcpred_mips.h"
24 
25 static const int8_t intra_pred_angle_up[17] = {
26  -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
27 };
28 
29 static const int8_t intra_pred_angle_low[16] = {
30  32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
31 };
32 
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34  mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35  res0, res1, mul_val_b0, mul_val_b1, round) \
36 { \
37  v8i16 res0_m, res1_m, res2_m, res3_m; \
38  \
39  MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40  mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
41  \
42  res0_m += mul_val_h1 * tmp0; \
43  res1_m += mul_val_h3 * tmp0; \
44  res2_m += mul_val_h1 * tmp0; \
45  res3_m += mul_val_h3 * tmp0; \
46  \
47  res0_m += mul_val_b0 * src0_r; \
48  res1_m += mul_val_b0 * src0_l; \
49  res2_m += (mul_val_b0 - 1) * src0_r; \
50  res3_m += (mul_val_b0 - 1) * src0_l; \
51  \
52  res0_m += mul_val_b1 * tmp1; \
53  res1_m += mul_val_b1 * tmp1; \
54  res2_m += (mul_val_b1 + 1) * tmp1; \
55  res3_m += (mul_val_b1 + 1) * tmp1; \
56  \
57  SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58  PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
59 }
60 
61 static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62  const uint8_t *src_left,
63  uint8_t *dst, int32_t stride,
64  int32_t flag)
65 {
66  uint32_t col;
67  uint32_t src_data;
68  v8i16 vec0, vec1, vec2;
69  v16i8 zero = { 0 };
70 
71  src_data = LW(src_top);
72  SW4(src_data, src_data, src_data, src_data, dst, stride);
73 
74  if (0 == flag) {
75  src_data = LW(src_left);
76 
77  vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
78 
79  vec0 = __msa_fill_h(src_left[-1]);
80  vec1 = __msa_fill_h(src_top[0]);
81 
82  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
83  vec2 -= vec0;
84  vec2 >>= 1;
85  vec2 += vec1;
86  CLIP_SH_0_255(vec2);
87 
88  for (col = 0; col < 4; col++) {
89  dst[stride * col] = (uint8_t) vec2[col];
90  }
91  }
92 }
93 
94 static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95  const uint8_t *src_left,
96  uint8_t *dst, int32_t stride,
97  int32_t flag)
98 {
99  uint8_t *tmp_dst = dst;
100  uint32_t row;
101  uint16_t val0, val1, val2, val3;
102  uint64_t src_data1;
103  v8i16 vec0, vec1, vec2;
104  v16i8 zero = { 0 };
105 
106  src_data1 = LD(src_top);
107 
108  for (row = 8; row--;) {
109  SD(src_data1, tmp_dst);
110  tmp_dst += stride;
111  }
112 
113  if (0 == flag) {
114  src_data1 = LD(src_left);
115 
116  vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
117 
118  vec0 = __msa_fill_h(src_left[-1]);
119  vec1 = __msa_fill_h(src_top[0]);
120 
121  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
122  vec2 -= vec0;
123  vec2 >>= 1;
124  vec2 += vec1;
125  CLIP_SH_0_255(vec2);
126 
127  val0 = vec2[0];
128  val1 = vec2[1];
129  val2 = vec2[2];
130  val3 = vec2[3];
131 
132  dst[0] = val0;
133  dst[stride] = val1;
134  dst[2 * stride] = val2;
135  dst[3 * stride] = val3;
136 
137  val0 = vec2[4];
138  val1 = vec2[5];
139  val2 = vec2[6];
140  val3 = vec2[7];
141 
142  dst[4 * stride] = val0;
143  dst[5 * stride] = val1;
144  dst[6 * stride] = val2;
145  dst[7 * stride] = val3;
146  }
147 }
148 
149 static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150  const uint8_t *src_left,
151  uint8_t *dst, int32_t stride,
152  int32_t flag)
153 {
154  int32_t col;
155  uint8_t *tmp_dst = dst;
156  uint32_t row;
157  v16u8 src;
158  v8i16 vec0, vec1, vec2, vec3;
159 
160  src = LD_UB(src_top);
161 
162  for (row = 16; row--;) {
163  ST_UB(src, tmp_dst);
164  tmp_dst += stride;
165  }
166 
167  if (0 == flag) {
168  src = LD_UB(src_left);
169 
170  vec0 = __msa_fill_h(src_left[-1]);
171  vec1 = __msa_fill_h(src_top[0]);
172 
173  UNPCK_UB_SH(src, vec2, vec3);
174  SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
175 
176  vec2 >>= 1;
177  vec3 >>= 1;
178 
179  ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180  CLIP_SH2_0_255(vec2, vec3);
181 
182  src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
183 
184  for (col = 0; col < 16; col++) {
185  dst[stride * col] = src[col];
186  }
187  }
188 }
189 
190 static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191  const uint8_t *src_left,
192  uint8_t *dst, int32_t stride,
193  int32_t flag)
194 {
195  uint32_t val0, val1, val2, val3;
196  v16i8 src0;
197  v8i16 src0_r, src_top_val, src_left_val;
198  v16i8 zero = { 0 };
199 
200  val0 = src_left[0] * 0x01010101;
201  val1 = src_left[1] * 0x01010101;
202  val2 = src_left[2] * 0x01010101;
203  val3 = src_left[3] * 0x01010101;
204  SW4(val0, val1, val2, val3, dst, stride);
205 
206  if (0 == flag) {
207  val0 = LW(src_top);
208  src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209  src_top_val = __msa_fill_h(src_top[-1]);
210  src_left_val = __msa_fill_h(src_left[0]);
211 
212  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
213 
214  src0_r -= src_top_val;
215  src0_r >>= 1;
216  src0_r += src_left_val;
217  CLIP_SH_0_255(src0_r);
218  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219  val0 = __msa_copy_s_w((v4i32) src0, 0);
220  SW(val0, dst);
221  }
222 }
223 
224 static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225  const uint8_t *src_left,
226  uint8_t *dst, int32_t stride,
227  int32_t flag)
228 {
229  uint64_t val0, val1, val2, val3;
230  v16i8 src0;
231  v8i16 src0_r, src_top_val, src_left_val;
232  v16i8 zero = { 0 };
233 
234  val0 = src_left[0] * 0x0101010101010101;
235  val1 = src_left[1] * 0x0101010101010101;
236  val2 = src_left[2] * 0x0101010101010101;
237  val3 = src_left[3] * 0x0101010101010101;
238  SD4(val0, val1, val2, val3, dst, stride);
239 
240  val0 = src_left[4] * 0x0101010101010101;
241  val1 = src_left[5] * 0x0101010101010101;
242  val2 = src_left[6] * 0x0101010101010101;
243  val3 = src_left[7] * 0x0101010101010101;
244  SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
245 
246  if (0 == flag) {
247  val0 = LD(src_top);
248  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249  src_top_val = __msa_fill_h(src_top[-1]);
250  src_left_val = __msa_fill_h(src_left[0]);
251 
252  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
253 
254  src0_r -= src_top_val;
255  src0_r >>= 1;
256  src0_r += src_left_val;
257  CLIP_SH_0_255(src0_r);
258  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259  val0 = __msa_copy_s_d((v2i64) src0, 0);
260  SD(val0, dst);
261  }
262 }
263 
264 static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265  const uint8_t *src_left,
266  uint8_t *dst, int32_t stride,
267  int32_t flag)
268 {
269  uint8_t *tmp_dst = dst;
270  uint32_t row;
271  uint8_t inp0, inp1, inp2, inp3;
272  v16i8 src0, src1, src2, src3;
273  v8i16 src0_r, src0_l, src_left_val, src_top_val;
274 
275  src_left_val = __msa_fill_h(src_left[0]);
276 
277  for (row = 4; row--;) {
278  inp0 = src_left[0];
279  inp1 = src_left[1];
280  inp2 = src_left[2];
281  inp3 = src_left[3];
282  src_left += 4;
283 
284  src0 = __msa_fill_b(inp0);
285  src1 = __msa_fill_b(inp1);
286  src2 = __msa_fill_b(inp2);
287  src3 = __msa_fill_b(inp3);
288 
289  ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290  tmp_dst += (4 * stride);
291  }
292 
293  if (0 == flag) {
294  src0 = LD_SB(src_top);
295  src_top_val = __msa_fill_h(src_top[-1]);
296 
297  UNPCK_UB_SH(src0, src0_r, src0_l);
298  SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
299 
300  src0_r >>= 1;
301  src0_l >>= 1;
302 
303  ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304  CLIP_SH2_0_255(src0_r, src0_l);
305  src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
306  ST_SB(src0, dst);
307  }
308 }
309 
310 static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311  const uint8_t *src_left,
312  uint8_t *dst, int32_t stride)
313 {
314  uint32_t row;
315  uint8_t inp0, inp1, inp2, inp3;
316  v16i8 src0, src1, src2, src3;
317 
318  for (row = 0; row < 8; row++) {
319  inp0 = src_left[row * 4];
320  inp1 = src_left[row * 4 + 1];
321  inp2 = src_left[row * 4 + 2];
322  inp3 = src_left[row * 4 + 3];
323 
324  src0 = __msa_fill_b(inp0);
325  src1 = __msa_fill_b(inp1);
326  src2 = __msa_fill_b(inp2);
327  src3 = __msa_fill_b(inp3);
328 
329  ST_SB2(src0, src0, dst, 16);
330  dst += stride;
331  ST_SB2(src1, src1, dst, 16);
332  dst += stride;
333  ST_SB2(src2, src2, dst, 16);
334  dst += stride;
335  ST_SB2(src3, src3, dst, 16);
336  dst += stride;
337  }
338 }
339 
340 static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341  const uint8_t *src_left,
342  uint8_t *dst, int32_t stride,
343  int32_t flag)
344 {
345  uint8_t *tmp_dst = dst;
346  uint32_t addition = 0;
347  uint32_t val0, val1, val2;
348  v16i8 src = { 0 };
349  v16u8 store;
350  v16i8 zero = { 0 };
351  v8u16 sum, vec0, vec1;
352 
353  val0 = LW(src_top);
354  val1 = LW(src_left);
355  INSERT_W2_SB(val0, val1, src);
356  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357  sum = (v8u16) __msa_hadd_u_w(sum, sum);
358  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359  sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360  addition = __msa_copy_u_w((v4i32) sum, 0);
361  store = (v16u8) __msa_fill_b(addition);
362  val0 = __msa_copy_u_w((v4i32) store, 0);
363  SW4(val0, val0, val0, val0, dst, stride)
364 
365  if (0 == flag) {
366  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
367 
368  vec1 += vec0;
369  vec0 += vec0;
370  vec1 += vec0;
371 
372  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374  val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375  store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376  val0 = __msa_copy_u_w((v4i32) store, 0);
377  SW(val0, tmp_dst);
378 
379  val0 = src_left[1];
380  val1 = src_left[2];
381  val2 = src_left[3];
382 
383  addition *= 3;
384 
385  ADD2(val0, addition, val1, addition, val0, val1);
386  val2 += addition;
387 
388  val0 += 2;
389  val1 += 2;
390  val2 += 2;
391  val0 >>= 2;
392  val1 >>= 2;
393  val2 >>= 2;
394 
395  tmp_dst[stride * 1] = val0;
396  tmp_dst[stride * 2] = val1;
397  tmp_dst[stride * 3] = val2;
398  }
399 }
400 
401 static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402  const uint8_t *src_left,
403  uint8_t *dst, int32_t stride,
404  int32_t flag)
405 {
406  uint8_t *tmp_dst = dst;
407  uint32_t row, col, val;
408  uint32_t addition = 0;
409  uint64_t val0, val1;
410  v16u8 src = { 0 };
411  v16u8 store;
412  v8u16 sum, vec0, vec1;
413  v16i8 zero = { 0 };
414 
415  val0 = LD(src_top);
416  val1 = LD(src_left);
417  INSERT_D2_UB(val0, val1, src);
418  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419  sum = (v8u16) __msa_hadd_u_w(sum, sum);
420  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423  sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424  addition = __msa_copy_u_w((v4i32) sum, 0);
425  store = (v16u8) __msa_fill_b(addition);
426  val0 = __msa_copy_u_d((v2i64) store, 0);
427 
428  for (row = 8; row--;) {
429  SD(val0, dst);
430  dst += stride;
431  }
432 
433  if (0 == flag) {
434  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
435 
436  vec1 += vec0;
437  vec0 += vec0;
438  vec1 += vec0;
439  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443  val0 = __msa_copy_u_d((v2i64) store, 0);
444  SD(val0, tmp_dst);
445 
446  val0 = LD(src_left);
447  src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448  vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449  vec0 = (v8u16) __msa_fill_h(addition);
450  vec0 *= 3;
451  vec1 += vec0;
452  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
453 
454  for (col = 1; col < 8; col++) {
455  tmp_dst[stride * col] = vec1[col];
456  }
457  }
458 }
459 
460 static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461  const uint8_t *src_left,
462  uint8_t *dst, int32_t stride,
463  int32_t flag)
464 {
465  uint8_t *tmp_dst = dst;
466  uint32_t row, col, val;
467  uint32_t addition = 0;
468  v16u8 src_above1, store, src_left1;
469  v8u16 sum, sum_above, sum_left;
470  v8u16 vec0, vec1, vec2;
471  v16i8 zero = { 0 };
472 
473  src_above1 = LD_UB(src_top);
474  src_left1 = LD_UB(src_left);
475 
476  HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477  sum = sum_above + sum_left;
478  sum = (v8u16) __msa_hadd_u_w(sum, sum);
479  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482  sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483  addition = __msa_copy_u_w((v4i32) sum, 0);
484  store = (v16u8) __msa_fill_b(addition);
485 
486  for (row = 16; row--;) {
487  ST_UB(store, dst);
488  dst += stride;
489  }
490 
491  if (0 == flag) {
492  vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493  ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
495  vec0 += vec0;
496  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497  SRARI_H2_UH(vec1, vec2, 2);
498  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501  ST_UB(store, tmp_dst);
502 
503  ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504  vec0 = (v8u16) __msa_fill_h(addition);
505  vec0 *= 3;
506  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507  SRARI_H2_UH(vec1, vec2, 2);
508  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
509 
510  for (col = 1; col < 16; col++) {
511  tmp_dst[stride * col] = store[col];
512  }
513  }
514 }
515 
516 static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517  const uint8_t *src_left,
518  uint8_t *dst, int32_t stride)
519 {
520  uint32_t row;
521  v16u8 src_above1, src_above2, store, src_left1, src_left2;
522  v8u16 sum_above1, sum_above2;
523  v8u16 sum_left1, sum_left2;
524  v8u16 sum, sum_above, sum_left;
525 
526  LD_UB2(src_top, 16, src_above1, src_above2);
527  LD_UB2(src_left, 16, src_left1, src_left2);
528  HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529  HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530  sum_above = sum_above1 + sum_above2;
531  sum_left = sum_left1 + sum_left2;
532  sum = sum_above + sum_left;
533  sum = (v8u16) __msa_hadd_u_w(sum, sum);
534  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537  sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538  store = (v16u8) __msa_splati_b((v16i8) sum, 0);
539 
540  for (row = 16; row--;) {
541  ST_UB2(store, store, dst, 16);
542  dst += stride;
543  ST_UB2(store, store, dst, 16);
544  dst += stride;
545  }
546 }
547 
548 static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549  const uint8_t *src_left,
550  uint8_t *dst, int32_t stride)
551 {
552  uint32_t src0, src1;
553  v16i8 src_vec0, src_vec1;
554  v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555  v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556  v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
557  v16i8 zero = { 0 };
558 
559  src0 = LW(src_top);
560  src1 = LW(src_left);
561 
562  mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
563 
564  src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565  src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
566 
567  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568  SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
569 
570  tmp0 = __msa_fill_h(src_top[4]);
571  tmp1 = __msa_fill_h(src_left[4]);
572 
573  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574  res0, res1, res2, res3);
575 
576  res0 += mul_val1 * tmp0;
577  res1 += mul_val1 * tmp0;
578  res2 += mul_val1 * tmp0;
579  res3 += mul_val1 * tmp0;
580 
581  res0 += 3 * src_vec0_r;
582  res1 += 2 * src_vec0_r;
583  res2 += src_vec0_r;
584  res0 += tmp1;
585  res1 += 2 * tmp1;
586  res2 += 3 * tmp1;
587  res3 += 4 * tmp1;
588 
589  PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590  SRARI_H2_SH(res0, res1, 3);
591  src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592  ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
593 }
594 
595 static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596  const uint8_t *src_left,
597  uint8_t *dst, int32_t stride)
598 {
599  uint64_t src0, src1;
600  v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601  v8i16 src_vec0_r, src_vec1_r;
602  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604  v8i16 tmp0, tmp1, tmp2;
605  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606  v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
607  v16i8 zero = { 0 };
608 
609  src0 = LD(src_top);
610  src1 = LD(src_left);
611 
612  src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613  src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
614 
615  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616  SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617  SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
618 
619  tmp0 = __msa_fill_h(src_top[8]);
620  tmp1 = __msa_fill_h(src_left[8]);
621 
622  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623  res0, res1, res2, res3);
624  MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625  res4, res5, res6, res7);
626 
627  tmp2 = mul_val1 * tmp0;
628  res0 += tmp2;
629  res1 += tmp2;
630  res2 += tmp2;
631  res3 += tmp2;
632  res4 += tmp2;
633  res5 += tmp2;
634  res6 += tmp2;
635  res7 += tmp2;
636 
637  res0 += 7 * src_vec0_r;
638  res1 += 6 * src_vec0_r;
639  res2 += 5 * src_vec0_r;
640  res3 += 4 * src_vec0_r;
641  res4 += 3 * src_vec0_r;
642  res5 += 2 * src_vec0_r;
643  res6 += src_vec0_r;
644 
645  res0 += tmp1;
646  res1 += 2 * tmp1;
647  res2 += 3 * tmp1;
648  res3 += 4 * tmp1;
649  res4 += 5 * tmp1;
650  res5 += 6 * tmp1;
651  res6 += 7 * tmp1;
652  res7 += 8 * tmp1;
653 
654  SRARI_H4_SH(res0, res1, res2, res3, 4);
655  SRARI_H4_SH(res4, res5, res6, res7, 4);
656  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657  src_vec0, src_vec1, src_vec2, src_vec3);
658 
659  ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660  0, 1, 0, 1, dst, stride);
661 }
662 
663 static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664  const uint8_t *src_left,
665  uint8_t *dst, int32_t stride)
666 {
667  v16u8 src0, src1;
668  v8i16 src0_r, src1_r, src0_l, src1_l;
669  v8i16 vec0, vec1;
670  v8i16 res0, res1, tmp0, tmp1;
671  v8i16 mul_val2, mul_val3;
672  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673  v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674 
675  src0 = LD_UB(src_top);
676  src1 = LD_UB(src_left);
677 
678  UNPCK_UB_SH(src0, src0_r, src0_l);
679  UNPCK_UB_SH(src1, src1_r, src1_l);
680 
681  mul_val2 = mul_val0 - 8;
682  mul_val3 = mul_val1 + 8;
683 
684  tmp0 = __msa_fill_h(src_top[16]);
685  tmp1 = __msa_fill_h(src_left[16]);
686 
687  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689  mul_val0, mul_val1, mul_val2, mul_val3,
690  res0, res1, 15, 1, 5);
691  ST_SH2(res0, res1, dst, stride);
692  dst += (2 * stride);
693 
694  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696  mul_val0, mul_val1, mul_val2, mul_val3,
697  res0, res1, 13, 3, 5);
698  ST_SH2(res0, res1, dst, stride);
699  dst += (2 * stride);
700 
701  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703  mul_val0, mul_val1, mul_val2, mul_val3,
704  res0, res1, 11, 5, 5);
705  ST_SH2(res0, res1, dst, stride);
706  dst += (2 * stride);
707 
708  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710  mul_val0, mul_val1, mul_val2, mul_val3,
711  res0, res1, 9, 7, 5);
712  ST_SH2(res0, res1, dst, stride);
713  dst += (2 * stride);
714 
715  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717  mul_val0, mul_val1, mul_val2, mul_val3,
718  res0, res1, 7, 9, 5);
719  ST_SH2(res0, res1, dst, stride);
720  dst += (2 * stride);
721 
722  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724  mul_val0, mul_val1, mul_val2, mul_val3,
725  res0, res1, 5, 11, 5);
726  ST_SH2(res0, res1, dst, stride);
727  dst += (2 * stride);
728 
729  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731  mul_val0, mul_val1, mul_val2, mul_val3,
732  res0, res1, 3, 13, 5);
733  ST_SH2(res0, res1, dst, stride);
734  dst += (2 * stride);
735 
736  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738  mul_val0, mul_val1, mul_val2, mul_val3,
739  res0, res1, 1, 15, 5);
740  ST_SH2(res0, res1, dst, stride);
741 }
742 
743 static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744  const uint8_t *src_left,
745  uint8_t *dst, int32_t stride,
746  uint8_t offset)
747 {
748  v16i8 src0, src1;
749  v8i16 src0_r, src1_r, src0_l, src1_l;
750  v8i16 vec0, vec1, res0, res1;
751  v8i16 tmp0, tmp1;
752  v8i16 mul_val2, mul_val3;
753  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755 
756  tmp0 = __msa_fill_h(src_top[32 - offset]);
757  tmp1 = __msa_fill_h(src_left[32]);
758 
759  src0 = LD_SB(src_top);
760  src1 = LD_SB(src_left);
761 
762  UNPCK_UB_SH(src0, src0_r, src0_l);
763  UNPCK_UB_SH(src1, src1_r, src1_l);
764 
765  mul_val1 += offset;
766  mul_val0 -= offset;
767  mul_val2 = mul_val0 - 8;
768  mul_val3 = mul_val1 + 8;
769 
770  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772  mul_val0, mul_val1, mul_val2, mul_val3,
773  res0, res1, 31, 1, 6);
774  ST_SH2(res0, res1, dst, stride);
775  dst += (2 * stride);
776 
777  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779  mul_val0, mul_val1, mul_val2, mul_val3,
780  res0, res1, 29, 3, 6);
781  ST_SH2(res0, res1, dst, stride);
782  dst += (2 * stride);
783 
784  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786  mul_val0, mul_val1, mul_val2, mul_val3,
787  res0, res1, 27, 5, 6);
788  ST_SH2(res0, res1, dst, stride);
789  dst += (2 * stride);
790 
791  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793  mul_val0, mul_val1, mul_val2, mul_val3,
794  res0, res1, 25, 7, 6);
795  ST_SH2(res0, res1, dst, stride);
796  dst += (2 * stride);
797 
798  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800  mul_val0, mul_val1, mul_val2, mul_val3,
801  res0, res1, 23, 9, 6);
802  ST_SH2(res0, res1, dst, stride);
803  dst += (2 * stride);
804 
805  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807  mul_val0, mul_val1, mul_val2, mul_val3,
808  res0, res1, 21, 11, 6);
809  ST_SH2(res0, res1, dst, stride);
810  dst += (2 * stride);
811 
812  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814  mul_val0, mul_val1, mul_val2, mul_val3,
815  res0, res1, 19, 13, 6);
816  ST_SH2(res0, res1, dst, stride);
817  dst += (2 * stride);
818 
819  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821  mul_val0, mul_val1, mul_val2, mul_val3,
822  res0, res1, 17, 15, 6);
823  ST_SH2(res0, res1, dst, stride);
824 }
825 
826 static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827  const uint8_t *src_left,
828  uint8_t *dst, int32_t stride,
829  uint8_t offset)
830 {
831  v16i8 src0, src1;
832  v8i16 src0_r, src1_r, src0_l, src1_l;
833  v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834  v8i16 mul_val2, mul_val3;
835  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837 
838  tmp0 = __msa_fill_h(src_top[32 - offset]);
839  tmp1 = __msa_fill_h(src_left[16]);
840 
841  src0 = LD_SB(src_top);
842  src1 = LD_SB(src_left);
843 
844  UNPCK_UB_SH(src0, src0_r, src0_l);
845  UNPCK_UB_SH(src1, src1_r, src1_l);
846 
847  mul_val1 += offset;
848  mul_val0 -= offset;
849  mul_val2 = mul_val0 - 8;
850  mul_val3 = mul_val1 + 8;
851 
852  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854  mul_val0, mul_val1, mul_val2, mul_val3,
855  res0, res1, 15, 17, 6);
856  ST_SH2(res0, res1, dst, stride);
857  dst += (2 * stride);
858 
859  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861  mul_val0, mul_val1, mul_val2, mul_val3,
862  res0, res1, 13, 19, 6);
863  ST_SH2(res0, res1, dst, stride);
864  dst += (2 * stride);
865 
866  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868  mul_val0, mul_val1, mul_val2, mul_val3,
869  res0, res1, 11, 21, 6);
870  ST_SH2(res0, res1, dst, stride);
871  dst += (2 * stride);
872 
873  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875  mul_val0, mul_val1, mul_val2, mul_val3,
876  res0, res1, 9, 23, 6);
877  ST_SH2(res0, res1, dst, stride);
878  dst += (2 * stride);
879 
880  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882  mul_val0, mul_val1, mul_val2, mul_val3,
883  res0, res1, 7, 25, 6);
884  ST_SH2(res0, res1, dst, stride);
885  dst += (2 * stride);
886 
887  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889  mul_val0, mul_val1, mul_val2, mul_val3,
890  res0, res1, 5, 27, 6);
891  ST_SH2(res0, res1, dst, stride);
892  dst += (2 * stride);
893 
894  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896  mul_val0, mul_val1, mul_val2, mul_val3,
897  res0, res1, 3, 29, 6);
898  ST_SH2(res0, res1, dst, stride);
899  dst += (2 * stride);
900 
901  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903  mul_val0, mul_val1, mul_val2, mul_val3,
904  res0, res1, 1, 31, 6);
905  ST_SH2(res0, res1, dst, stride);
906 }
907 
908 static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909  const uint8_t *src_left,
910  uint8_t *dst, int32_t stride)
911 {
912  process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913  process_intra_upper_16x16_msa((src_top + 16), src_left,
914  (dst + 16), stride, 16);
915  dst += (16 * stride);
916  src_left += 16;
917 
918  process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919  process_intra_lower_16x16_msa((src_top + 16), src_left,
920  (dst + 16), stride, 16);
921 }
922 
923 static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
924  const uint8_t *src_left,
925  uint8_t *dst,
926  int32_t stride,
927  int32_t mode)
928 {
929  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930  uint8_t ref_array[3 * 32 + 4];
931  uint8_t *ref_tmp = ref_array + 4;
932  const uint8_t *ref;
933  int32_t last;
934  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935  int32_t idx2, fact_val2, idx3, fact_val3;
936  int32_t angle, angle_loop;
937  int32_t inv_angle_val, offset;
938  uint64_t tmp0;
939  v16i8 top0, top1, top2, top3;
940  v16i8 dst_val0;
941  v16i8 zero = { 0 };
942  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944 
945  angle = intra_pred_angle_up[mode - 18];
946  inv_angle_val = inv_angle[mode - 18];
947  last = (angle) >> 3;
948  angle_loop = angle;
949 
950  ref = src_top - 1;
951  if (angle < 0 && last < -1) {
952  inv_angle_val = inv_angle[mode - 18];
953 
954  tmp0 = LD(ref);
955  SD(tmp0, ref_tmp);
956 
957  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959  ref_tmp[h_cnt] = src_left[offset];
960  }
961 
962  ref = ref_tmp;
963  }
964 
965  idx0 = angle_loop >> 5;
966  fact_val0 = angle_loop & 31;
967  angle_loop += angle;
968 
969  idx1 = angle_loop >> 5;
970  fact_val1 = angle_loop & 31;
971  angle_loop += angle;
972 
973  idx2 = angle_loop >> 5;
974  fact_val2 = angle_loop & 31;
975  angle_loop += angle;
976 
977  idx3 = angle_loop >> 5;
978  fact_val3 = angle_loop & 31;
979 
980  top0 = LD_SB(ref + idx0 + 1);
981  top1 = LD_SB(ref + idx1 + 1);
982  top2 = LD_SB(ref + idx2 + 1);
983  top3 = LD_SB(ref + idx3 + 1);
984 
985  fact0 = __msa_fill_h(fact_val0);
986  fact1 = __msa_fill_h(32 - fact_val0);
987 
988  fact2 = __msa_fill_h(fact_val1);
989  fact3 = __msa_fill_h(32 - fact_val1);
990 
991  fact4 = __msa_fill_h(fact_val2);
992  fact5 = __msa_fill_h(32 - fact_val2);
993 
994  fact6 = __msa_fill_h(fact_val3);
995  fact7 = __msa_fill_h(32 - fact_val3);
996 
997  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000  diff0, diff2, diff4, diff6);
1001  SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1002  diff1, diff3, diff5, diff7);
1003  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1004  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1005  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1006 
1007  diff1 += diff0 * fact1;
1008  diff3 += diff2 * fact3;
1009 
1010  SRARI_H2_SH(diff1, diff3, 5);
1011  dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1012  ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1013 }
1014 
1015 static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
1016  const uint8_t *src_left,
1017  uint8_t *dst,
1018  int32_t stride,
1019  int32_t mode)
1020 {
1021  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1022  uint8_t ref_array[3 * 32 + 4];
1023  uint8_t *ref_tmp = ref_array + 8;
1024  const uint8_t *ref;
1025  const uint8_t *src_left_tmp = src_left - 1;
1026  int32_t last, offset;
1027  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1028  int32_t idx2, fact_val2, idx3, fact_val3;
1029  int32_t angle, angle_loop;
1030  int32_t inv_angle_val, inv_angle_val_loop;
1031  int32_t tmp0, tmp1, tmp2;
1032  v16i8 top0, top1, top2, top3;
1033  v16u8 dst_val0, dst_val1;
1034  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1035  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1036 
1037  angle = intra_pred_angle_up[mode - 18];
1038  inv_angle_val = inv_angle[mode - 18];
1039  last = (angle) >> 2;
1040  angle_loop = angle;
1041 
1042  ref = src_top - 1;
1043  if (last < -1) {
1044  inv_angle_val_loop = inv_angle_val * last;
1045 
1046  tmp0 = LW(ref);
1047  tmp1 = LW(ref + 4);
1048  tmp2 = LW(ref + 8);
1049  SW(tmp0, ref_tmp);
1050  SW(tmp1, ref_tmp + 4);
1051  SW(tmp2, ref_tmp + 8);
1052 
1053  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1054  offset = (inv_angle_val_loop + 128) >> 8;
1055  ref_tmp[h_cnt] = src_left_tmp[offset];
1056  inv_angle_val_loop += inv_angle_val;
1057  }
1058  ref = ref_tmp;
1059  }
1060 
1061  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1062  idx0 = (angle_loop) >> 5;
1063  fact_val0 = (angle_loop) & 31;
1064  angle_loop += angle;
1065 
1066  idx1 = (angle_loop) >> 5;
1067  fact_val1 = (angle_loop) & 31;
1068  angle_loop += angle;
1069 
1070  idx2 = (angle_loop) >> 5;
1071  fact_val2 = (angle_loop) & 31;
1072  angle_loop += angle;
1073 
1074  idx3 = (angle_loop) >> 5;
1075  fact_val3 = (angle_loop) & 31;
1076  angle_loop += angle;
1077 
1078  top0 = LD_SB(ref + idx0 + 1);
1079  top1 = LD_SB(ref + idx1 + 1);
1080  top2 = LD_SB(ref + idx2 + 1);
1081  top3 = LD_SB(ref + idx3 + 1);
1082 
1083  fact0 = __msa_fill_h(fact_val0);
1084  fact1 = __msa_fill_h(32 - fact_val0);
1085  fact2 = __msa_fill_h(fact_val1);
1086  fact3 = __msa_fill_h(32 - fact_val1);
1087  fact4 = __msa_fill_h(fact_val2);
1088  fact5 = __msa_fill_h(32 - fact_val2);
1089  fact6 = __msa_fill_h(fact_val3);
1090  fact7 = __msa_fill_h(32 - fact_val3);
1091 
1092  UNPCK_UB_SH(top0, diff0, diff1);
1093  UNPCK_UB_SH(top1, diff2, diff3);
1094  UNPCK_UB_SH(top2, diff4, diff5);
1095  UNPCK_UB_SH(top3, diff6, diff7);
1096 
1097  SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1098  diff1, diff3, diff5, diff7);
1099  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1100  diff1, diff3, diff5, diff7);
1101 
1102  diff1 += diff0 * fact1;
1103  diff3 += diff2 * fact3;
1104  diff5 += diff4 * fact5;
1105  diff7 += diff6 * fact7;
1106 
1107  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1108  PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1109  ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1110  dst += (4 * stride);
1111  }
1112 }
1113 
1114 static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
1115  const uint8_t *src_left,
1116  uint8_t *dst,
1117  int32_t stride,
1118  int32_t mode)
1119 {
1120  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1121  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1122  int32_t idx2, fact_val2, idx3, fact_val3;
1123  int32_t tmp0;
1124  int32_t angle, angle_loop, offset;
1125  int32_t inv_angle_val, inv_angle_val_loop;
1126  uint8_t ref_array[3 * 32 + 4];
1127  uint8_t *ref_tmp = ref_array + 16;
1128  const uint8_t *ref;
1129  const uint8_t *src_left_tmp = src_left - 1;
1130  int32_t last;
1131  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1132  v16i8 dst0, dst1, dst2, dst3;
1133  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1134  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1135  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1136 
1137  angle = intra_pred_angle_up[mode - 18];
1138  inv_angle_val = inv_angle[mode - 18];
1139  last = angle >> 1;
1140  angle_loop = angle;
1141 
1142  ref = src_top - 1;
1143  if (last < -1) {
1144  inv_angle_val_loop = inv_angle_val * last;
1145 
1146  top0 = LD_UB(ref);
1147  tmp0 = LW(ref + 16);
1148  ST_UB(top0, ref_tmp);
1149  SW(tmp0, ref_tmp + 16);
1150 
1151  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1152  offset = (inv_angle_val_loop + 128) >> 8;
1153  ref_tmp[h_cnt] = src_left_tmp[offset];
1154  inv_angle_val_loop += inv_angle_val;
1155  }
1156  ref = ref_tmp;
1157  }
1158 
1159  for (v_cnt = 4; v_cnt--;) {
1160  idx0 = (angle_loop) >> 5;
1161  fact_val0 = (angle_loop) & 31;
1162  angle_loop += angle;
1163 
1164  idx1 = (angle_loop) >> 5;
1165  fact_val1 = (angle_loop) & 31;
1166  angle_loop += angle;
1167 
1168  idx2 = (angle_loop) >> 5;
1169  fact_val2 = (angle_loop) & 31;
1170  angle_loop += angle;
1171 
1172  idx3 = (angle_loop) >> 5;
1173  fact_val3 = (angle_loop) & 31;
1174  angle_loop += angle;
1175 
1176  LD_UB2(ref + idx0 + 1, 16, top0, top1);
1177  LD_UB2(ref + idx1 + 1, 16, top2, top3);
1178  LD_UB2(ref + idx2 + 1, 16, top4, top5);
1179  LD_UB2(ref + idx3 + 1, 16, top6, top7);
1180 
1181  fact0 = __msa_fill_h(fact_val0);
1182  fact1 = __msa_fill_h(32 - fact_val0);
1183  fact2 = __msa_fill_h(fact_val1);
1184  fact3 = __msa_fill_h(32 - fact_val1);
1185  fact4 = __msa_fill_h(fact_val2);
1186  fact5 = __msa_fill_h(32 - fact_val2);
1187  fact6 = __msa_fill_h(fact_val3);
1188  fact7 = __msa_fill_h(32 - fact_val3);
1189 
1190  SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1191  top1, top3, top5, top7);
1192  UNPCK_UB_SH(top0, diff0, diff1);
1193  UNPCK_UB_SH(top1, diff2, diff3);
1194  UNPCK_UB_SH(top2, diff4, diff5);
1195  UNPCK_UB_SH(top3, diff6, diff7);
1196  UNPCK_UB_SH(top4, diff8, diff9);
1197  UNPCK_UB_SH(top5, diff10, diff11);
1198  UNPCK_UB_SH(top6, diff12, diff13);
1199  UNPCK_UB_SH(top7, diff14, diff15);
1200 
1201  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1202  diff2, diff3, diff6, diff7);
1203  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1204  diff10, diff11, diff14, diff15);
1205 
1206  diff2 += diff0 * fact1;
1207  diff3 += diff1 * fact1;
1208  diff6 += diff4 * fact3;
1209  diff7 += diff5 * fact3;
1210  diff10 += diff8 * fact5;
1211  diff11 += diff9 * fact5;
1212  diff14 += diff12 * fact7;
1213  diff15 += diff13 * fact7;
1214 
1215  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1216  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1217  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1218  dst0, dst1, dst2, dst3);
1219  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1220  dst += (4 * stride);
1221  }
1222 }
1223 
1224 static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
1225  const uint8_t *src_left,
1226  uint8_t *dst,
1227  int32_t stride,
1228  int32_t mode)
1229 {
1230  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1231  uint8_t ref_array[3 * 32 + 4];
1232  uint8_t *ref_tmp;
1233  const uint8_t *ref;
1234  const uint8_t *src_left_tmp = src_left - 1;
1235  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1236  int32_t tmp0, tmp1, tmp2, tmp3;
1237  int32_t angle, angle_loop;
1238  int32_t inv_angle_val, inv_angle_val_loop;
1239  int32_t last, offset;
1240  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1241  v16i8 dst0, dst1, dst2, dst3;
1242  v8i16 fact0, fact1, fact2, fact3;
1243  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1244  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1245 
1246  ref_tmp = ref_array + 32;
1247 
1248  angle = intra_pred_angle_up[mode - 18];
1249  inv_angle_val = inv_angle[mode - 18];
1250  last = angle;
1251  angle_loop = angle;
1252 
1253  ref = src_top - 1;
1254  if (last < -1) {
1255  inv_angle_val_loop = inv_angle_val * last;
1256  LD_UB2(ref, 16, top0, top1);
1257  tmp0 = ref[32];
1258  tmp1 = ref[33];
1259  tmp2 = ref[34];
1260  tmp3 = ref[35];
1261 
1262  ST_UB2(top0, top1, ref_tmp, 16);
1263  ref_tmp[32] = tmp0;
1264  ref_tmp[33] = tmp1;
1265  ref_tmp[34] = tmp2;
1266  ref_tmp[35] = tmp3;
1267 
1268  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1269  offset = (inv_angle_val_loop + 128) >> 8;
1270  ref_tmp[h_cnt] = src_left_tmp[offset];
1271  inv_angle_val_loop += inv_angle_val;
1272  }
1273 
1274  ref = ref_tmp;
1275  }
1276 
1277  for (v_cnt = 16; v_cnt--;) {
1278  idx0 = (angle_loop) >> 5;
1279  fact_val0 = (angle_loop) & 31;
1280  angle_loop += angle;
1281 
1282  idx1 = (angle_loop) >> 5;
1283  fact_val1 = (angle_loop) & 31;
1284  angle_loop += angle;
1285 
1286  top0 = LD_UB(ref + idx0 + 1);
1287  top4 = LD_UB(ref + idx1 + 1);
1288  top1 = LD_UB(ref + idx0 + 17);
1289  top5 = LD_UB(ref + idx1 + 17);
1290  top3 = LD_UB(ref + idx0 + 33);
1291  top7 = LD_UB(ref + idx1 + 33);
1292 
1293  fact0 = __msa_fill_h(fact_val0);
1294  fact1 = __msa_fill_h(32 - fact_val0);
1295  fact2 = __msa_fill_h(fact_val1);
1296  fact3 = __msa_fill_h(32 - fact_val1);
1297 
1298  top2 = top1;
1299  top6 = top5;
1300 
1301  SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1302  top1, top3, top5, top7);
1303  UNPCK_UB_SH(top0, diff0, diff1);
1304  UNPCK_UB_SH(top1, diff2, diff3);
1305  UNPCK_UB_SH(top2, diff4, diff5);
1306  UNPCK_UB_SH(top3, diff6, diff7);
1307  UNPCK_UB_SH(top4, diff8, diff9);
1308  UNPCK_UB_SH(top5, diff10, diff11);
1309  UNPCK_UB_SH(top6, diff12, diff13);
1310  UNPCK_UB_SH(top7, diff14, diff15);
1311 
1312  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1313  diff2, diff3, diff6, diff7);
1314  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1315  diff10, diff11, diff14, diff15);
1316 
1317  diff2 += diff0 * fact1;
1318  diff3 += diff1 * fact1;
1319  diff6 += diff4 * fact1;
1320  diff7 += diff5 * fact1;
1321  diff10 += diff8 * fact3;
1322  diff11 += diff9 * fact3;
1323  diff14 += diff12 * fact3;
1324  diff15 += diff13 * fact3;
1325 
1326  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1327  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1328  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1329  dst0, dst1, dst2, dst3);
1330 
1331  ST_SB2(dst0, dst1, dst, 16);
1332  dst += stride;
1333  ST_SB2(dst2, dst3, dst, 16);
1334  dst += stride;
1335  }
1336 }
1337 
1338 static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
1339  const uint8_t *src_left,
1340  uint8_t *dst,
1341  int32_t stride,
1342  int32_t mode)
1343 {
1344  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1345  uint8_t ref_array[3 * 32 + 4];
1346  uint8_t *ref_tmp = ref_array + 4;
1347  const uint8_t *ref;
1348  int32_t last, offset;
1349  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1350  int32_t idx2, fact_val2, idx3, fact_val3;
1351  int32_t angle, angle_loop, inv_angle_val;
1352  uint64_t tmp0;
1353  v16i8 dst_val0, dst_val1;
1354  v16u8 top0, top1, top2, top3;
1355  v16u8 zero = { 0 };
1356  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1357  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1358 
1359  angle = intra_pred_angle_low[mode - 2];
1360  last = angle >> 3;
1361  angle_loop = angle;
1362 
1363  ref = src_left - 1;
1364  if (last < -1) {
1365  inv_angle_val = inv_angle[mode - 11];
1366 
1367  tmp0 = LD(ref);
1368  SD(tmp0, ref_tmp);
1369 
1370  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1371  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1372  ref_tmp[h_cnt] = src_top[offset];
1373  }
1374 
1375  ref = ref_tmp;
1376  }
1377 
1378  idx0 = angle_loop >> 5;
1379  fact_val0 = angle_loop & 31;
1380  angle_loop += angle;
1381 
1382  idx1 = angle_loop >> 5;
1383  fact_val1 = angle_loop & 31;
1384  angle_loop += angle;
1385 
1386  idx2 = angle_loop >> 5;
1387  fact_val2 = angle_loop & 31;
1388  angle_loop += angle;
1389 
1390  idx3 = angle_loop >> 5;
1391  fact_val3 = angle_loop & 31;
1392 
1393  top0 = LD_UB(ref + idx0 + 1);
1394  top1 = LD_UB(ref + idx1 + 1);
1395  top2 = LD_UB(ref + idx2 + 1);
1396  top3 = LD_UB(ref + idx3 + 1);
1397 
1398  fact0 = __msa_fill_h(fact_val0);
1399  fact1 = __msa_fill_h(32 - fact_val0);
1400  fact2 = __msa_fill_h(fact_val1);
1401  fact3 = __msa_fill_h(32 - fact_val1);
1402  fact4 = __msa_fill_h(fact_val2);
1403  fact5 = __msa_fill_h(32 - fact_val2);
1404  fact6 = __msa_fill_h(fact_val3);
1405  fact7 = __msa_fill_h(32 - fact_val3);
1406 
1407  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1408  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1409  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1410  diff0, diff2, diff4, diff6);
1411  SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1412  diff1, diff3, diff5, diff7);
1413  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1414  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1415  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1416 
1417  diff1 += diff0 * fact1;
1418  diff3 += diff2 * fact3;
1419 
1420  SRARI_H2_SH(diff1, diff3, 5);
1421  PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1422 
1423  diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1424  diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1425 
1426  diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1427 
1428  dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1429  dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1430 
1431  ST_W2(dst_val0, 0, 1, dst, stride);
1432  ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1433 }
1434 
1435 static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
1436  const uint8_t *src_left,
1437  uint8_t *dst,
1438  int32_t stride,
1439  int32_t mode)
1440 {
1441  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1442  uint8_t ref_array[3 * 32 + 4];
1443  uint8_t *ref_tmp = ref_array + 8;
1444  const uint8_t *ref;
1445  const uint8_t *src_top_tmp = src_top - 1;
1446  uint8_t *dst_org;
1447  int32_t last, offset, tmp0, tmp1, tmp2;
1448  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1449  int32_t idx2, fact_val2, idx3, fact_val3;
1450  int32_t angle, angle_loop, inv_angle_val;
1451  v16i8 top0, top1, top2, top3;
1452  v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1453  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1454  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1455 
1456  angle = intra_pred_angle_low[mode - 2];
1457  last = (angle) >> 2;
1458  angle_loop = angle;
1459 
1460  ref = src_left - 1;
1461  if (last < -1) {
1462  inv_angle_val = inv_angle[mode - 11];
1463 
1464  tmp0 = LW(ref);
1465  tmp1 = LW(ref + 4);
1466  tmp2 = LW(ref + 8);
1467  SW(tmp0, ref_tmp);
1468  SW(tmp1, ref_tmp + 4);
1469  SW(tmp2, ref_tmp + 8);
1470 
1471  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1472  offset = (h_cnt * inv_angle_val + 128) >> 8;
1473  ref_tmp[h_cnt] = src_top_tmp[offset];
1474  }
1475 
1476  ref = ref_tmp;
1477  }
1478 
1479  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480  dst_org = dst;
1481 
1482  idx0 = angle_loop >> 5;
1483  fact_val0 = angle_loop & 31;
1484  angle_loop += angle;
1485 
1486  idx1 = angle_loop >> 5;
1487  fact_val1 = angle_loop & 31;
1488  angle_loop += angle;
1489 
1490  idx2 = angle_loop >> 5;
1491  fact_val2 = angle_loop & 31;
1492  angle_loop += angle;
1493 
1494  idx3 = angle_loop >> 5;
1495  fact_val3 = angle_loop & 31;
1496  angle_loop += angle;
1497 
1498  top0 = LD_SB(ref + idx0 + 1);
1499  top1 = LD_SB(ref + idx1 + 1);
1500  top2 = LD_SB(ref + idx2 + 1);
1501  top3 = LD_SB(ref + idx3 + 1);
1502 
1503  fact0 = __msa_fill_h(fact_val0);
1504  fact1 = __msa_fill_h(32 - fact_val0);
1505  fact2 = __msa_fill_h(fact_val1);
1506  fact3 = __msa_fill_h(32 - fact_val1);
1507  fact4 = __msa_fill_h(fact_val2);
1508  fact5 = __msa_fill_h(32 - fact_val2);
1509  fact6 = __msa_fill_h(fact_val3);
1510  fact7 = __msa_fill_h(32 - fact_val3);
1511 
1512  UNPCK_UB_SH(top0, diff0, diff1);
1513  UNPCK_UB_SH(top1, diff2, diff3);
1514  UNPCK_UB_SH(top2, diff4, diff5);
1515  UNPCK_UB_SH(top3, diff6, diff7);
1516  SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1517  diff1, diff3, diff5, diff7);
1518  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1519  diff1, diff3, diff5, diff7);
1520 
1521  diff1 += diff0 * fact1;
1522  diff3 += diff2 * fact3;
1523  diff5 += diff4 * fact5;
1524  diff7 += diff6 * fact7;
1525 
1526  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1527  PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1528  dst_val0, dst_val1, dst_val2, dst_val3);
1529  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1530  ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1531  ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1532  dst += 4;
1533  }
1534 }
1535 
1536 static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
1537  const uint8_t *src_left,
1538  uint8_t *dst,
1539  int32_t stride,
1540  int32_t mode)
1541 {
1542  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1543  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1544  int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1545  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1546  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1547  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1548  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1549  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1550  int32_t angle, angle_loop, inv_angle_val, offset;
1551  uint8_t ref_array[3 * 32 + 4];
1552  uint8_t *ref_tmp = ref_array + 16;
1553  const uint8_t *ref, *src_top_tmp = src_top - 1;
1554  uint8_t *dst_org;
1555  int32_t last;
1556 
1557  angle = intra_pred_angle_low[mode - 2];
1558  last = (angle) >> 1;
1559  angle_loop = angle;
1560 
1561  ref = src_left - 1;
1562  if (last < -1) {
1563  inv_angle_val = inv_angle[mode - 11];
1564 
1565  top0 = LD_SB(ref);
1566  tmp0 = LW(ref + 16);
1567  ST_SB(top0, ref_tmp);
1568  SW(tmp0, ref_tmp + 16);
1569 
1570  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1571  offset = (h_cnt * inv_angle_val + 128) >> 8;
1572  ref_tmp[h_cnt] = src_top_tmp[offset];
1573  }
1574 
1575  ref = ref_tmp;
1576  }
1577 
1578  for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579  dst_org = dst;
1580 
1581  idx0 = angle_loop >> 5;
1582  fact_val0 = angle_loop & 31;
1583  angle_loop += angle;
1584 
1585  idx1 = angle_loop >> 5;
1586  fact_val1 = angle_loop & 31;
1587  angle_loop += angle;
1588 
1589  idx2 = angle_loop >> 5;
1590  fact_val2 = angle_loop & 31;
1591  angle_loop += angle;
1592 
1593  idx3 = angle_loop >> 5;
1594  fact_val3 = angle_loop & 31;
1595  angle_loop += angle;
1596 
1597  LD_SB2(ref + idx0 + 1, 16, top0, top1);
1598  LD_SB2(ref + idx1 + 1, 16, top2, top3);
1599  LD_SB2(ref + idx2 + 1, 16, top4, top5);
1600  LD_SB2(ref + idx3 + 1, 16, top6, top7);
1601 
1602  fact0 = __msa_fill_h(fact_val0);
1603  fact1 = __msa_fill_h(32 - fact_val0);
1604  fact2 = __msa_fill_h(fact_val1);
1605  fact3 = __msa_fill_h(32 - fact_val1);
1606  fact4 = __msa_fill_h(fact_val2);
1607  fact5 = __msa_fill_h(32 - fact_val2);
1608  fact6 = __msa_fill_h(fact_val3);
1609  fact7 = __msa_fill_h(32 - fact_val3);
1610 
1611  SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1612  top1, top3, top5, top7);
1613 
1614  UNPCK_UB_SH(top0, diff0, diff1);
1615  UNPCK_UB_SH(top1, diff2, diff3);
1616  UNPCK_UB_SH(top2, diff4, diff5);
1617  UNPCK_UB_SH(top3, diff6, diff7);
1618  UNPCK_UB_SH(top4, diff8, diff9);
1619  UNPCK_UB_SH(top5, diff10, diff11);
1620  UNPCK_UB_SH(top6, diff12, diff13);
1621  UNPCK_UB_SH(top7, diff14, diff15);
1622 
1623  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1624  diff2, diff3, diff6, diff7);
1625  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1626  diff10, diff11, diff14, diff15);
1627 
1628  diff2 += diff0 * fact1;
1629  diff3 += diff1 * fact1;
1630  diff6 += diff4 * fact3;
1631  diff7 += diff5 * fact3;
1632  diff10 += diff8 * fact5;
1633  diff11 += diff9 * fact5;
1634  diff14 += diff12 * fact7;
1635  diff15 += diff13 * fact7;
1636 
1637  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1638  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1639  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1640  dst_val0, dst_val1, dst_val2, dst_val3);
1641  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1642  ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1643  ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1644  ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1645  ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1646  dst_org += (8 * stride);
1647  ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1648  dst += 4;
1649  }
1650 }
1651 
1652 static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
1653  const uint8_t *src_left,
1654  uint8_t *dst,
1655  int32_t stride,
1656  int32_t mode)
1657 {
1658  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1659  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1660  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1661  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1662  v8i16 fact0, fact1, fact2, fact3;
1663  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1664  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1665  int32_t angle, angle_loop, inv_angle_val, offset;
1666  uint8_t ref_array[3 * 32 + 4];
1667  uint8_t *ref_tmp = ref_array + 32;
1668  const uint8_t *ref, *src_top_tmp = src_top - 1;
1669  uint8_t *dst_org;
1670  int32_t last;
1671 
1672  angle = intra_pred_angle_low[mode - 2];
1673  last = angle;
1674  angle_loop = angle;
1675 
1676  ref = src_left - 1;
1677  if (last < -1) {
1678  inv_angle_val = inv_angle[mode - 11];
1679 
1680  LD_SB2(ref, 16, top0, top1);
1681  tmp0 = LW(ref + 32);
1682  ST_SB2(top0, top1, ref_tmp, 16);
1683  SW(tmp0, ref_tmp + 32);
1684 
1685  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1686  offset = (h_cnt * inv_angle_val + 128) >> 8;
1687  ref_tmp[h_cnt] = src_top_tmp[offset];
1688  }
1689 
1690  ref = ref_tmp;
1691  }
1692 
1693  for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1694  dst_org = dst;
1695  idx0 = angle_loop >> 5;
1696  fact_val0 = angle_loop & 31;
1697  angle_loop += angle;
1698 
1699  idx1 = angle_loop >> 5;
1700  fact_val1 = angle_loop & 31;
1701  angle_loop += angle;
1702 
1703  top0 = LD_SB(ref + idx0 + 1);
1704  top4 = LD_SB(ref + idx1 + 1);
1705  top1 = LD_SB(ref + idx0 + 17);
1706  top5 = LD_SB(ref + idx1 + 17);
1707  top3 = LD_SB(ref + idx0 + 33);
1708  top7 = LD_SB(ref + idx1 + 33);
1709 
1710  fact0 = __msa_fill_h(fact_val0);
1711  fact1 = __msa_fill_h(32 - fact_val0);
1712  fact2 = __msa_fill_h(fact_val1);
1713  fact3 = __msa_fill_h(32 - fact_val1);
1714 
1715  top2 = top1;
1716  top6 = top5;
1717 
1718  SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1719  top1, top3, top5, top7);
1720 
1721  UNPCK_UB_SH(top0, diff0, diff1);
1722  UNPCK_UB_SH(top1, diff2, diff3);
1723  UNPCK_UB_SH(top2, diff4, diff5);
1724  UNPCK_UB_SH(top3, diff6, diff7);
1725  UNPCK_UB_SH(top4, diff8, diff9);
1726  UNPCK_UB_SH(top5, diff10, diff11);
1727  UNPCK_UB_SH(top6, diff12, diff13);
1728  UNPCK_UB_SH(top7, diff14, diff15);
1729 
1730  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1731  diff2, diff3, diff6, diff7);
1732  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1733  diff10, diff11, diff14, diff15);
1734 
1735  diff2 += diff0 * fact1;
1736  diff3 += diff1 * fact1;
1737  diff6 += diff4 * fact1;
1738  diff7 += diff5 * fact1;
1739  diff10 += diff8 * fact3;
1740  diff11 += diff9 * fact3;
1741  diff14 += diff12 * fact3;
1742  diff15 += diff13 * fact3;
1743 
1744  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1745  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1746  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1747  dst_val0, dst_val1, dst_val2, dst_val3);
1748  ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1749  ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1750 
1751  ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752  dst_org += (8 * stride);
1753  ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754  dst_org += (8 * stride);
1755  ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756  dst_org += (8 * stride);
1757  ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1758  dst_org += (8 * stride);
1759 
1760  dst += 2;
1761  }
1762 }
1763 
1764 static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
1765  int32_t dst_stride)
1766 {
1767  uint32_t row;
1768  v16u8 src1, src2;
1769 
1770  src1 = LD_UB(src);
1771  src2 = LD_UB(src + 16);
1772 
1773  for (row = 32; row--;) {
1774  ST_UB2(src1, src2, dst, 16);
1775  dst += dst_stride;
1776  }
1777 }
1778 
1780  const uint8_t *src_top,
1781  const uint8_t *src_left,
1782  ptrdiff_t stride)
1783 {
1784  hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1785 }
1786 
1788  const uint8_t *src_top,
1789  const uint8_t *src_left,
1790  ptrdiff_t stride)
1791 {
1792  hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1793 }
1794 
1796  const uint8_t *src_top,
1797  const uint8_t *src_left,
1798  ptrdiff_t stride)
1799 {
1800  hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1801 }
1802 
1804  const uint8_t *src_top,
1805  const uint8_t *src_left,
1806  ptrdiff_t stride)
1807 {
1808  hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1809 }
1810 
1811 void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1812  const uint8_t *src_left,
1813  ptrdiff_t stride, int log2, int c_idx)
1814 {
1815  switch (log2) {
1816  case 2:
1817  hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1818  break;
1819 
1820  case 3:
1821  hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1822  break;
1823 
1824  case 4:
1825  hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1826  break;
1827 
1828  case 5:
1829  hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1830  break;
1831  }
1832 }
1833 
1835  const uint8_t *src_top,
1836  const uint8_t *src_left,
1837  ptrdiff_t stride, int c_idx, int mode)
1838 {
1839  if (mode == 10) {
1840  hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841  } else if (mode == 26) {
1842  hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1843  } else if (mode >= 18) {
1844  hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1845  dst, stride, mode);
1846  } else {
1847  hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1848  dst, stride, mode);
1849  }
1850 }
1851 
1853  const uint8_t *src_top,
1854  const uint8_t *src_left,
1855  ptrdiff_t stride, int c_idx, int mode)
1856 {
1857  if (mode == 10) {
1858  hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859  } else if (mode == 26) {
1860  hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1861  } else if (mode >= 18) {
1862  hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1863  dst, stride, mode);
1864  } else {
1865  hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1866  dst, stride, mode);
1867  }
1868 }
1869 
1871  const uint8_t *src_top,
1872  const uint8_t *src_left,
1873  ptrdiff_t stride, int c_idx, int mode)
1874 {
1875  if (mode == 10) {
1876  hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877  } else if (mode == 26) {
1878  hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1879  } else if (mode >= 18) {
1881  dst, stride, mode);
1882  } else {
1884  dst, stride, mode);
1885  }
1886 }
1887 
1889  const uint8_t *src_top,
1890  const uint8_t *src_left,
1891  ptrdiff_t stride, int c_idx, int mode)
1892 {
1893  if (mode == 10) {
1894  hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1895  } else if (mode == 26) {
1897  } else if (mode >= 18) {
1899  dst, stride, mode);
1900  } else {
1902  dst, stride, mode);
1903  }
1904 }
1905 
1907  int x0, int y0, int c_idx)
1908 {
1909  v16u8 vec0;
1910  const HEVCSPS *const sps = pps->sps;
1911  const HEVCContext *const s = lc->parent;
1912  int i;
1913  int hshift = sps->hshift[c_idx];
1914  int vshift = sps->vshift[c_idx];
1915  int size_in_luma_h = 16 << hshift;
1916  int size_in_tbs_h = size_in_luma_h >> sps->log2_min_tb_size;
1917  int size_in_luma_v = 16 << vshift;
1918  int size_in_tbs_v = size_in_luma_v >> sps->log2_min_tb_size;
1919  int x = x0 >> hshift;
1920  int y = y0 >> vshift;
1921  int x_tb = (x0 >> sps->log2_min_tb_size) & sps->tb_mask;
1922  int y_tb = (y0 >> sps->log2_min_tb_size) & sps->tb_mask;
1923 
1924  int cur_tb_addr =
1925  pps->min_tb_addr_zs[(y_tb) * (sps->tb_mask + 2) + (x_tb)];
1926 
1927  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1928  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1929 
1930  int min_pu_width = sps->min_pu_width;
1931 
1932  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1933  lc->tu.intra_pred_mode;
1934  uint32_t a;
1935  uint8_t left_array[2 * 32 + 1];
1936  uint8_t filtered_left_array[2 * 32 + 1];
1937  uint8_t top_array[2 * 32 + 1];
1938  uint8_t filtered_top_array[2 * 32 + 1];
1939 
1940  uint8_t *left = left_array + 1;
1941  uint8_t *top = top_array + 1;
1942  uint8_t *filtered_left = filtered_left_array + 1;
1943  uint8_t *filtered_top = filtered_top_array + 1;
1944  int cand_bottom_left = lc->na.cand_bottom_left
1945  && cur_tb_addr >
1946  pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & sps->tb_mask) *
1947  (sps->tb_mask + 2) + (x_tb - 1)];
1948  int cand_left = lc->na.cand_left;
1949  int cand_up_left = lc->na.cand_up_left;
1950  int cand_up = lc->na.cand_up;
1951  int cand_up_right = lc->na.cand_up_right
1952  && cur_tb_addr >
1953  pps->min_tb_addr_zs[(y_tb - 1) * (sps->tb_mask + 2) +
1954  ((x_tb + size_in_tbs_h) & sps->tb_mask)];
1955 
1956  int bottom_left_size =
1957  (((y0 + 2 * size_in_luma_v) >
1958  (sps->height) ? (sps->height) : (y0 +
1959  2 * size_in_luma_v)) -
1960  (y0 + size_in_luma_v)) >> vshift;
1961  int top_right_size =
1962  (((x0 + 2 * size_in_luma_h) >
1963  (sps->width) ? (sps->width) : (x0 + 2 * size_in_luma_h)) -
1964  (x0 + size_in_luma_h)) >> hshift;
1965 
1966  if (pps->constrained_intra_pred_flag == 1) {
1967  int size_in_luma_pu_v = ((size_in_luma_v) >> sps->log2_min_pu_size);
1968  int size_in_luma_pu_h = ((size_in_luma_h) >> sps->log2_min_pu_size);
1969  int on_pu_edge_x = !(x0 & ((1 << sps->log2_min_pu_size) - 1));
1970  int on_pu_edge_y = !(y0 & ((1 << sps->log2_min_pu_size) - 1));
1971  if (!size_in_luma_pu_h)
1972  size_in_luma_pu_h++;
1973  if (cand_bottom_left == 1 && on_pu_edge_x) {
1974  int x_left_pu = ((x0 - 1) >> sps->log2_min_pu_size);
1975  int y_bottom_pu =
1976  ((y0 + size_in_luma_v) >> sps->log2_min_pu_size);
1977  int max =
1978  ((size_in_luma_pu_v) >
1979  (sps->min_pu_height -
1980  y_bottom_pu) ? (sps->min_pu_height -
1981  y_bottom_pu) : (size_in_luma_pu_v));
1982  cand_bottom_left = 0;
1983  for (i = 0; i < max; i += 2)
1984  cand_bottom_left |=
1985  ((s->cur_frame->tab_mvf[(x_left_pu) +
1986  (y_bottom_pu +
1987  i) * min_pu_width]).pred_flag ==
1988  PF_INTRA);
1989  }
1990  if (cand_left == 1 && on_pu_edge_x) {
1991  int x_left_pu = ((x0 - 1) >> sps->log2_min_pu_size);
1992  int y_left_pu = ((y0) >> sps->log2_min_pu_size);
1993  int max =
1994  ((size_in_luma_pu_v) >
1995  (sps->min_pu_height -
1996  y_left_pu) ? (sps->min_pu_height -
1997  y_left_pu) : (size_in_luma_pu_v));
1998  cand_left = 0;
1999  for (i = 0; i < max; i += 2)
2000  cand_left |=
2001  ((s->cur_frame->tab_mvf[(x_left_pu) +
2002  (y_left_pu +
2003  i) * min_pu_width]).pred_flag ==
2004  PF_INTRA);
2005  }
2006  if (cand_up_left == 1) {
2007  int x_left_pu = ((x0 - 1) >> sps->log2_min_pu_size);
2008  int y_top_pu = ((y0 - 1) >> sps->log2_min_pu_size);
2009  cand_up_left =
2010  (s->cur_frame->tab_mvf[(x_left_pu) +
2011  (y_top_pu) * min_pu_width]).pred_flag ==
2012  PF_INTRA;
2013  }
2014  if (cand_up == 1 && on_pu_edge_y) {
2015  int x_top_pu = ((x0) >> sps->log2_min_pu_size);
2016  int y_top_pu = ((y0 - 1) >> sps->log2_min_pu_size);
2017  int max =
2018  ((size_in_luma_pu_h) >
2019  (sps->min_pu_width -
2020  x_top_pu) ? (sps->min_pu_width -
2021  x_top_pu) : (size_in_luma_pu_h));
2022  cand_up = 0;
2023  for (i = 0; i < max; i += 2)
2024  cand_up |=
2025  ((s->cur_frame->tab_mvf[(x_top_pu + i) +
2026  (y_top_pu) *
2027  min_pu_width]).pred_flag == PF_INTRA);
2028  }
2029  if (cand_up_right == 1 && on_pu_edge_y) {
2030  int y_top_pu = ((y0 - 1) >> sps->log2_min_pu_size);
2031  int x_right_pu =
2032  ((x0 + size_in_luma_h) >> sps->log2_min_pu_size);
2033  int max =
2034  ((size_in_luma_pu_h) >
2035  (sps->min_pu_width -
2036  x_right_pu) ? (sps->min_pu_width -
2037  x_right_pu) : (size_in_luma_pu_h));
2038  cand_up_right = 0;
2039  for (i = 0; i < max; i += 2)
2040  cand_up_right |=
2041  ((s->cur_frame->tab_mvf[(x_right_pu + i) +
2042  (y_top_pu) *
2043  min_pu_width]).pred_flag == PF_INTRA);
2044  }
2045 
2046  vec0 = (v16u8) __msa_ldi_b(128);
2047 
2048  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2049 
2050  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2051 
2052  top[-1] = 128;
2053  }
2054  if (cand_up_left) {
2055  left[-1] = src[(-1) + stride * (-1)];
2056  top[-1] = left[-1];
2057  }
2058  if (cand_up) {
2059  vec0 = LD_UB(src - stride);
2060  ST_UB(vec0, top);
2061  }
2062  if (cand_up_right) {
2063  vec0 = LD_UB(src - stride + 16);
2064  ST_UB(vec0, (top + 16));
2065 
2066  do {
2067  uint32_t pix =
2068  ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2069  0x01010101U);
2070  for (i = 0; i < (16 - top_right_size); i += 4)
2071  ((((union unaligned_32 *) (top + 16 + top_right_size +
2072  i))->l) = (pix));
2073  } while (0);
2074  }
2075  if (cand_left)
2076  for (i = 0; i < 16; i++)
2077  left[i] = src[(-1) + stride * (i)];
2078  if (cand_bottom_left) {
2079  for (i = 16; i < 16 + bottom_left_size; i++)
2080  left[i] = src[(-1) + stride * (i)];
2081  do {
2082  uint32_t pix =
2083  ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2084  0x01010101U);
2085  for (i = 0; i < (16 - bottom_left_size); i += 4)
2086  ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2087  i))->l) = (pix));
2088  } while (0);
2089  }
2090 
2091  if (pps->constrained_intra_pred_flag == 1) {
2092  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2093  || cand_up_right) {
2094  int size_max_x =
2095  x0 + ((2 * 16) << hshift) <
2096  sps->width ? 2 * 16 : (sps->width - x0) >> hshift;
2097  int size_max_y =
2098  y0 + ((2 * 16) << vshift) <
2099  sps->height ? 2 * 16 : (sps->height - y0) >> vshift;
2100  int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2101  if (!cand_up_right) {
2102  size_max_x = x0 + ((16) << hshift) < sps->width ?
2103  16 : (sps->width - x0) >> hshift;
2104  }
2105  if (!cand_bottom_left) {
2106  size_max_y = y0 + ((16) << vshift) < sps->height ?
2107  16 : (sps->height - y0) >> vshift;
2108  }
2109  if (cand_bottom_left || cand_left || cand_up_left) {
2110  while (j > -1
2111  &&
2112  !((s->cur_frame->tab_mvf[(((x0 +
2113  ((-1) << hshift)) >> sps->
2114  log2_min_pu_size)) + (((y0 +
2115  ((j) <<
2116  vshift))
2117  >> sps->
2118  log2_min_pu_size))
2119  * min_pu_width]).pred_flag ==
2120  PF_INTRA))
2121  j--;
2122  if (!
2123  ((s->cur_frame->tab_mvf[(((x0 +
2124  ((-1) << hshift)) >> sps->
2125  log2_min_pu_size)) + (((y0 + ((j)
2126  <<
2127  vshift))
2128  >> sps->
2129  log2_min_pu_size))
2130  * min_pu_width]).pred_flag == PF_INTRA)) {
2131  j = 0;
2132  while (j < size_max_x
2133  &&
2134  !((s->cur_frame->tab_mvf[(((x0 +
2135  ((j) << hshift)) >> sps->
2136  log2_min_pu_size)) + (((y0 +
2137  ((-1) <<
2138  vshift))
2139  >> sps->
2140  log2_min_pu_size))
2141  * min_pu_width]).pred_flag ==
2142  PF_INTRA))
2143  j++;
2144  for (i = j; i > (j) - (j + 1); i--)
2145  if (!
2146  ((s->cur_frame->tab_mvf[(((x0 +
2147  ((i -
2148  1) << hshift)) >> sps->
2149  log2_min_pu_size)) + (((y0 +
2150  ((-1) <<
2151  vshift))
2152  >> sps->
2153  log2_min_pu_size))
2154  * min_pu_width]).pred_flag ==
2155  PF_INTRA))
2156  top[i - 1] = top[i];
2157  left[-1] = top[-1];
2158  }
2159  } else {
2160  j = 0;
2161  while (j < size_max_x
2162  &&
2163  !((s->cur_frame->tab_mvf[(((x0 +
2164  ((j) << hshift)) >> sps->
2165  log2_min_pu_size)) + (((y0 + ((-1)
2166  <<
2167  vshift))
2168  >> sps->
2169  log2_min_pu_size))
2170  * min_pu_width]).pred_flag ==
2171  PF_INTRA))
2172  j++;
2173  if (j > 0)
2174  if (x0 > 0) {
2175  for (i = j; i > (j) - (j + 1); i--)
2176  if (!
2177  ((s->cur_frame->tab_mvf[(((x0 +
2178  ((i -
2179  1) << hshift)) >>
2180  sps->log2_min_pu_size))
2181  + (((y0 + ((-1)
2182  << vshift))
2183  >>
2184  sps->log2_min_pu_size))
2185  *
2186  min_pu_width]).pred_flag ==
2187  PF_INTRA))
2188  top[i - 1] = top[i];
2189  } else {
2190  for (i = j; i > (j) - (j); i--)
2191  if (!
2192  ((s->cur_frame->tab_mvf[(((x0 +
2193  ((i -
2194  1) << hshift)) >>
2195  sps->log2_min_pu_size))
2196  + (((y0 + ((-1)
2197  << vshift))
2198  >>
2199  sps->log2_min_pu_size))
2200  *
2201  min_pu_width]).pred_flag ==
2202  PF_INTRA))
2203  top[i - 1] = top[i];
2204  top[-1] = top[0];
2205  }
2206  left[-1] = top[-1];
2207  }
2208  left[-1] = top[-1];
2209  if (cand_bottom_left || cand_left) {
2210  a = ((left[-1]) * 0x01010101U);
2211  for (i = 0; i < (0) + (size_max_y); i += 4)
2212  if (!
2213  ((s->cur_frame->tab_mvf[(((x0 +
2214  ((-1) << hshift)) >> sps->
2215  log2_min_pu_size)) + (((y0 +
2216  ((i) <<
2217  vshift))
2218  >> sps->
2219  log2_min_pu_size))
2220  * min_pu_width]).pred_flag ==
2221  PF_INTRA))
2222  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2223  else
2224  a = ((left[i + 3]) * 0x01010101U);
2225  }
2226  if (!cand_left) {
2227  vec0 = (v16u8) __msa_fill_b(left[-1]);
2228 
2229  ST_UB(vec0, left);
2230  }
2231  if (!cand_bottom_left) {
2232 
2233  vec0 = (v16u8) __msa_fill_b(left[15]);
2234 
2235  ST_UB(vec0, (left + 16));
2236  }
2237  if (x0 != 0 && y0 != 0) {
2238  a = ((left[size_max_y - 1]) * 0x01010101U);
2239  for (i = (size_max_y - 1);
2240  i > (size_max_y - 1) - (size_max_y); i -= 4)
2241  if (!
2242  ((s->cur_frame->tab_mvf[(((x0 +
2243  ((-1) << hshift)) >> sps->
2244  log2_min_pu_size)) + (((y0 +
2245  ((i -
2246  3) <<
2247  vshift))
2248  >> sps->
2249  log2_min_pu_size))
2250  * min_pu_width]).pred_flag ==
2251  PF_INTRA))
2252  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2253  else
2254  a = ((left[i - 3]) * 0x01010101U);
2255  if (!
2256  ((s->cur_frame->tab_mvf[(((x0 +
2257  ((-1) << hshift)) >> sps->
2258  log2_min_pu_size)) + (((y0 + ((-1)
2259  <<
2260  vshift))
2261  >> sps->
2262  log2_min_pu_size))
2263  * min_pu_width]).pred_flag == PF_INTRA))
2264  left[-1] = left[0];
2265  } else if (x0 == 0) {
2266  do {
2267  uint32_t pix = ((0) * 0x01010101U);
2268  for (i = 0; i < (size_max_y); i += 4)
2269  ((((union unaligned_32 *) (left + i))->l) = (pix));
2270  } while (0);
2271  } else {
2272  a = ((left[size_max_y - 1]) * 0x01010101U);
2273  for (i = (size_max_y - 1);
2274  i > (size_max_y - 1) - (size_max_y); i -= 4)
2275  if (!
2276  ((s->cur_frame->tab_mvf[(((x0 +
2277  ((-1) << hshift)) >> sps->
2278  log2_min_pu_size)) + (((y0 +
2279  ((i -
2280  3) <<
2281  vshift))
2282  >> sps->
2283  log2_min_pu_size))
2284  * min_pu_width]).pred_flag ==
2285  PF_INTRA))
2286  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2287  else
2288  a = ((left[i - 3]) * 0x01010101U);
2289  }
2290  top[-1] = left[-1];
2291  if (y0 != 0) {
2292  a = ((left[-1]) * 0x01010101U);
2293  for (i = 0; i < (0) + (size_max_x); i += 4)
2294  if (!
2295  ((s->cur_frame->tab_mvf[(((x0 +
2296  ((i) << hshift)) >> sps->
2297  log2_min_pu_size)) + (((y0 + ((-1)
2298  <<
2299  vshift))
2300  >> sps->
2301  log2_min_pu_size))
2302  * min_pu_width]).pred_flag ==
2303  PF_INTRA))
2304  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2305  else
2306  a = ((top[i + 3]) * 0x01010101U);
2307  }
2308  }
2309  }
2310 
2311  if (!cand_bottom_left) {
2312  if (cand_left) {
2313  vec0 = (v16u8) __msa_fill_b(left[15]);
2314 
2315  ST_UB(vec0, (left + 16));
2316 
2317  } else if (cand_up_left) {
2318  vec0 = (v16u8) __msa_fill_b(left[-1]);
2319 
2320  ST_UB2(vec0, vec0, left, 16);
2321 
2322  cand_left = 1;
2323  } else if (cand_up) {
2324  left[-1] = top[0];
2325 
2326  vec0 = (v16u8) __msa_fill_b(left[-1]);
2327 
2328  ST_UB2(vec0, vec0, left, 16);
2329 
2330  cand_up_left = 1;
2331  cand_left = 1;
2332  } else if (cand_up_right) {
2333  vec0 = (v16u8) __msa_fill_b(top[16]);
2334 
2335  ST_UB(vec0, top);
2336 
2337  left[-1] = top[16];
2338 
2339  ST_UB2(vec0, vec0, left, 16);
2340 
2341  cand_up = 1;
2342  cand_up_left = 1;
2343  cand_left = 1;
2344  } else {
2345  left[-1] = 128;
2346  vec0 = (v16u8) __msa_ldi_b(128);
2347 
2348  ST_UB2(vec0, vec0, top, 16);
2349  ST_UB2(vec0, vec0, left, 16);
2350  }
2351  }
2352 
2353  if (!cand_left) {
2354  vec0 = (v16u8) __msa_fill_b(left[16]);
2355  ST_UB(vec0, left);
2356  }
2357  if (!cand_up_left) {
2358  left[-1] = left[0];
2359  }
2360  if (!cand_up) {
2361  vec0 = (v16u8) __msa_fill_b(left[-1]);
2362  ST_UB(vec0, top);
2363  }
2364  if (!cand_up_right) {
2365  vec0 = (v16u8) __msa_fill_b(top[15]);
2366  ST_UB(vec0, (top + 16));
2367  }
2368 
2369  top[-1] = left[-1];
2370 
2371 
2372  if (!sps->intra_smoothing_disabled
2373  && (c_idx == 0 || sps->chroma_format_idc == 3)) {
2374  if (mode != INTRA_DC && 16 != 4) {
2375  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2376  int min_dist_vert_hor =
2377  (((((int) (mode - 26U)) >=
2378  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2379  ((((int) (mode - 10U)) >=
2380  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381  ? ((((int) (mode - 10U)) >=
2382  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2383  : ((((int) (mode - 26U)) >=
2384  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2385  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2386  filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2387  filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2388  for (i = 2 * 16 - 2; i >= 0; i--)
2389  filtered_left[i] = (left[i + 1] + 2 * left[i] +
2390  left[i - 1] + 2) >> 2;
2391  filtered_top[-1] =
2392  filtered_left[-1] =
2393  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2394  for (i = 2 * 16 - 2; i >= 0; i--)
2395  filtered_top[i] = (top[i + 1] + 2 * top[i] +
2396  top[i - 1] + 2) >> 2;
2397  left = filtered_left;
2398  top = filtered_top;
2399  }
2400  }
2401  }
2402 
2403  switch (mode) {
2404  case INTRA_PLANAR:
2405  s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2406  (uint8_t *) left, stride);
2407  break;
2408  case INTRA_DC:
2409  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2410  (uint8_t *) left, stride, 4, c_idx);
2411  break;
2412  default:
2413  s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2414  (uint8_t *) left, stride, c_idx, mode);
2415  break;
2416  }
2417 }
2418 
2419 void ff_intra_pred_8_32x32_msa(HEVCLocalContext *lc, int x0, int y0, int c_idx)
2420 {
2421  v16u8 vec0, vec1;
2422  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2423  v8i16 res0, res1, res2, res3;
2424  v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2425  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2426  const HEVCSPS *const sps = pps->sps;
2427  const HEVCContext *const s = lc->parent;
2428  int i;
2429  int hshift = sps->hshift[c_idx];
2430  int vshift = sps->vshift[c_idx];
2431  int size_in_luma_h = 32 << hshift;
2432  int size_in_tbs_h = size_in_luma_h >> sps->log2_min_tb_size;
2433  int size_in_luma_v = 32 << vshift;
2434  int size_in_tbs_v = size_in_luma_v >> sps->log2_min_tb_size;
2435  int x = x0 >> hshift;
2436  int y = y0 >> vshift;
2437  int x_tb = (x0 >> sps->log2_min_tb_size) & sps->tb_mask;
2438  int y_tb = (y0 >> sps->log2_min_tb_size) & sps->tb_mask;
2439 
2440  int cur_tb_addr =
2441  pps->min_tb_addr_zs[(y_tb) * (sps->tb_mask + 2) + (x_tb)];
2442 
2443  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2444  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2445 
2446  int min_pu_width = sps->min_pu_width;
2447 
2448  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2449  lc->tu.intra_pred_mode;
2450  uint32_t a;
2451  uint8_t left_array[2 * 32 + 1];
2452  uint8_t filtered_left_array[2 * 32 + 1];
2453  uint8_t top_array[2 * 32 + 1];
2454  uint8_t filtered_top_array[2 * 32 + 1];
2455 
2456  uint8_t *left = left_array + 1;
2457  uint8_t *top = top_array + 1;
2458  uint8_t *filtered_left = filtered_left_array + 1;
2459  uint8_t *filtered_top = filtered_top_array + 1;
2460  int cand_bottom_left = lc->na.cand_bottom_left
2461  && cur_tb_addr >
2462  pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & sps->tb_mask) *
2463  (sps->tb_mask + 2) + (x_tb - 1)];
2464  int cand_left = lc->na.cand_left;
2465  int cand_up_left = lc->na.cand_up_left;
2466  int cand_up = lc->na.cand_up;
2467  int cand_up_right = lc->na.cand_up_right
2468  && cur_tb_addr >
2469  pps->min_tb_addr_zs[(y_tb - 1) * (sps->tb_mask + 2) +
2470  ((x_tb + size_in_tbs_h) & sps->tb_mask)];
2471 
2472  int bottom_left_size =
2473  (((y0 + 2 * size_in_luma_v) >
2474  (sps->height) ? (sps->height) : (y0 +
2475  2 * size_in_luma_v)) -
2476  (y0 + size_in_luma_v)) >> vshift;
2477  int top_right_size =
2478  (((x0 + 2 * size_in_luma_h) >
2479  (sps->width) ? (sps->width) : (x0 + 2 * size_in_luma_h)) -
2480  (x0 + size_in_luma_h)) >> hshift;
2481 
2482  if (pps->constrained_intra_pred_flag == 1) {
2483  int size_in_luma_pu_v = ((size_in_luma_v) >> sps->log2_min_pu_size);
2484  int size_in_luma_pu_h = ((size_in_luma_h) >> sps->log2_min_pu_size);
2485  int on_pu_edge_x = !(x0 & ((1 << sps->log2_min_pu_size) - 1));
2486  int on_pu_edge_y = !(y0 & ((1 << sps->log2_min_pu_size) - 1));
2487  if (!size_in_luma_pu_h)
2488  size_in_luma_pu_h++;
2489  if (cand_bottom_left == 1 && on_pu_edge_x) {
2490  int x_left_pu = ((x0 - 1) >> sps->log2_min_pu_size);
2491  int y_bottom_pu =
2492  ((y0 + size_in_luma_v) >> sps->log2_min_pu_size);
2493  int max =
2494  ((size_in_luma_pu_v) >
2495  (sps->min_pu_height -
2496  y_bottom_pu) ? (sps->min_pu_height -
2497  y_bottom_pu) : (size_in_luma_pu_v));
2498  cand_bottom_left = 0;
2499  for (i = 0; i < max; i += 2)
2500  cand_bottom_left |=
2501  ((s->cur_frame->tab_mvf[(x_left_pu) +
2502  (y_bottom_pu +
2503  i) * min_pu_width]).pred_flag ==
2504  PF_INTRA);
2505  }
2506  if (cand_left == 1 && on_pu_edge_x) {
2507  int x_left_pu = ((x0 - 1) >> sps->log2_min_pu_size);
2508  int y_left_pu = ((y0) >> sps->log2_min_pu_size);
2509  int max =
2510  ((size_in_luma_pu_v) >
2511  (sps->min_pu_height -
2512  y_left_pu) ? (sps->min_pu_height -
2513  y_left_pu) : (size_in_luma_pu_v));
2514  cand_left = 0;
2515  for (i = 0; i < max; i += 2)
2516  cand_left |=
2517  ((s->cur_frame->tab_mvf[(x_left_pu) +
2518  (y_left_pu +
2519  i) * min_pu_width]).pred_flag ==
2520  PF_INTRA);
2521  }
2522  if (cand_up_left == 1) {
2523  int x_left_pu = ((x0 - 1) >> sps->log2_min_pu_size);
2524  int y_top_pu = ((y0 - 1) >> sps->log2_min_pu_size);
2525  cand_up_left =
2526  (s->cur_frame->tab_mvf[(x_left_pu) +
2527  (y_top_pu) * min_pu_width]).pred_flag ==
2528  PF_INTRA;
2529  }
2530  if (cand_up == 1 && on_pu_edge_y) {
2531  int x_top_pu = ((x0) >> sps->log2_min_pu_size);
2532  int y_top_pu = ((y0 - 1) >> sps->log2_min_pu_size);
2533  int max =
2534  ((size_in_luma_pu_h) >
2535  (sps->min_pu_width -
2536  x_top_pu) ? (sps->min_pu_width -
2537  x_top_pu) : (size_in_luma_pu_h));
2538  cand_up = 0;
2539  for (i = 0; i < max; i += 2)
2540  cand_up |=
2541  ((s->cur_frame->tab_mvf[(x_top_pu + i) +
2542  (y_top_pu) *
2543  min_pu_width]).pred_flag == PF_INTRA);
2544  }
2545  if (cand_up_right == 1 && on_pu_edge_y) {
2546  int y_top_pu = ((y0 - 1) >> sps->log2_min_pu_size);
2547  int x_right_pu =
2548  ((x0 + size_in_luma_h) >> sps->log2_min_pu_size);
2549  int max =
2550  ((size_in_luma_pu_h) >
2551  (sps->min_pu_width -
2552  x_right_pu) ? (sps->min_pu_width -
2553  x_right_pu) : (size_in_luma_pu_h));
2554  cand_up_right = 0;
2555  for (i = 0; i < max; i += 2)
2556  cand_up_right |=
2557  ((s->cur_frame->tab_mvf[(x_right_pu + i) +
2558  (y_top_pu) *
2559  min_pu_width]).pred_flag == PF_INTRA);
2560  }
2561  vec0 = (v16u8) __msa_ldi_b(128);
2562 
2563  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2564  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2565 
2566  top[-1] = 128;
2567  }
2568  if (cand_up_left) {
2569  left[-1] = src[(-1) + stride * (-1)];
2570  top[-1] = left[-1];
2571  }
2572  if (cand_up) {
2573  LD_UB2(src - stride, 16, vec0, vec1);
2574  ST_UB2(vec0, vec1, top, 16);
2575  }
2576 
2577  if (cand_up_right) {
2578  LD_UB2(src - stride + 32, 16, vec0, vec1);
2579  ST_UB2(vec0, vec1, (top + 32), 16);
2580  do {
2581  uint32_t pix =
2582  ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2583  0x01010101U);
2584  for (i = 0; i < (32 - top_right_size); i += 4)
2585  ((((union unaligned_32 *) (top + 32 + top_right_size +
2586  i))->l) = (pix));
2587  } while (0);
2588  }
2589  if (cand_left)
2590  for (i = 0; i < 32; i++)
2591  left[i] = src[(-1) + stride * (i)];
2592  if (cand_bottom_left) {
2593  for (i = 32; i < 32 + bottom_left_size; i++)
2594  left[i] = src[(-1) + stride * (i)];
2595  do {
2596  uint32_t pix =
2597  ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2598  0x01010101U);
2599  for (i = 0; i < (32 - bottom_left_size); i += 4)
2600  ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2601  i))->l) = (pix));
2602  } while (0);
2603  }
2604 
2605  if (pps->constrained_intra_pred_flag == 1) {
2606  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2607  || cand_up_right) {
2608  int size_max_x =
2609  x0 + ((2 * 32) << hshift) <
2610  sps->width ? 2 * 32 : (sps->width - x0) >> hshift;
2611  int size_max_y =
2612  y0 + ((2 * 32) << vshift) <
2613  sps->height ? 2 * 32 : (sps->height - y0) >> vshift;
2614  int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2615  if (!cand_up_right) {
2616  size_max_x = x0 + ((32) << hshift) < sps->width ?
2617  32 : (sps->width - x0) >> hshift;
2618  }
2619  if (!cand_bottom_left) {
2620  size_max_y = y0 + ((32) << vshift) < sps->height ?
2621  32 : (sps->height - y0) >> vshift;
2622  }
2623  if (cand_bottom_left || cand_left || cand_up_left) {
2624  while (j > -1
2625  &&
2626  !((s->cur_frame->tab_mvf[(((x0 +
2627  ((-1) << hshift)) >> sps->
2628  log2_min_pu_size)) + (((y0 +
2629  ((j) <<
2630  vshift))
2631  >> sps->
2632  log2_min_pu_size))
2633  * min_pu_width]).pred_flag ==
2634  PF_INTRA))
2635  j--;
2636  if (!
2637  ((s->cur_frame->tab_mvf[(((x0 +
2638  ((-1) << hshift)) >> sps->
2639  log2_min_pu_size)) + (((y0 + ((j)
2640  <<
2641  vshift))
2642  >> sps->
2643  log2_min_pu_size))
2644  * min_pu_width]).pred_flag == PF_INTRA)) {
2645  j = 0;
2646  while (j < size_max_x
2647  &&
2648  !((s->cur_frame->tab_mvf[(((x0 +
2649  ((j) << hshift)) >> sps->
2650  log2_min_pu_size)) + (((y0 +
2651  ((-1) <<
2652  vshift))
2653  >> sps->
2654  log2_min_pu_size))
2655  * min_pu_width]).pred_flag ==
2656  PF_INTRA))
2657  j++;
2658  for (i = j; i > (j) - (j + 1); i--)
2659  if (!
2660  ((s->cur_frame->tab_mvf[(((x0 +
2661  ((i -
2662  1) << hshift)) >> sps->
2663  log2_min_pu_size)) + (((y0 +
2664  ((-1) <<
2665  vshift))
2666  >> sps->
2667  log2_min_pu_size))
2668  * min_pu_width]).pred_flag ==
2669  PF_INTRA))
2670  top[i - 1] = top[i];
2671  left[-1] = top[-1];
2672  }
2673  } else {
2674  j = 0;
2675  while (j < size_max_x
2676  &&
2677  !((s->cur_frame->tab_mvf[(((x0 +
2678  ((j) << hshift)) >> sps->
2679  log2_min_pu_size)) + (((y0 + ((-1)
2680  <<
2681  vshift))
2682  >> sps->
2683  log2_min_pu_size))
2684  * min_pu_width]).pred_flag ==
2685  PF_INTRA))
2686  j++;
2687  if (j > 0)
2688  if (x0 > 0) {
2689  for (i = j; i > (j) - (j + 1); i--)
2690  if (!
2691  ((s->cur_frame->tab_mvf[(((x0 +
2692  ((i -
2693  1) << hshift)) >>
2694  sps->log2_min_pu_size))
2695  + (((y0 + ((-1)
2696  << vshift))
2697  >>
2698  sps->log2_min_pu_size))
2699  *
2700  min_pu_width]).pred_flag ==
2701  PF_INTRA))
2702  top[i - 1] = top[i];
2703  } else {
2704  for (i = j; i > (j) - (j); i--)
2705  if (!
2706  ((s->cur_frame->tab_mvf[(((x0 +
2707  ((i -
2708  1) << hshift)) >>
2709  sps->log2_min_pu_size))
2710  + (((y0 + ((-1)
2711  << vshift))
2712  >>
2713  sps->log2_min_pu_size))
2714  *
2715  min_pu_width]).pred_flag ==
2716  PF_INTRA))
2717  top[i - 1] = top[i];
2718  top[-1] = top[0];
2719  }
2720  left[-1] = top[-1];
2721  }
2722  left[-1] = top[-1];
2723  if (cand_bottom_left || cand_left) {
2724  a = ((left[-1]) * 0x01010101U);
2725  for (i = 0; i < (0) + (size_max_y); i += 4)
2726  if (!
2727  ((s->cur_frame->tab_mvf[(((x0 +
2728  ((-1) << hshift)) >> sps->
2729  log2_min_pu_size)) + (((y0 +
2730  ((i) <<
2731  vshift))
2732  >> sps->
2733  log2_min_pu_size))
2734  * min_pu_width]).pred_flag ==
2735  PF_INTRA))
2736  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2737  else
2738  a = ((left[i + 3]) * 0x01010101U);
2739  }
2740  if (!cand_left) {
2741  vec0 = (v16u8) __msa_fill_b(left[-1]);
2742 
2743  ST_UB2(vec0, vec0, left, 16);
2744  }
2745  if (!cand_bottom_left) {
2746  vec0 = (v16u8) __msa_fill_b(left[31]);
2747 
2748  ST_UB2(vec0, vec0, (left + 32), 16);
2749  }
2750  if (x0 != 0 && y0 != 0) {
2751  a = ((left[size_max_y - 1]) * 0x01010101U);
2752  for (i = (size_max_y - 1);
2753  i > (size_max_y - 1) - (size_max_y); i -= 4)
2754  if (!
2755  ((s->cur_frame->tab_mvf[(((x0 +
2756  ((-1) << hshift)) >> sps->
2757  log2_min_pu_size)) + (((y0 +
2758  ((i -
2759  3) <<
2760  vshift))
2761  >> sps->
2762  log2_min_pu_size))
2763  * min_pu_width]).pred_flag ==
2764  PF_INTRA))
2765  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2766  else
2767  a = ((left[i - 3]) * 0x01010101U);
2768  if (!
2769  ((s->cur_frame->tab_mvf[(((x0 +
2770  ((-1) << hshift)) >> sps->
2771  log2_min_pu_size)) + (((y0 + ((-1)
2772  <<
2773  vshift))
2774  >> sps->
2775  log2_min_pu_size))
2776  * min_pu_width]).pred_flag == PF_INTRA))
2777  left[-1] = left[0];
2778  } else if (x0 == 0) {
2779  do {
2780  uint32_t pix = ((0) * 0x01010101U);
2781  for (i = 0; i < (size_max_y); i += 4)
2782  ((((union unaligned_32 *) (left + i))->l) = (pix));
2783  } while (0);
2784  } else {
2785  a = ((left[size_max_y - 1]) * 0x01010101U);
2786  for (i = (size_max_y - 1);
2787  i > (size_max_y - 1) - (size_max_y); i -= 4)
2788  if (!
2789  ((s->cur_frame->tab_mvf[(((x0 +
2790  ((-1) << hshift)) >> sps->
2791  log2_min_pu_size)) + (((y0 +
2792  ((i -
2793  3) <<
2794  vshift))
2795  >> sps->
2796  log2_min_pu_size))
2797  * min_pu_width]).pred_flag ==
2798  PF_INTRA))
2799  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2800  else
2801  a = ((left[i - 3]) * 0x01010101U);
2802  }
2803  top[-1] = left[-1];
2804  if (y0 != 0) {
2805  a = ((left[-1]) * 0x01010101U);
2806  for (i = 0; i < (0) + (size_max_x); i += 4)
2807  if (!
2808  ((s->cur_frame->tab_mvf[(((x0 +
2809  ((i) << hshift)) >> sps->
2810  log2_min_pu_size)) + (((y0 + ((-1)
2811  <<
2812  vshift))
2813  >> sps->
2814  log2_min_pu_size))
2815  * min_pu_width]).pred_flag ==
2816  PF_INTRA))
2817  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2818  else
2819  a = ((top[i + 3]) * 0x01010101U);
2820  }
2821  }
2822  }
2823 
2824  if (!cand_bottom_left) {
2825  if (cand_left) {
2826  vec0 = (v16u8) __msa_fill_b(left[31]);
2827 
2828  ST_UB2(vec0, vec0, (left + 32), 16);
2829  } else if (cand_up_left) {
2830  vec0 = (v16u8) __msa_fill_b(left[-1]);
2831 
2832  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2833 
2834  cand_left = 1;
2835  } else if (cand_up) {
2836  left[-1] = top[0];
2837 
2838  vec0 = (v16u8) __msa_fill_b(left[-1]);
2839 
2840  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2841 
2842  cand_up_left = 1;
2843  cand_left = 1;
2844  } else if (cand_up_right) {
2845  vec0 = (v16u8) __msa_fill_b(top[32]);
2846 
2847  ST_UB2(vec0, vec0, top, 16);
2848 
2849  left[-1] = top[32];
2850 
2851  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2852 
2853  cand_up = 1;
2854  cand_up_left = 1;
2855  cand_left = 1;
2856  } else {
2857  left[-1] = 128;
2858 
2859  vec0 = (v16u8) __msa_ldi_b(128);
2860 
2861  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2862  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2863  }
2864  }
2865 
2866  if (!cand_left) {
2867  vec0 = (v16u8) __msa_fill_b(left[32]);
2868 
2869  ST_UB2(vec0, vec0, left, 16);
2870  }
2871  if (!cand_up_left) {
2872  left[-1] = left[0];
2873  }
2874  if (!cand_up) {
2875  vec0 = (v16u8) __msa_fill_b(left[-1]);
2876 
2877  ST_UB2(vec0, vec0, top, 16);
2878  }
2879  if (!cand_up_right) {
2880  vec0 = (v16u8) __msa_fill_b(top[31]);
2881 
2882  ST_UB2(vec0, vec0, (top + 32), 16);
2883  }
2884 
2885  top[-1] = left[-1];
2886 
2887 
2888  if (!sps->intra_smoothing_disabled
2889  && (c_idx == 0 || sps->chroma_format_idc == 3)) {
2890  if (mode != INTRA_DC && 32 != 4) {
2891  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2892  int min_dist_vert_hor =
2893  (((((int) (mode - 26U)) >=
2894  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2895  ((((int) (mode - 10U)) >=
2896  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2897  ? ((((int) (mode - 10U)) >=
2898  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2899  : ((((int) (mode - 26U)) >=
2900  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2901  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2902  int threshold = 1 << (8 - 5);
2903  if (sps->strong_intra_smoothing_enabled
2904  && c_idx == 0
2905  && ((top[-1] + top[63] - 2 * top[31]) >=
2906  0 ? (top[-1] + top[63] -
2907  2 * top[31]) : (-(top[-1] + top[63] -
2908  2 * top[31]))) < threshold
2909  && ((left[-1] + left[63] - 2 * left[31]) >=
2910  0 ? (left[-1] + left[63] -
2911  2 * left[31]) : (-(left[-1] + left[63] -
2912  2 * left[31]))) < threshold) {
2913 
2914 
2915  filtered_top[-1] = top[-1];
2916  filtered_top[63] = top[63];
2917 
2918 
2919  for (i = 0; i < 63; i++) {
2920  filtered_top[i] =
2921  ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2922  }
2923 
2924  tmp0 = __msa_fill_h(top[-1]);
2925  tmp1 = __msa_fill_h(top[63]);
2926 
2927  tmp2 = mul_val0 - 8;
2928  tmp3 = mul_val0 - 16;
2929  tmp4 = mul_val0 - 24;
2930  tmp5 = mul_val1 + 8;
2931  tmp6 = mul_val1 + 16;
2932  tmp7 = mul_val1 + 24;
2933 
2934  res0 = mul_val0 * tmp0;
2935  res1 = tmp2 * tmp0;
2936  res2 = tmp3 * tmp0;
2937  res3 = tmp4 * tmp0;
2938  res0 += mul_val1 * tmp1;
2939  res1 += tmp5 * tmp1;
2940  res2 += tmp6 * tmp1;
2941  res3 += tmp7 * tmp1;
2942 
2943  res0 = __msa_srari_h(res0, 6);
2944  res1 = __msa_srari_h(res1, 6);
2945  res2 = __msa_srari_h(res2, 6);
2946  res3 = __msa_srari_h(res3, 6);
2947 
2948  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2949  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2950 
2951  ST_UB2(vec0, vec1, filtered_top, 16);
2952 
2953  res0 = mul_val0 - 32;
2954  tmp2 = mul_val0 - 40;
2955  tmp3 = mul_val0 - 48;
2956  tmp4 = mul_val0 - 56;
2957  res3 = mul_val1 + 32;
2958  tmp5 = mul_val1 + 40;
2959  tmp6 = mul_val1 + 48;
2960  tmp7 = mul_val1 + 56;
2961 
2962  res0 = res0 * tmp0;
2963  res1 = tmp2 * tmp0;
2964  res2 = tmp3 * tmp0;
2965  res0 += res3 * tmp1;
2966  res3 = tmp4 * tmp0;
2967  res1 += tmp5 * tmp1;
2968  res2 += tmp6 * tmp1;
2969  res3 += tmp7 * tmp1;
2970 
2971  res0 = __msa_srari_h(res0, 6);
2972  res1 = __msa_srari_h(res1, 6);
2973  res2 = __msa_srari_h(res2, 6);
2974  res3 = __msa_srari_h(res3, 6);
2975 
2976  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2977  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2978 
2979  ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2980 
2981  filtered_top[63] = top[63];
2982 
2983  tmp0 = __msa_fill_h(left[-1]);
2984  tmp1 = __msa_fill_h(left[63]);
2985 
2986  tmp2 = mul_val0 - 8;
2987  tmp3 = mul_val0 - 16;
2988  tmp4 = mul_val0 - 24;
2989  tmp5 = mul_val1 + 8;
2990  tmp6 = mul_val1 + 16;
2991  tmp7 = mul_val1 + 24;
2992 
2993  res0 = mul_val0 * tmp0;
2994  res1 = tmp2 * tmp0;
2995  res2 = tmp3 * tmp0;
2996  res3 = tmp4 * tmp0;
2997  res0 += mul_val1 * tmp1;
2998  res1 += tmp5 * tmp1;
2999  res2 += tmp6 * tmp1;
3000  res3 += tmp7 * tmp1;
3001 
3002  res0 = __msa_srari_h(res0, 6);
3003  res1 = __msa_srari_h(res1, 6);
3004  res2 = __msa_srari_h(res2, 6);
3005  res3 = __msa_srari_h(res3, 6);
3006 
3007  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3008  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3009 
3010  ST_UB2(vec0, vec1, left, 16);
3011 
3012  res0 = mul_val0 - 32;
3013  tmp2 = mul_val0 - 40;
3014  tmp3 = mul_val0 - 48;
3015  tmp4 = mul_val0 - 56;
3016  res3 = mul_val1 + 32;
3017  tmp5 = mul_val1 + 40;
3018  tmp6 = mul_val1 + 48;
3019  tmp7 = mul_val1 + 56;
3020 
3021  res0 = res0 * tmp0;
3022  res1 = tmp2 * tmp0;
3023  res2 = tmp3 * tmp0;
3024  res0 += res3 * tmp1;
3025  res3 = tmp4 * tmp0;
3026  res1 += tmp5 * tmp1;
3027  res2 += tmp6 * tmp1;
3028  res3 += tmp7 * tmp1;
3029 
3030  res0 = __msa_srari_h(res0, 6);
3031  res1 = __msa_srari_h(res1, 6);
3032  res2 = __msa_srari_h(res2, 6);
3033  res3 = __msa_srari_h(res3, 6);
3034 
3035  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3036  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3037 
3038  ST_UB2(vec0, vec1, (left + 32), 16);
3039 
3040  left[63] = tmp1[0];
3041 
3042  top = filtered_top;
3043  } else {
3044  filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3045  filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3046  for (i = 2 * 32 - 2; i >= 0; i--)
3047  filtered_left[i] = (left[i + 1] + 2 * left[i] +
3048  left[i - 1] + 2) >> 2;
3049  filtered_top[-1] =
3050  filtered_left[-1] =
3051  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3052  for (i = 2 * 32 - 2; i >= 0; i--)
3053  filtered_top[i] = (top[i + 1] + 2 * top[i] +
3054  top[i - 1] + 2) >> 2;
3055  left = filtered_left;
3056  top = filtered_top;
3057  }
3058  }
3059  }
3060  }
3061 
3062  switch (mode) {
3063  case INTRA_PLANAR:
3064  s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3065  (uint8_t *) left, stride);
3066  break;
3067  case INTRA_DC:
3068  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3069  (uint8_t *) left, stride, 5, c_idx);
3070  break;
3071  default:
3072  s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3073  (uint8_t *) left, stride, c_idx, mode);
3074  break;
3075  }
3076 }
HEVCLocalContext::na
NeighbourAvailable na
Definition: hevcdec.h:427
HEVCLocalContext
Definition: hevcdec.h:381
SRARI_H2_SH
#define SRARI_H2_SH(...)
Definition: generic_macros_msa.h:2059
ff_intra_pred_8_16x16_msa
void ff_intra_pred_8_16x16_msa(HEVCLocalContext *lc, const HEVCPPS *pps, int x0, int y0, int c_idx)
Definition: hevcpred_msa.c:1906
PCKEV_B4_SB
#define PCKEV_B4_SB(...)
Definition: generic_macros_msa.h:1738
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1674
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
src1
const pixel * src1
Definition: h264pred_template.c:421
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2067
NeighbourAvailable::cand_left
int cand_left
Definition: hevcdec.h:311
NeighbourAvailable::cand_up
int cand_up
Definition: hevcdec.h:312
INTRA_DC
@ INTRA_DC
Definition: hevcdec.h:122
ILVR_B2_UH
#define ILVR_B2_UH(...)
Definition: generic_macros_msa.h:1339
NeighbourAvailable::cand_up_right
int cand_up_right
Definition: hevcdec.h:314
hevc_intra_pred_angular_upper_4width_msa
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:923
max
#define max(a, b)
Definition: cuda_runtime.h:33
hevc_intra_pred_plane_32x32_msa
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:908
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
PF_INTRA
@ PF_INTRA
Definition: hevcdec.h:114
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1144
hevc_intra_pred_horiz_8x8_msa
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:224
intra_pred_angle_low
static const int8_t intra_pred_angle_low[16]
Definition: hevcpred_msa.c:29
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2118
HADD_UB2_UH
#define HADD_UB2_UH(...)
Definition: generic_macros_msa.h:1067
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
intra_predict_vert_32x32_msa
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride)
Definition: hevcpred_msa.c:1764
val
static double val(void *priv, double ch)
Definition: aeval.c:77
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
intra_pred_angle_up
static const int8_t intra_pred_angle_up[17]
Definition: hevcpred_msa.c:25
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
hevc_intra_pred_dc_8x8_msa
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:401
TransformUnit::intra_pred_mode
int intra_pred_mode
Definition: hevcdec.h:334
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:167
hevc_intra_pred_vert_8x8_msa
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:94
MUL2
#define MUL2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2101
hevc_intra_pred_angular_lower_16width_msa
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1536
HEVCLocalContext::parent
const struct HEVCContext * parent
Definition: hevcdec.h:389
s
#define s(width, name)
Definition: cbs_vp9.c:198
NeighbourAvailable::cand_bottom_left
int cand_bottom_left
Definition: hevcdec.h:310
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_intra_pred_plane_16x16_msa
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:663
SLDI_B4_UB
#define SLDI_B4_UB(...)
Definition: generic_macros_msa.h:643
hevc_intra_pred_plane_4x4_msa
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:548
hevcpred_mips.h
hevc_intra_pred_angular_upper_8width_msa
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1015
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:511
ILVL_B2_SH
#define ILVL_B2_SH(...)
Definition: generic_macros_msa.h:1265
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:366
ff_hevc_intra_pred_dc_msa
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int log2, int c_idx)
Definition: hevcpred_msa.c:1811
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
ff_hevc_intra_pred_planar_1_msa
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1787
unaligned_32
Definition: intreadwrite.h:217
hevc_intra_pred_horiz_16x16_msa
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:264
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
ILVRL_B2_UH
#define ILVRL_B2_UH(...)
Definition: generic_macros_msa.h:1497
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:941
LW
#define LW(psrc)
Definition: generic_macros_msa.h:104
hevc_intra_pred_plane_8x8_msa
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:595
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
INTRA_PLANAR
@ INTRA_PLANAR
Definition: hevcdec.h:121
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
hevc_intra_pred_dc_32x32_msa
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:516
ST_SB4
#define ST_SB4(...)
Definition: generic_macros_msa.h:375
ff_pred_intra_pred_angular_1_msa
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1852
hevc_intra_pred_angular_lower_32width_msa
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1652
hevc_intra_pred_angular_lower_4width_msa
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1338
hevc_intra_pred_angular_upper_32width_msa
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1224
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
ST_SB2
#define ST_SB2(...)
Definition: generic_macros_msa.h:364
HEVC_PRED_PLANAR_16x2
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, res0, res1, mul_val_b0, mul_val_b1, round)
Definition: hevcpred_msa.c:33
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
SW4
#define SW4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:241
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1656
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
SUB2
#define SUB2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2135
hevc_intra_pred_dc_4x4_msa
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:340
hevc_intra_pred_horiz_32x32_msa
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:310
flag
#define flag(name)
Definition: cbs_av1.c:474
ff_hevc_intra_pred_planar_0_msa
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1779
SRARI_H2_UH
#define SRARI_H2_UH(...)
Definition: generic_macros_msa.h:2058
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
NeighbourAvailable::cand_up_left
int cand_up_left
Definition: hevcdec.h:313
ST_SB
#define ST_SB(...)
Definition: generic_macros_msa.h:41
src2
const pixel * src2
Definition: h264pred_template.c:422
CLIP_SH_0_255
#define CLIP_SH_0_255(in)
Definition: generic_macros_msa.h:935
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
UNPCK_UB_SH
#define UNPCK_UB_SH(in, out0, out1)
Definition: generic_macros_msa.h:2206
hevc_intra_pred_dc_16x16_msa
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:460
log2
#define log2(x)
Definition: libm.h:404
stride
#define stride
Definition: h264pred_template.c:537
IntraPredMode
IntraPredMode
Definition: hevcdec.h:120
SLDI_B4_SH
#define SLDI_B4_SH(...)
Definition: generic_macros_msa.h:645
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:277
hevc_intra_pred_angular_lower_8width_msa
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1435
sps
static int FUNC() sps(CodedBitstreamContext *ctx, RWContext *rw, H264RawSPS *current)
Definition: cbs_h264_syntax_template.c:260
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
U
#define U(x)
Definition: vpx_arith.h:37
SLDI_B4_SB
#define SLDI_B4_SB(...)
Definition: generic_macros_msa.h:644
ff_hevc_intra_pred_planar_3_msa
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1803
process_intra_lower_16x16_msa
static void process_intra_lower_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:826
ILVR_D2_SH
#define ILVR_D2_SH(...)
Definition: generic_macros_msa.h:1445
mode
mode
Definition: ebur128.h:83
HEVCContext
Definition: hevcdec.h:479
ff_pred_intra_pred_angular_0_msa
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1834
hevc_intra_pred_vert_16x16_msa
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:149
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:112
pps
uint64_t pps
Definition: dovi_rpuenc.c:35
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
HEVCLocalContext::tu
TransformUnit tu
Definition: hevcdec.h:410
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1362
src0
const pixel *const src0
Definition: h264pred_template.c:420
process_intra_upper_16x16_msa
static void process_intra_upper_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:743
ff_intra_pred_8_32x32_msa
void ff_intra_pred_8_32x32_msa(HEVCLocalContext *lc, int x0, int y0, int c_idx)
Definition: hevcpred_msa.c:2419
HEVCSPS
Definition: ps.h:190
HEVCPPS
Definition: ps.h:309
hevc_intra_pred_vert_4x4_msa
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:61
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1498
hevc_intra_pred_horiz_4x4_msa
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:190
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1719
MUL4
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2106
int32_t
int32_t
Definition: audioconvert.c:56
hevc_intra_pred_angular_upper_16width_msa
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1114
ff_pred_intra_pred_angular_3_msa
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1888
LD
#define LD(psrc)
Definition: generic_macros_msa.h:137
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
TransformUnit::intra_pred_mode_c
int intra_pred_mode_c
Definition: hevcdec.h:335
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1789
INSERT_D2_UB
#define INSERT_D2_UB(...)
Definition: generic_macros_msa.h:1169
ff_pred_intra_pred_angular_2_msa
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1870
SD
#define SD
Definition: ccaption_dec.c:940
ff_hevc_intra_pred_planar_2_msa
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1795
src
#define src
Definition: vp8dsp.c:248
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278