24 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
26 #define ASSERT_ALIGNED(ptr) ;
30 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
31 static void PREFIX_h264_qpel16_h_lowpass_altivec(
uint8_t *dst,
33 int dstStride,
int srcStride)
38 const vec_u8 permM2 = vec_lvsl(-2, src);
39 const vec_u8 permM1 = vec_lvsl(-1, src);
40 const vec_u8 permP0 = vec_lvsl(+0, src);
41 const vec_u8 permP1 = vec_lvsl(+1, src);
42 const vec_u8 permP2 = vec_lvsl(+2, src);
43 const vec_u8 permP3 = vec_lvsl(+3, src);
44 const vec_s16 v5ss = vec_splat_s16(5);
45 const vec_u16 v5us = vec_splat_u16(5);
46 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
47 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
49 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
51 register int align = ((((
unsigned long)src) - 2) % 16);
53 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
54 srcP2A, srcP2B, srcP3A, srcP3B,
55 srcM1A, srcM1B, srcM2A, srcM2B,
56 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
57 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
58 psumA, psumB, sumA, sumB;
62 for (i = 0 ; i < 16 ; i ++) {
63 vec_u8 srcR1 = vec_ld(-2, src);
64 vec_u8 srcR2 = vec_ld(14, src);
68 srcM2 = vec_perm(srcR1, srcR2, permM2);
69 srcM1 = vec_perm(srcR1, srcR2, permM1);
70 srcP0 = vec_perm(srcR1, srcR2, permP0);
71 srcP1 = vec_perm(srcR1, srcR2, permP1);
72 srcP2 = vec_perm(srcR1, srcR2, permP2);
73 srcP3 = vec_perm(srcR1, srcR2, permP3);
76 srcM2 = vec_perm(srcR1, srcR2, permM2);
77 srcM1 = vec_perm(srcR1, srcR2, permM1);
78 srcP0 = vec_perm(srcR1, srcR2, permP0);
79 srcP1 = vec_perm(srcR1, srcR2, permP1);
80 srcP2 = vec_perm(srcR1, srcR2, permP2);
84 vec_u8 srcR3 = vec_ld(30, src);
85 srcM2 = vec_perm(srcR1, srcR2, permM2);
86 srcM1 = vec_perm(srcR1, srcR2, permM1);
87 srcP0 = vec_perm(srcR1, srcR2, permP0);
88 srcP1 = vec_perm(srcR1, srcR2, permP1);
90 srcP3 = vec_perm(srcR2, srcR3, permP3);
93 vec_u8 srcR3 = vec_ld(30, src);
94 srcM2 = vec_perm(srcR1, srcR2, permM2);
95 srcM1 = vec_perm(srcR1, srcR2, permM1);
96 srcP0 = vec_perm(srcR1, srcR2, permP0);
98 srcP2 = vec_perm(srcR2, srcR3, permP2);
99 srcP3 = vec_perm(srcR2, srcR3, permP3);
102 vec_u8 srcR3 = vec_ld(30, src);
103 srcM2 = vec_perm(srcR1, srcR2, permM2);
104 srcM1 = vec_perm(srcR1, srcR2, permM1);
106 srcP1 = vec_perm(srcR2, srcR3, permP1);
107 srcP2 = vec_perm(srcR2, srcR3, permP2);
108 srcP3 = vec_perm(srcR2, srcR3, permP3);
111 vec_u8 srcR3 = vec_ld(30, src);
112 srcM2 = vec_perm(srcR1, srcR2, permM2);
114 srcP0 = vec_perm(srcR2, srcR3, permP0);
115 srcP1 = vec_perm(srcR2, srcR3, permP1);
116 srcP2 = vec_perm(srcR2, srcR3, permP2);
117 srcP3 = vec_perm(srcR2, srcR3, permP3);
136 sum1A = vec_adds(srcP0A, srcP1A);
137 sum1B = vec_adds(srcP0B, srcP1B);
138 sum2A = vec_adds(srcM1A, srcP2A);
139 sum2B = vec_adds(srcM1B, srcP2B);
140 sum3A = vec_adds(srcM2A, srcP3A);
141 sum3B = vec_adds(srcM2B, srcP3B);
143 pp1A = vec_mladd(sum1A, v20ss, v16ss);
144 pp1B = vec_mladd(sum1B, v20ss, v16ss);
146 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
147 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
149 pp3A = vec_add(sum3A, pp1A);
150 pp3B = vec_add(sum3B, pp1B);
152 psumA = vec_sub(pp3A, pp2A);
153 psumB = vec_sub(pp3B, pp2B);
155 sumA = vec_sra(psumA, v5us);
156 sumB = vec_sra(psumB, v5us);
158 sum = vec_packsu(sumA, sumB);
162 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
164 vec_st(fsum, 0, dst);
173 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
174 static void PREFIX_h264_qpel16_v_lowpass_altivec(
uint8_t *dst,
176 int dstStride,
int srcStride)
182 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
183 const vec_u16 v5us = vec_splat_u16(5);
184 const vec_s16 v5ss = vec_splat_s16(5);
185 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
187 const uint8_t *srcbis = src - (srcStride * 2);
189 const vec_u8 srcM2a = vec_ld(0, srcbis);
190 const vec_u8 srcM2b = vec_ld(16, srcbis);
191 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
193 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
194 const vec_u8 srcM1b = vec_ld(16, srcbis);
195 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
197 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
198 const vec_u8 srcP0b = vec_ld(16, srcbis);
199 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
201 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
202 const vec_u8 srcP1b = vec_ld(16, srcbis);
203 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
205 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
206 const vec_u8 srcP2b = vec_ld(16, srcbis);
207 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
221 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
222 psumA, psumB, sumA, sumB,
224 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
226 vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
228 for (i = 0 ; i < 16 ; i++) {
229 srcP3a = vec_ld(0, srcbis += srcStride);
230 srcP3b = vec_ld(16, srcbis);
231 srcP3 = vec_perm(srcP3a, srcP3b, perm);
236 sum1A = vec_adds(srcP0ssA, srcP1ssA);
237 sum1B = vec_adds(srcP0ssB, srcP1ssB);
238 sum2A = vec_adds(srcM1ssA, srcP2ssA);
239 sum2B = vec_adds(srcM1ssB, srcP2ssB);
240 sum3A = vec_adds(srcM2ssA, srcP3ssA);
241 sum3B = vec_adds(srcM2ssB, srcP3ssB);
254 pp1A = vec_mladd(sum1A, v20ss, v16ss);
255 pp1B = vec_mladd(sum1B, v20ss, v16ss);
257 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
258 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
260 pp3A = vec_add(sum3A, pp1A);
261 pp3B = vec_add(sum3B, pp1B);
263 psumA = vec_sub(pp3A, pp2A);
264 psumB = vec_sub(pp3B, pp2B);
266 sumA = vec_sra(psumA, v5us);
267 sumB = vec_sra(psumB, v5us);
269 sum = vec_packsu(sumA, sumB);
273 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
275 vec_st(fsum, 0, dst);
283 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
284 static void PREFIX_h264_qpel16_hv_lowpass_altivec(
uint8_t *dst, int16_t *tmp,
286 int dstStride,
int tmpStride,
291 const vec_u8 permM2 = vec_lvsl(-2, src);
292 const vec_u8 permM1 = vec_lvsl(-1, src);
293 const vec_u8 permP0 = vec_lvsl(+0, src);
294 const vec_u8 permP1 = vec_lvsl(+1, src);
295 const vec_u8 permP2 = vec_lvsl(+2, src);
296 const vec_u8 permP3 = vec_lvsl(+3, src);
297 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
298 const vec_u32 v10ui = vec_splat_u32(10);
299 const vec_s16 v5ss = vec_splat_s16(5);
300 const vec_s16 v1ss = vec_splat_s16(1);
301 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
302 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
304 register int align = ((((
unsigned long)src) - 2) % 16);
306 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
307 srcP2A, srcP2B, srcP3A, srcP3B,
308 srcM1A, srcM1B, srcM2A, srcM2B,
309 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
310 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
313 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
314 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
315 int16_t *tmpbis = tmp;
317 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
318 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
321 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
322 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
323 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
324 ssumAe, ssumAo, ssumBe, ssumBo;
328 src -= (2 * srcStride);
329 for (i = 0 ; i < 21 ; i ++) {
330 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
331 vec_u8 srcR1 = vec_ld(-2, src);
332 vec_u8 srcR2 = vec_ld(14, src);
336 srcM2 = vec_perm(srcR1, srcR2, permM2);
337 srcM1 = vec_perm(srcR1, srcR2, permM1);
338 srcP0 = vec_perm(srcR1, srcR2, permP0);
339 srcP1 = vec_perm(srcR1, srcR2, permP1);
340 srcP2 = vec_perm(srcR1, srcR2, permP2);
341 srcP3 = vec_perm(srcR1, srcR2, permP3);
344 srcM2 = vec_perm(srcR1, srcR2, permM2);
345 srcM1 = vec_perm(srcR1, srcR2, permM1);
346 srcP0 = vec_perm(srcR1, srcR2, permP0);
347 srcP1 = vec_perm(srcR1, srcR2, permP1);
348 srcP2 = vec_perm(srcR1, srcR2, permP2);
352 vec_u8 srcR3 = vec_ld(30, src);
353 srcM2 = vec_perm(srcR1, srcR2, permM2);
354 srcM1 = vec_perm(srcR1, srcR2, permM1);
355 srcP0 = vec_perm(srcR1, srcR2, permP0);
356 srcP1 = vec_perm(srcR1, srcR2, permP1);
358 srcP3 = vec_perm(srcR2, srcR3, permP3);
361 vec_u8 srcR3 = vec_ld(30, src);
362 srcM2 = vec_perm(srcR1, srcR2, permM2);
363 srcM1 = vec_perm(srcR1, srcR2, permM1);
364 srcP0 = vec_perm(srcR1, srcR2, permP0);
366 srcP2 = vec_perm(srcR2, srcR3, permP2);
367 srcP3 = vec_perm(srcR2, srcR3, permP3);
370 vec_u8 srcR3 = vec_ld(30, src);
371 srcM2 = vec_perm(srcR1, srcR2, permM2);
372 srcM1 = vec_perm(srcR1, srcR2, permM1);
374 srcP1 = vec_perm(srcR2, srcR3, permP1);
375 srcP2 = vec_perm(srcR2, srcR3, permP2);
376 srcP3 = vec_perm(srcR2, srcR3, permP3);
379 vec_u8 srcR3 = vec_ld(30, src);
380 srcM2 = vec_perm(srcR1, srcR2, permM2);
382 srcP0 = vec_perm(srcR2, srcR3, permP0);
383 srcP1 = vec_perm(srcR2, srcR3, permP1);
384 srcP2 = vec_perm(srcR2, srcR3, permP2);
385 srcP3 = vec_perm(srcR2, srcR3, permP3);
404 sum1A = vec_adds(srcP0A, srcP1A);
405 sum1B = vec_adds(srcP0B, srcP1B);
406 sum2A = vec_adds(srcM1A, srcP2A);
407 sum2B = vec_adds(srcM1B, srcP2B);
408 sum3A = vec_adds(srcM2A, srcP3A);
409 sum3B = vec_adds(srcM2B, srcP3B);
411 pp1A = vec_mladd(sum1A, v20ss, sum3A);
412 pp1B = vec_mladd(sum1B, v20ss, sum3B);
414 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
415 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
417 psumA = vec_sub(pp1A, pp2A);
418 psumB = vec_sub(pp1B, pp2B);
420 vec_st(psumA, 0, tmp);
421 vec_st(psumB, 16, tmp);
427 tmpM2ssA = vec_ld(0, tmpbis);
428 tmpM2ssB = vec_ld(16, tmpbis);
430 tmpM1ssA = vec_ld(0, tmpbis);
431 tmpM1ssB = vec_ld(16, tmpbis);
433 tmpP0ssA = vec_ld(0, tmpbis);
434 tmpP0ssB = vec_ld(16, tmpbis);
436 tmpP1ssA = vec_ld(0, tmpbis);
437 tmpP1ssB = vec_ld(16, tmpbis);
439 tmpP2ssA = vec_ld(0, tmpbis);
440 tmpP2ssB = vec_ld(16, tmpbis);
443 for (i = 0 ; i < 16 ; i++) {
444 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
445 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
447 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
448 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
449 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
450 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
451 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
452 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
467 pp1Ae = vec_mule(sum1A, v20ss);
468 pp1Ao = vec_mulo(sum1A, v20ss);
469 pp1Be = vec_mule(sum1B, v20ss);
470 pp1Bo = vec_mulo(sum1B, v20ss);
472 pp2Ae = vec_mule(sum2A, v5ss);
473 pp2Ao = vec_mulo(sum2A, v5ss);
474 pp2Be = vec_mule(sum2B, v5ss);
475 pp2Bo = vec_mulo(sum2B, v5ss);
477 pp3Ae = vec_sra((
vec_s32)sum3A, v16ui);
478 pp3Ao = vec_mulo(sum3A, v1ss);
479 pp3Be = vec_sra((
vec_s32)sum3B, v16ui);
480 pp3Bo = vec_mulo(sum3B, v1ss);
482 pp1cAe = vec_add(pp1Ae, v512si);
483 pp1cAo = vec_add(pp1Ao, v512si);
484 pp1cBe = vec_add(pp1Be, v512si);
485 pp1cBo = vec_add(pp1Bo, v512si);
487 pp32Ae = vec_sub(pp3Ae, pp2Ae);
488 pp32Ao = vec_sub(pp3Ao, pp2Ao);
489 pp32Be = vec_sub(pp3Be, pp2Be);
490 pp32Bo = vec_sub(pp3Bo, pp2Bo);
492 sumAe = vec_add(pp1cAe, pp32Ae);
493 sumAo = vec_add(pp1cAo, pp32Ao);
494 sumBe = vec_add(pp1cBe, pp32Be);
495 sumBo = vec_add(pp1cBo, pp32Bo);
497 ssumAe = vec_sra(sumAe, v10ui);
498 ssumAo = vec_sra(sumAo, v10ui);
499 ssumBe = vec_sra(sumBe, v10ui);
500 ssumBo = vec_sra(sumBo, v10ui);
502 ssume = vec_packs(ssumAe, ssumBe);
503 ssumo = vec_packs(ssumAo, ssumBo);
505 sumv = vec_packsu(ssume, ssumo);
506 sum = vec_perm(sumv, sumv, mperm);
510 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
512 vec_st(fsum, 0, dst);