00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "libavutil/mem.h"
00022
00023 #ifdef DEBUG
00024 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00025 #else
00026 #define ASSERT_ALIGNED(ptr) ;
00027 #endif
00028
00029
00030
00031 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
00032 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00033 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00034 \
00035 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
00036 psum = vec_mladd(vB, vsrc1ssH, psum);\
00037 psum = vec_mladd(vC, vsrc2ssH, psum);\
00038 psum = vec_mladd(vD, vsrc3ssH, psum);\
00039 psum = BIAS2(psum);\
00040 psum = vec_sr(psum, v6us);\
00041 \
00042 vdst = vec_ld(0, dst);\
00043 ppsum = (vec_u8)vec_pack(psum, psum);\
00044 vfdst = vec_perm(vdst, ppsum, fperm);\
00045 \
00046 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00047 \
00048 vec_st(fsum, 0, dst);\
00049 \
00050 vsrc0ssH = vsrc2ssH;\
00051 vsrc1ssH = vsrc3ssH;\
00052 \
00053 dst += stride;\
00054 src += stride;
00055
00056 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00057 \
00058 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00059 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00060 \
00061 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00062 psum = vec_mladd(vE, vsrc1ssH, psum);\
00063 psum = vec_sr(psum, v6us);\
00064 \
00065 vdst = vec_ld(0, dst);\
00066 ppsum = (vec_u8)vec_pack(psum, psum);\
00067 vfdst = vec_perm(vdst, ppsum, fperm);\
00068 \
00069 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00070 \
00071 vec_st(fsum, 0, dst);\
00072 \
00073 dst += stride;\
00074 src += stride;
00075
00076 #define noop(a) a
00077 #define add28(a) vec_add(v28ss, a)
00078
00079 #ifdef PREFIX_h264_chroma_mc8_altivec
00080 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00081 int stride, int h, int x, int y) {
00082 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00083 {((8 - x) * (8 - y)),
00084 (( x) * (8 - y)),
00085 ((8 - x) * ( y)),
00086 (( x) * ( y))};
00087 register int i;
00088 vec_u8 fperm;
00089 const vec_s32 vABCD = vec_ld(0, ABCD);
00090 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00091 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00092 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00093 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00094 LOAD_ZERO;
00095 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00096 const vec_u16 v6us = vec_splat_u16(6);
00097 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00098 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00099
00100 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00101 vec_u8 vsrc0uc, vsrc1uc;
00102 vec_s16 vsrc0ssH, vsrc1ssH;
00103 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00104 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00105 vec_u8 vdst, ppsum, vfdst, fsum;
00106
00107 if (((unsigned long)dst) % 16 == 0) {
00108 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00109 0x14, 0x15, 0x16, 0x17,
00110 0x08, 0x09, 0x0A, 0x0B,
00111 0x0C, 0x0D, 0x0E, 0x0F};
00112 } else {
00113 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00114 0x04, 0x05, 0x06, 0x07,
00115 0x18, 0x19, 0x1A, 0x1B,
00116 0x1C, 0x1D, 0x1E, 0x1F};
00117 }
00118
00119 vsrcAuc = vec_ld(0, src);
00120
00121 if (loadSecond)
00122 vsrcBuc = vec_ld(16, src);
00123 vsrcperm0 = vec_lvsl(0, src);
00124 vsrcperm1 = vec_lvsl(1, src);
00125
00126 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00127 if (reallyBadAlign)
00128 vsrc1uc = vsrcBuc;
00129 else
00130 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00131
00132 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00133 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00134
00135 if (ABCD[3]) {
00136 if (!loadSecond) {
00137 for (i = 0 ; i < h ; i++) {
00138 vsrcCuc = vec_ld(stride + 0, src);
00139 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00140 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00141
00142 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00143 }
00144 } else {
00145 vec_u8 vsrcDuc;
00146 for (i = 0 ; i < h ; i++) {
00147 vsrcCuc = vec_ld(stride + 0, src);
00148 vsrcDuc = vec_ld(stride + 16, src);
00149 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00150 if (reallyBadAlign)
00151 vsrc3uc = vsrcDuc;
00152 else
00153 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00154
00155 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00156 }
00157 }
00158 } else {
00159 const vec_s16 vE = vec_add(vB, vC);
00160 if (ABCD[2]) {
00161 if (!loadSecond) {
00162 for (i = 0 ; i < h ; i++) {
00163 vsrcCuc = vec_ld(stride + 0, src);
00164 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00165 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00166
00167 vsrc0uc = vsrc1uc;
00168 }
00169 } else {
00170 vec_u8 vsrcDuc;
00171 for (i = 0 ; i < h ; i++) {
00172 vsrcCuc = vec_ld(stride + 0, src);
00173 vsrcDuc = vec_ld(stride + 15, src);
00174 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00175 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00176
00177 vsrc0uc = vsrc1uc;
00178 }
00179 }
00180 } else {
00181 if (!loadSecond) {
00182 for (i = 0 ; i < h ; i++) {
00183 vsrcCuc = vec_ld(0, src);
00184 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00185 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00186
00187 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00188 }
00189 } else {
00190 vec_u8 vsrcDuc;
00191 for (i = 0 ; i < h ; i++) {
00192 vsrcCuc = vec_ld(0, src);
00193 vsrcDuc = vec_ld(15, src);
00194 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00195 if (reallyBadAlign)
00196 vsrc1uc = vsrcDuc;
00197 else
00198 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00199
00200 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00201 }
00202 }
00203 }
00204 }
00205 }
00206 #endif
00207
00208
00209 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
00210 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00211 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00212 {((8 - x) * (8 - y)),
00213 (( x) * (8 - y)),
00214 ((8 - x) * ( y)),
00215 (( x) * ( y))};
00216 register int i;
00217 vec_u8 fperm;
00218 const vec_s32 vABCD = vec_ld(0, ABCD);
00219 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00220 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00221 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00222 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00223 LOAD_ZERO;
00224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
00225 const vec_u16 v6us = vec_splat_u16(6);
00226 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00227 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00228
00229 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00230 vec_u8 vsrc0uc, vsrc1uc;
00231 vec_s16 vsrc0ssH, vsrc1ssH;
00232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00233 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00234 vec_u8 vdst, ppsum, vfdst, fsum;
00235
00236 if (((unsigned long)dst) % 16 == 0) {
00237 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00238 0x14, 0x15, 0x16, 0x17,
00239 0x08, 0x09, 0x0A, 0x0B,
00240 0x0C, 0x0D, 0x0E, 0x0F};
00241 } else {
00242 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00243 0x04, 0x05, 0x06, 0x07,
00244 0x18, 0x19, 0x1A, 0x1B,
00245 0x1C, 0x1D, 0x1E, 0x1F};
00246 }
00247
00248 vsrcAuc = vec_ld(0, src);
00249
00250 if (loadSecond)
00251 vsrcBuc = vec_ld(16, src);
00252 vsrcperm0 = vec_lvsl(0, src);
00253 vsrcperm1 = vec_lvsl(1, src);
00254
00255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00256 if (reallyBadAlign)
00257 vsrc1uc = vsrcBuc;
00258 else
00259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00260
00261 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
00262 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
00263
00264 if (!loadSecond) {
00265 for (i = 0 ; i < h ; i++) {
00266
00267
00268 vsrcCuc = vec_ld(stride + 0, src);
00269
00270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00272
00273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00274 }
00275 } else {
00276 vec_u8 vsrcDuc;
00277 for (i = 0 ; i < h ; i++) {
00278 vsrcCuc = vec_ld(stride + 0, src);
00279 vsrcDuc = vec_ld(stride + 16, src);
00280
00281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00282 if (reallyBadAlign)
00283 vsrc3uc = vsrcDuc;
00284 else
00285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00286
00287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00288 }
00289 }
00290 }
00291 #endif
00292
00293 #undef noop
00294 #undef add28
00295 #undef CHROMA_MC8_ALTIVEC_CORE
00296
00297
00298 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
00299 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00300 register int i;
00301
00302 LOAD_ZERO;
00303 const vec_u8 permM2 = vec_lvsl(-2, src);
00304 const vec_u8 permM1 = vec_lvsl(-1, src);
00305 const vec_u8 permP0 = vec_lvsl(+0, src);
00306 const vec_u8 permP1 = vec_lvsl(+1, src);
00307 const vec_u8 permP2 = vec_lvsl(+2, src);
00308 const vec_u8 permP3 = vec_lvsl(+3, src);
00309 const vec_s16 v5ss = vec_splat_s16(5);
00310 const vec_u16 v5us = vec_splat_u16(5);
00311 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00312 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00313
00314 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00315
00316 register int align = ((((unsigned long)src) - 2) % 16);
00317
00318 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00319 srcP2A, srcP2B, srcP3A, srcP3B,
00320 srcM1A, srcM1B, srcM2A, srcM2B,
00321 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00322 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00323 psumA, psumB, sumA, sumB;
00324
00325 vec_u8 sum, vdst, fsum;
00326
00327 for (i = 0 ; i < 16 ; i ++) {
00328 vec_u8 srcR1 = vec_ld(-2, src);
00329 vec_u8 srcR2 = vec_ld(14, src);
00330
00331 switch (align) {
00332 default: {
00333 srcM2 = vec_perm(srcR1, srcR2, permM2);
00334 srcM1 = vec_perm(srcR1, srcR2, permM1);
00335 srcP0 = vec_perm(srcR1, srcR2, permP0);
00336 srcP1 = vec_perm(srcR1, srcR2, permP1);
00337 srcP2 = vec_perm(srcR1, srcR2, permP2);
00338 srcP3 = vec_perm(srcR1, srcR2, permP3);
00339 } break;
00340 case 11: {
00341 srcM2 = vec_perm(srcR1, srcR2, permM2);
00342 srcM1 = vec_perm(srcR1, srcR2, permM1);
00343 srcP0 = vec_perm(srcR1, srcR2, permP0);
00344 srcP1 = vec_perm(srcR1, srcR2, permP1);
00345 srcP2 = vec_perm(srcR1, srcR2, permP2);
00346 srcP3 = srcR2;
00347 } break;
00348 case 12: {
00349 vec_u8 srcR3 = vec_ld(30, src);
00350 srcM2 = vec_perm(srcR1, srcR2, permM2);
00351 srcM1 = vec_perm(srcR1, srcR2, permM1);
00352 srcP0 = vec_perm(srcR1, srcR2, permP0);
00353 srcP1 = vec_perm(srcR1, srcR2, permP1);
00354 srcP2 = srcR2;
00355 srcP3 = vec_perm(srcR2, srcR3, permP3);
00356 } break;
00357 case 13: {
00358 vec_u8 srcR3 = vec_ld(30, src);
00359 srcM2 = vec_perm(srcR1, srcR2, permM2);
00360 srcM1 = vec_perm(srcR1, srcR2, permM1);
00361 srcP0 = vec_perm(srcR1, srcR2, permP0);
00362 srcP1 = srcR2;
00363 srcP2 = vec_perm(srcR2, srcR3, permP2);
00364 srcP3 = vec_perm(srcR2, srcR3, permP3);
00365 } break;
00366 case 14: {
00367 vec_u8 srcR3 = vec_ld(30, src);
00368 srcM2 = vec_perm(srcR1, srcR2, permM2);
00369 srcM1 = vec_perm(srcR1, srcR2, permM1);
00370 srcP0 = srcR2;
00371 srcP1 = vec_perm(srcR2, srcR3, permP1);
00372 srcP2 = vec_perm(srcR2, srcR3, permP2);
00373 srcP3 = vec_perm(srcR2, srcR3, permP3);
00374 } break;
00375 case 15: {
00376 vec_u8 srcR3 = vec_ld(30, src);
00377 srcM2 = vec_perm(srcR1, srcR2, permM2);
00378 srcM1 = srcR2;
00379 srcP0 = vec_perm(srcR2, srcR3, permP0);
00380 srcP1 = vec_perm(srcR2, srcR3, permP1);
00381 srcP2 = vec_perm(srcR2, srcR3, permP2);
00382 srcP3 = vec_perm(srcR2, srcR3, permP3);
00383 } break;
00384 }
00385
00386 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00387 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00388 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00389 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00390
00391 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00392 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00393 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00394 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00395
00396 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00397 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00398 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00399 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00400
00401 sum1A = vec_adds(srcP0A, srcP1A);
00402 sum1B = vec_adds(srcP0B, srcP1B);
00403 sum2A = vec_adds(srcM1A, srcP2A);
00404 sum2B = vec_adds(srcM1B, srcP2B);
00405 sum3A = vec_adds(srcM2A, srcP3A);
00406 sum3B = vec_adds(srcM2B, srcP3B);
00407
00408 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00409 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00410
00411 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00412 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00413
00414 pp3A = vec_add(sum3A, pp1A);
00415 pp3B = vec_add(sum3B, pp1B);
00416
00417 psumA = vec_sub(pp3A, pp2A);
00418 psumB = vec_sub(pp3B, pp2B);
00419
00420 sumA = vec_sra(psumA, v5us);
00421 sumB = vec_sra(psumB, v5us);
00422
00423 sum = vec_packsu(sumA, sumB);
00424
00425 ASSERT_ALIGNED(dst);
00426 vdst = vec_ld(0, dst);
00427
00428 OP_U8_ALTIVEC(fsum, sum, vdst);
00429
00430 vec_st(fsum, 0, dst);
00431
00432 src += srcStride;
00433 dst += dstStride;
00434 }
00435 }
00436 #endif
00437
00438
00439 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
00440 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00441 register int i;
00442
00443 LOAD_ZERO;
00444 const vec_u8 perm = vec_lvsl(0, src);
00445 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00446 const vec_u16 v5us = vec_splat_u16(5);
00447 const vec_s16 v5ss = vec_splat_s16(5);
00448 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00449
00450 uint8_t *srcbis = src - (srcStride * 2);
00451
00452 const vec_u8 srcM2a = vec_ld(0, srcbis);
00453 const vec_u8 srcM2b = vec_ld(16, srcbis);
00454 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00455
00456 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00457 const vec_u8 srcM1b = vec_ld(16, srcbis);
00458 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00459
00460 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00461 const vec_u8 srcP0b = vec_ld(16, srcbis);
00462 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00463
00464 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00465 const vec_u8 srcP1b = vec_ld(16, srcbis);
00466 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00467
00468 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00469 const vec_u8 srcP2b = vec_ld(16, srcbis);
00470 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00471
00472
00473 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00474 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00475 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00476 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00477 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00478 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00479 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00480 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00481 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00482 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00483
00484 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00485 psumA, psumB, sumA, sumB,
00486 srcP3ssA, srcP3ssB,
00487 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00488
00489 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00490
00491 for (i = 0 ; i < 16 ; i++) {
00492 srcP3a = vec_ld(0, srcbis += srcStride);
00493 srcP3b = vec_ld(16, srcbis);
00494 srcP3 = vec_perm(srcP3a, srcP3b, perm);
00495 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00496 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00497
00498
00499 sum1A = vec_adds(srcP0ssA, srcP1ssA);
00500 sum1B = vec_adds(srcP0ssB, srcP1ssB);
00501 sum2A = vec_adds(srcM1ssA, srcP2ssA);
00502 sum2B = vec_adds(srcM1ssB, srcP2ssB);
00503 sum3A = vec_adds(srcM2ssA, srcP3ssA);
00504 sum3B = vec_adds(srcM2ssB, srcP3ssB);
00505
00506 srcM2ssA = srcM1ssA;
00507 srcM2ssB = srcM1ssB;
00508 srcM1ssA = srcP0ssA;
00509 srcM1ssB = srcP0ssB;
00510 srcP0ssA = srcP1ssA;
00511 srcP0ssB = srcP1ssB;
00512 srcP1ssA = srcP2ssA;
00513 srcP1ssB = srcP2ssB;
00514 srcP2ssA = srcP3ssA;
00515 srcP2ssB = srcP3ssB;
00516
00517 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00518 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00519
00520 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00521 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00522
00523 pp3A = vec_add(sum3A, pp1A);
00524 pp3B = vec_add(sum3B, pp1B);
00525
00526 psumA = vec_sub(pp3A, pp2A);
00527 psumB = vec_sub(pp3B, pp2B);
00528
00529 sumA = vec_sra(psumA, v5us);
00530 sumB = vec_sra(psumB, v5us);
00531
00532 sum = vec_packsu(sumA, sumB);
00533
00534 ASSERT_ALIGNED(dst);
00535 vdst = vec_ld(0, dst);
00536
00537 OP_U8_ALTIVEC(fsum, sum, vdst);
00538
00539 vec_st(fsum, 0, dst);
00540
00541 dst += dstStride;
00542 }
00543 }
00544 #endif
00545
00546
00547 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
00548 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00549 register int i;
00550 LOAD_ZERO;
00551 const vec_u8 permM2 = vec_lvsl(-2, src);
00552 const vec_u8 permM1 = vec_lvsl(-1, src);
00553 const vec_u8 permP0 = vec_lvsl(+0, src);
00554 const vec_u8 permP1 = vec_lvsl(+1, src);
00555 const vec_u8 permP2 = vec_lvsl(+2, src);
00556 const vec_u8 permP3 = vec_lvsl(+3, src);
00557 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00558 const vec_u32 v10ui = vec_splat_u32(10);
00559 const vec_s16 v5ss = vec_splat_s16(5);
00560 const vec_s16 v1ss = vec_splat_s16(1);
00561 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00562 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00563
00564 register int align = ((((unsigned long)src) - 2) % 16);
00565
00566 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00567 srcP2A, srcP2B, srcP3A, srcP3B,
00568 srcM1A, srcM1B, srcM2A, srcM2B,
00569 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00570 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00571
00572 const vec_u8 mperm = (const vec_u8)
00573 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00574 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00575 int16_t *tmpbis = tmp;
00576
00577 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00578 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00579 tmpP2ssA, tmpP2ssB;
00580
00581 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00582 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00583 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00584 ssumAe, ssumAo, ssumBe, ssumBo;
00585 vec_u8 fsum, sumv, sum, vdst;
00586 vec_s16 ssume, ssumo;
00587
00588 src -= (2 * srcStride);
00589 for (i = 0 ; i < 21 ; i ++) {
00590 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00591 vec_u8 srcR1 = vec_ld(-2, src);
00592 vec_u8 srcR2 = vec_ld(14, src);
00593
00594 switch (align) {
00595 default: {
00596 srcM2 = vec_perm(srcR1, srcR2, permM2);
00597 srcM1 = vec_perm(srcR1, srcR2, permM1);
00598 srcP0 = vec_perm(srcR1, srcR2, permP0);
00599 srcP1 = vec_perm(srcR1, srcR2, permP1);
00600 srcP2 = vec_perm(srcR1, srcR2, permP2);
00601 srcP3 = vec_perm(srcR1, srcR2, permP3);
00602 } break;
00603 case 11: {
00604 srcM2 = vec_perm(srcR1, srcR2, permM2);
00605 srcM1 = vec_perm(srcR1, srcR2, permM1);
00606 srcP0 = vec_perm(srcR1, srcR2, permP0);
00607 srcP1 = vec_perm(srcR1, srcR2, permP1);
00608 srcP2 = vec_perm(srcR1, srcR2, permP2);
00609 srcP3 = srcR2;
00610 } break;
00611 case 12: {
00612 vec_u8 srcR3 = vec_ld(30, src);
00613 srcM2 = vec_perm(srcR1, srcR2, permM2);
00614 srcM1 = vec_perm(srcR1, srcR2, permM1);
00615 srcP0 = vec_perm(srcR1, srcR2, permP0);
00616 srcP1 = vec_perm(srcR1, srcR2, permP1);
00617 srcP2 = srcR2;
00618 srcP3 = vec_perm(srcR2, srcR3, permP3);
00619 } break;
00620 case 13: {
00621 vec_u8 srcR3 = vec_ld(30, src);
00622 srcM2 = vec_perm(srcR1, srcR2, permM2);
00623 srcM1 = vec_perm(srcR1, srcR2, permM1);
00624 srcP0 = vec_perm(srcR1, srcR2, permP0);
00625 srcP1 = srcR2;
00626 srcP2 = vec_perm(srcR2, srcR3, permP2);
00627 srcP3 = vec_perm(srcR2, srcR3, permP3);
00628 } break;
00629 case 14: {
00630 vec_u8 srcR3 = vec_ld(30, src);
00631 srcM2 = vec_perm(srcR1, srcR2, permM2);
00632 srcM1 = vec_perm(srcR1, srcR2, permM1);
00633 srcP0 = srcR2;
00634 srcP1 = vec_perm(srcR2, srcR3, permP1);
00635 srcP2 = vec_perm(srcR2, srcR3, permP2);
00636 srcP3 = vec_perm(srcR2, srcR3, permP3);
00637 } break;
00638 case 15: {
00639 vec_u8 srcR3 = vec_ld(30, src);
00640 srcM2 = vec_perm(srcR1, srcR2, permM2);
00641 srcM1 = srcR2;
00642 srcP0 = vec_perm(srcR2, srcR3, permP0);
00643 srcP1 = vec_perm(srcR2, srcR3, permP1);
00644 srcP2 = vec_perm(srcR2, srcR3, permP2);
00645 srcP3 = vec_perm(srcR2, srcR3, permP3);
00646 } break;
00647 }
00648
00649 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00650 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00651 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00652 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00653
00654 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00655 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00656 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00657 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00658
00659 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00660 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00661 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00662 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00663
00664 sum1A = vec_adds(srcP0A, srcP1A);
00665 sum1B = vec_adds(srcP0B, srcP1B);
00666 sum2A = vec_adds(srcM1A, srcP2A);
00667 sum2B = vec_adds(srcM1B, srcP2B);
00668 sum3A = vec_adds(srcM2A, srcP3A);
00669 sum3B = vec_adds(srcM2B, srcP3B);
00670
00671 pp1A = vec_mladd(sum1A, v20ss, sum3A);
00672 pp1B = vec_mladd(sum1B, v20ss, sum3B);
00673
00674 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00675 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00676
00677 psumA = vec_sub(pp1A, pp2A);
00678 psumB = vec_sub(pp1B, pp2B);
00679
00680 vec_st(psumA, 0, tmp);
00681 vec_st(psumB, 16, tmp);
00682
00683 src += srcStride;
00684 tmp += tmpStride;
00685 }
00686
00687 tmpM2ssA = vec_ld(0, tmpbis);
00688 tmpM2ssB = vec_ld(16, tmpbis);
00689 tmpbis += tmpStride;
00690 tmpM1ssA = vec_ld(0, tmpbis);
00691 tmpM1ssB = vec_ld(16, tmpbis);
00692 tmpbis += tmpStride;
00693 tmpP0ssA = vec_ld(0, tmpbis);
00694 tmpP0ssB = vec_ld(16, tmpbis);
00695 tmpbis += tmpStride;
00696 tmpP1ssA = vec_ld(0, tmpbis);
00697 tmpP1ssB = vec_ld(16, tmpbis);
00698 tmpbis += tmpStride;
00699 tmpP2ssA = vec_ld(0, tmpbis);
00700 tmpP2ssB = vec_ld(16, tmpbis);
00701 tmpbis += tmpStride;
00702
00703 for (i = 0 ; i < 16 ; i++) {
00704 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00705 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00706
00707 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00708 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00709 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00710 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00711 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00712 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00713
00714 tmpbis += tmpStride;
00715
00716 tmpM2ssA = tmpM1ssA;
00717 tmpM2ssB = tmpM1ssB;
00718 tmpM1ssA = tmpP0ssA;
00719 tmpM1ssB = tmpP0ssB;
00720 tmpP0ssA = tmpP1ssA;
00721 tmpP0ssB = tmpP1ssB;
00722 tmpP1ssA = tmpP2ssA;
00723 tmpP1ssB = tmpP2ssB;
00724 tmpP2ssA = tmpP3ssA;
00725 tmpP2ssB = tmpP3ssB;
00726
00727 pp1Ae = vec_mule(sum1A, v20ss);
00728 pp1Ao = vec_mulo(sum1A, v20ss);
00729 pp1Be = vec_mule(sum1B, v20ss);
00730 pp1Bo = vec_mulo(sum1B, v20ss);
00731
00732 pp2Ae = vec_mule(sum2A, v5ss);
00733 pp2Ao = vec_mulo(sum2A, v5ss);
00734 pp2Be = vec_mule(sum2B, v5ss);
00735 pp2Bo = vec_mulo(sum2B, v5ss);
00736
00737 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00738 pp3Ao = vec_mulo(sum3A, v1ss);
00739 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00740 pp3Bo = vec_mulo(sum3B, v1ss);
00741
00742 pp1cAe = vec_add(pp1Ae, v512si);
00743 pp1cAo = vec_add(pp1Ao, v512si);
00744 pp1cBe = vec_add(pp1Be, v512si);
00745 pp1cBo = vec_add(pp1Bo, v512si);
00746
00747 pp32Ae = vec_sub(pp3Ae, pp2Ae);
00748 pp32Ao = vec_sub(pp3Ao, pp2Ao);
00749 pp32Be = vec_sub(pp3Be, pp2Be);
00750 pp32Bo = vec_sub(pp3Bo, pp2Bo);
00751
00752 sumAe = vec_add(pp1cAe, pp32Ae);
00753 sumAo = vec_add(pp1cAo, pp32Ao);
00754 sumBe = vec_add(pp1cBe, pp32Be);
00755 sumBo = vec_add(pp1cBo, pp32Bo);
00756
00757 ssumAe = vec_sra(sumAe, v10ui);
00758 ssumAo = vec_sra(sumAo, v10ui);
00759 ssumBe = vec_sra(sumBe, v10ui);
00760 ssumBo = vec_sra(sumBo, v10ui);
00761
00762 ssume = vec_packs(ssumAe, ssumBe);
00763 ssumo = vec_packs(ssumAo, ssumBo);
00764
00765 sumv = vec_packsu(ssume, ssumo);
00766 sum = vec_perm(sumv, sumv, mperm);
00767
00768 ASSERT_ALIGNED(dst);
00769 vdst = vec_ld(0, dst);
00770
00771 OP_U8_ALTIVEC(fsum, sum, vdst);
00772
00773 vec_st(fsum, 0, dst);
00774
00775 dst += dstStride;
00776 }
00777 }
00778 #endif