00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "config.h"
00024 #if HAVE_ALTIVEC_H
00025 #include <altivec.h>
00026 #endif
00027 #include "libavcodec/dsputil.h"
00028 #include "dsputil_ppc.h"
00029 #include "util_altivec.h"
00030 #include "types_altivec.h"
00031 #include "dsputil_altivec.h"
00032
00033 static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00034 {
00035 int i;
00036 int s;
00037 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
00038 vector unsigned char *tv;
00039 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
00040 vector unsigned int sad;
00041 vector signed int sumdiffs;
00042
00043 s = 0;
00044 sad = (vector unsigned int)vec_splat_u32(0);
00045 for (i = 0; i < h; i++) {
00046
00047
00048
00049 tv = (vector unsigned char *) pix1;
00050 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
00051
00052 tv = (vector unsigned char *) &pix2[0];
00053 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
00054
00055 tv = (vector unsigned char *) &pix2[1];
00056 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
00057
00058
00059 avgv = vec_avg(pix2v, pix2iv);
00060
00061
00062 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
00063
00064
00065 sad = vec_sum4s(t5, sad);
00066
00067 pix1 += line_size;
00068 pix2 += line_size;
00069 }
00070
00071 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00072 sumdiffs = vec_splat(sumdiffs, 3);
00073 vec_ste(sumdiffs, 0, &s);
00074
00075 return s;
00076 }
00077
00078 static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00079 {
00080 int i;
00081 int s;
00082 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
00083 vector unsigned char *tv;
00084 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
00085 vector unsigned int sad;
00086 vector signed int sumdiffs;
00087 uint8_t *pix3 = pix2 + line_size;
00088
00089 s = 0;
00090 sad = (vector unsigned int)vec_splat_u32(0);
00091
00092
00093
00094
00095
00096
00097
00098
00099 tv = (vector unsigned char *) &pix2[0];
00100 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
00101
00102 for (i = 0; i < h; i++) {
00103
00104
00105
00106 tv = (vector unsigned char *) pix1;
00107 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
00108
00109 tv = (vector unsigned char *) &pix3[0];
00110 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
00111
00112
00113 avgv = vec_avg(pix2v, pix3v);
00114
00115
00116 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
00117
00118
00119 sad = vec_sum4s(t5, sad);
00120
00121 pix1 += line_size;
00122 pix2v = pix3v;
00123 pix3 += line_size;
00124
00125 }
00126
00127
00128 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00129 sumdiffs = vec_splat(sumdiffs, 3);
00130 vec_ste(sumdiffs, 0, &s);
00131 return s;
00132 }
00133
00134 static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00135 {
00136 int i;
00137 int s;
00138 uint8_t *pix3 = pix2 + line_size;
00139 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
00140 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
00141 vector unsigned char *tv, avgv, t5;
00142 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
00143 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
00144 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
00145 vector unsigned short avghv, avglv;
00146 vector unsigned short t1, t2, t3, t4;
00147 vector unsigned int sad;
00148 vector signed int sumdiffs;
00149
00150 sad = (vector unsigned int)vec_splat_u32(0);
00151
00152 s = 0;
00153
00154
00155
00156
00157
00158
00159
00160
00161 tv = (vector unsigned char *) &pix2[0];
00162 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
00163
00164 tv = (vector unsigned char *) &pix2[1];
00165 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
00166
00167 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
00168 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
00169 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
00170 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
00171 t1 = vec_add(pix2hv, pix2ihv);
00172 t2 = vec_add(pix2lv, pix2ilv);
00173
00174 for (i = 0; i < h; i++) {
00175
00176
00177
00178 tv = (vector unsigned char *) pix1;
00179 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
00180
00181 tv = (vector unsigned char *) &pix3[0];
00182 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
00183
00184 tv = (vector unsigned char *) &pix3[1];
00185 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
00186
00187
00188
00189
00190
00191
00192
00193
00194 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
00195 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
00196 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
00197 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
00198
00199
00200 t3 = vec_add(pix3hv, pix3ihv);
00201 t4 = vec_add(pix3lv, pix3ilv);
00202
00203 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
00204 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
00205
00206
00207 avgv = vec_pack(avghv, avglv);
00208
00209
00210 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
00211
00212
00213 sad = vec_sum4s(t5, sad);
00214
00215 pix1 += line_size;
00216 pix3 += line_size;
00217
00218 t1 = t3;
00219 t2 = t4;
00220 }
00221
00222 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00223 sumdiffs = vec_splat(sumdiffs, 3);
00224 vec_ste(sumdiffs, 0, &s);
00225
00226 return s;
00227 }
00228
00229 static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00230 {
00231 int i;
00232 int s;
00233 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
00234 vector unsigned char perm1, perm2, *pix1v, *pix2v;
00235 vector unsigned char t1, t2, t3,t4, t5;
00236 vector unsigned int sad;
00237 vector signed int sumdiffs;
00238
00239 sad = (vector unsigned int)vec_splat_u32(0);
00240
00241
00242 for (i = 0; i < h; i++) {
00243
00244 perm1 = vec_lvsl(0, pix1);
00245 pix1v = (vector unsigned char *) pix1;
00246 perm2 = vec_lvsl(0, pix2);
00247 pix2v = (vector unsigned char *) pix2;
00248 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
00249 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
00250
00251
00252 t3 = vec_max(t1, t2);
00253 t4 = vec_min(t1, t2);
00254 t5 = vec_sub(t3, t4);
00255
00256
00257 sad = vec_sum4s(t5, sad);
00258
00259 pix1 += line_size;
00260 pix2 += line_size;
00261 }
00262
00263
00264 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00265 sumdiffs = vec_splat(sumdiffs, 3);
00266 vec_ste(sumdiffs, 0, &s);
00267
00268 return s;
00269 }
00270
00271 static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00272 {
00273 int i;
00274 int s;
00275 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
00276 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
00277 vector unsigned char t1, t2, t3,t4, t5;
00278 vector unsigned int sad;
00279 vector signed int sumdiffs;
00280
00281 sad = (vector unsigned int)vec_splat_u32(0);
00282
00283 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
00284
00285 for (i = 0; i < h; i++) {
00286
00287
00288
00289 perm1 = vec_lvsl(0, pix1);
00290 pix1v = (vector unsigned char *) pix1;
00291 perm2 = vec_lvsl(0, pix2);
00292 pix2v = (vector unsigned char *) pix2;
00293 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
00294 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
00295
00296
00297 t3 = vec_max(t1, t2);
00298 t4 = vec_min(t1, t2);
00299 t5 = vec_sub(t3, t4);
00300
00301
00302 sad = vec_sum4s(t5, sad);
00303
00304 pix1 += line_size;
00305 pix2 += line_size;
00306 }
00307
00308
00309 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00310 sumdiffs = vec_splat(sumdiffs, 3);
00311 vec_ste(sumdiffs, 0, &s);
00312
00313 return s;
00314 }
00315
00316 static int pix_norm1_altivec(uint8_t *pix, int line_size)
00317 {
00318 int i;
00319 int s;
00320 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
00321 vector unsigned char *tv;
00322 vector unsigned char pixv;
00323 vector unsigned int sv;
00324 vector signed int sum;
00325
00326 sv = (vector unsigned int)vec_splat_u32(0);
00327
00328 s = 0;
00329 for (i = 0; i < 16; i++) {
00330
00331 tv = (vector unsigned char *) pix;
00332 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
00333
00334
00335 sv = vec_msum(pixv, pixv, sv);
00336
00337 pix += line_size;
00338 }
00339
00340 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
00341 sum = vec_splat(sum, 3);
00342 vec_ste(sum, 0, &s);
00343
00344 return s;
00345 }
00346
00352 static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00353 {
00354 int i;
00355 int s;
00356 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
00357 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
00358 vector unsigned char t1, t2, t3,t4, t5;
00359 vector unsigned int sum;
00360 vector signed int sumsqr;
00361
00362 sum = (vector unsigned int)vec_splat_u32(0);
00363
00364 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
00365
00366
00367 for (i = 0; i < h; i++) {
00368
00369
00370
00371 perm1 = vec_lvsl(0, pix1);
00372 pix1v = (vector unsigned char *) pix1;
00373 perm2 = vec_lvsl(0, pix2);
00374 pix2v = (vector unsigned char *) pix2;
00375 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
00376 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
00377
00378
00379
00380
00381
00382 t3 = vec_max(t1, t2);
00383 t4 = vec_min(t1, t2);
00384 t5 = vec_sub(t3, t4);
00385
00386
00387 sum = vec_msum(t5, t5, sum);
00388
00389 pix1 += line_size;
00390 pix2 += line_size;
00391 }
00392
00393
00394 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
00395 sumsqr = vec_splat(sumsqr, 3);
00396 vec_ste(sumsqr, 0, &s);
00397
00398 return s;
00399 }
00400
00406 static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00407 {
00408 int i;
00409 int s;
00410 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
00411 vector unsigned char perm1, perm2, *pix1v, *pix2v;
00412 vector unsigned char t1, t2, t3,t4, t5;
00413 vector unsigned int sum;
00414 vector signed int sumsqr;
00415
00416 sum = (vector unsigned int)vec_splat_u32(0);
00417
00418 for (i = 0; i < h; i++) {
00419
00420 perm1 = vec_lvsl(0, pix1);
00421 pix1v = (vector unsigned char *) pix1;
00422 perm2 = vec_lvsl(0, pix2);
00423 pix2v = (vector unsigned char *) pix2;
00424 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
00425 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
00426
00427
00428
00429
00430
00431 t3 = vec_max(t1, t2);
00432 t4 = vec_min(t1, t2);
00433 t5 = vec_sub(t3, t4);
00434
00435
00436 sum = vec_msum(t5, t5, sum);
00437
00438 pix1 += line_size;
00439 pix2 += line_size;
00440 }
00441
00442
00443 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
00444 sumsqr = vec_splat(sumsqr, 3);
00445 vec_ste(sumsqr, 0, &s);
00446
00447 return s;
00448 }
00449
00450 static int pix_sum_altivec(uint8_t * pix, int line_size)
00451 {
00452 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
00453 vector unsigned char perm, *pixv;
00454 vector unsigned char t1;
00455 vector unsigned int sad;
00456 vector signed int sumdiffs;
00457
00458 int i;
00459 int s;
00460
00461 sad = (vector unsigned int)vec_splat_u32(0);
00462
00463 for (i = 0; i < 16; i++) {
00464
00465 perm = vec_lvsl(0, pix);
00466 pixv = (vector unsigned char *) pix;
00467 t1 = vec_perm(pixv[0], pixv[1], perm);
00468
00469
00470 sad = vec_sum4s(t1, sad);
00471
00472 pix += line_size;
00473 }
00474
00475
00476 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00477 sumdiffs = vec_splat(sumdiffs, 3);
00478 vec_ste(sumdiffs, 0, &s);
00479
00480 return s;
00481 }
00482
00483 static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
00484 {
00485 int i;
00486 vector unsigned char perm, bytes, *pixv;
00487 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
00488 vector signed short shorts;
00489
00490 for (i = 0; i < 8; i++) {
00491
00492
00493
00494 perm = vec_lvsl(0, pixels);
00495 pixv = (vector unsigned char *) pixels;
00496 bytes = vec_perm(pixv[0], pixv[1], perm);
00497
00498
00499 shorts = (vector signed short)vec_mergeh(zero, bytes);
00500
00501
00502 vec_st(shorts, i*16, (vector signed short*)block);
00503
00504 pixels += line_size;
00505 }
00506 }
00507
00508 static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
00509 const uint8_t *s2, int stride)
00510 {
00511 int i;
00512 vector unsigned char perm, bytes, *pixv;
00513 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
00514 vector signed short shorts1, shorts2;
00515
00516 for (i = 0; i < 4; i++) {
00517
00518
00519
00520 perm = vec_lvsl(0, s1);
00521 pixv = (vector unsigned char *) s1;
00522 bytes = vec_perm(pixv[0], pixv[1], perm);
00523
00524
00525 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
00526
00527
00528 perm = vec_lvsl(0, s2);
00529 pixv = (vector unsigned char *) s2;
00530 bytes = vec_perm(pixv[0], pixv[1], perm);
00531
00532
00533 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
00534
00535
00536 shorts1 = vec_sub(shorts1, shorts2);
00537
00538
00539 vec_st(shorts1, 0, (vector signed short*)block);
00540
00541 s1 += stride;
00542 s2 += stride;
00543 block += 8;
00544
00545
00546
00547
00548
00549
00550
00551
00552 perm = vec_lvsl(0, s1);
00553 pixv = (vector unsigned char *) s1;
00554 bytes = vec_perm(pixv[0], pixv[1], perm);
00555
00556
00557 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
00558
00559
00560 perm = vec_lvsl(0, s2);
00561 pixv = (vector unsigned char *) s2;
00562 bytes = vec_perm(pixv[0], pixv[1], perm);
00563
00564
00565 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
00566
00567
00568 shorts1 = vec_sub(shorts1, shorts2);
00569
00570
00571 vec_st(shorts1, 0, (vector signed short*)block);
00572
00573 s1 += stride;
00574 s2 += stride;
00575 block += 8;
00576 }
00577 }
00578
00579
00580 static void clear_block_altivec(DCTELEM *block) {
00581 LOAD_ZERO;
00582 vec_st(zero_s16v, 0, block);
00583 vec_st(zero_s16v, 16, block);
00584 vec_st(zero_s16v, 32, block);
00585 vec_st(zero_s16v, 48, block);
00586 vec_st(zero_s16v, 64, block);
00587 vec_st(zero_s16v, 80, block);
00588 vec_st(zero_s16v, 96, block);
00589 vec_st(zero_s16v, 112, block);
00590 }
00591
00592
00593 static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
00594 register int i;
00595 register vector unsigned char vdst, vsrc;
00596
00597
00598 for (i = 0 ; (i + 15) < w ; i+=16) {
00599 vdst = vec_ld(i, (unsigned char*)dst);
00600 vsrc = vec_ld(i, (unsigned char*)src);
00601 vdst = vec_add(vsrc, vdst);
00602 vec_st(vdst, i, (unsigned char*)dst);
00603 }
00604
00605 for (; (i < w) ; i++) {
00606 dst[i] = src[i];
00607 }
00608 }
00609
00610
00611 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00612 {
00613 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
00614 register vector unsigned char pixelsv1, pixelsv2;
00615 register vector unsigned char pixelsv1B, pixelsv2B;
00616 register vector unsigned char pixelsv1C, pixelsv2C;
00617 register vector unsigned char pixelsv1D, pixelsv2D;
00618
00619 register vector unsigned char perm = vec_lvsl(0, pixels);
00620 int i;
00621 register int line_size_2 = line_size << 1;
00622 register int line_size_3 = line_size + line_size_2;
00623 register int line_size_4 = line_size << 2;
00624
00625 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
00626
00627
00628
00629
00630
00631 #if 0
00632 for (i = 0; i < h; i++) {
00633 pixelsv1 = vec_ld(0, pixels);
00634 pixelsv2 = vec_ld(16, pixels);
00635 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
00636 0, block);
00637 pixels+=line_size;
00638 block +=line_size;
00639 }
00640 #else
00641 for (i = 0; i < h; i += 4) {
00642 pixelsv1 = vec_ld( 0, pixels);
00643 pixelsv2 = vec_ld(15, pixels);
00644 pixelsv1B = vec_ld(line_size, pixels);
00645 pixelsv2B = vec_ld(15 + line_size, pixels);
00646 pixelsv1C = vec_ld(line_size_2, pixels);
00647 pixelsv2C = vec_ld(15 + line_size_2, pixels);
00648 pixelsv1D = vec_ld(line_size_3, pixels);
00649 pixelsv2D = vec_ld(15 + line_size_3, pixels);
00650 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
00651 0, (unsigned char*)block);
00652 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
00653 line_size, (unsigned char*)block);
00654 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
00655 line_size_2, (unsigned char*)block);
00656 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
00657 line_size_3, (unsigned char*)block);
00658 pixels+=line_size_4;
00659 block +=line_size_4;
00660 }
00661 #endif
00662 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
00663 }
00664
00665
00666 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
00667 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00668 {
00669 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
00670 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
00671 register vector unsigned char perm = vec_lvsl(0, pixels);
00672 int i;
00673
00674 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
00675
00676 for (i = 0; i < h; i++) {
00677 pixelsv1 = vec_ld( 0, pixels);
00678 pixelsv2 = vec_ld(16,pixels);
00679 blockv = vec_ld(0, block);
00680 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
00681 blockv = vec_avg(blockv,pixelsv);
00682 vec_st(blockv, 0, (unsigned char*)block);
00683 pixels+=line_size;
00684 block +=line_size;
00685 }
00686
00687 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
00688 }
00689
00690
00691 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
00692 {
00693 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
00694 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
00695 int i;
00696
00697 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
00698
00699 for (i = 0; i < h; i++) {
00700
00701
00702 int rightside = ((unsigned long)block & 0x0000000F);
00703
00704 blockv = vec_ld(0, block);
00705 pixelsv1 = vec_ld( 0, pixels);
00706 pixelsv2 = vec_ld(16, pixels);
00707 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
00708
00709 if (rightside) {
00710 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
00711 } else {
00712 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
00713 }
00714
00715 blockv = vec_avg(blockv, pixelsv);
00716
00717 vec_st(blockv, 0, block);
00718
00719 pixels += line_size;
00720 block += line_size;
00721 }
00722
00723 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
00724 }
00725
00726
00727 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00728 {
00729 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
00730 register int i;
00731 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
00732 register vector unsigned char blockv, temp1, temp2;
00733 register vector unsigned short pixelssum1, pixelssum2, temp3;
00734 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
00735 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
00736
00737 temp1 = vec_ld(0, pixels);
00738 temp2 = vec_ld(16, pixels);
00739 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
00740 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
00741 pixelsv2 = temp2;
00742 } else {
00743 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
00744 }
00745 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00746 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00747 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
00748 (vector unsigned short)pixelsv2);
00749 pixelssum1 = vec_add(pixelssum1, vctwo);
00750
00751 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
00752 for (i = 0; i < h ; i++) {
00753 int rightside = ((unsigned long)block & 0x0000000F);
00754 blockv = vec_ld(0, block);
00755
00756 temp1 = vec_ld(line_size, pixels);
00757 temp2 = vec_ld(line_size + 16, pixels);
00758 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
00759 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
00760 pixelsv2 = temp2;
00761 } else {
00762 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
00763 }
00764
00765 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00766 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00767 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
00768 (vector unsigned short)pixelsv2);
00769 temp3 = vec_add(pixelssum1, pixelssum2);
00770 temp3 = vec_sra(temp3, vctwo);
00771 pixelssum1 = vec_add(pixelssum2, vctwo);
00772 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
00773
00774 if (rightside) {
00775 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
00776 } else {
00777 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
00778 }
00779
00780 vec_st(blockv, 0, block);
00781
00782 block += line_size;
00783 pixels += line_size;
00784 }
00785
00786 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
00787 }
00788
00789
00790 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00791 {
00792 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
00793 register int i;
00794 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
00795 register vector unsigned char blockv, temp1, temp2;
00796 register vector unsigned short pixelssum1, pixelssum2, temp3;
00797 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
00798 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
00799 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
00800
00801 temp1 = vec_ld(0, pixels);
00802 temp2 = vec_ld(16, pixels);
00803 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
00804 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
00805 pixelsv2 = temp2;
00806 } else {
00807 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
00808 }
00809 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00810 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00811 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
00812 (vector unsigned short)pixelsv2);
00813 pixelssum1 = vec_add(pixelssum1, vcone);
00814
00815 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
00816 for (i = 0; i < h ; i++) {
00817 int rightside = ((unsigned long)block & 0x0000000F);
00818 blockv = vec_ld(0, block);
00819
00820 temp1 = vec_ld(line_size, pixels);
00821 temp2 = vec_ld(line_size + 16, pixels);
00822 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
00823 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
00824 pixelsv2 = temp2;
00825 } else {
00826 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
00827 }
00828
00829 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00830 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00831 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
00832 (vector unsigned short)pixelsv2);
00833 temp3 = vec_add(pixelssum1, pixelssum2);
00834 temp3 = vec_sra(temp3, vctwo);
00835 pixelssum1 = vec_add(pixelssum2, vcone);
00836 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
00837
00838 if (rightside) {
00839 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
00840 } else {
00841 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
00842 }
00843
00844 vec_st(blockv, 0, block);
00845
00846 block += line_size;
00847 pixels += line_size;
00848 }
00849
00850 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
00851 }
00852
00853
00854 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
00855 {
00856 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
00857 register int i;
00858 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
00859 register vector unsigned char blockv, temp1, temp2;
00860 register vector unsigned short temp3, temp4,
00861 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
00862 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
00863 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
00864
00865 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
00866
00867 temp1 = vec_ld(0, pixels);
00868 temp2 = vec_ld(16, pixels);
00869 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
00870 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
00871 pixelsv2 = temp2;
00872 } else {
00873 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
00874 }
00875 pixelsv3 = vec_mergel(vczero, pixelsv1);
00876 pixelsv4 = vec_mergel(vczero, pixelsv2);
00877 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00878 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00879 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
00880 (vector unsigned short)pixelsv4);
00881 pixelssum3 = vec_add(pixelssum3, vctwo);
00882 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
00883 (vector unsigned short)pixelsv2);
00884 pixelssum1 = vec_add(pixelssum1, vctwo);
00885
00886 for (i = 0; i < h ; i++) {
00887 blockv = vec_ld(0, block);
00888
00889 temp1 = vec_ld(line_size, pixels);
00890 temp2 = vec_ld(line_size + 16, pixels);
00891 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
00892 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
00893 pixelsv2 = temp2;
00894 } else {
00895 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
00896 }
00897
00898 pixelsv3 = vec_mergel(vczero, pixelsv1);
00899 pixelsv4 = vec_mergel(vczero, pixelsv2);
00900 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00901 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00902
00903 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
00904 (vector unsigned short)pixelsv4);
00905 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
00906 (vector unsigned short)pixelsv2);
00907 temp4 = vec_add(pixelssum3, pixelssum4);
00908 temp4 = vec_sra(temp4, vctwo);
00909 temp3 = vec_add(pixelssum1, pixelssum2);
00910 temp3 = vec_sra(temp3, vctwo);
00911
00912 pixelssum3 = vec_add(pixelssum4, vctwo);
00913 pixelssum1 = vec_add(pixelssum2, vctwo);
00914
00915 blockv = vec_packsu(temp3, temp4);
00916
00917 vec_st(blockv, 0, block);
00918
00919 block += line_size;
00920 pixels += line_size;
00921 }
00922
00923 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
00924 }
00925
00926
00927 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
00928 {
00929 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
00930 register int i;
00931 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
00932 register vector unsigned char blockv, temp1, temp2;
00933 register vector unsigned short temp3, temp4,
00934 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
00935 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
00936 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
00937 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
00938
00939 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
00940
00941 temp1 = vec_ld(0, pixels);
00942 temp2 = vec_ld(16, pixels);
00943 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
00944 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
00945 pixelsv2 = temp2;
00946 } else {
00947 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
00948 }
00949 pixelsv3 = vec_mergel(vczero, pixelsv1);
00950 pixelsv4 = vec_mergel(vczero, pixelsv2);
00951 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00952 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00953 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
00954 (vector unsigned short)pixelsv4);
00955 pixelssum3 = vec_add(pixelssum3, vcone);
00956 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
00957 (vector unsigned short)pixelsv2);
00958 pixelssum1 = vec_add(pixelssum1, vcone);
00959
00960 for (i = 0; i < h ; i++) {
00961 blockv = vec_ld(0, block);
00962
00963 temp1 = vec_ld(line_size, pixels);
00964 temp2 = vec_ld(line_size + 16, pixels);
00965 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
00966 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
00967 pixelsv2 = temp2;
00968 } else {
00969 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
00970 }
00971
00972 pixelsv3 = vec_mergel(vczero, pixelsv1);
00973 pixelsv4 = vec_mergel(vczero, pixelsv2);
00974 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00975 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00976
00977 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
00978 (vector unsigned short)pixelsv4);
00979 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
00980 (vector unsigned short)pixelsv2);
00981 temp4 = vec_add(pixelssum3, pixelssum4);
00982 temp4 = vec_sra(temp4, vctwo);
00983 temp3 = vec_add(pixelssum1, pixelssum2);
00984 temp3 = vec_sra(temp3, vctwo);
00985
00986 pixelssum3 = vec_add(pixelssum4, vcone);
00987 pixelssum1 = vec_add(pixelssum2, vcone);
00988
00989 blockv = vec_packsu(temp3, temp4);
00990
00991 vec_st(blockv, 0, block);
00992
00993 block += line_size;
00994 pixels += line_size;
00995 }
00996
00997 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
00998 }
00999
01000 static int hadamard8_diff8x8_altivec( void *s, uint8_t *dst, uint8_t *src, int stride, int h){
01001 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
01002 int sum;
01003 register const vector unsigned char vzero =
01004 (const vector unsigned char)vec_splat_u8(0);
01005 register vector signed short temp0, temp1, temp2, temp3, temp4,
01006 temp5, temp6, temp7;
01007 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
01008 {
01009 register const vector signed short vprod1 =(const vector signed short)
01010 { 1,-1, 1,-1, 1,-1, 1,-1 };
01011 register const vector signed short vprod2 =(const vector signed short)
01012 { 1, 1,-1,-1, 1, 1,-1,-1 };
01013 register const vector signed short vprod3 =(const vector signed short)
01014 { 1, 1, 1, 1,-1,-1,-1,-1 };
01015 register const vector unsigned char perm1 = (const vector unsigned char)
01016 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
01017 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
01018 register const vector unsigned char perm2 = (const vector unsigned char)
01019 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
01020 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
01021 register const vector unsigned char perm3 = (const vector unsigned char)
01022 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
01023 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
01024
01025 #define ONEITERBUTTERFLY(i, res) \
01026 { \
01027 register vector unsigned char src1, src2, srcO; \
01028 register vector unsigned char dst1, dst2, dstO; \
01029 register vector signed short srcV, dstV; \
01030 register vector signed short but0, but1, but2, op1, op2, op3; \
01031 src1 = vec_ld(stride * i, src); \
01032 src2 = vec_ld((stride * i) + 15, src); \
01033 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
01034 dst1 = vec_ld(stride * i, dst); \
01035 dst2 = vec_ld((stride * i) + 15, dst); \
01036 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
01037 \
01038 \
01039 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
01040 (vector signed char)srcO); \
01041 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
01042 (vector signed char)dstO); \
01043 \
01044 but0 = vec_sub(srcV, dstV); \
01045 op1 = vec_perm(but0, but0, perm1); \
01046 but1 = vec_mladd(but0, vprod1, op1); \
01047 op2 = vec_perm(but1, but1, perm2); \
01048 but2 = vec_mladd(but1, vprod2, op2); \
01049 op3 = vec_perm(but2, but2, perm3); \
01050 res = vec_mladd(but2, vprod3, op3); \
01051 }
01052 ONEITERBUTTERFLY(0, temp0);
01053 ONEITERBUTTERFLY(1, temp1);
01054 ONEITERBUTTERFLY(2, temp2);
01055 ONEITERBUTTERFLY(3, temp3);
01056 ONEITERBUTTERFLY(4, temp4);
01057 ONEITERBUTTERFLY(5, temp5);
01058 ONEITERBUTTERFLY(6, temp6);
01059 ONEITERBUTTERFLY(7, temp7);
01060 }
01061 #undef ONEITERBUTTERFLY
01062 {
01063 register vector signed int vsum;
01064 register vector signed short line0 = vec_add(temp0, temp1);
01065 register vector signed short line1 = vec_sub(temp0, temp1);
01066 register vector signed short line2 = vec_add(temp2, temp3);
01067 register vector signed short line3 = vec_sub(temp2, temp3);
01068 register vector signed short line4 = vec_add(temp4, temp5);
01069 register vector signed short line5 = vec_sub(temp4, temp5);
01070 register vector signed short line6 = vec_add(temp6, temp7);
01071 register vector signed short line7 = vec_sub(temp6, temp7);
01072
01073 register vector signed short line0B = vec_add(line0, line2);
01074 register vector signed short line2B = vec_sub(line0, line2);
01075 register vector signed short line1B = vec_add(line1, line3);
01076 register vector signed short line3B = vec_sub(line1, line3);
01077 register vector signed short line4B = vec_add(line4, line6);
01078 register vector signed short line6B = vec_sub(line4, line6);
01079 register vector signed short line5B = vec_add(line5, line7);
01080 register vector signed short line7B = vec_sub(line5, line7);
01081
01082 register vector signed short line0C = vec_add(line0B, line4B);
01083 register vector signed short line4C = vec_sub(line0B, line4B);
01084 register vector signed short line1C = vec_add(line1B, line5B);
01085 register vector signed short line5C = vec_sub(line1B, line5B);
01086 register vector signed short line2C = vec_add(line2B, line6B);
01087 register vector signed short line6C = vec_sub(line2B, line6B);
01088 register vector signed short line3C = vec_add(line3B, line7B);
01089 register vector signed short line7C = vec_sub(line3B, line7B);
01090
01091 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
01092 vsum = vec_sum4s(vec_abs(line1C), vsum);
01093 vsum = vec_sum4s(vec_abs(line2C), vsum);
01094 vsum = vec_sum4s(vec_abs(line3C), vsum);
01095 vsum = vec_sum4s(vec_abs(line4C), vsum);
01096 vsum = vec_sum4s(vec_abs(line5C), vsum);
01097 vsum = vec_sum4s(vec_abs(line6C), vsum);
01098 vsum = vec_sum4s(vec_abs(line7C), vsum);
01099 vsum = vec_sums(vsum, (vector signed int)vzero);
01100 vsum = vec_splat(vsum, 3);
01101 vec_ste(vsum, 0, &sum);
01102 }
01103 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
01104 return sum;
01105 }
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125
01126 static int hadamard8_diff16x8_altivec( void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
01127 int sum;
01128 register vector signed short
01129 temp0 __asm__ ("v0"),
01130 temp1 __asm__ ("v1"),
01131 temp2 __asm__ ("v2"),
01132 temp3 __asm__ ("v3"),
01133 temp4 __asm__ ("v4"),
01134 temp5 __asm__ ("v5"),
01135 temp6 __asm__ ("v6"),
01136 temp7 __asm__ ("v7");
01137 register vector signed short
01138 temp0S __asm__ ("v8"),
01139 temp1S __asm__ ("v9"),
01140 temp2S __asm__ ("v10"),
01141 temp3S __asm__ ("v11"),
01142 temp4S __asm__ ("v12"),
01143 temp5S __asm__ ("v13"),
01144 temp6S __asm__ ("v14"),
01145 temp7S __asm__ ("v15");
01146 register const vector unsigned char vzero __asm__ ("v31") =
01147 (const vector unsigned char)vec_splat_u8(0);
01148 {
01149 register const vector signed short vprod1 __asm__ ("v16") =
01150 (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
01151 register const vector signed short vprod2 __asm__ ("v17") =
01152 (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
01153 register const vector signed short vprod3 __asm__ ("v18") =
01154 (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
01155 register const vector unsigned char perm1 __asm__ ("v19") =
01156 (const vector unsigned char)
01157 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
01158 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
01159 register const vector unsigned char perm2 __asm__ ("v20") =
01160 (const vector unsigned char)
01161 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
01162 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
01163 register const vector unsigned char perm3 __asm__ ("v21") =
01164 (const vector unsigned char)
01165 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
01166 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
01167
01168 #define ONEITERBUTTERFLY(i, res1, res2) \
01169 { \
01170 register vector unsigned char src1 __asm__ ("v22"), \
01171 src2 __asm__ ("v23"), \
01172 dst1 __asm__ ("v24"), \
01173 dst2 __asm__ ("v25"), \
01174 srcO __asm__ ("v22"), \
01175 dstO __asm__ ("v23"); \
01176 \
01177 register vector signed short srcV __asm__ ("v24"), \
01178 dstV __asm__ ("v25"), \
01179 srcW __asm__ ("v26"), \
01180 dstW __asm__ ("v27"), \
01181 but0 __asm__ ("v28"), \
01182 but0S __asm__ ("v29"), \
01183 op1 __asm__ ("v30"), \
01184 but1 __asm__ ("v22"), \
01185 op1S __asm__ ("v23"), \
01186 but1S __asm__ ("v24"), \
01187 op2 __asm__ ("v25"), \
01188 but2 __asm__ ("v26"), \
01189 op2S __asm__ ("v27"), \
01190 but2S __asm__ ("v28"), \
01191 op3 __asm__ ("v29"), \
01192 op3S __asm__ ("v30"); \
01193 \
01194 src1 = vec_ld(stride * i, src); \
01195 src2 = vec_ld((stride * i) + 16, src); \
01196 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
01197 dst1 = vec_ld(stride * i, dst); \
01198 dst2 = vec_ld((stride * i) + 16, dst); \
01199 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
01200 \
01201 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
01202 (vector signed char)srcO); \
01203 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
01204 (vector signed char)dstO); \
01205 srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
01206 (vector signed char)srcO); \
01207 dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
01208 (vector signed char)dstO); \
01209 \
01210 but0 = vec_sub(srcV, dstV); \
01211 but0S = vec_sub(srcW, dstW); \
01212 op1 = vec_perm(but0, but0, perm1); \
01213 but1 = vec_mladd(but0, vprod1, op1); \
01214 op1S = vec_perm(but0S, but0S, perm1); \
01215 but1S = vec_mladd(but0S, vprod1, op1S); \
01216 op2 = vec_perm(but1, but1, perm2); \
01217 but2 = vec_mladd(but1, vprod2, op2); \
01218 op2S = vec_perm(but1S, but1S, perm2); \
01219 but2S = vec_mladd(but1S, vprod2, op2S); \
01220 op3 = vec_perm(but2, but2, perm3); \
01221 res1 = vec_mladd(but2, vprod3, op3); \
01222 op3S = vec_perm(but2S, but2S, perm3); \
01223 res2 = vec_mladd(but2S, vprod3, op3S); \
01224 }
01225 ONEITERBUTTERFLY(0, temp0, temp0S);
01226 ONEITERBUTTERFLY(1, temp1, temp1S);
01227 ONEITERBUTTERFLY(2, temp2, temp2S);
01228 ONEITERBUTTERFLY(3, temp3, temp3S);
01229 ONEITERBUTTERFLY(4, temp4, temp4S);
01230 ONEITERBUTTERFLY(5, temp5, temp5S);
01231 ONEITERBUTTERFLY(6, temp6, temp6S);
01232 ONEITERBUTTERFLY(7, temp7, temp7S);
01233 }
01234 #undef ONEITERBUTTERFLY
01235 {
01236 register vector signed int vsum;
01237 register vector signed short line0S, line1S, line2S, line3S, line4S,
01238 line5S, line6S, line7S, line0BS,line2BS,
01239 line1BS,line3BS,line4BS,line6BS,line5BS,
01240 line7BS,line0CS,line4CS,line1CS,line5CS,
01241 line2CS,line6CS,line3CS,line7CS;
01242
01243 register vector signed short line0 = vec_add(temp0, temp1);
01244 register vector signed short line1 = vec_sub(temp0, temp1);
01245 register vector signed short line2 = vec_add(temp2, temp3);
01246 register vector signed short line3 = vec_sub(temp2, temp3);
01247 register vector signed short line4 = vec_add(temp4, temp5);
01248 register vector signed short line5 = vec_sub(temp4, temp5);
01249 register vector signed short line6 = vec_add(temp6, temp7);
01250 register vector signed short line7 = vec_sub(temp6, temp7);
01251
01252 register vector signed short line0B = vec_add(line0, line2);
01253 register vector signed short line2B = vec_sub(line0, line2);
01254 register vector signed short line1B = vec_add(line1, line3);
01255 register vector signed short line3B = vec_sub(line1, line3);
01256 register vector signed short line4B = vec_add(line4, line6);
01257 register vector signed short line6B = vec_sub(line4, line6);
01258 register vector signed short line5B = vec_add(line5, line7);
01259 register vector signed short line7B = vec_sub(line5, line7);
01260
01261 register vector signed short line0C = vec_add(line0B, line4B);
01262 register vector signed short line4C = vec_sub(line0B, line4B);
01263 register vector signed short line1C = vec_add(line1B, line5B);
01264 register vector signed short line5C = vec_sub(line1B, line5B);
01265 register vector signed short line2C = vec_add(line2B, line6B);
01266 register vector signed short line6C = vec_sub(line2B, line6B);
01267 register vector signed short line3C = vec_add(line3B, line7B);
01268 register vector signed short line7C = vec_sub(line3B, line7B);
01269
01270 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
01271 vsum = vec_sum4s(vec_abs(line1C), vsum);
01272 vsum = vec_sum4s(vec_abs(line2C), vsum);
01273 vsum = vec_sum4s(vec_abs(line3C), vsum);
01274 vsum = vec_sum4s(vec_abs(line4C), vsum);
01275 vsum = vec_sum4s(vec_abs(line5C), vsum);
01276 vsum = vec_sum4s(vec_abs(line6C), vsum);
01277 vsum = vec_sum4s(vec_abs(line7C), vsum);
01278
01279 line0S = vec_add(temp0S, temp1S);
01280 line1S = vec_sub(temp0S, temp1S);
01281 line2S = vec_add(temp2S, temp3S);
01282 line3S = vec_sub(temp2S, temp3S);
01283 line4S = vec_add(temp4S, temp5S);
01284 line5S = vec_sub(temp4S, temp5S);
01285 line6S = vec_add(temp6S, temp7S);
01286 line7S = vec_sub(temp6S, temp7S);
01287
01288 line0BS = vec_add(line0S, line2S);
01289 line2BS = vec_sub(line0S, line2S);
01290 line1BS = vec_add(line1S, line3S);
01291 line3BS = vec_sub(line1S, line3S);
01292 line4BS = vec_add(line4S, line6S);
01293 line6BS = vec_sub(line4S, line6S);
01294 line5BS = vec_add(line5S, line7S);
01295 line7BS = vec_sub(line5S, line7S);
01296
01297 line0CS = vec_add(line0BS, line4BS);
01298 line4CS = vec_sub(line0BS, line4BS);
01299 line1CS = vec_add(line1BS, line5BS);
01300 line5CS = vec_sub(line1BS, line5BS);
01301 line2CS = vec_add(line2BS, line6BS);
01302 line6CS = vec_sub(line2BS, line6BS);
01303 line3CS = vec_add(line3BS, line7BS);
01304 line7CS = vec_sub(line3BS, line7BS);
01305
01306 vsum = vec_sum4s(vec_abs(line0CS), vsum);
01307 vsum = vec_sum4s(vec_abs(line1CS), vsum);
01308 vsum = vec_sum4s(vec_abs(line2CS), vsum);
01309 vsum = vec_sum4s(vec_abs(line3CS), vsum);
01310 vsum = vec_sum4s(vec_abs(line4CS), vsum);
01311 vsum = vec_sum4s(vec_abs(line5CS), vsum);
01312 vsum = vec_sum4s(vec_abs(line6CS), vsum);
01313 vsum = vec_sum4s(vec_abs(line7CS), vsum);
01314 vsum = vec_sums(vsum, (vector signed int)vzero);
01315 vsum = vec_splat(vsum, 3);
01316 vec_ste(vsum, 0, &sum);
01317 }
01318 return sum;
01319 }
01320
01321 static int hadamard8_diff16_altivec( void *s, uint8_t *dst, uint8_t *src, int stride, int h){
01322 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
01323 int score;
01324 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
01325 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
01326 if (h==16) {
01327 dst += 8*stride;
01328 src += 8*stride;
01329 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
01330 }
01331 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
01332 return score;
01333 }
01334
01335 static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
01336 int blocksize)
01337 {
01338 int i;
01339 vector float m, a;
01340 vector bool int t0, t1;
01341 const vector unsigned int v_31 =
01342 vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
01343 for (i = 0; i < blocksize; i += 4) {
01344 m = vec_ld(0, mag+i);
01345 a = vec_ld(0, ang+i);
01346 t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
01347 t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
01348 a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
01349 t0 = (vector bool int)vec_and(a, t1);
01350 t1 = (vector bool int)vec_andc(a, t1);
01351 a = vec_sub(m, (vector float)t1);
01352 m = vec_add(m, (vector float)t0);
01353 vec_stl(a, 0, ang+i);
01354 vec_stl(m, 0, mag+i);
01355 }
01356 }
01357
01358
01359 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
01360 {
01361 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
01362 register int i;
01363 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
01364 register vector unsigned char blockv, temp1, temp2, blocktemp;
01365 register vector unsigned short pixelssum1, pixelssum2, temp3;
01366
01367 register const vector unsigned char vczero = (const vector unsigned char)
01368 vec_splat_u8(0);
01369 register const vector unsigned short vctwo = (const vector unsigned short)
01370 vec_splat_u16(2);
01371
01372 temp1 = vec_ld(0, pixels);
01373 temp2 = vec_ld(16, pixels);
01374 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
01375 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
01376 pixelsv2 = temp2;
01377 } else {
01378 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
01379 }
01380 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01381 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01382 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
01383 (vector unsigned short)pixelsv2);
01384 pixelssum1 = vec_add(pixelssum1, vctwo);
01385
01386 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
01387 for (i = 0; i < h ; i++) {
01388 int rightside = ((unsigned long)block & 0x0000000F);
01389 blockv = vec_ld(0, block);
01390
01391 temp1 = vec_ld(line_size, pixels);
01392 temp2 = vec_ld(line_size + 16, pixels);
01393 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
01394 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
01395 pixelsv2 = temp2;
01396 } else {
01397 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
01398 }
01399
01400 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01401 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01402 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
01403 (vector unsigned short)pixelsv2);
01404 temp3 = vec_add(pixelssum1, pixelssum2);
01405 temp3 = vec_sra(temp3, vctwo);
01406 pixelssum1 = vec_add(pixelssum2, vctwo);
01407 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
01408
01409 if (rightside) {
01410 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
01411 } else {
01412 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
01413 }
01414
01415 blockv = vec_avg(blocktemp, blockv);
01416 vec_st(blockv, 0, block);
01417
01418 block += line_size;
01419 pixels += line_size;
01420 }
01421
01422 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
01423 }
01424
01425 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
01426 {
01427 c->pix_abs[0][1] = sad16_x2_altivec;
01428 c->pix_abs[0][2] = sad16_y2_altivec;
01429 c->pix_abs[0][3] = sad16_xy2_altivec;
01430 c->pix_abs[0][0] = sad16_altivec;
01431 c->pix_abs[1][0] = sad8_altivec;
01432 c->sad[0]= sad16_altivec;
01433 c->sad[1]= sad8_altivec;
01434 c->pix_norm1 = pix_norm1_altivec;
01435 c->sse[1]= sse8_altivec;
01436 c->sse[0]= sse16_altivec;
01437 c->pix_sum = pix_sum_altivec;
01438 c->diff_pixels = diff_pixels_altivec;
01439 c->get_pixels = get_pixels_altivec;
01440 c->clear_block = clear_block_altivec;
01441 c->add_bytes= add_bytes_altivec;
01442 c->put_pixels_tab[0][0] = put_pixels16_altivec;
01443
01444 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
01445 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
01446 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
01447 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
01448 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
01449 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
01450 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
01451 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
01452
01453 c->hadamard8_diff[0] = hadamard8_diff16_altivec;
01454 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
01455 if (CONFIG_VORBIS_DECODER)
01456 c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
01457 }