00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034
00035 #include "libavutil/common.h"
00036 #include "libavutil/lfg.h"
00037
00038 #include "simple_idct.h"
00039 #include "aandcttab.h"
00040 #include "faandct.h"
00041 #include "faanidct.h"
00042 #include "x86/idct_xvid.h"
00043 #include "dctref.h"
00044
00045 #undef printf
00046
00047 void ff_mmx_idct(DCTELEM *data);
00048 void ff_mmxext_idct(DCTELEM *data);
00049
00050 void odivx_idct_c(short *block);
00051
00052
00053 void ff_bfin_idct(DCTELEM *block);
00054 void ff_bfin_fdct(DCTELEM *block);
00055
00056
00057 void fdct_altivec(DCTELEM *block);
00058
00059
00060
00061 void ff_j_rev_dct_arm(DCTELEM *data);
00062 void ff_simple_idct_arm(DCTELEM *data);
00063 void ff_simple_idct_armv5te(DCTELEM *data);
00064 void ff_simple_idct_armv6(DCTELEM *data);
00065 void ff_simple_idct_neon(DCTELEM *data);
00066
00067 void ff_simple_idct_axp(DCTELEM *data);
00068
00069 struct algo {
00070 const char *name;
00071 enum { FDCT, IDCT } is_idct;
00072 void (* func) (DCTELEM *block);
00073 void (* ref) (DCTELEM *block);
00074 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
00075 int mm_support;
00076 };
00077
00078 #ifndef FAAN_POSTSCALE
00079 #define FAAN_SCALE SCALE_PERM
00080 #else
00081 #define FAAN_SCALE NO_PERM
00082 #endif
00083
00084 static int cpu_flags;
00085
00086 struct algo algos[] = {
00087 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
00088 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
00089 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
00090 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
00091 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
00092 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
00093 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
00094 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
00095
00096 #if HAVE_MMX
00097 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, FF_MM_MMX},
00098 #if HAVE_MMX2
00099 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, FF_MM_MMX2},
00100 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, FF_MM_SSE2},
00101 #endif
00102
00103 #if CONFIG_GPL
00104 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX},
00105 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX2},
00106 #endif
00107 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX},
00108 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, FF_MM_MMX},
00109 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, FF_MM_MMX2},
00110 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, FF_MM_SSE2},
00111 #endif
00112
00113 #if HAVE_ALTIVEC
00114 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC},
00115 #endif
00116
00117 #if ARCH_BFIN
00118 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
00119 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
00120 #endif
00121
00122 #if ARCH_ARM
00123 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
00124 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
00125 #if HAVE_ARMV5TE
00126 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
00127 #endif
00128 #if HAVE_ARMV6
00129 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
00130 #endif
00131 #if HAVE_NEON
00132 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
00133 #endif
00134 #endif
00135
00136 #if ARCH_ALPHA
00137 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
00138 #endif
00139
00140 { 0 }
00141 };
00142
00143 #define AANSCALE_BITS 12
00144
00145 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
00146
00147 static int64_t gettime(void)
00148 {
00149 struct timeval tv;
00150 gettimeofday(&tv,NULL);
00151 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00152 }
00153
00154 #define NB_ITS 20000
00155 #define NB_ITS_SPEED 50000
00156
00157 static short idct_mmx_perm[64];
00158
00159 static short idct_simple_mmx_perm[64]={
00160 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00161 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00162 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00163 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00164 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00165 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00166 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00167 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00168 };
00169
00170 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00171
00172 static void idct_mmx_init(void)
00173 {
00174 int i;
00175
00176
00177 for (i = 0; i < 64; i++) {
00178 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00179
00180 }
00181 }
00182
00183 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00184 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00185 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
00186
00187 static inline void mmx_emms(void)
00188 {
00189 #if HAVE_MMX
00190 if (cpu_flags & FF_MM_MMX)
00191 __asm__ volatile ("emms\n\t");
00192 #endif
00193 }
00194
00195 static void dct_error(const char *name, int is_idct,
00196 void (*fdct_func)(DCTELEM *block),
00197 void (*fdct_ref)(DCTELEM *block), int form, int test)
00198 {
00199 int it, i, scale;
00200 int err_inf, v;
00201 int64_t err2, ti, ti1, it1;
00202 int64_t sysErr[64], sysErrMax=0;
00203 int maxout=0;
00204 int blockSumErrMax=0, blockSumErr;
00205 AVLFG prng;
00206
00207 av_lfg_init(&prng, 1);
00208
00209 err_inf = 0;
00210 err2 = 0;
00211 for(i=0; i<64; i++) sysErr[i]=0;
00212 for(it=0;it<NB_ITS;it++) {
00213 for(i=0;i<64;i++)
00214 block1[i] = 0;
00215 switch(test){
00216 case 0:
00217 for(i=0;i<64;i++)
00218 block1[i] = (av_lfg_get(&prng) % 512) -256;
00219 if (is_idct){
00220 ff_ref_fdct(block1);
00221
00222 for(i=0;i<64;i++)
00223 block1[i]>>=3;
00224 }
00225 break;
00226 case 1:{
00227 int num = av_lfg_get(&prng) % 10 + 1;
00228 for(i=0;i<num;i++)
00229 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
00230 }break;
00231 case 2:
00232 block1[0] = av_lfg_get(&prng) % 4096 - 2048;
00233 block1[63]= (block1[0]&1)^1;
00234 break;
00235 }
00236
00237 #if 0 // simulate mismatch control
00238 { int sum=0;
00239 for(i=0;i<64;i++)
00240 sum+=block1[i];
00241
00242 if((sum&1)==0) block1[63]^=1;
00243 }
00244 #endif
00245
00246 for(i=0; i<64; i++)
00247 block_org[i]= block1[i];
00248
00249 if (form == MMX_PERM) {
00250 for(i=0;i<64;i++)
00251 block[idct_mmx_perm[i]] = block1[i];
00252 } else if (form == MMX_SIMPLE_PERM) {
00253 for(i=0;i<64;i++)
00254 block[idct_simple_mmx_perm[i]] = block1[i];
00255
00256 } else if (form == SSE2_PERM) {
00257 for(i=0; i<64; i++)
00258 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
00259 } else if (form == PARTTRANS_PERM) {
00260 for(i=0; i<64; i++)
00261 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
00262 } else {
00263 for(i=0; i<64; i++)
00264 block[i]= block1[i];
00265 }
00266 #if 0 // simulate mismatch control for tested IDCT but not the ref
00267 { int sum=0;
00268 for(i=0;i<64;i++)
00269 sum+=block[i];
00270
00271 if((sum&1)==0) block[63]^=1;
00272 }
00273 #endif
00274
00275 fdct_func(block);
00276 mmx_emms();
00277
00278 if (form == SCALE_PERM) {
00279 for(i=0; i<64; i++) {
00280 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00281 block[i] = (block[i] * scale ) >> AANSCALE_BITS;
00282 }
00283 }
00284
00285 fdct_ref(block1);
00286
00287 blockSumErr=0;
00288 for(i=0;i<64;i++) {
00289 v = abs(block[i] - block1[i]);
00290 if (v > err_inf)
00291 err_inf = v;
00292 err2 += v * v;
00293 sysErr[i] += block[i] - block1[i];
00294 blockSumErr += v;
00295 if( abs(block[i])>maxout) maxout=abs(block[i]);
00296 }
00297 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
00298 #if 0 // print different matrix pairs
00299 if(blockSumErr){
00300 printf("\n");
00301 for(i=0; i<64; i++){
00302 if((i&7)==0) printf("\n");
00303 printf("%4d ", block_org[i]);
00304 }
00305 for(i=0; i<64; i++){
00306 if((i&7)==0) printf("\n");
00307 printf("%4d ", block[i] - block1[i]);
00308 }
00309 }
00310 #endif
00311 }
00312 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
00313
00314 #if 1 // dump systematic errors
00315 for(i=0; i<64; i++){
00316 if(i%8==0) printf("\n");
00317 printf("%7d ", (int)sysErr[i]);
00318 }
00319 printf("\n");
00320 #endif
00321
00322 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00323 is_idct ? "IDCT" : "DCT",
00324 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
00325 #if 1 //Speed test
00326
00327 for(i=0;i<64;i++)
00328 block1[i] = 0;
00329 switch(test){
00330 case 0:
00331 for(i=0;i<64;i++)
00332 block1[i] = av_lfg_get(&prng) % 512 -256;
00333 if (is_idct){
00334 ff_ref_fdct(block1);
00335
00336 for(i=0;i<64;i++)
00337 block1[i]>>=3;
00338 }
00339 break;
00340 case 1:{
00341 case 2:
00342 block1[0] = av_lfg_get(&prng) % 512 -256;
00343 block1[1] = av_lfg_get(&prng) % 512 -256;
00344 block1[2] = av_lfg_get(&prng) % 512 -256;
00345 block1[3] = av_lfg_get(&prng) % 512 -256;
00346 }break;
00347 }
00348
00349 if (form == MMX_PERM) {
00350 for(i=0;i<64;i++)
00351 block[idct_mmx_perm[i]] = block1[i];
00352 } else if(form == MMX_SIMPLE_PERM) {
00353 for(i=0;i<64;i++)
00354 block[idct_simple_mmx_perm[i]] = block1[i];
00355 } else {
00356 for(i=0; i<64; i++)
00357 block[i]= block1[i];
00358 }
00359
00360 ti = gettime();
00361 it1 = 0;
00362 do {
00363 for(it=0;it<NB_ITS_SPEED;it++) {
00364 for(i=0; i<64; i++)
00365 block[i]= block1[i];
00366
00367
00368 fdct_func(block);
00369 }
00370 it1 += NB_ITS_SPEED;
00371 ti1 = gettime() - ti;
00372 } while (ti1 < 1000000);
00373 mmx_emms();
00374
00375 printf("%s %s: %0.1f kdct/s\n",
00376 is_idct ? "IDCT" : "DCT",
00377 name, (double)it1 * 1000.0 / (double)ti1);
00378 #endif
00379 }
00380
00381 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00382 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00383
00384 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00385 {
00386 static int init;
00387 static double c8[8][8];
00388 static double c4[4][4];
00389 double block1[64], block2[64], block3[64];
00390 double s, sum, v;
00391 int i, j, k;
00392
00393 if (!init) {
00394 init = 1;
00395
00396 for(i=0;i<8;i++) {
00397 sum = 0;
00398 for(j=0;j<8;j++) {
00399 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
00400 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00401 sum += c8[i][j] * c8[i][j];
00402 }
00403 }
00404
00405 for(i=0;i<4;i++) {
00406 sum = 0;
00407 for(j=0;j<4;j++) {
00408 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
00409 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00410 sum += c4[i][j] * c4[i][j];
00411 }
00412 }
00413 }
00414
00415
00416 s = 0.5 * sqrt(2.0);
00417 for(i=0;i<4;i++) {
00418 for(j=0;j<8;j++) {
00419 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
00420 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
00421 }
00422 }
00423
00424
00425 for(i=0;i<8;i++) {
00426 for(j=0;j<8;j++) {
00427 sum = 0;
00428 for(k=0;k<8;k++)
00429 sum += c8[k][j] * block1[8*i+k];
00430 block2[8*i+j] = sum;
00431 }
00432 }
00433
00434
00435 for(i=0;i<8;i++) {
00436 for(j=0;j<4;j++) {
00437
00438 sum = 0;
00439 for(k=0;k<4;k++)
00440 sum += c4[k][j] * block2[8*(2*k)+i];
00441 block3[8*(2*j)+i] = sum;
00442
00443
00444 sum = 0;
00445 for(k=0;k<4;k++)
00446 sum += c4[k][j] * block2[8*(2*k+1)+i];
00447 block3[8*(2*j+1)+i] = sum;
00448 }
00449 }
00450
00451
00452 for(i=0;i<8;i++) {
00453 for(j=0;j<8;j++) {
00454 v = block3[8*i+j];
00455 if (v < 0)
00456 v = 0;
00457 else if (v > 255)
00458 v = 255;
00459 dest[i * linesize + j] = (int)rint(v);
00460 }
00461 }
00462 }
00463
00464 static void idct248_error(const char *name,
00465 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
00466 {
00467 int it, i, it1, ti, ti1, err_max, v;
00468
00469 AVLFG prng;
00470
00471 av_lfg_init(&prng, 1);
00472
00473
00474
00475 err_max = 0;
00476 for(it=0;it<NB_ITS;it++) {
00477
00478
00479 for(i=0;i<64;i++)
00480 block1[i] = av_lfg_get(&prng) % 256 - 128;
00481 block1[0] += 1024;
00482
00483 for(i=0; i<64; i++)
00484 block[i]= block1[i];
00485 idct248_ref(img_dest1, 8, block);
00486
00487 for(i=0; i<64; i++)
00488 block[i]= block1[i];
00489 idct248_put(img_dest, 8, block);
00490
00491 for(i=0;i<64;i++) {
00492 v = abs((int)img_dest[i] - (int)img_dest1[i]);
00493 if (v == 255)
00494 printf("%d %d\n", img_dest[i], img_dest1[i]);
00495 if (v > err_max)
00496 err_max = v;
00497 }
00498 #if 0
00499 printf("ref=\n");
00500 for(i=0;i<8;i++) {
00501 int j;
00502 for(j=0;j<8;j++) {
00503 printf(" %3d", img_dest1[i*8+j]);
00504 }
00505 printf("\n");
00506 }
00507
00508 printf("out=\n");
00509 for(i=0;i<8;i++) {
00510 int j;
00511 for(j=0;j<8;j++) {
00512 printf(" %3d", img_dest[i*8+j]);
00513 }
00514 printf("\n");
00515 }
00516 #endif
00517 }
00518 printf("%s %s: err_inf=%d\n",
00519 1 ? "IDCT248" : "DCT248",
00520 name, err_max);
00521
00522 ti = gettime();
00523 it1 = 0;
00524 do {
00525 for(it=0;it<NB_ITS_SPEED;it++) {
00526 for(i=0; i<64; i++)
00527 block[i]= block1[i];
00528
00529
00530 idct248_put(img_dest, 8, block);
00531 }
00532 it1 += NB_ITS_SPEED;
00533 ti1 = gettime() - ti;
00534 } while (ti1 < 1000000);
00535 mmx_emms();
00536
00537 printf("%s %s: %0.1f kdct/s\n",
00538 1 ? "IDCT248" : "DCT248",
00539 name, (double)it1 * 1000.0 / (double)ti1);
00540 }
00541
00542 static void help(void)
00543 {
00544 printf("dct-test [-i] [<test-number>]\n"
00545 "test-number 0 -> test with random matrixes\n"
00546 " 1 -> test with random sparse matrixes\n"
00547 " 2 -> do 3. test from mpeg4 std\n"
00548 "-i test IDCT implementations\n"
00549 "-4 test IDCT248 implementations\n");
00550 }
00551
00552 int main(int argc, char **argv)
00553 {
00554 int test_idct = 0, test_248_dct = 0;
00555 int c,i;
00556 int test=1;
00557 cpu_flags = mm_support();
00558
00559 ff_ref_dct_init();
00560 idct_mmx_init();
00561
00562 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
00563 for(i=0;i<MAX_NEG_CROP;i++) {
00564 cropTbl[i] = 0;
00565 cropTbl[i + MAX_NEG_CROP + 256] = 255;
00566 }
00567
00568 for(;;) {
00569 c = getopt(argc, argv, "ih4");
00570 if (c == -1)
00571 break;
00572 switch(c) {
00573 case 'i':
00574 test_idct = 1;
00575 break;
00576 case '4':
00577 test_248_dct = 1;
00578 break;
00579 default :
00580 case 'h':
00581 help();
00582 return 0;
00583 }
00584 }
00585
00586 if(optind <argc) test= atoi(argv[optind]);
00587
00588 printf("ffmpeg DCT/IDCT test\n");
00589
00590 if (test_248_dct) {
00591 idct248_error("SIMPLE-C", ff_simple_idct248_put);
00592 } else {
00593 for (i=0;algos[i].name;i++)
00594 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
00595 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
00596 }
00597 }
00598 return 0;
00599 }