00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034
00035 #include "libavutil/cpu.h"
00036 #include "libavutil/common.h"
00037 #include "libavutil/lfg.h"
00038
00039 #include "simple_idct.h"
00040 #include "aandcttab.h"
00041 #include "faandct.h"
00042 #include "faanidct.h"
00043 #include "x86/idct_xvid.h"
00044 #include "dctref.h"
00045
00046 #undef printf
00047
00048 void ff_mmx_idct(DCTELEM *data);
00049 void ff_mmxext_idct(DCTELEM *data);
00050
00051 void odivx_idct_c(short *block);
00052
00053
00054 void ff_bfin_idct(DCTELEM *block);
00055 void ff_bfin_fdct(DCTELEM *block);
00056
00057
00058 void fdct_altivec(DCTELEM *block);
00059
00060
00061
00062 void ff_j_rev_dct_arm(DCTELEM *data);
00063 void ff_simple_idct_arm(DCTELEM *data);
00064 void ff_simple_idct_armv5te(DCTELEM *data);
00065 void ff_simple_idct_armv6(DCTELEM *data);
00066 void ff_simple_idct_neon(DCTELEM *data);
00067
00068 void ff_simple_idct_axp(DCTELEM *data);
00069
00070 struct algo {
00071 const char *name;
00072 void (*func)(DCTELEM *block);
00073 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
00074 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
00075 int mm_support;
00076 int nonspec;
00077 };
00078
00079 #ifndef FAAN_POSTSCALE
00080 #define FAAN_SCALE SCALE_PERM
00081 #else
00082 #define FAAN_SCALE NO_PERM
00083 #endif
00084
00085 static int cpu_flags;
00086
00087 static const struct algo fdct_tab[] = {
00088 { "REF-DBL", ff_ref_fdct, NO_PERM },
00089 { "FAAN", ff_faandct, FAAN_SCALE },
00090 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
00091 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
00092
00093 #if HAVE_MMX
00094 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
00095 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
00096 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
00097 #endif
00098
00099 #if HAVE_ALTIVEC
00100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
00101 #endif
00102
00103 #if ARCH_BFIN
00104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
00105 #endif
00106
00107 { 0 }
00108 };
00109
00110 #if HAVE_MMX
00111 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
00112 DCTELEM *block, int16_t *qmat);
00113
00114 static void ff_prores_idct_put_10_sse2_wrap(uint16_t *dst){
00115 int16_t qmat[64]; int i;
00116 int16_t tmp[64];
00117
00118 for(i=0; i<64; i++){
00119 qmat[i]=4;
00120 tmp[i]= dst[i];
00121 }
00122 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
00123 }
00124 #endif
00125
00126 static const struct algo idct_tab[] = {
00127 { "FAANI", ff_faanidct, NO_PERM },
00128 { "REF-DBL", ff_ref_idct, NO_PERM },
00129 { "INT", j_rev_dct, MMX_PERM },
00130 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
00131
00132 #if HAVE_MMX
00133 #if CONFIG_GPL
00134 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
00135 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
00136 #endif
00137 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
00138 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
00139 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
00140 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
00141 #if ARCH_X86_64
00142 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
00143 #endif
00144 #endif
00145
00146 #if ARCH_BFIN
00147 { "BFINidct", ff_bfin_idct, NO_PERM },
00148 #endif
00149
00150 #if ARCH_ARM
00151 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
00152 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
00153 #endif
00154 #if HAVE_ARMV5TE
00155 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
00156 #endif
00157 #if HAVE_ARMV6
00158 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
00159 #endif
00160 #if HAVE_NEON
00161 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
00162 #endif
00163
00164 #if ARCH_ALPHA
00165 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
00166 #endif
00167
00168 { 0 }
00169 };
00170
00171 #define AANSCALE_BITS 12
00172
00173 static int64_t gettime(void)
00174 {
00175 struct timeval tv;
00176 gettimeofday(&tv, NULL);
00177 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00178 }
00179
00180 #define NB_ITS 20000
00181 #define NB_ITS_SPEED 50000
00182
00183 static short idct_mmx_perm[64];
00184
00185 static short idct_simple_mmx_perm[64] = {
00186 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00187 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00188 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00189 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00190 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00191 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00192 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00193 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00194 };
00195
00196 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
00197
00198 static void idct_mmx_init(void)
00199 {
00200 int i;
00201
00202
00203 for (i = 0; i < 64; i++) {
00204 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00205 }
00206 }
00207
00208 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00209 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00210
00211 static inline void mmx_emms(void)
00212 {
00213 #if HAVE_MMX
00214 if (cpu_flags & AV_CPU_FLAG_MMX)
00215 __asm__ volatile ("emms\n\t");
00216 #endif
00217 }
00218
00219 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
00220 {
00221 int i, j;
00222
00223 memset(block, 0, 64 * sizeof(*block));
00224
00225 switch (test) {
00226 case 0:
00227 for (i = 0; i < 64; i++)
00228 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
00229 if (is_idct) {
00230 ff_ref_fdct(block);
00231 for (i = 0; i < 64; i++)
00232 block[i] >>= 3;
00233 }
00234 break;
00235 case 1:
00236 j = av_lfg_get(prng) % 10 + 1;
00237 for (i = 0; i < j; i++)
00238 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
00239 break;
00240 case 2:
00241 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
00242 block[63] = (block[0] & 1) ^ 1;
00243 break;
00244 }
00245 }
00246
00247 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
00248 {
00249 int i;
00250
00251 if (perm == MMX_PERM) {
00252 for (i = 0; i < 64; i++)
00253 dst[idct_mmx_perm[i]] = src[i];
00254 } else if (perm == MMX_SIMPLE_PERM) {
00255 for (i = 0; i < 64; i++)
00256 dst[idct_simple_mmx_perm[i]] = src[i];
00257 } else if (perm == SSE2_PERM) {
00258 for (i = 0; i < 64; i++)
00259 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
00260 } else if (perm == PARTTRANS_PERM) {
00261 for (i = 0; i < 64; i++)
00262 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
00263 } else if (perm == TRANSPOSE_PERM) {
00264 for (i = 0; i < 64; i++)
00265 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
00266 } else {
00267 for (i = 0; i < 64; i++)
00268 dst[i] = src[i];
00269 }
00270 }
00271
00272 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
00273 {
00274 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
00275 int it, i, scale;
00276 int err_inf, v;
00277 int64_t err2, ti, ti1, it1, err_sum = 0;
00278 int64_t sysErr[64], sysErrMax = 0;
00279 int maxout = 0;
00280 int blockSumErrMax = 0, blockSumErr;
00281 AVLFG prng;
00282 const int vals=1<<bits;
00283 double omse, ome;
00284 int spec_err;
00285
00286 av_lfg_init(&prng, 1);
00287
00288 err_inf = 0;
00289 err2 = 0;
00290 for (i = 0; i < 64; i++)
00291 sysErr[i] = 0;
00292 for (it = 0; it < NB_ITS; it++) {
00293 init_block(block1, test, is_idct, &prng, vals);
00294 permute(block, block1, dct->format);
00295
00296 dct->func(block);
00297 mmx_emms();
00298
00299 if (dct->format == SCALE_PERM) {
00300 for (i = 0; i < 64; i++) {
00301 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00302 block[i] = (block[i] * scale) >> AANSCALE_BITS;
00303 }
00304 }
00305
00306 ref(block1);
00307
00308 blockSumErr = 0;
00309 for (i = 0; i < 64; i++) {
00310 int err = block[i] - block1[i];
00311 err_sum += err;
00312 v = abs(err);
00313 if (v > err_inf)
00314 err_inf = v;
00315 err2 += v * v;
00316 sysErr[i] += block[i] - block1[i];
00317 blockSumErr += v;
00318 if (abs(block[i]) > maxout)
00319 maxout = abs(block[i]);
00320 }
00321 if (blockSumErrMax < blockSumErr)
00322 blockSumErrMax = blockSumErr;
00323 }
00324 for (i = 0; i < 64; i++)
00325 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
00326
00327 for (i = 0; i < 64; i++) {
00328 if (i % 8 == 0)
00329 printf("\n");
00330 printf("%7d ", (int) sysErr[i]);
00331 }
00332 printf("\n");
00333
00334 omse = (double) err2 / NB_ITS / 64;
00335 ome = (double) err_sum / NB_ITS / 64;
00336
00337 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
00338
00339 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00340 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
00341 omse, ome, (double) sysErrMax / NB_ITS,
00342 maxout, blockSumErrMax);
00343
00344 if (spec_err && !dct->nonspec)
00345 return 1;
00346
00347 if (!speed)
00348 return 0;
00349
00350
00351
00352 init_block(block, test, is_idct, &prng, vals);
00353 permute(block1, block, dct->format);
00354
00355 ti = gettime();
00356 it1 = 0;
00357 do {
00358 for (it = 0; it < NB_ITS_SPEED; it++) {
00359 memcpy(block, block1, sizeof(block));
00360 dct->func(block);
00361 }
00362 it1 += NB_ITS_SPEED;
00363 ti1 = gettime() - ti;
00364 } while (ti1 < 1000000);
00365 mmx_emms();
00366
00367 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
00368 (double) it1 * 1000.0 / (double) ti1);
00369
00370 return 0;
00371 }
00372
00373 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00374 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00375
00376 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00377 {
00378 static int init;
00379 static double c8[8][8];
00380 static double c4[4][4];
00381 double block1[64], block2[64], block3[64];
00382 double s, sum, v;
00383 int i, j, k;
00384
00385 if (!init) {
00386 init = 1;
00387
00388 for (i = 0; i < 8; i++) {
00389 sum = 0;
00390 for (j = 0; j < 8; j++) {
00391 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
00392 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00393 sum += c8[i][j] * c8[i][j];
00394 }
00395 }
00396
00397 for (i = 0; i < 4; i++) {
00398 sum = 0;
00399 for (j = 0; j < 4; j++) {
00400 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
00401 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00402 sum += c4[i][j] * c4[i][j];
00403 }
00404 }
00405 }
00406
00407
00408 s = 0.5 * sqrt(2.0);
00409 for (i = 0; i < 4; i++) {
00410 for (j = 0; j < 8; j++) {
00411 block1[8 * (2 * i) + j] =
00412 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
00413 block1[8 * (2 * i + 1) + j] =
00414 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
00415 }
00416 }
00417
00418
00419 for (i = 0; i < 8; i++) {
00420 for (j = 0; j < 8; j++) {
00421 sum = 0;
00422 for (k = 0; k < 8; k++)
00423 sum += c8[k][j] * block1[8 * i + k];
00424 block2[8 * i + j] = sum;
00425 }
00426 }
00427
00428
00429 for (i = 0; i < 8; i++) {
00430 for (j = 0; j < 4; j++) {
00431
00432 sum = 0;
00433 for (k = 0; k < 4; k++)
00434 sum += c4[k][j] * block2[8 * (2 * k) + i];
00435 block3[8 * (2 * j) + i] = sum;
00436
00437
00438 sum = 0;
00439 for (k = 0; k < 4; k++)
00440 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
00441 block3[8 * (2 * j + 1) + i] = sum;
00442 }
00443 }
00444
00445
00446 for (i = 0; i < 8; i++) {
00447 for (j = 0; j < 8; j++) {
00448 v = block3[8 * i + j];
00449 if (v < 0) v = 0;
00450 else if (v > 255) v = 255;
00451 dest[i * linesize + j] = (int) rint(v);
00452 }
00453 }
00454 }
00455
00456 static void idct248_error(const char *name,
00457 void (*idct248_put)(uint8_t *dest, int line_size,
00458 int16_t *block),
00459 int speed)
00460 {
00461 int it, i, it1, ti, ti1, err_max, v;
00462 AVLFG prng;
00463
00464 av_lfg_init(&prng, 1);
00465
00466
00467
00468 err_max = 0;
00469 for (it = 0; it < NB_ITS; it++) {
00470
00471 for (i = 0; i < 64; i++)
00472 block1[i] = av_lfg_get(&prng) % 256 - 128;
00473 block1[0] += 1024;
00474
00475 for (i = 0; i < 64; i++)
00476 block[i] = block1[i];
00477 idct248_ref(img_dest1, 8, block);
00478
00479 for (i = 0; i < 64; i++)
00480 block[i] = block1[i];
00481 idct248_put(img_dest, 8, block);
00482
00483 for (i = 0; i < 64; i++) {
00484 v = abs((int) img_dest[i] - (int) img_dest1[i]);
00485 if (v == 255)
00486 printf("%d %d\n", img_dest[i], img_dest1[i]);
00487 if (v > err_max)
00488 err_max = v;
00489 }
00490 #if 0
00491 printf("ref=\n");
00492 for(i=0;i<8;i++) {
00493 int j;
00494 for(j=0;j<8;j++) {
00495 printf(" %3d", img_dest1[i*8+j]);
00496 }
00497 printf("\n");
00498 }
00499
00500 printf("out=\n");
00501 for(i=0;i<8;i++) {
00502 int j;
00503 for(j=0;j<8;j++) {
00504 printf(" %3d", img_dest[i*8+j]);
00505 }
00506 printf("\n");
00507 }
00508 #endif
00509 }
00510 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
00511
00512 if (!speed)
00513 return;
00514
00515 ti = gettime();
00516 it1 = 0;
00517 do {
00518 for (it = 0; it < NB_ITS_SPEED; it++) {
00519 for (i = 0; i < 64; i++)
00520 block[i] = block1[i];
00521 idct248_put(img_dest, 8, block);
00522 }
00523 it1 += NB_ITS_SPEED;
00524 ti1 = gettime() - ti;
00525 } while (ti1 < 1000000);
00526 mmx_emms();
00527
00528 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
00529 (double) it1 * 1000.0 / (double) ti1);
00530 }
00531
00532 static void help(void)
00533 {
00534 printf("dct-test [-i] [<test-number>] [<bits>]\n"
00535 "test-number 0 -> test with random matrixes\n"
00536 " 1 -> test with random sparse matrixes\n"
00537 " 2 -> do 3. test from mpeg4 std\n"
00538 "bits Number of time domain bits to use, 8 is default\n"
00539 "-i test IDCT implementations\n"
00540 "-4 test IDCT248 implementations\n"
00541 "-t speed test\n");
00542 }
00543
00544 int main(int argc, char **argv)
00545 {
00546 int test_idct = 0, test_248_dct = 0;
00547 int c, i;
00548 int test = 1;
00549 int speed = 0;
00550 int err = 0;
00551 int bits=8;
00552
00553 cpu_flags = av_get_cpu_flags();
00554
00555 ff_ref_dct_init();
00556 idct_mmx_init();
00557
00558 for (;;) {
00559 c = getopt(argc, argv, "ih4t");
00560 if (c == -1)
00561 break;
00562 switch (c) {
00563 case 'i':
00564 test_idct = 1;
00565 break;
00566 case '4':
00567 test_248_dct = 1;
00568 break;
00569 case 't':
00570 speed = 1;
00571 break;
00572 default:
00573 case 'h':
00574 help();
00575 return 0;
00576 }
00577 }
00578
00579 if (optind < argc)
00580 test = atoi(argv[optind]);
00581 if(optind+1 < argc) bits= atoi(argv[optind+1]);
00582
00583 printf("ffmpeg DCT/IDCT test\n");
00584
00585 if (test_248_dct) {
00586 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
00587 } else {
00588 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
00589 for (i = 0; algos[i].name; i++)
00590 if (!(~cpu_flags & algos[i].mm_support)) {
00591 err |= dct_error(&algos[i], test, test_idct, speed, bits);
00592 }
00593 }
00594
00595 return err;
00596 }