00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "libavutil/common.h"
00024 #include "libavcodec/dsputil.h"
00025
00026 #include "dsputil_mmx.h"
00027 #include "mmx.h"
00028
00029 #define ROW_SHIFT 11
00030 #define COL_SHIFT 6
00031
00032 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
00033 #define rounder(bias) {round (bias), round (bias)}
00034
00035
00036 #if 0
00037
00038 static inline void idct_row (int16_t * row, int offset,
00039 int16_t * table, int32_t * rounder)
00040 {
00041 int C1, C2, C3, C4, C5, C6, C7;
00042 int a0, a1, a2, a3, b0, b1, b2, b3;
00043
00044 row += offset;
00045
00046 C1 = table[1];
00047 C2 = table[2];
00048 C3 = table[3];
00049 C4 = table[4];
00050 C5 = table[5];
00051 C6 = table[6];
00052 C7 = table[7];
00053
00054 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
00055 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
00056 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
00057 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
00058
00059 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
00060 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
00061 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
00062 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
00063
00064 row[0] = (a0 + b0) >> ROW_SHIFT;
00065 row[1] = (a1 + b1) >> ROW_SHIFT;
00066 row[2] = (a2 + b2) >> ROW_SHIFT;
00067 row[3] = (a3 + b3) >> ROW_SHIFT;
00068 row[4] = (a3 - b3) >> ROW_SHIFT;
00069 row[5] = (a2 - b2) >> ROW_SHIFT;
00070 row[6] = (a1 - b1) >> ROW_SHIFT;
00071 row[7] = (a0 - b0) >> ROW_SHIFT;
00072 }
00073 #endif
00074
00075
00076
00077
00078 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
00079 c4, c6, c4, c6, \
00080 c1, c3, -c1, -c5, \
00081 c5, c7, c3, -c7, \
00082 c4, -c6, c4, -c6, \
00083 -c4, c2, c4, -c2, \
00084 c5, -c1, c3, -c1, \
00085 c7, c3, c7, -c5 }
00086
00087 static inline void mmxext_row_head (int16_t * const row, const int offset,
00088 const int16_t * const table)
00089 {
00090 movq_m2r (*(row+offset), mm2);
00091
00092 movq_m2r (*(row+offset+4), mm5);
00093 movq_r2r (mm2, mm0);
00094
00095 movq_m2r (*table, mm3);
00096 movq_r2r (mm5, mm6);
00097
00098 movq_m2r (*(table+4), mm4);
00099 pmaddwd_r2r (mm0, mm3);
00100
00101 pshufw_r2r (mm2, mm2, 0x4e);
00102 }
00103
00104 static inline void mmxext_row (const int16_t * const table,
00105 const int32_t * const rounder)
00106 {
00107 movq_m2r (*(table+8), mm1);
00108 pmaddwd_r2r (mm2, mm4);
00109
00110 pmaddwd_m2r (*(table+16), mm0);
00111 pshufw_r2r (mm6, mm6, 0x4e);
00112
00113 movq_m2r (*(table+12), mm7);
00114 pmaddwd_r2r (mm5, mm1);
00115
00116 paddd_m2r (*rounder, mm3);
00117 pmaddwd_r2r (mm6, mm7);
00118
00119 pmaddwd_m2r (*(table+20), mm2);
00120 paddd_r2r (mm4, mm3);
00121
00122 pmaddwd_m2r (*(table+24), mm5);
00123 movq_r2r (mm3, mm4);
00124
00125 pmaddwd_m2r (*(table+28), mm6);
00126 paddd_r2r (mm7, mm1);
00127
00128 paddd_m2r (*rounder, mm0);
00129 psubd_r2r (mm1, mm3);
00130
00131 psrad_i2r (ROW_SHIFT, mm3);
00132 paddd_r2r (mm4, mm1);
00133
00134 paddd_r2r (mm2, mm0);
00135 psrad_i2r (ROW_SHIFT, mm1);
00136
00137 paddd_r2r (mm6, mm5);
00138 movq_r2r (mm0, mm4);
00139
00140 paddd_r2r (mm5, mm0);
00141 psubd_r2r (mm5, mm4);
00142 }
00143
00144 static inline void mmxext_row_tail (int16_t * const row, const int store)
00145 {
00146 psrad_i2r (ROW_SHIFT, mm0);
00147
00148 psrad_i2r (ROW_SHIFT, mm4);
00149
00150 packssdw_r2r (mm0, mm1);
00151
00152 packssdw_r2r (mm3, mm4);
00153
00154 movq_r2m (mm1, *(row+store));
00155 pshufw_r2r (mm4, mm4, 0xb1);
00156
00157
00158
00159 movq_r2m (mm4, *(row+store+4));
00160 }
00161
00162 static inline void mmxext_row_mid (int16_t * const row, const int store,
00163 const int offset,
00164 const int16_t * const table)
00165 {
00166 movq_m2r (*(row+offset), mm2);
00167 psrad_i2r (ROW_SHIFT, mm0);
00168
00169 movq_m2r (*(row+offset+4), mm5);
00170 psrad_i2r (ROW_SHIFT, mm4);
00171
00172 packssdw_r2r (mm0, mm1);
00173 movq_r2r (mm5, mm6);
00174
00175 packssdw_r2r (mm3, mm4);
00176 movq_r2r (mm2, mm0);
00177
00178 movq_r2m (mm1, *(row+store));
00179 pshufw_r2r (mm4, mm4, 0xb1);
00180
00181 movq_m2r (*table, mm3);
00182 movq_r2m (mm4, *(row+store+4));
00183
00184 pmaddwd_r2r (mm0, mm3);
00185
00186 movq_m2r (*(table+4), mm4);
00187 pshufw_r2r (mm2, mm2, 0x4e);
00188 }
00189
00190
00191
00192
00193 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
00194 c4, c6, -c4, -c2, \
00195 c1, c3, c3, -c7, \
00196 c5, c7, -c1, -c5, \
00197 c4, -c6, c4, -c2, \
00198 -c4, c2, c4, -c6, \
00199 c5, -c1, c7, -c5, \
00200 c7, c3, c3, -c1 }
00201
00202 static inline void mmx_row_head (int16_t * const row, const int offset,
00203 const int16_t * const table)
00204 {
00205 movq_m2r (*(row+offset), mm2);
00206
00207 movq_m2r (*(row+offset+4), mm5);
00208 movq_r2r (mm2, mm0);
00209
00210 movq_m2r (*table, mm3);
00211 movq_r2r (mm5, mm6);
00212
00213 punpckldq_r2r (mm0, mm0);
00214
00215 movq_m2r (*(table+4), mm4);
00216 pmaddwd_r2r (mm0, mm3);
00217
00218 movq_m2r (*(table+8), mm1);
00219 punpckhdq_r2r (mm2, mm2);
00220 }
00221
00222 static inline void mmx_row (const int16_t * const table,
00223 const int32_t * const rounder)
00224 {
00225 pmaddwd_r2r (mm2, mm4);
00226 punpckldq_r2r (mm5, mm5);
00227
00228 pmaddwd_m2r (*(table+16), mm0);
00229 punpckhdq_r2r (mm6, mm6);
00230
00231 movq_m2r (*(table+12), mm7);
00232 pmaddwd_r2r (mm5, mm1);
00233
00234 paddd_m2r (*rounder, mm3);
00235 pmaddwd_r2r (mm6, mm7);
00236
00237 pmaddwd_m2r (*(table+20), mm2);
00238 paddd_r2r (mm4, mm3);
00239
00240 pmaddwd_m2r (*(table+24), mm5);
00241 movq_r2r (mm3, mm4);
00242
00243 pmaddwd_m2r (*(table+28), mm6);
00244 paddd_r2r (mm7, mm1);
00245
00246 paddd_m2r (*rounder, mm0);
00247 psubd_r2r (mm1, mm3);
00248
00249 psrad_i2r (ROW_SHIFT, mm3);
00250 paddd_r2r (mm4, mm1);
00251
00252 paddd_r2r (mm2, mm0);
00253 psrad_i2r (ROW_SHIFT, mm1);
00254
00255 paddd_r2r (mm6, mm5);
00256 movq_r2r (mm0, mm7);
00257
00258 paddd_r2r (mm5, mm0);
00259 psubd_r2r (mm5, mm7);
00260 }
00261
00262 static inline void mmx_row_tail (int16_t * const row, const int store)
00263 {
00264 psrad_i2r (ROW_SHIFT, mm0);
00265
00266 psrad_i2r (ROW_SHIFT, mm7);
00267
00268 packssdw_r2r (mm0, mm1);
00269
00270 packssdw_r2r (mm3, mm7);
00271
00272 movq_r2m (mm1, *(row+store));
00273 movq_r2r (mm7, mm4);
00274
00275 pslld_i2r (16, mm7);
00276
00277 psrld_i2r (16, mm4);
00278
00279 por_r2r (mm4, mm7);
00280
00281
00282
00283 movq_r2m (mm7, *(row+store+4));
00284 }
00285
00286 static inline void mmx_row_mid (int16_t * const row, const int store,
00287 const int offset, const int16_t * const table)
00288 {
00289 movq_m2r (*(row+offset), mm2);
00290 psrad_i2r (ROW_SHIFT, mm0);
00291
00292 movq_m2r (*(row+offset+4), mm5);
00293 psrad_i2r (ROW_SHIFT, mm7);
00294
00295 packssdw_r2r (mm0, mm1);
00296 movq_r2r (mm5, mm6);
00297
00298 packssdw_r2r (mm3, mm7);
00299 movq_r2r (mm2, mm0);
00300
00301 movq_r2m (mm1, *(row+store));
00302 movq_r2r (mm7, mm1);
00303
00304 punpckldq_r2r (mm0, mm0);
00305 psrld_i2r (16, mm7);
00306
00307 movq_m2r (*table, mm3);
00308 pslld_i2r (16, mm1);
00309
00310 movq_m2r (*(table+4), mm4);
00311 por_r2r (mm1, mm7);
00312
00313 movq_m2r (*(table+8), mm1);
00314 punpckhdq_r2r (mm2, mm2);
00315
00316 movq_r2m (mm7, *(row+store+4));
00317 pmaddwd_r2r (mm0, mm3);
00318 }
00319
00320
00321 #if 0
00322
00323 static inline void idct_col (int16_t * col, int offset)
00324 {
00325
00326 #define F(c,x) (((c) * (x)) >> 16)
00327
00328
00329 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
00330
00331 int16_t x0, x1, x2, x3, x4, x5, x6, x7;
00332 int16_t y0, y1, y2, y3, y4, y5, y6, y7;
00333 int16_t a0, a1, a2, a3, b0, b1, b2, b3;
00334 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
00335
00336 col += offset;
00337
00338 x0 = col[0*8];
00339 x1 = col[1*8];
00340 x2 = col[2*8];
00341 x3 = col[3*8];
00342 x4 = col[4*8];
00343 x5 = col[5*8];
00344 x6 = col[6*8];
00345 x7 = col[7*8];
00346
00347 u04 = S (x0 + x4);
00348 v04 = S (x0 - x4);
00349 u26 = S (F (T2, x6) + x2);
00350 v26 = S (F (T2, x2) - x6);
00351
00352 a0 = S (u04 + u26);
00353 a1 = S (v04 + v26);
00354 a2 = S (v04 - v26);
00355 a3 = S (u04 - u26);
00356
00357 u17 = S (F (T1, x7) + x1);
00358 v17 = S (F (T1, x1) - x7);
00359 u35 = S (F (T3, x5) + x3);
00360 v35 = S (F (T3, x3) - x5);
00361
00362 b0 = S (u17 + u35);
00363 b3 = S (v17 - v35);
00364 u12 = S (u17 - u35);
00365 v12 = S (v17 + v35);
00366 u12 = S (2 * F (C4, u12));
00367 v12 = S (2 * F (C4, v12));
00368 b1 = S (u12 + v12);
00369 b2 = S (u12 - v12);
00370
00371 y0 = S (a0 + b0) >> COL_SHIFT;
00372 y1 = S (a1 + b1) >> COL_SHIFT;
00373 y2 = S (a2 + b2) >> COL_SHIFT;
00374 y3 = S (a3 + b3) >> COL_SHIFT;
00375
00376 y4 = S (a3 - b3) >> COL_SHIFT;
00377 y5 = S (a2 - b2) >> COL_SHIFT;
00378 y6 = S (a1 - b1) >> COL_SHIFT;
00379 y7 = S (a0 - b0) >> COL_SHIFT;
00380
00381 col[0*8] = y0;
00382 col[1*8] = y1;
00383 col[2*8] = y2;
00384 col[3*8] = y3;
00385 col[4*8] = y4;
00386 col[5*8] = y5;
00387 col[6*8] = y6;
00388 col[7*8] = y7;
00389 }
00390 #endif
00391
00392
00393
00394 static inline void idct_col (int16_t * const col, const int offset)
00395 {
00396 #define T1 13036
00397 #define T2 27146
00398 #define T3 43790
00399 #define C4 23170
00400
00401 DECLARE_ALIGNED(8, static const short, t1_vector)[] = {T1,T1,T1,T1};
00402 DECLARE_ALIGNED(8, static const short, t2_vector)[] = {T2,T2,T2,T2};
00403 DECLARE_ALIGNED(8, static const short, t3_vector)[] = {T3,T3,T3,T3};
00404 DECLARE_ALIGNED(8, static const short, c4_vector)[] = {C4,C4,C4,C4};
00405
00406
00407
00408
00409 movq_m2r (*t1_vector, mm0);
00410
00411 movq_m2r (*(col+offset+1*8), mm1);
00412 movq_r2r (mm0, mm2);
00413
00414 movq_m2r (*(col+offset+7*8), mm4);
00415 pmulhw_r2r (mm1, mm0);
00416
00417 movq_m2r (*t3_vector, mm5);
00418 pmulhw_r2r (mm4, mm2);
00419
00420 movq_m2r (*(col+offset+5*8), mm6);
00421 movq_r2r (mm5, mm7);
00422
00423 movq_m2r (*(col+offset+3*8), mm3);
00424 psubsw_r2r (mm4, mm0);
00425
00426 movq_m2r (*t2_vector, mm4);
00427 pmulhw_r2r (mm3, mm5);
00428
00429 paddsw_r2r (mm2, mm1);
00430 pmulhw_r2r (mm6, mm7);
00431
00432
00433
00434 movq_r2r (mm4, mm2);
00435 paddsw_r2r (mm3, mm5);
00436
00437 pmulhw_m2r (*(col+offset+2*8), mm4);
00438 paddsw_r2r (mm6, mm7);
00439
00440 psubsw_r2r (mm6, mm5);
00441 paddsw_r2r (mm3, mm7);
00442
00443 movq_m2r (*(col+offset+6*8), mm3);
00444 movq_r2r (mm0, mm6);
00445
00446 pmulhw_r2r (mm3, mm2);
00447 psubsw_r2r (mm5, mm0);
00448
00449 psubsw_r2r (mm3, mm4);
00450 paddsw_r2r (mm6, mm5);
00451
00452 movq_r2m (mm0, *(col+offset+3*8));
00453 movq_r2r (mm1, mm6);
00454
00455 paddsw_m2r (*(col+offset+2*8), mm2);
00456 paddsw_r2r (mm7, mm6);
00457
00458 psubsw_r2r (mm7, mm1);
00459 movq_r2r (mm1, mm7);
00460
00461 movq_m2r (*(col+offset+0*8), mm3);
00462 paddsw_r2r (mm5, mm1);
00463
00464 movq_m2r (*c4_vector, mm0);
00465 psubsw_r2r (mm5, mm7);
00466
00467 movq_r2m (mm6, *(col+offset+5*8));
00468 pmulhw_r2r (mm0, mm1);
00469
00470 movq_r2r (mm4, mm6);
00471 pmulhw_r2r (mm0, mm7);
00472
00473 movq_m2r (*(col+offset+4*8), mm5);
00474 movq_r2r (mm3, mm0);
00475
00476 psubsw_r2r (mm5, mm3);
00477 paddsw_r2r (mm5, mm0);
00478
00479 paddsw_r2r (mm3, mm4);
00480 movq_r2r (mm0, mm5);
00481
00482 psubsw_r2r (mm6, mm3);
00483 paddsw_r2r (mm2, mm5);
00484
00485 paddsw_r2r (mm1, mm1);
00486 psubsw_r2r (mm2, mm0);
00487
00488 paddsw_r2r (mm7, mm7);
00489 movq_r2r (mm3, mm2);
00490
00491 movq_r2r (mm4, mm6);
00492 paddsw_r2r (mm7, mm3);
00493
00494 psraw_i2r (COL_SHIFT, mm3);
00495 paddsw_r2r (mm1, mm4);
00496
00497 psraw_i2r (COL_SHIFT, mm4);
00498 psubsw_r2r (mm1, mm6);
00499
00500 movq_m2r (*(col+offset+5*8), mm1);
00501 psubsw_r2r (mm7, mm2);
00502
00503 psraw_i2r (COL_SHIFT, mm6);
00504 movq_r2r (mm5, mm7);
00505
00506 movq_r2m (mm4, *(col+offset+1*8));
00507 psraw_i2r (COL_SHIFT, mm2);
00508
00509 movq_r2m (mm3, *(col+offset+2*8));
00510 paddsw_r2r (mm1, mm5);
00511
00512 movq_m2r (*(col+offset+3*8), mm4);
00513 psubsw_r2r (mm1, mm7);
00514
00515 psraw_i2r (COL_SHIFT, mm5);
00516 movq_r2r (mm0, mm3);
00517
00518 movq_r2m (mm2, *(col+offset+5*8));
00519 psubsw_r2r (mm4, mm3);
00520
00521 psraw_i2r (COL_SHIFT, mm7);
00522 paddsw_r2r (mm0, mm4);
00523
00524 movq_r2m (mm5, *(col+offset+0*8));
00525 psraw_i2r (COL_SHIFT, mm3);
00526
00527 movq_r2m (mm6, *(col+offset+6*8));
00528 psraw_i2r (COL_SHIFT, mm4);
00529
00530 movq_r2m (mm7, *(col+offset+7*8));
00531
00532 movq_r2m (mm3, *(col+offset+4*8));
00533
00534 movq_r2m (mm4, *(col+offset+3*8));
00535
00536 #undef T1
00537 #undef T2
00538 #undef T3
00539 #undef C4
00540 }
00541
00542
00543 DECLARE_ALIGNED(8, static const int32_t, rounder0)[] =
00544 rounder ((1 << (COL_SHIFT - 1)) - 0.5);
00545 DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0);
00546 DECLARE_ALIGNED(8, static const int32_t, rounder1)[] =
00547 rounder (1.25683487303);
00548 DECLARE_ALIGNED(8, static const int32_t, rounder7)[] =
00549 rounder (-0.25);
00550 DECLARE_ALIGNED(8, static const int32_t, rounder2)[] =
00551 rounder (0.60355339059);
00552 DECLARE_ALIGNED(8, static const int32_t, rounder6)[] =
00553 rounder (-0.25);
00554 DECLARE_ALIGNED(8, static const int32_t, rounder3)[] =
00555 rounder (0.087788325588);
00556 DECLARE_ALIGNED(8, static const int32_t, rounder5)[] =
00557 rounder (-0.441341716183);
00558
00559 #undef COL_SHIFT
00560 #undef ROW_SHIFT
00561
00562 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
00563 void idct (int16_t * const block) \
00564 { \
00565 DECLARE_ALIGNED(16, static const int16_t, table04)[] = \
00566 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
00567 DECLARE_ALIGNED(16, static const int16_t, table17)[] = \
00568 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
00569 DECLARE_ALIGNED(16, static const int16_t, table26)[] = \
00570 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
00571 DECLARE_ALIGNED(16, static const int16_t, table35)[] = \
00572 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
00573 \
00574 idct_row_head (block, 0*8, table04); \
00575 idct_row (table04, rounder0); \
00576 idct_row_mid (block, 0*8, 4*8, table04); \
00577 idct_row (table04, rounder4); \
00578 idct_row_mid (block, 4*8, 1*8, table17); \
00579 idct_row (table17, rounder1); \
00580 idct_row_mid (block, 1*8, 7*8, table17); \
00581 idct_row (table17, rounder7); \
00582 idct_row_mid (block, 7*8, 2*8, table26); \
00583 idct_row (table26, rounder2); \
00584 idct_row_mid (block, 2*8, 6*8, table26); \
00585 idct_row (table26, rounder6); \
00586 idct_row_mid (block, 6*8, 3*8, table35); \
00587 idct_row (table35, rounder3); \
00588 idct_row_mid (block, 3*8, 5*8, table35); \
00589 idct_row (table35, rounder5); \
00590 idct_row_tail (block, 5*8); \
00591 \
00592 idct_col (block, 0); \
00593 idct_col (block, 4); \
00594 }
00595
00596 declare_idct (ff_mmxext_idct, mmxext_table,
00597 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
00598
00599 declare_idct (ff_mmx_idct, mmx_table,
00600 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
00601