00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/x86_cpu.h"
00026 #include "libavcodec/dsputil.h"
00027 #include "libavcodec/mpegvideo.h"
00028 #include "libavcodec/mathops.h"
00029 #include "dsputil_mmx.h"
00030
00031
00032 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
00033 {
00034 __asm__ volatile(
00035 "mov $-128, %%"REG_a" \n\t"
00036 "pxor %%mm7, %%mm7 \n\t"
00037 ASMALIGN(4)
00038 "1: \n\t"
00039 "movq (%0), %%mm0 \n\t"
00040 "movq (%0, %2), %%mm2 \n\t"
00041 "movq %%mm0, %%mm1 \n\t"
00042 "movq %%mm2, %%mm3 \n\t"
00043 "punpcklbw %%mm7, %%mm0 \n\t"
00044 "punpckhbw %%mm7, %%mm1 \n\t"
00045 "punpcklbw %%mm7, %%mm2 \n\t"
00046 "punpckhbw %%mm7, %%mm3 \n\t"
00047 "movq %%mm0, (%1, %%"REG_a") \n\t"
00048 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
00049 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
00050 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
00051 "add %3, %0 \n\t"
00052 "add $32, %%"REG_a" \n\t"
00053 "js 1b \n\t"
00054 : "+r" (pixels)
00055 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
00056 : "%"REG_a
00057 );
00058 }
00059
00060 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
00061 {
00062 __asm__ volatile(
00063 "pxor %%xmm7, %%xmm7 \n\t"
00064 "movq (%0), %%xmm0 \n\t"
00065 "movq (%0, %2), %%xmm1 \n\t"
00066 "movq (%0, %2,2), %%xmm2 \n\t"
00067 "movq (%0, %3), %%xmm3 \n\t"
00068 "lea (%0,%2,4), %0 \n\t"
00069 "punpcklbw %%xmm7, %%xmm0 \n\t"
00070 "punpcklbw %%xmm7, %%xmm1 \n\t"
00071 "punpcklbw %%xmm7, %%xmm2 \n\t"
00072 "punpcklbw %%xmm7, %%xmm3 \n\t"
00073 "movdqa %%xmm0, (%1) \n\t"
00074 "movdqa %%xmm1, 16(%1) \n\t"
00075 "movdqa %%xmm2, 32(%1) \n\t"
00076 "movdqa %%xmm3, 48(%1) \n\t"
00077 "movq (%0), %%xmm0 \n\t"
00078 "movq (%0, %2), %%xmm1 \n\t"
00079 "movq (%0, %2,2), %%xmm2 \n\t"
00080 "movq (%0, %3), %%xmm3 \n\t"
00081 "punpcklbw %%xmm7, %%xmm0 \n\t"
00082 "punpcklbw %%xmm7, %%xmm1 \n\t"
00083 "punpcklbw %%xmm7, %%xmm2 \n\t"
00084 "punpcklbw %%xmm7, %%xmm3 \n\t"
00085 "movdqa %%xmm0, 64(%1) \n\t"
00086 "movdqa %%xmm1, 80(%1) \n\t"
00087 "movdqa %%xmm2, 96(%1) \n\t"
00088 "movdqa %%xmm3, 112(%1) \n\t"
00089 : "+r" (pixels)
00090 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
00091 );
00092 }
00093
00094 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
00095 {
00096 __asm__ volatile(
00097 "pxor %%mm7, %%mm7 \n\t"
00098 "mov $-128, %%"REG_a" \n\t"
00099 ASMALIGN(4)
00100 "1: \n\t"
00101 "movq (%0), %%mm0 \n\t"
00102 "movq (%1), %%mm2 \n\t"
00103 "movq %%mm0, %%mm1 \n\t"
00104 "movq %%mm2, %%mm3 \n\t"
00105 "punpcklbw %%mm7, %%mm0 \n\t"
00106 "punpckhbw %%mm7, %%mm1 \n\t"
00107 "punpcklbw %%mm7, %%mm2 \n\t"
00108 "punpckhbw %%mm7, %%mm3 \n\t"
00109 "psubw %%mm2, %%mm0 \n\t"
00110 "psubw %%mm3, %%mm1 \n\t"
00111 "movq %%mm0, (%2, %%"REG_a") \n\t"
00112 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
00113 "add %3, %0 \n\t"
00114 "add %3, %1 \n\t"
00115 "add $16, %%"REG_a" \n\t"
00116 "jnz 1b \n\t"
00117 : "+r" (s1), "+r" (s2)
00118 : "r" (block+64), "r" ((x86_reg)stride)
00119 : "%"REG_a
00120 );
00121 }
00122
00123 static int pix_sum16_mmx(uint8_t * pix, int line_size){
00124 const int h=16;
00125 int sum;
00126 x86_reg index= -line_size*h;
00127
00128 __asm__ volatile(
00129 "pxor %%mm7, %%mm7 \n\t"
00130 "pxor %%mm6, %%mm6 \n\t"
00131 "1: \n\t"
00132 "movq (%2, %1), %%mm0 \n\t"
00133 "movq (%2, %1), %%mm1 \n\t"
00134 "movq 8(%2, %1), %%mm2 \n\t"
00135 "movq 8(%2, %1), %%mm3 \n\t"
00136 "punpcklbw %%mm7, %%mm0 \n\t"
00137 "punpckhbw %%mm7, %%mm1 \n\t"
00138 "punpcklbw %%mm7, %%mm2 \n\t"
00139 "punpckhbw %%mm7, %%mm3 \n\t"
00140 "paddw %%mm0, %%mm1 \n\t"
00141 "paddw %%mm2, %%mm3 \n\t"
00142 "paddw %%mm1, %%mm3 \n\t"
00143 "paddw %%mm3, %%mm6 \n\t"
00144 "add %3, %1 \n\t"
00145 " js 1b \n\t"
00146 "movq %%mm6, %%mm5 \n\t"
00147 "psrlq $32, %%mm6 \n\t"
00148 "paddw %%mm5, %%mm6 \n\t"
00149 "movq %%mm6, %%mm5 \n\t"
00150 "psrlq $16, %%mm6 \n\t"
00151 "paddw %%mm5, %%mm6 \n\t"
00152 "movd %%mm6, %0 \n\t"
00153 "andl $0xFFFF, %0 \n\t"
00154 : "=&r" (sum), "+r" (index)
00155 : "r" (pix - index), "r" ((x86_reg)line_size)
00156 );
00157
00158 return sum;
00159 }
00160
00161 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
00162 int tmp;
00163 __asm__ volatile (
00164 "movl $16,%%ecx\n"
00165 "pxor %%mm0,%%mm0\n"
00166 "pxor %%mm7,%%mm7\n"
00167 "1:\n"
00168 "movq (%0),%%mm2\n"
00169 "movq 8(%0),%%mm3\n"
00170
00171 "movq %%mm2,%%mm1\n"
00172
00173 "punpckhbw %%mm0,%%mm1\n"
00174 "punpcklbw %%mm0,%%mm2\n"
00175
00176 "movq %%mm3,%%mm4\n"
00177 "punpckhbw %%mm0,%%mm3\n"
00178 "punpcklbw %%mm0,%%mm4\n"
00179
00180 "pmaddwd %%mm1,%%mm1\n"
00181 "pmaddwd %%mm2,%%mm2\n"
00182
00183 "pmaddwd %%mm3,%%mm3\n"
00184 "pmaddwd %%mm4,%%mm4\n"
00185
00186 "paddd %%mm1,%%mm2\n"
00187
00188 "paddd %%mm3,%%mm4\n"
00189 "paddd %%mm2,%%mm7\n"
00190
00191 "add %2, %0\n"
00192 "paddd %%mm4,%%mm7\n"
00193 "dec %%ecx\n"
00194 "jnz 1b\n"
00195
00196 "movq %%mm7,%%mm1\n"
00197 "psrlq $32, %%mm7\n"
00198 "paddd %%mm7,%%mm1\n"
00199 "movd %%mm1,%1\n"
00200 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
00201 return tmp;
00202 }
00203
00204 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00205 int tmp;
00206 __asm__ volatile (
00207 "movl %4,%%ecx\n"
00208 "shr $1,%%ecx\n"
00209 "pxor %%mm0,%%mm0\n"
00210 "pxor %%mm7,%%mm7\n"
00211 "1:\n"
00212 "movq (%0),%%mm1\n"
00213 "movq (%1),%%mm2\n"
00214 "movq (%0,%3),%%mm3\n"
00215 "movq (%1,%3),%%mm4\n"
00216
00217
00218
00219
00220 "movq %%mm1,%%mm5\n"
00221 "movq %%mm3,%%mm6\n"
00222 "psubusb %%mm2,%%mm1\n"
00223 "psubusb %%mm4,%%mm3\n"
00224 "psubusb %%mm5,%%mm2\n"
00225 "psubusb %%mm6,%%mm4\n"
00226
00227 "por %%mm1,%%mm2\n"
00228 "por %%mm3,%%mm4\n"
00229
00230
00231 "movq %%mm2,%%mm1\n"
00232 "movq %%mm4,%%mm3\n"
00233
00234 "punpckhbw %%mm0,%%mm2\n"
00235 "punpckhbw %%mm0,%%mm4\n"
00236 "punpcklbw %%mm0,%%mm1\n"
00237 "punpcklbw %%mm0,%%mm3\n"
00238
00239 "pmaddwd %%mm2,%%mm2\n"
00240 "pmaddwd %%mm4,%%mm4\n"
00241 "pmaddwd %%mm1,%%mm1\n"
00242 "pmaddwd %%mm3,%%mm3\n"
00243
00244 "lea (%0,%3,2), %0\n"
00245 "lea (%1,%3,2), %1\n"
00246
00247 "paddd %%mm2,%%mm1\n"
00248 "paddd %%mm4,%%mm3\n"
00249 "paddd %%mm1,%%mm7\n"
00250 "paddd %%mm3,%%mm7\n"
00251
00252 "decl %%ecx\n"
00253 "jnz 1b\n"
00254
00255 "movq %%mm7,%%mm1\n"
00256 "psrlq $32, %%mm7\n"
00257 "paddd %%mm7,%%mm1\n"
00258 "movd %%mm1,%2\n"
00259 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00260 : "r" ((x86_reg)line_size) , "m" (h)
00261 : "%ecx");
00262 return tmp;
00263 }
00264
00265 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00266 int tmp;
00267 __asm__ volatile (
00268 "movl %4,%%ecx\n"
00269 "pxor %%mm0,%%mm0\n"
00270 "pxor %%mm7,%%mm7\n"
00271 "1:\n"
00272 "movq (%0),%%mm1\n"
00273 "movq (%1),%%mm2\n"
00274 "movq 8(%0),%%mm3\n"
00275 "movq 8(%1),%%mm4\n"
00276
00277
00278
00279
00280 "movq %%mm1,%%mm5\n"
00281 "movq %%mm3,%%mm6\n"
00282 "psubusb %%mm2,%%mm1\n"
00283 "psubusb %%mm4,%%mm3\n"
00284 "psubusb %%mm5,%%mm2\n"
00285 "psubusb %%mm6,%%mm4\n"
00286
00287 "por %%mm1,%%mm2\n"
00288 "por %%mm3,%%mm4\n"
00289
00290
00291 "movq %%mm2,%%mm1\n"
00292 "movq %%mm4,%%mm3\n"
00293
00294 "punpckhbw %%mm0,%%mm2\n"
00295 "punpckhbw %%mm0,%%mm4\n"
00296 "punpcklbw %%mm0,%%mm1\n"
00297 "punpcklbw %%mm0,%%mm3\n"
00298
00299 "pmaddwd %%mm2,%%mm2\n"
00300 "pmaddwd %%mm4,%%mm4\n"
00301 "pmaddwd %%mm1,%%mm1\n"
00302 "pmaddwd %%mm3,%%mm3\n"
00303
00304 "add %3,%0\n"
00305 "add %3,%1\n"
00306
00307 "paddd %%mm2,%%mm1\n"
00308 "paddd %%mm4,%%mm3\n"
00309 "paddd %%mm1,%%mm7\n"
00310 "paddd %%mm3,%%mm7\n"
00311
00312 "decl %%ecx\n"
00313 "jnz 1b\n"
00314
00315 "movq %%mm7,%%mm1\n"
00316 "psrlq $32, %%mm7\n"
00317 "paddd %%mm7,%%mm1\n"
00318 "movd %%mm1,%2\n"
00319 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00320 : "r" ((x86_reg)line_size) , "m" (h)
00321 : "%ecx");
00322 return tmp;
00323 }
00324
00325 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00326 int tmp;
00327 __asm__ volatile (
00328 "shr $1,%2\n"
00329 "pxor %%xmm0,%%xmm0\n"
00330 "pxor %%xmm7,%%xmm7\n"
00331 "1:\n"
00332 "movdqu (%0),%%xmm1\n"
00333 "movdqu (%1),%%xmm2\n"
00334 "movdqu (%0,%4),%%xmm3\n"
00335 "movdqu (%1,%4),%%xmm4\n"
00336
00337
00338
00339
00340 "movdqa %%xmm1,%%xmm5\n"
00341 "movdqa %%xmm3,%%xmm6\n"
00342 "psubusb %%xmm2,%%xmm1\n"
00343 "psubusb %%xmm4,%%xmm3\n"
00344 "psubusb %%xmm5,%%xmm2\n"
00345 "psubusb %%xmm6,%%xmm4\n"
00346
00347 "por %%xmm1,%%xmm2\n"
00348 "por %%xmm3,%%xmm4\n"
00349
00350
00351 "movdqa %%xmm2,%%xmm1\n"
00352 "movdqa %%xmm4,%%xmm3\n"
00353
00354 "punpckhbw %%xmm0,%%xmm2\n"
00355 "punpckhbw %%xmm0,%%xmm4\n"
00356 "punpcklbw %%xmm0,%%xmm1\n"
00357 "punpcklbw %%xmm0,%%xmm3\n"
00358
00359 "pmaddwd %%xmm2,%%xmm2\n"
00360 "pmaddwd %%xmm4,%%xmm4\n"
00361 "pmaddwd %%xmm1,%%xmm1\n"
00362 "pmaddwd %%xmm3,%%xmm3\n"
00363
00364 "lea (%0,%4,2), %0\n"
00365 "lea (%1,%4,2), %1\n"
00366
00367 "paddd %%xmm2,%%xmm1\n"
00368 "paddd %%xmm4,%%xmm3\n"
00369 "paddd %%xmm1,%%xmm7\n"
00370 "paddd %%xmm3,%%xmm7\n"
00371
00372 "decl %2\n"
00373 "jnz 1b\n"
00374
00375 "movdqa %%xmm7,%%xmm1\n"
00376 "psrldq $8, %%xmm7\n"
00377 "paddd %%xmm1,%%xmm7\n"
00378 "movdqa %%xmm7,%%xmm1\n"
00379 "psrldq $4, %%xmm7\n"
00380 "paddd %%xmm1,%%xmm7\n"
00381 "movd %%xmm7,%3\n"
00382 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
00383 : "r" ((x86_reg)line_size));
00384 return tmp;
00385 }
00386
00387 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
00388 int tmp;
00389 __asm__ volatile (
00390 "movl %3,%%ecx\n"
00391 "pxor %%mm7,%%mm7\n"
00392 "pxor %%mm6,%%mm6\n"
00393
00394 "movq (%0),%%mm0\n"
00395 "movq %%mm0, %%mm1\n"
00396 "psllq $8, %%mm0\n"
00397 "psrlq $8, %%mm1\n"
00398 "psrlq $8, %%mm0\n"
00399 "movq %%mm0, %%mm2\n"
00400 "movq %%mm1, %%mm3\n"
00401 "punpcklbw %%mm7,%%mm0\n"
00402 "punpcklbw %%mm7,%%mm1\n"
00403 "punpckhbw %%mm7,%%mm2\n"
00404 "punpckhbw %%mm7,%%mm3\n"
00405 "psubw %%mm1, %%mm0\n"
00406 "psubw %%mm3, %%mm2\n"
00407
00408 "add %2,%0\n"
00409
00410 "movq (%0),%%mm4\n"
00411 "movq %%mm4, %%mm1\n"
00412 "psllq $8, %%mm4\n"
00413 "psrlq $8, %%mm1\n"
00414 "psrlq $8, %%mm4\n"
00415 "movq %%mm4, %%mm5\n"
00416 "movq %%mm1, %%mm3\n"
00417 "punpcklbw %%mm7,%%mm4\n"
00418 "punpcklbw %%mm7,%%mm1\n"
00419 "punpckhbw %%mm7,%%mm5\n"
00420 "punpckhbw %%mm7,%%mm3\n"
00421 "psubw %%mm1, %%mm4\n"
00422 "psubw %%mm3, %%mm5\n"
00423 "psubw %%mm4, %%mm0\n"
00424 "psubw %%mm5, %%mm2\n"
00425 "pxor %%mm3, %%mm3\n"
00426 "pxor %%mm1, %%mm1\n"
00427 "pcmpgtw %%mm0, %%mm3\n\t"
00428 "pcmpgtw %%mm2, %%mm1\n\t"
00429 "pxor %%mm3, %%mm0\n"
00430 "pxor %%mm1, %%mm2\n"
00431 "psubw %%mm3, %%mm0\n"
00432 "psubw %%mm1, %%mm2\n"
00433 "paddw %%mm0, %%mm2\n"
00434 "paddw %%mm2, %%mm6\n"
00435
00436 "add %2,%0\n"
00437 "1:\n"
00438
00439 "movq (%0),%%mm0\n"
00440 "movq %%mm0, %%mm1\n"
00441 "psllq $8, %%mm0\n"
00442 "psrlq $8, %%mm1\n"
00443 "psrlq $8, %%mm0\n"
00444 "movq %%mm0, %%mm2\n"
00445 "movq %%mm1, %%mm3\n"
00446 "punpcklbw %%mm7,%%mm0\n"
00447 "punpcklbw %%mm7,%%mm1\n"
00448 "punpckhbw %%mm7,%%mm2\n"
00449 "punpckhbw %%mm7,%%mm3\n"
00450 "psubw %%mm1, %%mm0\n"
00451 "psubw %%mm3, %%mm2\n"
00452 "psubw %%mm0, %%mm4\n"
00453 "psubw %%mm2, %%mm5\n"
00454 "pxor %%mm3, %%mm3\n"
00455 "pxor %%mm1, %%mm1\n"
00456 "pcmpgtw %%mm4, %%mm3\n\t"
00457 "pcmpgtw %%mm5, %%mm1\n\t"
00458 "pxor %%mm3, %%mm4\n"
00459 "pxor %%mm1, %%mm5\n"
00460 "psubw %%mm3, %%mm4\n"
00461 "psubw %%mm1, %%mm5\n"
00462 "paddw %%mm4, %%mm5\n"
00463 "paddw %%mm5, %%mm6\n"
00464
00465 "add %2,%0\n"
00466
00467 "movq (%0),%%mm4\n"
00468 "movq %%mm4, %%mm1\n"
00469 "psllq $8, %%mm4\n"
00470 "psrlq $8, %%mm1\n"
00471 "psrlq $8, %%mm4\n"
00472 "movq %%mm4, %%mm5\n"
00473 "movq %%mm1, %%mm3\n"
00474 "punpcklbw %%mm7,%%mm4\n"
00475 "punpcklbw %%mm7,%%mm1\n"
00476 "punpckhbw %%mm7,%%mm5\n"
00477 "punpckhbw %%mm7,%%mm3\n"
00478 "psubw %%mm1, %%mm4\n"
00479 "psubw %%mm3, %%mm5\n"
00480 "psubw %%mm4, %%mm0\n"
00481 "psubw %%mm5, %%mm2\n"
00482 "pxor %%mm3, %%mm3\n"
00483 "pxor %%mm1, %%mm1\n"
00484 "pcmpgtw %%mm0, %%mm3\n\t"
00485 "pcmpgtw %%mm2, %%mm1\n\t"
00486 "pxor %%mm3, %%mm0\n"
00487 "pxor %%mm1, %%mm2\n"
00488 "psubw %%mm3, %%mm0\n"
00489 "psubw %%mm1, %%mm2\n"
00490 "paddw %%mm0, %%mm2\n"
00491 "paddw %%mm2, %%mm6\n"
00492
00493 "add %2,%0\n"
00494 "subl $2, %%ecx\n"
00495 " jnz 1b\n"
00496
00497 "movq %%mm6, %%mm0\n"
00498 "punpcklwd %%mm7,%%mm0\n"
00499 "punpckhwd %%mm7,%%mm6\n"
00500 "paddd %%mm0, %%mm6\n"
00501
00502 "movq %%mm6,%%mm0\n"
00503 "psrlq $32, %%mm6\n"
00504 "paddd %%mm6,%%mm0\n"
00505 "movd %%mm0,%1\n"
00506 : "+r" (pix1), "=r"(tmp)
00507 : "r" ((x86_reg)line_size) , "g" (h-2)
00508 : "%ecx");
00509 return tmp;
00510 }
00511
00512 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
00513 int tmp;
00514 uint8_t * pix= pix1;
00515 __asm__ volatile (
00516 "movl %3,%%ecx\n"
00517 "pxor %%mm7,%%mm7\n"
00518 "pxor %%mm6,%%mm6\n"
00519
00520 "movq (%0),%%mm0\n"
00521 "movq 1(%0),%%mm1\n"
00522 "movq %%mm0, %%mm2\n"
00523 "movq %%mm1, %%mm3\n"
00524 "punpcklbw %%mm7,%%mm0\n"
00525 "punpcklbw %%mm7,%%mm1\n"
00526 "punpckhbw %%mm7,%%mm2\n"
00527 "punpckhbw %%mm7,%%mm3\n"
00528 "psubw %%mm1, %%mm0\n"
00529 "psubw %%mm3, %%mm2\n"
00530
00531 "add %2,%0\n"
00532
00533 "movq (%0),%%mm4\n"
00534 "movq 1(%0),%%mm1\n"
00535 "movq %%mm4, %%mm5\n"
00536 "movq %%mm1, %%mm3\n"
00537 "punpcklbw %%mm7,%%mm4\n"
00538 "punpcklbw %%mm7,%%mm1\n"
00539 "punpckhbw %%mm7,%%mm5\n"
00540 "punpckhbw %%mm7,%%mm3\n"
00541 "psubw %%mm1, %%mm4\n"
00542 "psubw %%mm3, %%mm5\n"
00543 "psubw %%mm4, %%mm0\n"
00544 "psubw %%mm5, %%mm2\n"
00545 "pxor %%mm3, %%mm3\n"
00546 "pxor %%mm1, %%mm1\n"
00547 "pcmpgtw %%mm0, %%mm3\n\t"
00548 "pcmpgtw %%mm2, %%mm1\n\t"
00549 "pxor %%mm3, %%mm0\n"
00550 "pxor %%mm1, %%mm2\n"
00551 "psubw %%mm3, %%mm0\n"
00552 "psubw %%mm1, %%mm2\n"
00553 "paddw %%mm0, %%mm2\n"
00554 "paddw %%mm2, %%mm6\n"
00555
00556 "add %2,%0\n"
00557 "1:\n"
00558
00559 "movq (%0),%%mm0\n"
00560 "movq 1(%0),%%mm1\n"
00561 "movq %%mm0, %%mm2\n"
00562 "movq %%mm1, %%mm3\n"
00563 "punpcklbw %%mm7,%%mm0\n"
00564 "punpcklbw %%mm7,%%mm1\n"
00565 "punpckhbw %%mm7,%%mm2\n"
00566 "punpckhbw %%mm7,%%mm3\n"
00567 "psubw %%mm1, %%mm0\n"
00568 "psubw %%mm3, %%mm2\n"
00569 "psubw %%mm0, %%mm4\n"
00570 "psubw %%mm2, %%mm5\n"
00571 "pxor %%mm3, %%mm3\n"
00572 "pxor %%mm1, %%mm1\n"
00573 "pcmpgtw %%mm4, %%mm3\n\t"
00574 "pcmpgtw %%mm5, %%mm1\n\t"
00575 "pxor %%mm3, %%mm4\n"
00576 "pxor %%mm1, %%mm5\n"
00577 "psubw %%mm3, %%mm4\n"
00578 "psubw %%mm1, %%mm5\n"
00579 "paddw %%mm4, %%mm5\n"
00580 "paddw %%mm5, %%mm6\n"
00581
00582 "add %2,%0\n"
00583
00584 "movq (%0),%%mm4\n"
00585 "movq 1(%0),%%mm1\n"
00586 "movq %%mm4, %%mm5\n"
00587 "movq %%mm1, %%mm3\n"
00588 "punpcklbw %%mm7,%%mm4\n"
00589 "punpcklbw %%mm7,%%mm1\n"
00590 "punpckhbw %%mm7,%%mm5\n"
00591 "punpckhbw %%mm7,%%mm3\n"
00592 "psubw %%mm1, %%mm4\n"
00593 "psubw %%mm3, %%mm5\n"
00594 "psubw %%mm4, %%mm0\n"
00595 "psubw %%mm5, %%mm2\n"
00596 "pxor %%mm3, %%mm3\n"
00597 "pxor %%mm1, %%mm1\n"
00598 "pcmpgtw %%mm0, %%mm3\n\t"
00599 "pcmpgtw %%mm2, %%mm1\n\t"
00600 "pxor %%mm3, %%mm0\n"
00601 "pxor %%mm1, %%mm2\n"
00602 "psubw %%mm3, %%mm0\n"
00603 "psubw %%mm1, %%mm2\n"
00604 "paddw %%mm0, %%mm2\n"
00605 "paddw %%mm2, %%mm6\n"
00606
00607 "add %2,%0\n"
00608 "subl $2, %%ecx\n"
00609 " jnz 1b\n"
00610
00611 "movq %%mm6, %%mm0\n"
00612 "punpcklwd %%mm7,%%mm0\n"
00613 "punpckhwd %%mm7,%%mm6\n"
00614 "paddd %%mm0, %%mm6\n"
00615
00616 "movq %%mm6,%%mm0\n"
00617 "psrlq $32, %%mm6\n"
00618 "paddd %%mm6,%%mm0\n"
00619 "movd %%mm0,%1\n"
00620 : "+r" (pix1), "=r"(tmp)
00621 : "r" ((x86_reg)line_size) , "g" (h-2)
00622 : "%ecx");
00623 return tmp + hf_noise8_mmx(pix+8, line_size, h);
00624 }
00625
00626 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00627 MpegEncContext *c = p;
00628 int score1, score2;
00629
00630 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
00631 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
00632 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
00633
00634 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
00635 else return score1 + FFABS(score2)*8;
00636 }
00637
00638 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00639 MpegEncContext *c = p;
00640 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
00641 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
00642
00643 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
00644 else return score1 + FFABS(score2)*8;
00645 }
00646
00647 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
00648 int tmp;
00649
00650 assert( (((int)pix) & 7) == 0);
00651 assert((line_size &7) ==0);
00652
00653 #define SUM(in0, in1, out0, out1) \
00654 "movq (%0), %%mm2\n"\
00655 "movq 8(%0), %%mm3\n"\
00656 "add %2,%0\n"\
00657 "movq %%mm2, " #out0 "\n"\
00658 "movq %%mm3, " #out1 "\n"\
00659 "psubusb " #in0 ", %%mm2\n"\
00660 "psubusb " #in1 ", %%mm3\n"\
00661 "psubusb " #out0 ", " #in0 "\n"\
00662 "psubusb " #out1 ", " #in1 "\n"\
00663 "por %%mm2, " #in0 "\n"\
00664 "por %%mm3, " #in1 "\n"\
00665 "movq " #in0 ", %%mm2\n"\
00666 "movq " #in1 ", %%mm3\n"\
00667 "punpcklbw %%mm7, " #in0 "\n"\
00668 "punpcklbw %%mm7, " #in1 "\n"\
00669 "punpckhbw %%mm7, %%mm2\n"\
00670 "punpckhbw %%mm7, %%mm3\n"\
00671 "paddw " #in1 ", " #in0 "\n"\
00672 "paddw %%mm3, %%mm2\n"\
00673 "paddw %%mm2, " #in0 "\n"\
00674 "paddw " #in0 ", %%mm6\n"
00675
00676
00677 __asm__ volatile (
00678 "movl %3,%%ecx\n"
00679 "pxor %%mm6,%%mm6\n"
00680 "pxor %%mm7,%%mm7\n"
00681 "movq (%0),%%mm0\n"
00682 "movq 8(%0),%%mm1\n"
00683 "add %2,%0\n"
00684 "jmp 2f\n"
00685 "1:\n"
00686
00687 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00688 "2:\n"
00689 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00690
00691 "subl $2, %%ecx\n"
00692 "jnz 1b\n"
00693
00694 "movq %%mm6,%%mm0\n"
00695 "psrlq $32, %%mm6\n"
00696 "paddw %%mm6,%%mm0\n"
00697 "movq %%mm0,%%mm6\n"
00698 "psrlq $16, %%mm0\n"
00699 "paddw %%mm6,%%mm0\n"
00700 "movd %%mm0,%1\n"
00701 : "+r" (pix), "=r"(tmp)
00702 : "r" ((x86_reg)line_size) , "m" (h)
00703 : "%ecx");
00704 return tmp & 0xFFFF;
00705 }
00706 #undef SUM
00707
00708 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
00709 int tmp;
00710
00711 assert( (((int)pix) & 7) == 0);
00712 assert((line_size &7) ==0);
00713
00714 #define SUM(in0, in1, out0, out1) \
00715 "movq (%0), " #out0 "\n"\
00716 "movq 8(%0), " #out1 "\n"\
00717 "add %2,%0\n"\
00718 "psadbw " #out0 ", " #in0 "\n"\
00719 "psadbw " #out1 ", " #in1 "\n"\
00720 "paddw " #in1 ", " #in0 "\n"\
00721 "paddw " #in0 ", %%mm6\n"
00722
00723 __asm__ volatile (
00724 "movl %3,%%ecx\n"
00725 "pxor %%mm6,%%mm6\n"
00726 "pxor %%mm7,%%mm7\n"
00727 "movq (%0),%%mm0\n"
00728 "movq 8(%0),%%mm1\n"
00729 "add %2,%0\n"
00730 "jmp 2f\n"
00731 "1:\n"
00732
00733 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00734 "2:\n"
00735 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00736
00737 "subl $2, %%ecx\n"
00738 "jnz 1b\n"
00739
00740 "movd %%mm6,%1\n"
00741 : "+r" (pix), "=r"(tmp)
00742 : "r" ((x86_reg)line_size) , "m" (h)
00743 : "%ecx");
00744 return tmp;
00745 }
00746 #undef SUM
00747
00748 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00749 int tmp;
00750
00751 assert( (((int)pix1) & 7) == 0);
00752 assert( (((int)pix2) & 7) == 0);
00753 assert((line_size &7) ==0);
00754
00755 #define SUM(in0, in1, out0, out1) \
00756 "movq (%0),%%mm2\n"\
00757 "movq (%1)," #out0 "\n"\
00758 "movq 8(%0),%%mm3\n"\
00759 "movq 8(%1)," #out1 "\n"\
00760 "add %3,%0\n"\
00761 "add %3,%1\n"\
00762 "psubb " #out0 ", %%mm2\n"\
00763 "psubb " #out1 ", %%mm3\n"\
00764 "pxor %%mm7, %%mm2\n"\
00765 "pxor %%mm7, %%mm3\n"\
00766 "movq %%mm2, " #out0 "\n"\
00767 "movq %%mm3, " #out1 "\n"\
00768 "psubusb " #in0 ", %%mm2\n"\
00769 "psubusb " #in1 ", %%mm3\n"\
00770 "psubusb " #out0 ", " #in0 "\n"\
00771 "psubusb " #out1 ", " #in1 "\n"\
00772 "por %%mm2, " #in0 "\n"\
00773 "por %%mm3, " #in1 "\n"\
00774 "movq " #in0 ", %%mm2\n"\
00775 "movq " #in1 ", %%mm3\n"\
00776 "punpcklbw %%mm7, " #in0 "\n"\
00777 "punpcklbw %%mm7, " #in1 "\n"\
00778 "punpckhbw %%mm7, %%mm2\n"\
00779 "punpckhbw %%mm7, %%mm3\n"\
00780 "paddw " #in1 ", " #in0 "\n"\
00781 "paddw %%mm3, %%mm2\n"\
00782 "paddw %%mm2, " #in0 "\n"\
00783 "paddw " #in0 ", %%mm6\n"
00784
00785
00786 __asm__ volatile (
00787 "movl %4,%%ecx\n"
00788 "pxor %%mm6,%%mm6\n"
00789 "pcmpeqw %%mm7,%%mm7\n"
00790 "psllw $15, %%mm7\n"
00791 "packsswb %%mm7, %%mm7\n"
00792 "movq (%0),%%mm0\n"
00793 "movq (%1),%%mm2\n"
00794 "movq 8(%0),%%mm1\n"
00795 "movq 8(%1),%%mm3\n"
00796 "add %3,%0\n"
00797 "add %3,%1\n"
00798 "psubb %%mm2, %%mm0\n"
00799 "psubb %%mm3, %%mm1\n"
00800 "pxor %%mm7, %%mm0\n"
00801 "pxor %%mm7, %%mm1\n"
00802 "jmp 2f\n"
00803 "1:\n"
00804
00805 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00806 "2:\n"
00807 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00808
00809 "subl $2, %%ecx\n"
00810 "jnz 1b\n"
00811
00812 "movq %%mm6,%%mm0\n"
00813 "psrlq $32, %%mm6\n"
00814 "paddw %%mm6,%%mm0\n"
00815 "movq %%mm0,%%mm6\n"
00816 "psrlq $16, %%mm0\n"
00817 "paddw %%mm6,%%mm0\n"
00818 "movd %%mm0,%2\n"
00819 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00820 : "r" ((x86_reg)line_size) , "m" (h)
00821 : "%ecx");
00822 return tmp & 0x7FFF;
00823 }
00824 #undef SUM
00825
00826 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00827 int tmp;
00828
00829 assert( (((int)pix1) & 7) == 0);
00830 assert( (((int)pix2) & 7) == 0);
00831 assert((line_size &7) ==0);
00832
00833 #define SUM(in0, in1, out0, out1) \
00834 "movq (%0)," #out0 "\n"\
00835 "movq (%1),%%mm2\n"\
00836 "movq 8(%0)," #out1 "\n"\
00837 "movq 8(%1),%%mm3\n"\
00838 "add %3,%0\n"\
00839 "add %3,%1\n"\
00840 "psubb %%mm2, " #out0 "\n"\
00841 "psubb %%mm3, " #out1 "\n"\
00842 "pxor %%mm7, " #out0 "\n"\
00843 "pxor %%mm7, " #out1 "\n"\
00844 "psadbw " #out0 ", " #in0 "\n"\
00845 "psadbw " #out1 ", " #in1 "\n"\
00846 "paddw " #in1 ", " #in0 "\n"\
00847 "paddw " #in0 ", %%mm6\n"
00848
00849 __asm__ volatile (
00850 "movl %4,%%ecx\n"
00851 "pxor %%mm6,%%mm6\n"
00852 "pcmpeqw %%mm7,%%mm7\n"
00853 "psllw $15, %%mm7\n"
00854 "packsswb %%mm7, %%mm7\n"
00855 "movq (%0),%%mm0\n"
00856 "movq (%1),%%mm2\n"
00857 "movq 8(%0),%%mm1\n"
00858 "movq 8(%1),%%mm3\n"
00859 "add %3,%0\n"
00860 "add %3,%1\n"
00861 "psubb %%mm2, %%mm0\n"
00862 "psubb %%mm3, %%mm1\n"
00863 "pxor %%mm7, %%mm0\n"
00864 "pxor %%mm7, %%mm1\n"
00865 "jmp 2f\n"
00866 "1:\n"
00867
00868 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00869 "2:\n"
00870 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00871
00872 "subl $2, %%ecx\n"
00873 "jnz 1b\n"
00874
00875 "movd %%mm6,%2\n"
00876 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00877 : "r" ((x86_reg)line_size) , "m" (h)
00878 : "%ecx");
00879 return tmp;
00880 }
00881 #undef SUM
00882
00883 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
00884 x86_reg i=0;
00885 __asm__ volatile(
00886 "1: \n\t"
00887 "movq (%2, %0), %%mm0 \n\t"
00888 "movq (%1, %0), %%mm1 \n\t"
00889 "psubb %%mm0, %%mm1 \n\t"
00890 "movq %%mm1, (%3, %0) \n\t"
00891 "movq 8(%2, %0), %%mm0 \n\t"
00892 "movq 8(%1, %0), %%mm1 \n\t"
00893 "psubb %%mm0, %%mm1 \n\t"
00894 "movq %%mm1, 8(%3, %0) \n\t"
00895 "add $16, %0 \n\t"
00896 "cmp %4, %0 \n\t"
00897 " jb 1b \n\t"
00898 : "+r" (i)
00899 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
00900 );
00901 for(; i<w; i++)
00902 dst[i+0] = src1[i+0]-src2[i+0];
00903 }
00904
00905 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
00906 x86_reg i=0;
00907 uint8_t l, lt;
00908
00909 __asm__ volatile(
00910 "1: \n\t"
00911 "movq -1(%1, %0), %%mm0 \n\t"
00912 "movq (%1, %0), %%mm1 \n\t"
00913 "movq -1(%2, %0), %%mm2 \n\t"
00914 "movq (%2, %0), %%mm3 \n\t"
00915 "movq %%mm2, %%mm4 \n\t"
00916 "psubb %%mm0, %%mm2 \n\t"
00917 "paddb %%mm1, %%mm2 \n\t"
00918 "movq %%mm4, %%mm5 \n\t"
00919 "pmaxub %%mm1, %%mm4 \n\t"
00920 "pminub %%mm5, %%mm1 \n\t"
00921 "pminub %%mm2, %%mm4 \n\t"
00922 "pmaxub %%mm1, %%mm4 \n\t"
00923 "psubb %%mm4, %%mm3 \n\t"
00924 "movq %%mm3, (%3, %0) \n\t"
00925 "add $8, %0 \n\t"
00926 "cmp %4, %0 \n\t"
00927 " jb 1b \n\t"
00928 : "+r" (i)
00929 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
00930 );
00931
00932 l= *left;
00933 lt= *left_top;
00934
00935 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
00936
00937 *left_top= src1[w-1];
00938 *left = src2[w-1];
00939 }
00940
00941 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
00942 "mov"#m" "#p1", "#a" \n\t"\
00943 "mov"#m" "#p2", "#t" \n\t"\
00944 "punpcklbw "#a", "#t" \n\t"\
00945 "punpcklbw "#a", "#a" \n\t"\
00946 "psubw "#t", "#a" \n\t"\
00947
00948 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
00949 uint8_t *p1b=p1, *p2b=p2;\
00950 __asm__ volatile(\
00951 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
00952 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
00953 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
00954 "add %4, %1 \n\t"\
00955 "add %4, %2 \n\t"\
00956 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
00957 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
00958 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
00959 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
00960 "mov"#m1" "#mm"0, %0 \n\t"\
00961 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
00962 "mov"#m1" %0, "#mm"0 \n\t"\
00963 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
00964 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
00965 );\
00966 }
00967
00968
00969 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
00970 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
00971
00972 #define LBUTTERFLY2(a1,b1,a2,b2)\
00973 "paddw " #b1 ", " #a1 " \n\t"\
00974 "paddw " #b2 ", " #a2 " \n\t"\
00975 "paddw " #b1 ", " #b1 " \n\t"\
00976 "paddw " #b2 ", " #b2 " \n\t"\
00977 "psubw " #a1 ", " #b1 " \n\t"\
00978 "psubw " #a2 ", " #b2 " \n\t"
00979
00980 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
00981 LBUTTERFLY2(m0, m1, m2, m3)\
00982 LBUTTERFLY2(m4, m5, m6, m7)\
00983 LBUTTERFLY2(m0, m2, m1, m3)\
00984 LBUTTERFLY2(m4, m6, m5, m7)\
00985 LBUTTERFLY2(m0, m4, m1, m5)\
00986 LBUTTERFLY2(m2, m6, m3, m7)\
00987
00988 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
00989
00990 #define MMABS_MMX(a,z)\
00991 "pxor " #z ", " #z " \n\t"\
00992 "pcmpgtw " #a ", " #z " \n\t"\
00993 "pxor " #z ", " #a " \n\t"\
00994 "psubw " #z ", " #a " \n\t"
00995
00996 #define MMABS_MMX2(a,z)\
00997 "pxor " #z ", " #z " \n\t"\
00998 "psubw " #a ", " #z " \n\t"\
00999 "pmaxsw " #z ", " #a " \n\t"
01000
01001 #define MMABS_SSSE3(a,z)\
01002 "pabsw " #a ", " #a " \n\t"
01003
01004 #define MMABS_SUM(a,z, sum)\
01005 MMABS(a,z)\
01006 "paddusw " #a ", " #sum " \n\t"
01007
01008 #define MMABS_SUM_8x8_NOSPILL\
01009 MMABS(%%xmm0, %%xmm8)\
01010 MMABS(%%xmm1, %%xmm9)\
01011 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
01012 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
01013 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
01014 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
01015 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
01016 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
01017 "paddusw %%xmm1, %%xmm0 \n\t"
01018
01019 #if ARCH_X86_64
01020 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
01021 #else
01022 #define MMABS_SUM_8x8_SSE2\
01023 "movdqa %%xmm7, (%1) \n\t"\
01024 MMABS(%%xmm0, %%xmm7)\
01025 MMABS(%%xmm1, %%xmm7)\
01026 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
01027 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
01028 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
01029 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
01030 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
01031 "movdqa (%1), %%xmm2 \n\t"\
01032 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
01033 "paddusw %%xmm1, %%xmm0 \n\t"
01034 #endif
01035
01036
01037
01038
01039 #define HSUM_MMX(a, t, dst)\
01040 "movq "#a", "#t" \n\t"\
01041 "psrlq $32, "#a" \n\t"\
01042 "paddusw "#t", "#a" \n\t"\
01043 "movq "#a", "#t" \n\t"\
01044 "psrlq $16, "#a" \n\t"\
01045 "paddusw "#t", "#a" \n\t"\
01046 "movd "#a", "#dst" \n\t"\
01047
01048 #define HSUM_MMX2(a, t, dst)\
01049 "pshufw $0x0E, "#a", "#t" \n\t"\
01050 "paddusw "#t", "#a" \n\t"\
01051 "pshufw $0x01, "#a", "#t" \n\t"\
01052 "paddusw "#t", "#a" \n\t"\
01053 "movd "#a", "#dst" \n\t"\
01054
01055 #define HSUM_SSE2(a, t, dst)\
01056 "movhlps "#a", "#t" \n\t"\
01057 "paddusw "#t", "#a" \n\t"\
01058 "pshuflw $0x0E, "#a", "#t" \n\t"\
01059 "paddusw "#t", "#a" \n\t"\
01060 "pshuflw $0x01, "#a", "#t" \n\t"\
01061 "paddusw "#t", "#a" \n\t"\
01062 "movd "#a", "#dst" \n\t"\
01063
01064 #define HADAMARD8_DIFF_MMX(cpu) \
01065 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
01066 DECLARE_ALIGNED(8, uint64_t, temp)[13];\
01067 int sum;\
01068 \
01069 assert(h==8);\
01070 \
01071 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
01072 \
01073 __asm__ volatile(\
01074 HADAMARD48\
01075 \
01076 "movq %%mm7, 96(%1) \n\t"\
01077 \
01078 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
01079 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
01080 \
01081 "movq 96(%1), %%mm7 \n\t"\
01082 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
01083 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
01084 \
01085 : "=r" (sum)\
01086 : "r"(temp)\
01087 );\
01088 \
01089 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
01090 \
01091 __asm__ volatile(\
01092 HADAMARD48\
01093 \
01094 "movq %%mm7, 96(%1) \n\t"\
01095 \
01096 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
01097 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
01098 \
01099 "movq 96(%1), %%mm7 \n\t"\
01100 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
01101 "movq %%mm7, %%mm5 \n\t"\
01102 "movq %%mm6, %%mm7 \n\t"\
01103 "movq %%mm0, %%mm6 \n\t"\
01104 \
01105 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
01106 \
01107 HADAMARD48\
01108 "movq %%mm7, 64(%1) \n\t"\
01109 MMABS(%%mm0, %%mm7)\
01110 MMABS(%%mm1, %%mm7)\
01111 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
01112 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
01113 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
01114 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
01115 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
01116 "movq 64(%1), %%mm2 \n\t"\
01117 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
01118 "paddusw %%mm1, %%mm0 \n\t"\
01119 "movq %%mm0, 64(%1) \n\t"\
01120 \
01121 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
01122 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
01123 \
01124 HADAMARD48\
01125 "movq %%mm7, (%1) \n\t"\
01126 MMABS(%%mm0, %%mm7)\
01127 MMABS(%%mm1, %%mm7)\
01128 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
01129 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
01130 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
01131 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
01132 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
01133 "movq (%1), %%mm2 \n\t"\
01134 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
01135 "paddusw 64(%1), %%mm0 \n\t"\
01136 "paddusw %%mm1, %%mm0 \n\t"\
01137 \
01138 HSUM(%%mm0, %%mm1, %0)\
01139 \
01140 : "=r" (sum)\
01141 : "r"(temp)\
01142 );\
01143 return sum&0xFFFF;\
01144 }\
01145 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
01146
01147 #define HADAMARD8_DIFF_SSE2(cpu) \
01148 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
01149 DECLARE_ALIGNED(16, uint64_t, temp)[4];\
01150 int sum;\
01151 \
01152 assert(h==8);\
01153 \
01154 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
01155 \
01156 __asm__ volatile(\
01157 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
01158 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
01159 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
01160 MMABS_SUM_8x8\
01161 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
01162 : "=r" (sum)\
01163 : "r"(temp)\
01164 );\
01165 return sum&0xFFFF;\
01166 }\
01167 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
01168
01169 #define MMABS(a,z) MMABS_MMX(a,z)
01170 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
01171 HADAMARD8_DIFF_MMX(mmx)
01172 #undef MMABS
01173 #undef HSUM
01174
01175 #define MMABS(a,z) MMABS_MMX2(a,z)
01176 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
01177 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
01178 HADAMARD8_DIFF_MMX(mmx2)
01179 HADAMARD8_DIFF_SSE2(sse2)
01180 #undef MMABS
01181 #undef MMABS_SUM_8x8
01182 #undef HSUM
01183
01184 #if HAVE_SSSE3
01185 #define MMABS(a,z) MMABS_SSSE3(a,z)
01186 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
01187 HADAMARD8_DIFF_SSE2(ssse3)
01188 #undef MMABS
01189 #undef MMABS_SUM_8x8
01190 #endif
01191
01192 #define DCT_SAD4(m,mm,o)\
01193 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
01194 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
01195 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
01196 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
01197 MMABS_SUM(mm##2, mm##6, mm##0)\
01198 MMABS_SUM(mm##3, mm##7, mm##1)\
01199 MMABS_SUM(mm##4, mm##6, mm##0)\
01200 MMABS_SUM(mm##5, mm##7, mm##1)\
01201
01202 #define DCT_SAD_MMX\
01203 "pxor %%mm0, %%mm0 \n\t"\
01204 "pxor %%mm1, %%mm1 \n\t"\
01205 DCT_SAD4(q, %%mm, 0)\
01206 DCT_SAD4(q, %%mm, 8)\
01207 DCT_SAD4(q, %%mm, 64)\
01208 DCT_SAD4(q, %%mm, 72)\
01209 "paddusw %%mm1, %%mm0 \n\t"\
01210 HSUM(%%mm0, %%mm1, %0)
01211
01212 #define DCT_SAD_SSE2\
01213 "pxor %%xmm0, %%xmm0 \n\t"\
01214 "pxor %%xmm1, %%xmm1 \n\t"\
01215 DCT_SAD4(dqa, %%xmm, 0)\
01216 DCT_SAD4(dqa, %%xmm, 64)\
01217 "paddusw %%xmm1, %%xmm0 \n\t"\
01218 HSUM(%%xmm0, %%xmm1, %0)
01219
01220 #define DCT_SAD_FUNC(cpu) \
01221 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
01222 int sum;\
01223 __asm__ volatile(\
01224 DCT_SAD\
01225 :"=r"(sum)\
01226 :"r"(block)\
01227 );\
01228 return sum&0xFFFF;\
01229 }
01230
01231 #define DCT_SAD DCT_SAD_MMX
01232 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
01233 #define MMABS(a,z) MMABS_MMX(a,z)
01234 DCT_SAD_FUNC(mmx)
01235 #undef MMABS
01236 #undef HSUM
01237
01238 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
01239 #define MMABS(a,z) MMABS_MMX2(a,z)
01240 DCT_SAD_FUNC(mmx2)
01241 #undef HSUM
01242 #undef DCT_SAD
01243
01244 #define DCT_SAD DCT_SAD_SSE2
01245 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
01246 DCT_SAD_FUNC(sse2)
01247 #undef MMABS
01248
01249 #if HAVE_SSSE3
01250 #define MMABS(a,z) MMABS_SSSE3(a,z)
01251 DCT_SAD_FUNC(ssse3)
01252 #undef MMABS
01253 #endif
01254 #undef HSUM
01255 #undef DCT_SAD
01256
01257 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
01258 int sum;
01259 x86_reg i=size;
01260 __asm__ volatile(
01261 "pxor %%mm4, %%mm4 \n"
01262 "1: \n"
01263 "sub $8, %0 \n"
01264 "movq (%2,%0), %%mm2 \n"
01265 "movq (%3,%0,2), %%mm0 \n"
01266 "movq 8(%3,%0,2), %%mm1 \n"
01267 "punpckhbw %%mm2, %%mm3 \n"
01268 "punpcklbw %%mm2, %%mm2 \n"
01269 "psraw $8, %%mm3 \n"
01270 "psraw $8, %%mm2 \n"
01271 "psubw %%mm3, %%mm1 \n"
01272 "psubw %%mm2, %%mm0 \n"
01273 "pmaddwd %%mm1, %%mm1 \n"
01274 "pmaddwd %%mm0, %%mm0 \n"
01275 "paddd %%mm1, %%mm4 \n"
01276 "paddd %%mm0, %%mm4 \n"
01277 "jg 1b \n"
01278 "movq %%mm4, %%mm3 \n"
01279 "psrlq $32, %%mm3 \n"
01280 "paddd %%mm3, %%mm4 \n"
01281 "movd %%mm4, %1 \n"
01282 :"+r"(i), "=r"(sum)
01283 :"r"(pix1), "r"(pix2)
01284 );
01285 return sum;
01286 }
01287
01288 #define PHADDD(a, t)\
01289 "movq "#a", "#t" \n\t"\
01290 "psrlq $32, "#a" \n\t"\
01291 "paddd "#t", "#a" \n\t"
01292
01293
01294
01295
01296
01297 #define PMULHRW(x, y, s, o)\
01298 "pmulhw " #s ", "#x " \n\t"\
01299 "pmulhw " #s ", "#y " \n\t"\
01300 "paddw " #o ", "#x " \n\t"\
01301 "paddw " #o ", "#y " \n\t"\
01302 "psraw $1, "#x " \n\t"\
01303 "psraw $1, "#y " \n\t"
01304 #define DEF(x) x ## _mmx
01305 #define SET_RND MOVQ_WONE
01306 #define SCALE_OFFSET 1
01307
01308 #include "dsputil_mmx_qns_template.c"
01309
01310 #undef DEF
01311 #undef SET_RND
01312 #undef SCALE_OFFSET
01313 #undef PMULHRW
01314
01315 #define DEF(x) x ## _3dnow
01316 #define SET_RND(x)
01317 #define SCALE_OFFSET 0
01318 #define PMULHRW(x, y, s, o)\
01319 "pmulhrw " #s ", "#x " \n\t"\
01320 "pmulhrw " #s ", "#y " \n\t"
01321
01322 #include "dsputil_mmx_qns_template.c"
01323
01324 #undef DEF
01325 #undef SET_RND
01326 #undef SCALE_OFFSET
01327 #undef PMULHRW
01328
01329 #if HAVE_SSSE3
01330 #undef PHADDD
01331 #define DEF(x) x ## _ssse3
01332 #define SET_RND(x)
01333 #define SCALE_OFFSET -1
01334 #define PHADDD(a, t)\
01335 "pshufw $0x0E, "#a", "#t" \n\t"\
01336 "paddd "#t", "#a" \n\t"
01337 #define PMULHRW(x, y, s, o)\
01338 "pmulhrsw " #s ", "#x " \n\t"\
01339 "pmulhrsw " #s ", "#y " \n\t"
01340
01341 #include "dsputil_mmx_qns_template.c"
01342
01343 #undef DEF
01344 #undef SET_RND
01345 #undef SCALE_OFFSET
01346 #undef PMULHRW
01347 #undef PHADDD
01348 #endif //HAVE_SSSE3
01349
01350
01351 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
01352 {
01353 if (mm_flags & FF_MM_MMX) {
01354 const int dct_algo = avctx->dct_algo;
01355 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
01356 if(mm_flags & FF_MM_SSE2){
01357 c->fdct = ff_fdct_sse2;
01358 }else if(mm_flags & FF_MM_MMX2){
01359 c->fdct = ff_fdct_mmx2;
01360 }else{
01361 c->fdct = ff_fdct_mmx;
01362 }
01363 }
01364
01365 c->get_pixels = get_pixels_mmx;
01366 c->diff_pixels = diff_pixels_mmx;
01367 c->pix_sum = pix_sum16_mmx;
01368
01369 c->diff_bytes= diff_bytes_mmx;
01370 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
01371
01372 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
01373 c->hadamard8_diff[1]= hadamard8_diff_mmx;
01374
01375 c->pix_norm1 = pix_norm1_mmx;
01376 c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
01377 c->sse[1] = sse8_mmx;
01378 c->vsad[4]= vsad_intra16_mmx;
01379
01380 c->nsse[0] = nsse16_mmx;
01381 c->nsse[1] = nsse8_mmx;
01382 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01383 c->vsad[0] = vsad16_mmx;
01384 }
01385
01386 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01387 c->try_8x8basis= try_8x8basis_mmx;
01388 }
01389 c->add_8x8basis= add_8x8basis_mmx;
01390
01391 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
01392
01393
01394 if (mm_flags & FF_MM_MMX2) {
01395 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
01396 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
01397 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
01398 c->vsad[4]= vsad_intra16_mmx2;
01399
01400 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01401 c->vsad[0] = vsad16_mmx2;
01402 }
01403
01404 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
01405 }
01406
01407 if(mm_flags & FF_MM_SSE2){
01408 c->get_pixels = get_pixels_sse2;
01409 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
01410 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
01411 c->hadamard8_diff[1]= hadamard8_diff_sse2;
01412 #if CONFIG_LPC
01413 c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
01414 #endif
01415 }
01416
01417 #if HAVE_SSSE3
01418 if(mm_flags & FF_MM_SSSE3){
01419 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01420 c->try_8x8basis= try_8x8basis_ssse3;
01421 }
01422 c->add_8x8basis= add_8x8basis_ssse3;
01423 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
01424 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
01425 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
01426 }
01427 #endif
01428
01429 if(mm_flags & FF_MM_3DNOW){
01430 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01431 c->try_8x8basis= try_8x8basis_3dnow;
01432 }
01433 c->add_8x8basis= add_8x8basis_3dnow;
01434 }
01435 }
01436
01437 dsputil_init_pix_mmx(c, avctx);
01438 }