00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "dsputil_mmx.h"
00022
00023 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
00024 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
00025
00026
00027
00028
00029 #define SUMSUB_BADC( a, b, c, d ) \
00030 "paddw "#b", "#a" \n\t"\
00031 "paddw "#d", "#c" \n\t"\
00032 "paddw "#b", "#b" \n\t"\
00033 "paddw "#d", "#d" \n\t"\
00034 "psubw "#a", "#b" \n\t"\
00035 "psubw "#c", "#d" \n\t"
00036
00037 #define SUMSUBD2_AB( a, b, t ) \
00038 "movq "#b", "#t" \n\t"\
00039 "psraw $1 , "#b" \n\t"\
00040 "paddw "#a", "#b" \n\t"\
00041 "psraw $1 , "#a" \n\t"\
00042 "psubw "#t", "#a" \n\t"
00043
00044 #define IDCT4_1D( s02, s13, d02, d13, t ) \
00045 SUMSUB_BA ( s02, d02 )\
00046 SUMSUBD2_AB( s13, d13, t )\
00047 SUMSUB_BADC( d13, s02, s13, d02 )
00048
00049 #define STORE_DIFF_4P( p, t, z ) \
00050 "psraw $6, "#p" \n\t"\
00051 "movd (%0), "#t" \n\t"\
00052 "punpcklbw "#z", "#t" \n\t"\
00053 "paddsw "#t", "#p" \n\t"\
00054 "packuswb "#z", "#p" \n\t"\
00055 "movd "#p", (%0) \n\t"
00056
00057 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
00058 {
00059
00060 __asm__ volatile(
00061 "movq (%0), %%mm0 \n\t"
00062 "movq 8(%0), %%mm1 \n\t"
00063 "movq 16(%0), %%mm2 \n\t"
00064 "movq 24(%0), %%mm3 \n\t"
00065 :: "r"(block) );
00066
00067 __asm__ volatile(
00068
00069 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
00070
00071 "movq %0, %%mm6 \n\t"
00072
00073 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
00074
00075 "paddw %%mm6, %%mm3 \n\t"
00076
00077
00078 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
00079
00080 "pxor %%mm7, %%mm7 \n\t"
00081 :: "m"(ff_pw_32));
00082
00083 __asm__ volatile(
00084 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
00085 "add %1, %0 \n\t"
00086 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
00087 "add %1, %0 \n\t"
00088 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
00089 "add %1, %0 \n\t"
00090 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
00091 : "+r"(dst)
00092 : "r" ((x86_reg)stride)
00093 );
00094 }
00095
00096 static inline void h264_idct8_1d(int16_t *block)
00097 {
00098 __asm__ volatile(
00099 "movq 112(%0), %%mm7 \n\t"
00100 "movq 80(%0), %%mm0 \n\t"
00101 "movq 48(%0), %%mm3 \n\t"
00102 "movq 16(%0), %%mm5 \n\t"
00103
00104 "movq %%mm0, %%mm4 \n\t"
00105 "movq %%mm5, %%mm1 \n\t"
00106 "psraw $1, %%mm4 \n\t"
00107 "psraw $1, %%mm1 \n\t"
00108 "paddw %%mm0, %%mm4 \n\t"
00109 "paddw %%mm5, %%mm1 \n\t"
00110 "paddw %%mm7, %%mm4 \n\t"
00111 "paddw %%mm0, %%mm1 \n\t"
00112 "psubw %%mm5, %%mm4 \n\t"
00113 "paddw %%mm3, %%mm1 \n\t"
00114
00115 "psubw %%mm3, %%mm5 \n\t"
00116 "psubw %%mm3, %%mm0 \n\t"
00117 "paddw %%mm7, %%mm5 \n\t"
00118 "psubw %%mm7, %%mm0 \n\t"
00119 "psraw $1, %%mm3 \n\t"
00120 "psraw $1, %%mm7 \n\t"
00121 "psubw %%mm3, %%mm5 \n\t"
00122 "psubw %%mm7, %%mm0 \n\t"
00123
00124 "movq %%mm4, %%mm3 \n\t"
00125 "movq %%mm1, %%mm7 \n\t"
00126 "psraw $2, %%mm1 \n\t"
00127 "psraw $2, %%mm3 \n\t"
00128 "paddw %%mm5, %%mm3 \n\t"
00129 "psraw $2, %%mm5 \n\t"
00130 "paddw %%mm0, %%mm1 \n\t"
00131 "psraw $2, %%mm0 \n\t"
00132 "psubw %%mm4, %%mm5 \n\t"
00133 "psubw %%mm0, %%mm7 \n\t"
00134
00135 "movq 32(%0), %%mm2 \n\t"
00136 "movq 96(%0), %%mm6 \n\t"
00137 "movq %%mm2, %%mm4 \n\t"
00138 "movq %%mm6, %%mm0 \n\t"
00139 "psraw $1, %%mm4 \n\t"
00140 "psraw $1, %%mm6 \n\t"
00141 "psubw %%mm0, %%mm4 \n\t"
00142 "paddw %%mm2, %%mm6 \n\t"
00143
00144 "movq (%0), %%mm2 \n\t"
00145 "movq 64(%0), %%mm0 \n\t"
00146 SUMSUB_BA( %%mm0, %%mm2 )
00147 SUMSUB_BA( %%mm6, %%mm0 )
00148 SUMSUB_BA( %%mm4, %%mm2 )
00149 SUMSUB_BA( %%mm7, %%mm6 )
00150 SUMSUB_BA( %%mm5, %%mm4 )
00151 SUMSUB_BA( %%mm3, %%mm2 )
00152 SUMSUB_BA( %%mm1, %%mm0 )
00153 :: "r"(block)
00154 );
00155 }
00156
00157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
00158 {
00159 int i;
00160 DECLARE_ALIGNED(8, int16_t, b2)[64];
00161
00162 block[0] += 32;
00163
00164 for(i=0; i<2; i++){
00165 DECLARE_ALIGNED(8, uint64_t, tmp);
00166
00167 h264_idct8_1d(block+4*i);
00168
00169 __asm__ volatile(
00170 "movq %%mm7, %0 \n\t"
00171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
00172 "movq %%mm0, 8(%1) \n\t"
00173 "movq %%mm6, 24(%1) \n\t"
00174 "movq %%mm7, 40(%1) \n\t"
00175 "movq %%mm4, 56(%1) \n\t"
00176 "movq %0, %%mm7 \n\t"
00177 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
00178 "movq %%mm7, (%1) \n\t"
00179 "movq %%mm1, 16(%1) \n\t"
00180 "movq %%mm0, 32(%1) \n\t"
00181 "movq %%mm3, 48(%1) \n\t"
00182 : "=m"(tmp)
00183 : "r"(b2+32*i)
00184 : "memory"
00185 );
00186 }
00187
00188 for(i=0; i<2; i++){
00189 h264_idct8_1d(b2+4*i);
00190
00191 __asm__ volatile(
00192 "psraw $6, %%mm7 \n\t"
00193 "psraw $6, %%mm6 \n\t"
00194 "psraw $6, %%mm5 \n\t"
00195 "psraw $6, %%mm4 \n\t"
00196 "psraw $6, %%mm3 \n\t"
00197 "psraw $6, %%mm2 \n\t"
00198 "psraw $6, %%mm1 \n\t"
00199 "psraw $6, %%mm0 \n\t"
00200
00201 "movq %%mm7, (%0) \n\t"
00202 "movq %%mm5, 16(%0) \n\t"
00203 "movq %%mm3, 32(%0) \n\t"
00204 "movq %%mm1, 48(%0) \n\t"
00205 "movq %%mm0, 64(%0) \n\t"
00206 "movq %%mm2, 80(%0) \n\t"
00207 "movq %%mm4, 96(%0) \n\t"
00208 "movq %%mm6, 112(%0) \n\t"
00209 :: "r"(b2+4*i)
00210 : "memory"
00211 );
00212 }
00213
00214 add_pixels_clamped_mmx(b2, dst, stride);
00215 }
00216
00217 #define STORE_DIFF_8P( p, d, t, z )\
00218 "movq "#d", "#t" \n"\
00219 "psraw $6, "#p" \n"\
00220 "punpcklbw "#z", "#t" \n"\
00221 "paddsw "#t", "#p" \n"\
00222 "packuswb "#p", "#p" \n"\
00223 "movq "#p", "#d" \n"
00224
00225 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
00226 "movdqa "#c", "#a" \n"\
00227 "movdqa "#g", "#e" \n"\
00228 "psraw $1, "#c" \n"\
00229 "psraw $1, "#g" \n"\
00230 "psubw "#e", "#c" \n"\
00231 "paddw "#a", "#g" \n"\
00232 "movdqa "#b", "#e" \n"\
00233 "psraw $1, "#e" \n"\
00234 "paddw "#b", "#e" \n"\
00235 "paddw "#d", "#e" \n"\
00236 "paddw "#f", "#e" \n"\
00237 "movdqa "#f", "#a" \n"\
00238 "psraw $1, "#a" \n"\
00239 "paddw "#f", "#a" \n"\
00240 "paddw "#h", "#a" \n"\
00241 "psubw "#b", "#a" \n"\
00242 "psubw "#d", "#b" \n"\
00243 "psubw "#d", "#f" \n"\
00244 "paddw "#h", "#b" \n"\
00245 "psubw "#h", "#f" \n"\
00246 "psraw $1, "#d" \n"\
00247 "psraw $1, "#h" \n"\
00248 "psubw "#d", "#b" \n"\
00249 "psubw "#h", "#f" \n"\
00250 "movdqa "#e", "#d" \n"\
00251 "movdqa "#a", "#h" \n"\
00252 "psraw $2, "#d" \n"\
00253 "psraw $2, "#h" \n"\
00254 "paddw "#f", "#d" \n"\
00255 "paddw "#b", "#h" \n"\
00256 "psraw $2, "#f" \n"\
00257 "psraw $2, "#b" \n"\
00258 "psubw "#f", "#e" \n"\
00259 "psubw "#a", "#b" \n"\
00260 "movdqa 0x00(%1), "#a" \n"\
00261 "movdqa 0x40(%1), "#f" \n"\
00262 SUMSUB_BA(f, a)\
00263 SUMSUB_BA(g, f)\
00264 SUMSUB_BA(c, a)\
00265 SUMSUB_BA(e, g)\
00266 SUMSUB_BA(b, c)\
00267 SUMSUB_BA(h, a)\
00268 SUMSUB_BA(d, f)
00269
00270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
00271 {
00272 __asm__ volatile(
00273 "movdqa 0x10(%1), %%xmm1 \n"
00274 "movdqa 0x20(%1), %%xmm2 \n"
00275 "movdqa 0x30(%1), %%xmm3 \n"
00276 "movdqa 0x50(%1), %%xmm5 \n"
00277 "movdqa 0x60(%1), %%xmm6 \n"
00278 "movdqa 0x70(%1), %%xmm7 \n"
00279 H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
00280 TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
00281 "paddw %4, %%xmm4 \n"
00282 "movdqa %%xmm4, 0x00(%1) \n"
00283 "movdqa %%xmm2, 0x40(%1) \n"
00284 H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
00285 "movdqa %%xmm6, 0x60(%1) \n"
00286 "movdqa %%xmm7, 0x70(%1) \n"
00287 "pxor %%xmm7, %%xmm7 \n"
00288 STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
00289 STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
00290 STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
00291 STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
00292 "lea (%0,%2,4), %0 \n"
00293 STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
00294 STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
00295 "movdqa 0x60(%1), %%xmm0 \n"
00296 "movdqa 0x70(%1), %%xmm1 \n"
00297 STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
00298 STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
00299 :"+r"(dst)
00300 :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
00301 );
00302 }
00303
00304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00305 {
00306 int dc = (block[0] + 32) >> 6;
00307 __asm__ volatile(
00308 "movd %0, %%mm0 \n\t"
00309 "pshufw $0, %%mm0, %%mm0 \n\t"
00310 "pxor %%mm1, %%mm1 \n\t"
00311 "psubw %%mm0, %%mm1 \n\t"
00312 "packuswb %%mm0, %%mm0 \n\t"
00313 "packuswb %%mm1, %%mm1 \n\t"
00314 ::"r"(dc)
00315 );
00316 __asm__ volatile(
00317 "movd %0, %%mm2 \n\t"
00318 "movd %1, %%mm3 \n\t"
00319 "movd %2, %%mm4 \n\t"
00320 "movd %3, %%mm5 \n\t"
00321 "paddusb %%mm0, %%mm2 \n\t"
00322 "paddusb %%mm0, %%mm3 \n\t"
00323 "paddusb %%mm0, %%mm4 \n\t"
00324 "paddusb %%mm0, %%mm5 \n\t"
00325 "psubusb %%mm1, %%mm2 \n\t"
00326 "psubusb %%mm1, %%mm3 \n\t"
00327 "psubusb %%mm1, %%mm4 \n\t"
00328 "psubusb %%mm1, %%mm5 \n\t"
00329 "movd %%mm2, %0 \n\t"
00330 "movd %%mm3, %1 \n\t"
00331 "movd %%mm4, %2 \n\t"
00332 "movd %%mm5, %3 \n\t"
00333 :"+m"(*(uint32_t*)(dst+0*stride)),
00334 "+m"(*(uint32_t*)(dst+1*stride)),
00335 "+m"(*(uint32_t*)(dst+2*stride)),
00336 "+m"(*(uint32_t*)(dst+3*stride))
00337 );
00338 }
00339
00340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00341 {
00342 int dc = (block[0] + 32) >> 6;
00343 int y;
00344 __asm__ volatile(
00345 "movd %0, %%mm0 \n\t"
00346 "pshufw $0, %%mm0, %%mm0 \n\t"
00347 "pxor %%mm1, %%mm1 \n\t"
00348 "psubw %%mm0, %%mm1 \n\t"
00349 "packuswb %%mm0, %%mm0 \n\t"
00350 "packuswb %%mm1, %%mm1 \n\t"
00351 ::"r"(dc)
00352 );
00353 for(y=2; y--; dst += 4*stride){
00354 __asm__ volatile(
00355 "movq %0, %%mm2 \n\t"
00356 "movq %1, %%mm3 \n\t"
00357 "movq %2, %%mm4 \n\t"
00358 "movq %3, %%mm5 \n\t"
00359 "paddusb %%mm0, %%mm2 \n\t"
00360 "paddusb %%mm0, %%mm3 \n\t"
00361 "paddusb %%mm0, %%mm4 \n\t"
00362 "paddusb %%mm0, %%mm5 \n\t"
00363 "psubusb %%mm1, %%mm2 \n\t"
00364 "psubusb %%mm1, %%mm3 \n\t"
00365 "psubusb %%mm1, %%mm4 \n\t"
00366 "psubusb %%mm1, %%mm5 \n\t"
00367 "movq %%mm2, %0 \n\t"
00368 "movq %%mm3, %1 \n\t"
00369 "movq %%mm4, %2 \n\t"
00370 "movq %%mm5, %3 \n\t"
00371 :"+m"(*(uint64_t*)(dst+0*stride)),
00372 "+m"(*(uint64_t*)(dst+1*stride)),
00373 "+m"(*(uint64_t*)(dst+2*stride)),
00374 "+m"(*(uint64_t*)(dst+3*stride))
00375 );
00376 }
00377 }
00378
00379
00380 static const uint8_t scan8[16 + 2*4]={
00381 4+1*8, 5+1*8, 4+2*8, 5+2*8,
00382 6+1*8, 7+1*8, 6+2*8, 7+2*8,
00383 4+3*8, 5+3*8, 4+4*8, 5+4*8,
00384 6+3*8, 7+3*8, 6+4*8, 7+4*8,
00385 1+1*8, 2+1*8,
00386 1+2*8, 2+2*8,
00387 1+4*8, 2+4*8,
00388 1+5*8, 2+5*8,
00389 };
00390
00391 static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00392 int i;
00393 for(i=0; i<16; i++){
00394 if(nnzc[ scan8[i] ])
00395 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
00396 }
00397 }
00398
00399 static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00400 int i;
00401 for(i=0; i<16; i+=4){
00402 if(nnzc[ scan8[i] ])
00403 ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
00404 }
00405 }
00406
00407
00408 static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00409 int i;
00410 for(i=0; i<16; i++){
00411 int nnz = nnzc[ scan8[i] ];
00412 if(nnz){
00413 if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00414 else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
00415 }
00416 }
00417 }
00418
00419 static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00420 int i;
00421 for(i=0; i<16; i++){
00422 if(nnzc[ scan8[i] ] || block[i*16])
00423 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
00424 }
00425 }
00426
00427 static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00428 int i;
00429 for(i=0; i<16; i++){
00430 if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
00431 else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00432 }
00433 }
00434
00435 static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00436 int i;
00437 for(i=0; i<16; i+=4){
00438 int nnz = nnzc[ scan8[i] ];
00439 if(nnz){
00440 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00441 else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride);
00442 }
00443 }
00444 }
00445
00446 static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00447 int i;
00448 for(i=0; i<16; i+=4){
00449 int nnz = nnzc[ scan8[i] ];
00450 if(nnz){
00451 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00452 else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride);
00453 }
00454 }
00455 }
00456
00457 static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00458 int i;
00459 for(i=16; i<16+8; i++){
00460 if(nnzc[ scan8[i] ] || block[i*16])
00461 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00462 }
00463 }
00464
00465 static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00466 int i;
00467 for(i=16; i<16+8; i++){
00468 if(nnzc[ scan8[i] ])
00469 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00470 else if(block[i*16])
00471 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00472 }
00473 }
00474
00475 #if CONFIG_GPL && HAVE_YASM
00476 static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
00477 {
00478 __asm__ volatile(
00479 "movd %0, %%mm0 \n\t"
00480 "punpcklwd %1, %%mm0 \n\t"
00481 "paddsw %2, %%mm0 \n\t"
00482 "psraw $6, %%mm0 \n\t"
00483 "punpcklwd %%mm0, %%mm0 \n\t"
00484 "pxor %%mm1, %%mm1 \n\t"
00485 "psubw %%mm0, %%mm1 \n\t"
00486 "packuswb %%mm1, %%mm0 \n\t"
00487 "pshufw $0xFA, %%mm0, %%mm1 \n\t"
00488 "punpcklwd %%mm0, %%mm0 \n\t"
00489 ::"m"(block[ 0]),
00490 "m"(block[16]),
00491 "m"(ff_pw_32)
00492 );
00493 __asm__ volatile(
00494 "movq %0, %%mm2 \n\t"
00495 "movq %1, %%mm3 \n\t"
00496 "movq %2, %%mm4 \n\t"
00497 "movq %3, %%mm5 \n\t"
00498 "paddusb %%mm0, %%mm2 \n\t"
00499 "paddusb %%mm0, %%mm3 \n\t"
00500 "paddusb %%mm0, %%mm4 \n\t"
00501 "paddusb %%mm0, %%mm5 \n\t"
00502 "psubusb %%mm1, %%mm2 \n\t"
00503 "psubusb %%mm1, %%mm3 \n\t"
00504 "psubusb %%mm1, %%mm4 \n\t"
00505 "psubusb %%mm1, %%mm5 \n\t"
00506 "movq %%mm2, %0 \n\t"
00507 "movq %%mm3, %1 \n\t"
00508 "movq %%mm4, %2 \n\t"
00509 "movq %%mm5, %3 \n\t"
00510 :"+m"(*(uint64_t*)(dst+0*stride)),
00511 "+m"(*(uint64_t*)(dst+1*stride)),
00512 "+m"(*(uint64_t*)(dst+2*stride)),
00513 "+m"(*(uint64_t*)(dst+3*stride))
00514 );
00515 }
00516
00517 extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
00518
00519 static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00520 int i;
00521 for(i=0; i<16; i+=2)
00522 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00523 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
00524 }
00525
00526 static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00527 int i;
00528 for(i=0; i<16; i+=2){
00529 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00530 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
00531 else if(block[i*16]|block[i*16+16])
00532 ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
00533 }
00534 }
00535
00536 static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00537 int i;
00538 for(i=16; i<16+8; i+=2){
00539 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00540 ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00541 else if(block[i*16]|block[i*16+16])
00542 ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00543 }
00544 }
00545 #endif
00546
00547
00548
00549
00550
00551
00552 #define DIFF_GT_MMX(x,y,a,o,t)\
00553 "movq "#y", "#t" \n\t"\
00554 "movq "#x", "#o" \n\t"\
00555 "psubusb "#x", "#t" \n\t"\
00556 "psubusb "#y", "#o" \n\t"\
00557 "por "#t", "#o" \n\t"\
00558 "psubusb "#a", "#o" \n\t"
00559
00560
00561
00562 #define DIFF_GT2_MMX(x,y,a,o,t)\
00563 "movq "#y", "#t" \n\t"\
00564 "movq "#x", "#o" \n\t"\
00565 "psubusb "#x", "#t" \n\t"\
00566 "psubusb "#y", "#o" \n\t"\
00567 "psubusb "#a", "#t" \n\t"\
00568 "psubusb "#a", "#o" \n\t"\
00569 "pcmpeqb "#t", "#o" \n\t"\
00570
00571
00572
00573
00574 #define H264_DEBLOCK_MASK(alpha1, beta1) \
00575 "pshufw $0, "#alpha1", %%mm4 \n\t"\
00576 "pshufw $0, "#beta1 ", %%mm5 \n\t"\
00577 "packuswb %%mm4, %%mm4 \n\t"\
00578 "packuswb %%mm5, %%mm5 \n\t"\
00579 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) \
00580 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) \
00581 "por %%mm4, %%mm7 \n\t"\
00582 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) \
00583 "por %%mm4, %%mm7 \n\t"\
00584 "pxor %%mm6, %%mm6 \n\t"\
00585 "pcmpeqb %%mm6, %%mm7 \n\t"
00586
00587
00588
00589
00590 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
00591 "movq %%mm1 , %%mm5 \n\t"\
00592 "pxor %%mm2 , %%mm5 \n\t" \
00593 "pand "#pb_01" , %%mm5 \n\t" \
00594 "pcmpeqb %%mm4 , %%mm4 \n\t"\
00595 "pxor %%mm4 , %%mm3 \n\t"\
00596 "pavgb %%mm0 , %%mm3 \n\t" \
00597 "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" \
00598 "pxor %%mm1 , %%mm4 \n\t"\
00599 "pavgb %%mm2 , %%mm4 \n\t" \
00600 "pavgb %%mm5 , %%mm3 \n\t"\
00601 "paddusb %%mm4 , %%mm3 \n\t" \
00602 "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
00603 "psubusb %%mm3 , %%mm6 \n\t"\
00604 "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
00605 "pminub %%mm7 , %%mm6 \n\t"\
00606 "pminub %%mm7 , %%mm3 \n\t"\
00607 "psubusb %%mm6 , %%mm1 \n\t"\
00608 "psubusb %%mm3 , %%mm2 \n\t"\
00609 "paddusb %%mm3 , %%mm1 \n\t"\
00610 "paddusb %%mm6 , %%mm2 \n\t"
00611
00612
00613
00614
00615 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
00616 "movq %%mm1, "#tmp" \n\t"\
00617 "pavgb %%mm2, "#tmp" \n\t"\
00618 "pavgb "#tmp", "#q2" \n\t" \
00619 "pxor "q2addr", "#tmp" \n\t"\
00620 "pand %9, "#tmp" \n\t" \
00621 "psubusb "#tmp", "#q2" \n\t" \
00622 "movq "#p1", "#tmp" \n\t"\
00623 "psubusb "#tc0", "#tmp" \n\t"\
00624 "paddusb "#p1", "#tc0" \n\t"\
00625 "pmaxub "#tmp", "#q2" \n\t"\
00626 "pminub "#tc0", "#q2" \n\t"\
00627 "movq "#q2", "q1addr" \n\t"
00628
00629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00630 {
00631 DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
00632
00633 __asm__ volatile(
00634 "movq (%2,%4), %%mm0 \n\t"
00635 "movq (%2,%4,2), %%mm1 \n\t"
00636 "movq (%3), %%mm2 \n\t"
00637 "movq (%3,%4), %%mm3 \n\t"
00638 H264_DEBLOCK_MASK(%7, %8)
00639
00640 "movd %6, %%mm4 \n\t"
00641 "punpcklbw %%mm4, %%mm4 \n\t"
00642 "punpcklwd %%mm4, %%mm4 \n\t"
00643 "pcmpeqb %%mm3, %%mm3 \n\t"
00644 "movq %%mm4, %%mm6 \n\t"
00645 "pcmpgtb %%mm3, %%mm4 \n\t"
00646 "movq %%mm6, %1 \n\t"
00647 "pand %%mm4, %%mm7 \n\t"
00648 "movq %%mm7, %0 \n\t"
00649
00650
00651 "movq (%2), %%mm3 \n\t"
00652 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4)
00653 "pand %%mm7, %%mm6 \n\t"
00654 "pand %1, %%mm7 \n\t"
00655 "movq %%mm7, %%mm4 \n\t"
00656 "psubb %%mm6, %%mm7 \n\t"
00657 "pand %%mm4, %%mm6 \n\t"
00658 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
00659
00660
00661 "movq (%3,%4,2), %%mm4 \n\t"
00662 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3)
00663 "pand %0, %%mm6 \n\t"
00664 "movq %1, %%mm5 \n\t"
00665 "pand %%mm6, %%mm5 \n\t"
00666 "psubb %%mm6, %%mm7 \n\t"
00667 "movq (%3,%4), %%mm3 \n\t"
00668 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
00669
00670
00671 H264_DEBLOCK_P0_Q0(%9, unused)
00672 "movq %%mm1, (%2,%4,2) \n\t"
00673 "movq %%mm2, (%3) \n\t"
00674
00675 : "=m"(tmp0[0]), "=m"(tmp0[1])
00676 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
00677 "m"(*tmp0), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
00678 "m"(ff_bone)
00679 );
00680 }
00681
00682 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00683 {
00684 if((tc0[0] & tc0[1]) >= 0)
00685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00686 if((tc0[2] & tc0[3]) >= 0)
00687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
00688 }
00689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00690 {
00691
00692
00693 DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
00694 int i;
00695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
00696 if((tc0[0] & tc0[1]) < 0)
00697 continue;
00698 transpose4x4(trans, pix-4, 8, stride);
00699 transpose4x4(trans +4*8, pix, 8, stride);
00700 transpose4x4(trans+4, pix-4+4*stride, 8, stride);
00701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
00702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
00703 transpose4x4(pix-2, trans +2*8, stride, 8);
00704 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
00705 }
00706 }
00707
00708 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00709 {
00710 __asm__ volatile(
00711 "movq (%0), %%mm0 \n\t"
00712 "movq (%0,%2), %%mm1 \n\t"
00713 "movq (%1), %%mm2 \n\t"
00714 "movq (%1,%2), %%mm3 \n\t"
00715 H264_DEBLOCK_MASK(%4, %5)
00716 "movd %3, %%mm6 \n\t"
00717 "punpcklbw %%mm6, %%mm6 \n\t"
00718 "pand %%mm6, %%mm7 \n\t"
00719 H264_DEBLOCK_P0_Q0(%6, %7)
00720 "movq %%mm1, (%0,%2) \n\t"
00721 "movq %%mm2, (%1) \n\t"
00722
00723 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
00724 "r"(*(uint32_t*)tc0),
00725 "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
00726 );
00727 }
00728
00729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00730 {
00731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00732 }
00733
00734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00735 {
00736
00737 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
00738 transpose4x4(trans, pix-2, 8, stride);
00739 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
00741 transpose4x4(pix-2, trans, stride, 8);
00742 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00743 }
00744
00745
00746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
00747 "movq "#p0", %%mm4 \n\t"\
00748 "pxor "#q1", %%mm4 \n\t"\
00749 "pand "#one", %%mm4 \n\t" \
00750 "pavgb "#q1", "#p0" \n\t"\
00751 "psubusb %%mm4, "#p0" \n\t"\
00752 "pavgb "#p1", "#p0" \n\t" \
00753
00754 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
00755 {
00756 __asm__ volatile(
00757 "movq (%0), %%mm0 \n\t"
00758 "movq (%0,%2), %%mm1 \n\t"
00759 "movq (%1), %%mm2 \n\t"
00760 "movq (%1,%2), %%mm3 \n\t"
00761 H264_DEBLOCK_MASK(%3, %4)
00762 "movq %%mm1, %%mm5 \n\t"
00763 "movq %%mm2, %%mm6 \n\t"
00764 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5)
00765 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5)
00766 "psubb %%mm5, %%mm1 \n\t"
00767 "psubb %%mm6, %%mm2 \n\t"
00768 "pand %%mm7, %%mm1 \n\t"
00769 "pand %%mm7, %%mm2 \n\t"
00770 "paddb %%mm5, %%mm1 \n\t"
00771 "paddb %%mm6, %%mm2 \n\t"
00772 "movq %%mm1, (%0,%2) \n\t"
00773 "movq %%mm2, (%1) \n\t"
00774 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
00775 "m"(alpha1), "m"(beta1), "m"(ff_bone)
00776 );
00777 }
00778
00779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00780 {
00781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
00782 }
00783
00784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00785 {
00786
00787 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
00788 transpose4x4(trans, pix-2, 8, stride);
00789 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
00791 transpose4x4(pix-2, trans, stride, 8);
00792 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00793 }
00794
00795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
00796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
00797 int dir;
00798 __asm__ volatile(
00799 "movq %0, %%mm7 \n"
00800 "movq %1, %%mm6 \n"
00801 ::"m"(ff_pb_1), "m"(ff_pb_3)
00802 );
00803 if(field)
00804 __asm__ volatile(
00805 "movq %0, %%mm6 \n"
00806 ::"m"(ff_pb_3_1)
00807 );
00808 __asm__ volatile(
00809 "movq %%mm6, %%mm5 \n"
00810 "paddb %%mm5, %%mm5 \n"
00811 :);
00812
00813
00814
00815 for( dir=1; dir>=0; dir-- ) {
00816 const x86_reg d_idx = dir ? -8 : -1;
00817 const int mask_mv = dir ? mask_mv1 : mask_mv0;
00818 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
00819 int b_idx, edge;
00820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
00821 __asm__ volatile(
00822 "pand %0, %%mm0 \n\t"
00823 ::"m"(mask_dir)
00824 );
00825 if(!(mask_mv & edge)) {
00826 if(bidir) {
00827 __asm__ volatile(
00828 "movd (%1,%0), %%mm2 \n"
00829 "punpckldq 40(%1,%0), %%mm2 \n"
00830 "pshufw $0x44, (%1), %%mm0 \n"
00831 "pshufw $0x44, 40(%1), %%mm1 \n"
00832 "pshufw $0x4E, %%mm2, %%mm3 \n"
00833 "psubb %%mm2, %%mm0 \n"
00834 "psubb %%mm3, %%mm1 \n"
00835 "1: \n"
00836 "por %%mm1, %%mm0 \n"
00837 "movq (%2,%0,4), %%mm1 \n"
00838 "movq 8(%2,%0,4), %%mm2 \n"
00839 "movq %%mm1, %%mm3 \n"
00840 "movq %%mm2, %%mm4 \n"
00841 "psubw (%2), %%mm1 \n"
00842 "psubw 8(%2), %%mm2 \n"
00843 "psubw 160(%2), %%mm3 \n"
00844 "psubw 168(%2), %%mm4 \n"
00845 "packsswb %%mm2, %%mm1 \n"
00846 "packsswb %%mm4, %%mm3 \n"
00847 "paddb %%mm6, %%mm1 \n"
00848 "paddb %%mm6, %%mm3 \n"
00849 "psubusb %%mm5, %%mm1 \n"
00850 "psubusb %%mm5, %%mm3 \n"
00851 "packsswb %%mm3, %%mm1 \n"
00852 "add $40, %0 \n"
00853 "cmp $40, %0 \n"
00854 "jl 1b \n"
00855 "sub $80, %0 \n"
00856 "pshufw $0x4E, %%mm1, %%mm1 \n"
00857 "por %%mm1, %%mm0 \n"
00858 "pshufw $0x4E, %%mm0, %%mm1 \n"
00859 "pminub %%mm1, %%mm0 \n"
00860 ::"r"(d_idx),
00861 "r"(ref[0]+b_idx),
00862 "r"(mv[0]+b_idx)
00863 );
00864 } else {
00865 __asm__ volatile(
00866 "movd (%1), %%mm0 \n"
00867 "psubb (%1,%0), %%mm0 \n"
00868 "movq (%2), %%mm1 \n"
00869 "movq 8(%2), %%mm2 \n"
00870 "psubw (%2,%0,4), %%mm1 \n"
00871 "psubw 8(%2,%0,4), %%mm2 \n"
00872 "packsswb %%mm2, %%mm1 \n"
00873 "paddb %%mm6, %%mm1 \n"
00874 "psubusb %%mm5, %%mm1 \n"
00875 "packsswb %%mm1, %%mm1 \n"
00876 "por %%mm1, %%mm0 \n"
00877 ::"r"(d_idx),
00878 "r"(ref[0]+b_idx),
00879 "r"(mv[0]+b_idx)
00880 );
00881 }
00882 }
00883 __asm__ volatile(
00884 "movd %0, %%mm1 \n"
00885 "por %1, %%mm1 \n"
00886 ::"m"(nnz[b_idx]),
00887 "m"(nnz[b_idx+d_idx])
00888 );
00889 __asm__ volatile(
00890 "pminub %%mm7, %%mm1 \n"
00891 "pminub %%mm7, %%mm0 \n"
00892 "psllw $1, %%mm1 \n"
00893 "pxor %%mm2, %%mm2 \n"
00894 "pmaxub %%mm0, %%mm1 \n"
00895 "punpcklbw %%mm2, %%mm1 \n"
00896 "movq %%mm1, %0 \n"
00897 :"=m"(*bS[dir][edge])
00898 ::"memory"
00899 );
00900 }
00901 edges = 4;
00902 step = 1;
00903 }
00904 __asm__ volatile(
00905 "movq (%0), %%mm0 \n\t"
00906 "movq 8(%0), %%mm1 \n\t"
00907 "movq 16(%0), %%mm2 \n\t"
00908 "movq 24(%0), %%mm3 \n\t"
00909 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
00910 "movq %%mm0, (%0) \n\t"
00911 "movq %%mm3, 8(%0) \n\t"
00912 "movq %%mm4, 16(%0) \n\t"
00913 "movq %%mm2, 24(%0) \n\t"
00914 ::"r"(bS[0])
00915 :"memory"
00916 );
00917 }
00918
00919
00920
00921
00922 #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
00923 "mov"#q" "#C", "#T" \n\t"\
00924 "mov"#d" (%0), "#F" \n\t"\
00925 "paddw "#D", "#T" \n\t"\
00926 "psllw $2, "#T" \n\t"\
00927 "psubw "#B", "#T" \n\t"\
00928 "psubw "#E", "#T" \n\t"\
00929 "punpcklbw "#Z", "#F" \n\t"\
00930 "pmullw %4, "#T" \n\t"\
00931 "paddw %5, "#A" \n\t"\
00932 "add %2, %0 \n\t"\
00933 "paddw "#F", "#A" \n\t"\
00934 "paddw "#A", "#T" \n\t"\
00935 "psraw $5, "#T" \n\t"\
00936 "packuswb "#T", "#T" \n\t"\
00937 OP(T, (%1), A, d)\
00938 "add %3, %1 \n\t"
00939
00940 #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
00941 "mov"#q" "#C", "#T" \n\t"\
00942 "mov"#d" (%0), "#F" \n\t"\
00943 "paddw "#D", "#T" \n\t"\
00944 "psllw $2, "#T" \n\t"\
00945 "paddw %4, "#A" \n\t"\
00946 "psubw "#B", "#T" \n\t"\
00947 "psubw "#E", "#T" \n\t"\
00948 "punpcklbw "#Z", "#F" \n\t"\
00949 "pmullw %3, "#T" \n\t"\
00950 "paddw "#F", "#A" \n\t"\
00951 "add %2, %0 \n\t"\
00952 "paddw "#A", "#T" \n\t"\
00953 "mov"#q" "#T", "#OF"(%1) \n\t"
00954
00955 #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
00956 #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
00957 #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
00958 #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
00959
00960
00961 #define QPEL_H264(OPNAME, OP, MMX)\
00962 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00963 int h=4;\
00964 \
00965 __asm__ volatile(\
00966 "pxor %%mm7, %%mm7 \n\t"\
00967 "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
00968 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
00969 "1: \n\t"\
00970 "movd -1(%0), %%mm1 \n\t"\
00971 "movd (%0), %%mm2 \n\t"\
00972 "movd 1(%0), %%mm3 \n\t"\
00973 "movd 2(%0), %%mm0 \n\t"\
00974 "punpcklbw %%mm7, %%mm1 \n\t"\
00975 "punpcklbw %%mm7, %%mm2 \n\t"\
00976 "punpcklbw %%mm7, %%mm3 \n\t"\
00977 "punpcklbw %%mm7, %%mm0 \n\t"\
00978 "paddw %%mm0, %%mm1 \n\t"\
00979 "paddw %%mm3, %%mm2 \n\t"\
00980 "movd -2(%0), %%mm0 \n\t"\
00981 "movd 3(%0), %%mm3 \n\t"\
00982 "punpcklbw %%mm7, %%mm0 \n\t"\
00983 "punpcklbw %%mm7, %%mm3 \n\t"\
00984 "paddw %%mm3, %%mm0 \n\t"\
00985 "psllw $2, %%mm2 \n\t"\
00986 "psubw %%mm1, %%mm2 \n\t"\
00987 "pmullw %%mm4, %%mm2 \n\t"\
00988 "paddw %%mm5, %%mm0 \n\t"\
00989 "paddw %%mm2, %%mm0 \n\t"\
00990 "psraw $5, %%mm0 \n\t"\
00991 "packuswb %%mm0, %%mm0 \n\t"\
00992 OP(%%mm0, (%1),%%mm6, d)\
00993 "add %3, %0 \n\t"\
00994 "add %4, %1 \n\t"\
00995 "decl %2 \n\t"\
00996 " jnz 1b \n\t"\
00997 : "+a"(src), "+c"(dst), "+g"(h)\
00998 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
00999 : "memory"\
01000 );\
01001 }\
01002 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01003 int h=4;\
01004 __asm__ volatile(\
01005 "pxor %%mm7, %%mm7 \n\t"\
01006 "movq %0, %%mm4 \n\t"\
01007 "movq %1, %%mm5 \n\t"\
01008 :: "m"(ff_pw_5), "m"(ff_pw_16)\
01009 );\
01010 do{\
01011 __asm__ volatile(\
01012 "movd -1(%0), %%mm1 \n\t"\
01013 "movd (%0), %%mm2 \n\t"\
01014 "movd 1(%0), %%mm3 \n\t"\
01015 "movd 2(%0), %%mm0 \n\t"\
01016 "punpcklbw %%mm7, %%mm1 \n\t"\
01017 "punpcklbw %%mm7, %%mm2 \n\t"\
01018 "punpcklbw %%mm7, %%mm3 \n\t"\
01019 "punpcklbw %%mm7, %%mm0 \n\t"\
01020 "paddw %%mm0, %%mm1 \n\t"\
01021 "paddw %%mm3, %%mm2 \n\t"\
01022 "movd -2(%0), %%mm0 \n\t"\
01023 "movd 3(%0), %%mm3 \n\t"\
01024 "punpcklbw %%mm7, %%mm0 \n\t"\
01025 "punpcklbw %%mm7, %%mm3 \n\t"\
01026 "paddw %%mm3, %%mm0 \n\t"\
01027 "psllw $2, %%mm2 \n\t"\
01028 "psubw %%mm1, %%mm2 \n\t"\
01029 "pmullw %%mm4, %%mm2 \n\t"\
01030 "paddw %%mm5, %%mm0 \n\t"\
01031 "paddw %%mm2, %%mm0 \n\t"\
01032 "movd (%2), %%mm3 \n\t"\
01033 "psraw $5, %%mm0 \n\t"\
01034 "packuswb %%mm0, %%mm0 \n\t"\
01035 PAVGB" %%mm3, %%mm0 \n\t"\
01036 OP(%%mm0, (%1),%%mm6, d)\
01037 "add %4, %0 \n\t"\
01038 "add %4, %1 \n\t"\
01039 "add %3, %2 \n\t"\
01040 : "+a"(src), "+c"(dst), "+d"(src2)\
01041 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
01042 : "memory"\
01043 );\
01044 }while(--h);\
01045 }\
01046 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01047 src -= 2*srcStride;\
01048 __asm__ volatile(\
01049 "pxor %%mm7, %%mm7 \n\t"\
01050 "movd (%0), %%mm0 \n\t"\
01051 "add %2, %0 \n\t"\
01052 "movd (%0), %%mm1 \n\t"\
01053 "add %2, %0 \n\t"\
01054 "movd (%0), %%mm2 \n\t"\
01055 "add %2, %0 \n\t"\
01056 "movd (%0), %%mm3 \n\t"\
01057 "add %2, %0 \n\t"\
01058 "movd (%0), %%mm4 \n\t"\
01059 "add %2, %0 \n\t"\
01060 "punpcklbw %%mm7, %%mm0 \n\t"\
01061 "punpcklbw %%mm7, %%mm1 \n\t"\
01062 "punpcklbw %%mm7, %%mm2 \n\t"\
01063 "punpcklbw %%mm7, %%mm3 \n\t"\
01064 "punpcklbw %%mm7, %%mm4 \n\t"\
01065 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01066 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01067 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01068 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01069 \
01070 : "+a"(src), "+c"(dst)\
01071 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01072 : "memory"\
01073 );\
01074 }\
01075 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01076 int h=4;\
01077 int w=3;\
01078 src -= 2*srcStride+2;\
01079 while(w--){\
01080 __asm__ volatile(\
01081 "pxor %%mm7, %%mm7 \n\t"\
01082 "movd (%0), %%mm0 \n\t"\
01083 "add %2, %0 \n\t"\
01084 "movd (%0), %%mm1 \n\t"\
01085 "add %2, %0 \n\t"\
01086 "movd (%0), %%mm2 \n\t"\
01087 "add %2, %0 \n\t"\
01088 "movd (%0), %%mm3 \n\t"\
01089 "add %2, %0 \n\t"\
01090 "movd (%0), %%mm4 \n\t"\
01091 "add %2, %0 \n\t"\
01092 "punpcklbw %%mm7, %%mm0 \n\t"\
01093 "punpcklbw %%mm7, %%mm1 \n\t"\
01094 "punpcklbw %%mm7, %%mm2 \n\t"\
01095 "punpcklbw %%mm7, %%mm3 \n\t"\
01096 "punpcklbw %%mm7, %%mm4 \n\t"\
01097 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
01098 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
01099 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
01100 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
01101 \
01102 : "+a"(src)\
01103 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01104 : "memory"\
01105 );\
01106 tmp += 4;\
01107 src += 4 - 9*srcStride;\
01108 }\
01109 tmp -= 3*4;\
01110 __asm__ volatile(\
01111 "1: \n\t"\
01112 "movq (%0), %%mm0 \n\t"\
01113 "paddw 10(%0), %%mm0 \n\t"\
01114 "movq 2(%0), %%mm1 \n\t"\
01115 "paddw 8(%0), %%mm1 \n\t"\
01116 "movq 4(%0), %%mm2 \n\t"\
01117 "paddw 6(%0), %%mm2 \n\t"\
01118 "psubw %%mm1, %%mm0 \n\t"\
01119 "psraw $2, %%mm0 \n\t"\
01120 "psubw %%mm1, %%mm0 \n\t"\
01121 "paddsw %%mm2, %%mm0 \n\t"\
01122 "psraw $2, %%mm0 \n\t"\
01123 "paddw %%mm2, %%mm0 \n\t"\
01124 "psraw $6, %%mm0 \n\t"\
01125 "packuswb %%mm0, %%mm0 \n\t"\
01126 OP(%%mm0, (%1),%%mm7, d)\
01127 "add $24, %0 \n\t"\
01128 "add %3, %1 \n\t"\
01129 "decl %2 \n\t"\
01130 " jnz 1b \n\t"\
01131 : "+a"(tmp), "+c"(dst), "+g"(h)\
01132 : "S"((x86_reg)dstStride)\
01133 : "memory"\
01134 );\
01135 }\
01136 \
01137 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01138 int h=8;\
01139 __asm__ volatile(\
01140 "pxor %%mm7, %%mm7 \n\t"\
01141 "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
01142 "1: \n\t"\
01143 "movq (%0), %%mm0 \n\t"\
01144 "movq 1(%0), %%mm2 \n\t"\
01145 "movq %%mm0, %%mm1 \n\t"\
01146 "movq %%mm2, %%mm3 \n\t"\
01147 "punpcklbw %%mm7, %%mm0 \n\t"\
01148 "punpckhbw %%mm7, %%mm1 \n\t"\
01149 "punpcklbw %%mm7, %%mm2 \n\t"\
01150 "punpckhbw %%mm7, %%mm3 \n\t"\
01151 "paddw %%mm2, %%mm0 \n\t"\
01152 "paddw %%mm3, %%mm1 \n\t"\
01153 "psllw $2, %%mm0 \n\t"\
01154 "psllw $2, %%mm1 \n\t"\
01155 "movq -1(%0), %%mm2 \n\t"\
01156 "movq 2(%0), %%mm4 \n\t"\
01157 "movq %%mm2, %%mm3 \n\t"\
01158 "movq %%mm4, %%mm5 \n\t"\
01159 "punpcklbw %%mm7, %%mm2 \n\t"\
01160 "punpckhbw %%mm7, %%mm3 \n\t"\
01161 "punpcklbw %%mm7, %%mm4 \n\t"\
01162 "punpckhbw %%mm7, %%mm5 \n\t"\
01163 "paddw %%mm4, %%mm2 \n\t"\
01164 "paddw %%mm3, %%mm5 \n\t"\
01165 "psubw %%mm2, %%mm0 \n\t"\
01166 "psubw %%mm5, %%mm1 \n\t"\
01167 "pmullw %%mm6, %%mm0 \n\t"\
01168 "pmullw %%mm6, %%mm1 \n\t"\
01169 "movd -2(%0), %%mm2 \n\t"\
01170 "movd 7(%0), %%mm5 \n\t"\
01171 "punpcklbw %%mm7, %%mm2 \n\t"\
01172 "punpcklbw %%mm7, %%mm5 \n\t"\
01173 "paddw %%mm3, %%mm2 \n\t"\
01174 "paddw %%mm5, %%mm4 \n\t"\
01175 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
01176 "paddw %%mm5, %%mm2 \n\t"\
01177 "paddw %%mm5, %%mm4 \n\t"\
01178 "paddw %%mm2, %%mm0 \n\t"\
01179 "paddw %%mm4, %%mm1 \n\t"\
01180 "psraw $5, %%mm0 \n\t"\
01181 "psraw $5, %%mm1 \n\t"\
01182 "packuswb %%mm1, %%mm0 \n\t"\
01183 OP(%%mm0, (%1),%%mm5, q)\
01184 "add %3, %0 \n\t"\
01185 "add %4, %1 \n\t"\
01186 "decl %2 \n\t"\
01187 " jnz 1b \n\t"\
01188 : "+a"(src), "+c"(dst), "+g"(h)\
01189 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
01190 : "memory"\
01191 );\
01192 }\
01193 \
01194 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01195 int h=8;\
01196 __asm__ volatile(\
01197 "pxor %%mm7, %%mm7 \n\t"\
01198 "movq %0, %%mm6 \n\t"\
01199 :: "m"(ff_pw_5)\
01200 );\
01201 do{\
01202 __asm__ volatile(\
01203 "movq (%0), %%mm0 \n\t"\
01204 "movq 1(%0), %%mm2 \n\t"\
01205 "movq %%mm0, %%mm1 \n\t"\
01206 "movq %%mm2, %%mm3 \n\t"\
01207 "punpcklbw %%mm7, %%mm0 \n\t"\
01208 "punpckhbw %%mm7, %%mm1 \n\t"\
01209 "punpcklbw %%mm7, %%mm2 \n\t"\
01210 "punpckhbw %%mm7, %%mm3 \n\t"\
01211 "paddw %%mm2, %%mm0 \n\t"\
01212 "paddw %%mm3, %%mm1 \n\t"\
01213 "psllw $2, %%mm0 \n\t"\
01214 "psllw $2, %%mm1 \n\t"\
01215 "movq -1(%0), %%mm2 \n\t"\
01216 "movq 2(%0), %%mm4 \n\t"\
01217 "movq %%mm2, %%mm3 \n\t"\
01218 "movq %%mm4, %%mm5 \n\t"\
01219 "punpcklbw %%mm7, %%mm2 \n\t"\
01220 "punpckhbw %%mm7, %%mm3 \n\t"\
01221 "punpcklbw %%mm7, %%mm4 \n\t"\
01222 "punpckhbw %%mm7, %%mm5 \n\t"\
01223 "paddw %%mm4, %%mm2 \n\t"\
01224 "paddw %%mm3, %%mm5 \n\t"\
01225 "psubw %%mm2, %%mm0 \n\t"\
01226 "psubw %%mm5, %%mm1 \n\t"\
01227 "pmullw %%mm6, %%mm0 \n\t"\
01228 "pmullw %%mm6, %%mm1 \n\t"\
01229 "movd -2(%0), %%mm2 \n\t"\
01230 "movd 7(%0), %%mm5 \n\t"\
01231 "punpcklbw %%mm7, %%mm2 \n\t"\
01232 "punpcklbw %%mm7, %%mm5 \n\t"\
01233 "paddw %%mm3, %%mm2 \n\t"\
01234 "paddw %%mm5, %%mm4 \n\t"\
01235 "movq %5, %%mm5 \n\t"\
01236 "paddw %%mm5, %%mm2 \n\t"\
01237 "paddw %%mm5, %%mm4 \n\t"\
01238 "paddw %%mm2, %%mm0 \n\t"\
01239 "paddw %%mm4, %%mm1 \n\t"\
01240 "psraw $5, %%mm0 \n\t"\
01241 "psraw $5, %%mm1 \n\t"\
01242 "movq (%2), %%mm4 \n\t"\
01243 "packuswb %%mm1, %%mm0 \n\t"\
01244 PAVGB" %%mm4, %%mm0 \n\t"\
01245 OP(%%mm0, (%1),%%mm5, q)\
01246 "add %4, %0 \n\t"\
01247 "add %4, %1 \n\t"\
01248 "add %3, %2 \n\t"\
01249 : "+a"(src), "+c"(dst), "+d"(src2)\
01250 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01251 "m"(ff_pw_16)\
01252 : "memory"\
01253 );\
01254 }while(--h);\
01255 }\
01256 \
01257 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01258 int w= 2;\
01259 src -= 2*srcStride;\
01260 \
01261 while(w--){\
01262 __asm__ volatile(\
01263 "pxor %%mm7, %%mm7 \n\t"\
01264 "movd (%0), %%mm0 \n\t"\
01265 "add %2, %0 \n\t"\
01266 "movd (%0), %%mm1 \n\t"\
01267 "add %2, %0 \n\t"\
01268 "movd (%0), %%mm2 \n\t"\
01269 "add %2, %0 \n\t"\
01270 "movd (%0), %%mm3 \n\t"\
01271 "add %2, %0 \n\t"\
01272 "movd (%0), %%mm4 \n\t"\
01273 "add %2, %0 \n\t"\
01274 "punpcklbw %%mm7, %%mm0 \n\t"\
01275 "punpcklbw %%mm7, %%mm1 \n\t"\
01276 "punpcklbw %%mm7, %%mm2 \n\t"\
01277 "punpcklbw %%mm7, %%mm3 \n\t"\
01278 "punpcklbw %%mm7, %%mm4 \n\t"\
01279 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01280 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01281 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01282 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01283 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01284 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01285 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01286 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01287 \
01288 : "+a"(src), "+c"(dst)\
01289 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01290 : "memory"\
01291 );\
01292 if(h==16){\
01293 __asm__ volatile(\
01294 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01295 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01296 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01297 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01298 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01299 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01300 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01301 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01302 \
01303 : "+a"(src), "+c"(dst)\
01304 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01305 : "memory"\
01306 );\
01307 }\
01308 src += 4-(h+5)*srcStride;\
01309 dst += 4-h*dstStride;\
01310 }\
01311 }\
01312 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
01313 int w = (size+8)>>2;\
01314 src -= 2*srcStride+2;\
01315 while(w--){\
01316 __asm__ volatile(\
01317 "pxor %%mm7, %%mm7 \n\t"\
01318 "movd (%0), %%mm0 \n\t"\
01319 "add %2, %0 \n\t"\
01320 "movd (%0), %%mm1 \n\t"\
01321 "add %2, %0 \n\t"\
01322 "movd (%0), %%mm2 \n\t"\
01323 "add %2, %0 \n\t"\
01324 "movd (%0), %%mm3 \n\t"\
01325 "add %2, %0 \n\t"\
01326 "movd (%0), %%mm4 \n\t"\
01327 "add %2, %0 \n\t"\
01328 "punpcklbw %%mm7, %%mm0 \n\t"\
01329 "punpcklbw %%mm7, %%mm1 \n\t"\
01330 "punpcklbw %%mm7, %%mm2 \n\t"\
01331 "punpcklbw %%mm7, %%mm3 \n\t"\
01332 "punpcklbw %%mm7, %%mm4 \n\t"\
01333 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
01334 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
01335 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
01336 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
01337 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
01338 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
01339 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
01340 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
01341 : "+a"(src)\
01342 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01343 : "memory"\
01344 );\
01345 if(size==16){\
01346 __asm__ volatile(\
01347 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
01348 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
01349 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
01350 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
01351 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
01352 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
01353 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
01354 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
01355 : "+a"(src)\
01356 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01357 : "memory"\
01358 );\
01359 }\
01360 tmp += 4;\
01361 src += 4 - (size+5)*srcStride;\
01362 }\
01363 }\
01364 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
01365 int w = size>>4;\
01366 do{\
01367 int h = size;\
01368 __asm__ volatile(\
01369 "1: \n\t"\
01370 "movq (%0), %%mm0 \n\t"\
01371 "movq 8(%0), %%mm3 \n\t"\
01372 "movq 2(%0), %%mm1 \n\t"\
01373 "movq 10(%0), %%mm4 \n\t"\
01374 "paddw %%mm4, %%mm0 \n\t"\
01375 "paddw %%mm3, %%mm1 \n\t"\
01376 "paddw 18(%0), %%mm3 \n\t"\
01377 "paddw 16(%0), %%mm4 \n\t"\
01378 "movq 4(%0), %%mm2 \n\t"\
01379 "movq 12(%0), %%mm5 \n\t"\
01380 "paddw 6(%0), %%mm2 \n\t"\
01381 "paddw 14(%0), %%mm5 \n\t"\
01382 "psubw %%mm1, %%mm0 \n\t"\
01383 "psubw %%mm4, %%mm3 \n\t"\
01384 "psraw $2, %%mm0 \n\t"\
01385 "psraw $2, %%mm3 \n\t"\
01386 "psubw %%mm1, %%mm0 \n\t"\
01387 "psubw %%mm4, %%mm3 \n\t"\
01388 "paddsw %%mm2, %%mm0 \n\t"\
01389 "paddsw %%mm5, %%mm3 \n\t"\
01390 "psraw $2, %%mm0 \n\t"\
01391 "psraw $2, %%mm3 \n\t"\
01392 "paddw %%mm2, %%mm0 \n\t"\
01393 "paddw %%mm5, %%mm3 \n\t"\
01394 "psraw $6, %%mm0 \n\t"\
01395 "psraw $6, %%mm3 \n\t"\
01396 "packuswb %%mm3, %%mm0 \n\t"\
01397 OP(%%mm0, (%1),%%mm7, q)\
01398 "add $48, %0 \n\t"\
01399 "add %3, %1 \n\t"\
01400 "decl %2 \n\t"\
01401 " jnz 1b \n\t"\
01402 : "+a"(tmp), "+c"(dst), "+g"(h)\
01403 : "S"((x86_reg)dstStride)\
01404 : "memory"\
01405 );\
01406 tmp += 8 - size*24;\
01407 dst += 8 - size*dstStride;\
01408 }while(w--);\
01409 }\
01410 \
01411 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01412 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
01413 }\
01414 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01415 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
01416 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01417 }\
01418 \
01419 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01420 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01421 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01422 src += 8*srcStride;\
01423 dst += 8*dstStride;\
01424 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01425 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01426 }\
01427 \
01428 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01429 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01430 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01431 src += 8*dstStride;\
01432 dst += 8*dstStride;\
01433 src2 += 8*src2Stride;\
01434 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01435 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01436 }\
01437 \
01438 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01439 put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
01440 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
01441 }\
01442 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01443 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
01444 }\
01445 \
01446 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01447 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
01448 }\
01449 \
01450 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01451 {\
01452 __asm__ volatile(\
01453 "movq (%1), %%mm0 \n\t"\
01454 "movq 24(%1), %%mm1 \n\t"\
01455 "psraw $5, %%mm0 \n\t"\
01456 "psraw $5, %%mm1 \n\t"\
01457 "packuswb %%mm0, %%mm0 \n\t"\
01458 "packuswb %%mm1, %%mm1 \n\t"\
01459 PAVGB" (%0), %%mm0 \n\t"\
01460 PAVGB" (%0,%3), %%mm1 \n\t"\
01461 OP(%%mm0, (%2), %%mm4, d)\
01462 OP(%%mm1, (%2,%4), %%mm5, d)\
01463 "lea (%0,%3,2), %0 \n\t"\
01464 "lea (%2,%4,2), %2 \n\t"\
01465 "movq 48(%1), %%mm0 \n\t"\
01466 "movq 72(%1), %%mm1 \n\t"\
01467 "psraw $5, %%mm0 \n\t"\
01468 "psraw $5, %%mm1 \n\t"\
01469 "packuswb %%mm0, %%mm0 \n\t"\
01470 "packuswb %%mm1, %%mm1 \n\t"\
01471 PAVGB" (%0), %%mm0 \n\t"\
01472 PAVGB" (%0,%3), %%mm1 \n\t"\
01473 OP(%%mm0, (%2), %%mm4, d)\
01474 OP(%%mm1, (%2,%4), %%mm5, d)\
01475 :"+a"(src8), "+c"(src16), "+d"(dst)\
01476 :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
01477 :"memory");\
01478 }\
01479 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01480 {\
01481 do{\
01482 __asm__ volatile(\
01483 "movq (%1), %%mm0 \n\t"\
01484 "movq 8(%1), %%mm1 \n\t"\
01485 "movq 48(%1), %%mm2 \n\t"\
01486 "movq 8+48(%1), %%mm3 \n\t"\
01487 "psraw $5, %%mm0 \n\t"\
01488 "psraw $5, %%mm1 \n\t"\
01489 "psraw $5, %%mm2 \n\t"\
01490 "psraw $5, %%mm3 \n\t"\
01491 "packuswb %%mm1, %%mm0 \n\t"\
01492 "packuswb %%mm3, %%mm2 \n\t"\
01493 PAVGB" (%0), %%mm0 \n\t"\
01494 PAVGB" (%0,%3), %%mm2 \n\t"\
01495 OP(%%mm0, (%2), %%mm5, q)\
01496 OP(%%mm2, (%2,%4), %%mm5, q)\
01497 ::"a"(src8), "c"(src16), "d"(dst),\
01498 "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
01499 :"memory");\
01500 src8 += 2L*src8Stride;\
01501 src16 += 48;\
01502 dst += 2L*dstStride;\
01503 }while(h-=2);\
01504 }\
01505 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01506 {\
01507 OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
01508 OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
01509 }\
01510
01511
01512 #if ARCH_X86_64
01513 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01514 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01515 int h=16;\
01516 __asm__ volatile(\
01517 "pxor %%xmm15, %%xmm15 \n\t"\
01518 "movdqa %6, %%xmm14 \n\t"\
01519 "movdqa %7, %%xmm13 \n\t"\
01520 "1: \n\t"\
01521 "lddqu 6(%0), %%xmm1 \n\t"\
01522 "lddqu -2(%0), %%xmm7 \n\t"\
01523 "movdqa %%xmm1, %%xmm0 \n\t"\
01524 "punpckhbw %%xmm15, %%xmm1 \n\t"\
01525 "punpcklbw %%xmm15, %%xmm0 \n\t"\
01526 "punpcklbw %%xmm15, %%xmm7 \n\t"\
01527 "movdqa %%xmm1, %%xmm2 \n\t"\
01528 "movdqa %%xmm0, %%xmm6 \n\t"\
01529 "movdqa %%xmm1, %%xmm3 \n\t"\
01530 "movdqa %%xmm0, %%xmm8 \n\t"\
01531 "movdqa %%xmm1, %%xmm4 \n\t"\
01532 "movdqa %%xmm0, %%xmm9 \n\t"\
01533 "movdqa %%xmm0, %%xmm12 \n\t"\
01534 "movdqa %%xmm1, %%xmm11 \n\t"\
01535 "palignr $10,%%xmm0, %%xmm11\n\t"\
01536 "palignr $10,%%xmm7, %%xmm12\n\t"\
01537 "palignr $2, %%xmm0, %%xmm4 \n\t"\
01538 "palignr $2, %%xmm7, %%xmm9 \n\t"\
01539 "palignr $4, %%xmm0, %%xmm3 \n\t"\
01540 "palignr $4, %%xmm7, %%xmm8 \n\t"\
01541 "palignr $6, %%xmm0, %%xmm2 \n\t"\
01542 "palignr $6, %%xmm7, %%xmm6 \n\t"\
01543 "paddw %%xmm0 ,%%xmm11 \n\t"\
01544 "palignr $8, %%xmm0, %%xmm1 \n\t"\
01545 "palignr $8, %%xmm7, %%xmm0 \n\t"\
01546 "paddw %%xmm12,%%xmm7 \n\t"\
01547 "paddw %%xmm3, %%xmm2 \n\t"\
01548 "paddw %%xmm8, %%xmm6 \n\t"\
01549 "paddw %%xmm4, %%xmm1 \n\t"\
01550 "paddw %%xmm9, %%xmm0 \n\t"\
01551 "psllw $2, %%xmm2 \n\t"\
01552 "psllw $2, %%xmm6 \n\t"\
01553 "psubw %%xmm1, %%xmm2 \n\t"\
01554 "psubw %%xmm0, %%xmm6 \n\t"\
01555 "paddw %%xmm13,%%xmm11 \n\t"\
01556 "paddw %%xmm13,%%xmm7 \n\t"\
01557 "pmullw %%xmm14,%%xmm2 \n\t"\
01558 "pmullw %%xmm14,%%xmm6 \n\t"\
01559 "lddqu (%2), %%xmm3 \n\t"\
01560 "paddw %%xmm11,%%xmm2 \n\t"\
01561 "paddw %%xmm7, %%xmm6 \n\t"\
01562 "psraw $5, %%xmm2 \n\t"\
01563 "psraw $5, %%xmm6 \n\t"\
01564 "packuswb %%xmm2,%%xmm6 \n\t"\
01565 "pavgb %%xmm3, %%xmm6 \n\t"\
01566 OP(%%xmm6, (%1), %%xmm4, dqa)\
01567 "add %5, %0 \n\t"\
01568 "add %5, %1 \n\t"\
01569 "add %4, %2 \n\t"\
01570 "decl %3 \n\t"\
01571 "jg 1b \n\t"\
01572 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
01573 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01574 "m"(ff_pw_5), "m"(ff_pw_16)\
01575 : "memory"\
01576 );\
01577 }
01578 #else // ARCH_X86_64
01579 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01580 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01581 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01582 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01583 src += 8*dstStride;\
01584 dst += 8*dstStride;\
01585 src2 += 8*src2Stride;\
01586 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01587 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01588 }
01589 #endif // ARCH_X86_64
01590
01591 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
01592 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01593 int h=8;\
01594 __asm__ volatile(\
01595 "pxor %%xmm7, %%xmm7 \n\t"\
01596 "movdqa %0, %%xmm6 \n\t"\
01597 :: "m"(ff_pw_5)\
01598 );\
01599 do{\
01600 __asm__ volatile(\
01601 "lddqu -2(%0), %%xmm1 \n\t"\
01602 "movdqa %%xmm1, %%xmm0 \n\t"\
01603 "punpckhbw %%xmm7, %%xmm1 \n\t"\
01604 "punpcklbw %%xmm7, %%xmm0 \n\t"\
01605 "movdqa %%xmm1, %%xmm2 \n\t"\
01606 "movdqa %%xmm1, %%xmm3 \n\t"\
01607 "movdqa %%xmm1, %%xmm4 \n\t"\
01608 "movdqa %%xmm1, %%xmm5 \n\t"\
01609 "palignr $2, %%xmm0, %%xmm4 \n\t"\
01610 "palignr $4, %%xmm0, %%xmm3 \n\t"\
01611 "palignr $6, %%xmm0, %%xmm2 \n\t"\
01612 "palignr $8, %%xmm0, %%xmm1 \n\t"\
01613 "palignr $10,%%xmm0, %%xmm5 \n\t"\
01614 "paddw %%xmm5, %%xmm0 \n\t"\
01615 "paddw %%xmm3, %%xmm2 \n\t"\
01616 "paddw %%xmm4, %%xmm1 \n\t"\
01617 "psllw $2, %%xmm2 \n\t"\
01618 "movq (%2), %%xmm3 \n\t"\
01619 "psubw %%xmm1, %%xmm2 \n\t"\
01620 "paddw %5, %%xmm0 \n\t"\
01621 "pmullw %%xmm6, %%xmm2 \n\t"\
01622 "paddw %%xmm0, %%xmm2 \n\t"\
01623 "psraw $5, %%xmm2 \n\t"\
01624 "packuswb %%xmm2, %%xmm2 \n\t"\
01625 "pavgb %%xmm3, %%xmm2 \n\t"\
01626 OP(%%xmm2, (%1), %%xmm4, q)\
01627 "add %4, %0 \n\t"\
01628 "add %4, %1 \n\t"\
01629 "add %3, %2 \n\t"\
01630 : "+a"(src), "+c"(dst), "+d"(src2)\
01631 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01632 "m"(ff_pw_16)\
01633 : "memory"\
01634 );\
01635 }while(--h);\
01636 }\
01637 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01638 \
01639 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01640 int h=8;\
01641 __asm__ volatile(\
01642 "pxor %%xmm7, %%xmm7 \n\t"\
01643 "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
01644 "1: \n\t"\
01645 "lddqu -2(%0), %%xmm1 \n\t"\
01646 "movdqa %%xmm1, %%xmm0 \n\t"\
01647 "punpckhbw %%xmm7, %%xmm1 \n\t"\
01648 "punpcklbw %%xmm7, %%xmm0 \n\t"\
01649 "movdqa %%xmm1, %%xmm2 \n\t"\
01650 "movdqa %%xmm1, %%xmm3 \n\t"\
01651 "movdqa %%xmm1, %%xmm4 \n\t"\
01652 "movdqa %%xmm1, %%xmm5 \n\t"\
01653 "palignr $2, %%xmm0, %%xmm4 \n\t"\
01654 "palignr $4, %%xmm0, %%xmm3 \n\t"\
01655 "palignr $6, %%xmm0, %%xmm2 \n\t"\
01656 "palignr $8, %%xmm0, %%xmm1 \n\t"\
01657 "palignr $10,%%xmm0, %%xmm5 \n\t"\
01658 "paddw %%xmm5, %%xmm0 \n\t"\
01659 "paddw %%xmm3, %%xmm2 \n\t"\
01660 "paddw %%xmm4, %%xmm1 \n\t"\
01661 "psllw $2, %%xmm2 \n\t"\
01662 "psubw %%xmm1, %%xmm2 \n\t"\
01663 "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
01664 "pmullw %%xmm6, %%xmm2 \n\t"\
01665 "paddw %%xmm0, %%xmm2 \n\t"\
01666 "psraw $5, %%xmm2 \n\t"\
01667 "packuswb %%xmm2, %%xmm2 \n\t"\
01668 OP(%%xmm2, (%1), %%xmm4, q)\
01669 "add %3, %0 \n\t"\
01670 "add %4, %1 \n\t"\
01671 "decl %2 \n\t"\
01672 " jnz 1b \n\t"\
01673 : "+a"(src), "+c"(dst), "+g"(h)\
01674 : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
01675 : "memory"\
01676 );\
01677 }\
01678 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01679 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01680 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01681 src += 8*srcStride;\
01682 dst += 8*dstStride;\
01683 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01684 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01685 }\
01686
01687 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
01688 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01689 src -= 2*srcStride;\
01690 \
01691 __asm__ volatile(\
01692 "pxor %%xmm7, %%xmm7 \n\t"\
01693 "movq (%0), %%xmm0 \n\t"\
01694 "add %2, %0 \n\t"\
01695 "movq (%0), %%xmm1 \n\t"\
01696 "add %2, %0 \n\t"\
01697 "movq (%0), %%xmm2 \n\t"\
01698 "add %2, %0 \n\t"\
01699 "movq (%0), %%xmm3 \n\t"\
01700 "add %2, %0 \n\t"\
01701 "movq (%0), %%xmm4 \n\t"\
01702 "add %2, %0 \n\t"\
01703 "punpcklbw %%xmm7, %%xmm0 \n\t"\
01704 "punpcklbw %%xmm7, %%xmm1 \n\t"\
01705 "punpcklbw %%xmm7, %%xmm2 \n\t"\
01706 "punpcklbw %%xmm7, %%xmm3 \n\t"\
01707 "punpcklbw %%xmm7, %%xmm4 \n\t"\
01708 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01709 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01710 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01711 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01712 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
01713 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
01714 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01715 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01716 \
01717 : "+a"(src), "+c"(dst)\
01718 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01719 : "memory"\
01720 );\
01721 if(h==16){\
01722 __asm__ volatile(\
01723 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01724 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01725 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
01726 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
01727 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01728 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01729 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01730 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01731 \
01732 : "+a"(src), "+c"(dst)\
01733 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01734 : "memory"\
01735 );\
01736 }\
01737 }\
01738 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01739 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
01740 }\
01741 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01742 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
01743 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01744 }
01745
01746 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
01747 int w = (size+8)>>3;
01748 src -= 2*srcStride+2;
01749 while(w--){
01750 __asm__ volatile(
01751 "pxor %%xmm7, %%xmm7 \n\t"
01752 "movq (%0), %%xmm0 \n\t"
01753 "add %2, %0 \n\t"
01754 "movq (%0), %%xmm1 \n\t"
01755 "add %2, %0 \n\t"
01756 "movq (%0), %%xmm2 \n\t"
01757 "add %2, %0 \n\t"
01758 "movq (%0), %%xmm3 \n\t"
01759 "add %2, %0 \n\t"
01760 "movq (%0), %%xmm4 \n\t"
01761 "add %2, %0 \n\t"
01762 "punpcklbw %%xmm7, %%xmm0 \n\t"
01763 "punpcklbw %%xmm7, %%xmm1 \n\t"
01764 "punpcklbw %%xmm7, %%xmm2 \n\t"
01765 "punpcklbw %%xmm7, %%xmm3 \n\t"
01766 "punpcklbw %%xmm7, %%xmm4 \n\t"
01767 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
01768 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
01769 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
01770 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
01771 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
01772 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
01773 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
01774 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
01775 : "+a"(src)
01776 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
01777 : "memory"
01778 );
01779 if(size==16){
01780 __asm__ volatile(
01781 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
01782 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
01783 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
01784 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
01785 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
01786 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
01787 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
01788 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
01789 : "+a"(src)
01790 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
01791 : "memory"
01792 );
01793 }
01794 tmp += 8;
01795 src += 8 - (size+5)*srcStride;
01796 }
01797 }
01798
01799 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
01800 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
01801 int h = size;\
01802 if(size == 16){\
01803 __asm__ volatile(\
01804 "1: \n\t"\
01805 "movdqa 32(%0), %%xmm4 \n\t"\
01806 "movdqa 16(%0), %%xmm5 \n\t"\
01807 "movdqa (%0), %%xmm7 \n\t"\
01808 "movdqa %%xmm4, %%xmm3 \n\t"\
01809 "movdqa %%xmm4, %%xmm2 \n\t"\
01810 "movdqa %%xmm4, %%xmm1 \n\t"\
01811 "movdqa %%xmm4, %%xmm0 \n\t"\
01812 "palignr $10, %%xmm5, %%xmm0 \n\t"\
01813 "palignr $8, %%xmm5, %%xmm1 \n\t"\
01814 "palignr $6, %%xmm5, %%xmm2 \n\t"\
01815 "palignr $4, %%xmm5, %%xmm3 \n\t"\
01816 "palignr $2, %%xmm5, %%xmm4 \n\t"\
01817 "paddw %%xmm5, %%xmm0 \n\t"\
01818 "paddw %%xmm4, %%xmm1 \n\t"\
01819 "paddw %%xmm3, %%xmm2 \n\t"\
01820 "movdqa %%xmm5, %%xmm6 \n\t"\
01821 "movdqa %%xmm5, %%xmm4 \n\t"\
01822 "movdqa %%xmm5, %%xmm3 \n\t"\
01823 "palignr $8, %%xmm7, %%xmm4 \n\t"\
01824 "palignr $2, %%xmm7, %%xmm6 \n\t"\
01825 "palignr $10, %%xmm7, %%xmm3 \n\t"\
01826 "paddw %%xmm6, %%xmm4 \n\t"\
01827 "movdqa %%xmm5, %%xmm6 \n\t"\
01828 "palignr $6, %%xmm7, %%xmm5 \n\t"\
01829 "palignr $4, %%xmm7, %%xmm6 \n\t"\
01830 "paddw %%xmm7, %%xmm3 \n\t"\
01831 "paddw %%xmm6, %%xmm5 \n\t"\
01832 \
01833 "psubw %%xmm1, %%xmm0 \n\t"\
01834 "psubw %%xmm4, %%xmm3 \n\t"\
01835 "psraw $2, %%xmm0 \n\t"\
01836 "psraw $2, %%xmm3 \n\t"\
01837 "psubw %%xmm1, %%xmm0 \n\t"\
01838 "psubw %%xmm4, %%xmm3 \n\t"\
01839 "paddw %%xmm2, %%xmm0 \n\t"\
01840 "paddw %%xmm5, %%xmm3 \n\t"\
01841 "psraw $2, %%xmm0 \n\t"\
01842 "psraw $2, %%xmm3 \n\t"\
01843 "paddw %%xmm2, %%xmm0 \n\t"\
01844 "paddw %%xmm5, %%xmm3 \n\t"\
01845 "psraw $6, %%xmm0 \n\t"\
01846 "psraw $6, %%xmm3 \n\t"\
01847 "packuswb %%xmm0, %%xmm3 \n\t"\
01848 OP(%%xmm3, (%1), %%xmm7, dqa)\
01849 "add $48, %0 \n\t"\
01850 "add %3, %1 \n\t"\
01851 "decl %2 \n\t"\
01852 " jnz 1b \n\t"\
01853 : "+a"(tmp), "+c"(dst), "+g"(h)\
01854 : "S"((x86_reg)dstStride)\
01855 : "memory"\
01856 );\
01857 }else{\
01858 __asm__ volatile(\
01859 "1: \n\t"\
01860 "movdqa 16(%0), %%xmm1 \n\t"\
01861 "movdqa (%0), %%xmm0 \n\t"\
01862 "movdqa %%xmm1, %%xmm2 \n\t"\
01863 "movdqa %%xmm1, %%xmm3 \n\t"\
01864 "movdqa %%xmm1, %%xmm4 \n\t"\
01865 "movdqa %%xmm1, %%xmm5 \n\t"\
01866 "palignr $10, %%xmm0, %%xmm5 \n\t"\
01867 "palignr $8, %%xmm0, %%xmm4 \n\t"\
01868 "palignr $6, %%xmm0, %%xmm3 \n\t"\
01869 "palignr $4, %%xmm0, %%xmm2 \n\t"\
01870 "palignr $2, %%xmm0, %%xmm1 \n\t"\
01871 "paddw %%xmm5, %%xmm0 \n\t"\
01872 "paddw %%xmm4, %%xmm1 \n\t"\
01873 "paddw %%xmm3, %%xmm2 \n\t"\
01874 "psubw %%xmm1, %%xmm0 \n\t"\
01875 "psraw $2, %%xmm0 \n\t"\
01876 "psubw %%xmm1, %%xmm0 \n\t"\
01877 "paddw %%xmm2, %%xmm0 \n\t"\
01878 "psraw $2, %%xmm0 \n\t"\
01879 "paddw %%xmm2, %%xmm0 \n\t"\
01880 "psraw $6, %%xmm0 \n\t"\
01881 "packuswb %%xmm0, %%xmm0 \n\t"\
01882 OP(%%xmm0, (%1), %%xmm7, q)\
01883 "add $48, %0 \n\t"\
01884 "add %3, %1 \n\t"\
01885 "decl %2 \n\t"\
01886 " jnz 1b \n\t"\
01887 : "+a"(tmp), "+c"(dst), "+g"(h)\
01888 : "S"((x86_reg)dstStride)\
01889 : "memory"\
01890 );\
01891 }\
01892 }
01893
01894 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
01895 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01896 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
01897 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
01898 }\
01899 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01900 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
01901 }\
01902 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01903 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
01904 }\
01905
01906 #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
01907 #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
01908 #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
01909 #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
01910 #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
01911 #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
01912 #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
01913 #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
01914
01915 #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
01916 #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
01917 #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
01918 #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
01919 #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
01920 #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
01921 #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
01922 #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
01923
01924 #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
01925 #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
01926 #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
01927 #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
01928
01929 #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
01930 #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
01931 #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
01932 #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
01933
01934 #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
01935 #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
01936
01937 #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
01938 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
01939 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
01940 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
01941 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
01942
01943 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01944 put_pixels16_sse2(dst, src, stride, 16);
01945 }
01946 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01947 avg_pixels16_sse2(dst, src, stride, 16);
01948 }
01949 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
01950 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
01951
01952 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
01953 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01954 OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
01955 }\
01956
01957 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
01958 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01959 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
01960 }\
01961 \
01962 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01963 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
01964 }\
01965 \
01966 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01967 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
01968 }\
01969
01970 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
01971 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01972 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01973 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01974 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
01975 }\
01976 \
01977 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01978 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
01979 }\
01980 \
01981 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01982 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01983 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01984 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
01985 }\
01986
01987 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
01988 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01989 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01990 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01991 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01992 }\
01993 \
01994 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01995 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01996 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
01997 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01998 }\
01999 \
02000 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02001 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
02002 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
02003 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
02004 }\
02005 \
02006 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02007 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
02008 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
02009 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
02010 }\
02011 \
02012 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02013 DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
02014 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
02015 }\
02016 \
02017 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02018 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
02019 uint8_t * const halfHV= temp;\
02020 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02021 assert(((int)temp & 7) == 0);\
02022 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02023 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
02024 }\
02025 \
02026 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02027 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
02028 uint8_t * const halfHV= temp;\
02029 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02030 assert(((int)temp & 7) == 0);\
02031 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02032 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
02033 }\
02034 \
02035 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02036 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
02037 uint8_t * const halfHV= temp;\
02038 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02039 assert(((int)temp & 7) == 0);\
02040 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02041 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
02042 }\
02043 \
02044 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02045 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
02046 uint8_t * const halfHV= temp;\
02047 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02048 assert(((int)temp & 7) == 0);\
02049 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02050 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
02051 }\
02052
02053 #define H264_MC_4816(MMX)\
02054 H264_MC(put_, 4, MMX, 8)\
02055 H264_MC(put_, 8, MMX, 8)\
02056 H264_MC(put_, 16,MMX, 8)\
02057 H264_MC(avg_, 4, MMX, 8)\
02058 H264_MC(avg_, 8, MMX, 8)\
02059 H264_MC(avg_, 16,MMX, 8)\
02060
02061 #define H264_MC_816(QPEL, XMM)\
02062 QPEL(put_, 8, XMM, 16)\
02063 QPEL(put_, 16,XMM, 16)\
02064 QPEL(avg_, 8, XMM, 16)\
02065 QPEL(avg_, 16,XMM, 16)\
02066
02067
02068 #define AVG_3DNOW_OP(a,b,temp, size) \
02069 "mov" #size " " #b ", " #temp " \n\t"\
02070 "pavgusb " #temp ", " #a " \n\t"\
02071 "mov" #size " " #a ", " #b " \n\t"
02072 #define AVG_MMX2_OP(a,b,temp, size) \
02073 "mov" #size " " #b ", " #temp " \n\t"\
02074 "pavgb " #temp ", " #a " \n\t"\
02075 "mov" #size " " #a ", " #b " \n\t"
02076
02077 #define PAVGB "pavgusb"
02078 QPEL_H264(put_, PUT_OP, 3dnow)
02079 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
02080 #undef PAVGB
02081 #define PAVGB "pavgb"
02082 QPEL_H264(put_, PUT_OP, mmx2)
02083 QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
02084 QPEL_H264_V_XMM(put_, PUT_OP, sse2)
02085 QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
02086 QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
02087 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
02088 #if HAVE_SSSE3
02089 QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
02090 QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
02091 QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
02092 QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
02093 QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
02094 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
02095 #endif
02096 #undef PAVGB
02097
02098 H264_MC_4816(3dnow)
02099 H264_MC_4816(mmx2)
02100 H264_MC_816(H264_MC_V, sse2)
02101 H264_MC_816(H264_MC_HV, sse2)
02102 #if HAVE_SSSE3
02103 H264_MC_816(H264_MC_H, ssse3)
02104 H264_MC_816(H264_MC_HV, ssse3)
02105 #endif
02106
02107
02108 DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
02109 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
02110 };
02111
02112 #define H264_CHROMA_OP(S,D)
02113 #define H264_CHROMA_OP4(S,D,T)
02114 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
02115 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
02116 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
02117 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
02118 #include "dsputil_h264_template_mmx.c"
02119
02120 static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02121 {
02122 put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
02123 }
02124 static void put_vc1_chroma_mc8_mmx_nornd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02125 {
02126 put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2);
02127 }
02128 static void put_h264_chroma_mc4_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02129 {
02130 put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
02131 }
02132
02133 #undef H264_CHROMA_OP
02134 #undef H264_CHROMA_OP4
02135 #undef H264_CHROMA_MC8_TMPL
02136 #undef H264_CHROMA_MC4_TMPL
02137 #undef H264_CHROMA_MC2_TMPL
02138 #undef H264_CHROMA_MC8_MV0
02139
02140 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
02141 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
02142 "pavgb " #T ", " #D " \n\t"
02143 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2
02144 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2
02145 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
02146 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
02147 #include "dsputil_h264_template_mmx.c"
02148 static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02149 {
02150 avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
02151 }
02152 static void avg_vc1_chroma_mc8_mmx2_nornd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02153 {
02154 avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg+2);
02155 }
02156 static void avg_h264_chroma_mc4_mmx2(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02157 {
02158 avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
02159 }
02160 #undef H264_CHROMA_OP
02161 #undef H264_CHROMA_OP4
02162 #undef H264_CHROMA_MC8_TMPL
02163 #undef H264_CHROMA_MC4_TMPL
02164 #undef H264_CHROMA_MC2_TMPL
02165 #undef H264_CHROMA_MC8_MV0
02166
02167 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
02168 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
02169 "pavgusb " #T ", " #D " \n\t"
02170 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow
02171 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow
02172 #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
02173 #include "dsputil_h264_template_mmx.c"
02174 static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02175 {
02176 avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
02177 }
02178 static void avg_h264_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02179 {
02180 avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
02181 }
02182 #undef H264_CHROMA_OP
02183 #undef H264_CHROMA_OP4
02184 #undef H264_CHROMA_MC8_TMPL
02185 #undef H264_CHROMA_MC4_TMPL
02186 #undef H264_CHROMA_MC8_MV0
02187
02188 #if HAVE_SSSE3
02189 #define AVG_OP(X)
02190 #undef H264_CHROMA_MC8_TMPL
02191 #undef H264_CHROMA_MC4_TMPL
02192 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
02193 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
02194 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
02195 #include "dsputil_h264_template_ssse3.c"
02196 static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02197 {
02198 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
02199 }
02200 static void put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02201 {
02202 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
02203 }
02204
02205 #undef AVG_OP
02206 #undef H264_CHROMA_MC8_TMPL
02207 #undef H264_CHROMA_MC4_TMPL
02208 #undef H264_CHROMA_MC8_MV0
02209 #define AVG_OP(X) X
02210 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
02211 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
02212 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
02213 #include "dsputil_h264_template_ssse3.c"
02214 static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02215 {
02216 avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
02217 }
02218 static void avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02219 {
02220 avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
02221 }
02222 #undef AVG_OP
02223 #undef H264_CHROMA_MC8_TMPL
02224 #undef H264_CHROMA_MC4_TMPL
02225 #undef H264_CHROMA_MC8_MV0
02226 #endif
02227
02228
02229
02230
02231 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
02232 {
02233 int x, y;
02234 offset <<= log2_denom;
02235 offset += (1 << log2_denom) >> 1;
02236 __asm__ volatile(
02237 "movd %0, %%mm4 \n\t"
02238 "movd %1, %%mm5 \n\t"
02239 "movd %2, %%mm6 \n\t"
02240 "pshufw $0, %%mm4, %%mm4 \n\t"
02241 "pshufw $0, %%mm5, %%mm5 \n\t"
02242 "pxor %%mm7, %%mm7 \n\t"
02243 :: "g"(weight), "g"(offset), "g"(log2_denom)
02244 );
02245 for(y=0; y<h; y+=2){
02246 for(x=0; x<w; x+=4){
02247 __asm__ volatile(
02248 "movd %0, %%mm0 \n\t"
02249 "movd %1, %%mm1 \n\t"
02250 "punpcklbw %%mm7, %%mm0 \n\t"
02251 "punpcklbw %%mm7, %%mm1 \n\t"
02252 "pmullw %%mm4, %%mm0 \n\t"
02253 "pmullw %%mm4, %%mm1 \n\t"
02254 "paddsw %%mm5, %%mm0 \n\t"
02255 "paddsw %%mm5, %%mm1 \n\t"
02256 "psraw %%mm6, %%mm0 \n\t"
02257 "psraw %%mm6, %%mm1 \n\t"
02258 "packuswb %%mm7, %%mm0 \n\t"
02259 "packuswb %%mm7, %%mm1 \n\t"
02260 "movd %%mm0, %0 \n\t"
02261 "movd %%mm1, %1 \n\t"
02262 : "+m"(*(uint32_t*)(dst+x)),
02263 "+m"(*(uint32_t*)(dst+x+stride))
02264 );
02265 }
02266 dst += 2*stride;
02267 }
02268 }
02269
02270 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
02271 {
02272 int x, y;
02273 offset = ((offset + 1) | 1) << log2_denom;
02274 __asm__ volatile(
02275 "movd %0, %%mm3 \n\t"
02276 "movd %1, %%mm4 \n\t"
02277 "movd %2, %%mm5 \n\t"
02278 "movd %3, %%mm6 \n\t"
02279 "pshufw $0, %%mm3, %%mm3 \n\t"
02280 "pshufw $0, %%mm4, %%mm4 \n\t"
02281 "pshufw $0, %%mm5, %%mm5 \n\t"
02282 "pxor %%mm7, %%mm7 \n\t"
02283 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
02284 );
02285 for(y=0; y<h; y++){
02286 for(x=0; x<w; x+=4){
02287 __asm__ volatile(
02288 "movd %0, %%mm0 \n\t"
02289 "movd %1, %%mm1 \n\t"
02290 "punpcklbw %%mm7, %%mm0 \n\t"
02291 "punpcklbw %%mm7, %%mm1 \n\t"
02292 "pmullw %%mm3, %%mm0 \n\t"
02293 "pmullw %%mm4, %%mm1 \n\t"
02294 "paddsw %%mm1, %%mm0 \n\t"
02295 "paddsw %%mm5, %%mm0 \n\t"
02296 "psraw %%mm6, %%mm0 \n\t"
02297 "packuswb %%mm0, %%mm0 \n\t"
02298 "movd %%mm0, %0 \n\t"
02299 : "+m"(*(uint32_t*)(dst+x))
02300 : "m"(*(uint32_t*)(src+x))
02301 );
02302 }
02303 src += stride;
02304 dst += stride;
02305 }
02306 }
02307
02308 #define H264_WEIGHT(W,H) \
02309 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
02310 ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
02311 } \
02312 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
02313 ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
02314 }
02315
02316 H264_WEIGHT(16,16)
02317 H264_WEIGHT(16, 8)
02318 H264_WEIGHT( 8,16)
02319 H264_WEIGHT( 8, 8)
02320 H264_WEIGHT( 8, 4)
02321 H264_WEIGHT( 4, 8)
02322 H264_WEIGHT( 4, 4)
02323 H264_WEIGHT( 4, 2)
02324