47 "movq (%3), %%mm0 \n\t"
48 "movq 8(%3), %%mm1 \n\t"
49 "movq 16(%3), %%mm2 \n\t"
50 "movq 24(%3), %%mm3 \n\t"
51 "movq 32(%3), %%mm4 \n\t"
52 "movq 40(%3), %%mm5 \n\t"
53 "movq 48(%3), %%mm6 \n\t"
54 "movq 56(%3), %%mm7 \n\t"
55 "packuswb %%mm1, %%mm0 \n\t"
56 "packuswb %%mm3, %%mm2 \n\t"
57 "packuswb %%mm5, %%mm4 \n\t"
58 "packuswb %%mm7, %%mm6 \n\t"
59 "movq %%mm0, (%0) \n\t"
60 "movq %%mm2, (%0, %1) \n\t"
61 "movq %%mm4, (%0, %1, 2) \n\t"
62 "movq %%mm6, (%0, %2) \n\t"
73 "movq (%3), %%mm0 \n\t"
74 "movq 8(%3), %%mm1 \n\t"
75 "movq 16(%3), %%mm2 \n\t"
76 "movq 24(%3), %%mm3 \n\t"
77 "movq 32(%3), %%mm4 \n\t"
78 "movq 40(%3), %%mm5 \n\t"
79 "movq 48(%3), %%mm6 \n\t"
80 "movq 56(%3), %%mm7 \n\t"
81 "packuswb %%mm1, %%mm0 \n\t"
82 "packuswb %%mm3, %%mm2 \n\t"
83 "packuswb %%mm5, %%mm4 \n\t"
84 "packuswb %%mm7, %%mm6 \n\t"
85 "movq %%mm0, (%0) \n\t"
86 "movq %%mm2, (%0, %1) \n\t"
87 "movq %%mm4, (%0, %1, 2) \n\t"
88 "movq %%mm6, (%0, %2) \n\t"
89 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
"r"(p)
93 #define put_signed_pixels_clamped_mmx_half(off) \
94 "movq "#off"(%2), %%mm1 \n\t" \
95 "movq 16 + "#off"(%2), %%mm2 \n\t" \
96 "movq 32 + "#off"(%2), %%mm3 \n\t" \
97 "movq 48 + "#off"(%2), %%mm4 \n\t" \
98 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
99 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
100 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
101 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
102 "paddb %%mm0, %%mm1 \n\t" \
103 "paddb %%mm0, %%mm2 \n\t" \
104 "paddb %%mm0, %%mm3 \n\t" \
105 "paddb %%mm0, %%mm4 \n\t" \
106 "movq %%mm1, (%0) \n\t" \
107 "movq %%mm2, (%0, %3) \n\t" \
108 "movq %%mm3, (%0, %3, 2) \n\t" \
109 "movq %%mm4, (%0, %1) \n\t"
119 "lea (%3, %3, 2), %1 \n\t"
120 put_signed_pixels_clamped_mmx_half(0)
121 "lea (%0, %3, 4), %0 \n\t"
122 put_signed_pixels_clamped_mmx_half(64)
123 :
"+&r"(pixels),
"=&r"(line_skip3)
124 :
"r"(block),
"r"(line_skip)
142 "movq (%2), %%mm0 \n\t"
143 "movq 8(%2), %%mm1 \n\t"
144 "movq 16(%2), %%mm2 \n\t"
145 "movq 24(%2), %%mm3 \n\t"
146 "movq %0, %%mm4 \n\t"
147 "movq %1, %%mm6 \n\t"
148 "movq %%mm4, %%mm5 \n\t"
149 "punpcklbw %%mm7, %%mm4 \n\t"
150 "punpckhbw %%mm7, %%mm5 \n\t"
151 "paddsw %%mm4, %%mm0 \n\t"
152 "paddsw %%mm5, %%mm1 \n\t"
153 "movq %%mm6, %%mm5 \n\t"
154 "punpcklbw %%mm7, %%mm6 \n\t"
155 "punpckhbw %%mm7, %%mm5 \n\t"
156 "paddsw %%mm6, %%mm2 \n\t"
157 "paddsw %%mm5, %%mm3 \n\t"
158 "packuswb %%mm1, %%mm0 \n\t"
159 "packuswb %%mm3, %%mm2 \n\t"
160 "movq %%mm0, %0 \n\t"
161 "movq %%mm2, %1 \n\t"
162 :
"+m"(*pix),
"+m"(*(pix + line_size))
165 pix += line_size * 2;
170 #define CLEAR_BLOCKS(name, n) \
171 void name(int16_t *blocks) \
174 "pxor %%mm7, %%mm7 \n\t" \
175 "mov %1, %%"REG_a" \n\t" \
177 "movq %%mm7, (%0, %%"REG_a") \n\t" \
178 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
179 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
180 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
181 "add $32, %%"REG_a" \n\t" \
183 :: "r"(((uint8_t *)blocks) + 128 * n), \
194 "xorps %%xmm0, %%xmm0 \n"
195 "movaps %%xmm0, (%0) \n"
196 "movaps %%xmm0, 16(%0) \n"
197 "movaps %%xmm0, 32(%0) \n"
198 "movaps %%xmm0, 48(%0) \n"
199 "movaps %%xmm0, 64(%0) \n"
200 "movaps %%xmm0, 80(%0) \n"
201 "movaps %%xmm0, 96(%0) \n"
202 "movaps %%xmm0, 112(%0) \n"
211 "xorps %%xmm0, %%xmm0 \n"
212 "mov %1, %%"REG_a
" \n"
214 "movaps %%xmm0, (%0, %%"REG_a
") \n"
215 "movaps %%xmm0, 16(%0, %%"REG_a
") \n"
216 "movaps %%xmm0, 32(%0, %%"REG_a
") \n"
217 "movaps %%xmm0, 48(%0, %%"REG_a
") \n"
218 "movaps %%xmm0, 64(%0, %%"REG_a
") \n"
219 "movaps %%xmm0, 80(%0, %%"REG_a
") \n"
220 "movaps %%xmm0, 96(%0, %%"REG_a
") \n"
221 "movaps %%xmm0, 112(%0, %%"REG_a
") \n"
222 "add $128, %%"REG_a
" \n"
224 ::
"r"(((
uint8_t *)blocks) + 128 * 6),
236 "movq (%1, %0), %%mm0 \n\t"
237 "movq (%2, %0), %%mm1 \n\t"
238 "paddb %%mm0, %%mm1 \n\t"
239 "movq %%mm1, (%2, %0) \n\t"
240 "movq 8(%1, %0), %%mm0 \n\t"
241 "movq 8(%2, %0), %%mm1 \n\t"
242 "paddb %%mm0, %%mm1 \n\t"
243 "movq %%mm1, 8(%2, %0) \n\t"
249 :
"r"(src),
"r"(dst),
"r"((
x86_reg)w - 15)
252 dst[i + 0] += src[i + 0];
258 int *left,
int *left_top)
262 int l = *left & 0xff;
263 int tl = *left_top & 0xff;
268 "movzbl (%3, %4), %2 \n"
281 "add (%6, %4), %b0 \n"
282 "mov %b0, (%5, %4) \n"
285 :
"+&q"(l),
"+&q"(tl),
"=&r"(
t),
"=&q"(x),
"+&r"(w2)
286 :
"r"(dst + w),
"r"(diff + w),
"rm"(top + w)
296 int w,
int h,
int sides)
301 last_line = buf + (height - 1) * wrap;
307 "movd (%0), %%mm0 \n\t"
308 "punpcklbw %%mm0, %%mm0 \n\t"
309 "punpcklwd %%mm0, %%mm0 \n\t"
310 "punpckldq %%mm0, %%mm0 \n\t"
311 "movq %%mm0, -8(%0) \n\t"
312 "movq -8(%0, %2), %%mm1 \n\t"
313 "punpckhbw %%mm1, %%mm1 \n\t"
314 "punpckhwd %%mm1, %%mm1 \n\t"
315 "punpckhdq %%mm1, %%mm1 \n\t"
316 "movq %%mm1, (%0, %2) \n\t"
326 "movd (%0), %%mm0 \n\t"
327 "punpcklbw %%mm0, %%mm0 \n\t"
328 "punpcklwd %%mm0, %%mm0 \n\t"
329 "punpckldq %%mm0, %%mm0 \n\t"
330 "movq %%mm0, -8(%0) \n\t"
331 "movq %%mm0, -16(%0) \n\t"
332 "movq -8(%0, %2), %%mm1 \n\t"
333 "punpckhbw %%mm1, %%mm1 \n\t"
334 "punpckhwd %%mm1, %%mm1 \n\t"
335 "punpckhdq %%mm1, %%mm1 \n\t"
336 "movq %%mm1, (%0, %2) \n\t"
337 "movq %%mm1, 8(%0, %2) \n\t"
348 "movd (%0), %%mm0 \n\t"
349 "punpcklbw %%mm0, %%mm0 \n\t"
350 "punpcklwd %%mm0, %%mm0 \n\t"
351 "movd %%mm0, -4(%0) \n\t"
352 "movd -4(%0, %2), %%mm1 \n\t"
353 "punpcklbw %%mm1, %%mm1 \n\t"
354 "punpckhwd %%mm1, %%mm1 \n\t"
355 "punpckhdq %%mm1, %%mm1 \n\t"
356 "movd %%mm1, (%0, %2) \n\t"
367 for (i = 0; i < h; i += 4) {
368 ptr = buf - (i + 1) * wrap - w;
371 "movq (%1, %0), %%mm0 \n\t"
372 "movq %%mm0, (%0) \n\t"
373 "movq %%mm0, (%0, %2) \n\t"
374 "movq %%mm0, (%0, %2, 2) \n\t"
375 "movq %%mm0, (%0, %3) \n\t"
381 "r"((
x86_reg) -wrap * 3),
"r"(ptr + width + 2 * w)
387 for (i = 0; i < h; i += 4) {
388 ptr = last_line + (i + 1) * wrap - w;
391 "movq (%1, %0), %%mm0 \n\t"
392 "movq %%mm0, (%0) \n\t"
393 "movq %%mm0, (%0, %2) \n\t"
394 "movq %%mm0, (%0, %2, 2) \n\t"
395 "movq %%mm0, (%0, %3) \n\t"
402 "r"(ptr + width + 2 * w)
408 typedef void emulated_edge_mc_func(
uint8_t *dst,
const uint8_t *src,
409 ptrdiff_t linesize,
int block_w,
int block_h,
410 int src_x,
int src_y,
int w,
int h);
413 int stride,
int h,
int ox,
int oy,
414 int dxx,
int dxy,
int dyx,
int dyy,
415 int shift,
int r,
int width,
int height,
416 emulated_edge_mc_func *emu_edge_fn)
419 const int ix = ox >> (16 +
shift);
420 const int iy = oy >> (16 +
shift);
421 const int oxs = ox >> 4;
422 const int oys = oy >> 4;
423 const int dxxs = dxx >> 4;
424 const int dxys = dxy >> 4;
425 const int dyxs = dyx >> 4;
426 const int dyys = dyy >> 4;
427 const uint16_t r4[4] = {
r,
r,
r, r };
428 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
429 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
431 #define MAX_STRIDE 4096U
433 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
436 const int dxw = (dxx - (1 << (16 +
shift))) * (w - 1);
437 const int dyh = (dyy - (1 << (16 +
shift))) * (h - 1);
438 const int dxh = dxy * (h - 1);
439 const int dyw = dyx * (w - 1);
440 int need_emu = (unsigned)ix >= width - w ||
441 (
unsigned)iy >= height - h;
444 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
445 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 +
shift)
447 || (dxx | dxy | dyx | dyy) & 15
448 || (need_emu && (h > MAX_H ||
stride > MAX_STRIDE))) {
450 ff_gmc_c(dst, src,
stride, h, ox, oy, dxx, dxy, dyx, dyy,
457 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
462 "movd %0, %%mm6 \n\t"
463 "pxor %%mm7, %%mm7 \n\t"
464 "punpcklwd %%mm6, %%mm6 \n\t"
465 "punpcklwd %%mm6, %%mm6 \n\t"
469 for (x = 0; x < w; x += 4) {
470 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
471 oxs - dxys + dxxs * (x + 1),
472 oxs - dxys + dxxs * (x + 2),
473 oxs - dxys + dxxs * (x + 3) };
474 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
475 oys - dyys + dyxs * (x + 1),
476 oys - dyys + dyxs * (x + 2),
477 oys - dyys + dyxs * (x + 3) };
479 for (
y = 0;
y < h;
y++) {
481 "movq %0, %%mm4 \n\t"
482 "movq %1, %%mm5 \n\t"
483 "paddw %2, %%mm4 \n\t"
484 "paddw %3, %%mm5 \n\t"
485 "movq %%mm4, %0 \n\t"
486 "movq %%mm5, %1 \n\t"
487 "psrlw $12, %%mm4 \n\t"
488 "psrlw $12, %%mm5 \n\t"
489 :
"+m"(*dx4),
"+m"(*dy4)
490 :
"m"(*dxy4),
"m"(*dyy4)
494 "movq %%mm6, %%mm2 \n\t"
495 "movq %%mm6, %%mm1 \n\t"
496 "psubw %%mm4, %%mm2 \n\t"
497 "psubw %%mm5, %%mm1 \n\t"
498 "movq %%mm2, %%mm0 \n\t"
499 "movq %%mm4, %%mm3 \n\t"
500 "pmullw %%mm1, %%mm0 \n\t"
501 "pmullw %%mm5, %%mm3 \n\t"
502 "pmullw %%mm5, %%mm2 \n\t"
503 "pmullw %%mm4, %%mm1 \n\t"
505 "movd %4, %%mm5 \n\t"
506 "movd %3, %%mm4 \n\t"
507 "punpcklbw %%mm7, %%mm5 \n\t"
508 "punpcklbw %%mm7, %%mm4 \n\t"
509 "pmullw %%mm5, %%mm3 \n\t"
510 "pmullw %%mm4, %%mm2 \n\t"
512 "movd %2, %%mm5 \n\t"
513 "movd %1, %%mm4 \n\t"
514 "punpcklbw %%mm7, %%mm5 \n\t"
515 "punpcklbw %%mm7, %%mm4 \n\t"
516 "pmullw %%mm5, %%mm1 \n\t"
517 "pmullw %%mm4, %%mm0 \n\t"
518 "paddw %5, %%mm1 \n\t"
519 "paddw %%mm3, %%mm2 \n\t"
520 "paddw %%mm1, %%mm0 \n\t"
521 "paddw %%mm2, %%mm0 \n\t"
523 "psrlw %6, %%mm0 \n\t"
524 "packuswb %%mm0, %%mm0 \n\t"
525 "movd %%mm0, %0 \n\t"
528 :
"m"(src[0]),
"m"(src[1]),
529 "m"(src[stride]),
"m"(src[stride + 1]),
542 int stride,
int h,
int ox,
int oy,
543 int dxx,
int dxy,
int dyx,
int dyy,
544 int shift,
int r,
int width,
int height)
546 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
547 width, height, &ff_emulated_edge_mc_8);
551 int stride,
int h,
int ox,
int oy,
552 int dxx,
int dxy,
int dyx,
int dyy,
553 int shift,
int r,
int width,
int height)
555 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
556 width, height, &ff_emulated_edge_mc_8);
560 int stride,
int h,
int ox,
int oy,
561 int dxx,
int dxy,
int dyx,
int dyy,
562 int shift,
int r,
int width,
int height)
564 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
565 width, height, &ff_emulated_edge_mc_8);
570 #if CONFIG_DIRAC_DECODER
571 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
572 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
575 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
577 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
579 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
582 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
584 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
586 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
589 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
591 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
592 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
597 PIXELS16(
static, ff_avg, , , _mmxext)
598 DIRAC_PIXOP(put, ff_put, mmx)
599 DIRAC_PIXOP(avg, ff_avg, mmx)
603 DIRAC_PIXOP(avg, ff_avg, mmxext)
608 ff_put_dirac_pixels16_c(dst, src, stride, h);
615 ff_avg_dirac_pixels16_c(dst, src, stride, h);
622 ff_put_dirac_pixels32_c(dst, src, stride, h);
631 ff_avg_dirac_pixels32_c(dst, src, stride, h);
641 float min,
float max,
int len)
645 "movss %3, %%xmm4 \n\t"
646 "movss %4, %%xmm5 \n\t"
647 "shufps $0, %%xmm4, %%xmm4 \n\t"
648 "shufps $0, %%xmm5, %%xmm5 \n\t"
650 "movaps (%2, %0), %%xmm0 \n\t"
651 "movaps 16(%2, %0), %%xmm1 \n\t"
652 "movaps 32(%2, %0), %%xmm2 \n\t"
653 "movaps 48(%2, %0), %%xmm3 \n\t"
654 "maxps %%xmm4, %%xmm0 \n\t"
655 "maxps %%xmm4, %%xmm1 \n\t"
656 "maxps %%xmm4, %%xmm2 \n\t"
657 "maxps %%xmm4, %%xmm3 \n\t"
658 "minps %%xmm5, %%xmm0 \n\t"
659 "minps %%xmm5, %%xmm1 \n\t"
660 "minps %%xmm5, %%xmm2 \n\t"
661 "minps %%xmm5, %%xmm3 \n\t"
662 "movaps %%xmm0, (%1, %0) \n\t"
663 "movaps %%xmm1, 16(%1, %0) \n\t"
664 "movaps %%xmm2, 32(%1, %0) \n\t"
665 "movaps %%xmm3, 48(%1, %0) \n\t"
669 :
"r"(dst),
"r"(
src),
"m"(min),
"m"(max)