[FFmpeg-devel] [PATCH] x86/dsputil: port clear_block functions to yasm
James Almer
jamrial at gmail.com
Wed May 21 08:53:31 CEST 2014
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/dsputil.asm | 60 +++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/dsputil_init.c | 13 ++++++----
libavcodec/x86/dsputil_mmx.c | 55 ---------------------------------------
3 files changed, 68 insertions(+), 60 deletions(-)
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index c1ea9bf..c91dd8e 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -513,3 +513,63 @@ BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF
+
+;----------------------------------------
+; void ff_clear_block(int16_t *blocks);
+;----------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline store loops
+%macro CLEAR_BLOCK 2
+cglobal clear_block, 1, 1, %1, blocks
+ ZERO m0, m0
+%assign %%i 0
+%rep %2
+ mova [blocksq+mmsize*(0+%%i)], m0
+ mova [blocksq+mmsize*(1+%%i)], m0
+ mova [blocksq+mmsize*(2+%%i)], m0
+ mova [blocksq+mmsize*(3+%%i)], m0
+ mova [blocksq+mmsize*(4+%%i)], m0
+ mova [blocksq+mmsize*(5+%%i)], m0
+ mova [blocksq+mmsize*(6+%%i)], m0
+ mova [blocksq+mmsize*(7+%%i)], m0
+%assign %%i %%i+8
+%endrep
+ RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCK 0, 2
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCK 1, 1
+
+;-----------------------------------------
+; void ff_clear_blocks(int16_t *blocks);
+;-----------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS 1
+cglobal clear_blocks, 1, 2, %1, blocks, len
+ add blocksq, 768
+ mov lenq, -768
+ ZERO m0, m0
+.loop
+ mova [blocksq+lenq+mmsize*0], m0
+ mova [blocksq+lenq+mmsize*1], m0
+ mova [blocksq+lenq+mmsize*2], m0
+ mova [blocksq+lenq+mmsize*3], m0
+ mova [blocksq+lenq+mmsize*4], m0
+ mova [blocksq+lenq+mmsize*5], m0
+ mova [blocksq+lenq+mmsize*6], m0
+ mova [blocksq+lenq+mmsize*7], m0
+ add lenq, mmsize*8
+ js .loop
+ RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS 1
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 414da14..4461ae4 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -534,8 +534,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
- c->clear_block = ff_clear_block_mmx;
- c->clear_blocks = ff_clear_blocks_mmx;
c->draw_edges = ff_draw_edges_mmx;
}
@@ -547,6 +545,10 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMX_EXTERNAL
+ if (!high_bit_depth) {
+ c->clear_block = ff_clear_block_mmx;
+ c->clear_blocks = ff_clear_blocks_mmx;
+ }
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
@@ -585,7 +587,10 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
{
#if HAVE_SSE_INLINE
c->vector_clipf = ff_vector_clipf_sse;
+#endif /* HAVE_SSE_INLINE */
+#if HAVE_YASM
+#if HAVE_SSE_EXTERNAL
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
return;
@@ -594,9 +599,7 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse;
}
-#endif /* HAVE_SSE_INLINE */
-
-#if HAVE_YASM
+#endif
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
c->gmc = ff_gmc_sse;
#endif
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 42e25c4..a9c584d 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -172,61 +172,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
} while (--i);
}
-#define CLEAR_BLOCKS(name, n) \
-void name(int16_t *blocks) \
-{ \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "mov $-"#n", %%"REG_a" \n\t" \
- "1: \n\t" \
- "movq %%mm7, (%0, %%"REG_a") \n\t" \
- "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
- "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
- "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
- "add $32, %%"REG_a" \n\t" \
- "js 1b \n\t" \
- :: "r"(((uint8_t *) blocks) + n) \
- : "%"REG_a); \
-}
-CLEAR_BLOCKS(ff_clear_blocks_mmx, 768)
-CLEAR_BLOCKS(ff_clear_block_mmx, 128)
-
-void ff_clear_block_sse(int16_t *block)
-{
- __asm__ volatile (
- "xorps %%xmm0, %%xmm0 \n"
- "movaps %%xmm0, (%0) \n"
- "movaps %%xmm0, 16(%0) \n"
- "movaps %%xmm0, 32(%0) \n"
- "movaps %%xmm0, 48(%0) \n"
- "movaps %%xmm0, 64(%0) \n"
- "movaps %%xmm0, 80(%0) \n"
- "movaps %%xmm0, 96(%0) \n"
- "movaps %%xmm0, 112(%0) \n"
- :: "r" (block)
- : "memory");
-}
-
-void ff_clear_blocks_sse(int16_t *blocks)
-{
- __asm__ volatile (
- "xorps %%xmm0, %%xmm0 \n"
- "mov $-768, %%"REG_a" \n"
- "1: \n"
- "movaps %%xmm0, (%0, %%"REG_a") \n"
- "movaps %%xmm0, 16(%0, %%"REG_a") \n"
- "movaps %%xmm0, 32(%0, %%"REG_a") \n"
- "movaps %%xmm0, 48(%0, %%"REG_a") \n"
- "movaps %%xmm0, 64(%0, %%"REG_a") \n"
- "movaps %%xmm0, 80(%0, %%"REG_a") \n"
- "movaps %%xmm0, 96(%0, %%"REG_a") \n"
- "movaps %%xmm0, 112(%0, %%"REG_a") \n"
- "add $128, %%"REG_a" \n"
- "js 1b \n"
- :: "r"(((uint8_t *) blocks) + 128 * 6)
- : "%"REG_a);
-}
-
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
{
x86_reg i = 0;
--
1.8.5.5
More information about the ffmpeg-devel
mailing list