[FFmpeg-devel] [PATCH] dnxhdenc: get_pixels_8x4_sym_10bit_sse2
Timothy Gu
timothygu99 at gmail.com
Wed Apr 9 05:42:56 CEST 2014
Before:
3383 decicycles in dnxhd_10bit_get_pixels_8x4_sym, 130910 runs, 162 skips
After:
750 decicycles in ff_get_pixels_8x4_sym_10bit_sse2, 130999 runs, 73 skips
Overall performance impact negligible.
Signed-off-by: Timothy Gu <timothygu99 at gmail.com>
---
libavcodec/x86/dnxhdenc.asm | 41 +++++++++++++++++++++++++++++------------
libavcodec/x86/dnxhdenc_init.c | 4 ++++
2 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index 9dd6d51..d42530b 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -26,18 +26,30 @@ section .text
; void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
; ptrdiff_t line_size)
-INIT_XMM sse2
-cglobal get_pixels_8x4_sym, 3,3,5, block, pixels, linesize
- pxor m4, m4
- movq m0, [pixelsq]
- add pixelsq, linesizeq
- movq m1, [pixelsq]
- movq m2, [pixelsq+linesizeq]
- movq m3, [pixelsq+linesizeq*2]
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
+
+%macro GET_PIXELS 1
+%if %1 == 8
+cglobal get_pixels_8x4_sym, 3,3,5, block, pixels, linesize
+%elif %1 == 16
+cglobal get_pixels_8x4_sym_10bit, 3,3,4, block, pixels, linesize
+%endif
+ %if %1 == mmsize/2
+ pxor m4, m4
+ %define LOAD movh
+ %elif %1 == mmsize && %1 == 16
+ %define LOAD movu
+ %endif
+ LOAD m0, [pixelsq]
+ add pixelsq, linesizeq
+ LOAD m1, [pixelsq]
+ LOAD m2, [pixelsq+linesizeq]
+ LOAD m3, [pixelsq+linesizeq*2]
+ %if %1 == mmsize/2
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ %endif
mova [blockq ], m0
mova [blockq+16 ], m1
mova [blockq+32 ], m2
@@ -47,3 +59,8 @@ cglobal get_pixels_8x4_sym, 3,3,5, block, pixels, linesize
mova [blockq+96 ], m1
mova [blockq+112], m0
RET
+%endmacro
+
+INIT_XMM sse2
+GET_PIXELS 8
+GET_PIXELS 16
diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c
index 3b02264..c1c4a8b 100644
--- a/libavcodec/x86/dnxhdenc_init.c
+++ b/libavcodec/x86/dnxhdenc_init.c
@@ -27,6 +27,8 @@
void ff_get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
+void ff_get_pixels_8x4_sym_10bit_sse2(int16_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size);
av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
{
@@ -34,6 +36,8 @@ av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
if (EXTERNAL_SSE2(av_get_cpu_flags())) {
if (ctx->cid_table->bit_depth == 8)
ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_sse2;
+ else if (ctx->cid_table->bit_depth == 10)
+ ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_10bit_sse2;
}
#endif /* HAVE_SSE2_EXTERNAL */
}
--
1.8.3.2
More information about the ffmpeg-devel
mailing list