[FFmpeg-devel] [PATCH 3/4] huffyuvencdsp: Add ff_diff_bytes_sse2

Timothy Gu timothygu99 at gmail.com
Mon Oct 19 22:00:45 CEST 2015


4% to 35% faster depending on the width.
---
 libavcodec/x86/huffyuvencdsp.asm   | 31 ++++++++++++++++++++-----------
 libavcodec/x86/huffyuvencdsp_mmx.c |  8 +++++++-
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 97de7e9..9625fbe 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -27,27 +27,27 @@
 
 section .text
 
-INIT_MMX mmx
 ; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 ;                        intptr_t w);
-cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i
+%macro DIFF_BYTES 0
+cglobal diff_bytes, 4,6,2, dst, src1, src2, w, i
     xor               iq, iq
-    cmp               wq, 16
+    cmp               wq, mmsize * 2
         jb        .loop2
-    sub               wq, 15
+    sub               wq, mmsize * 2 - 1
 .loop:
-    mova              m0, [src2q + iq]
-    mova              m1, [src1q + iq]
+    movu              m0, [src2q + iq]
+    movu              m1, [src1q + iq]
     psubb             m1, m0
     mova     [iq + dstq], m1
-    mova              m0, [src2q + iq + 8]
-    mova              m1, [src1q + iq + 8]
+    movu              m0, [src2q + iq + mmsize]
+    movu              m1, [src1q + iq + mmsize]
     psubb             m1, m0
-    mova [8 + iq + dstq], m1
-    add               iq, 16
+    mova [mmsize + iq + dstq], m1
+    add               iq, mmsize * 2
     cmp               iq, wq
         jb         .loop
-    add               wq, 15
+    add               wq, mmsize * 2 - 1
 .loop2:
     mov              r6b, byte [src1q + iq]
     sub              r6b, byte [src2q + iq]
@@ -56,3 +56,12 @@ cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i
     cmp               iq, wq
         jb        .loop2
     REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c
index c5f81c8..9af5305 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/huffyuvencdsp_mmx.c
@@ -31,6 +31,8 @@
 
 void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                        intptr_t w);
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
 
 #if HAVE_INLINE_ASM
 
@@ -80,11 +82,15 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c)
 {
     av_unused int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags)) {
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
         c->diff_bytes = ff_diff_bytes_mmx;
     }
 
     if (INLINE_MMXEXT(cpu_flags)) {
         c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
     }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_sse2;
+    }
 }
-- 
1.9.1



More information about the ffmpeg-devel mailing list