[FFmpeg-devel] [PATCH] x86/mpegvideoencdsp: improve ff_pix_sum16_sse2

James Almer jamrial at gmail.com
Wed Oct 1 03:24:28 CEST 2014


~15 faster.

Also add an mmxext version that takes advantage of the new code, and
build it alongside with the mmx version only on x86_32.

Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavcodec/x86/mpegvideoencdsp.asm    | 51 +++++++++++++++++++++++------------
 libavcodec/x86/mpegvideoencdsp_init.c |  7 +++++
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 4fe6cfe..aec73f8 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -29,16 +29,16 @@ cextern pw_1
 
 SECTION .text
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-; %1 = number of xmm registers used
-; %2 = number of loops
-; %3 = number of GPRs used
-%macro PIX_SUM16 4
-cglobal pix_sum16, 2, %3, %1
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
     movsxdifnidn r1, r1d
-    mov          r2, %2
-%if cpuflag(xop)
+    mov          r2, %1
+%if mmsize == 16
     lea          r3, [r1*3]
-%else
+%endif
+%if notcpuflag(xop)
     pxor         m5, m5
 %endif
     pxor         m4, m4
@@ -52,42 +52,59 @@ cglobal pix_sum16, 2, %3, %1
     mova         m0, [r0]
 %if mmsize == 8
     mova         m1, [r0+8]
-%else
+%if cpuflag(mmxext)
+    mova         m2, [r0+r1]
+    mova         m3, [r0+r1+8]
+%endif
+%else ; sse2
     mova         m1, [r0+r1]
+    mova         m2, [r0+r1*2]
+    mova         m3, [r0+r3]
 %endif
+%if cpuflag(mmxext)
+    psadbw       m0, m5
+    psadbw       m1, m5
+    psadbw       m2, m5
+    psadbw       m3, m5
+%else ; mmx
     punpckhbw    m2, m0, m5
     punpcklbw    m0, m5
     punpckhbw    m3, m1, m5
     punpcklbw    m1, m5
+%endif ; cpuflag(mmxext)
 %endif ; cpuflag(xop)
     paddw        m1, m0
     paddw        m3, m2
     paddw        m3, m1
     paddw        m4, m3
-%if mmsize == 8
-    add          r0, r1
+%if cpuflag(mmxext)
+    lea          r0, [r0+r1*%3]
 %else
-    lea          r0, [r0+r1*%4]
+    add          r0, r1
 %endif
     dec r2
     jne .loop
-%if cpuflag(xop)
+%if mmsize == 16
     pshufd       m0, m4, q0032
     paddd        m4, m0
-%else
+%elif notcpuflag(mmxext)
     HADDW        m4, m5
 %endif
     movd        eax, m4
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
-PIX_SUM16 0, 16, 3, 0
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16  8, 4, 2
+%endif
 INIT_XMM sse2
-PIX_SUM16 6, 8,  3, 2
+PIX_SUM16  4, 4, 4
 %if HAVE_XOP_EXTERNAL
 INIT_XMM xop
-PIX_SUM16 5, 4,  4, 4
+PIX_SUM16  4, 4, 4
 %endif
 
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index d91b902..2a4db61 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -24,6 +24,7 @@
 #include "libavcodec/mpegvideoencdsp.h"
 
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
 int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
 int ff_pix_sum16_xop(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
@@ -218,11 +219,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
         c->pix_sum   = ff_pix_sum16_mmx;
         c->pix_norm1 = ff_pix_norm1_mmx;
     }
 
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_mmxext;
+    }
+#endif
+
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->pix_sum     = ff_pix_sum16_sse2;
         c->pix_norm1   = ff_pix_norm1_sse2;
-- 
1.8.5.5



More information about the ffmpeg-devel mailing list