[FFmpeg-devel] [PATCH 2/2] avfilter/x86/vf_ssim: add ff_ssim_4x4_line_xop

James Almer jamrial at gmail.com
Mon Jul 20 05:53:37 CEST 2015


~20% faster than ssse3. Also enabled for x86_32

Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavfilter/x86/vf_ssim.asm    | 62 ++++++++++++++++++++++++++++++++++++++++--
 libavfilter/x86/vf_ssim_init.c |  5 ++++
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/libavfilter/x86/vf_ssim.asm b/libavfilter/x86/vf_ssim.asm
index 6661987..3293e66 100644
--- a/libavfilter/x86/vf_ssim.asm
+++ b/libavfilter/x86/vf_ssim.asm
@@ -30,16 +30,50 @@ ssim_c2: times 4 dd 235963 ;(.03*.03*255*255*64*63 + .5)
 
 SECTION .text
 
+%macro SSIM_4X4_LINE 1
 %if ARCH_X86_64
-
-INIT_XMM ssse3
-cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
+cglobal ssim_4x4_line, 6, 8, %1, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
+%else
+cglobal ssim_4x4_line, 5, 7, %1, buf, buf_stride, ref, ref_stride, sums, buf_stride3, ref_stride3
+%define wd r5mp
+%endif
     lea     ref_stride3q, [ref_strideq*3]
     lea     buf_stride3q, [buf_strideq*3]
+%if notcpuflag(xop)
     pxor              m7, m7
     mova             m15, [pw_1]
+%endif
 
 .loop:
+%if cpuflag(xop)
+    pmovzxbw          m0, [bufq+buf_strideq*0]
+    pmovzxbw          m1, [refq+ref_strideq*0]
+    pmaddwd           m4, m0, m0
+    pmaddwd           m6, m0, m1
+    pmovzxbw          m2, [bufq+buf_strideq*1]
+    vpmadcswd         m4, m1, m1, m4
+    pmovzxbw          m3, [refq+ref_strideq*1]
+    paddw             m0, m2
+    vpmadcswd         m4, m2, m2, m4
+    vpmadcswd         m6, m2, m3, m6
+    paddw             m1, m3
+    vpmadcswd         m4, m3, m3, m4
+
+    pmovzxbw          m2, [bufq+buf_strideq*2]
+    pmovzxbw          m3, [refq+ref_strideq*2]
+    vpmadcswd         m4, m2, m2, m4
+    vpmadcswd         m6, m2, m3, m6
+    pmovzxbw          m5, [bufq+buf_stride3q]
+    pmovzxbw          m7, [refq+ref_stride3q]
+    vpmadcswd         m4, m3, m3, m4
+    vpmadcswd         m6, m5, m7, m6
+    paddw             m0, m2
+    paddw             m1, m3
+    vpmadcswd         m4, m5, m5, m4
+    paddw             m0, m5
+    paddw             m1, m7
+    vpmadcswd         m4, m7, m7, m4
+%else
     movh              m0, [bufq+buf_strideq*0]  ; a1
     movh              m1, [refq+ref_strideq*0]  ; b1
     movh              m2, [bufq+buf_strideq*1]  ; a2
@@ -85,12 +119,25 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
     paddd             m4, m9
     paddd             m6, m14
     paddd             m4, m12
+%endif
 
     ; m0 = [word] s1 a,a,a,a,b,b,b,b
     ; m1 = [word] s2 a,a,a,a,b,b,b,b
     ; m4 = [dword] ss a,a,b,b
     ; m6 = [dword] s12 a,a,b,b
 
+%if cpuflag(xop)
+    vphaddwq          m0, m0                    ; [dword] s1  a, 0, b, 0
+    vphaddwq          m1, m1                    ; [dword] s2  a, 0, b, 0
+    vphadddq          m4, m4                    ; [dword] ss  a, 0, b, 0
+    vphadddq          m6, m6                    ; [dword] s12 a, 0, b, 0
+    punpckhdq     m2, m0, m1                    ; [dword] s1  b, s2 b, 0, 0
+    punpckldq         m0, m1                    ; [dword] s1  a, s2 a, 0, 0
+    punpckhdq     m3, m4, m6                    ; [dword] ss  b, s12 b, 0, 0
+    punpckldq         m4, m6                    ; [dword] ss  a, s12 a, 0, 0
+    punpcklqdq    m1, m2, m3                    ; [dword] b s1, s2, ss, s12
+    punpcklqdq        m0, m4                    ; [dword] a s1, s2, ss, s12
+%else
     pmaddwd           m0, m15                   ; [dword] s1 a,a,b,b
     pmaddwd           m1, m15                   ; [dword] s2 a,a,b,b
     phaddd            m0, m4                    ; [dword] s1 a, b, ss a, b
@@ -99,6 +146,7 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
     punpckldq         m0, m1                    ; [dword] s1 a, s2 a, s1 b, s2 b
     punpckhqdq    m1, m0, m2                    ; [dword] b s1, s2, ss, s12
     punpcklqdq        m0, m2                    ; [dword] a s1, s2, ss, s12
+%endif
 
     mova  [sumsq+     0], m0
     mova  [sumsq+mmsize], m1
@@ -109,7 +157,15 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
     sub               wd, mmsize/8
     jg .loop
     RET
+%endmacro
 
+%if ARCH_X86_64
+INIT_XMM ssse3
+SSIM_4X4_LINE 16
+%endif
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+SSIM_4X4_LINE 8
 %endif
 
 INIT_XMM sse4
diff --git a/libavfilter/x86/vf_ssim_init.c b/libavfilter/x86/vf_ssim_init.c
index 9514b25..599c928 100644
--- a/libavfilter/x86/vf_ssim_init.c
+++ b/libavfilter/x86/vf_ssim_init.c
@@ -25,6 +25,9 @@
 void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride,
                             const uint8_t *ref, ptrdiff_t ref_stride,
                             int (*sums)[4], int w);
+void ff_ssim_4x4_line_xop  (const uint8_t *buf, ptrdiff_t buf_stride,
+                            const uint8_t *ref, ptrdiff_t ref_stride,
+                            int (*sums)[4], int w);
 float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w);
 
 void ff_ssim_init_x86(SSIMDSPContext *dsp)
@@ -35,4 +38,6 @@ void ff_ssim_init_x86(SSIMDSPContext *dsp)
         dsp->ssim_4x4_line = ff_ssim_4x4_line_ssse3;
     if (EXTERNAL_SSE4(cpu_flags))
         dsp->ssim_end_line = ff_ssim_end_line_sse4;
+    if (EXTERNAL_XOP(cpu_flags))
+        dsp->ssim_4x4_line = ff_ssim_4x4_line_xop;
 }
-- 
2.4.5



More information about the ffmpeg-devel mailing list