[FFmpeg-devel] [PATCH 3/4 v2] x86/swr: add ff_resample_{common, linear}_float_fma

James Almer jamrial at gmail.com
Mon Jun 30 18:06:01 CEST 2014


Signed-off-by: James Almer <jamrial at gmail.com>
---
 libswresample/x86/resample.asm       | 43 ++++++++++++++++++++++++++----------
 libswresample/x86/resample_x86_dsp.c | 43 +++++++++++++++++-------------------
 2 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
index bce1389..17f6169 100644
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -179,17 +179,16 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
     pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
     paddd                         m0, m1
 %else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                      m0, m1, [filterq+min_filter_count_x4q*1], m0
+%else
     mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
     addp%4                        m0, m0, m1
+%endif ; cpuflag
 %endif
     add         min_filter_count_x4q, mmsize
     js .inner_loop
 
-%if cpuflag(avx)
-    vextractf128                 xm1, m0, 0x1
-    addps                        xm0, xm1
-%endif
-
 %ifidn %1, int16
 %if mmsize == 16
     pshufd                        m1, m0, q0032
@@ -206,6 +205,10 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
     movd                      [dstq], m0
 %else ; float/double
     ; horizontal sum & store
+%if mmsize == 32
+    vextractf128                 xm1, m0, 0x1
+    addps                        xm0, xm1
+%endif
     movhlps                      xm1, xm0
 %ifidn %1, float
     addps                        xm0, xm1
@@ -429,21 +432,19 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
     paddd                         m2, m3
     paddd                         m0, m1
 %else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                      m2, m1, [filter2q+min_filter_count_x4q*1], m2
+    fmaddp%4                      m0, m1, [filter1q+min_filter_count_x4q*1], m0
+%else
     mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
     mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
     addp%4                        m2, m2, m3
     addp%4                        m0, m0, m1
+%endif ; cpuflag
 %endif
     add         min_filter_count_x4q, mmsize
     js .inner_loop
 
-%if cpuflag(avx)
-    vextractf128                 xm1, m0, 0x1
-    vextractf128                 xm3, m2, 0x1
-    addps                        xm0, xm1
-    addps                        xm2, xm3
-%endif
-
 %ifidn %1, int16
 %if mmsize == 16
     pshufd                        m3, m2, q0032
@@ -479,12 +480,22 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
     ; - unix64: eax=r6[filter1], edx=r2[todo]
 %else ; float/double
     ; val += (v2 - val) * (FELEML) frac / c->src_incr;
+%if mmsize == 32
+    vextractf128                 xm1, m0, 0x1
+    vextractf128                 xm3, m2, 0x1
+    addps                        xm0, xm1
+    addps                        xm2, xm3
+%endif
     cvtsi2s%4                    xm1, fracd
     subp%4                       xm2, xm0
     mulp%4                       xm1, xm4
     shufp%4                      xm1, xm1, q0000
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                     xm0, xm2, xm1, xm0
+%else
     mulp%4                       xm2, xm1
     addp%4                       xm0, xm2
+%endif ; cpuflag
 
     ; horizontal sum & store
     movhlps                      xm1, xm0
@@ -564,6 +575,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1
 INIT_YMM avx
 RESAMPLE_FNS float, 4, 2, s, pf_1
 %endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
+%if HAVE_FMA4_EXTERNAL
+INIT_XMM fma4
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
 
 %if ARCH_X86_32
 INIT_MMX mmxext
diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c
index 9049da6..ff9f1ec 100644
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@@ -27,30 +27,19 @@
 
 #include "libswresample/resample.h"
 
-int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
-                                    const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
-                                    const uint8_t *src, int sz, int upd);
+#define RESAMPLE_FUNCS(type, opt) \
+int ff_resample_common_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+                                      const uint8_t *src, int sz, int upd); \
+int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+                                      const uint8_t *src, int sz, int upd)
 
-int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
-                                  const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
-                                  const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
-                                   const uint8_t *src, int sz, int upd);
-int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
-                                   const uint8_t *src, int sz, int upd);
+RESAMPLE_FUNCS(int16,  mmxext);
+RESAMPLE_FUNCS(int16,  sse2);
+RESAMPLE_FUNCS(float,  sse);
+RESAMPLE_FUNCS(float,  avx);
+RESAMPLE_FUNCS(float,  fma3);
+RESAMPLE_FUNCS(float,  fma4);
+RESAMPLE_FUNCS(double, sse2);
 
 void swresample_dsp_x86_init(ResampleContext *c)
 {
@@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c)
         c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
         c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
     }
+    if (HAVE_FMA3_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA3) {
+        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma3;
+        c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma3;
+    }
+    if (HAVE_FMA4_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA4) {
+        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
+        c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
+    }
 }
-- 
1.8.5.5


More information about the ffmpeg-devel mailing list