[FFmpeg-devel] [PATCH 3/4] swr: add ff_resample_{common, linear}_float_fma

James Almer jamrial at gmail.com
Mon Jun 30 01:19:05 CEST 2014


Signed-off-by: James Almer <jamrial at gmail.com>
---
 libswresample/x86/resample.asm       | 25 +++++++++++++++++++--
 libswresample/x86/resample_x86_dsp.c | 43 +++++++++++++++++-------------------
 2 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
index b3c431e..2cb656e 100644
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -179,13 +179,17 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
     pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
     paddd                         m0, m1
 %else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                      m0, m1, [filterq+min_filter_count_x4q*1], m0
+%else
     mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
     addp%4                        m0, m0, m1
+%endif ; cpuflag
 %endif
     add         min_filter_count_x4q, mmsize
     js .inner_loop
 
-%if cpuflag(avx)
+%if mmsize == 32
     vextractf128                 xm1, m0, 0x1
     addps                        xm0, xm1
 %endif
@@ -429,15 +433,20 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
     paddd                         m2, m3
     paddd                         m0, m1
 %else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                      m2, m1, [filter2q+min_filter_count_x4q*1], m2
+    fmaddp%4                      m0, m1, [filter1q+min_filter_count_x4q*1], m0
+%else
     mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
     mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
     addp%4                        m2, m2, m3
     addp%4                        m0, m0, m1
+%endif ; cpuflag
 %endif
     add         min_filter_count_x4q, mmsize
     js .inner_loop
 
-%if cpuflag(avx)
+%if mmsize == 32
     vextractf128                 xm1, m0, 0x1
     vextractf128                 xm3, m2, 0x1
     addps                        xm0, xm1
@@ -483,8 +492,12 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
     subp%4                       xm2, xm0
     mulp%4                       xm1, xm4
     shufp%4                      xm1, xm1, q0000
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                     xm0, xm2, xm1, xm0
+%else
     mulp%4                       xm2, xm1
     addp%4                       xm0, xm2
+%endif ; cpuflag
 
     ; horizontal sum & store
     movhlps                      xm1, xm0
@@ -564,6 +577,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1
 INIT_YMM avx
 RESAMPLE_FNS float, 4, 2, s, pf_1
 %endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
+%if HAVE_FMA4_EXTERNAL
+INIT_XMM fma4
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
 
 %if ARCH_X86_32
 INIT_MMX mmxext
diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c
index 9049da6..c3d8578 100644
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@@ -27,30 +27,19 @@
 
 #include "libswresample/resample.h"
 
-int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
-                                    const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
-                                    const uint8_t *src, int sz, int upd);
+#define RESAMPLE_FUNCS(type, opt) \
+int ff_resample_common_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+                                      const uint8_t *src, int sz, int upd); \
+int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+                                      const uint8_t *src, int sz, int upd)
 
-int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
-                                  const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
-                                  const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
-                                   const uint8_t *src, int sz, int upd);
-int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
-                                   const uint8_t *src, int sz, int upd);
+RESAMPLE_FUNCS(int16, mmxext);
+RESAMPLE_FUNCS(int16, sse2);
+RESAMPLE_FUNCS(float, sse);
+RESAMPLE_FUNCS(float, avx);
+RESAMPLE_FUNCS(float, fma3);
+RESAMPLE_FUNCS(float, fma4);
+RESAMPLE_FUNCS(double, sse2);
 
 void swresample_dsp_x86_init(ResampleContext *c)
 {
@@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c)
         c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
         c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
     }
+    if (HAVE_FMA3_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA3) {
+        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma3;
+        c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma3;
+    }
+    if (HAVE_FMA4_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA4) {
+        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
+        c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
+    }
 }
-- 
1.9.4.msysgit.0




More information about the ffmpeg-devel mailing list