[FFmpeg-devel] [PATCH 2/2] swresample: add swri_resample_float_avx

Fri May 16 00:54:00 CEST 2014

Signed-off-by: James Almer <jamrial at gmail.com>
---
I go and mention how AVX lets you avoid having to copy stuff with mova 
and then i send a patch where i ignore my own advice right after that.
---
 libswresample/resample.c          | 10 +++++++
 libswresample/resample_template.c |  7 ++++-
 libswresample/x86/resample_mmx.h  | 61 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/libswresample/resample.c b/libswresample/resample.c
index 0ce74d0..0fef283 100644
--- a/libswresample/resample.c
+++ b/libswresample/resample.c
@@ -323,6 +323,12 @@ static int set_compensation(ResampleContext *c, int sample_delta, int compensati
 #undef TEMPLATE_RESAMPLE_DBL_SSE2
 #endif
 
+#if HAVE_AVX_INLINE
+#define TEMPLATE_RESAMPLE_FLT_AVX
+#include "resample_template.c"
+#undef TEMPLATE_RESAMPLE_FLT_AVX
+#endif
+
 #endif // HAVE_MMXEXT_INLINE
 
 static int multiple_resample(ResampleContext *c, AudioData *dst, int dst_size, AudioData *src, int src_size, int *consumed){
@@ -343,6 +349,10 @@ static int multiple_resample(ResampleContext *c, AudioData *dst, int dst_size, A
 #endif
              if(c->format == AV_SAMPLE_FMT_S16P) ret= swri_resample_int16(c, (int16_t*)dst->ch[i], (const int16_t*)src->ch[i], consumed, src_size, dst_size, i+1==dst->ch_count);
         else if(c->format == AV_SAMPLE_FMT_S32P) ret= swri_resample_int32(c, (int32_t*)dst->ch[i], (const int32_t*)src->ch[i], consumed, src_size, dst_size, i+1==dst->ch_count);
+#if HAVE_AVX_INLINE
+        else if(c->format == AV_SAMPLE_FMT_FLTP && (mm_flags&AV_CPU_FLAG_AVX))
+                                                 ret= swri_resample_float_avx (c, (float*)dst->ch[i], (const float*)src->ch[i], consumed, src_size, dst_size, i+1==dst->ch_count);
+#endif
 #if HAVE_SSE_INLINE
         else if(c->format == AV_SAMPLE_FMT_FLTP && (mm_flags&AV_CPU_FLAG_SSE))
                                                  ret= swri_resample_float_sse (c, (float*)dst->ch[i], (const float*)src->ch[i], consumed, src_size, dst_size, i+1==dst->ch_count);
diff --git a/libswresample/resample_template.c b/libswresample/resample_template.c
index 7624d92..becff12 100644
--- a/libswresample/resample_template.c
+++ b/libswresample/resample_template.c
@@ -44,7 +44,8 @@
 #    endif
 
 #elif    defined(TEMPLATE_RESAMPLE_FLT)     \
-      || defined(TEMPLATE_RESAMPLE_FLT_SSE)
+      || defined(TEMPLATE_RESAMPLE_FLT_SSE) \
+      || defined(TEMPLATE_RESAMPLE_FLT_AVX)
 
 #    define FILTER_SHIFT 0
 #    define DELEM  float
@@ -59,6 +60,10 @@
 #        define COMMON_CORE COMMON_CORE_FLT_SSE
 #        define LINEAR_CORE LINEAR_CORE_FLT_SSE
 #        define RENAME(N) N ## _float_sse
+#    elif defined(TEMPLATE_RESAMPLE_FLT_AVX)
+#        define COMMON_CORE COMMON_CORE_FLT_AVX
+#        define LINEAR_CORE LINEAR_CORE_FLT_AVX
+#        define RENAME(N) N ## _float_avx
 #    endif
 
 #elif defined(TEMPLATE_RESAMPLE_S32)
diff --git a/libswresample/x86/resample_mmx.h b/libswresample/x86/resample_mmx.h
index 2bd48a9..36c7a06 100644
--- a/libswresample/x86/resample_mmx.h
+++ b/libswresample/x86/resample_mmx.h
@@ -25,6 +25,7 @@
 int swri_resample_int16_mmx2 (struct ResampleContext *c, int16_t *dst, const int16_t *src, int *consumed, int src_size, int dst_size, int update_ctx);
 int swri_resample_int16_sse2 (struct ResampleContext *c, int16_t *dst, const int16_t *src, int *consumed, int src_size, int dst_size, int update_ctx);
 int swri_resample_float_sse  (struct ResampleContext *c,   float *dst, const   float *src, int *consumed, int src_size, int dst_size, int update_ctx);
+int swri_resample_float_avx  (struct ResampleContext *c,   float *dst, const   float *src, int *consumed, int src_size, int dst_size, int update_ctx);
 int swri_resample_double_sse2(struct ResampleContext *c,  double *dst, const  double *src, int *consumed, int src_size, int dst_size, int update_ctx);
 
 DECLARE_ALIGNED(16, const uint64_t, ff_resample_int16_rounder)[2]    = { 0x0000000000004000ULL, 0x0000000000000000ULL};
@@ -195,6 +196,66 @@ __asm__ volatile(\
     XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
 );
 
+#define COMMON_CORE_FLT_AVX \
+    x86_reg len= -4*c->filter_length;\
+__asm__ volatile(\
+    "vxorps     %%ymm0, %%ymm0, %%ymm0    \n\t"\
+    "1:                                   \n\t"\
+    "vmovups  (%1, %0), %%ymm1            \n\t"\
+    "vmulps   (%2, %0), %%ymm1, %%ymm1    \n\t"\
+    "vaddps     %%ymm1, %%ymm0, %%ymm0    \n\t"\
+    "add           $32, %0                \n\t"\
+    " js 1b                               \n\t"\
+    "vextractf128   $1, %%ymm0, %%xmm1    \n\t"\
+    "vaddps     %%xmm1, %%xmm0, %%xmm0    \n\t"\
+    "vmovhlps   %%xmm0, %%xmm1, %%xmm1    \n\t"\
+    "vaddps     %%xmm1, %%xmm0, %%xmm0    \n\t"\
+    "vshufps $1, %%xmm0, %%xmm0, %%xmm1   \n\t"\
+    "vaddss     %%xmm1, %%xmm0, %%xmm0    \n\t"\
+    "vmovss     %%xmm0, (%3)              \n\t"\
+    : "+r" (len)\
+    : "r" (((uint8_t*)(src+sample_index))-len),\
+      "r" (((uint8_t*)filter)-len),\
+      "r" (dst+dst_index)\
+    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
+);
+
+#define LINEAR_CORE_FLT_AVX \
+    x86_reg len= -4*c->filter_length;\
+__asm__ volatile(\
+    "vxorps      %%ymm0, %%ymm0, %%ymm0   \n\t"\
+    "vxorps      %%ymm2, %%ymm2, %%ymm2   \n\t"\
+    "1:                                   \n\t"\
+    "vmovups   (%3, %0), %%ymm1           \n\t"\
+    "vmulps    (%5, %0), %%ymm1, %%ymm3   \n\t"\
+    "vmulps    (%4, %0), %%ymm1, %%ymm1   \n\t"\
+    "vaddps      %%ymm1, %%ymm0, %%ymm0   \n\t"\
+    "vaddps      %%ymm3, %%ymm2, %%ymm2   \n\t"\
+    "add            $32, %0               \n\t"\
+    " js 1b                               \n\t"\
+    "vextractf128    $1, %%ymm0, %%xmm1   \n\t"\
+    "vextractf128    $1, %%ymm2, %%xmm3   \n\t"\
+    "vaddps      %%xmm1, %%xmm0, %%xmm0   \n\t"\
+    "vaddps      %%xmm3, %%xmm2, %%xmm2   \n\t"\
+    "vmovhlps    %%xmm0, %%xmm1, %%xmm1   \n\t"\
+    "vmovhlps    %%xmm2, %%xmm3, %%xmm3   \n\t"\
+    "vaddps      %%xmm1, %%xmm0, %%xmm0   \n\t"\
+    "vaddps      %%xmm3, %%xmm2, %%xmm2   \n\t"\
+    "vshufps $1, %%xmm0, %%xmm0, %%xmm1   \n\t"\
+    "vshufps $1, %%xmm2, %%xmm2, %%xmm3   \n\t"\
+    "vaddss      %%xmm1, %%xmm0, %%xmm0   \n\t"\
+    "vaddss      %%xmm3, %%xmm2, %%xmm2   \n\t"\
+    "vmovss      %%xmm0, %1               \n\t"\
+    "vmovss      %%xmm2, %2               \n\t"\
+    : "+r" (len),\
+      "=m" (val),\
+      "=m" (v2)\
+    : "r" (((uint8_t*)(src+sample_index))-len),\
+      "r" (((uint8_t*)filter)-len),\
+      "r" (((uint8_t*)(filter+c->filter_alloc))-len)\
+    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
+);
+
 #define COMMON_CORE_DBL_SSE2 \
     x86_reg len= -8*c->filter_length;\
 __asm__ volatile(\
-- 
1.8.5.5