[FFmpeg-cvslog] float_dsp: add x86-optimized functions for vector_fmac_scalar()
Justin Ruggles
git at videolan.org
Tue Jun 19 21:13:52 CEST 2012
ffmpeg | branch: master | Justin Ruggles <justin.ruggles at gmail.com> | Fri Jun 8 23:20:59 2012 -0400| [82b2df979069063beb14be340350501c8340f9cd] | committer: Justin Ruggles
float_dsp: add x86-optimized functions for vector_fmac_scalar()
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=82b2df979069063beb14be340350501c8340f9cd
---
libavutil/float_dsp.h | 6 ++---
libavutil/x86/float_dsp.asm | 47 ++++++++++++++++++++++++++++++++++++++++
libavutil/x86/float_dsp_init.c | 7 ++++++
3 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
index 4e26630..95cef62 100644
--- a/libavutil/float_dsp.h
+++ b/libavutil/float_dsp.h
@@ -42,12 +42,12 @@ typedef struct AVFloatDSPContext {
* overlap exactly or not at all.
*
* @param dst result vector
- * constraints: 16-byte aligned
+ * constraints: 32-byte aligned
* @param src input vector
- * constraints: 16-byte aligned
+ * constraints: 32-byte aligned
* @param mul scalar value
* @param len length of vector
- * constraints: multiple of 4
+ * constraints: multiple of 16
*/
void (*vector_fmac_scalar)(float *dst, const float *src, float mul,
int len);
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 53be7ab..66ef093 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -19,6 +19,7 @@
;******************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION .text
@@ -53,3 +54,49 @@ VECTOR_FMUL
INIT_YMM avx
VECTOR_FMUL
%endif
+
+;------------------------------------------------------------------------------
+; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
+;------------------------------------------------------------------------------
+
+%macro VECTOR_FMAC_SCALAR 0
+%if UNIX64
+cglobal vector_fmac_scalar, 3,3,3, dst, src, len
+%else
+cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
+%endif
+%if WIN64
+ SWAP 0, 2
+%endif
+%if ARCH_X86_32
+ VBROADCASTSS m0, mulm
+%else
+ shufps xmm0, xmm0, 0
+%if cpuflag(avx)
+ vinsertf128 m0, m0, xmm0, 1
+%endif
+%endif
+ lea lenq, [lend*4-2*mmsize]
+.loop
+ mulps m1, m0, [srcq+lenq ]
+ mulps m2, m0, [srcq+lenq+mmsize]
+ addps m1, m1, [dstq+lenq ]
+ addps m2, m2, [dstq+lenq+mmsize]
+ mova [dstq+lenq ], m1
+ mova [dstq+lenq+mmsize], m2
+ sub lenq, 2*mmsize
+ jge .loop
+%if mmsize == 32
+ vzeroupper
+ RET
+%else
+ REP_RET
+%endif
+%endmacro
+
+INIT_XMM sse
+VECTOR_FMAC_SCALAR
+%if HAVE_AVX
+INIT_YMM avx
+VECTOR_FMAC_SCALAR
+%endif
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index 10bb226..d259a36 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -26,6 +26,11 @@ extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
int len);
+extern void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
+ int len);
+extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
+ int len);
+
void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
#if HAVE_YASM
@@ -33,9 +38,11 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
fdsp->vector_fmul = ff_vector_fmul_sse;
+ fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
}
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
fdsp->vector_fmul = ff_vector_fmul_avx;
+ fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
}
#endif
}
More information about the ffmpeg-cvslog
mailing list