00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024 #include "fft.h"
00025 #include "config.h"
00026
00027 DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
00028 { 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
00029
00030 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
00031 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
00032 void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
00033
00034 #if HAVE_AVX
00035 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
00036 {
00037 ff_fft_dispatch_interleave_avx(z, s->nbits);
00038 }
00039 #endif
00040
00041 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00042 {
00043 int n = 1 << s->nbits;
00044
00045 ff_fft_dispatch_interleave_sse(z, s->nbits);
00046
00047 if(n <= 16) {
00048 x86_reg i = -8*n;
00049 __asm__ volatile(
00050 "1: \n"
00051 "movaps (%0,%1), %%xmm0 \n"
00052 "movaps %%xmm0, %%xmm1 \n"
00053 "unpcklps 16(%0,%1), %%xmm0 \n"
00054 "unpckhps 16(%0,%1), %%xmm1 \n"
00055 "movaps %%xmm0, (%0,%1) \n"
00056 "movaps %%xmm1, 16(%0,%1) \n"
00057 "add $32, %0 \n"
00058 "jl 1b \n"
00059 :"+r"(i)
00060 :"r"(z+n)
00061 :"memory"
00062 );
00063 }
00064 }
00065
00066 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
00067 {
00068 int n = 1 << s->nbits;
00069 int i;
00070 for(i=0; i<n; i+=2) {
00071 __asm__ volatile(
00072 "movaps %2, %%xmm0 \n"
00073 "movlps %%xmm0, %0 \n"
00074 "movhps %%xmm0, %1 \n"
00075 :"=m"(s->tmp_buf[s->revtab[i]]),
00076 "=m"(s->tmp_buf[s->revtab[i+1]])
00077 :"m"(z[i])
00078 );
00079 }
00080 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
00081 }
00082
00083 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
00084 {
00085 x86_reg j, k;
00086 long n = s->mdct_size;
00087 long n4 = n >> 2;
00088
00089 s->imdct_half(s, output + n4, input);
00090
00091 j = -n;
00092 k = n-16;
00093 __asm__ volatile(
00094 "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
00095 "1: \n"
00096 "movaps (%2,%1), %%xmm0 \n"
00097 "movaps (%3,%0), %%xmm1 \n"
00098 "shufps $0x1b, %%xmm0, %%xmm0 \n"
00099 "shufps $0x1b, %%xmm1, %%xmm1 \n"
00100 "xorps %%xmm7, %%xmm0 \n"
00101 "movaps %%xmm1, (%3,%1) \n"
00102 "movaps %%xmm0, (%2,%0) \n"
00103 "sub $16, %1 \n"
00104 "add $16, %0 \n"
00105 "jl 1b \n"
00106 :"+r"(j), "+r"(k)
00107 :"r"(output+n4), "r"(output+n4*3)
00108 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
00109 );
00110 }