00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024 #include "fft.h"
00025
00026 DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
00027 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
00028
00029 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
00030 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
00031
00032 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00033 {
00034 int n = 1 << s->nbits;
00035
00036 ff_fft_dispatch_interleave_sse(z, s->nbits);
00037
00038 if(n <= 16) {
00039 x86_reg i = -8*n;
00040 __asm__ volatile(
00041 "1: \n"
00042 "movaps (%0,%1), %%xmm0 \n"
00043 "movaps %%xmm0, %%xmm1 \n"
00044 "unpcklps 16(%0,%1), %%xmm0 \n"
00045 "unpckhps 16(%0,%1), %%xmm1 \n"
00046 "movaps %%xmm0, (%0,%1) \n"
00047 "movaps %%xmm1, 16(%0,%1) \n"
00048 "add $32, %0 \n"
00049 "jl 1b \n"
00050 :"+r"(i)
00051 :"r"(z+n)
00052 :"memory"
00053 );
00054 }
00055 }
00056
00057 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
00058 {
00059 int n = 1 << s->nbits;
00060 int i;
00061 for(i=0; i<n; i+=2) {
00062 __asm__ volatile(
00063 "movaps %2, %%xmm0 \n"
00064 "movlps %%xmm0, %0 \n"
00065 "movhps %%xmm0, %1 \n"
00066 :"=m"(s->tmp_buf[s->revtab[i]]),
00067 "=m"(s->tmp_buf[s->revtab[i+1]])
00068 :"m"(z[i])
00069 );
00070 }
00071 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
00072 }
00073
00074 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
00075 {
00076 av_unused x86_reg i, j, k, l;
00077 long n = 1 << s->mdct_bits;
00078 long n2 = n >> 1;
00079 long n4 = n >> 2;
00080 long n8 = n >> 3;
00081 const uint16_t *revtab = s->revtab + n8;
00082 const FFTSample *tcos = s->tcos;
00083 const FFTSample *tsin = s->tsin;
00084 FFTComplex *z = (FFTComplex *)output;
00085
00086
00087 for(k=n8-2; k>=0; k-=2) {
00088 __asm__ volatile(
00089 "movaps (%2,%1,2), %%xmm0 \n"
00090 "movaps -16(%2,%0,2), %%xmm1 \n"
00091 "movaps %%xmm0, %%xmm2 \n"
00092 "shufps $0x88, %%xmm1, %%xmm0 \n"
00093 "shufps $0x77, %%xmm2, %%xmm1 \n"
00094 "movlps (%3,%1), %%xmm4 \n"
00095 "movlps (%4,%1), %%xmm5 \n"
00096 "movhps -8(%3,%0), %%xmm4 \n"
00097 "movhps -8(%4,%0), %%xmm5 \n"
00098 "movaps %%xmm0, %%xmm2 \n"
00099 "movaps %%xmm1, %%xmm3 \n"
00100 "mulps %%xmm5, %%xmm0 \n"
00101 "mulps %%xmm4, %%xmm1 \n"
00102 "mulps %%xmm4, %%xmm2 \n"
00103 "mulps %%xmm5, %%xmm3 \n"
00104 "subps %%xmm0, %%xmm1 \n"
00105 "addps %%xmm3, %%xmm2 \n"
00106 "movaps %%xmm1, %%xmm0 \n"
00107 "unpcklps %%xmm2, %%xmm1 \n"
00108 "unpckhps %%xmm2, %%xmm0 \n"
00109 ::"r"(-4*k), "r"(4*k),
00110 "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
00111 );
00112 #if ARCH_X86_64
00113
00114
00115 __asm__("movlps %%xmm0, %0 \n"
00116 "movhps %%xmm0, %1 \n"
00117 "movlps %%xmm1, %2 \n"
00118 "movhps %%xmm1, %3 \n"
00119 :"=m"(z[revtab[-k-2]]),
00120 "=m"(z[revtab[-k-1]]),
00121 "=m"(z[revtab[ k ]]),
00122 "=m"(z[revtab[ k+1]])
00123 );
00124 #else
00125 __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]]));
00126 __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]]));
00127 __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]]));
00128 __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]]));
00129 #endif
00130 }
00131
00132 ff_fft_dispatch_sse(z, s->nbits);
00133
00134
00135
00136 #define CMUL(j,xmm0,xmm1)\
00137 "movaps (%2,"#j",2), %%xmm6 \n"\
00138 "movaps 16(%2,"#j",2), "#xmm0"\n"\
00139 "movaps %%xmm6, "#xmm1"\n"\
00140 "movaps "#xmm0",%%xmm7 \n"\
00141 "mulps (%3,"#j"), %%xmm6 \n"\
00142 "mulps (%4,"#j"), "#xmm0"\n"\
00143 "mulps (%4,"#j"), "#xmm1"\n"\
00144 "mulps (%3,"#j"), %%xmm7 \n"\
00145 "subps %%xmm6, "#xmm0"\n"\
00146 "addps %%xmm7, "#xmm1"\n"
00147
00148 j = -n2;
00149 k = n2-16;
00150 __asm__ volatile(
00151 "1: \n"
00152 CMUL(%0, %%xmm0, %%xmm1)
00153 CMUL(%1, %%xmm4, %%xmm5)
00154 "shufps $0x1b, %%xmm1, %%xmm1 \n"
00155 "shufps $0x1b, %%xmm5, %%xmm5 \n"
00156 "movaps %%xmm4, %%xmm6 \n"
00157 "unpckhps %%xmm1, %%xmm4 \n"
00158 "unpcklps %%xmm1, %%xmm6 \n"
00159 "movaps %%xmm0, %%xmm2 \n"
00160 "unpcklps %%xmm5, %%xmm0 \n"
00161 "unpckhps %%xmm5, %%xmm2 \n"
00162 "movaps %%xmm6, (%2,%1,2) \n"
00163 "movaps %%xmm4, 16(%2,%1,2) \n"
00164 "movaps %%xmm0, (%2,%0,2) \n"
00165 "movaps %%xmm2, 16(%2,%0,2) \n"
00166 "sub $16, %1 \n"
00167 "add $16, %0 \n"
00168 "jl 1b \n"
00169 :"+&r"(j), "+&r"(k)
00170 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
00171 :"memory"
00172 );
00173 }
00174
00175 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
00176 {
00177 x86_reg j, k;
00178 long n = 1 << s->mdct_bits;
00179 long n4 = n >> 2;
00180
00181 ff_imdct_half_sse(s, output+n4, input);
00182
00183 j = -n;
00184 k = n-16;
00185 __asm__ volatile(
00186 "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
00187 "1: \n"
00188 "movaps (%2,%1), %%xmm0 \n"
00189 "movaps (%3,%0), %%xmm1 \n"
00190 "shufps $0x1b, %%xmm0, %%xmm0 \n"
00191 "shufps $0x1b, %%xmm1, %%xmm1 \n"
00192 "xorps %%xmm7, %%xmm0 \n"
00193 "movaps %%xmm1, (%3,%1) \n"
00194 "movaps %%xmm0, (%2,%0) \n"
00195 "sub $16, %1 \n"
00196 "add $16, %0 \n"
00197 "jl 1b \n"
00198 :"+r"(j), "+r"(k)
00199 :"r"(output+n4), "r"(output+n4*3)
00200 );
00201 }
00202