[FFmpeg-devel] Fixpoint FFT optimization, with MDCT and IMDCT wrappers for audio optimization

Michael Niedermayer michaelni
Sat Jul 28 03:08:41 CEST 2007


Hi

On Fri, Jul 27, 2007 at 05:40:14PM -0400, mmh wrote:
[...]
> +/*
> +  This is a fixedpoint inplace 32bit FFT which accepts 3 arguments:
> +
> +  @param X   - input signal in format 1.31
> +  @param W   - phase factors in 1.31 format
> +  @param lgN - log_2(N) where N is the size of the input data set.
> +
> +*/
> +void ff_fft32 (FFTContext *c, FFTComplex32 *X)
> +{
> +    FFTComplex16 *W = c->exptab16;
> +    int lgN    = c->nbits;
> +    int N      = 1<<lgN;
> +    int s, j, k, m, w, ws;
> +    int64_t wwr,wwi;
> +
> +    for (s=1; s<=lgN; s++) {
> +        m  = 1<<s;
> +        ws = 1<<(lgN-s);
> +        w=0;
> +        for (j=0; j<m/2; j++) {
> +            wwr=W[w].re;
> +            wwi=W[w].im;
> +            w+=ws;
> +            for (k=j; k<N; k+=m) {
> +                long tr,ti;
> +                int k2 = k+m/2;

signed and /2 is slow either changed to unsiged or use >>1


> +
> +                tr        = (X[k2].re*wwr - X[k2].im*wwi + 0x4000)>>15;
> +                ti        = (X[k2].re*wwi + X[k2].im*wwr + 0x4000)>>15;
> +
> +                X[k2].re  = (X[k].re - tr);
> +                X[k2].im  = (X[k].im - ti);
> +
> +                X[k].re   = (X[k].re + tr);
> +                X[k].im   = (X[k].im + ti);
> +            }
> +        }
> +    }
> +}

what about implementing a split radix fft?
(http://en.wikipedia.org/wiki/Split-radix_FFT)

also several iterations have trivial (1/-1/0) w* these should not be done
in this generic way


> +
> +/*
> +  use same number of bits of precision as floating point.
> +*/
> +#define PRECISION 23
> +#define HALF (1<<PRECISION)
> +void ff_fft32x32 (FFTContext *c, FFTComplex32 *X)
> +{
> +    FFTComplex32 *W = c->exptab32;
> +    int lgN = c->nbits;
> +    int N   = 1<<lgN;
> +    int s, j, k, m, w, ws;
> +    int64_t wwr,wwi;
> +
> +
> +    for (s=1; s<=lgN; s++) {
> +        m  = 1<<s;
> +        ws = 1<<(lgN-s);
> +        w=0;
> +        for (j=0; j<m/2; j++) {
> +            wwr=W[w].re;
> +            wwi=W[w].im;
> +            w+=ws;
> +            for (k=j; k<N; k+=m) {
> +                long tr,ti;
> +                int k2 = k+m/2;
> +
> +                tr        = (X[k2].re*wwr - X[k2].im*wwi + HALF)>>PRECISION;
> +                ti        = (X[k2].re*wwi + X[k2].im*wwr + HALF)>>PRECISION;

what about a 

re+=re;
im+=im;
tr= ((re*wwr)>>32) - ((im*wwi)>>32);
ti= ((re*wwi)>>32) + ((im*wwr)>>32);

this should be much faster if a 32*32->high 32 instruction is available



> +
> +                X[k2].re  = (X[k].re - tr);
> +                X[k2].im  = (X[k].im - ti);
> +
> +                X[k].re   = (X[k].re + tr);
> +                X[k].im   = (X[k].im + ti);
> +            }
> +        }
> +    }
> +}
> +
> +
> +

> +  

trailing whitespace


> +/*
> +  Generate 16b coefficient table from a predefined floating point table.
> +*/

not a proper doxygen comment


> +static FFTComplex16 *stwids (FFTComplex *w, int n) {
> +    int i;
> +    FFTComplex16 *v = av_malloc (sizeof (short)*n);
> +    for (i = 0; i < n/2; i++) {
> +        v[i].re = w[i].re*32767;
> +        v[i].im = w[i].im*32767;
> +    }

this should be 32768 with proper cliping


[...]
> +  s->fft_calc = (void *)ff_fft32;

unneeded cast or your code is buggy


> +}

inconsistant indention 


[...]
> +/* complex multiplication: p = a * b */
> +#define CMUL(pre, pim, are, aim, bre, bim) \
> +{\
> +    int32_t _are = (are);\
> +    int32_t _aim = (aim);\
> +    int64_t _bre = (bre);\
> +    int64_t _bim = (bim);\
> +    (pre) = (_are * _bre - _aim * _bim + 0x4000)>>15;\
> +    (pim) = (_are * _bim + _aim * _bre + 0x4000)>>15;\

stuff begining with _ is reserved


[...]

> -
> +#undef random
>  float frandom(void)

doesnt belong in this patch

[...]
> Index: libavcodec/fft.c
> ===================================================================
> --- libavcodec/fft.c	(revision 9807)
> +++ libavcodec/fft.c	(working copy)
> @@ -109,7 +109,7 @@
>                  }
>                  nblocks = nblocks >> 1;
>              } while (nblocks != 0);
> -            av_freep(&s->exptab);
> +//            av_freep(&s->exptab);
>          }

hmm

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Democracy is the form of government in which you can choose your dictator
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20070728/ace41a30/attachment.pgp>



More information about the ffmpeg-devel mailing list