[FFmpeg-devel] [PATCH 3/4] mips: Optimization of fixed point FFT

Tue Sep 25 16:10:56 CEST 2012

Signed-off-by: Nedeljko Babic <nbabic at mips.com>
---
 libavcodec/fft.c                 |    1 +
 libavcodec/fft.h                 |    1 +
 libavcodec/fft_ac3_fixed.c       |    4 +-
 libavcodec/mips/Makefile         |    1 +
 libavcodec/mips/fft_mips_fixed.c |  562 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 568 insertions(+), 1 deletions(-)
 create mode 100644 libavcodec/mips/fft_mips_fixed.c

diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index e5bdcbd..a6ce1db 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -166,6 +166,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 #else
     if (CONFIG_MDCT)  s->mdct_calcw = ff_mdct_calcw_c;
     if (ARCH_ARM)     ff_fft_fixed_init_arm(s);
+    if (HAVE_MIPSDSPR2)    ff_fft_fixed_init_mips(s);
 #endif
 
     for(j=4; j<=nbits; j++) {
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
index c7d2cfb..01ac4f9 100644
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -143,6 +143,7 @@ void ff_fft_init_mips(FFTContext *s);
 #else
 void ff_fft_fixed_init_arm(FFTContext *s);
 void ff_ac3_fft_init_fixed(FFTContext *s);
+void ff_fft_fixed_init_mips(FFTContext *s);
 #endif
 
 void ff_fft_end(FFTContext *s);
diff --git a/libavcodec/fft_ac3_fixed.c b/libavcodec/fft_ac3_fixed.c
index 2796cb5..53968af 100644
--- a/libavcodec/fft_ac3_fixed.c
+++ b/libavcodec/fft_ac3_fixed.c
@@ -256,10 +256,12 @@ static void ff_fft_fixed_calc_mips(FFTContext *s, FFTComplex *z) {
 void ff_ac3_fft_init_fixed(FFTContext *s) {
 
     int n=0;
-    ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
 
 #if CONFIG_MDCT
     s->imdct_half_fixed = ff_imdct_fixed_half_mips;
 #endif /* CONFIG_MDCT */
     s->fft_fixed_calc   = ff_fft_fixed_calc_mips;
+
+    if (HAVE_MIPSDSPR2) ff_fft_fixed_init_mips(s);
+    else ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
 }
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 17f6b13..2e6f4b7 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -15,6 +15,7 @@ MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP)       += mips/mpegaudiodsp_mips_float.o
 MIPSDSPR1-OBJS-$(CONFIG_MPEGAUDIODSP)     += mips/mpegaudiodsp_mips_fixed.o
 OBJS-$(CONFIG_FFT)                        += mips/fft_init_table.o
 MIPSFPU-OBJS-$(CONFIG_FFT)                += mips/fft_mips.o
+MIPSDSPR2-OBJS-$(CONFIG_FFT)              += mips/fft_mips_fixed.o
 MIPSFPU-OBJS-$(HAVE_INLINE_ASM)           += mips/fmtconvert_mips.o
 OBJS-$(HAVE_INLINE_ASM)                   += mips/dsputil_mips.o
 MIPSDSPR1-OBJS-$(HAVE_INLINE_ASM)         += mips/fmtconvert_mips_fixed.o
diff --git a/libavcodec/mips/fft_mips_fixed.c b/libavcodec/mips/fft_mips_fixed.c
new file mode 100644
index 0000000..299119a
--- /dev/null
+++ b/libavcodec/mips/fft_mips_fixed.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj at mips.com)
+ *           Dragan Mrdjan    (dmrdjan at mips.com)
+ *           Zoran Lukic      (zlukic at mips.com)
+ *           Bojan Zivkovic   (bojan at mips.com)
+ *
+ * Optimization of FFT and MDCT/IMDCT transforms for MIPS fixed-point
+ * architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CONFIG_FFT_FLOAT 0
+#include "libavcodec/fft.h"
+#include "libavcodec/mips/fft_table.h"
+
+#if HAVE_INLINE_ASM
+static void ff_imdct_fixed_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k, n8, n4, n2, n, j, j2;
+    int ax0, ax1, ax2, ax3;
+    const uint16_t *revtab = s->revtab;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+    const FFTSample *in1, *in2, *in3, *in4;
+    FFTComplex *z = (FFTComplex *)output;
+
+    FFTSample t0, t1, t2, t3, t01, t11, t21, t31;
+
+    n = 1 << s->mdct_bits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    /* pre rotation */
+    in1 = input;
+    in3 = input + 2;
+    in2 = input + n2 - 1;
+    in4 = input + n2 - 3;
+
+    for(k=0; k<n4; k+=4) {
+        int k1 = k * 2;
+        int k2 = k1 + 2;
+
+        __asm__ volatile (
+            "lh             %[ax0],     0(%[in2])                   \n\t"
+            "lh             %[ax1],     0(%[in1])                   \n\t"
+            "lhx            %[ax2],     %[k1](%[tcos])              \n\t"
+            "lhx            %[ax3],     %[k1](%[tsin])              \n\t"
+            "multu          $ac0,       $0,             $0          \n\t"
+            "multu          $ac1,       $0,             $0          \n\t"
+            "append         %[ax0],     %[ax1],         16          \n\t"
+            "append         %[ax2],     %[ax3],         16          \n\t"
+            "multu          $ac2,       $0,             $0          \n\t"
+            "mulsaq_s.w.ph  $ac0,       %[ax0],         %[ax2]      \n\t"
+            "dpaqx_s.w.ph   $ac1,       %[ax0],         %[ax2]      \n\t"
+            "lh             %[ax0],     -4(%[in2])                  \n\t"
+            "lh             %[ax1],     4(%[in1])                   \n\t"
+            "lhx            %[ax2],     %[k2](%[tcos])              \n\t"
+            "lhx            %[ax3],     %[k2](%[tsin])              \n\t"
+            "append         %[ax0],     %[ax1],         16          \n\t"
+            "append         %[ax2],     %[ax3],         16          \n\t"
+            "mulsaq_s.w.ph  $ac2,       %[ax0],         %[ax2]      \n\t"
+            "multu          $ac3,       $0,             $0          \n\t"
+            "dpaqx_s.w.ph   $ac3,       %[ax0],         %[ax2]      \n\t"
+            "extr_r.w       %[t0],      $ac0,           16          \n\t"
+            "extr_r.w       %[t2],      $ac1,           16          \n\t"
+            "extr_r.w       %[t1],      $ac2,           16          \n\t"
+            "extr_r.w       %[t3],      $ac3,           16          \n\t"
+
+            : [ax0] "=&r" (ax0), [ax2] "=&r" (ax2),[ax1]  "=&r"  (ax1), [ax3] "=&r" (ax3),
+              [t0] "=&r" (t0),  [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
+            : [in1] "r" (in1), [in2] "r" (in2), [tcos] "r" (tcos),
+              [tsin] "r" (tsin), [k1] "r" (k1), [k2] "r" (k2)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+        );
+
+        j  = revtab[k];
+        j2 = revtab[k+1];
+
+        z[j].re = t0;
+        z[j].im = t2;
+        z[j2].re = t1;
+        z[j2].im = t3;
+
+        k1 += 4;
+        k2 += 4;
+
+        __asm__ volatile (
+            "lh             %[ax0],     -8(%[in2])                  \n\t"
+            "lh             %[ax1],     8(%[in1])                   \n\t"
+            "lhx            %[ax2],     %[k1](%[tcos])              \n\t"
+            "lhx            %[ax3],     %[k1](%[tsin])              \n\t"
+            "multu          $ac0,       $0,             $0          \n\t"
+            "multu          $ac1,       $0,             $0          \n\t"
+            "append         %[ax0],     %[ax1],         16          \n\t"
+            "append         %[ax2],     %[ax3],         16          \n\t"
+            "multu          $ac2,       $0,             $0          \n\t"
+            "mulsaq_s.w.ph  $ac0,       %[ax0],         %[ax2]      \n\t"
+            "dpaqx_s.w.ph   $ac1,       %[ax0],         %[ax2]      \n\t"
+            "lh             %[ax0],     -12(%[in2])                 \n\t"
+            "lh             %[ax1],     12(%[in1])                  \n\t"
+            "lhx            %[ax2],     %[k2](%[tcos])              \n\t"
+            "lhx            %[ax3],     %[k2](%[tsin])              \n\t"
+            "append         %[ax0],     %[ax1],         16          \n\t"
+            "append         %[ax2],     %[ax3],         16          \n\t"
+            "mulsaq_s.w.ph  $ac2,       %[ax0],         %[ax2]      \n\t"
+            "multu          $ac3,       $0,             $0          \n\t"
+            "dpaqx_s.w.ph   $ac3,       %[ax0],         %[ax2]      \n\t"
+            "extr_r.w       %[t0],      $ac0,           16          \n\t"
+            "extr_r.w       %[t2],      $ac1,           16          \n\t"
+            "extr_r.w       %[t1],      $ac2,           16          \n\t"
+            "extr_r.w       %[t3],      $ac3,           16          \n\t"
+
+            : [ax0] "=&r" (ax0), [ax2] "=&r" (ax2), [ax1] "=&r" (ax1), [ax3] "=&r" (ax3),
+              [t0] "=&r" (t0), [t2] "=&r" (t2), [t1] "=r" (t1), [t3] "=r" (t3)
+            : [in1] "r" (in1), [in2] "r" (in2), [tcos] "r" (tcos),
+              [tsin] "r"  (tsin),[k1] "r" (k1), [k2] "r" (k2)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+        );
+
+        j  = revtab[k+2];
+        j2 = revtab[k+3];
+
+        z[j ].re = t0;
+        z[j ].im = t2;
+        z[j2].re = t1;
+        z[j2].im = t3;
+        in1 += 8;
+        in2 -= 8;
+    }
+
+    s->fft_fixed_calc(s, z);
+
+    /* post rotation + reordering */
+
+    for(k=0; k<n8; k+=2 ) {
+        int k1 = 2 * (n8 - k - 1), k2 = k1 - 2;
+        int k11 = 2 * (n8 + k), k21 = k11 + 2;
+        in1 = (const FFTSample*)(z + (n8 - k - 1));
+        in2 = (const FFTSample*)(z + (n8 + k));
+
+         __asm__ volatile (
+             "lh             %[ax0],     2(%[in1])                   \n\t"
+             "lh             %[ax1],     0(%[in1])                   \n\t"
+             "lhx            %[ax2],     %[k1](%[tsin])              \n\t"
+             "lhx            %[ax3],     %[k1](%[tcos])              \n\t"
+             "multu          $ac0,       $0,             $0          \n\t"
+             "multu          $ac1,       $0,             $0          \n\t"
+             "append         %[ax0],     %[ax1],         16          \n\t"
+             "append         %[ax2],     %[ax3],         16          \n\t"
+             "mulsaq_s.w.ph  $ac0,       %[ax0],         %[ax2]      \n\t"
+             "dpaqx_s.w.ph   $ac1,       %[ax0],         %[ax2]      \n\t"
+             "lh             %[ax0],     -2(%[in1])                  \n\t"
+             "lh             %[ax1],     -4(%[in1])                  \n\t"
+             "lhx            %[ax2],     %[k2](%[tsin])              \n\t"
+             "lhx            %[ax3],     %[k2](%[tcos])              \n\t"
+             "append         %[ax0],     %[ax1],         16          \n\t"
+             "append         %[ax2],     %[ax3],         16          \n\t"
+             "multu          $ac2,       $0,             $0          \n\t"
+             "mulsaq_s.w.ph  $ac2,       %[ax0],         %[ax2]      \n\t"
+             "multu          $ac3,       $0,             $0          \n\t"
+             "dpaqx_s.w.ph   $ac3,       %[ax0],         %[ax2]      \n\t"
+             "extr_r.w       %[t0],      $ac0,           16          \n\t"
+             "extr_r.w       %[t2],      $ac1,           16          \n\t"
+             "extr_r.w       %[t1],      $ac2,           16          \n\t"
+             "extr_r.w       %[t3],      $ac3,           16          \n\t"
+
+            : [ax0] "=&r" (ax0), [ax1] "=&r" (ax1), [ax2] "=&r" (ax2), [ax3] "=&r" (ax3),
+              [t0] "=r" (t0), [t2] "=r" (t2), [t1] "=r" (t1), [t3] "=r" (t3)
+            : [in1] "r" (in1), [k1] "r" (k1), [tsin] "r" (tsin), [tcos] "r" (tcos),
+              [z] "r" (z), [k2] "r" (k2)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+         );
+
+         __asm__ volatile (
+             "lh             %[ax0],     2(%[in2])                   \n\t"
+             "lh             %[ax1],     0(%[in2])                   \n\t"
+             "lhx            %[ax2],     %[k11](%[tsin])             \n\t"
+             "lhx            %[ax3],     %[k11](%[tcos])             \n\t"
+             "multu          $ac0,       $0,             $0          \n\t"
+             "multu          $ac1,       $0,             $0          \n\t"
+             "append         %[ax0],     %[ax1],         16          \n\t"
+             "append         %[ax2],     %[ax3],         16          \n\t"
+             "mulsaq_s.w.ph  $ac0,       %[ax0],         %[ax2]      \n\t"
+             "dpaqx_s.w.ph   $ac1,       %[ax0],         %[ax2]      \n\t"
+             "lh             %[ax0],     6(%[in2])                   \n\t"
+             "lh             %[ax1],     4(%[in2])                   \n\t"
+             "lhx            %[ax2],     %[k21](%[tsin])             \n\t"
+             "lhx            %[ax3],     %[k21](%[tcos])             \n\t"
+             "append         %[ax0],     %[ax1],        16           \n\t"
+             "append         %[ax2],     %[ax3],        16           \n\t"
+             "multu          $ac2,       $0,            $0           \n\t"
+             "mulsaq_s.w.ph  $ac2,       %[ax0],        %[ax2]       \n\t"
+             "multu          $ac3,       $0,            $0           \n\t"
+             "dpaqx_s.w.ph   $ac3,       %[ax0],        %[ax2]       \n\t"
+             "extr_r.w       %[t01],     $ac0,          16           \n\t"
+             "extr_r.w       %[t21],     $ac1,          16           \n\t"
+             "extr_r.w       %[t11],     $ac2,          16           \n\t"
+             "extr_r.w       %[t31],     $ac3,          16           \n\t"
+
+            : [ax0] "=&r" (ax0), [ax1] "=&r" (ax1), [ax2] "=&r" (ax2), [ax3] "=&r" (ax3),
+              [t01] "=r" (t01), [t21] "=r" (t21), [t11] "=r" (t11), [t31] "=r" (t31)
+            : [in2] "r" (in2), [k11] "r" (k11), [tsin] "r" (tsin),[tcos] "r" (tcos),
+              [z] "r" (z), [k21] "r" (k21)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+        );
+
+        z[n8-k-1].re = t0;
+        z[n8+k  ].im = t2;
+        z[n8-k-1].im = t21;
+        z[n8+k  ].re = t01;
+
+        z[n8-k-2].re = t1;
+        z[n8+k+1].im = t3;
+        z[n8-k-2].im = t31;
+        z[n8+k+1].re = t11;
+        z[n8+k+1].im = t3;
+    }
+}
+
+static void ff_fft_fixed_calc_mips(FFTContext *s, FFTComplex *z)
+{
+
+    int nbits, i, n, num_transforms, offset, step;
+    int n4, n2, n34;
+    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    int step2;
+    int temp1, temp2, temp3, temp4;
+    int z0, z1, z2, z3;
+    int t12, t34, t56, t78, t0a, t1a, t2a, t3a;
+    int in1, in2, in3, in4;
+    FFTComplex *tmpz, *addr1, *addr2, *addr3;
+    int w_re, w_im;
+    FFTSample *w_re_ptr, *w_im_ptr;
+    int pom;
+    const int fft_size = (1 << s->nbits);
+
+    FFTComplex *tmpz_n2, *tmpz_n34, *tmpz_n4;
+    FFTComplex *tmpz_n2_i, *tmpz_n34_i, *tmpz_n4_i, *tmpz_i;
+
+    int z_re_n2, z_im_n2, z_re_n34, z_im_n34, z_re, z_im, z_re_n4, z_im_n4;
+
+    num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+    for (n=0; n<num_transforms; n++)
+    {
+        offset = fft_offsets_lut[n] << 2;
+        tmpz = z + offset;
+
+        /* fft4 */
+        __asm__ volatile (
+            "lw         %[z0],      0(%[tmpz])              \n\t"
+            "lw         %[z1],      4(%[tmpz])              \n\t"
+            "lw         %[z2],      8(%[tmpz])              \n\t"
+            "lw         %[z3],      12(%[tmpz])             \n\t"
+            "addq.ph    %[t12],     %[z0],      %[z1]       \n\t"
+            "subq.ph    %[t34],     %[z0],      %[z1]       \n\t"
+            "addq.ph    %[t56],     %[z2],      %[z3]       \n\t"
+            "subq.ph    %[t78],     %[z2],      %[z3]       \n\t"
+            "addq.ph    %[t0a],     %[t12],     %[t56]      \n\t"
+            "packrl.ph  %[t78],     %[t78],     %[t78]      \n\t"
+            "subq.ph    %[t2a],     %[t12],     %[t56]      \n\t"
+            "addq.ph    %[t1a],     %[t34],     %[t78]      \n\t"
+            "subq.ph    %[t3a],     %[t34],     %[t78]      \n\t"
+            "packrl.ph  %[t1a],     %[t1a],     %[t1a]      \n\t"
+            "packrl.ph  %[t3a],     %[t3a],     %[t3a]      \n\t"
+            "sw         %[t0a],     0(%[tmpz])              \n\t"
+            "packrl.ph  %[z1],      %[t1a],     %[t3a]      \n\t"
+            "packrl.ph  %[z3],      %[t3a],     %[t1a]      \n\t"
+            "sw         %[t2a],     8(%[tmpz])              \n\t"
+            "sw         %[z3],      4(%[tmpz])              \n\t"
+            "sw         %[z1],      12(%[tmpz])             \n\t"
+
+            : [z0] "=&r" (z0), [z1] "=&r" (z1), [t12] "=&r" (t12),
+              [z2] "=&r" (z2), [z3] "=&r" (z3), [t34] "=&r" (t34),
+              [t56] "=&r" (t56), [t78] "=&r" (t78), [t0a] "=&r" (t0a),
+              [t1a] "=&r" (t1a), [t2a] "=&r" (t2a), [t3a] "=&r" (t3a)
+            : [tmpz] "r" (tmpz)
+            : "memory"
+        );
+    }
+
+    if (fft_size < 8)
+        return;
+
+    pom = 23170;
+
+    num_transforms = (num_transforms >> 1) | 1;
+    for (n=0; n<num_transforms; n++)
+    {
+        offset = fft_offsets_lut[n] << 3;
+        tmpz = z + offset;
+
+        /* fft8 */
+        __asm__ volatile (
+            "lw         %[in1],     16(%[tmpz])             \t\n"
+            "lw         %[in2],     20(%[tmpz])             \t\n"
+            "lw         %[in3],     24(%[tmpz])             \t\n"
+            "lw         %[in4],     28(%[tmpz])             \t\n"
+            "addq.ph    %[temp1],   %[in1],     %[in2]      \t\n"
+            "subq.ph    %[temp3],   %[in1],     %[in2]      \t\n"
+            "seh        %[tmp1],    %[temp1]                \t\n"
+            "sra        %[temp1],   %[temp1],   16          \t\n"
+            "seh        %[tmp2],    %[temp1]                \t\n"
+            "addq.ph    %[temp2],   %[in3],     %[in4]      \t\n"
+            "subq.ph    %[temp4],   %[in3],     %[in4]      \t\n"
+            "seh        %[tmp3],    %[temp2]                \t\n"
+            "sra        %[temp2],   %[temp2],   16          \t\n"
+            "seh        %[tmp4],    %[temp2]                \t\n"
+            "add        %[tmp5],    %[tmp1],    %[tmp3]     \t\n"
+            "sub        %[tmp7],    %[tmp1],    %[tmp3]     \t\n"
+            "add        %[tmp6],    %[tmp2],    %[tmp4]     \t\n"
+            "sub        %[tmp8],    %[tmp2],    %[tmp4]     \t\n"
+            "seh        %[tmp1],    %[temp3]                \t\n"
+            "sra        %[temp3],   %[temp3],   16          \t\n"
+            "seh        %[tmp2],    %[temp3]                \t\n"
+            "seh        %[tmp3],    %[temp4]                \t\n"
+            "sra        %[temp4],   %[temp4],   16          \t\n"
+            "seh        %[tmp4],    %[temp4]                \t\n"
+            "lw         %[in1],     0(%[tmpz])              \t\n"
+            "move       %[temp1],   %[tmp6]                 \t\n"
+            "append     %[temp1],   %[tmp5],    16          \t\n"
+            "subq.ph    %[temp3],   %[in1],     %[temp1]    \t\n"
+            "addq.ph    %[temp4],   %[in1],     %[temp1]    \t\n"
+            "sw         %[temp3],   16(%[tmpz])             \t\n"
+            "sw         %[temp4],   0(%[tmpz])              \t\n"
+            "lw         %[in2],     8(%[tmpz])              \t\n"
+            "negu       %[temp1],   %[tmp7]                 \t\n"
+            "append     %[temp1],   %[tmp8],    16          \t\n"
+            "subq.ph    %[temp2],   %[in2],     %[temp1]    \t\n"
+            "addq.ph    %[temp3],   %[in2],     %[temp1]    \t\n"
+            "sw         %[temp2],   24(%[tmpz])             \t\n"
+            "sw         %[temp3],   8(%[tmpz])              \t\n"
+            "add        %[tmp5],    %[tmp1],    %[tmp2]     \t\n"
+            "mul        %[tmp5],    %[tmp5],    %[pom]      \t\n"
+            "sub        %[tmp6],    %[tmp2],    %[tmp1]     \t\n"
+            "mul        %[tmp6],    %[tmp6],    %[pom]      \t\n"
+            "sub        %[tmp7],    %[tmp3],    %[tmp4]     \t\n"
+            "mul        %[tmp7],    %[tmp7],    %[pom]      \t\n"
+            "add        %[tmp8],    %[tmp3],    %[tmp4]     \t\n"
+            "mul        %[tmp8],    %[tmp8],    %[pom]      \t\n"
+            "shra_r.w   %[tmp5],    %[tmp5],    15          \t\n"
+            "lw         %[in1],     4(%[tmpz])              \t\n"
+            "shra_r.w   %[tmp6],    %[tmp6],    15          \t\n"
+            "lw         %[in2],     12(%[tmpz])             \t\n"
+            "shra_r.w   %[tmp7],    %[tmp7],    15          \t\n"
+            "add        %[tmp1],    %[tmp5],    %[tmp7]     \t\n"
+            "shra_r.w   %[tmp8],    %[tmp8],    15          \t\n"
+            "add        %[tmp2],    %[tmp6],    %[tmp8]     \t\n"
+            "sub        %[tmp3],    %[tmp5],    %[tmp7]     \t\n"
+            "sub        %[tmp4],    %[tmp6],    %[tmp8]     \t\n"
+            "move       %[temp1],   %[tmp2]                 \t\n"
+            "append     %[temp1],   %[tmp1],    16          \t\n"
+            "subq.ph    %[temp2],   %[in1],     %[temp1]    \t\n"
+            "addq.ph    %[temp3],   %[in1],     %[temp1]    \t\n"
+            "sw         %[temp2],   20(%[tmpz])             \t\n"
+            "sw         %[temp3],   4(%[tmpz])              \t\n"
+            "negu       %[temp1],   %[tmp3]                 \t\n"
+            "append     %[temp1],   %[tmp4],    16          \t\n"
+            "subq.ph    %[temp2],   %[in2],     %[temp1]    \t\n"
+            "addq.ph    %[temp3],   %[in2],     %[temp1]    \t\n"
+            "sw         %[temp2],   28(%[tmpz])             \t\n"
+            "sw         %[temp3],   12(%[tmpz])             \t\n"
+
+            : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
+              [tmp4] "=&r" (tmp4), [tmp5] "=&r" (tmp5), [tmp6] "=&r" (tmp6),
+              [tmp7] "=&r" (tmp7), [tmp8] "=&r" (tmp8), [temp1] "=&r" (temp1),
+              [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4),
+              [in1] "=&r" (in1), [in2] "=&r" (in2), [in3] "=&r" (in3),
+              [in4] "=&r" (in4)
+            : [tmpz] "r" (tmpz), [pom] "r" (pom)
+            : "memory"
+        );
+    }
+
+    step = 1 << (MAX_LOG2_NFFT - 4);
+    n4 = 4;
+
+    for (nbits=4; nbits<=s->nbits; nbits++)
+    {
+        n2  = 2*n4;
+        n34 = 3*n4;
+        num_transforms = (num_transforms >> 1) | 1;
+        for (n=0; n<num_transforms; n++)
+        {
+            offset = fft_offsets_lut[n] << nbits;
+            tmpz = z + offset;
+
+            __asm__ volatile (
+                "sll        %[z0],      %[n2],          2           \n\t"
+                "sll        %[z1],      %[n34],         2           \n\t"
+                "sll        %[z2],      %[n4],          2           \n\t"
+                "addu       %[addr1],   %[tmpz],        %[z0]       \n\t"
+                "addu       %[addr2],   %[tmpz],        %[z1]       \n\t"
+                "addu       %[addr3],   %[tmpz],        %[z2]       \n\t"
+                "lw         %[z0],      0(%[addr1])                 \n\t"
+                "lw         %[z1],      0(%[addr2])                 \n\t"
+                "lw         %[z2],      0(%[tmpz])                  \n\t"
+                "sll        %[step2],   %[step],        2           \n\t"
+                "lw         %[z3],      0(%[addr3])                 \n\t"
+                "addq.ph    %[t56],     %[z0],          %[z1]       \n\t"
+                "subq.ph    %[t12],     %[z0],          %[z1]       \n\t"
+                "addq.ph    %[t0a],     %[z2],          %[t56]      \n\t"
+                "packrl.ph  %[z3],      %[z3],          %[z3]       \n\t"
+                "subq.ph    %[t2a],     %[z2],          %[t56]      \n\t"
+                "addq.ph    %[t1a],     %[z3],          %[t12]      \n\t"
+                "subq.ph    %[t3a],     %[z3],          %[t12]      \n\t"
+                "sw         %[t0a],     0(%[tmpz])                  \n\t"
+                "sw         %[t2a],     0(%[addr1])                 \n\t"
+                "packrl.ph  %[z0],      %[t1a],         %[t3a]      \n\t"
+                "packrl.ph  %[z1],      %[t3a],         %[t1a]      \n\t"
+                "sw         %[z0],      0(%[addr2])                 \n\t"
+                "sw         %[z1],      0(%[addr3])                 \n\t"
+
+                : [z0] "=&r" (z0), [z1] "=&r" (z1), [t12] "=&r" (t12),
+                  [z2] "=&r" (z2), [z3] "=&r" (z3), [step2] "=&r" (step2),
+                  [t56] "=&r" (t56), [t0a] "=&r" (t0a), [t1a] "=&r" (t1a),
+                  [t2a] "=&r" (t2a), [t3a] "=&r" (t3a), [addr1] "=&r" (addr1),
+                  [addr2] "=&r" (addr2), [addr3] "=&r" (addr3)
+                : [n2] "r" (n2), [n34] "r" (n34), [n4] "r" (n4), [tmpz] "r" (tmpz),
+                  [step] "r" (step)
+                : "memory"
+            );
+
+            w_re_ptr = (FFTSample*)(ff_cos_65536_fixed + step);
+            w_im_ptr = (FFTSample*)(ff_cos_65536_fixed + MAX_FFT_SIZE/4 - step);
+
+            for (i=1; i<n4; i ++ )
+            {
+                w_re = w_re_ptr[0];
+                w_im = w_im_ptr[0];
+
+                tmpz_n2  = tmpz + n2;
+                tmpz_n4  = tmpz + n4;
+                tmpz_n34 = tmpz + n34;
+
+                tmpz_n2_i  = tmpz_n2  + i;
+                tmpz_n4_i  = tmpz_n4  + i;
+                tmpz_n34_i = tmpz_n34 + i;
+                tmpz_i     = tmpz     + i;
+
+                __asm__ volatile (
+                    "lh         %[z_re_n2],     0(%[tmpz_n2_i])                     \n\t"
+                    "lh         %[z_im_n2],     2(%[tmpz_n2_i])                     \n\t"
+                    "lh         %[z_re_n34],    0(%[tmpz_n34_i])                    \n\t"
+                    "lh         %[z_im_n34],    2(%[tmpz_n34_i])                    \n\t"
+                    "mult       $ac0,           %[w_re],            %[z_re_n2]      \n\t"
+                    "mult       $ac2,           %[w_re],            %[z_re_n34]     \n\t"
+                    "mult       $ac1,           %[w_re],            %[z_im_n2]      \n\t"
+                    "mult       $ac3,           %[w_re],            %[z_im_n34]     \n\t"
+                    "madd       $ac0,           %[w_im],            %[z_im_n2]      \n\t"
+                    "msub       $ac2,           %[w_im],            %[z_im_n34]     \n\t"
+                    "msub       $ac1,           %[w_im],            %[z_re_n2]      \n\t"
+                    "madd       $ac3,           %[w_im],            %[z_re_n34]     \n\t"
+                    "lh         %[z_re],        0(%[tmpz_i])                        \n\t"
+                    "extr_r.w   %[tmp1],        $ac0, 15                            \n\t"
+                    "extr_r.w   %[tmp3],        $ac2, 15                            \n\t"
+                    "extr_r.w   %[tmp2],        $ac1, 15                            \n\t"
+                    "extr_r.w   %[tmp4],        $ac3, 15                            \n\t"
+                    "lh         %[z_im],        2(%[tmpz_i])                        \n\t"
+                    "lh         %[z_re_n4],     0(%[tmpz_n4_i])                     \n\t"
+                    "lh         %[z_im_n4],     2(%[tmpz_n4_i])                     \n\t"
+                    "add        %[tmp5],        %[tmp1],            %[tmp3]         \n\t"
+                    "sub        %[tmp1],        %[tmp1],            %[tmp3]         \n\t"
+                    "add        %[tmp6],        %[tmp2],            %[tmp4]         \n\t"
+                    "sub        %[tmp2],        %[tmp2],            %[tmp4]         \n\t"
+                    "subq_s.ph  %[z_re_n2],     %[z_re],            %[tmp5]         \n\t"
+                    "addq_s.ph  %[z_re],        %[z_re],            %[tmp5]         \n\t"
+                    "subq_s.ph  %[z_im_n2],     %[z_im],            %[tmp6]         \n\t"
+                    "addq_s.ph  %[z_im],        %[z_im],            %[tmp6]         \n\t"
+                    "sh         %[z_re_n2],     0(%[tmpz_n2_i])                     \n\t"
+                    "sh         %[z_re],        0(%[tmpz_i])                        \n\t"
+                    "sh         %[z_im_n2],     2(%[tmpz_n2_i])                     \n\t"
+                    "sh         %[z_im],        2(%[tmpz_i])                        \n\t"
+                    "subq_s.ph  %[z_re_n34],    %[z_re_n4],         %[tmp2]         \n\t"
+                    "addq_s.ph  %[z_re_n4],     %[z_re_n4],         %[tmp2]         \n\t"
+                    "addq_s.ph  %[z_im_n34],    %[z_im_n4],         %[tmp1]         \n\t"
+                    "subq_s.ph  %[z_im_n4],     %[z_im_n4],         %[tmp1]         \n\t"
+                    "sh         %[z_re_n34],    0(%[tmpz_n34_i])                    \n\t"
+                    "sh         %[z_re_n4],     0(%[tmpz_n4_i])                     \n\t"
+                    "sh         %[z_im_n34],    2(%[tmpz_n34_i])                    \n\t"
+                    "sh         %[z_im_n4],     2(%[tmpz_n4_i])                     \n\t"
+
+                    : [z_re_n2] "=&r" (z_re_n2), [z_re] "=&r" (z_re), [z_im] "=&r" (z_im),
+                      [z_im_n2] "=&r" (z_im_n2), [z_re_n34] "=&r" (z_re_n34),
+                      [z_im_n4] "=&r" (z_im_n4), [z_re_n4] "=&r" (z_re_n4),
+                      [z_im_n34] "=&r" (z_im_n34), [tmp1] "=r" (tmp1),
+                      [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+                      [tmp5] "=&r" (tmp5), [tmp6] "=&r" (tmp6)
+                    : [w_re] "r" (w_re), [w_im] "r" (w_im), [tmpz_n2_i] "r" (tmpz_n2_i),
+                      [tmpz_n34_i] "r" (tmpz_n34_i), [tmpz_n4_i] "r" (tmpz_n4_i),
+                      [tmpz_i] "r" (tmpz_i)
+                    : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+                      "$ac3hi", "$ac3lo"
+              );
+              w_re_ptr += step;
+              w_im_ptr -= step;
+            }
+        }
+        step >>= 1;
+        n4   <<= 1;
+    }
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_fft_fixed_init_mips(FFTContext *s) {
+
+#if HAVE_INLINE_ASM
+  int n=0;
+  ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
+
+#if CONFIG_MDCT
+    s->imdct_half_fixed = ff_imdct_fixed_half_mips;
+#endif /* CONFIG_MDCT */
+    s->fft_fixed_calc   = ff_fft_fixed_calc_mips;
+#endif /* HAVE_INLINE_ASM */
+}
-- 
1.7.3.4