[FFmpeg-devel] [PATCH 2/4] mips: Optimization of AC3 fixed point decoder

Tue Sep 25 16:10:55 CEST 2012

Signed-off-by: Nedeljko Babic <nbabic at mips.com>
---
 libavcodec/dsputil.c                    |    2 +-
 libavcodec/fmtconvert.c                 |    1 +
 libavcodec/fmtconvert.h                 |    1 +
 libavcodec/mips/Makefile                |    3 +-
 libavcodec/mips/dsputil_mips.c          |  102 ++++++++++++++
 libavcodec/mips/fmtconvert_mips_fixed.c |  226 +++++++++++++++++++++++++++++++
 6 files changed, 333 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/mips/fmtconvert_mips_fixed.c

diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index f813bb8..9e06050 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3188,7 +3188,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
     if (HAVE_MMI)        ff_dsputil_init_mmi   (c, avctx);
     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
-    if (HAVE_MIPSFPU)    ff_dsputil_init_mips  (c, avctx);
+    if (ARCH_MIPS)       ff_dsputil_init_mips  (c, avctx);
 
     for (i = 0; i < 4; i++) {
         for (j = 0; j < 16; j++) {
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index 951a2e5..c3b4544 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -159,6 +159,7 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
     if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);
     if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
     if (HAVE_MIPSFPU) ff_fmt_convert_init_mips(c);
+    if (HAVE_MIPSDSPR1) ff_fmt_convert_init_mips_fixed(c, avctx);
 }
 
 /* ffdshow custom code */
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index 8bda1e7..cc088b8 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -145,6 +145,7 @@ void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_altivec(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_mips(FmtConvertContext *c);
+void ff_fmt_convert_init_mips_fixed(FmtConvertContext *c, AVCodecContext *avctx);
 
 /* ffdshow custom code */
 void float_interleave(float *dst, const float **src, long len, int channels);
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index ff46768..17f6b13 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -16,4 +16,5 @@ MIPSDSPR1-OBJS-$(CONFIG_MPEGAUDIODSP)     += mips/mpegaudiodsp_mips_fixed.o
 OBJS-$(CONFIG_FFT)                        += mips/fft_init_table.o
 MIPSFPU-OBJS-$(CONFIG_FFT)                += mips/fft_mips.o
 MIPSFPU-OBJS-$(HAVE_INLINE_ASM)           += mips/fmtconvert_mips.o
-MIPSFPU-OBJS-$(HAVE_INLINE_ASM)           += mips/dsputil_mips.o
+OBJS-$(HAVE_INLINE_ASM)                   += mips/dsputil_mips.o
+MIPSDSPR1-OBJS-$(HAVE_INLINE_ASM)         += mips/fmtconvert_mips_fixed.o
diff --git a/libavcodec/mips/dsputil_mips.c b/libavcodec/mips/dsputil_mips.c
index e46a0a9..4e4ee61 100644
--- a/libavcodec/mips/dsputil_mips.c
+++ b/libavcodec/mips/dsputil_mips.c
@@ -47,6 +47,7 @@
 #include "config.h"
 #include "libavcodec/dsputil.h"
 
+#if HAVE_MIPSFPU
 static void vector_fmul_window_mips(float *dst, const float *src0,
         const float *src1, const float *win, int len)
 {
@@ -157,8 +158,109 @@ static void vector_fmul_window_mips(float *dst, const float *src0,
         );
     }
 }
+#endif
+
+#if HAVE_MIPSDSPR2
+static void vector_fmul_window_mips_fixed(int *dst, const int16_t *src0, const int16_t *src1, const int16_t *win, int len)
+{
+    int i,j;
+    int *dst_i, *dst_j;
+    const int16_t * src0_i, *src1_j;
+    const int16_t *win_i, *win_j;
+    int16_t s0, s01, s02, s03, s1, s11, s12, s13;
+    int16_t wi, wi1, wi2, wi3, wj, wj1, wj2, wj3;
+
+    dst += len;
+    win += len;
+    src0 += len;
+
+    for(i=-len, j=len-1; i<0; i+=4, j-=4) {
+        dst_i = dst + i;
+        dst_j = dst + j;
+        src0_i = src0 + i;
+        src1_j = src1 + j;
+        win_i = win + i;
+        win_j = win + j;
+
+        __asm__ volatile (
+            "lh             %[s0],      0(%[src0_i])                \n\t"
+            "lh             %[s1],      0(%[src1_j])                \n\t"
+            "lh             %[wi],      0(%[win_i])                 \n\t"
+            "lh             %[wj],      0(%[win_j])                 \n\t"
+            "append         %[s0],      %[s1],          16          \n\t"
+            "append         %[wj],      %[wi],          16          \n\t"
+            "mult           $ac0,       $0,             $0          \n\t"
+            "mulsaq_s.w.ph  $ac0,       %[s0],          %[wj]       \n\t"
+            "mult           $ac1,       $0,             $0          \n\t"
+            "dpaqx_s.w.ph   $ac1,       %[s0],          %[wj]       \n\t"
+            "lh             %[s01],     2(%[src0_i])                \n\t"
+            "lh             %[s11],     -2(%[src1_j])               \n\t"
+            "extr_r.w       %[s1],      $ac0,           16          \n\t"
+            "lh             %[wi1],     2(%[win_i])                 \n\t"
+            "lh             %[wj1],     -2(%[win_j])                \n\t"
+            "extr_r.w       %[wj],      $ac1,           16          \n\t"
+            "append         %[s01],     %[s11],         16          \n\t"
+            "append         %[wj1],     %[wi1],         16          \n\t"
+            "mult           $ac2,       $0,             $0          \n\t"
+            "mulsaq_s.w.ph  $ac2,       %[s01],         %[wj1]      \n\t"
+            "sw             %[s1],      0(%[dst_i])                 \n\t"
+            "sw             %[wj],       0(%[dst_j])                \n\t"
+            "mult           $ac3,       $0,             $0          \n\t"
+            "dpaqx_s.w.ph   $ac3,       %[s01],         %[wj1]      \n\t"
+            "extr_r.w       %[s11],     $ac2,           16          \n\t"
+            "extr_r.w       %[wj1],     $ac3,           16          \n\t"
+            "lh             %[s02],     4(%[src0_i])                \n\t"
+            "lh             %[s12],     -4(%[src1_j])               \n\t"
+            "lh             %[wi2],     4(%[win_i])                 \n\t"
+            "lh             %[wj2],     -4(%[win_j])                \n\t"
+            "append         %[s02],     %[s12],         16          \n\t"
+            "append         %[wj2],     %[wi2],         16          \n\t"
+            "mult           $ac0,       $0,             $0          \n\t"
+            "mulsaq_s.w.ph  $ac0,       %[s02],         %[wj2]      \n\t"
+            "sw             %[s11],     4(%[dst_i])                 \n\t"
+            "sw             %[wj1],     -4(%[dst_j])                \n\t"
+            "mult           $ac1,       $0,             $0          \n\t"
+            "dpaqx_s.w.ph   $ac1,       %[s02],         %[wj2]      \n\t"
+            "extr_r.w       %[s12],     $ac0,           16          \n\t"
+            "lh             %[s03],     6(%[src0_i])                \n\t"
+            "lh             %[s13],     -6(%[src1_j])               \n\t"
+            "lh             %[wi3],     6(%[win_i])                 \n\t"
+            "lh             %[wj3],     -6(%[win_j])                \n\t"
+            "append         %[s03],     %[s13],         16          \n\t"
+            "append         %[wj3],     %[wi3],         16          \n\t"
+            "mult           $ac2,       $0,             $0          \n\t"
+            "mulsaq_s.w.ph  $ac2,       %[s03],         %[wj3]      \n\t"
+            "sw             %[s12],     8(%[dst_i])                 \n\t"
+            "extr_r.w       %[wj2],     $ac1,           16          \n\t"
+            "mult           $ac3,       $0,             $0          \n\t"
+            "dpaqx_s.w.ph   $ac3,       %[s03],         %[wj3]      \n\t"
+            "extr_r.w       %[s13],     $ac2,           16          \n\t"
+            "extr_r.w       %[wj3],     $ac3,           16          \n\t"
+            "sw             %[wj2],     -8(%[dst_j])                \n\t"
+            "sw             %[s13],     12(%[dst_i])                \n\t"
+            "sw             %[wj3],     -12(%[dst_j])               \n\t"
+
+            : [s0] "=&r" (s0), [s1] "=&r" (s1), [wi] "=&r" (wi),
+              [wj] "=&r" (wj), [s03] "=&r" (s03), [s01] "=&r" (s01),
+              [s11] "=&r" (s11), [wi1] "=&r" (wi1), [wj1] "=&r" (wj1),
+              [s13] "=&r" (s13), [s02] "=&r" (s02), [s12] "=&r" (s12),
+              [wi2] "=&r" (wi2), [wj2] "=&r" (wj2), [wi3] "=&r" (wi3),
+              [wj3] "=&r" (wj3)
+            : [src0_i] "r" (src0_i), [win_j] "r" (win_j ), [src1_j] "r" (src1_j),
+              [win_i] "r" (win_i), [dst_i] "r" (dst_i), [dst_j] "r" (dst_j)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+        );
+    }
+}
+#endif
 
 av_cold void ff_dsputil_init_mips( DSPContext* c, AVCodecContext *avctx )
 {
+#if HAVE_MIPSFPU
     c->vector_fmul_window = vector_fmul_window_mips;
+#endif
+#if HAVE_MIPSDSPR2
+    c->vector_fmul_window_fixed = vector_fmul_window_mips_fixed;
+#endif
 }
diff --git a/libavcodec/mips/fmtconvert_mips_fixed.c b/libavcodec/mips/fmtconvert_mips_fixed.c
new file mode 100644
index 0000000..bc3ada0
--- /dev/null
+++ b/libavcodec/mips/fmtconvert_mips_fixed.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Zoran Lukic (zlukic at mips.com)
+ *
+ * Format Conversion Utils optimized for MIPS fixed-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/fmtconvert.c
+ */
+
+#include "libavcodec/fmtconvert.h"
+
+static void int32_to_fixed_fmul_scalar_mips(int16_t *dst, const int *src,
+                                            int mul, int len)
+{
+    int i;
+    int16_t temp1, temp3, temp5, temp7, temp9, temp11, temp13, temp15;
+
+    for (i=0; i<len; i+=8) {
+        __asm__ volatile (
+            "lw     %[temp1],   0(%[src_i])         \n\t"
+            "lw     %[temp3],   4(%[src_i])         \n\t"
+            "lw     %[temp5],   8(%[src_i])         \n\t"
+            "lw     %[temp7],   12(%[src_i])        \n\t"
+            "lw     %[temp9],   16(%[src_i])        \n\t"
+            "lw     %[temp11],  20(%[src_i])        \n\t"
+            "lw     %[temp13],  24(%[src_i])        \n\t"
+            "lw     %[temp15],  28(%[src_i])        \n\t"
+            "mul    %[temp1],   %[temp1],   %[mul]  \n\t"
+            "mul    %[temp3],   %[temp3],   %[mul]  \n\t"
+            "mul    %[temp5],   %[temp5],   %[mul]  \n\t"
+            "mul    %[temp7],   %[temp7],   %[mul]  \n\t"
+            "mul    %[temp9],   %[temp9],   %[mul]  \n\t"
+            "mul    %[temp11],  %[temp11],  %[mul]  \n\t"
+            "mul    %[temp13],  %[temp13],  %[mul]  \n\t"
+            "mul    %[temp15],  %[temp15],  %[mul]  \n\t"
+            "addiu  %[temp1],   %[temp1],   0x8000  \n\t"
+            "addiu  %[temp3],   %[temp3],   0x8000  \n\t"
+            "addiu  %[temp5],   %[temp5],   0x8000  \n\t"
+            "addiu  %[temp7],   %[temp7],   0x8000  \n\t"
+            "addiu  %[temp9],   %[temp9],   0x8000  \n\t"
+            "addiu  %[temp11],  %[temp11],  0x8000  \n\t"
+            "addiu  %[temp13],  %[temp13],  0x8000  \n\t"
+            "addiu  %[temp15],  %[temp15],  0x8000  \n\t"
+            "sra    %[temp1],   %[temp1],   0x10    \n\t"
+            "sra    %[temp3],   %[temp3],   0x10    \n\t"
+            "sra    %[temp5],   %[temp5],   0x10    \n\t"
+            "sra    %[temp7],   %[temp7],   0x10    \n\t"
+            "sra    %[temp9],   %[temp9],   0x10    \n\t"
+            "sra    %[temp11],  %[temp11],  0x10    \n\t"
+            "sra    %[temp13],  %[temp13],  0x10    \n\t"
+            "sra    %[temp15],  %[temp15],  0x10    \n\t"
+            "sh     %[temp1],   0(%[dst_i])         \n\t"
+            "sh     %[temp3],   2(%[dst_i])         \n\t"
+            "sh     %[temp5],   4(%[dst_i])         \n\t"
+            "sh     %[temp7],   6(%[dst_i])         \n\t"
+            "sh     %[temp9],   8(%[dst_i])         \n\t"
+            "sh     %[temp11],  10(%[dst_i])        \n\t"
+            "sh     %[temp13],  12(%[dst_i])        \n\t"
+            "sh     %[temp15],  14(%[dst_i])        \n\t"
+
+            : [temp1] "=r" (temp1),   [temp11] "=r" (temp11),
+              [temp13] "=r" (temp13), [temp15] "=r" (temp15),
+              [temp3] "=r" (temp3),   [temp5] "=r" (temp5),
+              [temp7] "=r" (temp7),   [temp9] "=r" (temp9)
+            : [dst_i] "r" (dst+i),  [src_i] "r" (src+i),
+              [mul] "r" (mul)
+            : "memory"
+        );
+    }
+}
+
+static inline int fixed_to_int16_one_mips(const int *src)
+{
+    int16_t ret;
+    int temp1, temp7, temp8;
+    __asm__ volatile (
+        "lw     %[temp1],   0(%[src_i1])            \n\t"
+        "li     %[temp8],   0xf000                  \n\t"
+        "li     %[ret1],    0xefff                  \n\t"
+        "slt    %[temp7],   %[temp1],   %[temp8]    \n\t"
+        "movn   %[ret1],    %[temp1],   %[temp7]    \n\t"
+        "seh    %[ret1],    %[ret1]                 \n\t"
+        : [temp1] "=r" (temp1), [temp7] "=r" (temp7),
+          [temp8] "=r" (temp8), [ret1] "=r" (ret)
+        : [src_i1] "r" (src)
+        : "memory"
+    );
+    return (int16_t) ret;
+}
+
+static void fixed_to_int16_interleave_mips(int16_t *dst, const int **src,
+                                    long len, int channels)
+{
+    int i,j,c;
+    if(channels==2) {
+        for(i=0; i<len; i++) {
+            int temp, temp1, temp7, temp8;
+            __asm__ volatile (
+                "lw     %[temp],    0(%[src_i])             \n\t"
+                "lw     %[temp1],   0(%[src_i1])            \n\t"
+                "li     %[temp8],   0xf000                  \n\t"
+                "li     %[ret],     0xefff                  \n\t"
+                "li     %[ret1],    0xefff                  \n\t"
+                "slt    %[temp7],   %[temp],    %[temp8]    \n\t"
+                "movn   %[ret],     %[temp],    %[temp7]    \n\t"
+                "slt    %[temp7],   %[temp1],   %[temp8]    \n\t"
+                "movn   %[ret1],    %[temp1],   %[temp7]    \n\t"
+                "seh    %[ret],     %[ret]                  \n\t"
+                "seh    %[ret1],    %[ret1]                 \n\t"
+
+                : [temp] "=&r" (temp),    [temp1] "=&r" (temp1),
+                  [temp7] "=&r" (temp7),  [temp8] "=&r" (temp8),
+                  [ret] "=&r" (dst[2*i]), [ret1] "=&r" (dst[2*i+1])
+                : [src_i] "r" (src[0]+i), [src_i1] "r" (src[1]+i)
+                : "memory"
+            );
+        }
+    }
+    else {
+        if(channels==6) {
+            for(i=0; i<len; i++) {
+                int temp, temp1, temp2, temp3, temp4, temp5, temp7, temp8;
+                __asm__ volatile (
+                    "lw     %[temp],    0(%[src_i])             \n\t"
+                    "lw     %[temp1],   0(%[src_i1])            \n\t"
+                    "lw     %[temp2],   0(%[src_i2])            \n\t"
+                    "lw     %[temp3],   0(%[src_i3])            \n\t"
+                    "lw     %[temp4],   0(%[src_i4])            \n\t"
+                    "lw     %[temp5],   0(%[src_i5])            \n\t"
+                    "li     %[temp8],   0xf000                  \n\t"
+                    "li     %[ret],     0xefff                  \n\t"
+                    "li     %[ret1],    0xefff                  \n\t"
+                    "li     %[ret2],    0xefff                  \n\t"
+                    "li     %[ret3],    0xefff                  \n\t"
+                    "li     %[ret4],    0xefff                  \n\t"
+                    "li     %[ret5],    0xefff                  \n\t"
+                    "slt    %[temp7],   %[temp],    %[temp8]    \n\t"
+                    "movn   %[ret],     %[temp],    %[temp7]    \n\t"
+                    "slt    %[temp7],   %[temp1],   %[temp8]    \n\t"
+                    "movn   %[ret1],    %[temp1],   %[temp7]    \n\t"
+                    "slt    %[temp7],   %[temp2],   %[temp8]    \n\t"
+                    "movn   %[ret2],    %[temp2],   %[temp7]    \n\t"
+                    "slt    %[temp7],   %[temp3],   %[temp8]    \n\t"
+                    "movn   %[ret3],    %[temp3],   %[temp7]    \n\t"
+                    "slt    %[temp7],   %[temp4],   %[temp8]    \n\t"
+                    "movn   %[ret4],    %[temp4],   %[temp7]    \n\t"
+                    "slt    %[temp7],   %[temp5],   %[temp8]    \n\t"
+                    "movn   %[ret5],    %[temp5],   %[temp7]    \n\t"
+                    "seh    %[ret],     %[ret]                  \n\t"
+                    "seh    %[ret1],    %[ret1]                 \n\t"
+                    "seh    %[ret2],    %[ret2]                 \n\t"
+                    "seh    %[ret5],    %[ret5]                 \n\t"
+                    "seh    %[ret3],    %[ret3]                 \n\t"
+                    "seh    %[ret4],    %[ret4]                 \n\t"
+
+                    : [temp] "=&r" (temp),       [temp1] "=&r" (temp1),
+                      [temp2] "=&r" (temp2),     [temp3] "=&r" (temp3),
+                      [temp4] "=&r" (temp4),     [temp5] "=&r" (temp5),
+                      [temp7] "=&r" (temp7),     [temp8] "=&r" (temp8),
+                      [ret] "=&r" (dst[6*i]),    [ret1] "=&r" (dst[6*i+1]),
+                      [ret2] "=&r" (dst[6*i+2]), [ret3] "=&r" (dst[6*i+3]),
+                      [ret4] "=&r" (dst[6*i+4]), [ret5] "=&r" (dst[6*i+5])
+                    : [src_i] "r" (src[0]+i),    [src_i1] "r" (src[1]+i),
+                      [src_i2] "r" (src[2]+i),   [src_i3] "r" (src[3]+i),
+                      [src_i4] "r" (src[4]+i),   [src_i5] "r" (src[5]+i)
+                    : "memory"
+                );
+            }
+        }
+        else {
+            for(c=0; c<channels; c++)
+                for(i=0, j=c; i<len; i++, j+=channels)
+                    dst[j] = fixed_to_int16_one_mips(src[c]+i);
+        }
+    }
+}
+
+void ff_fmt_convert_init_mips_fixed(FmtConvertContext *c, AVCodecContext *avctx) {
+    c->int32_to_fixed_fmul_scalar = int32_to_fixed_fmul_scalar_mips;
+    c->fixed_to_int16_interleave  = fixed_to_int16_interleave_mips;
+}
-- 
1.7.3.4