[FFmpeg-devel] [PATCH 1/2] Optimization of AC3 floating point decoder for MIPS

Vitor Sessak vitor1001 at gmail.com
Thu Jun 21 14:26:48 CEST 2012


Hi

On 06/21/2012 12:04 PM, Nedeljko Babic wrote:
> FFT in MIPS implementation is working iteratively instead
>   of "recursively" calling functions for smaller FFT sizes.
> Some of DSP and format convert utils functions are also optimized.

Some comments

>   libavcodec/dsputil.c              |    1 +
>   libavcodec/dsputil.h              |    1 +
>   libavcodec/fft.c                  |    1 +
>   libavcodec/fft.h                  |   11 +
>   libavcodec/fmtconvert.c           |    1 +
>   libavcodec/fmtconvert.h           |    1 +
>   libavcodec/mips/Makefile          |    4 +
>   libavcodec/mips/dsputil_mips.c    |  168 +++++++++
>   libavcodec/mips/fft_mips.c        |  689 +++++++++++++++++++++++++++++++++++++
>   libavcodec/mips/fft_table.h       |  482 ++++++++++++++++++++++++++
>   libavcodec/mips/fmtconvert_mips.c |  336 ++++++++++++++++++
>   11 files changed, 1695 insertions(+), 0 deletions(-)
>   create mode 100644 libavcodec/mips/dsputil_mips.c
>   create mode 100644 libavcodec/mips/fft_mips.c
>   create mode 100644 libavcodec/mips/fft_table.h
>   create mode 100644 libavcodec/mips/fmtconvert_mips.c
>
> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
> index 442b900..b7d928f 100644
> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -3161,6 +3161,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
>       if (HAVE_MMI)        ff_dsputil_init_mmi   (c, avctx);
>       if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
>       if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
> +    if (HAVE_MIPSFPU)    ff_dsputil_init_mips  (c, avctx);

> --- a/libavcodec/fft.h
> +++ b/libavcodec/fft.h
> @@ -38,6 +38,16 @@
>
>   typedef float FFTDouble;
>
> +#if ARCH_MIPS
> +enum _fftConsts{
> +    MIN_LOG2_NFFT = 5, //!<  Specifies miniumum allowed fft size
> +    MAX_LOG2_NFFT = 12 //!<  Specifies maxiumum allowed fft size
> +};
> +
> +#define MAX_FFT_SIZE (1<<  MAX_LOG2_NFFT)
> +#define MIN_FFT_SIZE (1<<  MAX_LOG2_NFFT)
> +
> +#endif

MIPS-specific code should not be in common code.

> diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
> new file mode 100644

Nice, can you post the benchmarks results of "fft-
test -s"?

> index 0000000..286c67f
> --- /dev/null
> +++ b/libavcodec/mips/fft_mips.c
> @@ -0,0 +1,689 @@
> +/*
> + * Copyright (c) 2012
> + *      MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + *    contributors may be used to endorse or promote products derived from
> + *    this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Author:  Stanisalv Ocovaj (socovaj at mips.com)
> + * Author:  Zoran Lukic (zoranl at mips.com)
> + *
> + * Optimized MDCT/IMDCT and FFT transforms
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +#include "config.h"
> +#include "libavcodec/fft.h"
> +#include "fft_table.h"
> +
> +/**
> + * FFT transform
> + */
> +
> +#if HAVE_INLINE_ASM
> +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) {
> +
> +    int nbits, i, n, num_transforms, offset, step;
> +    int n4, n2, n34;
> +    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
> +    FFTComplex *tmpz;
> +    float w_re, w_im;
> +    float *w_re_ptr;
> +    const int fft_size = (1<<  s->nbits);
> +    int s_n = s->nbits;
> +    int tem1, tem2;
> +    float pom,  pom1,  pom2,  pom3;
> +    float temp, temp1, temp3, temp4;
> +    FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
> +    FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
> +
> +    /**
> +    *num_transforms = (0x2aab>>  (16 - s->nbits)) | 1;
> +    */
> +    __asm__ __volatile__ (
> +        "li   %[tem1], 16                                      \n\t"
> +        "sub  %[s_n],  %[tem1], %[s_n]                         \n\t"
> +        "li   %[tem2], 10923                                   \n\t"
> +        "srav %[tem2], %[tem2], %[s_n]                         \n\t"
> +        "ori  %[num_t],%[tem2], 1                              \n\t"
> +        : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
> +          [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
> +    );
> +
> +
> +    for (n=0; n<num_transforms; n++)
> +    {
> +        offset = fft_offsets_lut[n]<<  2;
> +        tmpz = z + offset;

What is the point of this LUT? If you want your input permutated in some 
particular order, you can just use revtab struct.

> +
> +        tmp1 = tmpz[0].re + tmpz[1].re;
> +        tmp5 = tmpz[2].re + tmpz[3].re;
> +        tmp2 = tmpz[0].im + tmpz[1].im;
> +        tmp6 = tmpz[2].im + tmpz[3].im;
> +        tmp3 = tmpz[0].re - tmpz[1].re;
> +        tmp8 = tmpz[2].im - tmpz[3].im;
> +        tmp4 = tmpz[0].im - tmpz[1].im;
> +        tmp7 = tmpz[2].re - tmpz[3].re;
> +
> +        tmpz[0].re = tmp1 + tmp5;
> +        tmpz[2].re = tmp1 - tmp5;
> +        tmpz[0].im = tmp2 + tmp6;
> +        tmpz[2].im = tmp2 - tmp6;
> +        tmpz[1].re = tmp3 + tmp8;
> +        tmpz[3].re = tmp3 - tmp8;
> +        tmpz[1].im = tmp4 - tmp7;
> +        tmpz[3].im = tmp4 + tmp7;
> +
> +}
> +
> +    if (fft_size<  8)
> +        return;
> +
> +    num_transforms = (num_transforms>>  1) | 1;
> +    for (n=0; n<num_transforms; n++)
> +    {
> +        offset = fft_offsets_lut[n]<<  3;
> +        tmpz = z + offset;
> +
> +        __asm__ __volatile__ (
> +            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
> +            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
> +            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
> +            "lwc1  %[pom1], 56(%[tmpz])                     \n\t"
> +            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
> +            "lwc1  %[pom2], 44(%[tmpz])                     \n\t"
> +            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
> +            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
> +            "add.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re + tmpz[5].re;
> +            "add.s %[tmp3], %[tmp3],    %[pom1]             \n\t"  // tmp3 = tmpz[6].re + tmpz[7].re;
> +            "add.s %[tmp2], %[tmp2],    %[pom2]             \n\t"  // tmp2 = tmpz[4].im + tmpz[5].im;
> +            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
> +            "add.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im + tmpz[7].im;
> +            "add.s %[tmp5], %[tmp1],    %[tmp3]             \n\t"  // tmp5 = tmp1 + tmp3;
> +            "sub.s %[tmp7], %[tmp1],    %[tmp3]             \n\t"  // tmp7 = tmp1 - tmp3;
> +            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
> +            "lwc1  %[pom1], 44(%[tmpz])                     \n\t"
> +            "add.s %[tmp6], %[tmp2],    %[tmp4]             \n\t"  // tmp6 = tmp2 + tmp4;
> +            "sub.s %[tmp8], %[tmp2],    %[tmp4]             \n\t"  // tmp8 = tmp2 - tmp4;
> +            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
> +            "lwc1  %[pom2], 56(%[tmpz])                     \n\t"
> +            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
> +            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
> +            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
> +            "sub.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re - tmpz[5].re;
> +            "lwc1  %[pom],  0(%[tmpz])                      \n\t"
> +            "sub.s %[tmp2], %[tmp2],    %[pom1]             \n\t"  // tmp2 = tmpz[4].im - tmpz[5].im;
> +            "sub.s %[tmp3], %[tmp3],    %[pom2]             \n\t"  // tmp3 = tmpz[6].re - tmpz[7].re;
> +            "lwc1  %[pom2], 4(%[tmpz])                      \n\t"
> +            "sub.s %[pom1], %[pom],     %[tmp5]             \n\t"
> +            "sub.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im - tmpz[7].im;
> +            "add.s %[pom3], %[pom],     %[tmp5]             \n\t"
> +            "sub.s %[pom],  %[pom2],    %[tmp6]             \n\t"
> +            "add.s %[pom2], %[pom2],    %[tmp6]             \n\t"
> +            "swc1  %[pom1], 32(%[tmpz])                     \n\t"  // tmpz[4].re = tmpz[0].re - tmp5;
> +            "swc1  %[pom3], 0(%[tmpz])                      \n\t"  // tmpz[0].re = tmpz[0].re + tmp5;
> +            "swc1  %[pom],  36(%[tmpz])                     \n\t"  // tmpz[4].im = tmpz[0].im - tmp6;
> +            "swc1  %[pom2], 4(%[tmpz])                      \n\t"  // tmpz[0].im = tmpz[0].im + tmp6;
> +            "lwc1  %[pom1], 16(%[tmpz])                     \n\t"
> +            "lwc1  %[pom3], 20(%[tmpz])                     \n\t"
> +            "li.s  %[pom],  0.7071067812                    \n\t"  // float pom = 0.7071067812f;
> +            "add.s %[temp1],%[tmp1],    %[tmp2]             \n\t"
> +            "sub.s %[temp], %[pom1],    %[tmp8]             \n\t"
> +            "add.s %[pom2], %[pom3],    %[tmp7]             \n\t"
> +            "sub.s %[temp3],%[tmp3],    %[tmp4]             \n\t"
> +            "sub.s %[temp4],%[tmp2],    %[tmp1]             \n\t"
> +            "swc1  %[temp], 48(%[tmpz])                     \n\t"  // tmpz[6].re = tmpz[2].re - tmp8;
> +            "swc1  %[pom2], 52(%[tmpz])                     \n\t"  // tmpz[6].im = tmpz[2].im + tmp7;
> +            "add.s %[pom1], %[pom1],    %[tmp8]             \n\t"
> +            "sub.s %[pom3], %[pom3],    %[tmp7]             \n\t"
> +            "add.s %[tmp3], %[tmp3],    %[tmp4]             \n\t"
> +            "mul.s %[tmp5], %[pom],     %[temp1]            \n\t"  // tmp5 = pom * (tmp1 + tmp2);
> +            "mul.s %[tmp7], %[pom],     %[temp3]            \n\t"  // tmp7 = pom * (tmp3 - tmp4);
> +            "mul.s %[tmp6], %[pom],     %[temp4]            \n\t"  // tmp6 = pom * (tmp2 - tmp1);
> +            "mul.s %[tmp8], %[pom],     %[tmp3]             \n\t"  // tmp8 = pom * (tmp3 + tmp4);
> +            "swc1  %[pom1], 16(%[tmpz])                     \n\t"  // tmpz[2].re = tmpz[2].re + tmp8;
> +            "swc1  %[pom3], 20(%[tmpz])                     \n\t"  // tmpz[2].im = tmpz[2].im - tmp7;
> +            "add.s %[tmp1], %[tmp5],    %[tmp7]             \n\t"  // tmp1 = tmp5 + tmp7;
> +            "sub.s %[tmp3], %[tmp5],    %[tmp7]             \n\t"  // tmp3 = tmp5 - tmp7;
> +            "add.s %[tmp2], %[tmp6],    %[tmp8]             \n\t"  // tmp2 = tmp6 + tmp8;
> +            "sub.s %[tmp4], %[tmp6],    %[tmp8]             \n\t"  // tmp4 = tmp6 - tmp8;
> +            "lwc1  %[temp], 8(%[tmpz])                      \n\t"
> +            "lwc1  %[temp1],12(%[tmpz])                     \n\t"
> +            "lwc1  %[pom],  24(%[tmpz])                     \n\t"
> +            "lwc1  %[pom2], 28(%[tmpz])                     \n\t"
> +            "sub.s %[temp4],%[temp],    %[tmp1]             \n\t"
> +            "sub.s %[temp3],%[temp1],   %[tmp2]             \n\t"
> +            "add.s %[temp], %[temp],    %[tmp1]             \n\t"
> +            "add.s %[temp1],%[temp1],   %[tmp2]             \n\t"
> +            "sub.s %[pom1], %[pom],     %[tmp4]             \n\t"
> +            "add.s %[pom3], %[pom2],    %[tmp3]             \n\t"
> +            "add.s %[pom],  %[pom],     %[tmp4]             \n\t"
> +            "sub.s %[pom2], %[pom2],    %[tmp3]             \n\t"
> +            "swc1  %[temp4],40(%[tmpz])                     \n\t"  // tmpz[5].re = tmpz[1].re - tmp1;
> +            "swc1  %[temp3],44(%[tmpz])                     \n\t"  // tmpz[5].im = tmpz[1].im - tmp2;
> +            "swc1  %[temp], 8(%[tmpz])                      \n\t"  // tmpz[1].re = tmpz[1].re + tmp1;
> +            "swc1  %[temp1],12(%[tmpz])                     \n\t"  // tmpz[1].im = tmpz[1].im + tmp2;
> +            "swc1  %[pom1], 56(%[tmpz])                     \n\t"  // tmpz[7].re = tmpz[3].re - tmp4;
> +            "swc1  %[pom3], 60(%[tmpz])                     \n\t"  // tmpz[7].im = tmpz[3].im + tmp3;
> +            "swc1  %[pom],  24(%[tmpz])                     \n\t"  // tmpz[3].re = tmpz[3].re + tmp4;
> +            "swc1  %[pom2], 28(%[tmpz])                     \n\t"  // tmpz[3].im = tmpz[3].im - tmp3;
> +            : [tmpz]"+r"(tmpz), [tmp1]"=f"(tmp1), [pom]"=f"(pom),   [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
> +              [tmp3]"=f"(tmp3), [tmp2]"=f"(tmp2), [tmp4]"=f"(tmp4), [tmp5]"=f"(tmp5),  [tmp7]"=f"(tmp7),
> +              [tmp6]"=f"(tmp6), [tmp8]"=f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
> +              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
> +            :
> +            : "memory"
> +        );
> +    }
> +
> +    step = 1<<  (MAX_LOG2_NFFT - 4);
> +    n4 = 4;
> +    for (nbits=4; nbits<=s->nbits; nbits++)
> +    {
> +        /*
> +        * num_transforms = (num_transforms>>  1) | 1;
> +        */
> +        __asm__ __volatile__ (
> +            "sra %[num_t], %[num_t], 1               \n\t"
> +            "ori %[num_t], %[num_t], 1               \n\t"
> +
> +            : [num_t] "+r" (num_transforms)
> +        );
> +        n2  = 2 * n4;
> +        n34 = 3 * n4;
> +
> +        for (n=0; n<num_transforms; n++)
> +        {
> +            offset = fft_offsets_lut[n]<<  nbits;
> +            tmpz = z + offset;
> +
> +            tmpz_n2  = tmpz +  n2;
> +            tmpz_n4  = tmpz +  n4;
> +            tmpz_n34 = tmpz +  n34;
> +
> +            __asm__ __volatile__ (
> +                "lwc1  %[pom1], 0(%[tmpz_n2])            \n\t"
> +                "lwc1  %[pom],  0(%[tmpz_n34])           \n\t"
> +                "lwc1  %[pom2], 4(%[tmpz_n2])            \n\t"
> +                "lwc1  %[pom3], 4(%[tmpz_n34])           \n\t"
> +                "lwc1  %[temp1],0(%[tmpz])               \n\t"
> +                "lwc1  %[temp3],4(%[tmpz])               \n\t"
> +                "add.s %[tmp5], %[pom1],      %[pom]     \n\t"   //  tmp5 = tmpz[ n2].re + tmpz[n34].re;
> +                "sub.s %[tmp1], %[pom1],      %[pom]     \n\t"   //  tmp1 = tmpz[ n2].re - tmpz[n34].re;
> +                "add.s %[tmp6], %[pom2],      %[pom3]    \n\t"   //  tmp6 = tmpz[ n2].im + tmpz[n34].im;
> +                "sub.s %[tmp2], %[pom2],      %[pom3]    \n\t"   //  tmp2 = tmpz[ n2].im - tmpz[n34].im;
> +                "sub.s %[temp], %[temp1],     %[tmp5]    \n\t"
> +                "add.s %[temp1],%[temp1],     %[tmp5]    \n\t"
> +                "sub.s %[temp4],%[temp3],     %[tmp6]    \n\t"
> +                "add.s %[temp3],%[temp3],     %[tmp6]    \n\t"
> +                "swc1  %[temp], 0(%[tmpz_n2])            \n\t"   //  tmpz[ n2].re = tmpz[ 0].re - tmp5;
> +                "swc1  %[temp1],0(%[tmpz])               \n\t"   //  tmpz[  0].re = tmpz[ 0].re + tmp5;
> +                "lwc1  %[pom1], 0(%[tmpz_n4])            \n\t"
> +                "swc1  %[temp4],4(%[tmpz_n2])            \n\t"   //  tmpz[ n2].im = tmpz[ 0].im - tmp6;
> +                "lwc1  %[temp], 4(%[tmpz_n4])            \n\t"
> +                "swc1  %[temp3],4(%[tmpz])               \n\t"   //  tmpz[  0].im = tmpz[ 0].im + tmp6;
> +                "sub.s %[pom],  %[pom1],      %[tmp2]    \n\t"
> +                "add.s %[pom1], %[pom1],      %[tmp2]    \n\t"
> +                "add.s %[temp1],%[temp],      %[tmp1]    \n\t"
> +                "sub.s %[temp], %[temp],      %[tmp1]    \n\t"
> +                "swc1  %[pom],  0(%[tmpz_n34])           \n\t"   //  tmpz[n34].re = tmpz[n4].re - tmp2;
> +                "swc1  %[pom1], 0(%[tmpz_n4])            \n\t"   //  tmpz[ n4].re = tmpz[n4].re + tmp2;
> +                "swc1  %[temp1],4(%[tmpz_n34])           \n\t"   //  tmpz[n34].im = tmpz[n4].im + tmp1;
> +                "swc1  %[temp], 4(%[tmpz_n4])            \n\t"   //  tmpz[ n4].im = tmpz[n4].im - tmp1;
> +                : [tmpz]"+r"(tmpz), [tmpz_n2]"+r"(tmpz_n2), [tmpz_n34]"+r"(tmpz_n34), [tmp5]"=f"(tmp5),
> +                  [tmp1]"=f"(tmp1), [pom]"=&f"(pom),        [pom1]"=&f"(pom1),        [pom2]"=&f"(pom2),
> +                  [tmp2]"=f"(tmp2), [tmp6]"=f"(tmp6),       [tmpz_n4]"+r"(tmpz_n4),   [pom3]"=&f"(pom3),
> +                  [temp]"=f"(temp), [temp1]"=f"(temp1),     [temp3]"=f"(temp3),       [temp4]"=f"(temp4)
> +                :
> +                : "memory"
> +            );
> +
> +            w_re_ptr = w_tab + step;
> +
> +            for (i=1; i<n4; i++)
> +            {
> +                w_re = w_re_ptr[0];
> +                w_im = w_re_ptr[MAX_FFT_SIZE/4];

Can you explain why you cannot use the same cos/sin tab that the C 
version uses?

-Vitor


More information about the ffmpeg-devel mailing list