[FFmpeg-devel] [PATCH 1/2] Optimization of AC3 floating point decoder for MIPS
Vitor Sessak
vitor1001 at gmail.com
Thu Jun 21 14:26:48 CEST 2012
Hi
On 06/21/2012 12:04 PM, Nedeljko Babic wrote:
> FFT in MIPS implementation is working iteratively instead
> of "recursively" calling functions for smaller FFT sizes.
> Some of DSP and format convert utils functions are also optimized.
Some comments
> libavcodec/dsputil.c | 1 +
> libavcodec/dsputil.h | 1 +
> libavcodec/fft.c | 1 +
> libavcodec/fft.h | 11 +
> libavcodec/fmtconvert.c | 1 +
> libavcodec/fmtconvert.h | 1 +
> libavcodec/mips/Makefile | 4 +
> libavcodec/mips/dsputil_mips.c | 168 +++++++++
> libavcodec/mips/fft_mips.c | 689 +++++++++++++++++++++++++++++++++++++
> libavcodec/mips/fft_table.h | 482 ++++++++++++++++++++++++++
> libavcodec/mips/fmtconvert_mips.c | 336 ++++++++++++++++++
> 11 files changed, 1695 insertions(+), 0 deletions(-)
> create mode 100644 libavcodec/mips/dsputil_mips.c
> create mode 100644 libavcodec/mips/fft_mips.c
> create mode 100644 libavcodec/mips/fft_table.h
> create mode 100644 libavcodec/mips/fmtconvert_mips.c
>
> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
> index 442b900..b7d928f 100644
> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -3161,6 +3161,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
> if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
> if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
> if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
> + if (HAVE_MIPSFPU) ff_dsputil_init_mips (c, avctx);
> --- a/libavcodec/fft.h
> +++ b/libavcodec/fft.h
> @@ -38,6 +38,16 @@
>
> typedef float FFTDouble;
>
> +#if ARCH_MIPS
> +enum _fftConsts{
> + MIN_LOG2_NFFT = 5, //!< Specifies miniumum allowed fft size
> + MAX_LOG2_NFFT = 12 //!< Specifies maxiumum allowed fft size
> +};
> +
> +#define MAX_FFT_SIZE (1<< MAX_LOG2_NFFT)
> +#define MIN_FFT_SIZE (1<< MAX_LOG2_NFFT)
> +
> +#endif
MIPS-specific code should not be in common code.
> diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
> new file mode 100644
Nice, can you post the benchmarks results of "fft-
test -s"?
> index 0000000..286c67f
> --- /dev/null
> +++ b/libavcodec/mips/fft_mips.c
> @@ -0,0 +1,689 @@
> +/*
> + * Copyright (c) 2012
> + * MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + * contributors may be used to endorse or promote products derived from
> + * this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Author: Stanisalv Ocovaj (socovaj at mips.com)
> + * Author: Zoran Lukic (zoranl at mips.com)
> + *
> + * Optimized MDCT/IMDCT and FFT transforms
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +#include "config.h"
> +#include "libavcodec/fft.h"
> +#include "fft_table.h"
> +
> +/**
> + * FFT transform
> + */
> +
> +#if HAVE_INLINE_ASM
> +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) {
> +
> + int nbits, i, n, num_transforms, offset, step;
> + int n4, n2, n34;
> + FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
> + FFTComplex *tmpz;
> + float w_re, w_im;
> + float *w_re_ptr;
> + const int fft_size = (1<< s->nbits);
> + int s_n = s->nbits;
> + int tem1, tem2;
> + float pom, pom1, pom2, pom3;
> + float temp, temp1, temp3, temp4;
> + FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
> + FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
> +
> + /**
> + *num_transforms = (0x2aab>> (16 - s->nbits)) | 1;
> + */
> + __asm__ __volatile__ (
> + "li %[tem1], 16 \n\t"
> + "sub %[s_n], %[tem1], %[s_n] \n\t"
> + "li %[tem2], 10923 \n\t"
> + "srav %[tem2], %[tem2], %[s_n] \n\t"
> + "ori %[num_t],%[tem2], 1 \n\t"
> + : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
> + [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
> + );
> +
> +
> + for (n=0; n<num_transforms; n++)
> + {
> + offset = fft_offsets_lut[n]<< 2;
> + tmpz = z + offset;
What is the point of this LUT? If you want your input permutated in some
particular order, you can just use revtab struct.
> +
> + tmp1 = tmpz[0].re + tmpz[1].re;
> + tmp5 = tmpz[2].re + tmpz[3].re;
> + tmp2 = tmpz[0].im + tmpz[1].im;
> + tmp6 = tmpz[2].im + tmpz[3].im;
> + tmp3 = tmpz[0].re - tmpz[1].re;
> + tmp8 = tmpz[2].im - tmpz[3].im;
> + tmp4 = tmpz[0].im - tmpz[1].im;
> + tmp7 = tmpz[2].re - tmpz[3].re;
> +
> + tmpz[0].re = tmp1 + tmp5;
> + tmpz[2].re = tmp1 - tmp5;
> + tmpz[0].im = tmp2 + tmp6;
> + tmpz[2].im = tmp2 - tmp6;
> + tmpz[1].re = tmp3 + tmp8;
> + tmpz[3].re = tmp3 - tmp8;
> + tmpz[1].im = tmp4 - tmp7;
> + tmpz[3].im = tmp4 + tmp7;
> +
> +}
> +
> + if (fft_size< 8)
> + return;
> +
> + num_transforms = (num_transforms>> 1) | 1;
> + for (n=0; n<num_transforms; n++)
> + {
> + offset = fft_offsets_lut[n]<< 3;
> + tmpz = z + offset;
> +
> + __asm__ __volatile__ (
> + "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
> + "lwc1 %[pom], 40(%[tmpz]) \n\t"
> + "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
> + "lwc1 %[pom1], 56(%[tmpz]) \n\t"
> + "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
> + "lwc1 %[pom2], 44(%[tmpz]) \n\t"
> + "lwc1 %[pom3], 60(%[tmpz]) \n\t"
> + "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
> + "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
> + "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
> + "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
> + "lwc1 %[pom], 40(%[tmpz]) \n\t"
> + "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
> + "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
> + "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
> + "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
> + "lwc1 %[pom1], 44(%[tmpz]) \n\t"
> + "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
> + "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
> + "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
> + "lwc1 %[pom2], 56(%[tmpz]) \n\t"
> + "lwc1 %[pom3], 60(%[tmpz]) \n\t"
> + "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
> + "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
> + "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
> + "lwc1 %[pom], 0(%[tmpz]) \n\t"
> + "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
> + "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
> + "lwc1 %[pom2], 4(%[tmpz]) \n\t"
> + "sub.s %[pom1], %[pom], %[tmp5] \n\t"
> + "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
> + "add.s %[pom3], %[pom], %[tmp5] \n\t"
> + "sub.s %[pom], %[pom2], %[tmp6] \n\t"
> + "add.s %[pom2], %[pom2], %[tmp6] \n\t"
> + "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
> + "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
> + "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
> + "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
> + "lwc1 %[pom1], 16(%[tmpz]) \n\t"
> + "lwc1 %[pom3], 20(%[tmpz]) \n\t"
> + "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f;
> + "add.s %[temp1],%[tmp1], %[tmp2] \n\t"
> + "sub.s %[temp], %[pom1], %[tmp8] \n\t"
> + "add.s %[pom2], %[pom3], %[tmp7] \n\t"
> + "sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
> + "sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
> + "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
> + "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
> + "add.s %[pom1], %[pom1], %[tmp8] \n\t"
> + "sub.s %[pom3], %[pom3], %[tmp7] \n\t"
> + "add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
> + "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
> + "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
> + "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
> + "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
> + "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
> + "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
> + "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
> + "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
> + "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
> + "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
> + "lwc1 %[temp], 8(%[tmpz]) \n\t"
> + "lwc1 %[temp1],12(%[tmpz]) \n\t"
> + "lwc1 %[pom], 24(%[tmpz]) \n\t"
> + "lwc1 %[pom2], 28(%[tmpz]) \n\t"
> + "sub.s %[temp4],%[temp], %[tmp1] \n\t"
> + "sub.s %[temp3],%[temp1], %[tmp2] \n\t"
> + "add.s %[temp], %[temp], %[tmp1] \n\t"
> + "add.s %[temp1],%[temp1], %[tmp2] \n\t"
> + "sub.s %[pom1], %[pom], %[tmp4] \n\t"
> + "add.s %[pom3], %[pom2], %[tmp3] \n\t"
> + "add.s %[pom], %[pom], %[tmp4] \n\t"
> + "sub.s %[pom2], %[pom2], %[tmp3] \n\t"
> + "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
> + "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
> + "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
> + "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
> + "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
> + "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
> + "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
> + "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
> + : [tmpz]"+r"(tmpz), [tmp1]"=f"(tmp1), [pom]"=f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
> + [tmp3]"=f"(tmp3), [tmp2]"=f"(tmp2), [tmp4]"=f"(tmp4), [tmp5]"=f"(tmp5), [tmp7]"=f"(tmp7),
> + [tmp6]"=f"(tmp6), [tmp8]"=f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
> + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
> + :
> + : "memory"
> + );
> + }
> +
> + step = 1<< (MAX_LOG2_NFFT - 4);
> + n4 = 4;
> + for (nbits=4; nbits<=s->nbits; nbits++)
> + {
> + /*
> + * num_transforms = (num_transforms>> 1) | 1;
> + */
> + __asm__ __volatile__ (
> + "sra %[num_t], %[num_t], 1 \n\t"
> + "ori %[num_t], %[num_t], 1 \n\t"
> +
> + : [num_t] "+r" (num_transforms)
> + );
> + n2 = 2 * n4;
> + n34 = 3 * n4;
> +
> + for (n=0; n<num_transforms; n++)
> + {
> + offset = fft_offsets_lut[n]<< nbits;
> + tmpz = z + offset;
> +
> + tmpz_n2 = tmpz + n2;
> + tmpz_n4 = tmpz + n4;
> + tmpz_n34 = tmpz + n34;
> +
> + __asm__ __volatile__ (
> + "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
> + "lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
> + "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
> + "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
> + "lwc1 %[temp1],0(%[tmpz]) \n\t"
> + "lwc1 %[temp3],4(%[tmpz]) \n\t"
> + "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
> + "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
> + "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
> + "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
> + "sub.s %[temp], %[temp1], %[tmp5] \n\t"
> + "add.s %[temp1],%[temp1], %[tmp5] \n\t"
> + "sub.s %[temp4],%[temp3], %[tmp6] \n\t"
> + "add.s %[temp3],%[temp3], %[tmp6] \n\t"
> + "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
> + "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
> + "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
> + "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
> + "lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
> + "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
> + "sub.s %[pom], %[pom1], %[tmp2] \n\t"
> + "add.s %[pom1], %[pom1], %[tmp2] \n\t"
> + "add.s %[temp1],%[temp], %[tmp1] \n\t"
> + "sub.s %[temp], %[temp], %[tmp1] \n\t"
> + "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
> + "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
> + "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
> + "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
> + : [tmpz]"+r"(tmpz), [tmpz_n2]"+r"(tmpz_n2), [tmpz_n34]"+r"(tmpz_n34), [tmp5]"=f"(tmp5),
> + [tmp1]"=f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
> + [tmp2]"=f"(tmp2), [tmp6]"=f"(tmp6), [tmpz_n4]"+r"(tmpz_n4), [pom3]"=&f"(pom3),
> + [temp]"=f"(temp), [temp1]"=f"(temp1), [temp3]"=f"(temp3), [temp4]"=f"(temp4)
> + :
> + : "memory"
> + );
> +
> + w_re_ptr = w_tab + step;
> +
> + for (i=1; i<n4; i++)
> + {
> + w_re = w_re_ptr[0];
> + w_im = w_re_ptr[MAX_FFT_SIZE/4];
Can you explain why you cannot use the same cos/sin tab that the C
version uses?
-Vitor
More information about the ffmpeg-devel
mailing list