[Ffmpeg-devel] [PATCH] Little optimization to fft_sse.c

Mon Mar 6 09:30:16 CET 2006

Hi,

2006/3/6, Diego Biurrun <diego at biurrun.de>:
> On Mon, Mar 06, 2006 at 03:15:55AM +0800, Zuxy Meng wrote:
> >
> > I have also written a 3DNow! version of fft. Is that still needed?
>
> Sure, those old processors have not all been thrown away yet...
>

Attached are FFT routines that can be used just the same as
ff_fft_calc_sse (the external interfaces are all the same).

For my Athlon XP 2800+, the 3DNow! version is about 5% slower than the
SSE version but 50% faster than the FPU version. The speedup might be
more prominent in a K6-2/III for its lack of a fully pipelined fpu.
However, the fastest is the Extended 3DNow! version, which is yet 33%
faster than the SSE version. So for an "original" K7 without SSE, the
speedup is beyond 100%.

Two reasons why I send complete source files instead of patches:

1. I'm not very familiar with ffmpeg's policy dealing ISA specific
optimizations. Are macros like HAVE_3DNOWEX, HAVE_SSE,
RUNTIME_CPUDETECTION etc. valid here? Changes must be made to
libavcodec\fft.c but I don't know what's the proper way.

2. These two files are written in intrinsics like the original
fft_sse.c. However, they require the mm3dnow.h instead of xmmintrin.h.
The former seems to be absent in gcc3. Although gcc4's mm3dnow.h can
be used for any gcc version that supports 3DNow builtins, subsequent
change must be made to the configure script or we may include the
header in the ffmpeg package. Again I don't know what's the proper
way.

And more tests are of course necessary.

--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
/*
 * FFT/MDCT transform with 3DNow! optimizations
 * Copyright (c) 2002 Fabrice Bellard.
 * Copyright (c) 2006 Zuxy MENG Jie.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
#include "../dsputil.h"
#include <math.h>

#ifdef HAVE_BUILTIN_VECTOR

#include <mm3dnow.h>

static const int p1m1[2] __attribute__((aligned(8))) =
    { 0, 1 << 31 };

static const int m1p1[2] __attribute__((aligned(8))) =
    { 1 << 31, 0 };

void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
{
    int ln = s->nbits;
    int j, np, np2;
    int nblocks, nloops;
    register FFTComplex *p, *q;
    FFTComplex *cptr, *cptr1;
    int k;

    np = 1 << ln;
	/* FEMMS not a must here but recommended by AMD */
	_m_femms();

    {
        __m64 *r, a0, a1, b0, b1, tmp, c;

		r = (__m64 *)&z[0];
        if (s->inverse) {
			c = *(__m64 *)m1p1;
		}
        else {
			c = *(__m64 *)p1m1;
		}

        j = (np >> 2);
        do {
            /* do the pass 0 butterfly */
			a0 = _m_pfadd(r[0], r[1]);
			a1 = _m_pfsub(r[0], r[1]);

            /* do the pass 0 butterfly */
			b0 = _m_pfadd(r[2], r[3]);
			b1 = _m_pfsub(r[2], r[3]);

            /* multiply third by -i */
            tmp = _m_punpckhdq(b1, b1);
            b1 = _m_punpckldq(b1, b1);
            b1 = _m_punpckldq(tmp, b1);			
			b1 = _m_pxor(b1, c);

            /* do the pass 1 butterfly */
            r[0] = _m_pfadd(a0, b0);
			r[1] = _m_pfadd(a1, b1);
            r[2] = _m_pfsub(a0, b0);
			r[3] = _m_pfsub(a1, b1);
            r += 4;
        } while (--j != 0);
    }
    /* pass 2 .. ln-1 */

    nblocks = np >> 3;
    nloops = 1 << 2;
    np2 = np >> 1;

    cptr1 = s->exptab1;
    do {
        p = z;
        q = z + nloops;
        j = nblocks;
        do {
            cptr = cptr1;
            k = nloops >> 1;
            do {
                __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21;

                a0 = *(__m64 *)&p[0];
				a1 = *(__m64 *)&p[1];
                b0 = *(__m64 *)&q[0]; 
				b1 = *(__m64 *)&q[1];

                /* complex mul */
                c0 = *(__m64 *)&cptr[0];
				c1 = *(__m64 *)&cptr[1];
                /*  cre*re cim*re */
				t10 = _m_pfmul(c0, _m_punpckldq(b0, b0));
				t11 = _m_pfmul(c1, _m_punpckldq(b1, b1));
                c0 = *(__m64 *)&cptr[2];
				c1 = *(__m64 *)&cptr[3];
                /*  -cim*im cre*im */
				t20 = _m_pfmul(c0, _m_punpckhdq(b0, b0));
				t21 = _m_pfmul(c1, _m_punpckhdq(b1, b1));
				b0 = _m_pfadd(t10, t20);
				b1 = _m_pfadd(t11, t21);

                /* butterfly */
				*(__m64 *)&p[0] = _m_pfadd(a0, b0);
				*(__m64 *)&p[1] = _m_pfadd(a1, b1);
				*(__m64 *)&q[0] = _m_pfsub(a0, b0);
				*(__m64 *)&q[1] = _m_pfsub(a1, b1);

                p += 2;
                q += 2;
                cptr += 4;
            } while (--k);

            p += nloops;
            q += nloops;
        } while (--j);
        cptr1 += nloops * 2;
        nblocks = nblocks >> 1;
        nloops = nloops << 1;
    } while (nblocks != 0);
	_m_femms();
}

#endif

-------------- next part --------------
/*
 * FFT/MDCT transform with Extended 3DNow! optimizations
 * Copyright (c) 2002 Fabrice Bellard.
 * Copyright (c) 2006 Zuxy MENG Jie.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
#include "../dsputil.h"
#include <math.h>

#ifdef HAVE_BUILTIN_VECTOR

#include <mm3dnow.h>

static const int p1m1[2] __attribute__((aligned(8))) =
    { 0, 1 << 31 };

static const int m1p1[2] __attribute__((aligned(8))) =
    { 1 << 31, 0 };

void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
{
    int ln = s->nbits;
    int j, np, np2;
    int nblocks, nloops;
    register FFTComplex *p, *q;
    FFTComplex *cptr, *cptr1;
    int k;

    np = 1 << ln;
	/* FEMMS is not a must here but recommended by AMD */
	_m_femms();

    {
        __m64 *r, a0, a1, b0, b1, c;

		r = (__m64 *)&z[0];
        if (s->inverse) {
            c = *(__m64 *)m1p1;
        }
        else {
            c = *(__m64 *)p1m1;
        }		

        j = (np >> 2);
        do {
            /* do the pass 0 butterfly */			
 			a0 = _m_pfadd(r[0], r[1]);
			a1 = _m_pfsub(r[0], r[1]);

            /* do the pass 0 butterfly */
			b0 = _m_pfadd(r[2], r[3]);
			b1 = _m_pfsub(r[2], r[3]);

            /* multiply third by -i */
			b1 = _m_pswapd(b1);
			b1 = _m_pxor(b1, c);

            r[0] = _m_pfadd(a0, b0);
			r[1] = _m_pfadd(a1, b1);
            r[2] = _m_pfsub(a0, b0);
			r[3] = _m_pfsub(a1, b1);
            r += 4;
        } while (--j != 0);
    }
    /* pass 2 .. ln-1 */

    nblocks = np >> 3;
    nloops = 1 << 2;
    np2 = np >> 1;

    cptr1 = s->exptab1;
    do {
        p = z;
        q = z + nloops;
        j = nblocks;
        do {
            cptr = cptr1;
            k = nloops >> 1;
            do {
                __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21;

                a0 = *(__m64 *)&p[0];
				a1 = *(__m64 *)&p[1];
                b0 = *(__m64 *)&q[0]; 
				b1 = *(__m64 *)&q[1];

                /* complex mul */
                c0 = *(__m64 *)&cptr[0];
				c1 = *(__m64 *)&cptr[1];
                /* cre*re cim*im */
				t10 = _m_pfmul(c0, b0);
				t11 = _m_pfmul(c1, b1);
				/* no need to access cptr[2] & cptr[3] */
                c0 = _m_pswapd(c0);
				c1 = _m_pswapd(c1);
                /* cim*re cre*im */
				t20 = _m_pfmul(c0, b0);
				t21 = _m_pfmul(c1, b1);

				/* cre*re-cim*im cim*re+cre*im */
				b0 = _m_pfpnacc(t10, t20);
				b1 = _m_pfpnacc(t11, t21);

                /* butterfly */
				*(__m64 *)&p[0] = _m_pfadd(a0, b0);
				*(__m64 *)&p[1] = _m_pfadd(a1, b1);
				*(__m64 *)&q[0] = _m_pfsub(a0, b0);
				*(__m64 *)&q[1] = _m_pfsub(a1, b1);

                p += 2;
                q += 2;
                cptr += 4;
            } while (--k);

            p += nloops;
            q += nloops;
        } while (--j);
        cptr1 += nloops * 2;
        nblocks = nblocks >> 1;
        nloops = nloops << 1;
    } while (nblocks != 0);
	_m_femms();
}

#endif

-------------- next part --------------
/* Copyright (C) 2004 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with GCC; see the file COPYING.  If not, write to
   the Free Software Foundation, 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  */

/* As a special exception, if you include this header file into source
   files compiled by GCC, this header file does not by itself cause
   the resulting executable to be covered by the GNU General Public
   License.  This exception does not however invalidate any other
   reasons why the executable file might be covered by the GNU General
   Public License.  */

/* Implemented from the mm3dnow.h (of supposedly AMD origin) included with
   MSVC 7.1.  */

#ifndef _MM3DNOW_H_INCLUDED
#define _MM3DNOW_H_INCLUDED

#ifdef __3dNOW__

#include <mmintrin.h>

/* Internal data types for implementing the intrinsics.  */
typedef float __v2sf __attribute__ ((__vector_size__ (8)));

static __inline void
_m_femms (void)
{
  __builtin_ia32_femms();
}

static __inline __m64
_m_pavgusb (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pavgusb ((__v8qi)__A, (__v8qi)__B);
}

static __inline __m64
_m_pf2id (__m64 __A)
{
  return (__m64)__builtin_ia32_pf2id ((__v2sf)__A);
}

static __inline __m64
_m_pfacc (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfacc ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfadd (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfadd ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfcmpeq (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfcmpeq ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfcmpge (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfcmpge ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfcmpgt (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfcmpgt ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfmax (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfmax ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfmin (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfmin ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfmul (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfmul ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfrcp (__m64 __A)
{
  return (__m64)__builtin_ia32_pfrcp ((__v2sf)__A);
}

static __inline __m64
_m_pfrcpit1 (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfrcpit1 ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfrcpit2 (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfrcpit2 ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfrsqrt (__m64 __A)
{
  return (__m64)__builtin_ia32_pfrsqrt ((__v2sf)__A);
}

static __inline __m64
_m_pfrsqit1 (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfrsqit1 ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfsub (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfsub ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfsubr (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfsubr ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pi2fd (__m64 __A)
{
  return (__m64)__builtin_ia32_pi2fd ((__v2si)__A);
}

static __inline __m64
_m_pmulhrw (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pmulhrw ((__v4hi)__A, (__v4hi)__B);
}

static __inline void
_m_prefetch (void *__P)
{
  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
}

static __inline void
_m_prefetchw (void *__P)
{
  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
}

static __inline __m64
_m_from_float (float __A)
{
  return (__m64)(__v2sf){ __A, 0 };
}

static __inline float
_m_to_float (__m64 __A)
{
  union { __v2sf v; float a[2]; } __tmp = { (__v2sf)__A };
  return __tmp.a[0];
}

#ifdef __3dNOW_A__

static __inline __m64
_m_pf2iw (__m64 __A)
{
  return (__m64)__builtin_ia32_pf2iw ((__v2sf)__A);
}

static __inline __m64
_m_pfnacc (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfnacc ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pfpnacc (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfpnacc ((__v2sf)__A, (__v2sf)__B);
}

static __inline __m64
_m_pi2fw (__m64 __A)
{
  return (__m64)__builtin_ia32_pi2fw ((__v2si)__A);
}

static __inline __m64
_m_pswapd (__m64 __A)
{
  return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A);
}

#endif /* __3dNOW_A__ */
#endif /* __3dNOW__ */

#endif /* _MM3DNOW_H_INCLUDED */