[FFmpeg-devel] [PATCH] VC-1 MMX DSP functions

Zuxy Meng zuxy.meng
Sun Jul 8 17:14:17 CEST 2007


Hi,

2007/7/8, Zuxy Meng <zuxy.meng at gmail.com>:
> Hi,
>
> 2007/7/7, Christophe GISQUET <christophe.gisquet at free.fr>:
> > Hello,
> >
> > here are the MMX functions now licensed under the MIT license.
> >
> > Zuxy Meng has been working on SSE2 versions of those; I'm not sure if he
> > would agree to contribute to this file using MIT license. In that case,
> > I don't mind the license being changed, but I would prefer having the
> > MIT licensing available in the svn history.
>
> I care less about license issues than raw performance :-)
>
> I did a quick test on 64-bit K8 tonight thanks to Stephan's testbed.
> The result wasn't promising. In short, from fastest to slowest:
> MMX > SSE2 w/o sw pipeling > SSE2 w/ sw pipeling
>
> The reason may be that on K8 SSE2 is thoughput bound (K8 can decode 3
> MMX instructions per cycle, but only 1.5 SSE2 ones), and sw pipeling
> increase the # of instructions per loop. If AMD does what they've
> promised on their upcoming K10, I guess the result will be:
> SSE2 w/o sw pipeling > SSE2 w/ sw pipeling > MMX
>
> And IIRC on your 32-bit Conroe, where SSE2 is latency bound (punpcklbw
> and unaligned movq are slow), the list is somewhat different:
> SSE2 w/ sw pipeling > MMX > SSE2 w/o sw pipeling
>
> On my Dothan:
> MMX > SSE2 w/ sw pipeling > SSE2 w/o sw pipeling
>
> So the conclusion is that I can't make  a conclusion. Any suggestions?

I just tried to unroll the loop so the # of instructions per loop
remains the same after being sw pipelined and the speed improves a
little bit:

Now SSE2 is about the same speed as MMX (+- 0.5%) both on my Dothan
and Stephan's 64-bit K8.

Attached isn't against Christophe's newest version and may look ugly,
but it serves as base for further improvement.
-- 
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
/*
 * VC-1 and WMV3 decoder - DSP functions MMX-optimized
 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License version 2.0 as published by the Free Software Foundation.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include "dsputil.h"
#include "x86_cpu.h"

DECLARE_ALIGNED_16(static const uint64_t,ff_fact_53[2] ) = { 0x0035003500350035ULL, 0x0035003500350035ULL };
DECLARE_ALIGNED_16(static const uint64_t,ff_fact_18[2] ) = { 0x0012001200120012ULL, 0x0012001200120012ULL };

/** Add rounder from mm7 to mm3 and pack result at destination */
#define NORMALIZE_MMX(SHIFT)                                               \
     "paddsw    %%mm7, %%mm3               \n\t" /* +bias-r */             \
     "paddsw    %%mm7, %%mm4               \n\t" /* +bias-r */             \
     "psraw     $"SHIFT", %%mm3            \n\t"                           \
     "psraw     $"SHIFT", %%mm4            \n\t"                           \
     "packuswb  %%mm4, %%mm3               \n\t"                           \
     "movq      %%mm3, (%2)                \n\t"

/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
#define LOAD_ROUNDER_MMX                                                   \
     "movd       %7, %%mm7                 \n\t"                           \
     "punpcklwd  %%mm7, %%mm7              \n\t"                           \
     "punpckldq  %%mm7, %%mm7              \n\t" /* pshufw */

/** Add rounder from mm7 to mm3 and pack result at destination */
#define NORMALIZE_SSE2(SHIFT, XMM)                                         \
     "paddsw    %%xmm7, "XMM"             \n\t" /* +bias-r */             \
     "psraw     $"SHIFT", "XMM"           \n\t"                           \
     "packuswb  "XMM", "XMM"             \n\t"                           \
     "movq      "XMM", (%2)               \n\t"

/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
#define LOAD_ROUNDER_SSE2                                                  \
     "movd       %7, %%xmm7                \n\t"                           \
     "punpcklwd  %%xmm7, %%xmm7            \n\t"                           \
     "pshufd     $0, %%xmm7, %%xmm7        \n\t"

/** 1/2 shift for MMX instruction set */
static void vc1_put_shift2_mmx(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset)
{
    src -= offset;
    rnd = 8-rnd;
    asm volatile(
        LOAD_ROUNDER_MMX
        ASMALIGN(3)
        "1:                                \n\t"
        "movd      0(%1,%5  ), %%mm3       \n\t"
        "movd      4(%1,%5  ), %%mm4       \n\t"
        "movd      0(%1,%5,2), %%mm1        \n\t"
        "movd      4(%1,%5,2), %%mm2        \n\t"
        "punpcklbw %%mm0, %%mm3            \n\t"
        "punpcklbw %%mm0, %%mm4            \n\t"
        "punpcklbw %%mm0, %%mm1            \n\t"
        "punpcklbw %%mm0, %%mm2            \n\t"
        "paddw     %%mm1, %%mm3            \n\t"
        "paddw     %%mm2, %%mm4            \n\t"
        "movq      %%mm3, %%mm1            \n\t"
        "movq      %%mm4, %%mm2            \n\t"
        "psllw     $3, %%mm3               \n\t" /* 8* */
        "psllw     $3, %%mm4               \n\t" /* 8* */
        "paddw     %%mm1, %%mm3            \n\t" /* 9,9 */
        "paddw     %%mm2, %%mm4            \n\t" /* 9,9 */
        "movd      0(%1     ), %%mm1       \n\t"
        "movd      4(%1     ), %%mm2       \n\t"
        "punpcklbw %%mm0, %%mm1            \n\t"
        "punpcklbw %%mm0, %%mm2            \n\t"
        "psubsw    %%mm1, %%mm3            \n\t" /* -1,9,9 */
        "psubsw    %%mm2, %%mm4            \n\t" /* -1,9,9 */
        "movd      0(%1,%6  ), %%mm1       \n\t"
        "movd      4(%1,%6  ), %%mm2       \n\t"
        "punpcklbw %%mm0, %%mm1            \n\t"
        "punpcklbw %%mm0, %%mm2            \n\t"
        "psubsw    %%mm1, %%mm3            \n\t"
        "psubsw    %%mm2, %%mm4            \n\t"
        NORMALIZE_MMX("4")
        "add  %3, %1                       \n\t"
        "add  %4, %2                       \n\t"
        "dec  %0                           \n\t"
        "jnz 1b                            \n\t"
        : "+g"(h), "+r" (src),  "+r" (dst)
        : "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd)
        : "memory"
    );
}


/**
 * Macro to build vc1_put_shift[12].
 * Parameters passed must use %5 (+offset) and %6 (-offset).
 *
 * @param  NAME   Either 1 or 3
 * @param  A1     Offset for tap having coefficient -3
 * @param  A2     Offset for tap having coefficient 18
 * @param  A3     Offset for tap having coefficient 53
 * @param  A4     Offset for tap having coefficient -4
 */
#ifndef ARCH_X86_64
#define MSPEL_FILTER13(NAME, A1, A2, A3, A4)                            \
static void vc1_put_shift ## NAME ## _mmx(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset) \
{                                                                       \
    src -= offset;                                                      \
    rnd = 32-rnd;                                                       \
    asm volatile(                                                       \
        LOAD_ROUNDER_MMX                                                \
        ASMALIGN(3)                                                     \
        "1:                                \n\t"                        \
        "movd      0"A1", %%mm1            \n\t"                        \
        "movd      4"A1", %%mm2            \n\t"                        \
        "punpcklbw %%mm0, %%mm1            \n\t"                        \
        "punpcklbw %%mm0, %%mm2            \n\t"                        \
        "movq      %%mm1, %%mm3            \n\t"                        \
        "movq      %%mm2, %%mm4            \n\t"                        \
        "paddw     %%mm1, %%mm1            \n\t"                        \
        "paddw     %%mm2, %%mm2            \n\t"                        \
        "paddsw    %%mm3, %%mm1            \n\t" /* 3* */               \
        "paddsw    %%mm4, %%mm2            \n\t" /* 3* */               \
        "movd      0"A2", %%mm3            \n\t"                        \
        "movd      4"A2", %%mm4            \n\t"                        \
        "punpcklbw %%mm0, %%mm3            \n\t"                        \
        "punpcklbw %%mm0, %%mm4            \n\t"                        \
        "pmullw    %%mm6, %%mm3            \n\t" /* *18 */              \
        "pmullw    %%mm6, %%mm4            \n\t" /* *18 */              \
        "psubsw    %%mm1, %%mm3            \n\t" /*18,-3 */             \
        "psubsw    %%mm2, %%mm4            \n\t" /*18,-3 */             \
        "movd      0"A3", %%mm1            \n\t"                        \
        "movd      4"A3", %%mm2            \n\t"                        \
        "punpcklbw %%mm0, %%mm1            \n\t"                        \
        "punpcklbw %%mm0, %%mm2            \n\t"                        \
        "pmullw    %%mm5, %%mm1            \n\t" /* *53 */              \
        "pmullw    %%mm5, %%mm2            \n\t" /* *53 */              \
        "paddsw    %%mm1, %%mm3            \n\t" /*53,18,-3 */          \
        "paddsw    %%mm2, %%mm4            \n\t" /*53,18,-3 */          \
        "movd      0"A4", %%mm1            \n\t"                        \
        "movd      4"A4", %%mm2            \n\t"                        \
        "punpcklbw %%mm0, %%mm1            \n\t"                        \
        "punpcklbw %%mm0, %%mm2            \n\t"                        \
        "psllw     $2, %%mm1               \n\t" /* 4* */               \
        "psllw     $2, %%mm2               \n\t" /* 4* */               \
        "psubsw    %%mm1, %%mm3            \n\t"                        \
        "psubsw    %%mm2, %%mm4            \n\t"                        \
        NORMALIZE_MMX("6")                                              \
        "add       %3, %1                  \n\t"                        \
        "add       %4, %2                  \n\t"                        \
        "dec       %0                      \n\t"                        \
        "jnz 1b                            \n\t"                        \
        : "+g"(h), "+r" (src),  "+r" (dst)                              \
        : "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd)    \
        : "memory"                                                      \
    );                                                                  \
};                                                                      \
                                                                        \
static void vc1_put_shift ## NAME ## _sse2(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset) \
{                                                                       \
    src -= offset;                                                      \
    rnd = 32-rnd;                                                       \
    asm volatile(                                                       \
        LOAD_ROUNDER_SSE2                                               \
        "movq      "A2", %%xmm5            \n\t"                        \
        "movq      "A1", %%xmm6            \n\t"                        \
        "punpcklbw %%xmm0, %%xmm5          \n\t"\
        "punpcklbw %%xmm0, %%xmm6          \n\t"\
        ASMALIGN(3)                                                     \
        "1:                                \n\t"                        \
        "movq      "A4", %%xmm1            \n\t"                        \
        "movq      "A3", %%xmm2            \n\t"                        \
        "punpcklbw %%xmm0, %%xmm1          \n\t"                        \
        "punpcklbw %%xmm0, %%xmm2          \n\t"                        \
        "add       %3, %1                  \n\t"                        \
	"movdqa %%xmm5, %%xmm3\n\t"\
	"movdqa %%xmm6, %%xmm4\n\t"\
        "movq      "A2", %%xmm5            \n\t"                        \
        "movq      "A1", %%xmm6            \n\t"                        \
        "punpcklbw %%xmm0, %%xmm5          \n\t"                        \
        "punpcklbw %%xmm0, %%xmm6          \n\t"                        \
        "pmullw    %8, %%xmm2          \n\t" /* *53 */              \
        "psllw     $2, %%xmm1              \n\t" /* *4  */              \
        "pmullw    %9, %%xmm3          \n\t" /* *18 */              \
        "psubsw    %%xmm4, %%xmm2          \n\t" /* 53,-1 */            \
        "paddsw    %%xmm4, %%xmm1          \n\t" /* 4,1 */              \
        "psubsw    %%xmm4, %%xmm3          \n\t" /* 18,-1 */            \
        "psubsw    %%xmm1, %%xmm2          \n\t" /* -4,53,-2 */         \
        "paddsw    %%xmm2, %%xmm3          \n\t" /* -4,53,18,-3 */      \
        NORMALIZE_SSE2("6", "%%xmm3")                                   \
        "add       %4, %2                  \n\t"                        \
        "dec       %0                      \n\t"                        \
        "jnz 1b                            \n\t"                        \
        : "+g"(h), "+r" (src),  "+r" (dst)                              \
        : "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd), "m"(*ff_fact_53), "m"(*ff_fact_18)    \
        : "memory"                                                      \
    );                                                                  \
}
#else
#define MSPEL_FILTER13(NAME, A1, A2, A3, A4)                            \
static void vc1_put_shift ## NAME ## _mmx(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset) {}\
static void vc1_put_shift ## NAME ## _sse2(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset) \
{                                                                       \
    src -= offset;                                                      \
    rnd = 32-rnd;                                                       \
    asm volatile(                                                       \
        LOAD_ROUNDER_SSE2                                               \
        "movq      "A4", %%xmm11           \n\t"                        \
        "movq      "A3", %%xmm12           \n\t"                        \
        "movq      "A2", %%xmm5            \n\t"                        \
        "movq      "A1", %%xmm6            \n\t"                        \
        "punpcklbw %%xmm0, %%xmm11         \n\t"                        \
        "punpcklbw %%xmm0, %%xmm12         \n\t"                        \
        "punpcklbw %%xmm0, %%xmm5          \n\t"\
        "punpcklbw %%xmm0, %%xmm6          \n\t"\
        ASMALIGN(3)                                                     \
        "1:                                \n\t"                        \
        "add       %3, %1                  \n\t"                        \
        "movq      "A4", %%xmm1           \n\t"                        \
        "movq      "A3", %%xmm2           \n\t"                        \
        "movq      "A2", %%xmm3            \n\t"                        \
        "movq      "A1", %%xmm4            \n\t"                        \
        "punpcklbw %%xmm0, %%xmm1         \n\t"                        \
        "punpcklbw %%xmm0, %%xmm2         \n\t"                        \
        "punpcklbw %%xmm0, %%xmm3          \n\t"                        \
        "punpcklbw %%xmm0, %%xmm4          \n\t"                        \
        "pmullw    %%xmm8, %%xmm12          \n\t" /* *53 */              \
        "psllw     $2, %%xmm11              \n\t" /* *4  */              \
        "pmullw    %%xmm9, %%xmm5          \n\t" /* *18 */              \
        "psubsw    %%xmm6, %%xmm12          \n\t" /* 53,-1 */            \
        "paddsw    %%xmm6, %%xmm11          \n\t" /* 4,1 */              \
        "psubsw    %%xmm6, %%xmm5          \n\t" /* 18,-1 */            \
        "psubsw    %%xmm11, %%xmm12          \n\t" /* -4,53,-2 */         \
        "paddsw    %%xmm12, %%xmm5          \n\t" /* -4,53,18,-3 */      \
        NORMALIZE_SSE2("6", "%%xmm5")                                   \
        "add       %4, %2                  \n\t"                        \
        "dec       %0                      \n\t"                        \
        "jz 1f                            \n\t"                        \
        "add       %3, %1                  \n\t"                        \
        "movq      "A4", %%xmm11           \n\t"                        \
        "movq      "A3", %%xmm12           \n\t"                        \
        "movq      "A2", %%xmm5            \n\t"                        \
        "movq      "A1", %%xmm6            \n\t"                        \
        "punpcklbw %%xmm0, %%xmm11         \n\t"                        \
        "punpcklbw %%xmm0, %%xmm12         \n\t"                        \
        "punpcklbw %%xmm0, %%xmm5          \n\t"                        \
        "punpcklbw %%xmm0, %%xmm6          \n\t"                        \
        "pmullw    %%xmm8, %%xmm2          \n\t" /* *53 */              \
        "psllw     $2, %%xmm1              \n\t" /* *4  */              \
        "pmullw    %%xmm9, %%xmm3          \n\t" /* *18 */              \
        "psubsw    %%xmm4, %%xmm2          \n\t" /* 53,-1 */            \
        "paddsw    %%xmm4, %%xmm1          \n\t" /* 4,1 */              \
        "psubsw    %%xmm4, %%xmm3          \n\t" /* 18,-1 */            \
        "psubsw    %%xmm1, %%xmm2          \n\t" /* -4,53,-2 */         \
        "paddsw    %%xmm2, %%xmm3          \n\t" /* -4,53,18,-3 */      \
        NORMALIZE_SSE2("6", "%%xmm3")                                   \
        "add       %4, %2                  \n\t"                        \
        "dec       %0                      \n\t"                        \
	"jnz 1b\n\t"\
	"1:\n\t"\
	"nop\n\t"\
        : "+g"(h), "+r" (src),  "+r" (dst)                              \
        : "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd)    \
        : "memory"                                                      \
    );                                                                  \
}
#endif

/** 1/4 shift MMX and SSE2 */
MSPEL_FILTER13(1, "(%1,%6  )", "(%1,%5,2)", "(%1,%5  )", "(%1     )")
/** 3/4 shift MMX and SSE2 */
MSPEL_FILTER13(3, "(%1     )", "(%1,%5  )", "(%1,%5,2)", "(%1,%6  )")

/** 1/2 shift for SSE2 instruction set */
static void vc1_put_shift2_sse2(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset)
{
    src -= offset;
    rnd = 8-rnd;
    asm volatile(
        LOAD_ROUNDER_SSE2
#ifdef ARCH_X86_64
        "movq      (%1     ), %%xmm11       \n\t"
        "movq      (%1,%5  ), %%xmm12       \n\t"
#endif
        "movq      (%1,%5,2), %%xmm5       \n\t"
        "movq      (%1,%6  ), %%xmm6       \n\t"
#ifdef ARCH_X86_64
        "punpcklbw %%xmm0, %%xmm11          \n\t"
        "punpcklbw %%xmm0, %%xmm12          \n\t"
#endif
        "punpcklbw %%xmm0, %%xmm5          \n\t"
        "punpcklbw %%xmm0, %%xmm6          \n\t"
        ASMALIGN(3)
        "1:                                \n\t"
#ifndef ARCH_X86_64
        "movq      (%1     ), %%xmm1       \n\t"
        "movq      (%1,%5  ), %%xmm2       \n\t"
        "punpcklbw %%xmm0, %%xmm1          \n\t"
        "punpcklbw %%xmm0, %%xmm2          \n\t"
#endif
        "add  %3, %1                       \n\t"
#ifdef ARCH_X86_64
        "movq      (%1     ), %%xmm1       \n\t"
        "movq      (%1,%5  ), %%xmm2       \n\t"
#endif
        "movq      (%1,%5,2), %%xmm3       \n\t"
        "movq      (%1,%6  ), %%xmm4       \n\t"
#ifdef ARCH_X86_64
        "punpcklbw %%xmm0, %%xmm1          \n\t"
        "punpcklbw %%xmm0, %%xmm2          \n\t"
#endif
        "punpcklbw %%xmm0, %%xmm3          \n\t"
        "punpcklbw %%xmm0, %%xmm4          \n\t"
#ifndef ARCH_X86_64
        "paddsw    %%xmm2, %%xmm5          \n\t"
        "paddsw    %%xmm1, %%xmm6          \n\t"
        "movdqa    %%xmm5, %%xmm2          \n\t"
        "psllw     $3, %%xmm5              \n\t" /* 8* */
        "paddw     %%xmm2, %%xmm5          \n\t" /* 9,9 */
        "psubsw    %%xmm6, %%xmm5          \n\t"
#else
        "paddsw    %%xmm12, %%xmm5          \n\t"
        "paddsw    %%xmm11, %%xmm6          \n\t"
        "movdqa    %%xmm5, %%xmm12          \n\t"
        "psllw     $3, %%xmm5              \n\t" /* 8* */
        "paddw     %%xmm12, %%xmm5          \n\t" /* 9,9 */
        "psubsw    %%xmm6, %%xmm5          \n\t"
#endif
        NORMALIZE_SSE2("4", "%%xmm5")
        "add  %4, %2                       \n\t"
        "dec  %0                           \n\t"
        "jz 1f                            \n\t"
#ifndef ARCH_X86_64
        "movq      (%1     ), %%xmm1       \n\t"
        "movq      (%1,%5  ), %%xmm2       \n\t"
        "punpcklbw %%xmm0, %%xmm1          \n\t"
        "punpcklbw %%xmm0, %%xmm2          \n\t"
#endif
        "add  %3, %1                       \n\t"
#ifdef ARCH_X86_64
        "movq      (%1     ), %%xmm11       \n\t"
        "movq      (%1,%5  ), %%xmm12       \n\t"
#endif
        "movq      (%1,%5,2), %%xmm5       \n\t"
        "movq      (%1,%6  ), %%xmm6       \n\t"
#ifdef ARCH_X86_64
        "punpcklbw %%xmm0, %%xmm11          \n\t"
        "punpcklbw %%xmm0, %%xmm12          \n\t"
#endif
        "punpcklbw %%xmm0, %%xmm5          \n\t"
        "punpcklbw %%xmm0, %%xmm6          \n\t"
        "paddsw    %%xmm2, %%xmm3          \n\t"
        "paddsw    %%xmm1, %%xmm4          \n\t"
        "movdqa    %%xmm3, %%xmm2          \n\t"
        "psllw     $3, %%xmm3              \n\t" /* 8* */
        "paddw     %%xmm2, %%xmm3          \n\t" /* 9,9 */
        "psubsw    %%xmm4, %%xmm3          \n\t"
        NORMALIZE_SSE2("4", "%%xmm3")
        "add  %4, %2                       \n\t"
        "dec  %0                           \n\t"
        "jnz 1b                            \n\t"
	"1:\n\t"
	"nop\n\t"
        : "+g"(h), "+r" (src),  "+r" (dst)
        : "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd)
        : "memory"
    );
}


extern  void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h);
typedef void (*vc1_mspel_mc_filter)(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset);

/** Interpolates fractional pel values using MMX */
static inline void vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride, int mode, int rnd, const int sse2)
{
    const uint8_t *tptr;
    int           tptrstr;
    int           mode1 = mode & 3;
    int           mode2 = (mode >> 2) & 3;
    DECLARE_ALIGNED_16(uint8_t, tmp[8*11]);
    vc1_mspel_mc_filter vc1_put_shift[4];

    if (sse2) {

        vc1_put_shift[1] = vc1_put_shift1_sse2;
        vc1_put_shift[2] = vc1_put_shift2_sse2;
        vc1_put_shift[3] = vc1_put_shift3_sse2;

        asm volatile(
            "pxor       %%xmm0, %%xmm0 \n\t"
#ifdef ARCH_X86_64
	    "movdqa     %0, %%xmm8 \n\t"
	    "movdqa     %1, %%xmm9 \n\t"
#endif
            :: "m"(*ff_fact_53), "m"(*ff_fact_18)
        );
    } else {

        vc1_put_shift[1] = vc1_put_shift1_mmx;
        vc1_put_shift[2] = vc1_put_shift2_mmx;
        vc1_put_shift[3] = vc1_put_shift3_mmx;

        asm volatile(
            "pxor %%mm0, %%mm0         \n\t"
            "movq %0, %%mm5            \n\t"
            "movq %1, %%mm6            \n\t"
            :: "m"(*ff_fact_53), "m"(*ff_fact_18)
        );
    }

    /* Translation: tmp=src-stride, tmp+8=src, ... */
    if (mode1) { /* Horizontal filter to apply */
        if (mode2) { /* Vertical filter to apply, output to tmp */
            vc1_put_shift[mode1](tmp, 8, src-stride, stride, 11, rnd, 1);
            tptr = tmp+8;
            tptrstr = 8;
        } else { /* No vertical filter, output 8 lines to dst */
            //fprintf(stderr, "mode1 noV\n"); fflush(stderr);
            vc1_put_shift[mode1](dst, stride, src, stride, 8, rnd, 1);
	    return;
        }
    } else {
        /* No horizontal filter, use directly src as input */
        tptr = src;
        tptrstr = stride;
        /* put_vc1_mspel_mc00_mmx directly calls put_pixels8_mmx */
    }

    vc1_put_shift[mode2](dst, stride, tptr, tptrstr, 8, 1-rnd, tptrstr);
}

static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
    put_pixels8_mmx(dst, src, stride, 8);
}

#define DECLARE_FUNCTIONS(a, b)                 \
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
    vc1_mspel_mc(dst, src, stride, a + (b<<2), rnd, 0);                 \
};                                                                      \
static void put_vc1_mspel_mc ## a ## b ## _sse2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
    vc1_mspel_mc(dst, src, stride, a + (b<<2), rnd, 1);                 \
}

DECLARE_FUNCTIONS(0, 1)
DECLARE_FUNCTIONS(0, 2)
DECLARE_FUNCTIONS(0, 3)

DECLARE_FUNCTIONS(1, 0)
DECLARE_FUNCTIONS(1, 1)
DECLARE_FUNCTIONS(1, 2)
DECLARE_FUNCTIONS(1, 3)

DECLARE_FUNCTIONS(2, 0)
DECLARE_FUNCTIONS(2, 1)
DECLARE_FUNCTIONS(2, 2)
DECLARE_FUNCTIONS(2, 3)

DECLARE_FUNCTIONS(3, 0)
DECLARE_FUNCTIONS(3, 1)
DECLARE_FUNCTIONS(3, 2)
DECLARE_FUNCTIONS(3, 3)

void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
    dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
    dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
    dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
    dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;

    dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
    dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
    dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
    dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;

    dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
    dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;

    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
}

void ff_vc1dsp_init_sse2(DSPContext* dsp, AVCodecContext *avctx) {
    dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
    dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_sse2;
    dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_sse2;
    dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_sse2;

    dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_sse2;
    dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_sse2;
    dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_sse2;
    dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_sse2;

    dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_sse2;
    dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_sse2;
    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_sse2;
    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_sse2;

    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_sse2;
    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_sse2;
    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_sse2;
    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_sse2;
}



More information about the ffmpeg-devel mailing list