[FFmpeg-devel] [PATCH 02/11] x86: dcadsp: implement int8x8_fmul_int32

Loren Merritt lorenm at u.washington.edu
Fri Feb 7 03:06:26 CET 2014


On Thu, 6 Feb 2014, Christophe Gisquet wrote:

> +; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
> +%macro INT8X8_FMUL_INT32  1
> +cglobal int8x8_fmul_int32, 3,3,%1, dst, src, scale
> +    cvtsi2ss    m0, scalem
> +    mulss       m0, [pf_inv16]
> +    shufps      m0, m0, 0
> +%if cpuflag(sse2)
> +%if cpuflag(sse4)
> +    pmovsxbd    m1, [srcq+0]
> +    pmovsxbd    m2, [srcq+4]
> +%else
> +    movq        m1, [srcq]
> +    punpcklbw   m1, m1
> +    mova        m2, m1
> +    punpcklwd   m1, m1
> +    punpckhwd   m2, m2
> +    psrad       m1, 24
> +    psrad       m2, 24
> +%endif
> +    cvtdq2ps    m1, m1
> +    cvtdq2ps    m2, m2
> +%else
> +    movd       mm0, [srcq+0]
> +    movd       mm1, [srcq+4]
> +    punpcklbw  mm0, mm0
> +    punpcklbw  mm1, mm1
> +    movq       mm2, mm0
> +    movq       mm3, mm1
> +    punpcklwd  mm0, mm0
> +    punpcklwd  mm1, mm1
> +    punpckhwd  mm2, mm2
> +    punpckhwd  mm3, mm3
> +    psrad      mm0, 24
> +    psrad      mm1, 24
> +    psrad      mm2, 24
> +    psrad      mm3, 24
> +    cvtpi2ps    m1, mm0
> +    cvtpi2ps    m2, mm1
> +    cvtpi2ps    m3, mm2
> +    cvtpi2ps    m4, mm3
> +    shufps      m0, m0, 0
> +    emms
> +    unpcklpd    m1, m3
> +    unpcklpd    m2, m4

unpcklpd is a sse2 instruction.
shufps m2, m4, q1010

> +%endif
> +    mulps       m1, m0
> +    mulps       m2, m0
> +    mova [dstq+ 0], m1
> +    mova [dstq+16], m2
> +    REP_RET
> +%endmacro
> +
> +%if ARCH_X86_32
> +INIT_XMM sse
> +INT8X8_FMUL_INT32  5
> +%endif
> +
> +INIT_XMM sse2
> +INT8X8_FMUL_INT32  3
> +
> +INIT_XMM sse4
> +INT8X8_FMUL_INT32  3

You don't need an exact counting of xmmregs unless some version of the
function uses >=7 of them. Round it to 5 and eliminate the macro
parameter.

--Loren Merritt


More information about the ffmpeg-devel mailing list