[FFmpeg-devel] [PATCH] x86/aacpsdsp: add SSE and SSE3 optimized functions

Michael Niedermayer michael at niedermayer.cc
Sat Jul 25 14:39:20 CEST 2015


On Fri, Jul 24, 2015 at 11:00:55PM -0300, James Almer wrote:
> Between 1.5 and 2.5 times faster
> 
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
> There's a couple missing, like ps_stereo_interpolate_ipdopd which i wanted to write
> but couldn't test because it was not used by any of the samples i tried.
> 
>  libavcodec/aacps.c             |   4 +-
>  libavcodec/aacpsdsp.h          |   1 +
>  libavcodec/aacpsdsp_template.c |   2 +
>  libavcodec/x86/Makefile        |   6 +-
>  libavcodec/x86/aacpsdsp.asm    | 212 +++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/aacpsdsp_init.c |  55 +++++++++++
>  6 files changed, 276 insertions(+), 4 deletions(-)
>  create mode 100644 libavcodec/x86/aacpsdsp.asm
>  create mode 100644 libavcodec/x86/aacpsdsp_init.c
> 
> diff --git a/libavcodec/aacps.c b/libavcodec/aacps.c
> index bf60475..eec6e30 100644
> --- a/libavcodec/aacps.c
> +++ b/libavcodec/aacps.c
> @@ -936,8 +936,8 @@ static void stereo_processing(PSContext *ps, INTFLOAT (*l)[32][2], INTFLOAT (*r)
>              H22[0][e+1][b] = h22;
>          }
>          for (k = 0; k < NR_BANDS[is34]; k++) {
> -            INTFLOAT h[2][4];
> -            INTFLOAT h_step[2][4];
> +            LOCAL_ALIGNED_16(INTFLOAT, h, [2], [4]);
> +            LOCAL_ALIGNED_16(INTFLOAT, h_step, [2], [4]);
>              int start = ps->border_position[e];
>              int stop  = ps->border_position[e+1];
>              INTFLOAT width = Q30(1.f) / (stop - start);
> diff --git a/libavcodec/aacpsdsp.h b/libavcodec/aacpsdsp.h
> index 9e3c5aa..c194bbe 100644
> --- a/libavcodec/aacpsdsp.h
> +++ b/libavcodec/aacpsdsp.h
> @@ -52,5 +52,6 @@ typedef struct PSDSPContext {
>  void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s);
>  void ff_psdsp_init_arm(PSDSPContext *s);
>  void ff_psdsp_init_mips(PSDSPContext *s);
> +void ff_psdsp_init_x86(PSDSPContext *s);
>  
>  #endif /* LIBAVCODEC_AACPSDSP_H */
> diff --git a/libavcodec/aacpsdsp_template.c b/libavcodec/aacpsdsp_template.c
> index bfec828..3049ce8 100644
> --- a/libavcodec/aacpsdsp_template.c
> +++ b/libavcodec/aacpsdsp_template.c
> @@ -224,5 +224,7 @@ av_cold void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s)
>          ff_psdsp_init_arm(s);
>      if (ARCH_MIPS)
>          ff_psdsp_init_mips(s);
> +    if (ARCH_X86)
> +        ff_psdsp_init_x86(s);
>  #endif /* !USE_FIXED */
>  }
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index a515ebd..c403770 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -38,7 +38,8 @@ OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
>  OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
>  
>  # decoders/encoders
> -OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
> +OBJS-$(CONFIG_AAC_DECODER)             += x86/aacpsdsp_init.o          \
> +                                          x86/sbrdsp_init.o
>  OBJS-$(CONFIG_ADPCM_G722_DECODER)      += x86/g722dsp_init.o
>  OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
>  OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
> @@ -130,7 +131,8 @@ YASM-OBJS-$(CONFIG_VP8DSP)             += x86/vp8dsp.o                  \
>                                            x86/vp8dsp_loopfilter.o
>  
>  # decoders/encoders
> -YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
> +YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/aacpsdsp.o                \
> +                                          x86/sbrdsp.o
>  YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
>  YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
>  YASM-OBJS-$(CONFIG_APNG_DECODER)       += x86/pngdsp.o
> diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
> new file mode 100644
> index 0000000..d416944
> --- /dev/null
> +++ b/libavcodec/x86/aacpsdsp.asm
> @@ -0,0 +1,212 @@
> +;******************************************************************************
> +;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
> +;*
> +;* Copyright (C) 2015 James Almer
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
> +
> +SECTION_TEXT
> +
> +;*************************************************************************
> +;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
> +;*************************************************************************
> +%macro PS_ADD_SQUARES 1
> +cglobal ps_add_squares, 3, 3, %1, dst, src, n
> +.loop:
> +    movaps m0, [srcq]
> +    movaps m1, [srcq+mmsize]
> +    mulps  m0, m0
> +    mulps  m1, m1
> +%if cpuflag(sse3)
> +    haddps m0, m1
> +%else
> +    movaps m3, m0
> +    movaps m4, m1
> +    shufps m3, m3, q0301
> +    shufps m4, m4, q0301
> +    addps  m0, m3
> +    addps  m1, m4
> +    shufps m0, m1, q2020
> +%endif
> +    addps  m0, [dstq]
> +    movaps [dstq], m0
> +    add  dstq, mmsize
> +    add  srcq, mmsize*2
> +    sub    nd, mmsize/4
> +    jg .loop
> +    REP_RET
> +%endmacro
> +
> +INIT_XMM sse
> +PS_ADD_SQUARES 3
> +INIT_XMM sse3
> +PS_ADD_SQUARES 5
> +
> +;*******************************************************************
> +;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
> +;                                   float *src1, int n);
> +;*******************************************************************
> +INIT_XMM sse
> +cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n
> +    xor r4q, r4q
> +
> +.loop:
> +    movu     m0, [src1q+r4q]
> +    movu     m1, [src1q+r4q+mmsize]
> +    mova     m2, [src2q]
> +    mova     m3, m2
> +    unpcklps m2, m2
> +    unpckhps m3, m3
> +    mulps    m0, m2
> +    mulps    m1, m3
> +    mova [dstq+r4q], m0
> +    mova [dstq+r4q+mmsize], m1
> +    add   src2q, mmsize
> +    add     r4q, mmsize*2
> +    sub      nd, mmsize/4
> +    jg .loop
> +    REP_RET
> +

> +;***********************************************************************
> +;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
> +;                                   float h[2][4], float h_step[2][4],
> +;                                   int len);
> +;***********************************************************************
> +INIT_XMM sse3
> +cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
> +    movaps   m0, [hq]
> +    movaps   m1, [h_stepq]
> +    shl      nd, 3
> +    add      lq, nq
> +    add      rq, nq
> +    neg      nq
> +
> +align 16
> +.loop:

this assumes n >= 0
i dont think the calling code guratees this
either the calling code should be changed or this should be checked
for

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

In a rich man's house there is no place to spit but his face.
-- Diogenes of Sinope
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150725/b32076ec/attachment.sig>


More information about the ffmpeg-devel mailing list