[FFmpeg-devel] [PATCH 5/9] SBR DSP x86: implement SSE hf_apply_noise

Michael Niedermayer michaelni at gmx.at
Fri Apr 5 16:36:15 CEST 2013


On Thu, Apr 04, 2013 at 07:45:49PM +0000, Christophe Gisquet wrote:
> 497 to 255 cycles on Arrandale and Win64.
> Replacing the multiplication by s_m[m] by an andps and an xorps with
> appropriate vectors is slower. Unrolling is a 15 cycles win.
> ---
>  libavcodec/x86/sbrdsp.asm    | 100 +++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/sbrdsp_init.c |  16 +++++++
>  2 files changed, 116 insertions(+)
> 
> diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
> index 573981a..fdeaa2b 100644
> --- a/libavcodec/x86/sbrdsp.asm
> +++ b/libavcodec/x86/sbrdsp.asm
> @@ -25,6 +25,12 @@ SECTION_RODATA
>  ; mask equivalent for multiply by -1.0 1.0
>  ps_mask         times 2 dd 1<<31, 0
>  ps_neg          times 4 dd 1<<31
> +ps_noise0       times 2 dd  1.0,  0.0,
> +ps_noise2       times 2 dd -1.0,  0.0
> +ps_noise13      dd  0.0,  1.0, 0.0, -1.0
> +                dd  0.0, -1.0, 0.0,  1.0
> +                dd  0.0,  1.0, 0.0, -1.0
> +cextern         sbr_noise_table
>  
>  SECTION_TEXT
>  
> @@ -301,3 +307,97 @@ cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
>      sub        cq, 2*mmsize
>      jge     .loop
>      REP_RET
> +
> +INIT_XMM sse
> +%ifdef PIC

i suspect you meant ifNdef


> +%define NPICREGS 0
> +%define NOISE_TABLE sbr_noise_table
> +%else
> +%define NPICREGS 1
> +%define NOISE_TABLE r5q
> +%endif
> +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
> +;                      const float *q_filt, int noise,
> +;                      int kx, int m_max)
> +cglobal sbr_hf_apply_noise_0, 5,5+NPICREGS,8, Y,s_m,q_filt,noise,kx,m_max
> +    mova       m0, [ps_noise0]
> +    jmp apply_noise_main
> +
> +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
> +;                      const float *q_filt, int noise,
> +;                      int kx, int m_max)
> +cglobal sbr_hf_apply_noise_1, 5,5+NPICREGS,8, Y,s_m,q_filt,noise,kx,m_max
> +    and       kxq, 1
> +    shl       kxq, 4

> +    mova       m0, [kxq + ps_noise13]

[...]
> +    mova       m0, [kxq + ps_noise13 + 16]

these 2 will break PIC too

[..]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Everything should be made as simple as possible, but not simpler.
-- Albert Einstein
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130405/2cd5f8aa/attachment.asc>


More information about the ffmpeg-devel mailing list