[FFmpeg-devel] [PATCH 4/7] x86: sbrdsp: implement SSE hf_apply_noise

Michael Niedermayer michaelni at gmx.at
Sat Apr 6 17:19:01 CEST 2013


On Sat, Apr 06, 2013 at 04:56:24PM +0200, Michael Niedermayer wrote:
> On Sat, Apr 06, 2013 at 10:52:11AM +0000, Christophe Gisquet wrote:
> > 233 to 115(sse)/110(sse2) cycles on Arrandale and Win64.
> > Replacing the multiplication by s_m[m] by an andps and an xorps with
> > appropriate vectors is slower. Unrolling is a 15 cycles win.
> > ---
> >  libavcodec/x86/sbrdsp.asm    | 145 +++++++++++++++++++++++++++++++++++++++++++
> >  libavcodec/x86/sbrdsp_init.c |  32 ++++++++++
> >  2 files changed, 177 insertions(+)
> > 
> > diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
> > index 65c972e..a7998fa 100644
> > --- a/libavcodec/x86/sbrdsp.asm
> > +++ b/libavcodec/x86/sbrdsp.asm
> > @@ -26,6 +26,12 @@ SECTION_RODATA
> >  ps_mask         times 2 dd 1<<31, 0
> >  ps_mask2        times 2 dd 0, 1<<31
> >  ps_neg          times 4 dd 1<<31
> > +ps_noise0       times 2 dd  1.0,  0.0,
> > +ps_noise2       times 2 dd -1.0,  0.0
> > +ps_noise13      dd  0.0,  1.0, 0.0, -1.0
> > +                dd  0.0, -1.0, 0.0,  1.0
> > +                dd  0.0,  1.0, 0.0, -1.0
> > +cextern         sbr_noise_table
> >  
> >  SECTION_TEXT
> >  
> > @@ -358,3 +364,142 @@ SBR_QMF_DEINT_BFLY
> >      
> >  INIT_XMM sse2
> >  SBR_QMF_DEINT_BFLY
> > +
> > +%if WIN64
> > +%define NREGS 0
> > +%else
> > +%ifndef PIC
> > +%define NREGS 1
> > +%else
> > +%define NREGS 0
> > +%endif
> > +%endif
> > +
> > +%macro SBUTTERFLY_F128 3
> > +    vperm2i128  m%3, m%1, m%2, q0301 ; punpckh
> > +    vinserti128 m%1, m%1, xm%2, 1    ; punpckl
> > +    SWAP %2, %3
> > +%endmacro
> > +
> > +%macro SBR_HF_APPLY_NOISE 0
> > +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
> > +;                      const float *q_filt, int noise,
> > +;                      int kx, int m_max)
> > +cglobal sbr_hf_apply_noise_0, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> > +    mova       m0, [ps_noise0]
> > +    jmp apply_noise_main %+ SUFFIX
> > +
> > +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
> > +;                      const float *q_filt, int noise,
> > +;                      int kx, int m_max)
> > +cglobal sbr_hf_apply_noise_1, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> > +    and       kxq, 1
> > +    shl       kxq, 4
> 
> > +%if NREGS
> > +    lea       r5q, [ps_noise13]
> > +    mova       m0, [kxq + r5q]
> > +%else
> > +    mova       m0, [kxq + ps_noise13]
> > +%endif
> 
> this could probably be simplified with some kind of generic macro
> like
> FOO mova, r5q, m0, ps_noise13, kxq
> or something similar ...

also changes to libavutil/x86/x86*.asm should be kept in sync with
x264 if possible

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The real ebay dictionary, page 1
"Used only once"    - "Some unspecified defect prevented a second use"
"In good condition" - "Can be repaird by experienced expert"
"As is" - "You wouldnt want it even if you were payed for it, if you knew ..."
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130406/73bc923f/attachment.asc>


More information about the ffmpeg-devel mailing list