[FFmpeg-devel] [PATCH] split-radix FFT

Michael Niedermayer michaelni
Tue Jul 29 13:43:43 CEST 2008


On Tue, Jul 29, 2008 at 12:22:59AM -0600, Loren Merritt wrote:
> AOn Tue, 29 Jul 2008, Michael Niedermayer wrote:
> > On Fri, Jul 25, 2008 at 08:14:00PM -0600, Loren Merritt wrote:
> >
> >> +#ifdef EMULATE_3DNOWEXT
> >> +#define PSWAPD(s,d)\
> >> +    "movq "#s","#d"\n"\
> >> +    "psrlq $32,"#d"\n"\
> >> +    "punpckldq "#s","#d"\n"
> >
> >> +#define PSWAPD_UNARY(s)\
> >> +    "sub $8, %%"REG_SP"\n"\
> >> +    "movd "#s", 4(%%"REG_SP")\n"\
> >> +    "punpckhdq (%%"REG_SP"), "#s"\n"\
> >> +    "add $8, %%"REG_SP"\n"
> >
> > Gcc failed with a "+m" ?
> 
> No, I just designed the 3dn1 emulation of 3dn2 for simplicity (including 
> code locality) rather than speed. I wouldn't have written it at all 
> except that then I wouldn't be able to delete the radix-2 init code. 
> (I still can't delete it until someone ports split-radix to altivec, 
> but I assume that'll happen.)
> 
> >> +static void fft4(FFTComplex *z)
> >>  {
> >> -    int ln = s->nbits;
> >> -    long j;
> >> -    x86_reg i;
> >> -    long nblocks, nloops;
> >> -    FFTComplex *p, *cptr;
> >> +    T2(z[0], z[1], %%mm0, %%mm1);
> >> +    LOAD(z[2], %%mm2);
> >> +    LOAD(z[3], %%mm3);
> >> +    T4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5);
> >> +    PUNPCK(%%mm0, %%mm1, %%mm4);
> >> +    PUNPCK(%%mm2, %%mm3, %%mm5);
> >> +    SAVE(z[0], %%mm0);
> >> +    SAVE(z[1], %%mm4);
> >> +    SAVE(z[2], %%mm2);
> >> +    SAVE(z[3], %%mm5);
> >> +}
> >
> > is there any reason why seperate asm() are chained? I think a single
> > asm block, or even nasm/yasm if you prefer would be better.
> 
> Because it works for me, and I don't see any alternatives that are as 
> concise.
> yasm, ok.

I prefer code that is easy to maintain over concise code, and code that gcc
can silently pessimize is not easy to maintain IMHO. It easily can cost
someone quite some time to debug why some codec is slower on some gcc
version or compiled with different flags ...

It would of course be different if such "silent pessimization" where just
hypothetical but it isnt, gcc is really following murphis law here, if it
can mess up it does.
Thats why i would strogly prefer if gcc couldnt put anything at all between
the asm parts ...


> 
> > The way its written is almost asking for gcc to put something in between,
> > iam especially concerned about the -fPIC case and gcc putting all the GOT
> > "magic" in between the asms ...
> 
> Is gcc so stupid as to emit GOT stuff when dereferencing a pointer that's 
> already in a register, no global variables involved?

yes, examples below with gcc 4.3.1

fft8:
.LFB92:
        .loc 1 97 0
.LVL1:
        call    __i686.get_pc_thunk.cx  # 30    set_got [length = 12]
        addl    $_GLOBAL_OFFSET_TABLE_, %ecx
        .loc 1 97 0
        movl    4(%esp), %eax   # 2     *movsi_1/1      [length = 4]
        .loc 1 98 0
#APP
# 98 "libavcodec/i386/fft_sse.c" 1
        movaps (%eax), %xmm0
# 0 "" 2
        .loc 1 99 0
# 99 "libavcodec/i386/fft_sse.c" 1
        movaps 16(%eax), %xmm2
# 0 "" 2
        .loc 1 100 0
# 100 "libavcodec/i386/fft_sse.c" 1
        movaps        %xmm0, %xmm1 
shufps $0x64, %xmm2, %xmm0 
shufps $0xce, %xmm2, %xmm1 
movaps        %xmm0, %xmm2 
addps         %xmm1, %xmm0 

....
fft16:
.LFB93:
        .loc 1 111 0
.LVL2:
        call    __i686.get_pc_thunk.cx  # 38    set_got [length = 12]
        addl    $_GLOBAL_OFFSET_TABLE_, %ecx
        .loc 1 111 0
        movl    4(%esp), %eax   # 2     *movsi_1/1      [length = 4]
        .loc 1 112 0
#APP
# 112 "libavcodec/i386/fft_sse.c" 1
        movaps (%eax), %xmm0
# 0 "" 2
        .loc 1 113 0
# 113 "libavcodec/i386/fft_sse.c" 1
        movaps 16(%eax), %xmm2
# 0 "" 2
        .loc 1 114 0
# 114 "libavcodec/i386/fft_sse.c" 1
        movaps        %xmm0, %xmm1 
shufps $0x64, %xmm2, %xmm0 
shufps $0xce, %xmm2, %xmm1 

.....

addps         %xmm7, %xmm6 
subps         %xmm7, %xmm0 
movaps        %xmm6, %xmm7 
shufps $0x88, %xmm0, %xmm6 
shufps $0xdd, %xmm0, %xmm7 

# 0 "" 2
        .loc 1 129 0
#NO_APP
        movl    ff_cos_16 at GOT(%ecx), %edx       # 27    *movsi_1/1      [length = 6]
#APP
# 129 "libavcodec/i386/fft_sse.c" 1
        movaps          (%edx), %xmm0 
movaps      %xmm4, %xmm2 
movaps       16+(%edx), %xmm1 
movaps      %xmm5, %xmm3 
mulps       %xmm0, %xmm2 
mulps       %xmm1, %xmm3 
mulps       %xmm1, %xmm4 

....
subps         %xmm1, %xmm2 
movaps        %xmm0, %xmm1 
shufps $0x88, %xmm2, %xmm0 
shufps $0xdd, %xmm2, %xmm1 

# 0 "" 2
        .loc 1 115 0
# 115 "libavcodec/i386/fft_sse.c" 1
        movaps 32(%eax), %xmm2
# 0 "" 2
        .loc 1 116 0
# 116 "libavcodec/i386/fft_sse.c" 1
        movaps 48(%eax), %xmm3
# 0 "" 2
        .loc 1 117 0
#NO_APP
        leal    root2 at GOTOFF, %esi      # 14    *lea_1  [length = 6]
        leal    root2mppm at GOTOFF, %ecx  # 15    *lea_1  [length = 6]
#APP
# 117 "libavcodec/i386/fft_sse.c" 1
        movaps        %xmm2, %xmm4 
shufps $0x44, %xmm3, %xmm2 
shufps $0xee, %xmm3, %xmm4 
movaps        %xmm2, %xmm5 


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

I know you won't believe me, but the highest form of Human Excellence is
to question oneself and others. -- Socrates
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080729/0bdd08fc/attachment.pgp>



More information about the ffmpeg-devel mailing list