[Ffmpeg-devel] gcc4 support & MMX fixups (from Debian)

Paweł Sikora pluto
Tue Jan 31 23:37:04 CET 2006


Dnia Tuesday, 31 of January 2006 21:25, matthieu castet napisa?:
> Hi Pawe?,
>
> Pawe? Sikora wrote:
> > Hi all,
> >
> > I have an implementation of transpose4x4 in C which uses gcc's vector
> > extensions. It doesn't press register allocator so much and allows
> > optimal code scheduling.
> >
> > Instantiation of attached patch e.g. in foo(dst, src, 4, 4)
> > gives a nice piece of code:
> >
> > [ x86-64 example ]
> >
> > foo:    movd        4(%rsi), %mm0
> >         movd        (%rsi), %mm1
> >         movd        8(%rsi), %mm2
> >         movd        12(%rsi), %mm3
> >         punpcklbw   %mm0, %mm1
> >         punpcklbw   %mm3, %mm2
> >         movq        %mm1, %mm0
> >         punpckhwd   %mm2, %mm1
> >         punpcklwd   %mm2, %mm0
> >         movd        %mm1, 8(%rdi)
> >         punpckhdq   %mm1, %mm1
> >         movd        %mm0, (%rdi)
> >         punpckhdq   %mm0, %mm0
> >         movd        %mm1, 12(%rdi)
> >         movd        %mm0, 4(%rdi)
> >         ret
> >
> > actually gcc-4.1 has a good optimizer and happy asm. hardcoding
> > doesn't introduce incredible performance boost but only degradation
> > of code scheduling.
>
> Could you post a benchmarck between the 2 versions ?

I did a simple benchmark with transpose4x4 marked with attribute noinline.

results:

orig:  iters = 1000000000, dt = 7.92 [avg]
fixed: iters = 1000000000, dt = 7.35 [avg]

we gain: ~7.2%

hardware:
cpu: athlon64-3000+
ram: 2x512MB geil.
mb:  gigabyte K8U-939 ULi socket 939

moreover, for x86-64 version we can speedup transpose4x4.
simple change of `stride` parameters from `int` to `long` changes code from:

orig_transpose4x4:
	leal	(%rdx,%rdx), %r9d
	leal	(%rcx,%rcx), %eax
	movslq	%edx,%r11
	movslq	%ecx,%r8
	movslq	%r9d,%r10
	addl	%edx, %r9d
	movslq	%eax,%rdx
	addl	%ecx, %eax
	movslq	%r9d,%r9
	cltq
#APP
	movd  (%rsi), %mm0                
	movd  (%rsi,%r8), %mm1                
	movd  (%rsi,%rdx), %mm2                
	movd  (%rsi,%rax), %mm3                
	punpcklbw %mm1, %mm0         
	punpcklbw %mm3, %mm2         
	movq %mm0, %mm1              
	punpcklwd %mm2, %mm0         
	punpckhwd %mm2, %mm1         
	movd  %mm0, (%rdi)                
	punpckhdq %mm0, %mm0         
	movd  %mm0, (%rdi,%r11)                
	movd  %mm1, (%rdi,%r10)                
	punpckhdq %mm1, %mm1         
	movd  %mm1, (%rdi,%r9)                
#NO_APP
	ret

fixed_transpose4x4:
	movslq	%ecx,%rax
	movd	(%rsi), %mm1
	movd	(%rsi,%rax), %mm3
	leal	(%rcx,%rcx), %eax
	movslq	%eax,%r8
	addl	%ecx, %eax
	punpcklbw	%mm3, %mm1
	cltq
	movd	(%rsi,%r8), %mm2
	movd	(%rsi,%rax), %mm0
	movslq	%edx,%rax
	punpcklbw	%mm0, %mm2
	movq	%mm1, %mm0
	punpcklwd	%mm2, %mm0
	punpckhwd	%mm2, %mm1
	movd	%mm0, (%rdi)
	punpckhdq	%mm0, %mm0
	movd	%mm0, (%rdi,%rax)
	leal	(%rdx,%rdx), %eax
	movslq	%eax,%rcx
	addl	%edx, %eax
	movd	%mm1, (%rdi,%rcx)
	punpckhdq	%mm1, %mm1
	cltq
	movd	%mm1, (%rdi,%rax)
	ret

to:

orig_transpose4x4:
	leaq	(%rdx,%rdx,2), %r8
	leaq	(%rcx,%rcx,2), %rax
#APP
	movd  (%rsi), %mm0                
	movd  (%rsi,%rcx), %mm1                
	movd  (%rsi,%rcx,2), %mm2                
	movd  (%rax,%rsi), %mm3                
	punpcklbw %mm1, %mm0         
	punpcklbw %mm3, %mm2         
	movq %mm0, %mm1              
	punpcklwd %mm2, %mm0         
	punpckhwd %mm2, %mm1         
	movd  %mm0, (%rdi)                
	punpckhdq %mm0, %mm0         
	movd  %mm0, (%rdi,%rdx)                
	movd  %mm1, (%rdi,%rdx,2)                
	punpckhdq %mm1, %mm1         
	movd  %mm1, (%r8,%rdi)                
#NO_APP
	ret

fixed_transpose4x4:
	movd	(%rsi,%rcx), %mm3
	movd	(%rsi,%rcx,2), %mm2
	leaq	(%rcx,%rcx,2), %rcx
	movd	(%rsi), %mm1
	movd	(%rcx,%rsi), %mm0
	punpcklbw	%mm3, %mm1
	punpcklbw	%mm0, %mm2
	movq	%mm1, %mm0
	punpckhwd	%mm2, %mm1
	punpcklwd	%mm2, %mm0
	movd	%mm0, (%rdi)
	punpckhdq	%mm0, %mm0
	movd	%mm0, (%rdi,%rdx)
	movd	%mm1, (%rdi,%rdx,2)
	punpckhdq	%mm1, %mm1
	leaq	(%rdx,%rdx,2), %rdx
	movd	%mm1, (%rdx,%rdi)
	ret

as you can see the function is smaller and faster.
this change reduces total time from 7.35s to 6.24s
and diffs beetwen both implementations are unmeasurable.

-- 
to_be || !to_be == 1, to_be | ~to_be == -1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: test.c
Type: text/x-csrc
Size: 2766 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20060131/580c3c80/attachment.c>



More information about the ffmpeg-devel mailing list