[FFmpeg-cvslog] r20739 - in trunk/libavcodec: apedec.c dsputil.c dsputil.h ppc/int_altivec.c x86/dsputil_mmx.c x86/dsputil_yasm.asm

Ramiro Polla ramiro.polla
Mon Dec 7 23:53:46 CET 2009


On Sun, Dec 6, 2009 at 5:02 PM, Reimar D?ffinger
<Reimar.Doeffinger at gmx.de> wrote:
> On Sat, Dec 05, 2009 at 02:38:05PM -0200, Ramiro Polla wrote:
>> On Sat, Dec 5, 2009 at 1:09 PM, lorenm <subversion at mplayerhq.hu> wrote:
>> > Author: lorenm
>> > Date: Sat Dec ?5 16:09:10 2009
>> > New Revision: 20739
>> >
>> > Log:
>> > refactor and optimize scalarproduct
>> > 29-105% faster apply_filter, 6-90% faster ape decoding on core2
>> > (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.)
>> > 9-123% faster ape decoding on G4.
>> >
>> > Modified:
>> > ? trunk/libavcodec/apedec.c
>> > ? trunk/libavcodec/dsputil.c
>> > ? trunk/libavcodec/dsputil.h
>> > ? trunk/libavcodec/ppc/int_altivec.c
>> > ? trunk/libavcodec/x86/dsputil_mmx.c
>> > ? trunk/libavcodec/x86/dsputil_yasm.asm
>>
>> It seems this broke x86_32 and x86_64 mac os x.
>> http://fate.multimedia.cx/index.php?stderr=143835 ends in:
>
> In addition, gcc 4.2.4 MinGW and Cygwin ?builds seem to decode incorrectly:
> http://fate.multimedia.cx/index.php?build_record=144294
>
> Though consistently between them:
> http://fate.multimedia.cx/index.php?test_result=33060104
> http://fate.multimedia.cx/index.php?test_result=33055764

Also reproducible on Ubuntu Linux 9.04 (gcc: Ubuntu 4.3.3-5ubuntu4,
yasm: 0.7.1.2093) compiling for 32-bit (--cc='ccache gcc -m32'
--arch=i686 --cpu=i686), forcing ff_scalarproduct_and_madd_int16_sse2
to be used on a q6600.

It crashes either on 2fa or 2ff on the listing below:
000002d0 <_ff_scalarproduct_and_madd_int16_sse2>:
 2d0:	53                   	push   %ebx
 2d1:	8b 44 24 08          	mov    0x8(%esp),%eax
 2d5:	8b 4c 24 0c          	mov    0xc(%esp),%ecx
 2d9:	8b 54 24 10          	mov    0x10(%esp),%edx
 2dd:	d1 e3                	shl    %ebx
 2df:	66 0f 6e 7c 24 18    	movd   0x18(%esp),%xmm7
 2e5:	f2 0f 70 ff 00       	pshuflw $0x0,%xmm7,%xmm7
 2ea:	66 0f 6c ff          	punpcklqdq %xmm7,%xmm7
 2ee:	66 0f ef f6          	pxor   %xmm6,%xmm6
 2f2:	01 d8                	add    %ebx,%eax
 2f4:	01 d9                	add    %ebx,%ecx
 2f6:	01 da                	add    %ebx,%edx
 2f8:	f7 db                	neg    %ebx
 2fa:	f3 0f 6f 04 19       	movdqu (%ecx,%ebx,1),%xmm0
 2ff:	f3 0f 6f 4c 19 10    	movdqu 0x10(%ecx,%ebx,1),%xmm1
 305:	66 0f 6f 24 18       	movdqa (%eax,%ebx,1),%xmm4
 30a:	66 0f 6f 6c 18 10    	movdqa 0x10(%eax,%ebx,1),%xmm5
 310:	f3 0f 6f 14 1a       	movdqu (%edx,%ebx,1),%xmm2
 315:	f3 0f 6f 5c 1a 10    	movdqu 0x10(%edx,%ebx,1),%xmm3
 31b:	66 0f f5 c4          	pmaddwd %xmm4,%xmm0
 31f:	66 0f f5 cd          	pmaddwd %xmm5,%xmm1
 323:	66 0f d5 d7          	pmullw %xmm7,%xmm2
 327:	66 0f d5 df          	pmullw %xmm7,%xmm3
 32b:	66 0f fe f0          	paddd  %xmm0,%xmm6
 32f:	66 0f fe f1          	paddd  %xmm1,%xmm6
 333:	66 0f fd d4          	paddw  %xmm4,%xmm2
 337:	66 0f fd dd          	paddw  %xmm5,%xmm3
 33b:	66 0f 7f 14 18       	movdqa %xmm2,(%eax,%ebx,1)
 340:	66 0f 7f 5c 18 10    	movdqa %xmm3,0x10(%eax,%ebx,1)
 346:	83 c3 20             	add    $0x20,%ebx
 349:	7c af                	jl     2fa
<_ff_scalarproduct_and_madd_int16_sse2+0x2a>
 34b:	0f 12 c6             	movhlps %xmm6,%xmm0
 34e:	66 0f fe f0          	paddd  %xmm0,%xmm6
 352:	f2 0f 70 c6 4e       	pshuflw $0x4e,%xmm6,%xmm0
 357:	66 0f fe f0          	paddd  %xmm0,%xmm6
 35b:	66 0f 7e f0          	movd   %xmm6,%eax
 35f:	5b                   	pop    %ebx
 360:	c3                   	ret
 361:	eb 0d                	jmp    370 <_ff_scalarproduct_and_madd_int16_ssse3>
 363:	90                   	nop
 364:	90                   	nop
 365:	90                   	nop
 366:	90                   	nop
 367:	90                   	nop
 368:	90                   	nop
 369:	90                   	nop
 36a:	90                   	nop
 36b:	90                   	nop
 36c:	90                   	nop
 36d:	90                   	nop
 36e:	90                   	nop
 36f:	90                   	nop



More information about the ffmpeg-cvslog mailing list