[FFmpeg-devel] suggestions for debugging ff_fft_calc_altivec?

Pavel Koshevoy pkoshevoy at gmail.com
Sun Aug 11 07:10:30 CEST 2013


On 8/10/13 10:27 PM, Pavel Koshevoy wrote:
> Hi,
>
> I've decided to take a closer look at OSX 10.5 PPC build with
> gas-preprocessor.pl.  The last time I tried it more than half a year
> ago the result was that some audio decoders didn't sound right (AAC,
> AC3 -- both sound like loud white noise).
>
> I tried again today and compared decoder output from builds with and
> without gas-preprocessor.  I've narrowed it down to output of
> fft_calc_c (called in ff_imdct_half_c) being different from output of
> ff_fft_calc_altivec (called in imdct_half_altivec).
>
> I am not a PPC assembly expert, nor am I familiar with FFT
> implementation details to be able to easily spot what the problem is
> with ff_fft_calc_altivec.  Can anyone offer some suggestions to
> proceed further?
>

I've attached the gas-preprocess'ed fft_altivec_s temp assembly file, 
for reference.

     Pavel.

-------------- next part --------------
# 1 "/nfs/scratch/Developer/ffmpeg-git-src/libavcodec/ppc/fft_altivec_s.S"
# 1 "/Developer/ppc-debug-gas/ffmpeg-git-build-nfs-gas-debug//"
# 1 "<built-in>"
# 1 "<command line>"
# 1 "/nfs/scratch/Developer/ffmpeg-git-src/libavcodec/ppc/fft_altivec_s.S"
# 41 "/nfs/scratch/Developer/ffmpeg-git-src/libavcodec/ppc/fft_altivec_s.S"
# 1 "./config.h" 1
# 42 "/nfs/scratch/Developer/ffmpeg-git-src/libavcodec/ppc/fft_altivec_s.S" 2
# 1 "/nfs/scratch/Developer/ffmpeg-git-src/libavcodec/ppc/asm.S" 1
# 64 "/nfs/scratch/Developer/ffmpeg-git-src/libavcodec/ppc/asm.S"
.macro extfunc name
    .global _\name
    .type _\name, STT_FUNC
_\name:
\name:
.endm

.macro movrel rd, sym, gp



    lis \rd, \sym at ha
    la \rd, \sym at l(\rd)

.endm

.macro get_got rd







.endm
# 43 "/nfs/scratch/Developer/ffmpeg-git-src/libavcodec/ppc/fft_altivec_s.S" 2

.text

.macro addi2 ra, imm
.if \imm & 0xffff
    addi \ra, \ra, \imm at l
.endif
.if (\imm+0x8000)>>16
    addis \ra, \ra, \imm at ha
.endif
.endm

.macro FFT4 a0, a1, a2, a3
    vperm \a2,\a0,\a1,v20
    vperm \a3,\a0,\a1,v21
    vaddfp \a0,\a2,\a3
    vsubfp \a1,\a2,\a3
    vmrghw \a2,\a0,\a1
    vperm \a3,\a0,\a1,v22
    vaddfp \a0,\a2,\a3
    vsubfp \a1,\a2,\a3
    vperm \a2,\a0,\a1,v23
    vperm \a3,\a0,\a1,v24
.endm

.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
    vperm \a2,\a0,\a1,v20
    vperm \a3,\a0,\a1,v21
    vperm \b2,\b0,\b1,v20
    vperm \b3,\b0,\b1,v21
    vaddfp \a0,\a2,\a3
    vsubfp \a1,\a2,\a3
    vaddfp \b0,\b2,\b3
    vsubfp \b1,\b2,\b3
    vmrghw \a2,\a0,\a1
    vperm \a3,\a0,\a1,v22
    vmrghw \b2,\b0,\b1
    vperm \b3,\b0,\b1,v22
    vaddfp \a0,\a2,\a3
    vsubfp \a1,\a2,\a3
    vaddfp \b0,\b2,\b3
    vsubfp \b1,\b2,\b3
    vperm \a2,\a0,\a1,v23
    vperm \a3,\a0,\a1,v24
    vperm \b2,\b0,\b1,v23
    vperm \b3,\b0,\b1,v24
.endm

.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4
    vmrghw \b2,\b0,\b1
    vmrglw \b3,\b0,\b1
    vperm \a2,\a0,\a1,v20
    vperm \a3,\a0,\a1,v21
    vaddfp \b0,\b2,\b3
    vsubfp \b1,\b2,\b3
    vperm \b4,\b1,\b1,v25
    vaddfp \a0,\a2,\a3
    vsubfp \a1,\a2,\a3
    vmaddfp \b1,\b1,v17,v14
    vmaddfp \b1,\b4,v18,\b1
    vmrghw \a2,\a0,\a1
    vperm \a3,\a0,\a1,v22
    vperm \b2,\b0,\b1,v26
    vperm \b3,\b0,\b1,v27
    vaddfp \a0,\a2,\a3
    vsubfp \a1,\a2,\a3
    vaddfp \b0,\b2,\b3
    vsubfp \b1,\b2,\b3
    vperm \a2,\a0,\a1,v23
    vperm \a3,\a0,\a1,v24
    vperm \b2,\b0,\b1,v28
    vperm \b3,\b0,\b1,v29
    vsubfp \b0,\a2,\b2
    vsubfp \b1,\a3,\b3
    vaddfp \a0,\a2,\b2
    vaddfp \a1,\a3,\b3
.endm

.macro BF d0,d1,s0,s1
    vsubfp \d1,\s0,\s1
    vaddfp \d0,\s0,\s1
.endm

.macro zip d0,d1,s0,s1
    vmrghw \d0,\s0,\s1
    vmrglw \d1,\s0,\s1
.endm

.macro def_fft4 interleave
fft4\interleave\()_altivec:
    lvx v0, 0,r3
    lvx v1,r9,r3
    FFT4 v0,v1,v2,v3
.ifnb \interleave
    zip v0,v1,v2,v3
    stvx v0, 0,r3
    stvx v1,r9,r3
.else
    stvx v2, 0,r3
    stvx v3,r9,r3
.endif
    blr
.endm

.macro def_fft8 interleave
fft8\interleave\()_altivec:
    addi r4,r3,32
    lvx v0, 0,r3
    lvx v1,r9,r3
    lvx v2, 0,r4
    lvx v3,r9,r4
    FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
.ifnb \interleave
    zip v4,v5,v0,v1
    zip v6,v7,v2,v3
    stvx v4, 0,r3
    stvx v5,r9,r3
    stvx v6, 0,r4
    stvx v7,r9,r4
.else
    stvx v0, 0,r3
    stvx v1,r9,r3
    stvx v2, 0,r4
    stvx v3,r9,r4
.endif
    blr
.endm

.macro def_fft16 interleave
fft16\interleave\()_altivec:
    addi r5,r3,64
    addi r6,r3,96
    addi r4,r3,32
    lvx v0, 0,r5
    lvx v1,r9,r5
    lvx v2, 0,r6
    lvx v3,r9,r6
    FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
    lvx v0, 0,r3
    lvx v1,r9,r3
    lvx v2, 0,r4
    lvx v3,r9,r4
    FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
    vmaddfp v8,v4,v15,v14
    vmaddfp v9,v5,v15,v14
    vmaddfp v10,v6,v15,v14
    vmaddfp v11,v7,v15,v14
    vmaddfp v8,v5,v16,v8
    vnmsubfp v9,v4,v16,v9
    vnmsubfp v10,v7,v16,v10
    vmaddfp v11,v6,v16,v11
    BF v10,v12,v10,v8
    BF v11,v13,v9,v11
    BF v0,v4,v0,v10
    BF v3,v7,v3,v12
    BF v1,v5,v1,v11
    BF v2,v6,v2,v13
.ifnb \interleave
    zip v8, v9,v0,v1
    zip v10,v11,v2,v3
    zip v12,v13,v4,v5
    zip v14,v15,v6,v7
    stvx v8, 0,r3
    stvx v9,r9,r3
    stvx v10, 0,r4
    stvx v11,r9,r4
    stvx v12, 0,r5
    stvx v13,r9,r5
    stvx v14, 0,r6
    stvx v15,r9,r6
.else
    stvx v0, 0,r3
    stvx v4, 0,r5
    stvx v3,r9,r4
    stvx v7,r9,r6
    stvx v1,r9,r3
    stvx v5,r9,r5
    stvx v2, 0,r4
    stvx v6, 0,r6
.endif
    blr
.endm


.macro PASS interleave, suffix
fft_pass\suffix\()_altivec:
    mtctr r5
    slwi r0,r5,4
    slwi r7,r5,6
    slwi r5,r5,5
    add r10,r5,r7
    add r0,r4,r0
    addi r6,r5,16
    addi r8,r7,16
    addi r11,r10,16
1:
    lvx v8, 0,r4
    lvx v10, 0,r0
    sub r0,r0,r9
    lvx v9, 0,r0
    vperm v9,v9,v10,v19
    lvx v4,r3,r7
    lvx v5,r3,r8
    lvx v6,r3,r10
    lvx v7,r3,r11
    vmaddfp v10,v4,v8,v14
    vmaddfp v11,v5,v8,v14
    vmaddfp v12,v6,v8,v14
    vmaddfp v13,v7,v8,v14
    lvx v0, 0,r3
    lvx v3,r3,r6
    vmaddfp v10,v5,v9,v10
    vnmsubfp v11,v4,v9,v11
    vnmsubfp v12,v7,v9,v12
    vmaddfp v13,v6,v9,v13
    lvx v1,r3,r9
    lvx v2,r3,r5
    BF v12,v8,v12,v10
    BF v13,v9,v11,v13
    BF v0,v4,v0,v12
    BF v3,v7,v3,v8
.if !\interleave
    stvx v0, 0,r3
    stvx v4,r3,r7
    stvx v3,r3,r6
    stvx v7,r3,r11
.endif
    BF v1,v5,v1,v13
    BF v2,v6,v2,v9
.if !\interleave
    stvx v1,r3,r9
    stvx v2,r3,r5
    stvx v5,r3,r8
    stvx v6,r3,r10
.else
    vmrghw v8,v0,v1
    vmrglw v9,v0,v1
    stvx v8, 0,r3
    stvx v9,r3,r9
    vmrghw v8,v2,v3
    vmrglw v9,v2,v3
    stvx v8,r3,r5
    stvx v9,r3,r6
    vmrghw v8,v4,v5
    vmrglw v9,v4,v5
    stvx v8,r3,r7
    stvx v9,r3,r8
    vmrghw v8,v6,v7
    vmrglw v9,v6,v7
    stvx v8,r3,r10
    stvx v9,r3,r11
.endif
    addi r3,r3,32
    addi r4,r4,16
    bdnz 1b
    sub r3,r3,r5
    blr
.endm
# 315 "/nfs/scratch/Developer/ffmpeg-git-src/libavcodec/ppc/fft_altivec_s.S"
    .rodata
    .align 4
fft_data:
    .float 0, 0, 0, 0
    .float 1, 0.92387953, 0.70710678118654752440, 0.38268343
    .float 0, 0.38268343, 0.70710678118654752440, 0.92387953
    .float -0.70710678118654752440, 0.70710678118654752440, 0.70710678118654752440,-0.70710678118654752440
    .float 0.70710678118654752440, 0.70710678118654752440, 0.70710678118654752440, 0.70710678118654752440
    .byte 0x10,0x11,0x12,0x13, 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b, 0x04,0x05,0x06,0x07
    .byte 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07, 0x18,0x19,0x1a,0x1b, 0x14,0x15,0x16,0x17
    .byte 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x10,0x11,0x12,0x13, 0x1c,0x1d,0x1e,0x1f
    .byte 0x08,0x09,0x0a,0x0b, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x18,0x19,0x1a,0x1b
    .byte 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07, 0x10,0x11,0x12,0x13, 0x14,0x15,0x16,0x17
    .byte 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x18,0x19,0x1a,0x1b, 0x1c,0x1d,0x1e,0x1f
    .byte 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07
    .byte 0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b, 0x1c,0x1d,0x1e,0x1f, 0x10,0x11,0x12,0x13
    .byte 0x00,0x01,0x02,0x03, 0x0c,0x0d,0x0e,0x0f, 0x18,0x19,0x1a,0x1b, 0x14,0x15,0x16,0x17
    .byte 0x00,0x01,0x02,0x03, 0x08,0x09,0x0a,0x0b, 0x14,0x15,0x16,0x17, 0x1c,0x1d,0x1e,0x1f
    .byte 0x04,0x05,0x06,0x07, 0x0c,0x0d,0x0e,0x0f, 0x10,0x11,0x12,0x13, 0x18,0x19,0x1a,0x1b

.macro lvm b, r, regs:vararg
    lvx \r, 0, \b
    addi \b, \b, 16
  .ifnb \regs
    lvm \b, \regs
  .endif
.endm

.macro stvm b, r, regs:vararg
    stvx \r, 0, \b
    addi \b, \b, 16
  .ifnb \regs
    stvm \b, \regs
  .endif
.endm

.macro fft_calc interleave
extfunc ff_fft_calc\interleave\()_altivec
    mflr r0
    stw r0, 2*4(r1)
    stwu r1, -(160+16*4)(r1)
    get_got r11
    addi r6, r1, 16*4
    stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
    mfvrsave r0
    stw r0, 15*4(r1)
    li r6, 0xfffffffc
    mtvrsave r6

    movrel r6, fft_data, r11
    lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
    lvm r6, v22, v23, v24, v25, v26, v27, v28, v29

    li r9, 16
    movrel r12, _ff_cos_tabs, r11

    movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
    lwz r3, 0(r3)
    subi r3, r3, 2
    slwi r3, r3, 2+0
    lwzx r3, r3, r6
    mtctr r3
    mr r3, r4
    bctrl

    addi r6, r1, 16*4
    lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
    lwz r6, 15*4(r1)
    mtvrsave r6
    lwz r1, 0(r1)
    lwz r0, 2*4(r1)
    mtlr r0
    blr
.endm

.macro DECL_FFT suffix, bits, n, n2, n4
fft\n\suffix\()_altivec:
    mflr r0
    stw r0,4*(\bits-3)(r1)
    bl fft\n2\()_altivec
    addi2 r3,\n*4
    bl fft\n4\()_altivec
    addi2 r3,\n*2
    bl fft\n4\()_altivec
    addi2 r3,\n*-6
    lwz r0,4*(\bits-3)(r1)
    lwz r4,\bits*4(r12)
    mtlr r0
    li r5,\n/16
    b fft_pass\suffix\()_altivec
.endm

.macro DECL_FFTS interleave, suffix
    .text
    def_fft4 \suffix
    def_fft8 \suffix
    def_fft16 \suffix
    PASS \interleave, \suffix
    DECL_FFT \suffix, 5, 32, 16, 8
    DECL_FFT \suffix, 6, 64, 32, 16
    DECL_FFT \suffix, 7, 128, 64, 32
    DECL_FFT \suffix, 8, 256, 128, 64
    DECL_FFT \suffix, 9, 512, 256, 128
    DECL_FFT \suffix,10, 1024, 512, 256
    DECL_FFT \suffix,11, 2048, 1024, 512
    DECL_FFT \suffix,12, 4096, 2048, 1024
    DECL_FFT \suffix,13, 8192, 4096, 2048
    DECL_FFT \suffix,14,16384, 8192, 4096
    DECL_FFT \suffix,15,32768,16384, 8192
    DECL_FFT \suffix,16,65536,32768,16384

    fft_calc \suffix

    .rodata
    .align 3
fft_dispatch_tab\suffix\()_altivec:
    .int fft4\suffix\()_altivec
    .int fft8\suffix\()_altivec
    .int fft16\suffix\()_altivec
    .int fft32\suffix\()_altivec
    .int fft64\suffix\()_altivec
    .int fft128\suffix\()_altivec
    .int fft256\suffix\()_altivec
    .int fft512\suffix\()_altivec
    .int fft1024\suffix\()_altivec
    .int fft2048\suffix\()_altivec
    .int fft4096\suffix\()_altivec
    .int fft8192\suffix\()_altivec
    .int fft16384\suffix\()_altivec
    .int fft32768\suffix\()_altivec
    .int fft65536\suffix\()_altivec
.endm

DECL_FFTS 0
DECL_FFTS 1, _interleave


More information about the ffmpeg-devel mailing list