[FFmpeg-devel] [PATCH] Fix non-rounding up to next 16-bit aligned bug in IFF decoder

Ronald S. Bultje rsbultje
Thu Apr 29 18:51:44 CEST 2010


Hi,

On Thu, Apr 29, 2010 at 10:37 AM, Sebastian Vater
<cdgs.basty at googlemail.com> wrote:
> ? ?START_TIMER;
> ? ?const uint8_t *end = dst + (buf_size * 8);
> ? ?const uint32_t *lut = plane8_lut[plane];
> ? ?for(; dst < end; dst += 8) {
> ? ? ? ?const uint8_t lut_offsets = *buf++;
> ? ? ? ?uint32_t v ?= AV_RN32A(dst) | lut[lut_offsets >> 4];
> ? ? ? ?uint32_t v2 ?= AV_RN32A(dst + 4) | lut[lut_offsets & 0x0F];
> ? ? ? ?AV_WN32A(dst, v);
> ? ? ? ?AV_WN32A(dst + 4, v2);
> ? ?}
> ? ?STOP_TIMER("decodeplane8");
[..]
> ? ? ?52: ? ? ? 0f b6 06 ? ? ? ? ? ? ? ?movzbl (%esi),%eax
> ? ? ?55: ? ? ? 83 c6 01 ? ? ? ? ? ? ? ?add ? ?$0x1,%esi

const uint8_t lut_offsets = *buf++;

> ? ? ?58: ? ? ? 8b 53 04 ? ? ? ? ? ? ? ?mov ? ?0x4(%ebx),%edx

read dst+4

> ? ? ?5b: ? ? ? 89 c1 ? ? ? ? ? ? ? ? ? mov ? ?%eax,%ecx

move lut_offset from eac->ecx

> ? ? ?5d: ? ? ? 83 e1 0f ? ? ? ? ? ? ? ?and ? ?$0xf,%ecx

ecx has lower 4 bits

> ? ? ?60: ? ? ? 0b 14 8f ? ? ? ? ? ? ? ?or ? ? (%edi,%ecx,4),%edx

edx |= table

> ? ? ?63: ? ? ? c0 e8 04 ? ? ? ? ? ? ? ?shr ? ?$0x4,%al

eax has higher 4 bits

> ? ? ?66: ? ? ? 0f b6 c0 ? ? ? ? ? ? ? ?movzbl %al,%eax

Err...?

> ? ? ?69: ? ? ? 8b 04 87 ? ? ? ? ? ? ? ?mov ? ?(%edi,%eax,4),%eax

table value

> ? ? ?6c: ? ? ? 09 03 ? ? ? ? ? ? ? ? ? or ? ? %eax,(%ebx)

read/write dest (note how this is a single instruction).

> ? ? ?6e: ? ? ? 89 53 04 ? ? ? ? ? ? ? ?mov ? ?%edx,0x4(%ebx)

write dst+4 (note how this isn't a single instruction)

> ? ? ?71: ? ? ? 83 c3 08 ? ? ? ? ? ? ? ?add ? ?$0x8,%ebx
> ? ? ?74: ? ? ? 39 dd ? ? ? ? ? ? ? ? ? cmp ? ?%ebx,%ebp
> ? ? ?76: ? ? ? 77 da ? ? ? ? ? ? ? ? ? ja ? ? 52 <decodeplane8+0x52>

loop end

You probably see from the disassembly you could remove another few
instructions by doing the calculations serially instead of parallel.
For example, it reads/writes dst in a single instruction, but uses
several instructions for dst+4. Serializing this might improve the
code another few instructions.

for() {
uint32_t v;
v = AV_RN32A(..) | lut[x>>4];
AV_WN32A(dst, v);
v = AV_RN32A(..) | lut[x&0xf];
AV_WN32A(dst, v+4);
}

Ideally this becomes (12 instructions instead of original 14):

movzbl (%esi),%eax
add    $0x1,%esi
mov    %eax,%ecx
shr    $0x4,%eax
and    $0xf,%ecx
mov    (%edi,%eax,4),%eax
mov    (%edi,%ecx,4),%ecx
or     %eax,(%ebx)
or     %ecx,(%ebx, 4) <- not sure if that's possible
add    $0x8,%ebx
cmp    %ebx,%ebp
ja     52 <decodeplane8+0x52>

Ronald



More information about the ffmpeg-devel mailing list