[FFmpeg-devel] [PATCH] Add add_pixels4/8() to h264dsp, and remove add_pixels4 from dsputil.

Mon Feb 11 01:12:55 CET 2013

Hi,

On Sat, Feb 9, 2013 at 5:49 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Sat, Feb 09, 2013 at 03:43:56PM -0800, Ronald S. Bultje wrote:
>> From: "Ronald S. Bultje" <rsbultje at gmail.com>
>>
>> These functions are mostly H264-specific (the only other user I can
>> spot is bink), and this allows us to special-case some functionality
>> for H264. Also remove the 16-bit-coeff with >8bpp versions (unused)
>> and merge the duplicate 32-bit-coeff for >8bpp (identical).
>
> [...]
>
>> +
>> +#include "bit_depth_template.c"
>> +
>> +static void FUNCC(ff_h264_add_pixels4)(uint8_t *_dst, int16_t *_src, int stride)
>> +{
>> +    int i;
>> +    pixel *dst = (pixel *) _dst;
>> +    dctcoef *src = (dctcoef *) _src;
>
>> +    stride /= sizeof(pixel);
>
> a >> should be faster

It's used as an increment for type int16_t, so it's actually undone in
the assembly. Example disassembly on x86-32:

_ff_h264_add_pixels4_8_c:
0000bc20        pushl   %ebx
0000bc21        pushl   %esi
0000bc22        movl    0x0c(%esp),%eax ; dst
0000bc26        addl    $0x03,%eax
0000bc29        xorl    %ecx,%ecx
0000bc2b        movl    0x14(%esp),%edx ; linesize
0000bc2f        movl    0x10(%esp),%esi ; block
0000bc33        nopw    _ff_h264dsp_init(%eax,%eax)
0000bc39        nopl    _ff_h264dsp_init(%eax)
0000bc40        movb    (%esi,%ecx,8),%bl ; load
0000bc43        addb    %bl,0xfd(%eax) ; add
0000bc46        movb    0x02(%esi,%ecx,8),%bl ; load
0000bc4a        addb    %bl,0xfe(%eax) ; add
0000bc4d        movb    0x04(%esi,%ecx,8),%bl ; load
0000bc51        addb    %bl,0xff(%eax) ; add
0000bc54        movb    0x06(%esi,%ecx,8),%bl ; load
0000bc58        addb    %bl,(%eax) ; add
0000bc5a        addl    %edx,%eax ; += linesize
0000bc5c        incl    %ecx ; block increment
0000bc5d        cmpl    $0x04,%ecx ; next line
0000bc60        jne     0x0000bc40 ; jump
0000bc62        movl    $_ff_h264dsp_init,0x04(%esi) ; $_... is
actually zero, so this zeroes the block
0000bc69        movl    $_ff_h264dsp_init,(%esi)
0000bc6f        movl    $_ff_h264dsp_init,0x0c(%esi)
0000bc76        movl    $_ff_h264dsp_init,0x08(%esi)
0000bc7d        movl    $_ff_h264dsp_init,0x14(%esi)
0000bc84        movl    $_ff_h264dsp_init,0x10(%esi)
0000bc8b        movl    $_ff_h264dsp_init,0x1c(%esi)
0000bc92        movl    $_ff_h264dsp_init,0x18(%esi)
0000bc99        popl    %esi
0000bc9a        popl    %ebx
0000bc9b        ret
0000bc9c        nopl    _ff_h264dsp_init(%eax)

As you see, no division or anything weird, the compiler knows what to do.

>> @@ -121,6 +121,10 @@ typedef struct H264DSPContext {
>>      void (*h264_luma_dc_dequant_idct)(int16_t *output,
>>                                        int16_t *input /*align 16*/, int qmul);
>>      void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul);
>> +
>> +    /* bypass-transform */
>> +    void (*h264_add_pixels8)(uint8_t *dst, int16_t *block, int stride);
>> +    void (*h264_add_pixels4)(uint8_t *dst, int16_t *block, int stride);
>
> maybe a function name like add_and_clear_pixels or something similar
> is better
>
> also iam not sure the fields of the H264DSPContext context should have
> h264_ prefixes but thats quite seperate of this patch, i just realized
> now

Agreed, will rename to add_pixels_and_clear (since we clear block, not
pixels), and will remove h264 prefixes in a future patch.

Ronald