[FFmpeg-devel] [PATCH] VBLE Decoder

Michael Niedermayer michaelni at gmx.at
Wed Nov 9 15:14:13 CET 2011


Hi

On Wed, Nov 09, 2011 at 12:39:32AM -0500, Derek Buitenhuis wrote:
[...]

> +typedef struct {
> +    AVCodecContext *avctx;
> +    
> +    int size;

please remove trailing whitespace, they arent allowed in ffmpeg nor
the fork.


> +    uint8_t *len;
> +    uint32_t *val;
> +} VBLEContext;
> +

> +static uint8_t vble_bitscan(GetBitContext *gb)
> +{
> +    uint32_t src = get_bits_long(gb, 32);
> +
> +    skip_bits_long(gb, -32);
> +
> +    for(int i = 0; i < 32; i++) {
> +        if(src & (1 << i)) {
> +            skip_bits_long(gb, i + 1);
> +            return i;
> +        }
> +    }

the following should be quite a bit faster and prettier:

static av_always_inline int vble_bitscan(GetBitContext *gb)
{
    static const uint8_t len[]={ ... };
    int i= len[ show_bits(gb, 8) ];
    skip_bits(gb, i+1);
    return i;
}


> +
> +    return -1;
> +}
> +
> +static void vble_unpack(VBLEContext *ctx, GetBitContext *gb)
> +{
> +    uint8_t *lengths = ctx->len;
> +    int sum;
> +
> +    /* Read all the lengths in first */
> +    for (int i = 0; i < ctx->size * 4; i++)
> +        lengths[i] = vble_bitscan(gb);
> +

the sum of lengths should be checked against get_bits_left()


> +    /* Pixels are encoded in groups of 4 */
> +    for (int i = 0; i < ctx->size; i++) {
> +        lengths = ctx->len + i * 4;
> +
> +        sum = lengths[0] + lengths[1] + lengths[2] + lengths[3];
> +
> +        if(sum != 0)
> +            ctx->val[i] = get_bits_long(gb, sum);

reading 4 at a time with get_bits_long() is likely slower than
reading 2 at a time of just 1, later also would simplify the code


> +        else
> +            ctx->val[i] = 0;
> +    }
> +}
> +
> +static void vble_decode(VBLEContext *ctx)
> +{
> +    AVFrame *pic = ctx->avctx->coded_frame;
> +    uint8_t *out;
> +    uint8_t *lengths;
> +    uint8_t pix, len_sum;
> +    int pos;
> +    int h = ctx->avctx->height;
> +
> +    for (int i = 0; i < ctx->size; i++) {
> +        len_sum = 0;
> +        lengths = ctx->len + i * 4;
> +
> +        for (int p = 0; p < 4; p++) {
> +            pos = i * 4 + p; 
> +

> +            if (pos < pic->linesize[0] * h) {
> +                /* Y */
> +                out = pic->data[0];
> +            }
> +            else if (pos < pic->linesize[0] * h + pic->linesize[1] * h / 2) {
> +                /* U */
> +                out = pic->data[1];
> +                pos -= pic->linesize[0] * h;
> +            }
> +            else {
> +                /* V */
> +                out = pic->data[2];
> +                pos -= pic->linesize[0] * h + pic->linesize[1] * h / 2;
> +            }

these should be outside the innermost loop, which should be easy
to do when 1 value is read at a time but can be done with multiple
values too, though then messier



> +
> +            pix = (uint8_t)(ctx->val[i] >> len_sum);
> +            pix &= (1 << (lengths[p])) - 1;

if(lengths[p]) pix=get_bits(gb, lengths[p])
(for the 1 sample at a time case)


> +            pix += (1 << (lengths[p])) - 1;
> +

> +            out[pos] = pix & 1 ? 255 - (pix >> 1) : pix >> 1;

in case gcc compiles this to a conditional branch then the
following should be faster:
(pix >> 1) ^ -(pix&1);

Note: you can check speed with START/STOP_TIMER


> +
> +            len_sum += lengths[p];
> +        }




> +    }
> +}
> +
> +static void vble_median_restore(uint8_t *plane, int width, int height)
> +{
> +    uint8_t *dst = plane;
> +    uint8_t a, b, c;
> +
> +    for(int i = 1; i < width; i++)
> +        dst[i] += dst[i - 1];
> +
> +    dst += width;
> +
> +    for(int i = 1; i < height; i++) {
    
> +        for(int j = 1; j < width; j++) {
> +            a = dst[j - 1];
> +            b = dst[j - width];
> +            c = a + b - dst[j - 1 - width];
> +            dst[j] += mid_pred(a, b, c);
> +        }

this could use  add_hfyu_median_prediction()
or is there some reason why its use is not possible ?


> +        dst += width;
> +    }
> +}
> +
> +static int vble_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
> +                             AVPacket *avpkt)
> +{
> +    VBLEContext *ctx = avctx->priv_data;
> +    AVFrame *pic = avctx->coded_frame;
> +    GetBitContext gb;
> +    const uint8_t *src = avpkt->data;
> +    int version;
> +    int w = avctx->width, h = avctx->height;
> +
> +    /* Clear buffer if need be */
> +    if(pic->data[0])
> +        avctx->release_buffer(avctx, pic);
> +
> +    /* Allocate buffer */
> +    if(avctx->get_buffer(avctx, pic) < 0) {
> +        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
> +        return -1;
> +    }
> +

> +    /* Set flags */
> +    pic->reference = 0;

this has to be set before get_buffer() as it may affect the kind
of buffer you get


> +    pic->key_frame = 1;
> +    pic->pict_type = FF_I_TYPE;
> +

> +    /* Codec is YV12-only */
> +    pic->linesize[0] = w;
> +    pic->linesize[1] = pic->linesize[2] = w / 2;

you cannot change the linesize of buffers allocated by get_buffer()
it also would break alignment for outside code like some video filter


> +
> +    /* Version should always be 1 */
> +    version = AV_RL32(src);
> +    av_log(avctx, AV_LOG_DEBUG, "Version: %d\n", version);
> +
> +    if (version != 1) {
> +        av_log(avctx, AV_LOG_ERROR, "Unsupported VBLE Version: %d\n", version);
> +        return -1;
> +    }
> +    
> +    /* Move past version */
> +    src += 4;
> +
> +    init_get_bits(&gb, src, (avpkt->size - 4) * 8);

you should check that the input contains at least 4 + (w*h*3)/16 bytes


> +
> +    /* Unpack and decode */
> +    vble_unpack(ctx, &gb);
> +    vble_decode(ctx);
> +
> +    /* Restore planes. Should be almost identical to Huffyuv's. */
> +    vble_median_restore(pic->data[0], pic->linesize[0], h);
> +    vble_median_restore(pic->data[1], pic->linesize[1], h / 2);
> +    vble_median_restore(pic->data[2], pic->linesize[2], h / 2);
> +

it should be faster to interleave the last get_bits() vble_decode() and
vble_median_restore() linewise due to better L1 data cache use
alternatively the 3 median_restore could be interleaved linewise with
vble_decode() and draw_horiz_band() be implemented
(CODEC_CAP_DRAW_HORIZ_BAND)

also you could add CODEC_FLAG_GRAY support to allow faster but
grayscale only decoding.

These are all of course just ideas, its perfectly fine if you dont
implement them or do them as seperate patches. 



> +    *data_size = sizeof(AVFrame);
> +    *(AVFrame *)data = *pic;
> +
> +    return avpkt->size;
> +}
> +
> +static av_cold int vble_decode_init(AVCodecContext *avctx)
> +{
> +    VBLEContext *ctx = avctx->priv_data;
> +
> +    ctx->avctx = avctx;
> +    
> +    /* Number of samples divided by 4 */
> +    ctx->size = (avctx->width * avctx->height * 3) / 8;
> +
> +    ctx->len = (uint8_t *)av_malloc(ctx->size * 4 * sizeof(uint8_t));

> +    ctx->val = (uint32_t *)av_malloc(ctx->size * sizeof(uint32_t));

the reading and use can be combined making this array unneeded
the casts are also unneeded

If you want you can also add yourself to the MAINTAINERS file,
that is if you want to maintain this code in/for ffmpeg. And it would
be very welcome if you maintain it!


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

I have often repented speaking, but never of holding my tongue.
-- Xenocrates
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20111109/3e11406a/attachment.asc>


More information about the ffmpeg-devel mailing list