[FFmpeg-devel] [PATCH] VP8 decoder

Vitor Sessak vitor1001
Tue Jun 22 18:50:01 CEST 2010


On 06/22/2010 04:43 PM, Ronald S. Bultje wrote:
> Hi,
>
> On Tue, Jun 15, 2010 at 10:22 PM, Michael Niedermayer<michaelni at gmx.at>  wrote:
>> On Tue, Jun 15, 2010 at 09:12:30AM -0400, David Conrad wrote:
>>> +static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
>>> +                           uint8_t *intra4x4)
>>> +{
>>> +    VP56RangeCoder *c =&s->c;
>>> +    int n;
>>> +
>>> +    if (s->segmentation.update_map)
>>> +        mb->segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
>>> +
>>> +    mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
>>> +
>>> +    if (s->keyframe) {
>>> +        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
>>> +
>>
>>> +        if (mb->mode == MODE_I4x4)
>>> +            decode_intra4x4_modes(c, intra4x4, s->b4_stride, 1);
>>> +        else
>>
>> {}
>
> Done.
>
>> also it might be worth to try patcheck
>
> Tried, didn't give me anything (I hope that's good).
>
>>> +static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16])
>>> +{
>>> +    int i, t0, t1, t2, t3;
>>> +
>>> +    for (i = 0; i<  4; i++) {
>>> +        t0 = dc[0*4+i] + dc[3*4+i];
>>> +        t1 = dc[1*4+i] + dc[2*4+i];
>>> +        t2 = dc[1*4+i] - dc[2*4+i];
>>> +        t3 = dc[0*4+i] - dc[3*4+i];
>>> +
>>> +        dc[0*4+i] = t0 + t1;
>>> +        dc[1*4+i] = t3 + t2;
>>> +        dc[2*4+i] = t0 - t1;
>>> +        dc[3*4+i] = t3 - t2;
>>> +    }
>>> +
>>> +    for (i = 0; i<  4; i++) {
>>> +        t0 = dc[i*4+0] + dc[i*4+3];
>>> +        t1 = dc[i*4+1] + dc[i*4+2];
>>> +        t2 = dc[i*4+1] - dc[i*4+2];
>>> +        t3 = dc[i*4+0] - dc[i*4+3];
>>> +
>>> +        *block[i][0] = (t0 + t1 + 3)>>  3;
>>> +        *block[i][1] = (t3 + t2 + 3)>>  3;
>>> +        *block[i][2] = (t0 - t1 + 3)>>  3;
>>> +        *block[i][3] = (t3 - t2 + 3)>>  3;
>>
>> the +3 can be moved up and factored
>
> Done.
>
>>> +static av_always_inline void filter_common(uint8_t *p, int stride, int is4tap)
>>> +{
>>> +    LOAD_PIXELS
>>> +    int a, f1, f2;
>>> +
>>> +    a = 3*(q0 - p0);
>>> +
>>> +    if (is4tap)
>>> +        a += clip_int8(p1 - q1);
>>> +
>>> +    a = clip_int8(a);
>>> +
>>
>>> +    // We deviate from the spec here with c(a+3)>>  3
>>> +    // since that's what libvpx does.
>>> +    f1 = clip_int8(a+4)>>  3;
>>> +    f2 = clip_int8(a+3)>>  3;
>>
>> these only need cliping in one direction
>> and or the clip_int8() could be avoided
>
> Changed to FFMIN(x, 127).
>
>>> +    // Despite what the spec says, we do need to clamp here to
>>> +    // be bitexact with libvpx.
>>> +    p[-1*stride] = av_clip_uint8(p0 + f2);
>>> +    p[ 0*stride] = av_clip_uint8(q0 - f1);
>>
>> for the !is4tap i think theres a bit more cliping done than needed
>
> I looked through it and don't think this one can strictly be prevented.
>
>>> +static av_always_inline int simple_limit(uint8_t *p, int stride, int flim)
>>> +{
>>> +    LOAD_PIXELS
>>> +    return 2*FFABS(p0-q0) + FFABS(p1-q1)/2<= flim;
>>
>>>> 1 instead of /2 may be faster
>
> Changed.
>
>>> +static av_always_inline void filter_mbedge(uint8_t *p, int stride)
>>> +{
>>> +    int a0, a1, a2, w;
>>> +
>>> +    LOAD_PIXELS
>>> +
>>> +    w = clip_int8(p1-q1);
>>> +    w = clip_int8(w + 3*(q0-p0));
>>> +
>>
>>> +    a0 = clip_int8((27*w + 63)>>  7);
>>> +    a1 = clip_int8((18*w + 63)>>  7);
>>> +    a2 = clip_int8(( 9*w + 63)>>  7);
>>
>> the cliping is unneeded
>
> Indeed, removed.
>
>>> +static const uint8_t subpel_filters[7][6] = {
>>> +    { 0,   6, 123,  12,   1,   0 },
>>> +    { 2,  11, 108,  36,   8,   1 },
>>> +    { 0,   9,  93,  50,   6,   0 },
>>> +    { 3,  16,  77,  77,  16,   3 },
>>> +    { 0,   6,  50,  93,   9,   0 },
>>> +    { 1,   8,  36, 108,  11,   2 },
>>> +    { 0,   1,  12, 123,   6,   0 },
>>> +};
>>> +
>>> +
>>> +#define FILTER_6TAP(src, F, stride) \
>>> +    av_clip_uint8((F[2]*src[x+0*stride] - F[1]*src[x-1*stride] + F[0]*src[x-2*stride] + \
>>> +                   F[3]*src[x+1*stride] - F[4]*src[x+2*stride] + F[5]*src[x+3*stride] + 64)>>  7)
>>> +
>>> +#define VP8_EPEL(SIZE) \
>>> +static void put_vp8_epel ## SIZE ## _h_c(uint8_t *dst, uint8_t *src, int stride, int h, int mx, int my) \
>>> +{ \
>>> +    const uint8_t *filter = subpel_filters[mx-1]; \
>>> +    int x, y; \
>>> +\
>>> +    for (y = 0; y<  h; y++) { \
>>> +        for (x = 0; x<  SIZE; x++) \
>>> +            dst[x] = FILTER_6TAP(src, filter, 1); \
>>> +        dst += stride; \
>>> +        src += stride; \
>>> +    } \
>>> +} \
>>> +\
>>
>> maybe it makes sense to write seperate functions for thr 4tap cases instead of 0*
>
> Added as suggested.
>
>> also i dont mind if you commit it to svn to continue development there.
>> It surely is usefull alraedy to people if its faster than libvpx
>
> I have some x86 mmx optimizations already so I think this is a good
> idea. Please let me know if the current patch is OK to apply.

I risk to start a bikeshed, but should we mark it as experimental until 
it pass the conformance test suite?

-Vitor



More information about the ffmpeg-devel mailing list