[FFmpeg-devel] [PATCH] VP8 decoder

Michael Niedermayer michaelni
Wed Jun 16 04:22:15 CEST 2010


On Tue, Jun 15, 2010 at 09:12:30AM -0400, David Conrad wrote:
> Hi,
> 
> Since I might not have much time to work on this soon, I figure a round of review might be good.
> 
> It's bitexact for all streams I have that do not use the bilinear filter (so not test vectors 3,4,7)
> 
> Speed is currently ~10% faster than libvpx for intra-only, and 40-50% faster for normal video, probably mostly due to faster C loop filter functions.
> 
> I tried implementing the border saving in h264 so that the loop filter could be run on the same row that was just decoded, but that isn't working yet.
> 
> Edge emulation isn't fully supported as of yet since intra pred relies on pixels outside the frame being 127/129.
> 

>  arm/h264pred_init_arm.c |    7 +-
>  h264pred.c              |  130 ++++++++++++++++++++++++++++++++++++++++++++++--
>  h264pred.h              |    2 
>  3 files changed, 131 insertions(+), 8 deletions(-)
> 6af3e7c9c48fce13890c77b5dd4ffd12e788efd8  0001-VP8-intra-pred-functions.patch
> From b34add5a2859966864346b189bb493f5b773a662 Mon Sep 17 00:00:00 2001
> From: David Conrad <lessen42 at gmail.com>
> Date: Tue, 15 Jun 2010 08:57:35 -0400
> Subject: [PATCH 1/2] VP8 intra pred functions
> 
> ---
>  libavcodec/arm/h264pred_init_arm.c |    7 +-
>  libavcodec/h264pred.c              |  130 ++++++++++++++++++++++++++++++++++--
>  libavcodec/h264pred.h              |    2 +
>  3 files changed, 131 insertions(+), 8 deletions(-)
> 
[...]

> +static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
> +    const int lt= src[-1-1*stride];
> +    LOAD_TOP_EDGE
> +    LOAD_TOP_RIGHT_EDGE
> +
> +    // 32-bit stores maybe

yes


[...]
> @@ -539,6 +610,20 @@ static void pred16x16_plane_rv40_c(uint8_t *src, int stride){
>      pred16x16_plane_compat_c(src, stride, 0, 1);
>  }
>  
> +static void pred16x16_tm_vp8_c(uint8_t *src, int stride){
> +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
> +    int tl = src[-1-stride];
> +    uint8_t *top = src-stride;
> +    int x, y;
> +
> +    for (y = 0; y < 16; y++) {
> +        int l = src[-1];
> +        for (x = 0; x < 16; x++)
> +            src[x] = cm[l + top[x] - tl];

&cm[l - tl]
can be factored out

[...]
> +static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
> +                           uint8_t *intra4x4)
> +{
> +    VP56RangeCoder *c = &s->c;
> +    int n;
> +
> +    if (s->segmentation.update_map)
> +        mb->segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
> +
> +    mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
> +
> +    if (s->keyframe) {
> +        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
> +

> +        if (mb->mode == MODE_I4x4)
> +            decode_intra4x4_modes(c, intra4x4, s->b4_stride, 1);
> +        else

{}
also it might be worth to try patcheck


[...]
> +static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16])
> +{
> +    int i, t0, t1, t2, t3;
> +
> +    for (i = 0; i < 4; i++) {
> +        t0 = dc[0*4+i] + dc[3*4+i];
> +        t1 = dc[1*4+i] + dc[2*4+i];
> +        t2 = dc[1*4+i] - dc[2*4+i];
> +        t3 = dc[0*4+i] - dc[3*4+i];
> +
> +        dc[0*4+i] = t0 + t1;
> +        dc[1*4+i] = t3 + t2;
> +        dc[2*4+i] = t0 - t1;
> +        dc[3*4+i] = t3 - t2;
> +    }
> +
> +    for (i = 0; i < 4; i++) {
> +        t0 = dc[i*4+0] + dc[i*4+3];
> +        t1 = dc[i*4+1] + dc[i*4+2];
> +        t2 = dc[i*4+1] - dc[i*4+2];
> +        t3 = dc[i*4+0] - dc[i*4+3];
> +
> +        *block[i][0] = (t0 + t1 + 3) >> 3;
> +        *block[i][1] = (t3 + t2 + 3) >> 3;
> +        *block[i][2] = (t0 - t1 + 3) >> 3;
> +        *block[i][3] = (t3 - t2 + 3) >> 3;

the +3 can be moved up and factored

[...]
> +
> +#define clip_int8(x) av_clip(x, -128, 127)
> +

> +static av_always_inline void filter_common(uint8_t *p, int stride, int is4tap)
> +{
> +    LOAD_PIXELS
> +    int a, f1, f2;
> +
> +    a = 3*(q0 - p0);
> +
> +    if (is4tap)
> +        a += clip_int8(p1 - q1);
> +
> +    a = clip_int8(a);
> +

> +    // We deviate from the spec here with c(a+3) >> 3
> +    // since that's what libvpx does.
> +    f1 = clip_int8(a+4) >> 3;
> +    f2 = clip_int8(a+3) >> 3;

these only need cliping in one direction
and or the clip_int8() could be avoided

> +
> +    // Despite what the spec says, we do need to clamp here to
> +    // be bitexact with libvpx.
> +    p[-1*stride] = av_clip_uint8(p0 + f2);
> +    p[ 0*stride] = av_clip_uint8(q0 - f1);

for the !is4tap i think theres a bit more cliping done than needed


> +
> +    // only used for _inner on blocks without high edge variance
> +    if (!is4tap) {
> +        a = (f1+1)>>1;
> +        p[-2*stride] = av_clip_uint8(p1 + a);
> +        p[ 1*stride] = av_clip_uint8(q1 - a);
> +    }
> +}
> +

> +static av_always_inline int simple_limit(uint8_t *p, int stride, int flim)
> +{
> +    LOAD_PIXELS
> +    return 2*FFABS(p0-q0) + FFABS(p1-q1)/2 <= flim;

>>1 instead of /2 may be faster


> +}
> +
> +/**
> + * E - limit at the macroblock edge
> + * I - limit for interior difference
> + */
> +static av_always_inline int normal_limit(uint8_t *p, int stride, int E, int I)
> +{
> +    LOAD_PIXELS
> +    return simple_limit(p, stride, 2*E+I)
> +        && FFABS(p3-p2) <= I && FFABS(p2-p1) <= I && FFABS(p1-p0) <= I
> +        && FFABS(q3-q2) <= I && FFABS(q2-q1) <= I && FFABS(q1-q0) <= I;
> +}
> +
> +// high edge variance
> +static av_always_inline int hev(uint8_t *p, int stride, int thresh)
> +{
> +    LOAD_PIXELS
> +    return FFABS(p1-p0) > thresh || FFABS(q1-q0) > thresh;
> +}
> +

> +static av_always_inline void filter_mbedge(uint8_t *p, int stride)
> +{
> +    int a0, a1, a2, w;
> +
> +    LOAD_PIXELS
> +
> +    w = clip_int8(p1-q1);
> +    w = clip_int8(w + 3*(q0-p0));
> +

> +    a0 = clip_int8((27*w + 63) >> 7);
> +    a1 = clip_int8((18*w + 63) >> 7);
> +    a2 = clip_int8(( 9*w + 63) >> 7);

the cliping is unneeded


[...]

> +static const uint8_t subpel_filters[7][6] = {
> +    { 0,   6, 123,  12,   1,   0 },
> +    { 2,  11, 108,  36,   8,   1 },
> +    { 0,   9,  93,  50,   6,   0 },
> +    { 3,  16,  77,  77,  16,   3 },
> +    { 0,   6,  50,  93,   9,   0 },
> +    { 1,   8,  36, 108,  11,   2 },
> +    { 0,   1,  12, 123,   6,   0 },
> +};
> +
> +
> +#define FILTER_6TAP(src, F, stride) \
> +    av_clip_uint8((F[2]*src[x+0*stride] - F[1]*src[x-1*stride] + F[0]*src[x-2*stride] + \
> +                   F[3]*src[x+1*stride] - F[4]*src[x+2*stride] + F[5]*src[x+3*stride] + 64) >> 7)
> +
> +#define VP8_EPEL(SIZE) \
> +static void put_vp8_epel ## SIZE ## _h_c(uint8_t *dst, uint8_t *src, int stride, int h, int mx, int my) \
> +{ \
> +    const uint8_t *filter = subpel_filters[mx-1]; \
> +    int x, y; \
> +\
> +    for (y = 0; y < h; y++) { \
> +        for (x = 0; x < SIZE; x++) \
> +            dst[x] = FILTER_6TAP(src, filter, 1); \
> +        dst += stride; \
> +        src += stride; \
> +    } \
> +} \
> +\

maybe it makes sense to write seperate functions for thr 4tap cases instead of 0*

also i dont mind if you commit it to svn to continue development there.
It surely is usefull alraedy to people if its faster than libvpx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Opposition brings concord. Out of discord comes the fairest harmony.
-- Heraclitus
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100616/c2cd4c07/attachment.pgp>



More information about the ffmpeg-devel mailing list