[FFmpeg-devel] [PATCH] RV40 Loop Filter

Sat Oct 25 09:08:44 CEST 2008

On Wed, Oct 22, 2008 at 10:53:23AM +0200, Michael Niedermayer wrote:
> On Tue, Oct 21, 2008 at 09:23:21AM +0300, Kostya wrote:
> > $subj
> > I've tested it and looks like filter is invoked properly now,
> > filtering workflow should be also correct, but there is some
> > non-bitexactness elsewhere, so result picture is still a bit
> > wrong.
> 
> [...]
> > +    if(!edge)
> > +        flag0 = flag1 = 0;
> > +    else{
> > +        flag0 = (llim0 == 3);
> > +        flag1 = (llim1 == 3);
> > +    }
> > +    if(flag0 && FFABS(s2) >= thr1)
> > +        flag0 = 0;
> > +    if(flag1 && FFABS(s3) >= thr1)
> > +        flag1 = 0;
> 
> if(!edge)
>     flag0 = flag1 = 0;
> else{
>     flag0 = llim0 == 3 && FFABS(s2) < thr1;
>     flag1 = llim1 == 3 && FFABS(s3) < thr1;
> }
> 
> also the naming of variables like lim0 llim0 lim1 llim1 thr0 thr1 and so on
> is very poor.
> Where they correspond to positions numbers are fine but where they do not
> this kind of naming makes the code undeciperable and unreviewable.
 
I've tried to give variables more meaningful names
(or at least more different ones).
 
> > +
> > +    lims = (lim0 + lim1 + llim0 + llim1) >> 1;
> > +    if(flag0 && flag1){ /* strong filtering */
> > +        for(i = 0; i < 4; i++, src += stride){
> > +            t = src[0*step] - src[-1*step];
> > +            if(!t) continue;
> > +            sflag = (mult * FFABS(t)) >> 7;
> > +            if(sflag > 1) continue;
> > +
> > +            p0 = (RV40_STRONG_FILTER(src, step, -3, 1, -3) + rv40_dither_l[dmode + i]) >> 7;
> > +            p1 = (RV40_STRONG_FILTER(src, step, -2, 2, -2) + rv40_dither_r[dmode + i]) >> 7;
> > +            diff[0] = src[-1*step];
> > +            diff[1] = src[ 0*step];
> 
> the variable diff can be moved into the for loop, this may also apply to
> others

done 
 
> [...]
> > +static int rv40_set_deblock_coef(RV34DecContext *r)
> > +{
> > +    MpegEncContext *s = &r->s;
> > +    int mvmask = 0, i, j, dx, dy;
> > +    int midx = s->mb_x * 2 + s->mb_y * 2 * s->b8_stride;
> 
> > +    if(s->pict_type == FF_I_TYPE)
> > +        return 0;
> 
> why is this even called for i frames?

I intend to use it for calculating macroblock-specific deblock
strength in RV30.
 
> > +    for(j = 0; j < 2; j++){
> > +        for(i = 0; i < 2; i++){
> > +            if(i || s->mb_x){
> > +                dx = FFABS(s->current_picture_ptr->motion_val[0][midx + i][0] - s->current_picture_ptr->motion_val[0][midx + i - 1][0]);
> > +                dy = FFABS(s->current_picture_ptr->motion_val[0][midx + i][1] - s->current_picture_ptr->motion_val[0][midx + i - 1][1]);
> > +                if(dx > 3 || dy > 3){
> > +                    mvmask |= 0x11 << (i*2 + j*8);
> > +                }
> > +            }
> > +            if(j || !s->first_slice_line){
> > +                dx = FFABS(s->current_picture_ptr->motion_val[0][midx + i][0] - s->current_picture_ptr->motion_val[0][midx + i - s->b8_stride][0]);
> > +                dy = FFABS(s->current_picture_ptr->motion_val[0][midx + i][1] - s->current_picture_ptr->motion_val[0][midx + i - s->b8_stride][1]);
> 
> s->current_picture_ptr->motion_val[0][midx + i] is duplicated all over the 151 char long lines

contracted 
 
> > +                if(dx > 3 || dy > 3){
> > +                    mvmask |= 0x03 << (i*2 + j*8);
> > +                }
> > +            }
> > +        }
> > +        midx += s->b8_stride;
> > +    }
> 
> i think the if() can be moved out of the loop like
> if(first_slice_line)
>     mvmask &= 123;

IMO it can't.
It constructs mask based on motion vectors difference in the
horizontal/vertical neighbouring blocks after all. 
 
> > +    return mvmask;
> > +}
> > +
> > +static void rv40_loop_filter(RV34DecContext *r)
> > +{
> > +    MpegEncContext *s = &r->s;
> > +    int mb_pos;
> > +    int i, j;
> > +    uint8_t *Y, *C;
> > +    int alpha, beta, betaY, betaC;
> > +    int q;
> > +    // 0 - cur block, 1 - top, 2 - left, 3 - bottom
> > +    int btype[4], clip[4], mvmasks[4], cbps[4], uvcbps[4][2];
> > +
> 
> > +    if(s->pict_type == FF_B_TYPE)
> > +        return;
> 
> why is this even called for b frames?

Because the spec says so :)
RV40 has many special cases for B-frame loop filter which
I didn't care to implement.

> > +
> > +    for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++){
> > +        mb_pos = s->mb_y * s->mb_stride;
> > +        for(s->mb_x = 0; s->mb_x < s->mb_width; s->mb_x++, mb_pos++){
> > +            int btype = s->current_picture_ptr->mb_type[mb_pos];
> 
> > +            if(IS_INTRA(btype) || IS_SEPARATE_DC(btype)){
> > +                r->cbp_luma  [mb_pos] = 0xFFFF;
> > +            }
> > +            if(IS_INTRA(btype))
> > +                r->cbp_chroma[mb_pos] = 0xFF;
> 
> inconsistant {}

added braces 
 
> [...]
> > +            mvmasks[0] = r->deblock_coefs[mb_pos];
> > +            btype  [0] = s->current_picture_ptr->mb_type[mb_pos];
> > +            cbps   [0] = r->cbp_luma[mb_pos];
> > +            uvcbps[0][0] = r->cbp_chroma[mb_pos] & 0xF;
> > +            uvcbps[0][1] = r->cbp_chroma[mb_pos] >> 4;
> > +            if(s->mb_y){
> > +                mvmasks[1] = r->deblock_coefs[mb_pos - s->mb_stride] & 0xF000;
> > +                btype  [1] = s->current_picture_ptr->mb_type[mb_pos - s->mb_stride];
> > +                cbps   [1] = r->cbp_luma[mb_pos - s->mb_stride] & 0xF000;
> > +                uvcbps[1][0] =  r->cbp_chroma[mb_pos - s->mb_stride]       & 0xC;
> > +                uvcbps[1][1] = (r->cbp_chroma[mb_pos - s->mb_stride] >> 4) & 0xC;
> > +            }else{
> > +                mvmasks[1] = 0;
> > +                btype  [1] = btype[0];
> > +                cbps   [1] = 0;
> > +                uvcbps[1][0] = uvcbps[1][1] = 0;
> > +            }
> > +            if(s->mb_x){
> > +                mvmasks[2] = r->deblock_coefs[mb_pos - 1] & 0x8888;
> > +                btype  [2] = s->current_picture_ptr->mb_type[mb_pos - 1];
> > +                cbps   [2] = r->cbp_luma[mb_pos - 1] & 0x8888;
> > +                uvcbps[2][0] =  r->cbp_chroma[mb_pos - 1]       & 0xA;
> > +                uvcbps[2][1] = (r->cbp_chroma[mb_pos - 1] >> 4) & 0xA;
> > +            }else{
> > +                mvmasks[2] = 0;
> > +                btype  [2] = btype[0];
> > +                cbps   [2] = 0;
> > +                uvcbps[2][0] = uvcbps[2][1] = 0;
> > +            }
> > +            if(s->mb_y < s->mb_height - 1){
> > +                mvmasks[3] = r->deblock_coefs[mb_pos + s->mb_stride] & 0x000F;
> > +                btype  [3] = s->current_picture_ptr->mb_type[mb_pos + s->mb_stride];
> > +                cbps   [3] = r->cbp_luma[mb_pos + s->mb_stride] & 0x000F;
> > +                uvcbps[3][0] =  r->cbp_chroma[mb_pos + s->mb_stride]       & 0x3;
> > +                uvcbps[3][1] = (r->cbp_chroma[mb_pos + s->mb_stride] >> 4) & 0x3;
> > +            }else{
> > +                mvmasks[3] = 0;
> > +                btype  [3] = btype[0];
> > +                cbps   [3] = 0;
> > +                uvcbps[3][0] = uvcbps[3][1] = 0;
> > +            }
> 
> lots of duplicated code
> btype holds the macro block type thus the 'b' seems wrong
> also the plural 's' in the names seems wrong, we have a mask, a pattern
> not masks and patterns

factored out common part and renamed variables 

[lots of loop filter invoking] 
> 
> the word mess is probably the best way to describe this
> as far as i can tell you are packing all the bits related to deblocking
> and then later duplicate code each with hardcoded masks to extract them
> again.

We have a saying here "To make a candy from crap", which I think describes
current situation. I'd like to shot the group of men who proposed the loop
filter in the form RV40 has it.

The problem is that edges should be filtered in that order with clipping
values depending on clipping values selected depending on whether
neighbouring block coded is not and if it belongs to the same MB or not.
It's possible to all of the into loop, but it will have too many additional
conditions to my taste. I've merged some of them though.
 
> this likely could be reduced in size by 3/4 and made much more readable
> 
> 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
-------------- next part --------------
Index: libavcodec/rv40.c
===================================================================

--- libavcodec/rv40.c	(revision 15305)
+++ libavcodec/rv40.c	(working copy)
@@ -247,7 +247,449 @@
     return 0;
 }
 
+#define CLIP_SYMM(a, b) av_clip(a, -(b), b)
 /**
+ * weaker deblocking very similar to the one described in 4.4.2 of JVT-A003r1
+ */
+static inline void rv40_weak_loop_filter(uint8_t *src, const int step,
+                                         const int flag0, const int flag1,
+                                         const int alpha,
+                                         const int lim0, const int lim1,
+                                         const int difflim, const int beta,
+                                         const int S0, const int S1,
+                                         const int S2, const int S3)
+{
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int t, u, diff;
+
+    t = src[0*step] - src[-1*step];
+    if(!t){
+        return;
+    }
+    u = (alpha * FFABS(t)) >> 7;
+    if(u > 3 - (flag0 && flag1)){
+        return;
+    }
+
+    t <<= 2;
+    if(flag0 && flag1)
+        t += src[-2*step] - src[1*step];
+    diff = CLIP_SYMM((t + 4) >> 3, difflim);
+    src[-1*step] = cm[src[-1*step] + diff];
+    src[ 0*step] = cm[src[ 0*step] - diff];
+    if(FFABS(S2) <= beta && flag0){
+        t = (S0 + S2 - diff) >> 1;
+        src[-2*step] = cm[src[-2*step] - CLIP_SYMM(t, lim1)];
+    }
+    if(FFABS(S3) <= beta && flag1){
+        t = (S1 + S3 + diff) >> 1;
+        src[ 1*step] = cm[src[ 1*step] - CLIP_SYMM(t, lim0)];
+    }
+}
+
+/**
+ * This macro is used for calculating 25*x0+26*x1+26*x2+26*x3+25*x4
+ * or 25*x0+26*x1+51*x2+26*x3
+ * @param  sub - index of the value with coefficient = 25
+ * @param last - index of the value with coefficient 25 or 51
+ */
+#define RV40_STRONG_FILTER(src, step, start, last, sub) \
+     26*(src[start    *step] + src[(start+1)*step]  + src[(start+2)*step] \
+       + src[(start+3)*step] + src[last     *step]) - src[last     *step] \
+       - src[sub      *step]
+
+/**
+ * Deblocking filter, the altered version from JVT-A003r1 H.26L draft.
+ */
+static inline void rv40_adaptive_loop_filter(uint8_t *src, const int step,
+                                             const int stride, const int dmode,
+                                             const int lim0, const int lim1,
+                                             const int alpha,
+                                             const int beta, const int beta2,
+                                             const int chroma, const int edge)
+{
+    int diffs[4][4];
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    uint8_t *ptr;
+    int flag0 = 1, flag1 = 1;
+    int strength0 = 3, strength1 = 3;
+    int i;
+    int lims;
+
+    for(i = 0, ptr = src; i < 4; i++, ptr += stride){
+        diffs[i][0] = ptr[-2*step] - ptr[-1*step];
+        diffs[i][1] = ptr[ 1*step] - ptr[ 0*step];
+        s0 += diffs[i][0];
+        s1 += diffs[i][1];
+    }
+    if(FFABS(s0) >= (beta<<2)){
+        strength0 = 1;
+    }
+    if(FFABS(s1) >= (beta<<2)){
+        strength1 = 1;
+    }
+    if(strength0 + strength1 <= 2){
+        return;
+    }
+
+    for(i = 0, ptr = src; i < 4; i++, ptr += stride){
+        diffs[i][2] = ptr[-2*step] - ptr[-3*step];
+        diffs[i][3] = ptr[ 1*step] - ptr[ 2*step];
+        s2 += diffs[i][2];
+        s3 += diffs[i][3];
+    }
+
+    if(!edge)
+        flag0 = flag1 = 0;
+    else{
+        flag0 = (strength0 == 3) && (FFABS(s2) < beta2);
+        flag1 = (strength1 == 3) && (FFABS(s3) < beta2);
+    }
+
+    lims = (lim0 + lim1 + strength0 + strength1) >> 1;
+    if(flag0 && flag1){ /* strong filtering */
+        for(i = 0; i < 4; i++, src += stride){
+            int diff[2], sflag, p0, p1;
+            int t = src[0*step] - src[-1*step];
+
+            if(!t) continue;
+            sflag = (alpha * FFABS(t)) >> 7;
+            if(sflag > 1) continue;
+
+            p0 = (RV40_STRONG_FILTER(src, step, -3, 1, -3) + rv40_dither_l[dmode + i]) >> 7;
+            p1 = (RV40_STRONG_FILTER(src, step, -2, 2, -2) + rv40_dither_r[dmode + i]) >> 7;
+            diff[0] = src[-1*step];
+            diff[1] = src[ 0*step];
+            src[-1*step] = sflag ? av_clip(p0, src[-1*step] - lims, src[-1*step] + lims) : p0;
+            src[ 0*step] = sflag ? av_clip(p1, src[ 0*step] - lims, src[ 0*step] + lims) : p1;
+            diff[0] -= src[-1*step];
+            diff[1] -= src[ 0*step];
+            p0 = (RV40_STRONG_FILTER(src, step, -4, 0, -4) + rv40_dither_l[dmode + i] + diff[1]*25) >> 7;
+            p1 = (RV40_STRONG_FILTER(src, step, -1, 3, -1) + rv40_dither_r[dmode + i] + diff[0]*25) >> 7;
+            src[-2*step] = sflag ? av_clip(p0, src[-2*step] - lims, src[-2*step] + lims) : p0;
+            src[ 1*step] = sflag ? av_clip(p1, src[ 1*step] - lims, src[ 1*step] + lims) : p1;
+            if(!chroma){
+                src[-3*step] = (RV40_STRONG_FILTER(src, step, -4, -3, -1) + 64) >> 7;
+                src[ 2*step] = (RV40_STRONG_FILTER(src, step,  0,  2,  0) + 64) >> 7;
+            }
+        }
+    }else if(strength0 == 3 && strength1 == 3){
+        for(i = 0; i < 4; i++, src += stride)
+            rv40_weak_loop_filter(src, step, 1, 1, alpha, lim0, lim1, lims, beta,
+                                  diffs[i][0], diffs[i][1], diffs[i][2], diffs[i][3]);
+    }else{
+        for(i = 0; i < 4; i++, src += stride)
+            rv40_weak_loop_filter(src, step, strength0==3, strength1==3,
+                                  alpha, lim0>>1, lim1>>1, lims>>1, beta,
+                                  diffs[i][0], diffs[i][1], diffs[i][2], diffs[i][3]);
+    }
+}
+
+static void rv40_v_loop_filter(uint8_t *src, int stride, int dmode, int lim0, int lim1,
+                               int alpha, int beta, int beta2, int chroma, int edge){
+    rv40_adaptive_loop_filter(src, 1, stride, dmode, lim0, lim1, alpha, beta, beta2, chroma, edge);
+}
+static void rv40_h_loop_filter(uint8_t *src, int stride, int dmode, int lim0, int lim1,
+                               int alpha, int beta, int beta2, int chroma, int edge){
+    rv40_adaptive_loop_filter(src, stride, 1, dmode, lim0, lim1, alpha, beta, beta2, chroma, edge);
+}
+
+static int rv40_set_deblock_coef(RV34DecContext *r)
+{
+    MpegEncContext *s = &r->s;
+    int mvmask = 0, i, j, dx, dy;
+    int midx = s->mb_x * 2 + s->mb_y * 2 * s->b8_stride;
+    int16_t (*motion_val)[2] = s->current_picture_ptr->motion_val[0][midx];
+    if(s->pict_type == FF_I_TYPE)
+        return 0;
+    for(j = 0; j < 2; j++){
+        for(i = 0; i < 2; i++){
+            if(i || s->mb_x){
+                dx = FFABS(motion_val[i][0] - motion_val[i - 1][0]);
+                dy = FFABS(motion_val[i][1] - motion_val[i - 1][1]);
+                if(dx > 3 || dy > 3){
+                    mvmask |= 0x11 << (i*2 + j*8);
+                }
+            }
+            if(j || !s->first_slice_line){
+                dx = FFABS(motion_val[i][0] - motion_val[i - s->b8_stride][0]);
+                dy = FFABS(motion_val[i][1] - motion_val[i - s->b8_stride][1]);
+                if(dx > 3 || dy > 3){
+                    mvmask |= 0x03 << (i*2 + j*8);
+                }
+            }
+        }
+        motion_val += s->b8_stride;
+    }
+    return mvmask;
+}
+
+static void rv40_loop_filter(RV34DecContext *r)
+{
+    MpegEncContext *s = &r->s;
+    int mb_pos;
+    int i, j;
+    uint8_t *Y, *C;
+    int alpha, beta, betaY, betaC;
+    int q;
+    // 0 - cur block, 1 - top, 2 - left, 3 - bottom
+    int mbtype[4], clip[4], mvmasks[4], cbp[4], uvcbp[4][2];
+
+    if(s->pict_type == FF_B_TYPE)
+        return;
+
+    for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++){
+        mb_pos = s->mb_y * s->mb_stride;
+        for(s->mb_x = 0; s->mb_x < s->mb_width; s->mb_x++, mb_pos++){
+            int btype = s->current_picture_ptr->mb_type[mb_pos];
+            if(IS_INTRA(btype) || IS_SEPARATE_DC(btype)){
+                r->cbp_luma  [mb_pos] = 0xFFFF;
+            }
+            if(IS_INTRA(btype)){
+                r->cbp_chroma[mb_pos] = 0xFF;
+            }
+        }
+    }
+    for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++){
+        mb_pos = s->mb_y * s->mb_stride;
+        for(s->mb_x = 0; s->mb_x < s->mb_width; s->mb_x++, mb_pos++){
+            int y_h_deblock, y_v_deblock;
+            int c_v_deblock[2], c_h_deblock[2];
+            int cur_clip, left_clip;
+
+            ff_init_block_index(s);
+            ff_update_block_index(s);
+            Y = s->dest[0];
+            q = s->current_picture_ptr->qscale_table[mb_pos];
+            alpha = rv40_alpha_tab[q];
+            beta  = rv40_beta_tab [q];
+            betaY = betaC = beta * 3;
+            if(s->width * s->height <= 0x6300){
+                betaY += beta;
+            }
+
+            mvmasks[0] = r->deblock_coefs[mb_pos];
+            mbtype [0] = s->current_picture_ptr->mb_type[mb_pos];
+            cbp    [0] = r->cbp_luma[mb_pos];
+            uvcbp[0][0] = r->cbp_chroma[mb_pos] & 0xF;
+            uvcbp[0][1] = r->cbp_chroma[mb_pos] >> 4;
+            for(i = 1; i < 4; i++){
+                mvmasks[i] = 0;
+                mbtype [i] = mbtype[0];
+                cbp    [i] = 0;
+                uvcbp[1][0] = uvcbp[1][1] = 0;
+            }
+            if(s->mb_y){
+                mvmasks[1] = r->deblock_coefs[mb_pos - s->mb_stride] & 0xF000;
+                mbtype [1] = s->current_picture_ptr->mb_type[mb_pos - s->mb_stride];
+                cbp    [1] = r->cbp_luma[mb_pos - s->mb_stride] & 0xF000;
+                uvcbp[1][0] =  r->cbp_chroma[mb_pos - s->mb_stride]       & 0xC;
+                uvcbp[1][1] = (r->cbp_chroma[mb_pos - s->mb_stride] >> 4) & 0xC;
+            }
+            if(s->mb_x){
+                mvmasks[2] = r->deblock_coefs[mb_pos - 1] & 0x8888;
+                mbtype [2] = s->current_picture_ptr->mb_type[mb_pos - 1];
+                cbp    [2] = r->cbp_luma[mb_pos - 1] & 0x8888;
+                uvcbp[2][0] =  r->cbp_chroma[mb_pos - 1]       & 0xA;
+                uvcbp[2][1] = (r->cbp_chroma[mb_pos - 1] >> 4) & 0xA;
+            }
+            if(s->mb_y < s->mb_height - 1){
+                mvmasks[3] = r->deblock_coefs[mb_pos + s->mb_stride] & 0x000F;
+                mbtype [3] = s->current_picture_ptr->mb_type[mb_pos + s->mb_stride];
+                cbp    [3] = r->cbp_luma[mb_pos + s->mb_stride] & 0x000F;
+                uvcbp[3][0] =  r->cbp_chroma[mb_pos + s->mb_stride]       & 0x3;
+                uvcbp[3][1] = (r->cbp_chroma[mb_pos + s->mb_stride] >> 4) & 0x3;
+            }
+            for(i = 0; i < 4; i++){
+                mbtype[i] = (IS_INTRA(mbtype[i]) || IS_SEPARATE_DC(mbtype[i])) ? 2 : 1;
+                clip[i] = rv40_filter_clip_tbl[mbtype[i]][q];
+            }
+            y_h_deblock = cbp[0] | ((cbp[0] << 4) & ~0x000F) | (cbp[1] >> 12)
+                        | ((cbp[3] << 20) & ~0x000F) | (cbp[3] << 16)
+                        | mvmasks[0] | (mvmasks[3] << 16);
+            y_v_deblock = ((cbp[0] << 1) & ~0x1111) | (cbp[2] >> 3)
+                        | cbp[0] | (cbp[3] << 16)
+                        | mvmasks[0] | (mvmasks[3] << 16);
+            if(!s->mb_x){
+                y_v_deblock &= ~0x1111;
+            }
+            if(!s->mb_y){
+                y_h_deblock &= ~0x000F;
+            }
+            if(s->mb_y == s->mb_height - 1 || (mbtype[0] == 2 || mbtype[3] == 2)){
+                y_h_deblock &= ~0xF0000;
+            }
+            cbp[0] = cbp[0] | (cbp[3] << 16)
+                   | mvmasks[0] | (mvmasks[3] << 16);
+            for(i = 0; i < 2; i++){
+                c_v_deblock[i] = ((uvcbp[0][i] << 1) & ~0x5) | (uvcbp[2][i] >> 1)
+                               | (uvcbp[3][i] << 4) | uvcbp[0][i];
+                c_h_deblock[i] = (uvcbp[3][i] << 4) | uvcbp[0][i] | (uvcbp[1][i] >> 2)
+                               | (uvcbp[3][i] << 6) | (uvcbp[0][i] << 2);
+                uvcbp[0][i] = (uvcbp[3][i] << 4) | uvcbp[0][i];
+                if(!s->mb_x){
+                    c_v_deblock[i] &= ~0x5;
+                }
+                if(!s->mb_y){
+                    c_h_deblock[i] &= ~0x3;
+                }
+                if(s->mb_y == s->mb_height - 1 || mbtype[0] == 2 || mbtype[3] == 2){
+                    c_h_deblock[i] &= ~0x30;
+                }
+            }
+
+            Y = s->dest[0];
+            cur_clip  = cbp[0] & 0x0001 ? clip[0] : 0;
+            left_clip = (cbp[2] | mvmasks[2]) & 0x0008 ? clip[2] : 0;
+            if((y_h_deblock & 0x0010)){
+                rv40_h_loop_filter(Y+4*s->linesize, s->linesize, 0,
+                                   cbp[0] & 0x0010 ? clip[0] : 0,
+                                   cur_clip,
+                                   alpha, beta, betaY, 0, 0);
+            }
+            if((y_v_deblock & 0x0001) && !(mbtype[0] == 2 || mbtype[2] == 2)){
+                rv40_v_loop_filter(Y, s->linesize, 0,
+                                   cur_clip,
+                                   left_clip,
+                                   alpha, beta, betaY, 0, 0);
+            }
+            if((y_h_deblock & 0x0001) &&  (mbtype[0] == 2 || mbtype[1] == 2)){
+                rv40_h_loop_filter(Y, s->linesize, 0,
+                                   cur_clip,
+                                   (cbp[1] | mvmasks[1]) & 0x1000 ? clip[1] : 0,
+                                   alpha, beta, betaY, 0, 1);
+            }
+            if((y_v_deblock & 0x0001) &&  (mbtype[0] == 2 || mbtype[2] == 2)){
+                rv40_v_loop_filter(Y, s->linesize, 0,
+                                   cur_clip,
+                                   left_clip,
+                                   alpha, beta, betaY, 0, 1);
+            }
+            for(i = 1; i < 4; i++){
+                Y += 4;
+                cur_clip = cbp[0] & (0x0001<<i) ? clip[0] : 0;
+                if((y_h_deblock & (0x0010<<i))){
+                    rv40_h_loop_filter(Y+4*s->linesize, s->linesize, i*4,
+                                       cbp[0] & (0x0010<<i) ? clip[0] : 0,
+                                       cur_clip,
+                                       alpha, beta, betaY, 0, 0);
+                }
+                if((y_v_deblock & (0x0001<<i))){
+                    rv40_v_loop_filter(Y, s->linesize, i*4,
+                                       cur_clip,
+                                       cbp[0] & (0x0001<<(i-1)) ? clip[0] : 0,
+                                       alpha, beta, betaY, 0, 0);
+                }
+                if((y_h_deblock & (0x0001<<i)) && (mbtype[0] == 2 || mbtype[1] == 2)){
+                    rv40_h_loop_filter(Y, s->linesize, i*4,
+                                       cur_clip,
+                                       (cbp[1] | mvmasks[1]) & (0x1000<<i) ? clip[1] : 0,
+                                       alpha, beta, betaY, 0, 1);
+                }                
+            }
+            for(j = 4; j < 16; j += 4){
+                Y = s->dest[0] + j*s->linesize;
+                for(i = 0; i < 4; i++){
+                    int ij = i+j;
+                    cur_clip = cbp[0] & (0x0001<<ij) ? clip[0] : 0;
+                    if((y_h_deblock & (0x0010<<ij))){
+                        rv40_h_loop_filter(Y+4*s->linesize, s->linesize, ij,
+                                           cbp[0] & (0x0010<<ij) ? clip[0] : 0,
+                                           cur_clip,
+                                           alpha, beta, betaY, 0, 0);
+                    }
+                    if((y_v_deblock & (0x0001<<ij))){
+                        int left_clip;
+                        if(i){
+                            left_clip = cbp[0] & (0x0001<<(ij-1)) ? clip[0] : 0;
+                        }else{
+                            left_clip = (cbp[2] | mvmasks[2]) & (0x0008<<ij) ? clip[2] : 0;
+                        }
+                        rv40_v_loop_filter(Y, s->linesize, ij,
+                                           cur_clip,
+                                           left_clip,
+                                           alpha, beta, betaY, 0, !i && (mbtype[0] == 2 || mbtype[2] == 2));
+                    }
+                    Y += 4;
+                }
+            }
+            for(i = 0; i < 2; i++){
+                C = s->dest[i+1];
+                if((c_h_deblock[i] & 0x4)){
+                    rv40_h_loop_filter(C+4*s->uvlinesize, s->uvlinesize, 0,
+                                       uvcbp[0][i] & 0x4 ? clip[0] : 0,
+                                       uvcbp[0][i] & 0x1 ? clip[0] : 0,
+                                       alpha, beta, betaC, 1, 0);
+                }
+                if((c_v_deblock[i] & 0x1) && !(mbtype[0] == 2 || mbtype[2] == 2)){
+                    rv40_v_loop_filter(C, s->uvlinesize, 0,
+                                       uvcbp[0][i] & 0x1 ? clip[0] : 0,
+                                       uvcbp[2][i] & 0x2 ? clip[2] : 0,
+                                       alpha, beta, betaC, 1, 0);
+                }
+                if((c_h_deblock[i] & 0x1) &&  (mbtype[0] == 2 || mbtype[1] == 2)){
+                    rv40_h_loop_filter(C, s->uvlinesize, 0,
+                                       uvcbp[0][i] & 0x1 ? clip[0] : 0,
+                                       uvcbp[1][i] & 0x4 ? clip[1] : 0,
+                                       alpha, beta, betaC, 1, 1);
+                }
+                if((c_v_deblock[i] & 0x1) &&  (mbtype[0] == 2 || mbtype[2] == 2)){
+                    rv40_v_loop_filter(C, s->uvlinesize, 0,
+                                       uvcbp[0][i] & 0x1 ? clip[0] : 0,
+                                       uvcbp[2][i] & 0x2 ? clip[2] : 0,
+                                       alpha, beta, betaC, 1, 1);
+                }
+                C += 4;
+                if((c_h_deblock[i] & 0x8)){
+                    rv40_h_loop_filter(C+4*s->uvlinesize, s->uvlinesize, 8,
+                                       uvcbp[0][i] & 0x8 ? clip[0] : 0,
+                                       uvcbp[0][i] & 0x2 ? clip[0] : 0,
+                                       alpha, beta, betaC, 1, 0);
+                }
+                if((c_v_deblock[i] & 0x2)){
+                    rv40_v_loop_filter(C, s->uvlinesize, 0,
+                                       uvcbp[0][i] & 0x2 ? clip[0] : 0,
+                                       uvcbp[0][i] & 0x1 ? clip[0] : 0,
+                                       alpha, beta, betaC, 1, 0);
+                }
+                if((c_h_deblock[i] & 0x2) && (mbtype[0] == 2 || mbtype[1] == 2)){
+                    rv40_h_loop_filter(C, s->uvlinesize, 8,
+                                       uvcbp[0][i] & 0x2 ? clip[0] : 0,
+                                       uvcbp[1][i] & 0x8 ? clip[1] : 0,
+                                       alpha, beta, betaC, 1, 1);
+                }
+                C = s->dest[i+1] + 4*s->uvlinesize;
+                if((c_h_deblock[i] & 0x10)){
+                    rv40_h_loop_filter(C+4*s->uvlinesize, s->uvlinesize, 0,
+                                       uvcbp[0][i] & 0x10 ? clip[0] : 0,
+                                       uvcbp[0][i] & 0x04 ? clip[0] : 0,
+                                       alpha, beta, betaC, 1, 0);
+                }
+                if((c_v_deblock[i] & 0x4)){
+                    rv40_v_loop_filter(C, s->uvlinesize, 8,
+                                       uvcbp[0][i] & 0x4 ? clip[0] : 0,
+                                       uvcbp[2][i] & 0x8 ? clip[2] : 0,
+                                       alpha, beta, betaC, 1, (mbtype[0] == 2 || mbtype[2] == 2));
+                }
+                C += 4;
+                if((c_h_deblock[i] & 0x20)){
+                    rv40_h_loop_filter(C+4*s->uvlinesize, s->uvlinesize, 8,
+                                       uvcbp[0][i] & 0x20 ? clip[3] : 0,
+                                       uvcbp[0][i] & 0x08 ? clip[0] : 0,
+                                       alpha, beta, betaC, 1, 0);
+                }
+                if((c_v_deblock[i] & 0x8)){
+                    rv40_v_loop_filter(C, s->uvlinesize, 8,
+                                       uvcbp[0][i] & 0x8 ? clip[0] : 0,
+                                       uvcbp[0][i] & 0x4 ? clip[0] : 0,
+                                       alpha, beta, betaC, 1, 0);
+                }
+            }
+        }
+    }
+}
+
+/**
  * Initialize decoder.
  */
 static av_cold int rv40_decode_init(AVCodecContext *avctx)
@@ -261,6 +703,8 @@
     r->parse_slice_header = rv40_parse_slice_header;
     r->decode_intra_types = rv40_decode_intra_types;
     r->decode_mb_info     = rv40_decode_mb_info;
+    r->loop_filter        = rv40_loop_filter;
+    r->set_deblock_coef   = rv40_set_deblock_coef;
     r->luma_dc_quant_i = rv40_luma_dc_quant[0];
     r->luma_dc_quant_p = rv40_luma_dc_quant[1];
     return 0;