[FFmpeg-cvslog] vp3: integrate clear_blocks with idct of previous block.

Ronald S. Bultje git at videolan.org
Sun Jan 20 14:02:00 CET 2013


ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Fri Jan 18 16:43:04 2013 +0100| [aeaf268e52fc11c1f64914a319e0edddf1346d6a] | committer: Ronald S. Bultje

vp3: integrate clear_blocks with idct of previous block.

This is identical to what e.g. vp8 does, and prevents the function call
overhead (plus dependency on dsputil for this particular function).

Arm asm updated by Janne Grunau <janne-libav at jannau.net>.

Signed-off-by: Janne Grunau <janne-libav at jannau.net>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=aeaf268e52fc11c1f64914a319e0edddf1346d6a
---

 libavcodec/arm/vp3dsp_neon.S    |   22 +++++++++++++++-------
 libavcodec/ppc/vp3dsp_altivec.c |    2 ++
 libavcodec/vp3.c                |    5 ++---
 libavcodec/vp3dsp.c             |    5 ++++-
 libavcodec/vp3dsp.h             |    2 +-
 libavcodec/x86/vp3dsp.asm       |   27 ++++++++++++++++++++-------
 libavcodec/x86/vp3dsp_init.c    |    2 +-
 7 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index e09de57..e5ecfc3 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -108,14 +108,20 @@ endfunc
 
 function vp3_idct_start_neon
     vpush           {d8-d15}
+    vmov.i16        q4,  #0
+    vmov.i16        q5,  #0
     movrel          r3,  vp3_idct_constants
     vld1.64         {d0-d1},   [r3,:128]
-    vld1.64         {d16-d19}, [r2,:128]!
-    vld1.64         {d20-d23}, [r2,:128]!
-    vld1.64         {d24-d27}, [r2,:128]!
+    vld1.64         {d16-d19}, [r2,:128]
+    vst1.64         {q4-q5},   [r2,:128]!
+    vld1.64         {d20-d23}, [r2,:128]
+    vst1.64         {q4-q5},   [r2,:128]!
+    vld1.64         {d24-d27}, [r2,:128]
+    vst1.64         {q4-q5},   [r2,:128]!
     vadd.s16        q1,  q8,  q12
     vsub.s16        q8,  q8,  q12
-    vld1.64         {d28-d31}, [r2,:128]!
+    vld1.64         {d28-d31}, [r2,:128]
+    vst1.64         {q4-q5},   [r2,:128]!
 
 vp3_idct_core_neon:
     vmull.s16       q2,  d18, xC1S7     // (ip[1] * C1) << 16
@@ -345,10 +351,12 @@ function ff_vp3_idct_add_neon, export=1
 endfunc
 
 function ff_vp3_idct_dc_add_neon, export=1
-    ldrsh           r2,  [r2]
+    ldrsh           r12, [r2]
     mov             r3,  r0
-    add             r2,  r2,  #15
-    vdup.16         q15, r2
+    add             r12, r12,  #15
+    vdup.16         q15, r12
+    mov             r12, 0
+    strh            r12, [r2]
     vshr.s16        q15, q15, #5
 
     vld1.8          {d0}, [r0,:64], r1
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index 75a3677..6adf9ae 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -140,6 +140,7 @@ static void vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64])
     PUT(b5)     dst += stride;
     PUT(b6)     dst += stride;
     PUT(b7)
+    memset(block, 0, sizeof(*block) * 64);
 }
 
 static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
@@ -171,6 +172,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
     ADD(b5)     dst += stride;
     ADD(b6)     dst += stride;
     ADD(b7)
+    memset(block, 0, sizeof(*block) * 64);
 }
 
 #endif /* HAVE_ALTIVEC */
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 0340c22..9417535 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -138,6 +138,7 @@ typedef struct Vp3DecodeContext {
     DSPContext dsp;
     VideoDSPContext vdsp;
     VP3DSPContext vp3dsp;
+    DECLARE_ALIGNED(16, DCTELEM, block)[64];
     int flipped_image;
     int last_slice_end;
     int skip_loop_filter;
@@ -1458,7 +1459,7 @@ static void await_reference_row(Vp3DecodeContext *s, Vp3Fragment *fragment, int
 static void render_slice(Vp3DecodeContext *s, int slice)
 {
     int x, y, i, j, fragment;
-    LOCAL_ALIGNED_16(DCTELEM, block, [64]);
+    DCTELEM *block = s->block;
     int motion_x = 0xdeadbeef, motion_y = 0xdeadbeef;
     int motion_halfpel_index;
     uint8_t *motion_source;
@@ -1571,8 +1572,6 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                         }
                     }
 
-                        s->dsp.clear_block(block);
-
                     /* invert DCT and place (or add) in final output */
 
                     if (s->all_fragments[i].coding_method == MODE_INTRA) {
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index 9b0b5d0..9e6209d 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -215,14 +215,16 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
 
 static void vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
     idct(dest, line_size, block, 1);
+    memset(block, 0, sizeof(*block) * 64);
 }
 
 static void vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
     idct(dest, line_size, block, 2);
+    memset(block, 0, sizeof(*block) * 64);
 }
 
 static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
-                              const DCTELEM *block/*align 16*/){
+                              DCTELEM *block/*align 16*/){
     int i, dc = (block[0] + 15) >> 5;
 
     for(i = 0; i < 8; i++){
@@ -236,6 +238,7 @@ static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
         dest[7] = av_clip_uint8(dest[7] + dc);
         dest += line_size;
     }
+    block[0] = 0;
 }
 
 static void vp3_v_loop_filter_c(uint8_t *first_pixel, int stride,
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 3781bbf..feb3000 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -25,7 +25,7 @@
 typedef struct VP3DSPContext {
     void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block);
     void (*idct_add)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*idct_dc_add)(uint8_t *dest, int line_size, const DCTELEM *block);
+    void (*idct_dc_add)(uint8_t *dest, int line_size, DCTELEM *block);
     void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
     void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
 
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index fc1e776..d2c464c 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -562,6 +562,13 @@ cglobal vp3_idct_put, 3, 4, 9
 %endif
 %assign %%i %%i+64
 %endrep
+
+    pxor          m0, m0
+%assign %%offset 0
+%rep 128/mmsize
+    mova [r2+%%offset], m0
+%assign %%offset %%offset+mmsize
+%endrep
     RET
 
 cglobal vp3_idct_add, 3, 4, 9
@@ -600,6 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9
     movhps   [r0+r1], m0
 %endif
     lea           r0, [r0+r1*2]
+%assign %%offset 0
+%rep 32/mmsize
+    mova [r2+%%offset], m4
+%assign %%offset %%offset+mmsize
+%endrep
     add           r2, 32
     dec           r3
     jg .loop
@@ -620,7 +632,7 @@ vp3_idct_funcs
     paddusb       m2, m0
     movq          m4, [r0+r1*2]
     paddusb       m3, m0
-    movq          m5, [r0+r3  ]
+    movq          m5, [r0+r2  ]
     paddusb       m4, m0
     paddusb       m5, m0
     psubusb       m2, m1
@@ -630,7 +642,7 @@ vp3_idct_funcs
     movq   [r0+r1  ], m3
     psubusb       m5, m1
     movq   [r0+r1*2], m4
-    movq   [r0+r3  ], m5
+    movq   [r0+r2  ], m5
 %endmacro
 
 INIT_MMX mmxext
@@ -638,11 +650,12 @@ cglobal vp3_idct_dc_add, 3, 4
 %if ARCH_X86_64
     movsxd        r1, r1d
 %endif
-    lea           r3, [r1*3]
-    movsx         r2, word [r2]
-    add           r2, 15
-    sar           r2, 5
-    movd          m0, r2d
+    movsx         r3, word [r2]
+    mov    word [r2], 0
+    lea           r2, [r1*3]
+    add           r3, 15
+    sar           r3, 5
+    movd          m0, r3d
     pshufw        m0, m0, 0x0
     pxor          m1, m1
     psubw         m1, m0
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index bbe74ba..95beeab 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -32,7 +32,7 @@ void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
 
 void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
-                               const DCTELEM *block);
+                               DCTELEM *block);
 
 void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
                                  int *bounding_values);



More information about the ffmpeg-cvslog mailing list