[FFmpeg-cvslog] dirac: Fix mmx/sse haar wavelet compose

Michael Niedermayer git at videolan.org
Tue Nov 1 22:03:58 CET 2011


ffmpeg | branch: master | Michael Niedermayer <michaelni at gmx.at> | Tue Nov  1 21:41:01 2011 +0100| [754539a4095a40b111c40c169ba079c3e0018e74] | committer: Michael Niedermayer

dirac: Fix mmx/sse haar wavelet compose

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=754539a4095a40b111c40c169ba079c3e0018e74
---

 libavcodec/x86/dwt.c        |   53 ++++++++++++++++++++++--------------------
 libavcodec/x86/dwt_yasm.asm |   20 +++++++---------
 2 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/libavcodec/x86/dwt.c b/libavcodec/x86/dwt.c
index cc0a711..1d04c7d 100644
--- a/libavcodec/x86/dwt.c
+++ b/libavcodec/x86/dwt.c
@@ -30,6 +30,8 @@ void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b
 void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
 void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
 void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
 \
 static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
 { \
@@ -83,6 +85,28 @@ static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
 \
     ff_vertical_compose_haar##ext(b0, b1, width_align); \
 } \
+static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
 \
 
 #if HAVE_YASM
@@ -95,11 +119,6 @@ COMPOSE_VERTICAL(_sse2, 8)
 
 void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
 
-void ff_horizontal_compose_haar0i_mmx(IDWTELEM *b, IDWTELEM *tmp, int w);
-void ff_horizontal_compose_haar1i_mmx(IDWTELEM *b, IDWTELEM *tmp, int w);
-void ff_horizontal_compose_haar0i_sse2(IDWTELEM *b, IDWTELEM *tmp, int w);
-void ff_horizontal_compose_haar1i_sse2(IDWTELEM *b, IDWTELEM *tmp, int w);
-
 void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x)
 {
     for (; x < w2; x++) {
@@ -108,22 +127,6 @@ void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x
     }
 }
 
-void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x)
-{
-    for (; x < w2; x++) {
-        b[2*x  ] = tmp[x];
-        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);
-    }
-}
-
-void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x)
-{
-    for (; x < w2; x++) {
-        b[2*x  ] = (tmp[x] + 1)>>1;
-        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;
-    }
-}
-
 void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
 {
 #if HAVE_YASM
@@ -148,11 +151,11 @@ void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
         break;
     case DWT_DIRAC_HAAR0:
         d->vertical_compose   = vertical_compose_haar_mmx;
-        d->horizontal_compose = ff_horizontal_compose_haar0i_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
         break;
     case DWT_DIRAC_HAAR1:
         d->vertical_compose   = vertical_compose_haar_mmx;
-        d->horizontal_compose = ff_horizontal_compose_haar1i_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
         break;
     }
 #endif
@@ -175,11 +178,11 @@ void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
         break;
     case DWT_DIRAC_HAAR0:
         d->vertical_compose   = vertical_compose_haar_sse2;
-//MMXDISABLED         d->horizontal_compose = ff_horizontal_compose_haar0i_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
         break;
     case DWT_DIRAC_HAAR1:
         d->vertical_compose   = vertical_compose_haar_sse2;
-        d->horizontal_compose = ff_horizontal_compose_haar1i_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
         break;
     }
 
diff --git a/libavcodec/x86/dwt_yasm.asm b/libavcodec/x86/dwt_yasm.asm
index b008906..7d7471c 100644
--- a/libavcodec/x86/dwt_yasm.asm
+++ b/libavcodec/x86/dwt_yasm.asm
@@ -22,8 +22,6 @@
 %include "x86inc.asm"
 
 cextern horizontal_compose_dd97i_end_c
-cextern horizontal_compose_haar0i_end_c
-cextern horizontal_compose_haar1i_end_c
 
 SECTION_RODATA
 pw_1: times 8 dw 1
@@ -188,7 +186,7 @@ cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
 ; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
 cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
     mov    w2d, wd
-    xor     xd, xd
+    xor     xq, xq
     shr    w2d, 1
     lea  b_w2q, [bq+wq]
     mova    m3, [pw_1]
@@ -199,13 +197,13 @@ cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
     psraw   m1, 1
     psubw   m0, m1
     mova    [tmpq + 2*xq], m0
-    add     xd, mmsize/2
-    cmp     xd, w2d
+    add     xq, mmsize/2
+    cmp     xq, w2q
     jl      .lowpass_loop
 
-    xor     xd, xd
-    and    w2d, ~(mmsize/2 - 1)
-    cmp    w2d, mmsize/2
+    xor     xq, xq
+    and    w2q, ~(mmsize/2 - 1)
+    cmp    w2q, mmsize/2
     jl      .end
 
 .highpass_loop:
@@ -226,11 +224,11 @@ cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
     mova    [bq+4*xq], m0
     mova    [bq+4*xq+mmsize], m2
 
-    add     xd, mmsize/2
-    cmp     xd, w2d
+    add     xq, mmsize/2
+    cmp     xq, w2q
     jl      .highpass_loop
 .end:
-    END_HORIZONTAL horizontal_compose_haar%2i_end_c
+    REP_RET
 %endmacro
 
 



More information about the ffmpeg-cvslog mailing list