[FFmpeg-devel] [PATCH 3/4] avcodec/x86: avg_pixels16_y2_sse2

Sun Feb 3 16:31:08 CET 2013

about 1% faster bidirectional motion compensation for matrixbench
on i7

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
---
 libavcodec/x86/dsputil_mmx.c |    3 +++
 libavcodec/x86/hpeldsp.asm   |   22 ++++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 047504e..1c796ae 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1529,6 +1529,8 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                               int line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           int line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
 
 void ff_put_h264_chroma_mc8_rnd_mmx  (uint8_t *dst, uint8_t *src,
                                       int stride, int h, int x, int y);
@@ -2043,6 +2045,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
 
             c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
             c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+            c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
         }
     }
 
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 088f811..a151834 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -388,31 +388,31 @@ AVG_PIXELS8_X2
 
 ; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 %macro AVG_PIXELS8_Y2 0
-cglobal avg_pixels8_y2, 4,5
+cglobal avg_pixels %+ mmsize %+ _y2, 4,5
     movsxdifnidn r2, r2d
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
+    movu         m3, [r0+r2]
+    movu         m4, [r0+r4]
     PAVGB        m0, m3
     PAVGB        m1, m4
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     PAVGB        m2, m1
     PAVGB        m1, m0
     add          r0, r4
     add          r1, r4
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
+    movu         m3, [r0+r2]
+    movu         m4, [r0+r4]
     PAVGB        m2, m3
     PAVGB        m1, m4
     mova    [r0+r2], m2
@@ -427,6 +427,8 @@ INIT_MMX mmxext
 AVG_PIXELS8_Y2
 INIT_MMX 3dnow
 AVG_PIXELS8_Y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
 
 
 ; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-- 
1.7.9.5