[FFmpeg-devel] [PATCH] Dsputilize some functions from APE decode 2/2 - SSE2

Kostya kostya.shishkov
Thu Jul 10 17:17:59 CEST 2008


On Thu, Jul 10, 2008 at 01:20:18PM +0200, Michael Niedermayer wrote:
> On Thu, Jul 10, 2008 at 01:48:46PM +0300, Kostya wrote:
> > On Thu, Jul 10, 2008 at 11:48:14AM +0200, Michael Niedermayer wrote:
> > > On Thu, Jul 10, 2008 at 11:16:01AM +0300, Kostya wrote:
> > > [...]
> > > > +static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
> > > > +{
> > > > +    x86_reg o = -(order << 1);
> > > > +    v1 += order;
> > > > +    v2 += order;
> > > > +    asm volatile(
> > > > +        "1:                       \n\t"
> > > 
> > > > +        "movdqu  (%1,%2), %%xmm0  \n\t"
> > > > +        "paddw   (%0,%2), %%xmm0  \n\t"
> > > > +        "movdqa  %%xmm0,  (%0,%2) \n\t"
> > > > +        "add     $16,     %2      \n\t"
> > > > +        "movdqu  (%1,%2), %%xmm0  \n\t"
> > > > +        "paddw   (%0,%2), %%xmm0  \n\t"
> > > > +        "movdqa  %%xmm0,  (%0,%2) \n\t"
> > > > +        "add     $16,     %2      \n\t"
> > > 
> > > is that faster than:
> > > "movdqu    (%1,%2), %%xmm0  \n\t"
> > > "paddw     (%0,%2), %%xmm0  \n\t"
> > > "movdqa  %%xmm0,    (%0,%2) \n\t"
> > > "movdqu  16(%1,%2), %%xmm0  \n\t"
> > > "paddw   16(%0,%2), %%xmm0  \n\t"
> > > "movdqa  %%xmm0,  16(%0,%2) \n\t"
> > > "add     $32,     %2      \n\t"
> > > 
> > > ?
> >  
> > It was the first thing I've tried. It was slower (on Core2).
> 
> and:
> 
> "movdqu    (%1,%2), %%xmm0  \n\t"
> "movdqu  16(%1,%2), %%xmm1  \n\t"
> "paddw     (%0,%2), %%xmm0  \n\t"
> "paddw   16(%0,%2), %%xmm1  \n\t"
> "movdqa  %%xmm0,    (%0,%2) \n\t"
> "movdqa  %%xmm1,  16(%0,%2) \n\t"
> "add     $32,     %2      \n\t"
 
It's on par. Patch attached for reference.
 
> [...] 
> 
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> Those who are too smart to engage in politics are punished by being
> governed by those who are dumber. -- Plato 
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c	(revision 14100)
+++ libavcodec/i386/dsputil_mmx.c	(working copy)
@@ -2061,6 +2061,79 @@
 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
 
+
+static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+    x86_reg o = -(order << 1);
+    v1 += order;
+    v2 += order;
+    asm volatile(
+        "1:                          \n\t"
+        "movdqu   (%1,%2),   %%xmm0  \n\t"
+        "movdqu 16(%1,%2),   %%xmm1  \n\t"
+        "paddw    (%0,%2),   %%xmm0  \n\t"
+        "paddw  16(%0,%2),   %%xmm1  \n\t"
+        "movdqa   %%xmm0,    (%0,%2) \n\t"
+        "movdqa   %%xmm1,  16(%0,%2) \n\t"
+        "add      $32,       %2      \n\t"
+        "js       1b                 \n\t"
+        : "+r"(v1), "+r"(v2), "+r"(o)
+    );
+}
+
+static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+    x86_reg o = -(order << 1);
+    v1 += order;
+    v2 += order;
+    asm volatile(
+        "1:                           \n\t"
+        "movdqa    (%0,%2),   %%xmm0  \n\t"
+        "movdqa  16(%0,%2),   %%xmm2  \n\t"
+        "movdqu    (%1,%2),   %%xmm1  \n\t"
+        "movdqu  16(%1,%2),   %%xmm3  \n\t"
+        "psubw     %%xmm1,    %%xmm0  \n\t"
+        "psubw     %%xmm3,    %%xmm2  \n\t"
+        "movdqa    %%xmm0,    (%0,%2) \n\t"
+        "movdqa    %%xmm2,  16(%0,%2) \n\t"
+        "add       $32,       %2      \n\t"
+        "js        1b                 \n\t"
+        : "+r"(v1), "+r"(v2), "+r"(o)
+    );
+}
+
+static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+{
+    int res = 0;
+    DECLARE_ALIGNED_16(int64_t, sh);
+    x86_reg o = -(order << 1);
+
+    v1 += order;
+    v2 += order;
+    sh = shift;
+    asm volatile(
+        "pxor      %%xmm7,  %%xmm7        \n\t"
+        "1:                               \n\t"
+        "movdqu    (%0,%3), %%xmm0        \n\t"
+        "movdqu  16(%0,%3), %%xmm1        \n\t"
+        "pmaddwd   (%1,%3), %%xmm0        \n\t"
+        "pmaddwd 16(%1,%3), %%xmm1        \n\t"
+        "paddd     %%xmm0,  %%xmm7        \n\t"
+        "paddd     %%xmm1,  %%xmm7        \n\t"
+        "add       $32,     %3            \n\t"
+        "js        1b                     \n\t"
+        "movhlps   %%xmm7,  %%xmm2        \n\t"
+        "paddd     %%xmm2,  %%xmm7        \n\t"
+        "psrad     %4,      %%xmm7        \n\t"
+        "pshuflw   $0x4E,   %%xmm7,%%xmm2 \n\t"
+        "paddd     %%xmm2,  %%xmm7        \n\t"
+        "movd      %%xmm7,  %2            \n\t"
+        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
+        : "m"(sh)
+    );
+    return res;
+}
+
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     mm_flags = mm_support();
@@ -2429,6 +2502,11 @@
         }
         if(mm_flags & MM_3DNOW)
             c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+        if(mm_flags & MM_SSE2){
+            c->add_int16 = add_int16_sse2;
+            c->sub_int16 = sub_int16_sse2;
+            c->scalarproduct_int16 = scalarproduct_int16_sse2;
+        }
     }
 
     if (ENABLE_ENCODERS)



More information about the ffmpeg-devel mailing list