[FFmpeg-devel] [PATCH] Dsputilize some functions from APE decode 2/2 - SSE2

Kostya kostya.shishkov
Tue Jul 8 12:48:21 CEST 2008


On Tue, Jul 08, 2008 at 12:07:49AM +0200, Michael Niedermayer wrote:
> On Mon, Jul 07, 2008 at 09:21:07PM +0300, Kostya wrote:
> > $subj
> > 
> > It makes APE with insane compression to decode only
> > in 0.5 realtime instead of 0.11 realtime :).
> > 
> > Sorry for the implementation, I mostly translate
> > from asm instead of writing code.
> 
> > Index: libavcodec/i386/dsputil_mmx.c
> > ===================================================================
> > --- libavcodec/i386/dsputil_mmx.c	(revision 14100)
> > +++ libavcodec/i386/dsputil_mmx.c	(working copy)
> > @@ -2061,6 +2061,70 @@
> >  extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> >                            int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
> >  
> > +
> > +static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
> > +{
> > +    order <<= 1;
> > +    asm volatile(
> > +        "1:                     \n\t"
> > +        "movdqa  (%0),   %%xmm0 \n\t"
> > +        "movdqu  (%1),   %%xmm1 \n\t"
> > +        "paddw   %%xmm1, %%xmm0 \n\t"
> 
> Maybe the following is faster? (ignore if not)
> "movdqu  (%1),   %%xmm0 \n\t"
> "paddw   (%0),   %%xmm0 \n\t"
 
it does the same and one instruction less, so changed
 
> > +        "movdqa  %%xmm0, (%0)   \n\t"
> 
> > +        "add     $16,    %0     \n\t"
> > +        "add     $16,    %1     \n\t"
> > +        "sub     $16,    %2     \n\t"
> > +        "jnz     1b             \n\t"
> 
> 2 of the add/sub are unneeded, see my commit a moment ago to float_to_int16

changed

> also the loop will likely significantly benefit from being unrolled once

len is declared as multiple of 8, and loop handles 8 elements 
 
> > +        : "+r"(v1), "+r"(v2), "+r"(order)
> > +    );
> > +}
> > +
[...]
> > +
> > +static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
> > +{
> > +    int res = 0;
> > +    DECLARE_ALIGNED_16(int64_t, sh);
> > +
> > +    sh = shift;
> > +    order <<= 1;
> > +    asm volatile(
> > +        "pxor    %%xmm7, %%xmm7        \n\t"
> > +        "1:                            \n\t"
> > +        "movdqu  (%0),   %%xmm0        \n\t"
> > +        "pmaddwd (%1),   %%xmm0        \n\t"
> > +        "movhlps %%xmm0, %%xmm2        \n\t"
> > +        "paddd   %%xmm2, %%xmm0        \n\t"
> > +        "psrad   %4,     %%xmm0        \n\t"
> > +        "pshuflw $0x4E,  %%xmm0,%%xmm2 \n\t"
> > +        "paddd   %%xmm2, %%xmm0        \n\t"
> > +        "paddd   %%xmm0, %%xmm7        \n\t"
> 
> i think ive already said that values should be accumlated vertically
> before horizontally
> the movhlps,psrad, pshuflw should be after the loop
 
changed
 
> > +        "add     $16,    %0            \n\t"
> > +        "add     $16,    %1            \n\t"
> > +        "sub     $16,    %3            \n\t"
> > +        "jnz     1b                    \n\t"
> > +        "movd    %%xmm7, %2            \n\t"
> > +        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(order)
> 
> > +        : "m"(sh)
> 
> should be in a register

why? psrad takes either (x)mm register, immediate value or memory
for input.
 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> Observe your enemies, for they first find out your faults. -- Antisthenes
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c	(revision 14100)
+++ libavcodec/i386/dsputil_mmx.c	(working copy)
@@ -2061,6 +2061,63 @@
 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
 
+
+static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+    x86_reg o = order - 8;
+    asm volatile(
+        "1:                           \n\t"
+        "movdqu  (%1,%2,2), %%xmm0    \n\t"
+        "paddw   (%0,%2,2), %%xmm0    \n\t"
+        "movdqa  %%xmm0,    (%0,%2,2) \n\t"
+        "sub     $8,        %2        \n\t"
+        "jge     1b                   \n\t"
+        : "+r"(v1), "+r"(v2), "+r"(o)
+    );
+}
+
+static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+    x86_reg o = order - 8;
+    asm volatile(
+        "1:                           \n\t"
+        "movdqa  (%0,%2,2), %%xmm0    \n\t"
+        "movdqu  (%1,%2,2), %%xmm1    \n\t"
+        "psubw   %%xmm1,    %%xmm0    \n\t"
+        "movdqa  %%xmm0,    (%0,%2,2) \n\t"
+        "sub     $8,        %2        \n\t"
+        "jge     1b                   \n\t"
+        : "+r"(v1), "+r"(v2), "+r"(o)
+    );
+}
+
+static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+{
+    int res = 0;
+    x86_reg o = order - 8;
+    DECLARE_ALIGNED_16(int64_t, sh);
+
+    sh = shift;
+    asm volatile(
+        "pxor    %%xmm7,    %%xmm7        \n\t"
+        "1:                               \n\t"
+        "movdqu  (%0,%3,2), %%xmm0        \n\t"
+        "pmaddwd (%1,%3,2), %%xmm0        \n\t"
+        "paddd   %%xmm0,    %%xmm7        \n\t"
+        "sub     $8,        %3            \n\t"
+        "jge     1b                       \n\t"
+        "movhlps %%xmm7,    %%xmm2        \n\t"
+        "paddd   %%xmm2,    %%xmm7        \n\t"
+        "psrad   %4,        %%xmm7        \n\t"
+        "pshuflw $0x4E,     %%xmm7,%%xmm2 \n\t"
+        "paddd   %%xmm2,    %%xmm7        \n\t"
+        "movd    %%xmm7,    %2            \n\t"
+        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
+        : "m"(sh)
+    );
+    return res;
+}
+
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     mm_flags = mm_support();
@@ -2429,6 +2486,11 @@
         }
         if(mm_flags & MM_3DNOW)
             c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+        if(mm_flags & MM_SSE2){
+            c->add_int16 = add_int16_sse2;
+            c->sub_int16 = sub_int16_sse2;
+            c->scalarproduct_int16 = scalarproduct_int16_sse2;
+        }
     }
 
     if (ENABLE_ENCODERS)



More information about the ffmpeg-devel mailing list