[FFmpeg-devel] [RFC][PATCH] DSPUtilize some functions from APE decoder

Kostya kostya.shishkov
Wed Jul 2 15:26:25 CEST 2008


I'm not satisfied with the decoding speed of APE decoder,
so I've decided to finally dsputilize functions marked as such.

Altivec version is in development.
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c	(revision 14044)
+++ libavcodec/i386/dsputil_mmx.c	(working copy)
@@ -2061,6 +2061,66 @@
 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
 
+
+static void vector_int16_add_sse(int16_t * v1, int16_t * v2, int order)
+{
+    int i;
+    for(i = 0; i < order; i += 8){
+        asm volatile(
+	"movdqa  (%0),   %%xmm0 \n\t"
+	"movdqu  (%1),   %%xmm1 \n\t"
+	"paddw   %%xmm1, %%xmm0 \n\t"
+	"movdqa  %%xmm0, (%0)   \n\t"
+	: "+r"(v1), "+r"(v2)
+	);
+	v1 += 8;
+	v2 += 8;
+    }
+}
+
+static void vector_int16_sub_sse(int16_t * v1, int16_t * v2, int order)
+{
+    int i;
+    for(i = 0; i < order; i += 8){
+        asm volatile(
+	"movdqa  (%0),   %%xmm0 \n\t"
+	"movdqu  (%1),   %%xmm1 \n\t"
+	"psubw   %%xmm1, %%xmm0 \n\t"
+	"movdqa  %%xmm0, (%0)   \n\t"
+	: "+r"(v1), "+r"(v2)
+	);
+	v1 += 8;
+	v2 += 8;
+    }
+}
+
+static int32_t vector_int16_scalarproduct_sse(int16_t * v1, int16_t * v2, int order)
+{
+    int i;
+    int res = 0, *resp=&res;
+
+    asm volatile("pxor %xmm7, %xmm7 \n\t");
+
+    for(i = 0; i < order; i += 8){
+        asm volatile(
+	"movdqu   (%0),   %%xmm0 \n\t"
+	"movdqa   (%1),   %%xmm1 \n\t"
+	"pmaddwd  %%xmm1, %%xmm0 \n\t"
+	"movhlps  %%xmm0, %%xmm2 \n\t"
+
+	"paddd    %%xmm2, %%xmm0 \n\t"
+	"pshufd  $0x01, %%xmm0,%%xmm2 \n\t"
+	"paddd    %%xmm2, %%xmm0 \n\t"
+	"paddd   %%xmm0, %%xmm7 \n\t"
+	: "+r"(v1), "+r"(v2)
+	);
+	v1 += 8;
+	v2 += 8;
+    }
+    asm volatile("movd %%xmm7, (%0)\n\t" : "+r"(resp));
+    return res;
+}
+
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     mm_flags = mm_support();
@@ -2426,6 +2486,9 @@
             c->float_to_int16 = float_to_int16_sse;
             c->vector_fmul_reverse = vector_fmul_reverse_sse;
             c->vector_fmul_add_add = vector_fmul_add_add_sse;
+            c->vector_int16_add = vector_int16_add_sse;
+            c->vector_int16_sub = vector_int16_sub_sse;
+            c->vector_int16_scalarproduct = vector_int16_scalarproduct_sse;
         }
         if(mm_flags & MM_3DNOW)
             c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
Index: libavcodec/ppc/dsputil_altivec.c
===================================================================
--- libavcodec/ppc/dsputil_altivec.c	(revision 14044)
+++ libavcodec/ppc/dsputil_altivec.c	(working copy)
@@ -1484,6 +1484,60 @@
 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
 }
 
+static void vector_int16_add_ppc(int16_t * v1, int16_t * v2, int order)
+{
+    int i;
+    register vector short vec1;
+    DECLARE_ALIGNED_16(int16_t, buf[order]);
+    int16_t *b2 = buf;
+    
+    memcpy(buf, v2, order *2);
+    for(i = 0; i < order; i += 8){
+        vec1 = vec_ld(0, v1);
+        vec1 = vec_add(vec1, vec_ld(0, b2));
+        vec_st(vec1, 0, v1);
+        v1 += 8;
+        b2 += 8;
+    }
+}
+
+static void vector_int16_sub_ppc(int16_t * v1, int16_t * v2, int order)
+{
+    int i;
+    register vector short vec1;
+    DECLARE_ALIGNED_16(int16_t, buf[order]);
+    int16_t *b2 = buf;
+    
+    memcpy(buf, v2, order *2);
+    for(i = 0; i < order; i += 8){
+        vec1 = vec_ld(0, v1);
+        vec1 = vec_sub(vec1, vec_ld(0, b2));
+        vec_st(vec1, 0, v1);
+        v1 += 8;
+        b2 += 8;
+    }
+}
+
+static int32_t vector_int16_scalarproduct_ppc(int16_t * v1, int16_t * v2, int order)
+{
+    int i;
+    register vector short vec1;
+    register const vector short zero = vec_splat_s16(0);
+    register vector int res = vec_splat_s32(0), t;
+    DECLARE_ALIGNED_16(int, ires);
+
+    for(i = 0; i < order; i += 8){
+        vec1 = vec_ld(0, v1);
+        t = vec_msums(vec1, vec_ld(0, v2), zero);
+        res = vec_sums(t, res);
+        v1 += 8;
+        v2 += 8;
+    }
+    vec_st(res, 0, &ires);
+    return ires;
+}
+
+
 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
     c->pix_abs[0][1] = sad16_x2_altivec;
@@ -1515,4 +1569,8 @@
     c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
     if (ENABLE_VORBIS_DECODER)
         c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
+
+    c->vector_int16_add = vector_int16_add_ppc;
+    c->vector_int16_sub = vector_int16_sub_ppc;
+    c->vector_int16_scalarproduct = vector_int16_scalarproduct_ppc;
 }
Index: libavcodec/apedec.c
===================================================================
--- libavcodec/apedec.c	(revision 14044)
+++ libavcodec/apedec.c	(working copy)
@@ -161,30 +161,7 @@
 } APEContext;
 
 // TODO: dsputilize
-static inline void vector_add(int16_t * v1, int16_t * v2, int order)
-{
-    while (order--)
-       *v1++ += *v2++;
-}
 
-// TODO: dsputilize
-static inline void vector_sub(int16_t * v1, int16_t * v2, int order)
-{
-    while (order--)
-        *v1++ -= *v2++;
-}
-
-// TODO: dsputilize
-static inline int32_t scalarproduct(int16_t * v1, int16_t * v2, int order)
-{
-    int res = 0;
-
-    while (order--)
-        res += *v1++ * *v2++;
-
-    return res;
-}
-
 static av_cold int ape_decode_init(AVCodecContext * avctx)
 {
     APEContext *s = avctx->priv_data;
@@ -672,19 +649,19 @@
     do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order);
 }
 
-static inline void do_apply_filter(int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
+static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
 {
     int res;
     int absres;
 
     while (count--) {
         /* round fixedpoint scalar product */
-        res = (scalarproduct(f->delay - order, f->coeffs, order) + (1 << (fracbits - 1))) >> fracbits;
+        res = (ctx->dsp.vector_int16_scalarproduct(f->delay - order, f->coeffs, order) + (1 << (fracbits - 1))) >> fracbits;
 
         if (*data < 0)
-            vector_add(f->coeffs, f->adaptcoeffs - order, order);
+            ctx->dsp.vector_int16_add(f->coeffs, f->adaptcoeffs - order, order);
         else if (*data > 0)
-            vector_sub(f->coeffs, f->adaptcoeffs - order, order);
+            ctx->dsp.vector_int16_sub(f->coeffs, f->adaptcoeffs - order, order);
 
         res += *data;
 
@@ -736,9 +713,9 @@
                          int32_t * data0, int32_t * data1,
                          int count, int order, int fracbits)
 {
-    do_apply_filter(ctx->fileversion, &f[0], data0, count, order, fracbits);
+    do_apply_filter(ctx, ctx->fileversion, &f[0], data0, count, order, fracbits);
     if (data1)
-        do_apply_filter(ctx->fileversion, &f[1], data1, count, order, fracbits);
+        do_apply_filter(ctx, ctx->fileversion, &f[1], data1, count, order, fracbits);
 }
 
 static void ape_apply_filters(APEContext * ctx, int32_t * decoded0,
Index: libavcodec/dsputil.c
===================================================================
--- libavcodec/dsputil.c	(revision 14044)
+++ libavcodec/dsputil.c	(working copy)
@@ -3944,6 +3944,26 @@
     }
 }
 
+static void vector_int16_add_c(int16_t * v1, int16_t * v2, int order){
+    while (order--)
+       *v1++ += *v2++;
+}
+
+static void vector_int16_sub_c(int16_t * v1, int16_t * v2, int order){
+    while (order--)
+        *v1++ -= *v2++;
+}
+
+static int32_t vector_int16_scalarproduct_c(int16_t * v1, int16_t * v2, int order)
+{
+    int res = 0;
+
+    while (order--)
+        res += *v1++ * *v2++;
+
+    return res;
+}
+
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -4430,6 +4450,10 @@
     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
     c->float_to_int16 = ff_float_to_int16_c;
 
+    c->vector_int16_add = vector_int16_add_c;
+    c->vector_int16_sub = vector_int16_sub_c;
+    c->vector_int16_scalarproduct = vector_int16_scalarproduct_c;
+
     c->shrink[0]= ff_img_copy_plane;
     c->shrink[1]= ff_shrink22;
     c->shrink[2]= ff_shrink44;
Index: libavcodec/dsputil.h
===================================================================
--- libavcodec/dsputil.h	(revision 14044)
+++ libavcodec/dsputil.h	(working copy)
@@ -451,6 +451,13 @@
     void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
            int * range, int * sum,  int edges);
 
+    /* ape functions */
+    /* Add second vector values to the first one. v1 is aligned, v2 is not. */
+    void (*vector_int16_add)(int16_t *v1, int16_t *v2, int len);
+    /* Add second vector values to the first one. v1 is aligned, v2 is not. */
+    void (*vector_int16_sub)(int16_t *v1, int16_t *v2, int len);
+    /* Calculate scalar product of two vectors. v1 is unaligned, v2 is aligned. */
+    int32_t (*vector_int16_scalarproduct)(int16_t *v1, int16_t *v2, int len);
 } DSPContext;
 
 void dsputil_static_init(void);



More information about the ffmpeg-devel mailing list