[FFmpeg-devel] [PATCH 2/2] ARM: NEON optimised VP6 edge filter

Mans Rullgard mans
Fri Apr 23 16:24:34 CEST 2010


---
 libavcodec/arm/Makefile           |    6 ++
 libavcodec/arm/vp56dsp_init_arm.c |   34 ++++++++++
 libavcodec/arm/vp56dsp_neon.S     |  121 +++++++++++++++++++++++++++++++++++++
 libavcodec/vp56dsp.c              |    2 +
 libavcodec/vp56dsp.h              |    1 +
 5 files changed, 164 insertions(+), 0 deletions(-)
 create mode 100644 libavcodec/arm/vp56dsp_init_arm.c
 create mode 100644 libavcodec/arm/vp56dsp_neon.S

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 794156d..46bfc75 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -1,5 +1,8 @@
 OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o         \
 
+OBJS-$(CONFIG_VP5_DECODER)             += arm/vp56dsp_init_arm.o
+OBJS-$(CONFIG_VP6_DECODER)             += arm/vp56dsp_init_arm.o
+
 OBJS-$(CONFIG_H264DSP)                 += arm/h264dsp_init_arm.o        \
                                           arm/h264pred_init_arm.o       \
 
@@ -40,6 +43,9 @@ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
 
 NEON-OBJS-$(CONFIG_VP3_DECODER)        += arm/vp3dsp_neon.o
 
+NEON-OBJS-$(CONFIG_VP5_DECODER)        += arm/vp56dsp_neon.o
+NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp56dsp_neon.o
+
 OBJS-$(HAVE_NEON)                      += arm/dsputil_init_neon.o       \
                                           arm/dsputil_neon.o            \
                                           arm/int_neon.o                \
diff --git a/libavcodec/arm/vp56dsp_init_arm.c b/libavcodec/arm/vp56dsp_init_arm.c
new file mode 100644
index 0000000..ceab9a8
--- /dev/null
+++ b/libavcodec/arm/vp56dsp_init_arm.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vp56dsp.h"
+
+void ff_vp6_edge_filter_hor_neon(uint8_t *yuv, int stride, int t);
+void ff_vp6_edge_filter_ver_neon(uint8_t *yuv, int stride, int t);
+
+void ff_vp56dsp_init_arm(VP56DSPContext *s, enum CodecID codec)
+{
+    if (codec != CODEC_ID_VP5 && HAVE_NEON) {
+        s->edge_filter_hor = ff_vp6_edge_filter_hor_neon;
+        s->edge_filter_ver = ff_vp6_edge_filter_ver_neon;
+    }
+}
diff --git a/libavcodec/arm/vp56dsp_neon.S b/libavcodec/arm/vp56dsp_neon.S
new file mode 100644
index 0000000..0353661
--- /dev/null
+++ b/libavcodec/arm/vp56dsp_neon.S
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+.macro  vp6_edge_filter
+        vdup.16         q3,  r2                 @ t
+        vmov.i16        q13, #1
+        vsubl.u8        q0,  d20, d18           @ p[   0] - p[-s]
+        vsubl.u8        q1,  d16, d22           @ p[-2*s] - p[ s]
+        vsubl.u8        q14, d21, d19
+        vsubl.u8        q15, d17, d23
+        vadd.i16        q2,  q0,  q0            @ 2*(p[0]-p[-s])
+        vadd.i16        d29, d28, d28
+        vadd.i16        q0,  q0,  q1            @    p[0]-p[-s]  + p[-2*s]-p[s]
+        vadd.i16        d28, d28, d30
+        vadd.i16        q0,  q0,  q2            @ 3*(p[0]-p[-s]) + p[-2*s]-p[s]
+        vadd.i16        d28, d28, d29
+        vrshr.s16       q0,  q0,  #3            @ v
+        vrshr.s16       d28, d28, #3
+        vsub.i16        q8,  q3,  q13           @ t-1
+        vabs.s16        q1,  q0                 @ V
+        vshr.s16        q2,  q0,  #15           @ s
+        vabs.s16        d30, d28
+        vshr.s16        d29, d28, #15
+        vsub.i16        q12, q1,  q3            @ V-t
+        vsub.i16        d31, d30, d6
+        vsub.i16        q12, q12, q13           @ V-t-1
+        vsub.i16        d31, d31, d26
+        vcge.u16        q12, q12, q8            @ V-t-1 >= t-1
+        vcge.u16        d31, d31, d16
+        vadd.i16        q13, q3,  q3            @ 2*t
+        vadd.i16        d16, d6,  d6
+        vsub.i16        q13, q13, q1            @ 2*t - V
+        vsub.i16        d16, d16, d30
+        vadd.i16        q13, q13, q2            @ += s
+        vadd.i16        d16, d16, d29
+        veor            q13, q13, q2            @ ^= s
+        veor            d16, d16, d29
+        vbif            q0,  q13, q12
+        vbif            d28, d16, d31
+        vmovl.u8        q1,  d20
+        vmovl.u8        q15, d21
+        vaddw.u8        q2,  q0,  d18
+        vaddw.u8        q3,  q14, d19
+        vsub.i16        q1,  q1,  q0
+        vsub.i16        d30, d30, d28
+        vqmovun.s16     d18, q2
+        vqmovun.s16     d19, q3
+        vqmovun.s16     d20, q1
+        vqmovun.s16     d21, q15
+.endm
+
+function ff_vp6_edge_filter_ver_neon, export=1
+        sub             r0,  r0,  r1,  lsl #1
+        vld1.8          {q8},     [r0], r1      @ p[-2*s]
+        vld1.8          {q9},     [r0], r1      @ p[-s]
+        vld1.8          {q10},    [r0], r1      @ p[0]
+        vld1.8          {q11},    [r0]          @ p[s]
+        vp6_edge_filter
+        sub             r0,  r0,  r1,  lsl #1
+        sub             r1,  r1,  #8
+        vst1.8          {d18},    [r0]!
+        vst1.32         {d19[0]}, [r0], r1
+        vst1.8          {d20},    [r0]!
+        vst1.32         {d21[0]}, [r0]
+        bx              lr
+endfunc
+
+function ff_vp6_edge_filter_hor_neon, export=1
+        sub             r3,  r0,  #1
+        sub             r0,  r0,  #2
+        vld1.32         {d16[0]}, [r0], r1
+        vld1.32         {d18[0]}, [r0], r1
+        vld1.32         {d20[0]}, [r0], r1
+        vld1.32         {d22[0]}, [r0], r1
+        vld1.32         {d16[1]}, [r0], r1
+        vld1.32         {d18[1]}, [r0], r1
+        vld1.32         {d20[1]}, [r0], r1
+        vld1.32         {d22[1]}, [r0], r1
+        vld1.32         {d17[0]}, [r0], r1
+        vld1.32         {d19[0]}, [r0], r1
+        vld1.32         {d21[0]}, [r0], r1
+        vld1.32         {d23[0]}, [r0], r1
+        vtrn.8          q8,  q9
+        vtrn.8          q10, q11
+        vtrn.16         q8,  q10
+        vtrn.16         q9,  q11
+        vp6_edge_filter
+        vtrn.8          q9,  q10
+        vst1.16         {d18[0]}, [r3], r1
+        vst1.16         {d20[0]}, [r3], r1
+        vst1.16         {d18[1]}, [r3], r1
+        vst1.16         {d20[1]}, [r3], r1
+        vst1.16         {d18[2]}, [r3], r1
+        vst1.16         {d20[2]}, [r3], r1
+        vst1.16         {d18[3]}, [r3], r1
+        vst1.16         {d20[3]}, [r3], r1
+        vst1.16         {d19[0]}, [r3], r1
+        vst1.16         {d21[0]}, [r3], r1
+        vst1.16         {d19[1]}, [r3], r1
+        vst1.16         {d21[1]}, [r3], r1
+        bx              lr
+endfunc
diff --git a/libavcodec/vp56dsp.c b/libavcodec/vp56dsp.c
index 991c90a..ac6a400 100644
--- a/libavcodec/vp56dsp.c
+++ b/libavcodec/vp56dsp.c
@@ -83,4 +83,6 @@ void ff_vp56dsp_init(VP56DSPContext *s, enum CodecID codec)
         s->edge_filter_hor = vp6_edge_filter_hor;
         s->edge_filter_ver = vp6_edge_filter_ver;
     }
+
+    if (ARCH_ARM) ff_vp56dsp_init_arm(s, codec);
 }
diff --git a/libavcodec/vp56dsp.h b/libavcodec/vp56dsp.h
index dbc84b1..2d6941f 100644
--- a/libavcodec/vp56dsp.h
+++ b/libavcodec/vp56dsp.h
@@ -29,5 +29,6 @@ typedef struct VP56DSPContext {
 } VP56DSPContext;
 
 void ff_vp56dsp_init(VP56DSPContext *s, enum CodecID codec);
+void ff_vp56dsp_init_arm(VP56DSPContext *s, enum CodecID codec);
 
 #endif /* AVCODEC_VP56DSP_H */
-- 
1.7.0.4




More information about the ffmpeg-devel mailing list