[FFmpeg-cvslog] arm: Add VFP-accelerated version of fft16
Martin Storsjö
git at videolan.org
Mon Jul 22 12:11:34 CEST 2013
ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Fri Jul 19 11:23:57 2013 +0300| [8b9eba664edaddf9a304d3acbf0388b5c520781d] | committer: Martin Storsjö
arm: Add VFP-accelerated version of fft16
Before After
Mean StdDev Mean StdDev Change
This function 1389.3 4.2 967.8 35.1 +43.6%
Overall 15577.5 83.2 15400.0 336.4 +1.2%
Signed-off-by: Martin Storsjö <martin at martin.st>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8b9eba664edaddf9a304d3acbf0388b5c520781d
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/fft_vfp.S | 298 +++++++++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mdct_vfp.S | 5 +-
3 files changed, 301 insertions(+), 3 deletions(-)
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 9bb8795..e941aaa 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -54,6 +54,7 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
new file mode 100644
index 0000000..7845ebb
--- /dev/null
+++ b/libavcodec/arm/fft_vfp.S
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison at riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ TODO: * FFTs wider than 16
+@ * dispatch code
+
+function fft4_vfp
+ vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
+ vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
+ vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
+ vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
+ @ stall
+ vadd.f s12, s0, s8 @ i0
+ vadd.f s13, s1, s9 @ i1
+ vadd.f s14, s2, s10 @ i2
+ vadd.f s15, s3, s11 @ i3
+ vsub.f s8, s0, s8 @ i4
+ vsub.f s9, s1, s9 @ i5
+ vsub.f s10, s2, s10 @ i6
+ vsub.f s11, s3, s11 @ i7
+ @ stall
+ @ stall
+ vadd.f s0, s12, s14 @ z[0].re
+ vsub.f s4, s12, s14 @ z[2].re
+ vadd.f s1, s13, s15 @ z[0].im
+ vsub.f s5, s13, s15 @ z[2].im
+ vadd.f s7, s9, s10 @ z[3].im
+ vsub.f s3, s9, s10 @ z[1].im
+ vadd.f s2, s8, s11 @ z[1].re
+ vsub.f s6, s8, s11 @ z[3].re
+ @ stall
+ @ stall
+ vstr d0, [a1, #0*2*4]
+ vstr d2, [a1, #2*2*4]
+ @ stall
+ @ stall
+ vstr d1, [a1, #1*2*4]
+ vstr d3, [a1, #3*2*4]
+
+ bx lr
+endfunc
+
+.macro macro_fft8_head
+ @ FFT4
+ vldr d4, [a1, #0 * 2*4]
+ vldr d6, [a1, #1 * 2*4]
+ vldr d5, [a1, #2 * 2*4]
+ vldr d7, [a1, #3 * 2*4]
+ @ BF
+ vldr d12, [a1, #4 * 2*4]
+ vadd.f s16, s8, s12 @ vector op
+ vldr d14, [a1, #5 * 2*4]
+ vldr d13, [a1, #6 * 2*4]
+ vldr d15, [a1, #7 * 2*4]
+ vsub.f s20, s8, s12 @ vector op
+ vadd.f s0, s16, s18
+ vsub.f s2, s16, s18
+ vadd.f s1, s17, s19
+ vsub.f s3, s17, s19
+ vadd.f s7, s21, s22
+ vsub.f s5, s21, s22
+ vadd.f s4, s20, s23
+ vsub.f s6, s20, s23
+ vsub.f s20, s24, s28 @ vector op
+ vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
+ vstr d1, [a1, #1 * 2*4]
+ vldr s0, cos1pi4
+ vadd.f s16, s24, s28 @ vector op
+ vstr d2, [a1, #2 * 2*4]
+ vstr d3, [a1, #3 * 2*4]
+ vldr d12, [a1, #0 * 2*4]
+ @ TRANSFORM
+ vmul.f s20, s20, s0 @ vector x scalar op
+ vldr d13, [a1, #1 * 2*4]
+ vldr d14, [a1, #2 * 2*4]
+ vldr d15, [a1, #3 * 2*4]
+ @ BUTTERFLIES
+ vadd.f s0, s18, s16
+ vadd.f s1, s17, s19
+ vsub.f s2, s17, s19
+ vsub.f s3, s18, s16
+ vadd.f s4, s21, s20
+ vsub.f s5, s21, s20
+ vadd.f s6, s22, s23
+ vsub.f s7, s22, s23
+ vadd.f s8, s0, s24 @ vector op
+ vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
+ vstr d1, [a1, #1 * 2*4]
+ vldr d6, [a1, #0 * 2*4]
+ vldr d7, [a1, #1 * 2*4]
+ vadd.f s1, s5, s6
+ vadd.f s0, s7, s4
+ vsub.f s2, s5, s6
+ vsub.f s3, s7, s4
+ vsub.f s12, s24, s12 @ vector op
+ vsub.f s5, s29, s1
+ vsub.f s4, s28, s0
+ vsub.f s6, s30, s2
+ vsub.f s7, s31, s3
+ vadd.f s16, s0, s28 @ vector op
+ vstr d6, [a1, #4 * 2*4]
+ vstr d7, [a1, #6 * 2*4]
+ vstr d4, [a1, #0 * 2*4]
+ vstr d5, [a1, #2 * 2*4]
+ vstr d2, [a1, #5 * 2*4]
+ vstr d3, [a1, #7 * 2*4]
+.endm
+
+.macro macro_fft8_tail
+ vstr d8, [a1, #1 * 2*4]
+ vstr d9, [a1, #3 * 2*4]
+.endm
+
+function fft8_vfp
+ ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
+ fmrx a2, FPSCR
+ fmxr FPSCR, a3
+ vpush {s16-s31}
+
+ macro_fft8_head
+ macro_fft8_tail
+
+ vpop {s16-s31}
+ fmxr FPSCR, a2
+ bx lr
+endfunc
+
+.align 3
+cos1pi4: @ cos(1*pi/4) = sqrt(2)
+ .float 0.707106769084930419921875
+cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
+ .float 0.92387950420379638671875
+cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
+ .float 0.3826834261417388916015625
+
+function ff_fft16_vfp, export=1
+ ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
+ fmrx a2, FPSCR
+ fmxr FPSCR, a3
+ vpush {s16-s31}
+
+ macro_fft8_head
+ @ FFT4(z+8)
+ vldr d10, [a1, #8 * 2*4]
+ vldr d12, [a1, #9 * 2*4]
+ vldr d11, [a1, #10 * 2*4]
+ vldr d13, [a1, #11 * 2*4]
+ macro_fft8_tail
+ vadd.f s16, s20, s24 @ vector op
+ @ FFT4(z+12)
+ vldr d4, [a1, #12 * 2*4]
+ vldr d6, [a1, #13 * 2*4]
+ vldr d5, [a1, #14 * 2*4]
+ vsub.f s20, s20, s24 @ vector op
+ vldr d7, [a1, #15 * 2*4]
+ vadd.f s0, s16, s18
+ vsub.f s4, s16, s18
+ vadd.f s1, s17, s19
+ vsub.f s5, s17, s19
+ vadd.f s7, s21, s22
+ vsub.f s3, s21, s22
+ vadd.f s2, s20, s23
+ vsub.f s6, s20, s23
+ vadd.f s16, s8, s12 @ vector op
+ vstr d0, [a1, #8 * 2*4]
+ vstr d2, [a1, #10 * 2*4]
+ vstr d1, [a1, #9 * 2*4]
+ vsub.f s20, s8, s12
+ vstr d3, [a1, #11 * 2*4]
+ @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
+ vldr d12, [a1, #10 * 2*4]
+ vadd.f s0, s16, s18
+ vadd.f s1, s17, s19
+ vsub.f s6, s16, s18
+ vsub.f s7, s17, s19
+ vsub.f s3, s21, s22
+ vadd.f s2, s20, s23
+ vadd.f s5, s21, s22
+ vsub.f s4, s20, s23
+ vstr d0, [a1, #12 * 2*4]
+ vmov s0, s6
+ @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
+ vldr d6, [a1, #9 * 2*4]
+ vstr d1, [a1, #13 * 2*4]
+ vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
+ vstr d2, [a1, #15 * 2*4]
+ vldr d7, [a1, #13 * 2*4]
+ vadd.f s4, s25, s24
+ vsub.f s5, s25, s24
+ vsub.f s6, s0, s7
+ vadd.f s7, s0, s7
+ vmul.f s20, s12, s3 @ vector op
+ @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
+ vldr d4, [a1, #11 * 2*4]
+ vldr d5, [a1, #15 * 2*4]
+ vldr s1, cos3pi8
+ vmul.f s24, s4, s2 @ vector * scalar op
+ vmul.f s28, s12, s1 @ vector * scalar op
+ vmul.f s12, s8, s1 @ vector * scalar op
+ vadd.f s4, s20, s29
+ vsub.f s5, s21, s28
+ vsub.f s6, s22, s31
+ vadd.f s7, s23, s30
+ vmul.f s8, s8, s3 @ vector * scalar op
+ vldr d8, [a1, #1 * 2*4]
+ vldr d9, [a1, #5 * 2*4]
+ vldr d10, [a1, #3 * 2*4]
+ vldr d11, [a1, #7 * 2*4]
+ vldr d14, [a1, #2 * 2*4]
+ vadd.f s0, s6, s4
+ vadd.f s1, s5, s7
+ vsub.f s2, s5, s7
+ vsub.f s3, s6, s4
+ vadd.f s4, s12, s9
+ vsub.f s5, s13, s8
+ vsub.f s6, s14, s11
+ vadd.f s7, s15, s10
+ vadd.f s12, s0, s16 @ vector op
+ vstr d0, [a1, #1 * 2*4]
+ vstr d1, [a1, #5 * 2*4]
+ vldr d4, [a1, #1 * 2*4]
+ vldr d5, [a1, #5 * 2*4]
+ vadd.f s0, s6, s4
+ vadd.f s1, s5, s7
+ vsub.f s2, s5, s7
+ vsub.f s3, s6, s4
+ vsub.f s8, s16, s8 @ vector op
+ vstr d6, [a1, #1 * 2*4]
+ vstr d7, [a1, #5 * 2*4]
+ vldr d15, [a1, #6 * 2*4]
+ vsub.f s4, s20, s0
+ vsub.f s5, s21, s1
+ vsub.f s6, s22, s2
+ vsub.f s7, s23, s3
+ vadd.f s20, s0, s20 @ vector op
+ vstr d4, [a1, #9 * 2*4]
+ @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
+ vldr d6, [a1, #8 * 2*4]
+ vstr d5, [a1, #13 * 2*4]
+ vldr d7, [a1, #12 * 2*4]
+ vstr d2, [a1, #11 * 2*4]
+ vldr d8, [a1, #0 * 2*4]
+ vstr d3, [a1, #15 * 2*4]
+ vldr d9, [a1, #4 * 2*4]
+ vadd.f s0, s26, s24
+ vadd.f s1, s25, s27
+ vsub.f s2, s25, s27
+ vsub.f s3, s26, s24
+ vadd.f s4, s14, s12
+ vadd.f s5, s13, s15
+ vsub.f s6, s13, s15
+ vsub.f s7, s14, s12
+ vadd.f s8, s0, s28 @ vector op
+ vstr d0, [a1, #3 * 2*4]
+ vstr d1, [a1, #7 * 2*4]
+ vldr d6, [a1, #3 * 2*4]
+ vldr d7, [a1, #7 * 2*4]
+ vsub.f s0, s16, s4
+ vsub.f s1, s17, s5
+ vsub.f s2, s18, s6
+ vsub.f s3, s19, s7
+ vsub.f s12, s28, s12 @ vector op
+ vadd.f s16, s4, s16 @ vector op
+ vstr d10, [a1, #3 * 2*4]
+ vstr d11, [a1, #7 * 2*4]
+ vstr d4, [a1, #2 * 2*4]
+ vstr d5, [a1, #6 * 2*4]
+ vstr d0, [a1, #8 * 2*4]
+ vstr d1, [a1, #12 * 2*4]
+ vstr d6, [a1, #10 * 2*4]
+ vstr d7, [a1, #14 * 2*4]
+ vstr d8, [a1, #0 * 2*4]
+ vstr d9, [a1, #4 * 2*4]
+
+ vpop {s16-s31}
+ fmxr FPSCR, a2
+ bx lr
+endfunc
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index 7413a41..0623e96 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -174,9 +174,8 @@ function ff_imdct_half_vfp, export=1
.endr
fmxr FPSCR, OLDFPSCR
- mov ORIGOUT, OUT
- ldr ip, [CONTEXT, #9*4]
- blx ip @ s->fft_calc(s, output)
+ mov a1, OUT
+ bl ff_fft16_vfp
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
More information about the ffmpeg-cvslog
mailing list