[FFmpeg-cvslog] VP8: ARM optimised decode_block_coeffs_internal
Mans Rullgard
git
Sun Feb 13 01:16:14 CET 2011
ffmpeg | branch: master | Mans Rullgard <mans at mansr.com> | Tue Feb 1 22:38:15 2011 +0000| [4ae3ee4ae9d1607d772f9c8e6fe9b167e6761ae4] | committer: Michael Niedermayer
VP8: ARM optimised decode_block_coeffs_internal
Approximately 5% faster on Cortex-A8.
Signed-off-by: Mans Rullgard <mans at mansr.com>
(cherry picked from commit a7878c9f73c12cfa685bd8af8f3afcca85f56a8b)
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4ae3ee4ae9d1607d772f9c8e6fe9b167e6761ae4
---
Makefile | 2 +-
libavcodec/arm/Makefile | 2 +
libavcodec/arm/vp8.h | 29 ++++++
libavcodec/arm/vp8_armv6.S | 220 ++++++++++++++++++++++++++++++++++++++++++++
libavcodec/vp8.c | 8 ++-
libavcodec/vp8data.h | 2 +-
6 files changed, 260 insertions(+), 3 deletions(-)
diff --git a/Makefile b/Makefile
index 356ba29..b059955 100644
--- a/Makefile
+++ b/Makefile
@@ -66,7 +66,7 @@ config.h: .config
SUBDIR_VARS := OBJS FFLIBS CLEANFILES DIRS TESTPROGS EXAMPLES SKIPHEADERS \
ALTIVEC-OBJS MMX-OBJS NEON-OBJS X86-OBJS YASM-OBJS-FFT YASM-OBJS \
- HOSTPROGS BUILT_HEADERS TESTOBJS ARCH_HEADERS
+ HOSTPROGS BUILT_HEADERS TESTOBJS ARCH_HEADERS ARMV6-OBJS
define RESET
$(1) :=
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 15269ea..d223703 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -3,6 +3,7 @@ OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \
OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_init_arm.o
OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o
+ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o
OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
@@ -23,6 +24,7 @@ OBJS-$(HAVE_ARMV5TE) += arm/dsputil_init_armv5te.o \
OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
+ $(ARMV6-OBJS-yes)
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \
diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h
new file mode 100644
index 0000000..90e7344
--- /dev/null
+++ b/libavcodec/arm/vp8.h
@@ -0,0 +1,29 @@
+/**
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP8_H
+#define AVCODEC_ARM_VP8_H
+
+#if HAVE_ARMV6
+#define decode_block_coeffs_internal ff_decode_block_coeffs_armv6
+int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, DCTELEM block[16],
+ uint8_t probs[8][3][NUM_DCT_TOKENS-1],
+ int i, uint8_t *token_prob, int16_t qmul[2]);
+#endif
+
+#endif
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
new file mode 100644
index 0000000..54c036b
--- /dev/null
+++ b/libavcodec/arm/vp8_armv6.S
@@ -0,0 +1,220 @@
+/**
+ * Copyright (C) 2010 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+ .syntax unified
+
+.macro rac_get_prob h, bs, buf, cw, pr, t0, t1
+ adds \bs, \bs, \t0
+ lsl \cw, \cw, \t0
+ lsl \t0, \h, \t0
+ rsb \h, \pr, #256
+ ldrhcs \t1, [\buf], #2
+ smlabb \h, \t0, \pr, \h
+ rev16cs \t1, \t1
+ orrcs \cw, \cw, \t1, lsl \bs
+ subcs \bs, \bs, #16
+ lsr \h, \h, #8
+ cmp \cw, \h, lsl #16
+ subge \cw, \cw, \h, lsl #16
+ subge \h, \t0, \h
+.endm
+
+.macro rac_get_128 h, bs, buf, cw, t0, t1
+ adds \bs, \bs, \t0
+ lsl \cw, \cw, \t0
+ lsl \t0, \h, \t0
+ ldrhcs \t1, [\buf], #2
+ mov \h, #128
+ rev16cs \t1, \t1
+ add \h, \h, \t0, lsl #7
+ orrcs \cw, \cw, \t1, lsl \bs
+ subcs \bs, \bs, #16
+ lsr \h, \h, #8
+ cmp \cw, \h, lsl #16
+ subge \cw, \cw, \h, lsl #16
+ subge \h, \t0, \h
+.endm
+
+function ff_decode_block_coeffs_armv6, export=1
+ push {r0,r1,r4-r11,lr}
+ movrel lr, ff_vp56_norm_shift
+ ldrd r4, r5, [sp, #44] @ token_prob, qmul
+ cmp r3, #0
+ ldr r11, [r5]
+ ldm r0, {r5-r7} @ high, bits, buf
+ pkhtbne r11, r11, r11, asr #16
+ ldr r8, [r0, #16] @ code_word
+0:
+ ldrb r9, [lr, r5]
+ add r3, r3, #1
+ ldrb r0, [r4, #1]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ blt 2f
+
+ ldrb r9, [lr, r5]
+ ldrb r0, [r4, #2]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r9, [lr, r5]
+ bge 3f
+
+ add r4, r3, r3, lsl #5
+ sxth r12, r11
+ add r4, r2, r4
+ adds r6, r6, r9
+ add r4, r4, #11
+ lsl r8, r8, r9
+ ldrhcs r10, [r7], #2
+ lsl r9, r5, r9
+ mov r5, #128
+ rev16cs r10, r10
+ add r5, r5, r9, lsl #7
+ orrcs r8, r8, r10, lsl r6
+ subcs r6, r6, #16
+ lsr r5, r5, #8
+ cmp r8, r5, lsl #16
+ movrel r10, zigzag_scan-1
+ subge r8, r8, r5, lsl #16
+ subge r5, r9, r5
+ ldrb r10, [r10, r3]
+ rsbge r12, r12, #0
+ cmp r3, #16
+ strh r12, [r1, r10]
+ bge 6f
+5:
+ ldrb r9, [lr, r5]
+ ldrb r0, [r4]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ pkhtb r11, r11, r11, asr #16
+ bge 0b
+
+6:
+ ldr r0, [sp]
+ ldr r9, [r0, #12]
+ cmp r7, r9
+ movhi r7, r9
+ stm r0, {r5-r7} @ high, bits, buf
+ str r8, [r0, #16] @ code_word
+
+ add sp, sp, #8
+ mov r0, r3
+ pop {r4-r11,pc}
+2:
+ add r4, r3, r3, lsl #5
+ cmp r3, #16
+ add r4, r2, r4
+ pkhtb r11, r11, r11, asr #16
+ bne 0b
+ b 6b
+3:
+ ldrb r0, [r4, #3]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r9, [lr, r5]
+ bge 1f
+
+ mov r12, #2
+ ldrb r0, [r4, #4]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ addge r12, #1
+ ldrb r9, [lr, r5]
+ blt 4f
+ ldrb r0, [r4, #5]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ addge r12, #1
+ ldrb r9, [lr, r5]
+ b 4f
+1:
+ ldrb r0, [r4, #6]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r9, [lr, r5]
+ bge 3f
+
+ ldrb r0, [r4, #7]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r9, [lr, r5]
+ bge 2f
+
+ mov r12, #5
+ mov r0, #159
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ addge r12, r12, #1
+ ldrb r9, [lr, r5]
+ b 4f
+2:
+ mov r12, #7
+ mov r0, #165
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ addge r12, r12, #2
+ ldrb r9, [lr, r5]
+ mov r0, #145
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ addge r12, r12, #1
+ ldrb r9, [lr, r5]
+ b 4f
+3:
+ ldrb r0, [r4, #8]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ addge r4, r4, #1
+ ldrb r9, [lr, r5]
+ movge r12, #2
+ movlt r12, #0
+ ldrb r0, [r4, #9]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ mov r9, #8
+ addge r12, r12, #1
+ movrel r4, ff_vp8_dct_cat_prob
+ lsl r9, r9, r12
+ ldr r4, [r4, r12, lsl #2]
+ add r12, r9, #3
+ mov r1, #0
+ ldrb r0, [r4], #1
+1:
+ ldrb r9, [lr, r5]
+ lsl r1, r1, #1
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r0, [r4], #1
+ addge r1, r1, #1
+ cmp r0, #0
+ bne 1b
+ ldrb r9, [lr, r5]
+ add r12, r12, r1
+ ldr r1, [sp, #4]
+4:
+ add r4, r3, r3, lsl #5
+ add r4, r2, r4
+ add r4, r4, #22
+ rac_get_128 r5, r6, r7, r8, r9, r10
+ rsbge r12, r12, #0
+ smulbb r12, r12, r11
+ movrel r9, zigzag_scan-1
+ ldrb r9, [r9, r3]
+ cmp r3, #16
+ strh r12, [r1, r9]
+ bge 6b
+ b 5b
+endfunc
+
+ .section .rodata
+zigzag_scan:
+ .byte 0, 2, 8, 16
+ .byte 10, 4, 6, 12
+ .byte 18, 24, 26, 20
+ .byte 14, 22, 28, 30
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 8de8968..3cd7624 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -30,6 +30,10 @@
#include "h264pred.h"
#include "rectangle.h"
+#if ARCH_ARM
+# include "arm/vp8.h"
+#endif
+
typedef struct {
uint8_t filter_level;
uint8_t inner_limit;
@@ -801,6 +805,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_
}
}
+#ifndef decode_block_coeffs_internal
/**
* @param c arithmetic bitstream reader context
* @param block destination for block coefficients
@@ -854,7 +859,7 @@ skip_eob:
int b = vp56_rac_get_prob(c, token_prob[9+a]);
int cat = (a<<1) + b;
coeff = 3 + (8<<cat);
- coeff += vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]);
+ coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
}
}
token_prob = probs[i+1][2];
@@ -864,6 +869,7 @@ skip_eob:
return i;
}
+#endif
static av_always_inline
int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
diff --git a/libavcodec/vp8data.h b/libavcodec/vp8data.h
index 5326e21..472b37a 100644
--- a/libavcodec/vp8data.h
+++ b/libavcodec/vp8data.h
@@ -313,7 +313,7 @@ static const uint8_t vp8_dct_cat5_prob[] = { 180, 157, 141, 134, 130, 0 };
static const uint8_t vp8_dct_cat6_prob[] = { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
// only used for cat3 and above; cat 1 and 2 are referenced directly
-static const uint8_t * const vp8_dct_cat_prob[] =
+const uint8_t * const ff_vp8_dct_cat_prob[] =
{
vp8_dct_cat3_prob,
vp8_dct_cat4_prob,
More information about the ffmpeg-cvslog
mailing list