[FFmpeg-devel] [PATCH v4] SH4 mpegaudio decoder optimizations

Guennadi Liakhovetski g.liakhovetski
Tue Feb 1 18:29:38 CET 2011


Hi all

This is version 4 of the following patch:

This patch implements several mpegaudio optimizations for SH4 FPU-enabled 
SoCs. Verified to provide more than 45% acceleration, when decoding a 
128kbps stereo mp3 audio file to /dev/null.
---

Yes, I know, the .S file is all indented with "evil" TABs... Please, let's 
review it this way, I'll convert it to spaces once accepted. Would have 
been too much trouble for me to develop with spaces...

v4: switched to a out-of-line .S assembly implementation

v3, v2, v1: were inline, changelog irrelevant

Thanks to all who commented!

diff --git a/libavcodec/mpc.c b/libavcodec/mpc.c
index d9a1fb7..33b1692 100644
--- a/libavcodec/mpc.c
+++ b/libavcodec/mpc.c
@@ -51,7 +51,7 @@ static void mpc_synth(MPCContext *c, int16_t *out, int channels)
     for(ch = 0;  ch < channels; ch++){
         samples_ptr = samples + ch;
         for(i = 0; i < SAMPLES_PER_BAND; i++) {
-            ff_mpa_synth_filter(c->synth_buf[ch], &(c->synth_buf_offset[ch]),
+            ff_mpa_synth_filter(NULL, c->synth_buf[ch], &(c->synth_buf_offset[ch]),
                                 ff_mpa_synth_window, &dither_state,
                                 samples_ptr, channels,
                                 c->sb_samples[ch][i]);
diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
index 97c7855..f1a05e8 100644
--- a/libavcodec/mpegaudio.h
+++ b/libavcodec/mpegaudio.h
@@ -175,7 +175,8 @@ int ff_mpa_l2_select_table(int bitrate, int nb_channels, int freq, int lsf);
 int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bitrate);
 extern MPA_INT ff_mpa_synth_window[];
 void ff_mpa_synth_init(MPA_INT *window);
-void ff_mpa_synth_filter(MPA_INT *synth_buf_ptr, int *synth_buf_offset,
+void ff_mpa_synth_filter(MPADecodeContext *s,
+                         MPA_INT *synth_buf_ptr, int *synth_buf_offset,
                          MPA_INT *window, int *dither_state,
                          OUT_INT *samples, int incr,
                          INTFLOAT sb_samples[SBLIMIT]);
@@ -189,6 +190,7 @@ void ff_mpa_synth_filter_float(MPADecodeContext *s,
 
 void ff_mpegaudiodec_init_mmx(MPADecodeContext *s);
 void ff_mpegaudiodec_init_altivec(MPADecodeContext *s);
+void ff_mpegaudiodec_init_sh4(MPADecodeContext *s);
 
 /* fast header check for resync */
 static inline int ff_mpa_check_header(uint32_t header){
diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index 769be89..27ca635 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -329,6 +329,10 @@ static av_cold int decode_init(AVCodecContext * avctx)
 #endif
     if (HAVE_ALTIVEC && CONFIG_FLOAT) ff_mpegaudiodec_init_altivec(s);
 
+#if ARCH_SH4 && !CONFIG_FLOAT
+    ff_mpegaudiodec_init_sh4(s);
+#endif
+
     avctx->sample_fmt= OUT_FMT;
     s->error_recognition= avctx->error_recognition;
 
@@ -702,7 +706,8 @@ static void apply_window_mp3_c(MPA_INT *synth_buf, MPA_INT *window,
    32 samples. */
 /* XXX: optimize by avoiding ring buffer usage */
 #if !CONFIG_FLOAT
-void ff_mpa_synth_filter(MPA_INT *synth_buf_ptr, int *synth_buf_offset,
+void ff_mpa_synth_filter(MPADecodeContext *s,
+                         MPA_INT *synth_buf_ptr, int *synth_buf_offset,
                          MPA_INT *window, int *dither_state,
                          OUT_INT *samples, int incr,
                          INTFLOAT sb_samples[SBLIMIT])
@@ -728,7 +733,10 @@ void ff_mpa_synth_filter(MPA_INT *synth_buf_ptr, int *synth_buf_offset,
     dct32(synth_buf, sb_samples);
 #endif
 
-    apply_window_mp3_c(synth_buf, window, dither_state, samples, incr);
+    if (s)
+        s->apply_window_mp3(synth_buf, window, dither_state, samples, incr);
+    else
+        apply_window_mp3_c(synth_buf, window, dither_state, samples, incr);
 
     offset = (offset - 32) & 511;
     *synth_buf_offset = offset;
@@ -2013,9 +2021,7 @@ static int mp_decode_frame(MPADecodeContext *s,
         samples_ptr = samples + ch;
         for(i=0;i<nb_frames;i++) {
             RENAME(ff_mpa_synth_filter)(
-#if CONFIG_FLOAT
                          s,
-#endif
                          s->synth_buf[ch], &(s->synth_buf_offset[ch]),
                          RENAME(ff_mpa_synth_window), &s->dither_state,
                          samples_ptr, s->nb_channels,
diff --git a/libavcodec/qdm2.c b/libavcodec/qdm2.c
index efcf6b5..94e3b07 100644
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@@ -1616,7 +1616,7 @@ static void qdm2_synthesis_filter (QDM2Context *q, int index)
         OUT_INT *samples_ptr = samples + ch;
 
         for (i = 0; i < 8; i++) {
-            ff_mpa_synth_filter(q->synth_buf[ch], &(q->synth_buf_offset[ch]),
+            ff_mpa_synth_filter(NULL, q->synth_buf[ch], &(q->synth_buf_offset[ch]),
                 ff_mpa_synth_window, &dither_state,
                 samples_ptr, q->nb_channels,
                 q->sb_samples[ch][(8 * index) + i]);
diff --git a/libavcodec/sh4/Makefile b/libavcodec/sh4/Makefile
index 142cba4..26a1e83 100644
--- a/libavcodec/sh4/Makefile
+++ b/libavcodec/sh4/Makefile
@@ -1,3 +1,4 @@
 OBJS                                   += sh4/dsputil_align.o           \
                                           sh4/dsputil_sh4.o             \
                                           sh4/idct_sh4.o                \
+                                          sh4/mpegaudiodec_sh4.o
diff --git a/libavcodec/sh4/dsputil_sh4.c b/libavcodec/sh4/dsputil_sh4.c
index ec06e24..37efdf0 100644
--- a/libavcodec/sh4/dsputil_sh4.c
+++ b/libavcodec/sh4/dsputil_sh4.c
@@ -20,8 +20,11 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#undef CONFIG_FLOAT
+
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
+#include "libavcodec/mpegaudio.h"
 #include "dsputil_sh4.h"
 #include "sh4.h"
 
@@ -102,3 +105,32 @@ void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
                 c->idct_permutation_type= FF_NO_IDCT_PERM;
         }
 }
+
+void mp3_win_loop_sh4(int64_t *sum, MPA_INT *synth_buf, MPA_INT *w,
+		      OUT_INT *samples, int incr);
+int round_sample_sh4(int64_t *sum);
+void sum8_macs_sh4(int64_t *sum, int32_t *p, int32_t *w);
+
+static void apply_window_mp3_sh4(MPA_INT *synth_buf, MPA_INT *window,
+                                 int *dither_state, OUT_INT *samples, int incr)
+{
+    int64_t sum, sum2 = 0;
+
+    /* copy to avoid wrap */
+    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
+
+    sum = *dither_state;
+    sum8_macs_sh4(&sum, synth_buf + 16, window);
+    sum8_macs_sh4(&sum2, synth_buf + 48, window + 32);
+    sum -= sum2;
+    *samples = round_sample_sh4(&sum);
+
+    mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr);
+
+    *dither_state = sum;
+}
+
+void ff_mpegaudiodec_init_sh4(MPADecodeContext *s)
+{
+    s->apply_window_mp3 = apply_window_mp3_sh4;
+}
diff --git a/libavcodec/sh4/mpegaudiodec_sh4.S b/libavcodec/sh4/mpegaudiodec_sh4.S
new file mode 100644
index 0000000..ea3b95c
--- /dev/null
+++ b/libavcodec/sh4/mpegaudiodec_sh4.S
@@ -0,0 +1,244 @@
+#define MAXSHORT (1 << 15) - 1
+#define MINSHORT -(1 << 15)
+
+#define FRAC_BITS   23   /* fractional bits for sb_samples and dct */
+#define WFRAC_BITS  16   /* fractional bits for window */
+
+#define OUT_MAX MAXSHORT
+#define OUT_MIN MINSHORT
+#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
+
+	.macro sh_macs, wp, pp, tmp
+	add	\tmp, \wp
+	add	\tmp, \pp
+	mac.l	@\wp+, @\pp+
+	.endm
+
+	.macro sh_sum8_macs, sum, wp, pp, tmp
+	mov.l	@\sum+, \tmp
+	lds	\tmp, macl
+	mov.l	@\sum+, \tmp
+	lds	\tmp, mach
+	mov	#126, \tmp
+	shll	\tmp			/* 63 * 4 */
+	mac.l	@\wp+, @\pp+		/* 0 */
+	sh_macs	\wp, \pp, \tmp		/* 1 */
+	sh_macs	\wp, \pp, \tmp		/* 2 */
+	sh_macs	\wp, \pp, \tmp		/* 3 */
+	sh_macs	\wp, \pp, \tmp		/* 4 */
+	sh_macs	\wp, \pp, \tmp		/* 5 */
+	sh_macs	\wp, \pp, \tmp		/* 6 */
+	sh_macs	\wp, \pp, \tmp		/* 7 */
+	sts	mach, \tmp
+	mov.l	\tmp, @-\sum
+	sts	macl, \tmp
+	mov.l	\tmp, @-\sum
+	.endm
+
+	/* void sum8_macs(int64_t *sum, int32_t *p, int32_t *w) */
+
+	.globl sum8_macs_sh4
+	.balign 4
+	.type   sum8_macs_sh4, @function
+sum8_macs_sh4:
+	sh_sum8_macs	r4, r6, r5, r3
+	rts
+	nop
+	.size sum8_macs_sh4, .-sum8_macs_sh4
+
+	.macro round_sample, sum, tmp1, tmp2
+	mov.l	@\sum, \tmp1
+	mov.l	m23, \tmp2
+	and	\tmp1, \tmp2		/* hi &= (1 << 24) - 1 */
+	mov.l	\tmp2, @\sum
+	shlr16	\tmp1
+	shlr8	\tmp1			/* lo >>= 24 */
+	mov.l	@(4, \sum), \tmp2
+	shll8	\tmp2			/* hi <<= 8 */
+	or	\tmp2, \tmp1		/* lo |= hi */
+	mov	#0, \tmp2
+	mov.l	\tmp2, @(4, \sum)
+	/* av_clip */
+	mov.l	max, \tmp2
+	cmp/ge	\tmp2, \tmp1
+	bt	1f
+	mov.l	min, \tmp2
+	cmp/gt	\tmp2, \tmp1
+	bt	2f
+1:	mov	\tmp2, \tmp1
+2:	
+	.endm
+
+	/* int round_sample_asm(int64_t *sum) */
+
+	.globl round_sample_sh4
+	.balign 4
+	.type   round_sample_sh4, @function
+round_sample_sh4:
+	round_sample r4, r0, r1
+	rts
+	nop
+	.size round_sample_sh4, .-round_sample_sh4
+
+	.balign 4
+
+	/* void mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr) */
+	.globl mp3_win_loop_sh4
+	.balign 4
+	.type   mp3_win_loop_sh4, @function
+mp3_win_loop_sh4:
+	mov	r6, r0
+	add	#120, r0	/* w2 = w + 30 (* 4) */
+	mov	#60, r1		/* j = r1 - loop counter: 15 * 4 */
+
+	mov.l	@r15, r2	/* r2 = incr */
+	shll	r2		/* r2 *= 2 */
+
+	mov.l	r8, @-r15
+	mov.l	r9, @-r15
+	mov.l	r10, @-r15
+	mov.l	r11, @-r15
+	sts.l	pr, @-r15
+
+	mov	#30, r11
+	muls.w	r2, r11
+	sts	macl, r11
+	add	r7, r11		/* samples2 = samples + 30 * incr (* 2) */
+
+4:	sub	r1, r5		/* p = synth_buf - j (* 4) */
+
+	/* We'll use stack for 64-bit variables:
+	 * tmp = -sum2 */
+
+	/* SUM8P2_MACS_MLSS(sum, sum2, w, w2, p) */
+
+	mov	r5, r8
+	mov	r6, r9
+	sh_sum8_macs r4, r6, r5, r3	/* sum, window, pp, tmp */
+	mov	r8, r5		/* restore p */
+	mov	r9, r6		/* restore w */
+
+	mov	#0, r10
+	mov.l	r10, @-r15	/* sum2 = 0 */
+	mov.l	r10, @-r15
+	mov	r15, r10	/* r10 = &sum2 */
+
+	mov	r0, r9
+	sh_sum8_macs r10, r0, r5, r3	/* sum2 is now at the top of the stack */
+	mov	r9, r0		/* restore w2 */
+	mov	r8, r5		/* restore p */
+
+	/* SUM8P2_MLSS_MLSS(sum, sum2, w + 32, w2 + 32, p) */
+
+	add	r1, r5
+	add	r1, r5		/* p += 2 * j (* 4) */
+
+	mov	r5, r8		/* save r5 */
+	mov	r6, r9		/* save r6 before incrementing */
+
+	add	#64, r6
+	add	#64, r6		/* w += 32 (* 4): 128 < 0 in 8-bit arith. */
+	mov	#0, r10
+	mov.l	r10, @-r15	/* int64_t tmp = 0 */
+	mov.l	r10, @-r15
+
+	mov	r15, r10	/* r10 = &tmp */
+
+	sh_sum8_macs r10, r6, r5, r3	/* tmp is now at the top of the stack */
+	mov	r9, r6		/* restore w */
+	mov	r8, r5		/* restore p */
+
+	clrt
+
+	/* sum -= tmp - 64-bit signed */
+	mov.l	@r15+, r8	/* low 32-bits of the tmp-sum */
+	mov.l	@r4, r10	/* low 32-bits of the sum */
+	subc	r8, r10
+	mov.l	r10, @r4
+	mov.l	@r15+, r8	/* high 32-bits of the tmp-sum */
+	mov.l	@(4, r4), r10	/* high 32-bits of the sum */
+	subc	r8, r10
+	mov.l	r10, @(4, r4)
+
+	mov	r5, r8
+	mov	r0, r10
+	add	#64, r0
+	add	#64, r0		/* w2 += 32 (* 4): 128 < 0 in 8-bit arith. */
+
+	mov	r15, r9		/* r9 = &sum2 */
+	sh_sum8_macs r9, r0, r5, r3	/* tmp = -sum2 is now at the top of the stack */
+	mov	r8, r5		/* restore p */
+
+	round_sample r4, r0, r3
+
+	mov.w	r0, @r7		/* *samples = round_samples() */
+	add	r2, r7		/* samples += incr (* 2) */
+
+	clrt
+
+	mov.l	@r15+, r0	/* low 32-bits */
+	mov.l	@r4, r3
+	subc	r0, r3
+	mov.l	r3, @r4
+	mov.l	@r15+, r0	/* high 32-bits */
+	mov.l	@(4, r4), r3
+	subc	r0, r3
+
+	mov.l	r3, @(4, r4)
+	round_sample r4, r0, r3
+
+	mov.w	r0, @r11	/* *samples2 = round_samples() */
+	sub	r2, r11		/* samples2 -= incr (* 2) */
+
+	sub	r1, r5		/* synth_buf = p - j */
+	mov	r10, r0		/* restore w2 */
+	add	#-4, r0		/* w2-- */
+
+	add	#-4, r1
+	cmp/pl	r1
+	bf	5f
+	bra	4b
+	/* CAREFUL: The below w++ is still in the loop - it's in the delay window */
+5:	add	#4, r6		/* w++ */
+
+	mov	#0, r10
+	mov.l	r10, @-r15	/* int64_t tmp = 0 */
+	mov.l	r10, @-r15
+	mov	r15, r10	/* r10 = &tmp */
+
+	add	#64, r6
+	add	#64, r6		/* w += 32 (* 4): 128 < 0 in 8-bit arith. */
+
+	mov	r15, r10	/* r10 = &tmp */
+	sh_sum8_macs r10, r6, r5, r3	/* tmp = -sum2 is now at the top of the stack */
+
+	clrt
+
+	/* sum -= tmp - 64-bit signed */
+	mov.l	@r15+, r8	/* low 32-bits of the tmp-sum */
+	mov.l	@r4, r9		/* low 32-bits of the sum */
+	subc	r8, r9
+	mov.l	r9, @r4
+	mov.l	@r15+, r8	/* high 32-bits of the tmp-sum */
+	mov.l	@(4, r4), r9	/* high 32-bits of the sum */
+	subc	r8, r9
+
+	mov.l	r9, @(4, r4)
+	round_sample r4, r0, r1
+
+	mov.w	r0, @r7		/* *samples = round_samples() */
+
+	lds.l	@r15+, pr
+	mov.l	@r15+, r11
+	mov.l	@r15+, r10
+	mov.l	@r15+, r9
+
+	rts
+	mov.l	@r15+, r8	/* delay window */
+
+	.size mp3_win_loop_sh4, .-mp3_win_loop_sh4
+
+	.align 4
+max:	.long	OUT_MAX
+min:	.long	OUT_MIN
+m23:	.long	(1 << OUT_SHIFT) - 1



More information about the ffmpeg-devel mailing list