[FFmpeg-devel] [PATCH] Separate format conversion DSP functions from DSPContext.
Mans Rullgard
mans
Wed Feb 2 00:13:20 CET 2011
From: Justin Ruggles <justin.ruggles at gmail.com>
This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.
Signed-off-by: Mans Rullgard <mans at mansr.com>
---
Now with x86 yasm code removed from its old location.
---
libavcodec/Makefile | 1 +
libavcodec/aac.h | 2 +
libavcodec/aacdec.c | 4 +-
libavcodec/ac3dec.c | 5 +-
libavcodec/ac3dec.h | 2 +
libavcodec/arm/Makefile | 5 +
libavcodec/arm/dsputil_init_neon.c | 10 -
libavcodec/arm/dsputil_init_vfp.c | 4 -
libavcodec/arm/dsputil_neon.S | 365 -------------------------------
libavcodec/arm/dsputil_vfp.S | 55 -----
libavcodec/arm/fmtconvert_init_arm.c | 48 ++++
libavcodec/arm/fmtconvert_neon.S | 391 ++++++++++++++++++++++++++++++++++
libavcodec/arm/fmtconvert_vfp.S | 77 +++++++
libavcodec/binkaudio.c | 6 +-
libavcodec/dca.c | 7 +-
libavcodec/dsputil.c | 33 ---
libavcodec/dsputil.h | 5 -
libavcodec/fmtconvert.c | 68 ++++++
libavcodec/fmtconvert.h | 79 +++++++
libavcodec/nellymoserdec.c | 5 +-
libavcodec/ppc/Makefile | 1 +
libavcodec/ppc/float_altivec.c | 112 ----------
libavcodec/ppc/fmtconvert_altivec.c | 142 ++++++++++++
libavcodec/vorbis_dec.c | 6 +-
libavcodec/wma.c | 1 +
libavcodec/wma.h | 2 +
libavcodec/wmadec.c | 2 +-
libavcodec/x86/Makefile | 2 +
libavcodec/x86/dsputil_mmx.c | 220 -------------------
libavcodec/x86/dsputil_yasm.asm | 69 ------
libavcodec/x86/fmtconvert.asm | 91 ++++++++
libavcodec/x86/fmtconvert_mmx.c | 266 +++++++++++++++++++++++
32 files changed, 1204 insertions(+), 882 deletions(-)
create mode 100644 libavcodec/arm/fmtconvert_init_arm.c
create mode 100644 libavcodec/arm/fmtconvert_neon.S
create mode 100644 libavcodec/arm/fmtconvert_vfp.S
create mode 100644 libavcodec/fmtconvert.c
create mode 100644 libavcodec/fmtconvert.h
create mode 100644 libavcodec/ppc/fmtconvert_altivec.c
create mode 100644 libavcodec/x86/fmtconvert.asm
create mode 100644 libavcodec/x86/fmtconvert_mmx.c
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index de1bde0..6a0a05b 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -12,6 +12,7 @@ OBJS = allcodecs.o \
bitstream_filter.o \
dsputil.o \
faanidct.o \
+ fmtconvert.o \
imgconvert.o \
jrevdct.o \
opt.o \
diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index 714e314..cff476a 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -35,6 +35,7 @@
#include "fft.h"
#include "mpeg4audio.h"
#include "sbr.h"
+#include "fmtconvert.h"
#include <stdint.h>
@@ -268,6 +269,7 @@ typedef struct {
FFTContext mdct;
FFTContext mdct_small;
DSPContext dsp;
+ FmtConvertContext fmt_conv;
int random_state;
/** @} */
diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c
index 0ea7dc8..411c1df 100644
--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -85,6 +85,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
#include "lpc.h"
#include "aac.h"
@@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
ff_aac_sbr_init();
dsputil_init(&ac->dsp, avctx);
+ ff_fmt_convert_init(&ac->fmt_conv, avctx);
ac->random_state = 0x1f2e3d4c;
@@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
*data_size = data_size_tmp;
if (samples)
- ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
+ ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
if (ac->output_configured)
ac->output_configured = OC_LOCKED;
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 8e40ce1..5ebee19 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
ff_kbd_window_init(s->window, 5.0, 256);
dsputil_init(&s->dsp, avctx);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
av_lfg_init(&s->dith_state, 0);
/* set scale value for float to int16 conversion */
@@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
} else {
gain *= s->dynamic_range[0];
}
- s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
+ s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
}
/* apply spectral extension to high frequency bins */
@@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
err = 1;
}
- s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
+ s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
out_samples += 256 * s->out_channels;
}
*data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index 55520cd..147e5e5 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -55,6 +55,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
/* override ac3.h to include coupling channel */
#undef AC3_MAX_CHANNELS
@@ -190,6 +191,7 @@ typedef struct {
///@defgroup opt optimization
DSPContext dsp; ///< for optimization
+ FmtConvertContext fmt_conv; ///< optimized conversion functions
float mul_bias; ///< scaling for float_to_int16 conversion
///@}
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 4c30e0a..014456e 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS += arm/dsputil_init_arm.o \
arm/dsputil_arm.o \
arm/fft_init_arm.o \
+ arm/fmtconvert_init_arm.o \
arm/jrevdct_arm.o \
arm/mpegvideo_arm.o \
arm/simple_idct_arm.o \
@@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
+VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \
+
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
arm/dsputil_init_vfp.o \
+ $(VFP-OBJS-yes)
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
arm/mpegvideo_iwmmxt.o \
@@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
arm/dsputil_neon.o \
+ arm/fmtconvert_neon.o \
arm/int_neon.o \
arm/mpegvideo_neon.o \
arm/simple_idct_neon.o \
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 6798204..76ae632 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
int len);
void ff_butterflies_float_neon(float *v1, float *v2, int len);
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
- float mul, int len);
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
@@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
int len);
-void ff_float_to_int16_neon(int16_t *, const float *, long);
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
@@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
c->butterflies_float = ff_butterflies_float_neon;
c->scalarproduct_float = ff_scalarproduct_float_neon;
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
c->vector_fmul_add = ff_vector_fmul_add_neon;
c->vector_clipf = ff_vector_clipf_neon;
@@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->float_to_int16 = ff_float_to_int16_neon;
- c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
- }
-
if (CONFIG_VORBIS_DECODER)
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c
index 76ef6b4..bd52315 100644
--- a/libavcodec/arm/dsputil_init_vfp.c
+++ b/libavcodec/arm/dsputil_init_vfp.c
@@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
const float *src1, int len);
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul = ff_vector_fmul_vfp;
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
-#if HAVE_ARMV6
- c->float_to_int16 = ff_float_to_int16_vfp;
-#endif
}
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 8329f6c..05a9115 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
bx lr
endfunc
-function ff_float_to_int16_neon, export=1
- subs r2, r2, #8
- vld1.64 {d0-d1}, [r1,:128]!
- vcvt.s32.f32 q8, q0, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vcvt.s32.f32 q9, q1, #16
- beq 3f
- bics ip, r2, #15
- beq 2f
-1: subs ip, ip, #16
- vshrn.s32 d4, q8, #16
- vld1.64 {d0-d1}, [r1,:128]!
- vcvt.s32.f32 q0, q0, #16
- vshrn.s32 d5, q9, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vcvt.s32.f32 q1, q1, #16
- vshrn.s32 d6, q0, #16
- vst1.64 {d4-d5}, [r0,:128]!
- vshrn.s32 d7, q1, #16
- vld1.64 {d16-d17},[r1,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r1,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.64 {d6-d7}, [r0,:128]!
- bne 1b
- ands r2, r2, #15
- beq 3f
-2: vld1.64 {d0-d1}, [r1,:128]!
- vshrn.s32 d4, q8, #16
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vshrn.s32 d5, q9, #16
- vcvt.s32.f32 q1, q1, #16
- vshrn.s32 d6, q0, #16
- vst1.64 {d4-d5}, [r0,:128]!
- vshrn.s32 d7, q1, #16
- vst1.64 {d6-d7}, [r0,:128]!
- bx lr
-3: vshrn.s32 d4, q8, #16
- vshrn.s32 d5, q9, #16
- vst1.64 {d4-d5}, [r0,:128]!
- bx lr
-endfunc
-
-function ff_float_to_int16_interleave_neon, export=1
- cmp r3, #2
- ldrlt r1, [r1]
- blt ff_float_to_int16_neon
- bne 4f
-
- ldr r3, [r1]
- ldr r1, [r1, #4]
-
- subs r2, r2, #8
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q8, q0, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q9, q1, #16
- vld1.64 {d20-d21},[r1,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r1,:128]!
- vcvt.s32.f32 q11, q11, #16
- beq 3f
- bics ip, r2, #15
- beq 2f
-1: subs ip, ip, #16
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 q10, q8, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q1, q1, #16
- vld1.64 {d24-d25},[r1,:128]!
- vcvt.s32.f32 q12, q12, #16
- vld1.64 {d26-d27},[r1,:128]!
- vsri.32 q11, q9, #16
- vst1.64 {d20-d21},[r0,:128]!
- vcvt.s32.f32 q13, q13, #16
- vst1.64 {d22-d23},[r0,:128]!
- vsri.32 q12, q0, #16
- vld1.64 {d16-d17},[r3,:128]!
- vsri.32 q13, q1, #16
- vst1.64 {d24-d25},[r0,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r3,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r1,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r1,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.64 {d26-d27},[r0,:128]!
- bne 1b
- ands r2, r2, #15
- beq 3f
-2: vsri.32 q10, q8, #16
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q1, q1, #16
- vld1.64 {d24-d25},[r1,:128]!
- vcvt.s32.f32 q12, q12, #16
- vsri.32 q11, q9, #16
- vld1.64 {d26-d27},[r1,:128]!
- vcvt.s32.f32 q13, q13, #16
- vst1.64 {d20-d21},[r0,:128]!
- vsri.32 q12, q0, #16
- vst1.64 {d22-d23},[r0,:128]!
- vsri.32 q13, q1, #16
- vst1.64 {d24-d27},[r0,:128]!
- bx lr
-3: vsri.32 q10, q8, #16
- vsri.32 q11, q9, #16
- vst1.64 {d20-d23},[r0,:128]!
- bx lr
-
-4: push {r4-r8,lr}
- cmp r3, #4
- lsl ip, r3, #1
- blt 4f
-
- @ 4 channels
-5: ldmia r1!, {r4-r7}
- mov lr, r2
- mov r8, r0
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r6,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r7,:128]!
- vcvt.s32.f32 q11, q11, #16
-6: subs lr, lr, #8
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 q9, q8, #16
- vld1.64 {d2-d3}, [r5,:128]!
- vcvt.s32.f32 q1, q1, #16
- vsri.32 q11, q10, #16
- vld1.64 {d4-d5}, [r6,:128]!
- vcvt.s32.f32 q2, q2, #16
- vzip.32 d18, d22
- vld1.64 {d6-d7}, [r7,:128]!
- vcvt.s32.f32 q3, q3, #16
- vzip.32 d19, d23
- vst1.64 {d18}, [r8], ip
- vsri.32 q1, q0, #16
- vst1.64 {d22}, [r8], ip
- vsri.32 q3, q2, #16
- vst1.64 {d19}, [r8], ip
- vzip.32 d2, d6
- vst1.64 {d23}, [r8], ip
- vzip.32 d3, d7
- beq 7f
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.64 {d2}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.64 {d6}, [r8], ip
- vld1.64 {d20-d21},[r6,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.64 {d3}, [r8], ip
- vld1.64 {d22-d23},[r7,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.64 {d7}, [r8], ip
- b 6b
-7: vst1.64 {d2}, [r8], ip
- vst1.64 {d6}, [r8], ip
- vst1.64 {d3}, [r8], ip
- vst1.64 {d7}, [r8], ip
- subs r3, r3, #4
- popeq {r4-r8,pc}
- cmp r3, #4
- add r0, r0, #8
- bge 5b
-
- @ 2 channels
-4: cmp r3, #2
- blt 4f
- ldmia r1!, {r4-r5}
- mov lr, r2
- mov r8, r0
- tst lr, #8
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
- beq 6f
- subs lr, lr, #8
- beq 7f
- vsri.32 d18, d16, #16
- vsri.32 d19, d17, #16
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vst1.32 {d19[1]}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.32 {d22[0]}, [r8], ip
- vst1.32 {d22[1]}, [r8], ip
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
-6: subs lr, lr, #16
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 d18, d16, #16
- vld1.64 {d2-d3}, [r5,:128]!
- vcvt.s32.f32 q1, q1, #16
- vsri.32 d19, d17, #16
- vld1.64 {d4-d5}, [r4,:128]!
- vcvt.s32.f32 q2, q2, #16
- vld1.64 {d6-d7}, [r5,:128]!
- vcvt.s32.f32 q3, q3, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vsri.32 d2, d0, #16
- vst1.32 {d19[1]}, [r8], ip
- vsri.32 d3, d1, #16
- vst1.32 {d22[0]}, [r8], ip
- vsri.32 d6, d4, #16
- vst1.32 {d22[1]}, [r8], ip
- vsri.32 d7, d5, #16
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
- beq 6f
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.32 {d2[0]}, [r8], ip
- vst1.32 {d2[1]}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.32 {d3[0]}, [r8], ip
- vst1.32 {d3[1]}, [r8], ip
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.32 {d6[0]}, [r8], ip
- vst1.32 {d6[1]}, [r8], ip
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.32 {d7[0]}, [r8], ip
- vst1.32 {d7[1]}, [r8], ip
- bgt 6b
-6: vst1.32 {d2[0]}, [r8], ip
- vst1.32 {d2[1]}, [r8], ip
- vst1.32 {d3[0]}, [r8], ip
- vst1.32 {d3[1]}, [r8], ip
- vst1.32 {d6[0]}, [r8], ip
- vst1.32 {d6[1]}, [r8], ip
- vst1.32 {d7[0]}, [r8], ip
- vst1.32 {d7[1]}, [r8], ip
- b 8f
-7: vsri.32 d18, d16, #16
- vsri.32 d19, d17, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vst1.32 {d19[1]}, [r8], ip
- vst1.32 {d22[0]}, [r8], ip
- vst1.32 {d22[1]}, [r8], ip
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
-8: subs r3, r3, #2
- add r0, r0, #4
- popeq {r4-r8,pc}
-
- @ 1 channel
-4: ldr r4, [r1],#4
- tst r2, #8
- mov lr, r2
- mov r5, r0
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
- bne 8f
-6: subs lr, lr, #16
- vld1.64 {d4-d5}, [r4,:128]!
- vcvt.s32.f32 q2, q2, #16
- vld1.64 {d6-d7}, [r4,:128]!
- vcvt.s32.f32 q3, q3, #16
- vst1.16 {d0[1]}, [r5,:16], ip
- vst1.16 {d0[3]}, [r5,:16], ip
- vst1.16 {d1[1]}, [r5,:16], ip
- vst1.16 {d1[3]}, [r5,:16], ip
- vst1.16 {d2[1]}, [r5,:16], ip
- vst1.16 {d2[3]}, [r5,:16], ip
- vst1.16 {d3[1]}, [r5,:16], ip
- vst1.16 {d3[3]}, [r5,:16], ip
- beq 7f
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
-7: vst1.16 {d4[1]}, [r5,:16], ip
- vst1.16 {d4[3]}, [r5,:16], ip
- vst1.16 {d5[1]}, [r5,:16], ip
- vst1.16 {d5[3]}, [r5,:16], ip
- vst1.16 {d6[1]}, [r5,:16], ip
- vst1.16 {d6[3]}, [r5,:16], ip
- vst1.16 {d7[1]}, [r5,:16], ip
- vst1.16 {d7[3]}, [r5,:16], ip
- bgt 6b
- pop {r4-r8,pc}
-8: subs lr, lr, #8
- vst1.16 {d0[1]}, [r5,:16], ip
- vst1.16 {d0[3]}, [r5,:16], ip
- vst1.16 {d1[1]}, [r5,:16], ip
- vst1.16 {d1[3]}, [r5,:16], ip
- vst1.16 {d2[1]}, [r5,:16], ip
- vst1.16 {d2[3]}, [r5,:16], ip
- vst1.16 {d3[1]}, [r5,:16], ip
- vst1.16 {d3[3]}, [r5,:16], ip
- popeq {r4-r8,pc}
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
- b 6b
-endfunc
-
function ff_vector_fmul_neon, export=1
subs r3, r3, #8
vld1.64 {d0-d3}, [r1,:128]!
@@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0]
bx lr
endfunc
-function ff_int32_to_float_fmul_scalar_neon, export=1
-VFP vdup.32 q0, d0[0]
-VFP len .req r2
-NOVFP vdup.32 q0, r2
-NOVFP len .req r3
-
- vld1.32 {q1},[r1,:128]!
- vcvt.f32.s32 q3, q1
- vld1.32 {q2},[r1,:128]!
- vcvt.f32.s32 q8, q2
-1: subs len, len, #8
- pld [r1, #16]
- vmul.f32 q9, q3, q0
- vmul.f32 q10, q8, q0
- beq 2f
- vld1.32 {q1},[r1,:128]!
- vcvt.f32.s32 q3, q1
- vld1.32 {q2},[r1,:128]!
- vcvt.f32.s32 q8, q2
- vst1.32 {q9}, [r0,:128]!
- vst1.32 {q10},[r0,:128]!
- b 1b
-2: vst1.32 {q9}, [r0,:128]!
- vst1.32 {q10},[r0,:128]!
- bx lr
- .unreq len
-endfunc
-
function ff_vector_fmul_reverse_neon, export=1
add r2, r2, r3, lsl #2
sub r2, r2, #32
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
index a65b69e..197d500 100644
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
vpop {d8-d15}
bx lr
endfunc
-
-#if HAVE_ARMV6
-/**
- * ARM VFP optimized float to int16 conversion.
- * Assume that len is a positive number and is multiple of 8, destination
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for
- * performance), little endian byte sex
- */
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
-function ff_float_to_int16_vfp, export=1
- push {r4-r8,lr}
- vpush {d8-d11}
- vldmia r1!, {s16-s23}
- vcvt.s32.f32 s0, s16
- vcvt.s32.f32 s1, s17
- vcvt.s32.f32 s2, s18
- vcvt.s32.f32 s3, s19
- vcvt.s32.f32 s4, s20
- vcvt.s32.f32 s5, s21
- vcvt.s32.f32 s6, s22
- vcvt.s32.f32 s7, s23
-1:
- subs r2, r2, #8
- vmov r3, r4, s0, s1
- vmov r5, r6, s2, s3
- vmov r7, r8, s4, s5
- vmov ip, lr, s6, s7
- vldmiagt r1!, {s16-s23}
- ssat r4, #16, r4
- ssat r3, #16, r3
- ssat r6, #16, r6
- ssat r5, #16, r5
- pkhbt r3, r3, r4, lsl #16
- pkhbt r4, r5, r6, lsl #16
- vcvtgt.s32.f32 s0, s16
- vcvtgt.s32.f32 s1, s17
- vcvtgt.s32.f32 s2, s18
- vcvtgt.s32.f32 s3, s19
- vcvtgt.s32.f32 s4, s20
- vcvtgt.s32.f32 s5, s21
- vcvtgt.s32.f32 s6, s22
- vcvtgt.s32.f32 s7, s23
- ssat r8, #16, r8
- ssat r7, #16, r7
- ssat lr, #16, lr
- ssat ip, #16, ip
- pkhbt r5, r7, r8, lsl #16
- pkhbt r6, ip, lr, lsl #16
- stmia r0!, {r3-r6}
- bgt 1b
-
- vpop {d8-d11}
- pop {r4-r8,pc}
-endfunc
-#endif
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
new file mode 100644
index 0000000..4b6e393
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -0,0 +1,48 @@
+/*
+ * ARM optimized Format Conversion Utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
+ float mul, int len);
+
+void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ if (HAVE_ARMVFP && HAVE_ARMV6) {
+ c->float_to_int16 = ff_float_to_int16_vfp;
+ }
+
+ if (HAVE_NEON) {
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
+
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+ c->float_to_int16 = ff_float_to_int16_neon;
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+ }
+ }
+}
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
new file mode 100644
index 0000000..359e57e
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -0,0 +1,391 @@
+/*
+ * ARM NEON optimised Format Conversion Utils
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+ preserve8
+ .text
+
+function ff_float_to_int16_neon, export=1
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r1,:128]!
+ vcvt.s32.f32 q8, q0, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vcvt.s32.f32 q9, q1, #16
+ beq 3f
+ bics ip, r2, #15
+ beq 2f
+1: subs ip, ip, #16
+ vshrn.s32 d4, q8, #16
+ vld1.64 {d0-d1}, [r1,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vshrn.s32 d5, q9, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vshrn.s32 d6, q0, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ vshrn.s32 d7, q1, #16
+ vld1.64 {d16-d17},[r1,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r1,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.64 {d6-d7}, [r0,:128]!
+ bne 1b
+ ands r2, r2, #15
+ beq 3f
+2: vld1.64 {d0-d1}, [r1,:128]!
+ vshrn.s32 d4, q8, #16
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vshrn.s32 d5, q9, #16
+ vcvt.s32.f32 q1, q1, #16
+ vshrn.s32 d6, q0, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ vshrn.s32 d7, q1, #16
+ vst1.64 {d6-d7}, [r0,:128]!
+ bx lr
+3: vshrn.s32 d4, q8, #16
+ vshrn.s32 d5, q9, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ bx lr
+endfunc
+
+function ff_float_to_int16_interleave_neon, export=1
+ cmp r3, #2
+ ldrlt r1, [r1]
+ blt ff_float_to_int16_neon
+ bne 4f
+
+ ldr r3, [r1]
+ ldr r1, [r1, #4]
+
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q8, q0, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q9, q1, #16
+ vld1.64 {d20-d21},[r1,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r1,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ beq 3f
+ bics ip, r2, #15
+ beq 2f
+1: subs ip, ip, #16
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 q10, q8, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vld1.64 {d24-d25},[r1,:128]!
+ vcvt.s32.f32 q12, q12, #16
+ vld1.64 {d26-d27},[r1,:128]!
+ vsri.32 q11, q9, #16
+ vst1.64 {d20-d21},[r0,:128]!
+ vcvt.s32.f32 q13, q13, #16
+ vst1.64 {d22-d23},[r0,:128]!
+ vsri.32 q12, q0, #16
+ vld1.64 {d16-d17},[r3,:128]!
+ vsri.32 q13, q1, #16
+ vst1.64 {d24-d25},[r0,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r3,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r1,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r1,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.64 {d26-d27},[r0,:128]!
+ bne 1b
+ ands r2, r2, #15
+ beq 3f
+2: vsri.32 q10, q8, #16
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vld1.64 {d24-d25},[r1,:128]!
+ vcvt.s32.f32 q12, q12, #16
+ vsri.32 q11, q9, #16
+ vld1.64 {d26-d27},[r1,:128]!
+ vcvt.s32.f32 q13, q13, #16
+ vst1.64 {d20-d21},[r0,:128]!
+ vsri.32 q12, q0, #16
+ vst1.64 {d22-d23},[r0,:128]!
+ vsri.32 q13, q1, #16
+ vst1.64 {d24-d27},[r0,:128]!
+ bx lr
+3: vsri.32 q10, q8, #16
+ vsri.32 q11, q9, #16
+ vst1.64 {d20-d23},[r0,:128]!
+ bx lr
+
+4: push {r4-r8,lr}
+ cmp r3, #4
+ lsl ip, r3, #1
+ blt 4f
+
+ @ 4 channels
+5: ldmia r1!, {r4-r7}
+ mov lr, r2
+ mov r8, r0
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r6,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r7,:128]!
+ vcvt.s32.f32 q11, q11, #16
+6: subs lr, lr, #8
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 q9, q8, #16
+ vld1.64 {d2-d3}, [r5,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vsri.32 q11, q10, #16
+ vld1.64 {d4-d5}, [r6,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vzip.32 d18, d22
+ vld1.64 {d6-d7}, [r7,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vzip.32 d19, d23
+ vst1.64 {d18}, [r8], ip
+ vsri.32 q1, q0, #16
+ vst1.64 {d22}, [r8], ip
+ vsri.32 q3, q2, #16
+ vst1.64 {d19}, [r8], ip
+ vzip.32 d2, d6
+ vst1.64 {d23}, [r8], ip
+ vzip.32 d3, d7
+ beq 7f
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.64 {d2}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.64 {d6}, [r8], ip
+ vld1.64 {d20-d21},[r6,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.64 {d3}, [r8], ip
+ vld1.64 {d22-d23},[r7,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.64 {d7}, [r8], ip
+ b 6b
+7: vst1.64 {d2}, [r8], ip
+ vst1.64 {d6}, [r8], ip
+ vst1.64 {d3}, [r8], ip
+ vst1.64 {d7}, [r8], ip
+ subs r3, r3, #4
+ popeq {r4-r8,pc}
+ cmp r3, #4
+ add r0, r0, #8
+ bge 5b
+
+ @ 2 channels
+4: cmp r3, #2
+ blt 4f
+ ldmia r1!, {r4-r5}
+ mov lr, r2
+ mov r8, r0
+ tst lr, #8
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ beq 6f
+ subs lr, lr, #8
+ beq 7f
+ vsri.32 d18, d16, #16
+ vsri.32 d19, d17, #16
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vst1.32 {d19[1]}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.32 {d22[0]}, [r8], ip
+ vst1.32 {d22[1]}, [r8], ip
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+6: subs lr, lr, #16
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 d18, d16, #16
+ vld1.64 {d2-d3}, [r5,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vsri.32 d19, d17, #16
+ vld1.64 {d4-d5}, [r4,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vld1.64 {d6-d7}, [r5,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vsri.32 d2, d0, #16
+ vst1.32 {d19[1]}, [r8], ip
+ vsri.32 d3, d1, #16
+ vst1.32 {d22[0]}, [r8], ip
+ vsri.32 d6, d4, #16
+ vst1.32 {d22[1]}, [r8], ip
+ vsri.32 d7, d5, #16
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+ beq 6f
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.32 {d2[0]}, [r8], ip
+ vst1.32 {d2[1]}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.32 {d3[0]}, [r8], ip
+ vst1.32 {d3[1]}, [r8], ip
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.32 {d6[0]}, [r8], ip
+ vst1.32 {d6[1]}, [r8], ip
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.32 {d7[0]}, [r8], ip
+ vst1.32 {d7[1]}, [r8], ip
+ bgt 6b
+6: vst1.32 {d2[0]}, [r8], ip
+ vst1.32 {d2[1]}, [r8], ip
+ vst1.32 {d3[0]}, [r8], ip
+ vst1.32 {d3[1]}, [r8], ip
+ vst1.32 {d6[0]}, [r8], ip
+ vst1.32 {d6[1]}, [r8], ip
+ vst1.32 {d7[0]}, [r8], ip
+ vst1.32 {d7[1]}, [r8], ip
+ b 8f
+7: vsri.32 d18, d16, #16
+ vsri.32 d19, d17, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vst1.32 {d19[1]}, [r8], ip
+ vst1.32 {d22[0]}, [r8], ip
+ vst1.32 {d22[1]}, [r8], ip
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+8: subs r3, r3, #2
+ add r0, r0, #4
+ popeq {r4-r8,pc}
+
+ @ 1 channel
+4: ldr r4, [r1],#4
+ tst r2, #8
+ mov lr, r2
+ mov r5, r0
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ bne 8f
+6: subs lr, lr, #16
+ vld1.64 {d4-d5}, [r4,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vld1.64 {d6-d7}, [r4,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vst1.16 {d0[1]}, [r5,:16], ip
+ vst1.16 {d0[3]}, [r5,:16], ip
+ vst1.16 {d1[1]}, [r5,:16], ip
+ vst1.16 {d1[3]}, [r5,:16], ip
+ vst1.16 {d2[1]}, [r5,:16], ip
+ vst1.16 {d2[3]}, [r5,:16], ip
+ vst1.16 {d3[1]}, [r5,:16], ip
+ vst1.16 {d3[3]}, [r5,:16], ip
+ beq 7f
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+7: vst1.16 {d4[1]}, [r5,:16], ip
+ vst1.16 {d4[3]}, [r5,:16], ip
+ vst1.16 {d5[1]}, [r5,:16], ip
+ vst1.16 {d5[3]}, [r5,:16], ip
+ vst1.16 {d6[1]}, [r5,:16], ip
+ vst1.16 {d6[3]}, [r5,:16], ip
+ vst1.16 {d7[1]}, [r5,:16], ip
+ vst1.16 {d7[3]}, [r5,:16], ip
+ bgt 6b
+ pop {r4-r8,pc}
+8: subs lr, lr, #8
+ vst1.16 {d0[1]}, [r5,:16], ip
+ vst1.16 {d0[3]}, [r5,:16], ip
+ vst1.16 {d1[1]}, [r5,:16], ip
+ vst1.16 {d1[3]}, [r5,:16], ip
+ vst1.16 {d2[1]}, [r5,:16], ip
+ vst1.16 {d2[3]}, [r5,:16], ip
+ vst1.16 {d3[1]}, [r5,:16], ip
+ vst1.16 {d3[3]}, [r5,:16], ip
+ popeq {r4-r8,pc}
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ b 6b
+endfunc
+
+function ff_int32_to_float_fmul_scalar_neon, export=1
+VFP vdup.32 q0, d0[0]
+VFP len .req r2
+NOVFP vdup.32 q0, r2
+NOVFP len .req r3
+
+ vld1.32 {q1},[r1,:128]!
+ vcvt.f32.s32 q3, q1
+ vld1.32 {q2},[r1,:128]!
+ vcvt.f32.s32 q8, q2
+1: subs len, len, #8
+ pld [r1, #16]
+ vmul.f32 q9, q3, q0
+ vmul.f32 q10, q8, q0
+ beq 2f
+ vld1.32 {q1},[r1,:128]!
+ vcvt.f32.s32 q3, q1
+ vld1.32 {q2},[r1,:128]!
+ vcvt.f32.s32 q8, q2
+ vst1.32 {q9}, [r0,:128]!
+ vst1.32 {q10},[r0,:128]!
+ b 1b
+2: vst1.32 {q9}, [r0,:128]!
+ vst1.32 {q10},[r0,:128]!
+ bx lr
+ .unreq len
+endfunc
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
new file mode 100644
index 0000000..1d19e77
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb at users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+ .syntax unified
+
+/**
+ * ARM VFP optimized float to int16 conversion.
+ * Assume that len is a positive number and is multiple of 8, destination
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
+ * performance), little endian byte sex
+ */
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
+function ff_float_to_int16_vfp, export=1
+ push {r4-r8,lr}
+ vpush {d8-d11}
+ vldmia r1!, {s16-s23}
+ vcvt.s32.f32 s0, s16
+ vcvt.s32.f32 s1, s17
+ vcvt.s32.f32 s2, s18
+ vcvt.s32.f32 s3, s19
+ vcvt.s32.f32 s4, s20
+ vcvt.s32.f32 s5, s21
+ vcvt.s32.f32 s6, s22
+ vcvt.s32.f32 s7, s23
+1:
+ subs r2, r2, #8
+ vmov r3, r4, s0, s1
+ vmov r5, r6, s2, s3
+ vmov r7, r8, s4, s5
+ vmov ip, lr, s6, s7
+ vldmiagt r1!, {s16-s23}
+ ssat r4, #16, r4
+ ssat r3, #16, r3
+ ssat r6, #16, r6
+ ssat r5, #16, r5
+ pkhbt r3, r3, r4, lsl #16
+ pkhbt r4, r5, r6, lsl #16
+ vcvtgt.s32.f32 s0, s16
+ vcvtgt.s32.f32 s1, s17
+ vcvtgt.s32.f32 s2, s18
+ vcvtgt.s32.f32 s3, s19
+ vcvtgt.s32.f32 s4, s20
+ vcvtgt.s32.f32 s5, s21
+ vcvtgt.s32.f32 s6, s22
+ vcvtgt.s32.f32 s7, s23
+ ssat r8, #16, r8
+ ssat r7, #16, r7
+ ssat lr, #16, lr
+ ssat ip, #16, ip
+ pkhbt r5, r7, r8, lsl #16
+ pkhbt r6, ip, lr, lsl #16
+ stmia r0!, {r3-r6}
+ bgt 1b
+
+ vpop {d8-d11}
+ pop {r4-r8,pc}
+endfunc
diff --git a/libavcodec/binkaudio.c b/libavcodec/binkaudio.c
index ae2f6c8..5348465 100644
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@@ -33,6 +33,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
extern const uint16_t ff_wma_critical_freqs[25];
@@ -43,6 +44,7 @@ typedef struct {
AVCodecContext *avctx;
GetBitContext gb;
DSPContext dsp;
+ FmtConvertContext fmt_conv;
int first;
int channels;
int frame_len; ///< transform size (samples)
@@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
s->avctx = avctx;
dsputil_init(&s->dsp, avctx);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
/* determine frame length */
if (avctx->sample_rate < 22050) {
@@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
ff_rdft_calc(&s->trans.rdft, coeffs);
}
- s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
+ s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
+ s->frame_len, s->channels);
if (!s->first) {
int count = s->overlap_len * s->channels;
diff --git a/libavcodec/dca.c b/libavcodec/dca.c
index 3a3eb25..63ea329 100644
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -40,6 +40,7 @@
#include "dca.h"
#include "synth_filter.h"
#include "dcadsp.h"
+#include "fmtconvert.h"
//#define TRACE
@@ -347,6 +348,7 @@ typedef struct {
FFTContext imdct;
SynthFilterContext synth;
DCADSPContext dcadsp;
+ FmtConvertContext fmt_conv;
} DCAContext;
static const uint16_t dca_vlc_offs[] = {
@@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
}
- s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
+ s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
block, rscale, 8);
}
@@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
}
}
- s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
+ s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
samples += 256 * channels;
}
@@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
ff_mdct_init(&s->imdct, 6, 1, 1.0);
ff_synth_filter_init(&s->synth);
ff_dcadsp_init(&s->dcadsp);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
s->samples_chanptr[i] = s->samples + i * 256;
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 2d4ec72..84714de 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
return p;
}
-static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
- int i;
- for(i=0; i<len; i++)
- dst[i] = src[i] * mul;
-}
-
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
uint32_t maxi, uint32_t maxisign)
{
@@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
}
}
-static av_always_inline int float_to_int16_one(const float *src){
- return av_clip_int16(lrintf(*src));
-}
-
-static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
- int i;
- for(i=0; i<len; i++)
- dst[i] = float_to_int16_one(src+i);
-}
-
-static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
- int i,j,c;
- if(channels==2){
- for(i=0; i<len; i++){
- dst[2*i] = float_to_int16_one(src[0]+i);
- dst[2*i+1] = float_to_int16_one(src[1]+i);
- }
- }else{
- for(c=0; c<channels; c++)
- for(i=0, j=c; i<len; i++, j+=channels)
- dst[j] = float_to_int16_one(src[c]+i);
- }
-}
-
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
{
int res = 0;
@@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->vector_fmul_reverse = vector_fmul_reverse_c;
c->vector_fmul_add = vector_fmul_add_c;
c->vector_fmul_window = vector_fmul_window_c;
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
c->vector_clipf = vector_clipf_c;
- c->float_to_int16 = ff_float_to_int16_c;
- c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
c->scalarproduct_int16 = scalarproduct_int16_c;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
c->scalarproduct_float = scalarproduct_float_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index b942e66..c811186 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -392,7 +392,6 @@ typedef struct DSPContext {
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
- void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
/**
* Multiply a vector of floats by a scalar float. Source and
@@ -445,10 +444,6 @@ typedef struct DSPContext {
*/
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
- /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
- void (*float_to_int16)(int16_t *dst, const float *src, long len);
- void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
-
/* (I)DCT */
void (*fdct)(DCTELEM *block/* align 16*/);
void (*fdct248)(DCTELEM *block/* align 16*/);
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
new file mode 100644
index 0000000..e26b899
--- /dev/null
+++ b/libavcodec/fmtconvert.c
@@ -0,0 +1,68 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "fmtconvert.h"
+
+static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
+ int i;
+ for(i=0; i<len; i++)
+ dst[i] = src[i] * mul;
+}
+
+static av_always_inline int float_to_int16_one(const float *src){
+ return av_clip_int16(lrintf(*src));
+}
+
+static void float_to_int16_c(int16_t *dst, const float *src, long len)
+{
+ int i;
+ for(i=0; i<len; i++)
+ dst[i] = float_to_int16_one(src+i);
+}
+
+static void float_to_int16_interleave_c(int16_t *dst, const float **src,
+ long len, int channels)
+{
+ int i,j,c;
+ if(channels==2){
+ for(i=0; i<len; i++){
+ dst[2*i] = float_to_int16_one(src[0]+i);
+ dst[2*i+1] = float_to_int16_one(src[1]+i);
+ }
+ }else{
+ for(c=0; c<channels; c++)
+ for(i=0, j=c; i<len; i++, j+=channels)
+ dst[j] = float_to_int16_one(src[c]+i);
+ }
+}
+
+av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
+ c->float_to_int16 = float_to_int16_c;
+ c->float_to_int16_interleave = float_to_int16_interleave_c;
+
+ if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
+ if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
+ if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
+}
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
new file mode 100644
index 0000000..f2ee261
--- /dev/null
+++ b/libavcodec/fmtconvert.h
@@ -0,0 +1,79 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FMTCONVERT_H
+#define AVCODEC_FMTCONVERT_H
+
+#include "avcodec.h"
+
+typedef struct FmtConvertContext {
+ /**
+ * Convert an array of int32_t to float and multiply by a float value.
+ * @param dst destination array of float.
+ * constraints: 16-byte aligned
+ * @param src source array of int32_t.
+ * constraints: 16-byte aligned
+ * @param len number of elements to convert.
+ * constraints: multiple of 8
+ */
+ void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
+
+ /**
+ * Convert an array of float to an array of int16_t.
+ *
+ * Convert floats from in the range [-32768.0,32767.0] to ints
+ * without rescaling
+ *
+ * @param dst destination array of int16_t.
+ * constraints: 16-byte aligned
+ * @param src source array of float.
+ * constraints: 16-byte aligned
+ * @param len number of elements to convert.
+ * constraints: multiple of 8
+ */
+ void (*float_to_int16)(int16_t *dst, const float *src, long len);
+
+ /**
+ * Convert multiple arrays of float to an interleaved array of int16_t.
+ *
+ * Convert floats from in the range [-32768.0,32767.0] to ints
+ * without rescaling
+ *
+ * @param dst destination array of interleaved int16_t.
+ * constraints: 16-byte aligned
+ * @param src source array of float arrays, one for each channel.
+ * constraints: 16-byte aligned
+ * @param len number of elements to convert.
+ * constraints: multiple of 8
+ * @param channels number of channels
+ */
+ void (*float_to_int16_interleave)(int16_t *dst, const float **src,
+ long len, int channels);
+} FmtConvertContext;
+
+void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
+
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
+
+#endif /* AVCODEC_FMTCONVERT_H */
diff --git a/libavcodec/nellymoserdec.c b/libavcodec/nellymoserdec.c
index 8b13a5d..80e04ee 100644
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -38,6 +38,7 @@
#include "avcodec.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
#define ALT_BITSTREAM_READER_LE
#include "get_bits.h"
@@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
float scale_bias;
DSPContext dsp;
FFTContext imdct_ctx;
+ FmtConvertContext fmt_conv;
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
} NellyMoserDecodeContext;
@@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
dsputil_init(&s->dsp, avctx);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
s->scale_bias = 1.0/(1*8);
@@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,
for (i=0 ; i<blocks ; i++) {
nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
- s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
+ s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
*data_size += NELLY_SAMPLES*sizeof(int16_t);
}
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index 9b2358d..35ea0c3 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \
OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \
ppc/fdct_altivec.o \
ppc/float_altivec.o \
+ ppc/fmtconvert_altivec.o \
ppc/gmc_altivec.o \
ppc/idct_altivec.o \
ppc/int_altivec.o \
diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c
index 60bae9a..ba97cbf 100644
--- a/libavcodec/ppc/float_altivec.c
+++ b/libavcodec/ppc/float_altivec.c
@@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
}
}
-static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
-{
- union {
- vector float v;
- float s[4];
- } mul_u;
- int i;
- vector float src1, src2, dst1, dst2, mul_v, zero;
-
- zero = (vector float)vec_splat_u32(0);
- mul_u.s[0] = mul;
- mul_v = vec_splat(mul_u.v, 0);
-
- for(i=0; i<len; i+=8) {
- src1 = vec_ctf(vec_ld(0, src+i), 0);
- src2 = vec_ctf(vec_ld(16, src+i), 0);
- dst1 = vec_madd(src1, mul_v, zero);
- dst2 = vec_madd(src2, mul_v, zero);
- vec_st(dst1, 0, dst+i);
- vec_st(dst2, 16, dst+i);
- }
-}
-
-
-static vector signed short
-float_to_int16_one_altivec(const float *src)
-{
- vector float s0 = vec_ld(0, src);
- vector float s1 = vec_ld(16, src);
- vector signed int t0 = vec_cts(s0, 0);
- vector signed int t1 = vec_cts(s1, 0);
- return vec_packs(t0,t1);
-}
-
-static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
-{
- int i;
- vector signed short d0, d1, d;
- vector unsigned char align;
- if(((long)dst)&15) //FIXME
- for(i=0; i<len-7; i+=8) {
- d0 = vec_ld(0, dst+i);
- d = float_to_int16_one_altivec(src+i);
- d1 = vec_ld(15, dst+i);
- d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
- align = vec_lvsr(0, dst+i);
- d0 = vec_perm(d1, d, align);
- d1 = vec_perm(d, d1, align);
- vec_st(d0, 0, dst+i);
- vec_st(d1,15, dst+i);
- }
- else
- for(i=0; i<len-7; i+=8) {
- d = float_to_int16_one_altivec(src+i);
- vec_st(d, 0, dst+i);
- }
-}
-
-static void
-float_to_int16_interleave_altivec(int16_t *dst, const float **src,
- long len, int channels)
-{
- int i;
- vector signed short d0, d1, d2, c0, c1, t0, t1;
- vector unsigned char align;
- if(channels == 1)
- float_to_int16_altivec(dst, src[0], len);
- else
- if (channels == 2) {
- if(((long)dst)&15)
- for(i=0; i<len-7; i+=8) {
- d0 = vec_ld(0, dst + i);
- t0 = float_to_int16_one_altivec(src[0] + i);
- d1 = vec_ld(31, dst + i);
- t1 = float_to_int16_one_altivec(src[1] + i);
- c0 = vec_mergeh(t0, t1);
- c1 = vec_mergel(t0, t1);
- d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
- align = vec_lvsr(0, dst + i);
- d0 = vec_perm(d2, c0, align);
- d1 = vec_perm(c0, c1, align);
- vec_st(d0, 0, dst + i);
- d0 = vec_perm(c1, d2, align);
- vec_st(d1, 15, dst + i);
- vec_st(d0, 31, dst + i);
- dst+=8;
- }
- else
- for(i=0; i<len-7; i+=8) {
- t0 = float_to_int16_one_altivec(src[0] + i);
- t1 = float_to_int16_one_altivec(src[1] + i);
- d0 = vec_mergeh(t0, t1);
- d1 = vec_mergel(t0, t1);
- vec_st(d0, 0, dst + i);
- vec_st(d1, 16, dst + i);
- dst+=8;
- }
- } else {
- DECLARE_ALIGNED(16, int16_t, tmp)[len];
- int c, j;
- for (c = 0; c < channels; c++) {
- float_to_int16_altivec(tmp, src[c], len);
- for (i = 0, j = c; i < len; i++, j+=channels) {
- dst[j] = tmp[i];
- }
- }
- }
-}
-
void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul = vector_fmul_altivec;
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
c->vector_fmul_add = vector_fmul_add_altivec;
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->vector_fmul_window = vector_fmul_window_altivec;
- c->float_to_int16 = float_to_int16_altivec;
- c->float_to_int16_interleave = float_to_int16_interleave_altivec;
}
}
diff --git a/libavcodec/ppc/fmtconvert_altivec.c b/libavcodec/ppc/fmtconvert_altivec.c
new file mode 100644
index 0000000..e5287c9
--- /dev/null
+++ b/libavcodec/ppc/fmtconvert_altivec.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 Luca Barbato <lu_zero at gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/fmtconvert.h"
+
+#include "dsputil_altivec.h"
+#include "util_altivec.h"
+
+static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
+{
+ union {
+ vector float v;
+ float s[4];
+ } mul_u;
+ int i;
+ vector float src1, src2, dst1, dst2, mul_v, zero;
+
+ zero = (vector float)vec_splat_u32(0);
+ mul_u.s[0] = mul;
+ mul_v = vec_splat(mul_u.v, 0);
+
+ for(i=0; i<len; i+=8) {
+ src1 = vec_ctf(vec_ld(0, src+i), 0);
+ src2 = vec_ctf(vec_ld(16, src+i), 0);
+ dst1 = vec_madd(src1, mul_v, zero);
+ dst2 = vec_madd(src2, mul_v, zero);
+ vec_st(dst1, 0, dst+i);
+ vec_st(dst2, 16, dst+i);
+ }
+}
+
+
+static vector signed short
+float_to_int16_one_altivec(const float *src)
+{
+ vector float s0 = vec_ld(0, src);
+ vector float s1 = vec_ld(16, src);
+ vector signed int t0 = vec_cts(s0, 0);
+ vector signed int t1 = vec_cts(s1, 0);
+ return vec_packs(t0,t1);
+}
+
+static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
+{
+ int i;
+ vector signed short d0, d1, d;
+ vector unsigned char align;
+ if(((long)dst)&15) //FIXME
+ for(i=0; i<len-7; i+=8) {
+ d0 = vec_ld(0, dst+i);
+ d = float_to_int16_one_altivec(src+i);
+ d1 = vec_ld(15, dst+i);
+ d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
+ align = vec_lvsr(0, dst+i);
+ d0 = vec_perm(d1, d, align);
+ d1 = vec_perm(d, d1, align);
+ vec_st(d0, 0, dst+i);
+ vec_st(d1,15, dst+i);
+ }
+ else
+ for(i=0; i<len-7; i+=8) {
+ d = float_to_int16_one_altivec(src+i);
+ vec_st(d, 0, dst+i);
+ }
+}
+
+static void
+float_to_int16_interleave_altivec(int16_t *dst, const float **src,
+ long len, int channels)
+{
+ int i;
+ vector signed short d0, d1, d2, c0, c1, t0, t1;
+ vector unsigned char align;
+ if(channels == 1)
+ float_to_int16_altivec(dst, src[0], len);
+ else
+ if (channels == 2) {
+ if(((long)dst)&15)
+ for(i=0; i<len-7; i+=8) {
+ d0 = vec_ld(0, dst + i);
+ t0 = float_to_int16_one_altivec(src[0] + i);
+ d1 = vec_ld(31, dst + i);
+ t1 = float_to_int16_one_altivec(src[1] + i);
+ c0 = vec_mergeh(t0, t1);
+ c1 = vec_mergel(t0, t1);
+ d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
+ align = vec_lvsr(0, dst + i);
+ d0 = vec_perm(d2, c0, align);
+ d1 = vec_perm(c0, c1, align);
+ vec_st(d0, 0, dst + i);
+ d0 = vec_perm(c1, d2, align);
+ vec_st(d1, 15, dst + i);
+ vec_st(d0, 31, dst + i);
+ dst+=8;
+ }
+ else
+ for(i=0; i<len-7; i+=8) {
+ t0 = float_to_int16_one_altivec(src[0] + i);
+ t1 = float_to_int16_one_altivec(src[1] + i);
+ d0 = vec_mergeh(t0, t1);
+ d1 = vec_mergel(t0, t1);
+ vec_st(d0, 0, dst + i);
+ vec_st(d1, 16, dst + i);
+ dst+=8;
+ }
+ } else {
+ DECLARE_ALIGNED(16, int16_t, tmp)[len];
+ int c, j;
+ for (c = 0; c < channels; c++) {
+ float_to_int16_altivec(tmp, src[c], len);
+ for (i = 0, j = c; i < len; i++, j+=channels) {
+ dst[j] = tmp[i];
+ }
+ }
+ }
+}
+
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+ c->float_to_int16 = float_to_int16_altivec;
+ c->float_to_int16_interleave = float_to_int16_interleave_altivec;
+ }
+}
diff --git a/libavcodec/vorbis_dec.c b/libavcodec/vorbis_dec.c
index 9fef5eb..bca56ba 100644
--- a/libavcodec/vorbis_dec.c
+++ b/libavcodec/vorbis_dec.c
@@ -31,6 +31,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
#include "vorbis.h"
#include "xiph.h"
@@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
AVCodecContext *avccontext;
GetBitContext gb;
DSPContext dsp;
+ FmtConvertContext fmt_conv;
FFTContext mdct[2];
uint_fast8_t first_frame;
@@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)
vc->avccontext = avccontext;
dsputil_init(&vc->dsp, avccontext);
+ ff_fmt_convert_init(&vc->fmt_conv, avccontext);
vc->scale_bias = 32768.0f;
@@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
}
- vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
+ vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
+ vc->audio_channels);
*data_size = len * 2 * vc->audio_channels;
return buf_size ;
diff --git a/libavcodec/wma.c b/libavcodec/wma.c
index e0b9b68..a7eacb8 100644
--- a/libavcodec/wma.c
+++ b/libavcodec/wma.c
@@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
s->block_align = avctx->block_align;
dsputil_init(&s->dsp, avctx);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
if (avctx->codec->id == CODEC_ID_WMAV1) {
s->version = 1;
diff --git a/libavcodec/wma.h b/libavcodec/wma.h
index 11274ad..a51b3e8 100644
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@@ -26,6 +26,7 @@
#include "put_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
/* size of blocks */
#define BLOCK_MIN_BITS 7
@@ -134,6 +135,7 @@ typedef struct WMACodecContext {
float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
DSPContext dsp;
+ FmtConvertContext fmt_conv;
#ifdef TRACE
int frame_count;
diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c
index d85d80d..83f8dea 100644
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
incr = s->nb_channels;
for (ch = 0; ch < MAX_CHANNELS; ch++)
output[ch] = s->frame_out[ch];
- s->dsp.float_to_int16_interleave(samples, output, n, incr);
+ s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
for (ch = 0; ch < incr; ch++) {
/* prepare for next block */
memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 943edcb..83cec00 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
x86/deinterlace.o \
+ x86/fmtconvert.o \
x86/h264_chromamc.o \
$(YASM-OBJS-yes)
@@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o
OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \
x86/dsputil_mmx.o \
x86/fdct_mmx.o \
+ x86/fmtconvert_mmx.o \
x86/idct_mmx_xvid.o \
x86/idct_sse2_xvid.o \
x86/motion_est_mmx.o \
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 2eb7d85..39bf3f2 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
}
#endif /* HAVE_6REGS */
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtpi2ps (%2,%0), %%xmm0 \n"
- "cvtpi2ps 8(%2,%0), %%xmm1 \n"
- "cvtpi2ps 16(%2,%0), %%xmm2 \n"
- "cvtpi2ps 24(%2,%0), %%xmm3 \n"
- "movlhps %%xmm1, %%xmm0 \n"
- "movlhps %%xmm3, %%xmm2 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm2 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm2, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtdq2ps (%2,%0), %%xmm0 \n"
- "cvtdq2ps 16(%2,%0), %%xmm1 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm1 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm1, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
int len)
{
@@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
);
}
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- // not bit-exact: pf2id uses different rounding than C and SSE
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "pf2id (%2,%0,2) , %%mm0 \n\t"
- "pf2id 8(%2,%0,2) , %%mm1 \n\t"
- "pf2id 16(%2,%0,2) , %%mm2 \n\t"
- "pf2id 24(%2,%0,2) , %%mm3 \n\t"
- "packssdw %%mm1 , %%mm0 \n\t"
- "packssdw %%mm3 , %%mm2 \n\t"
- "movq %%mm0 , (%1,%0) \n\t"
- "movq %%mm2 , 8(%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- "femms \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
- "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
- "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
- "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
- "packssdw %%mm1 , %%mm0 \n\t"
- "packssdw %%mm3 , %%mm2 \n\t"
- "movq %%mm0 , (%1,%0) \n\t"
- "movq %%mm2 , 8(%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- "emms \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
- "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
- "packssdw %%xmm1 , %%xmm0 \n\t"
- "movdqa %%xmm0 , (%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-
void ff_vp3_idct_mmx(int16_t *input_data);
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
@@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
@@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
-#if !HAVE_YASM
-#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#endif
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
-
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
- DECLARE_ALIGNED(16, int16_t, tmp)[len];\
- int i,j,c;\
- for(c=0; c<channels; c++){\
- float_to_int16_##cpu(tmp, src[c], len);\
- for(i=0, j=c; i<len; i++, j+=channels)\
- dst[j] = tmp[i];\
- }\
-}\
-\
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
- if(channels==1)\
- float_to_int16_##cpu(dst, src[0], len);\
- else if(channels==2){\
- x86_reg reglen = len; \
- const float *src0 = src[0];\
- const float *src1 = src[1];\
- __asm__ volatile(\
- "shl $2, %0 \n"\
- "add %0, %1 \n"\
- "add %0, %2 \n"\
- "add %0, %3 \n"\
- "neg %0 \n"\
- body\
- :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
- );\
- }else if(channels==6){\
- ff_float_to_int16_interleave6_##cpu(dst, src, len);\
- }else\
- float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
-}
-
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
- "1: \n"
- "pf2id (%2,%0), %%mm0 \n"
- "pf2id 8(%2,%0), %%mm1 \n"
- "pf2id (%3,%0), %%mm2 \n"
- "pf2id 8(%3,%0), %%mm3 \n"
- "packssdw %%mm1, %%mm0 \n"
- "packssdw %%mm3, %%mm2 \n"
- "movq %%mm0, %%mm1 \n"
- "punpcklwd %%mm2, %%mm0 \n"
- "punpckhwd %%mm2, %%mm1 \n"
- "movq %%mm0, (%1,%0)\n"
- "movq %%mm1, 8(%1,%0)\n"
- "add $16, %0 \n"
- "js 1b \n"
- "femms \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
- "1: \n"
- "cvtps2pi (%2,%0), %%mm0 \n"
- "cvtps2pi 8(%2,%0), %%mm1 \n"
- "cvtps2pi (%3,%0), %%mm2 \n"
- "cvtps2pi 8(%3,%0), %%mm3 \n"
- "packssdw %%mm1, %%mm0 \n"
- "packssdw %%mm3, %%mm2 \n"
- "movq %%mm0, %%mm1 \n"
- "punpcklwd %%mm2, %%mm0 \n"
- "punpckhwd %%mm2, %%mm1 \n"
- "movq %%mm0, (%1,%0)\n"
- "movq %%mm1, 8(%1,%0)\n"
- "add $16, %0 \n"
- "js 1b \n"
- "emms \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
- "1: \n"
- "cvtps2dq (%2,%0), %%xmm0 \n"
- "cvtps2dq (%3,%0), %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "movhlps %%xmm0, %%xmm1 \n"
- "punpcklwd %%xmm1, %%xmm0 \n"
- "movdqa %%xmm0, (%1,%0) \n"
- "add $16, %0 \n"
- "js 1b \n"
-)
-
-static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
- if(channels==6)
- ff_float_to_int16_interleave6_3dn2(dst, src, len);
- else
- float_to_int16_interleave_3dnow(dst, src, len, channels);
-}
-
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
@@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
if(mm_flags & AV_CPU_FLAG_3DNOW){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->float_to_int16 = float_to_int16_3dnow;
- c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
- }
}
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_3dnow2;
#endif
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
- }
}
if(mm_flags & AV_CPU_FLAG_MMX2){
#if HAVE_YASM
@@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_sse;
#endif
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
c->vector_clipf = vector_clipf_sse;
- c->float_to_int16 = float_to_int16_sse;
- c->float_to_int16_interleave = float_to_int16_interleave_sse;
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
#endif
@@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
if(mm_flags & AV_CPU_FLAG_3DNOW)
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
if(mm_flags & AV_CPU_FLAG_SSE2){
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
- c->float_to_int16 = float_to_int16_sse2;
- c->float_to_int16_interleave = float_to_int16_interleave_sse2;
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 099f0a8..b1b37e1 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
section .text align=16
-%macro PSWAPD_SSE 2
- pshufw %1, %2, 0x4e
-%endmacro
-%macro PSWAPD_3DN1 2
- movq %1, %2
- psrlq %1, 32
- punpckldq %1, %2
-%endmacro
-
-%macro FLOAT_TO_INT16_INTERLEAVE6 1
-; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
-cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
-%ifdef ARCH_X86_64
- %define lend r10d
- mov lend, r2d
-%else
- %define lend dword r2m
-%endif
- mov src1q, [srcq+1*gprsize]
- mov src2q, [srcq+2*gprsize]
- mov src3q, [srcq+3*gprsize]
- mov src4q, [srcq+4*gprsize]
- mov src5q, [srcq+5*gprsize]
- mov srcq, [srcq]
- sub src1q, srcq
- sub src2q, srcq
- sub src3q, srcq
- sub src4q, srcq
- sub src5q, srcq
-.loop:
- cvtps2pi mm0, [srcq]
- cvtps2pi mm1, [srcq+src1q]
- cvtps2pi mm2, [srcq+src2q]
- cvtps2pi mm3, [srcq+src3q]
- cvtps2pi mm4, [srcq+src4q]
- cvtps2pi mm5, [srcq+src5q]
- packssdw mm0, mm3
- packssdw mm1, mm4
- packssdw mm2, mm5
- pswapd mm3, mm0
- punpcklwd mm0, mm1
- punpckhwd mm1, mm2
- punpcklwd mm2, mm3
- pswapd mm3, mm0
- punpckldq mm0, mm2
- punpckhdq mm2, mm1
- punpckldq mm1, mm3
- movq [dstq ], mm0
- movq [dstq+16], mm2
- movq [dstq+ 8], mm1
- add srcq, 8
- add dstq, 24
- sub lend, 2
- jg .loop
- emms
- RET
-%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
-
-%define pswapd PSWAPD_SSE
-FLOAT_TO_INT16_INTERLEAVE6 sse
-%define cvtps2pi pf2id
-%define pswapd PSWAPD_3DN1
-FLOAT_TO_INT16_INTERLEAVE6 3dnow
-%undef pswapd
-FLOAT_TO_INT16_INTERLEAVE6 3dn2
-%undef cvtps2pi
-
-
-
%macro SCALARPRODUCT 1
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
new file mode 100644
index 0000000..6c744fc
--- /dev/null
+++ b/libavcodec/x86/fmtconvert.asm
@@ -0,0 +1,91 @@
+;******************************************************************************
+;* x86 optimized Format Conversion Utils
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+section .text align=16
+
+%macro PSWAPD_SSE 2
+ pshufw %1, %2, 0x4e
+%endmacro
+%macro PSWAPD_3DN1 2
+ movq %1, %2
+ psrlq %1, 32
+ punpckldq %1, %2
+%endmacro
+
+%macro FLOAT_TO_INT16_INTERLEAVE6 1
+; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
+cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
+%ifdef ARCH_X86_64
+ %define lend r10d
+ mov lend, r2d
+%else
+ %define lend dword r2m
+%endif
+ mov src1q, [srcq+1*gprsize]
+ mov src2q, [srcq+2*gprsize]
+ mov src3q, [srcq+3*gprsize]
+ mov src4q, [srcq+4*gprsize]
+ mov src5q, [srcq+5*gprsize]
+ mov srcq, [srcq]
+ sub src1q, srcq
+ sub src2q, srcq
+ sub src3q, srcq
+ sub src4q, srcq
+ sub src5q, srcq
+.loop:
+ cvtps2pi mm0, [srcq]
+ cvtps2pi mm1, [srcq+src1q]
+ cvtps2pi mm2, [srcq+src2q]
+ cvtps2pi mm3, [srcq+src3q]
+ cvtps2pi mm4, [srcq+src4q]
+ cvtps2pi mm5, [srcq+src5q]
+ packssdw mm0, mm3
+ packssdw mm1, mm4
+ packssdw mm2, mm5
+ pswapd mm3, mm0
+ punpcklwd mm0, mm1
+ punpckhwd mm1, mm2
+ punpcklwd mm2, mm3
+ pswapd mm3, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm2, mm1
+ punpckldq mm1, mm3
+ movq [dstq ], mm0
+ movq [dstq+16], mm2
+ movq [dstq+ 8], mm1
+ add srcq, 8
+ add dstq, 24
+ sub lend, 2
+ jg .loop
+ emms
+ RET
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
+
+%define pswapd PSWAPD_SSE
+FLOAT_TO_INT16_INTERLEAVE6 sse
+%define cvtps2pi pf2id
+%define pswapd PSWAPD_3DN1
+FLOAT_TO_INT16_INTERLEAVE6 3dnow
+%undef pswapd
+FLOAT_TO_INT16_INTERLEAVE6 3dn2
+%undef cvtps2pi
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
new file mode 100644
index 0000000..ea41f73
--- /dev/null
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -0,0 +1,266 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/fmtconvert.h"
+
+static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
+{
+ x86_reg i = -4*len;
+ __asm__ volatile(
+ "movss %3, %%xmm4 \n"
+ "shufps $0, %%xmm4, %%xmm4 \n"
+ "1: \n"
+ "cvtpi2ps (%2,%0), %%xmm0 \n"
+ "cvtpi2ps 8(%2,%0), %%xmm1 \n"
+ "cvtpi2ps 16(%2,%0), %%xmm2 \n"
+ "cvtpi2ps 24(%2,%0), %%xmm3 \n"
+ "movlhps %%xmm1, %%xmm0 \n"
+ "movlhps %%xmm3, %%xmm2 \n"
+ "mulps %%xmm4, %%xmm0 \n"
+ "mulps %%xmm4, %%xmm2 \n"
+ "movaps %%xmm0, (%1,%0) \n"
+ "movaps %%xmm2, 16(%1,%0) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(dst+len), "r"(src+len), "m"(mul)
+ );
+}
+
+static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
+{
+ x86_reg i = -4*len;
+ __asm__ volatile(
+ "movss %3, %%xmm4 \n"
+ "shufps $0, %%xmm4, %%xmm4 \n"
+ "1: \n"
+ "cvtdq2ps (%2,%0), %%xmm0 \n"
+ "cvtdq2ps 16(%2,%0), %%xmm1 \n"
+ "mulps %%xmm4, %%xmm0 \n"
+ "mulps %%xmm4, %%xmm1 \n"
+ "movaps %%xmm0, (%1,%0) \n"
+ "movaps %%xmm1, 16(%1,%0) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(dst+len), "r"(src+len), "m"(mul)
+ );
+}
+
+static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ // not bit-exact: pf2id uses different rounding than C and SSE
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "pf2id (%2,%0,2) , %%mm0 \n\t"
+ "pf2id 8(%2,%0,2) , %%mm1 \n\t"
+ "pf2id 16(%2,%0,2) , %%mm2 \n\t"
+ "pf2id 24(%2,%0,2) , %%mm3 \n\t"
+ "packssdw %%mm1 , %%mm0 \n\t"
+ "packssdw %%mm3 , %%mm2 \n\t"
+ "movq %%mm0 , (%1,%0) \n\t"
+ "movq %%mm2 , 8(%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ "femms \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+
+static void float_to_int16_sse(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
+ "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
+ "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
+ "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
+ "packssdw %%mm1 , %%mm0 \n\t"
+ "packssdw %%mm3 , %%mm2 \n\t"
+ "movq %%mm0 , (%1,%0) \n\t"
+ "movq %%mm2 , 8(%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ "emms \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+
+static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
+ "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
+ "packssdw %%xmm1 , %%xmm0 \n\t"
+ "movdqa %%xmm0 , (%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+
+#if !HAVE_YASM
+#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#endif
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
+
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
+ DECLARE_ALIGNED(16, int16_t, tmp)[len];\
+ int i,j,c;\
+ for(c=0; c<channels; c++){\
+ float_to_int16_##cpu(tmp, src[c], len);\
+ for(i=0, j=c; i<len; i++, j+=channels)\
+ dst[j] = tmp[i];\
+ }\
+}\
+\
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
+ if(channels==1)\
+ float_to_int16_##cpu(dst, src[0], len);\
+ else if(channels==2){\
+ x86_reg reglen = len; \
+ const float *src0 = src[0];\
+ const float *src1 = src[1];\
+ __asm__ volatile(\
+ "shl $2, %0 \n"\
+ "add %0, %1 \n"\
+ "add %0, %2 \n"\
+ "add %0, %3 \n"\
+ "neg %0 \n"\
+ body\
+ :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
+ );\
+ }else if(channels==6){\
+ ff_float_to_int16_interleave6_##cpu(dst, src, len);\
+ }else\
+ float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
+}
+
+FLOAT_TO_INT16_INTERLEAVE(3dnow,
+ "1: \n"
+ "pf2id (%2,%0), %%mm0 \n"
+ "pf2id 8(%2,%0), %%mm1 \n"
+ "pf2id (%3,%0), %%mm2 \n"
+ "pf2id 8(%3,%0), %%mm3 \n"
+ "packssdw %%mm1, %%mm0 \n"
+ "packssdw %%mm3, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "punpcklwd %%mm2, %%mm0 \n"
+ "punpckhwd %%mm2, %%mm1 \n"
+ "movq %%mm0, (%1,%0)\n"
+ "movq %%mm1, 8(%1,%0)\n"
+ "add $16, %0 \n"
+ "js 1b \n"
+ "femms \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse,
+ "1: \n"
+ "cvtps2pi (%2,%0), %%mm0 \n"
+ "cvtps2pi 8(%2,%0), %%mm1 \n"
+ "cvtps2pi (%3,%0), %%mm2 \n"
+ "cvtps2pi 8(%3,%0), %%mm3 \n"
+ "packssdw %%mm1, %%mm0 \n"
+ "packssdw %%mm3, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "punpcklwd %%mm2, %%mm0 \n"
+ "punpckhwd %%mm2, %%mm1 \n"
+ "movq %%mm0, (%1,%0)\n"
+ "movq %%mm1, 8(%1,%0)\n"
+ "add $16, %0 \n"
+ "js 1b \n"
+ "emms \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse2,
+ "1: \n"
+ "cvtps2dq (%2,%0), %%xmm0 \n"
+ "cvtps2dq (%3,%0), %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "movhlps %%xmm0, %%xmm1 \n"
+ "punpcklwd %%xmm1, %%xmm0 \n"
+ "movdqa %%xmm0, (%1,%0) \n"
+ "add $16, %0 \n"
+ "js 1b \n"
+)
+
+static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
+ if(channels==6)
+ ff_float_to_int16_interleave6_3dn2(dst, src, len);
+ else
+ float_to_int16_interleave_3dnow(dst, src, len, channels);
+}
+
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+
+ if(mm_flags & AV_CPU_FLAG_3DNOW){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->float_to_int16 = float_to_int16_3dnow;
+ c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
+ }
+ }
+ if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
+ }
+ }
+ if(mm_flags & AV_CPU_FLAG_SSE){
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
+ c->float_to_int16 = float_to_int16_sse;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse;
+ }
+ if(mm_flags & AV_CPU_FLAG_SSE2){
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
+ c->float_to_int16 = float_to_int16_sse2;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse2;
+ }
+ }
+}
--
1.7.3.5
More information about the ffmpeg-devel
mailing list