[FFmpeg-cvslog] r14205 - in trunk: libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/i386/dsputil_mmx.c libavcodec/vorbis_dec.c libavutil/x86_cpu.h
lorenm
subversion
Sun Jul 13 16:56:01 CEST 2008
Author: lorenm
Date: Sun Jul 13 16:56:01 2008
New Revision: 14205
Log:
simplify vorbis windowing
Modified:
trunk/libavcodec/dsputil.c
trunk/libavcodec/dsputil.h
trunk/libavcodec/i386/dsputil_mmx.c
trunk/libavcodec/vorbis_dec.c
trunk/libavutil/x86_cpu.h
Modified: trunk/libavcodec/dsputil.c
==============================================================================
--- trunk/libavcodec/dsputil.c (original)
+++ trunk/libavcodec/dsputil.c Sun Jul 13 16:56:01 2008
@@ -3930,17 +3930,40 @@ void ff_vector_fmul_add_add_c(float *dst
dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
}
+void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
+ int i;
+ for(i=0; i<len; i++)
+ dst[i] = src0[i]*win[len-i-1] + src1[i]*win[i] + add_bias;
+}
+
+static av_always_inline int float_to_int16_one(const float *src){
+ int_fast32_t tmp = *(const int32_t*)src;
+ if(tmp & 0xf0000){
+ tmp = (0x43c0ffff - tmp)>>31;
+ // is this faster on some gcc/cpu combinations?
+// if(tmp > 0x43c0ffff) tmp = 0xFFFF;
+// else tmp = 0;
+ }
+ return tmp - 0x8000;
+}
+
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
int i;
- for(i=0; i<len; i++) {
- int_fast32_t tmp = ((const int32_t*)src)[i];
- if(tmp & 0xf0000){
- tmp = (0x43c0ffff - tmp)>>31;
- // is this faster on some gcc/cpu combinations?
-// if(tmp > 0x43c0ffff) tmp = 0xFFFF;
-// else tmp = 0;
+ for(i=0; i<len; i++)
+ dst[i] = float_to_int16_one(src+i);
+}
+
+void ff_float_to_int16_interleave_c(int16_t *dst, const float *src, long len, int channels){
+ int i,j,c;
+ if(channels==2){
+ for(i=0; i<len; i++){
+ dst[2*i] = float_to_int16_one(src+i);
+ dst[2*i+1] = float_to_int16_one(src+i+len);
}
- dst[i] = tmp - 0x8000;
+ }else{
+ for(c=0; c<channels; c++, src+=len)
+ for(i=0, j=c; i<len; i++, j+=channels)
+ dst[j] = float_to_int16_one(src+i);
}
}
@@ -4450,7 +4473,9 @@ void dsputil_init(DSPContext* c, AVCodec
c->vector_fmul = vector_fmul_c;
c->vector_fmul_reverse = vector_fmul_reverse_c;
c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
+ c->vector_fmul_window = ff_vector_fmul_window_c;
c->float_to_int16 = ff_float_to_int16_c;
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
c->add_int16 = add_int16_c;
c->sub_int16 = sub_int16_c;
c->scalarproduct_int16 = scalarproduct_int16_c;
Modified: trunk/libavcodec/dsputil.h
==============================================================================
--- trunk/libavcodec/dsputil.h (original)
+++ trunk/libavcodec/dsputil.h Sun Jul 13 16:56:01 2008
@@ -63,6 +63,8 @@ void ff_h264_lowres_idct_put_c(uint8_t *
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
const float *src2, int src3, int blocksize, int step);
+void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
+ const float *win, float add_bias, int len);
void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
/* encoding scans */
@@ -364,10 +366,13 @@ typedef struct DSPContext {
void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
/* assume len is a multiple of 8, and src arrays are 16-byte aligned */
void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);
+ /* assume len is a multiple of 4, and arrays are 16-byte aligned */
+ void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
/* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
* simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
void (*float_to_int16)(int16_t *dst, const float *src, long len);
+ void (*float_to_int16_interleave)(int16_t *dst, const float *src, long len, int channels);
/* (I)DCT */
void (*fdct)(DCTELEM *block/* align 16*/);
Modified: trunk/libavcodec/i386/dsputil_mmx.c
==============================================================================
--- trunk/libavcodec/i386/dsputil_mmx.c (original)
+++ trunk/libavcodec/i386/dsputil_mmx.c Sun Jul 13 16:56:01 2008
@@ -2022,6 +2022,39 @@ static void vector_fmul_add_add_sse(floa
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
}
+static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
+ const float *win, float add_bias, int len){
+#ifdef HAVE_6REGS
+ if(add_bias == 0){
+ x86_reg i = -len*2;
+ x86_reg j = len*2-16;
+ asm volatile(
+ "1: \n"
+ "movaps (%5,%0), %%xmm0 \n"
+ "movaps (%5,%1), %%xmm1 \n"
+ "movaps %%xmm0, %%xmm2 \n"
+ "movaps %%xmm1, %%xmm3 \n"
+ "shufps $0x1b, %%xmm2, %%xmm2 \n"
+ "shufps $0x1b, %%xmm3, %%xmm3 \n"
+ "mulps (%4,%0), %%xmm0 \n"
+ "mulps (%4,%1), %%xmm1 \n"
+ "mulps (%3,%0), %%xmm3 \n"
+ "mulps (%3,%1), %%xmm2 \n"
+ "addps %%xmm3, %%xmm0 \n"
+ "addps %%xmm2, %%xmm1 \n"
+ "movaps %%xmm0, (%2,%0) \n"
+ "movaps %%xmm1, (%2,%1) \n"
+ "sub $16, %1 \n"
+ "add $16, %0 \n"
+ "jl 1b \n"
+ :"+r"(i), "+r"(j)
+ :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2)
+ );
+ }else
+#endif
+ ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
+}
+
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
// not bit-exact: pf2id uses different rounding than C and SSE
asm volatile(
@@ -2083,6 +2116,87 @@ static void float_to_int16_sse2(int16_t
);
}
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
+static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float *src, long len, int channels){\
+ DECLARE_ALIGNED_16(int16_t, tmp[len*channels]);\
+ int i,j,c;\
+ float_to_int16_##cpu(tmp, src, len*channels);\
+ for(c=0; c<channels; c++){\
+ int16_t *ptmp = tmp+c*len;\
+ for(i=0, j=c; i<len; i++, j+=channels)\
+ dst[j] = ptmp[i];\
+ }\
+}\
+\
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float *src, long len, int channels){\
+ if(channels==1)\
+ float_to_int16_##cpu(dst, src, len);\
+ else if(channels>2)\
+ float_to_int16_interleave2_##cpu(dst, src, len, channels);\
+ else{\
+ float *src1;\
+ asm volatile(\
+ "shl $2, %0 \n"\
+ "add %0, %1 \n"\
+ "add %0, %2 \n"\
+ "lea (%2,%0), %3 \n"\
+ "neg %0 \n"\
+ body\
+ :"+r"(len), "+r"(dst), "+r"(src), "=r"(src1)\
+ );\
+ }\
+}
+
+FLOAT_TO_INT16_INTERLEAVE(3dnow,
+ "1: \n"
+ "pf2id (%2,%0), %%mm0 \n"
+ "pf2id 8(%2,%0), %%mm1 \n"
+ "pf2id (%3,%0), %%mm2 \n"
+ "pf2id 8(%3,%0), %%mm3 \n"
+ "packssdw %%mm1, %%mm0 \n"
+ "packssdw %%mm3, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "punpcklwd %%mm2, %%mm0 \n"
+ "punpckhwd %%mm2, %%mm1 \n"
+ "movq %%mm0, (%1,%0)\n"
+ "movq %%mm0, 8(%1,%0)\n"
+ "add $16, %0 \n"
+ "js 1b \n"
+ "femms \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse,
+ "1: \n"
+ "cvtps2pi (%2,%0), %%mm0 \n"
+ "cvtps2pi 8(%2,%0), %%mm1 \n"
+ "cvtps2pi (%3,%0), %%mm2 \n"
+ "cvtps2pi 8(%3,%0), %%mm3 \n"
+ "packssdw %%mm1, %%mm0 \n"
+ "packssdw %%mm3, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "punpcklwd %%mm2, %%mm0 \n"
+ "punpckhwd %%mm2, %%mm1 \n"
+ "movq %%mm0, (%1,%0)\n"
+ "movq %%mm0, 8(%1,%0)\n"
+ "add $16, %0 \n"
+ "js 1b \n"
+ "emms \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse2,
+ "1: \n"
+ "cvtps2dq (%2,%0), %%xmm0 \n"
+ "cvtps2dq (%3,%0), %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "movhlps %%xmm0, %%xmm1 \n"
+ "punpcklwd %%xmm1, %%xmm0 \n"
+ "movdqa %%xmm0, (%1,%0) \n"
+ "add $16, %0 \n"
+ "js 1b \n"
+)
+
+
extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
@@ -2519,8 +2633,10 @@ void dsputil_init_mmx(DSPContext* c, AVC
if(mm_flags & MM_3DNOW){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
- if(!(avctx->flags & CODEC_FLAG_BITEXACT))
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->float_to_int16 = float_to_int16_3dnow;
+ c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
+ }
}
if(mm_flags & MM_3DNOWEXT)
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
@@ -2528,11 +2644,14 @@ void dsputil_init_mmx(DSPContext* c, AVC
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->vector_fmul = vector_fmul_sse;
c->float_to_int16 = float_to_int16_sse;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse;
c->vector_fmul_reverse = vector_fmul_reverse_sse;
c->vector_fmul_add_add = vector_fmul_add_add_sse;
+ c->vector_fmul_window = vector_fmul_window_sse;
}
if(mm_flags & MM_SSE2){
c->float_to_int16 = float_to_int16_sse2;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse2;
}
if(mm_flags & MM_3DNOW)
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
Modified: trunk/libavcodec/vorbis_dec.c
==============================================================================
--- trunk/libavcodec/vorbis_dec.c (original)
+++ trunk/libavcodec/vorbis_dec.c Sun Jul 13 16:56:01 2008
@@ -149,10 +149,10 @@ typedef struct vorbis_context_s {
uint_fast8_t mode_count;
vorbis_mode *modes;
uint_fast8_t mode_number; // mode number for the current packet
+ uint_fast8_t previous_window;
float *channel_residues;
float *channel_floors;
float *saved;
- uint_fast16_t saved_start;
float *ret;
float *buf;
float *buf_tmp;
@@ -903,7 +903,7 @@ static int vorbis_parse_id_hdr(vorbis_co
vc->ret = av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float));
vc->buf = av_malloc( vc->blocksize[1] * sizeof(float));
vc->buf_tmp = av_malloc( vc->blocksize[1] * sizeof(float));
- vc->saved_start=0;
+ vc->previous_window=0;
ff_mdct_init(&vc->mdct[0], bl0, 1);
ff_mdct_init(&vc->mdct[1], bl1, 1);
@@ -1394,13 +1394,26 @@ void vorbis_inverse_coupling(float *mag,
}
}
+static void copy_normalize(float *dst, float *src, int len, int exp_bias, float add_bias)
+{
+ int i;
+ if(exp_bias) {
+ for(i=0; i<len; i++)
+ ((uint32_t*)dst)[i] = ((uint32_t*)src)[i] + exp_bias; // dst[k]=src[i]*(1<<bias)
+ } else {
+ for(i=0; i<len; i++)
+ dst[i] = src[i] + add_bias;
+ }
+}
+
// Decode the audio packet using the functions above
static int vorbis_parse_audio_packet(vorbis_context *vc) {
GetBitContext *gb=&vc->gb;
- uint_fast8_t previous_window=0,next_window=0;
+ uint_fast8_t previous_window=vc->previous_window;
uint_fast8_t mode_number;
+ uint_fast8_t blockflag;
uint_fast16_t blocksize;
int_fast32_t i,j;
uint_fast8_t no_residue[vc->audio_channels];
@@ -1411,7 +1424,6 @@ static int vorbis_parse_audio_packet(vor
uint_fast8_t res_chan[vc->audio_channels];
uint_fast8_t res_num=0;
int_fast16_t retlen=0;
- uint_fast16_t saved_start=0;
float fadd_bias = vc->add_bias;
if (get_bits1(gb)) {
@@ -1429,12 +1441,12 @@ static int vorbis_parse_audio_packet(vor
AV_DEBUG(" Mode number: %d , mapping: %d , blocktype %d \n", mode_number, vc->modes[mode_number].mapping, vc->modes[mode_number].blockflag);
- if (vc->modes[mode_number].blockflag) {
- previous_window=get_bits1(gb);
- next_window=get_bits1(gb);
+ blockflag=vc->modes[mode_number].blockflag;
+ blocksize=vc->blocksize[blockflag];
+ if (blockflag) {
+ skip_bits(gb, 2); // previous_window, next_window
}
- blocksize=vc->blocksize[vc->modes[mode_number].blockflag];
memset(ch_res_ptr, 0, sizeof(float)*vc->audio_channels*blocksize/2); //FIXME can this be removed ?
memset(ch_floor_ptr, 0, sizeof(float)*vc->audio_channels*blocksize/2); //FIXME can this be removed ?
@@ -1504,76 +1516,31 @@ static int vorbis_parse_audio_packet(vor
// MDCT, overlap/add, save data for next overlapping FPMATH
+ retlen = (blocksize + vc->blocksize[previous_window])/4;
for(j=0;j<vc->audio_channels;++j) {
- uint_fast8_t step=vc->audio_channels;
- uint_fast16_t k;
- float *saved=vc->saved+j*vc->blocksize[1]/2;
- float *ret=vc->ret;
- const float *lwin=vc->win[1];
- const float *swin=vc->win[0];
+ uint_fast16_t bs0=vc->blocksize[0];
+ uint_fast16_t bs1=vc->blocksize[1];
+ float *saved=vc->saved+j*bs1/2;
+ float *ret=vc->ret+j*retlen;
float *buf=vc->buf;
- float *buf_tmp=vc->buf_tmp;
-
- ch_floor_ptr=vc->channel_floors+j*blocksize/2;
-
- saved_start=vc->saved_start;
+ const float *win=vc->win[blockflag&previous_window];
- vc->mdct[0].fft.imdct_calc(&vc->mdct[vc->modes[mode_number].blockflag], buf, ch_floor_ptr, buf_tmp);
+ vc->mdct[0].fft.imdct_calc(&vc->mdct[blockflag], buf, vc->channel_floors+j*blocksize/2, vc->buf_tmp);
- //FIXME process channels together, to allow faster simd vector_fmul_add_add?
- if (vc->modes[mode_number].blockflag) {
- // -- overlap/add
- if (previous_window) {
- vc->dsp.vector_fmul_add_add(ret+j, buf, lwin, saved, vc->add_bias, vc->blocksize[1]/2, step);
- retlen=vc->blocksize[1]/2;
- } else {
- int len = (vc->blocksize[1]-vc->blocksize[0])/4;
- buf += len;
- vc->dsp.vector_fmul_add_add(ret+j, buf, swin, saved, vc->add_bias, vc->blocksize[0]/2, step);
- k = vc->blocksize[0]/2*step + j;
- buf += vc->blocksize[0]/2;
- if(vc->exp_bias){
- for(i=0; i<len; i++, k+=step)
- ((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + vc->exp_bias; // ret[k]=buf[i]*(1<<bias)
- } else {
- for(i=0; i<len; i++, k+=step)
- ret[k] = buf[i] + fadd_bias;
- }
- buf=vc->buf;
- retlen=vc->blocksize[0]/2+len;
- }
- // -- save
- if (next_window) {
- buf += vc->blocksize[1]/2;
- vc->dsp.vector_fmul_reverse(saved, buf, lwin, vc->blocksize[1]/2);
- saved_start=0;
- } else {
- saved_start=(vc->blocksize[1]-vc->blocksize[0])/4;
- buf += vc->blocksize[1]/2;
- for(i=0; i<saved_start; i++)
- ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + vc->exp_bias;
- vc->dsp.vector_fmul_reverse(saved+saved_start, buf+saved_start, swin, vc->blocksize[0]/2);
- }
+ if(blockflag == previous_window) {
+ vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/2);
+ } else if(blockflag > previous_window) {
+ vc->dsp.vector_fmul_window(ret, saved, buf+(bs1-bs0)/4, win, fadd_bias, bs0/2);
+ copy_normalize(ret+bs0/2, buf+(bs1+bs0)/4, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
} else {
- // --overlap/add
- if(vc->add_bias) {
- for(k=j, i=0;i<saved_start;++i, k+=step)
- ret[k] = saved[i] + fadd_bias;
- } else {
- for(k=j, i=0;i<saved_start;++i, k+=step)
- ret[k] = saved[i];
- }
- vc->dsp.vector_fmul_add_add(ret+k, buf, swin, saved+saved_start, vc->add_bias, vc->blocksize[0]/2, step);
- retlen=saved_start+vc->blocksize[0]/2;
- // -- save
- buf += vc->blocksize[0]/2;
- vc->dsp.vector_fmul_reverse(saved, buf, swin, vc->blocksize[0]/2);
- saved_start=0;
+ copy_normalize(ret, saved, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
+ vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved+(bs1-bs0)/4, buf, win, fadd_bias, bs0/2);
}
+ memcpy(saved, buf+blocksize/2, blocksize/2*sizeof(float));
}
- vc->saved_start=saved_start;
- return retlen*vc->audio_channels;
+ vc->previous_window = blockflag;
+ return retlen;
}
// Return the decoded audio packet through the standard api
@@ -1610,8 +1577,8 @@ static int vorbis_decode_frame(AVCodecCo
AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
- vc->dsp.float_to_int16(data, vc->ret, len);
- *data_size=len*2;
+ vc->dsp.float_to_int16_interleave(data, vc->ret, len, vc->audio_channels);
+ *data_size=len*2*vc->audio_channels;
return buf_size ;
}
Modified: trunk/libavutil/x86_cpu.h
==============================================================================
--- trunk/libavutil/x86_cpu.h (original)
+++ trunk/libavutil/x86_cpu.h Sun Jul 13 16:56:01 2008
@@ -68,6 +68,10 @@ typedef int32_t x86_reg;
# define HAVE_7REGS 1
#endif
+#if defined(ARCH_X86_64) || (defined(ARCH_X86_32) && (defined(HAVE_EBX_AVAILABLE) || defined(HAVE_EBP_AVAILABLE)))
+# define HAVE_6REGS 1
+#endif
+
#if defined(ARCH_X86_64) && defined(PIC)
# define BROKEN_RELOCATIONS 1
#endif
More information about the ffmpeg-cvslog
mailing list