FFmpeg
mpegaudiodsp.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/internal.h"
25 #include "libavutil/x86/asm.h"
26 #include "libavutil/x86/cpu.h"
28 
29 #define DECL(CPU)\
30 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
31 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
32 
33 #if HAVE_X86ASM
34 #if ARCH_X86_32
35 DECL(sse)
36 #endif
37 DECL(sse2)
38 DECL(sse3)
39 DECL(ssse3)
40 DECL(avx)
41 #endif /* HAVE_X86ASM */
42 
43 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
44  float *tmpbuf);
45 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
46  float *tmpbuf);
47 
48 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
49 
50 #if HAVE_6REGS && HAVE_SSE_INLINE
51 
52 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
53 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
54 
55 #define SUM8(op, sum, w, p) \
56 { \
57  op(sum, (w)[0 * 64], (p)[0 * 64]); \
58  op(sum, (w)[1 * 64], (p)[1 * 64]); \
59  op(sum, (w)[2 * 64], (p)[2 * 64]); \
60  op(sum, (w)[3 * 64], (p)[3 * 64]); \
61  op(sum, (w)[4 * 64], (p)[4 * 64]); \
62  op(sum, (w)[5 * 64], (p)[5 * 64]); \
63  op(sum, (w)[6 * 64], (p)[6 * 64]); \
64  op(sum, (w)[7 * 64], (p)[7 * 64]); \
65 }
66 
67 static void apply_window(const float *buf, const float *win1,
68  const float *win2, float *sum1, float *sum2, int len)
69 {
70  x86_reg count = - 4*len;
71  const float *win1a = win1+len;
72  const float *win2a = win2+len;
73  const float *bufa = buf+len;
74  float *sum1a = sum1+len;
75  float *sum2a = sum2+len;
76 
77 
78 #define MULT(a, b) \
79  "movaps " #a "(%1,%0), %%xmm1 \n\t" \
80  "movaps " #a "(%3,%0), %%xmm2 \n\t" \
81  "mulps %%xmm2, %%xmm1 \n\t" \
82  "subps %%xmm1, %%xmm0 \n\t" \
83  "mulps " #b "(%2,%0), %%xmm2 \n\t" \
84  "subps %%xmm2, %%xmm4 \n\t" \
85 
86  __asm__ volatile(
87  "1: \n\t"
88  "xorps %%xmm0, %%xmm0 \n\t"
89  "xorps %%xmm4, %%xmm4 \n\t"
90 
91  MULT( 0, 0)
92  MULT( 256, 64)
93  MULT( 512, 128)
94  MULT( 768, 192)
95  MULT(1024, 256)
96  MULT(1280, 320)
97  MULT(1536, 384)
98  MULT(1792, 448)
99 
100  "movaps %%xmm0, (%4,%0) \n\t"
101  "movaps %%xmm4, (%5,%0) \n\t"
102  "add $16, %0 \n\t"
103  "jl 1b \n\t"
104  :"+&r"(count)
105  :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
106  );
107 
108 #undef MULT
109 }
110 
111 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
112  ptrdiff_t incr)
113 {
114  LOCAL_ALIGNED_16(float, suma, [17]);
115  LOCAL_ALIGNED_16(float, sumb, [17]);
116  LOCAL_ALIGNED_16(float, sumc, [17]);
117  LOCAL_ALIGNED_16(float, sumd, [17]);
118 
119  float sum;
120 
121  /* copy to avoid wrap */
122  __asm__ volatile(
123  "movaps 0(%0), %%xmm0 \n\t" \
124  "movaps 16(%0), %%xmm1 \n\t" \
125  "movaps 32(%0), %%xmm2 \n\t" \
126  "movaps 48(%0), %%xmm3 \n\t" \
127  "movaps %%xmm0, 0(%1) \n\t" \
128  "movaps %%xmm1, 16(%1) \n\t" \
129  "movaps %%xmm2, 32(%1) \n\t" \
130  "movaps %%xmm3, 48(%1) \n\t" \
131  "movaps 64(%0), %%xmm0 \n\t" \
132  "movaps 80(%0), %%xmm1 \n\t" \
133  "movaps 96(%0), %%xmm2 \n\t" \
134  "movaps 112(%0), %%xmm3 \n\t" \
135  "movaps %%xmm0, 64(%1) \n\t" \
136  "movaps %%xmm1, 80(%1) \n\t" \
137  "movaps %%xmm2, 96(%1) \n\t" \
138  "movaps %%xmm3, 112(%1) \n\t"
139  ::"r"(in), "r"(in+512)
140  :"memory"
141  );
142 
143  apply_window(in + 16, win , win + 512, suma, sumc, 16);
144  apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
145 
146  SUM8(MACS, suma[0], win + 32, in + 48);
147 
148  sumc[ 0] = 0;
149  sumb[16] = 0;
150  sumd[16] = 0;
151 
152 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
153  "movups " #sumd "(%4), %%xmm0 \n\t" \
154  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
155  "subps " #suma "(%1), %%xmm0 \n\t" \
156  "movaps %%xmm0," #out1 "(%0) \n\t" \
157 \
158  "movups " #sumc "(%3), %%xmm0 \n\t" \
159  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
160  "addps " #sumb "(%2), %%xmm0 \n\t" \
161  "movaps %%xmm0," #out2 "(%0) \n\t"
162 
163  if (incr == 1) {
164  __asm__ volatile(
165  SUMS( 0, 48, 4, 52, 0, 112)
166  SUMS(16, 32, 20, 36, 16, 96)
167  SUMS(32, 16, 36, 20, 32, 80)
168  SUMS(48, 0, 52, 4, 48, 64)
169 
170  :"+&r"(out)
171  :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
172  :"memory"
173  );
174  out += 16*incr;
175  } else {
176  int j;
177  float *out2 = out + 32 * incr;
178  out[0 ] = -suma[ 0];
179  out += incr;
180  out2 -= incr;
181  for(j=1;j<16;j++) {
182  *out = -suma[ j] + sumd[16-j];
183  *out2 = sumb[16-j] + sumc[ j];
184  out += incr;
185  out2 -= incr;
186  }
187  }
188 
189  sum = 0;
190  SUM8(MLSS, sum, win + 16 + 32, in + 32);
191  *out = sum;
192 }
193 
194 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
195 
196 #if HAVE_X86ASM
197 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
198 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
199  int count, int switch_point, int block_type) \
200 { \
201  int align_end = count - (count & 3); \
202  int j; \
203  for (j = 0; j < align_end; j+= 4) { \
204  LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
205  float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
206  /* apply window & overlap with previous buffer */ \
207  \
208  /* select window */ \
209  ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
210  in += 4*18; \
211  buf += 4*18; \
212  out += 4; \
213  } \
214  for (; j < count; j++) { \
215  /* apply window & overlap with previous buffer */ \
216  \
217  /* select window */ \
218  int win_idx = (switch_point && j < 2) ? 0 : block_type; \
219  float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
220  \
221  ff_imdct36_float_ ## CPU1(out, buf, in, win); \
222  \
223  in += 18; \
224  buf++; \
225  out++; \
226  } \
227 }
228 
229 #if HAVE_SSE
230 #if ARCH_X86_32
231 DECL_IMDCT_BLOCKS(sse,sse)
232 #endif
233 DECL_IMDCT_BLOCKS(sse2,sse)
234 DECL_IMDCT_BLOCKS(sse3,sse)
235 DECL_IMDCT_BLOCKS(ssse3,sse)
236 #endif
237 #if HAVE_AVX_EXTERNAL
238 DECL_IMDCT_BLOCKS(avx,avx)
239 #endif
240 #endif /* HAVE_X86ASM */
241 
243 {
245 
246  int i, j;
247  for (j = 0; j < 4; j++) {
248  for (i = 0; i < 40; i ++) {
249  mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
250  mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
251  mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
252  mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
253  mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
254  mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
255  mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
256  mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
257  }
258  }
259 
260 #if HAVE_6REGS && HAVE_SSE_INLINE
261  if (INLINE_SSE(cpu_flags)) {
262  s->apply_window_float = apply_window_mp3;
263  }
264 #endif /* HAVE_SSE_INLINE */
265 
266 #if HAVE_X86ASM
267 #if HAVE_SSE
268 #if ARCH_X86_32
269  if (EXTERNAL_SSE(cpu_flags)) {
270  s->imdct36_blocks_float = imdct36_blocks_sse;
271  }
272 #endif
273  if (EXTERNAL_SSE2(cpu_flags)) {
274  s->imdct36_blocks_float = imdct36_blocks_sse2;
275  }
276  if (EXTERNAL_SSE3(cpu_flags)) {
277  s->imdct36_blocks_float = imdct36_blocks_sse3;
278  }
279  if (EXTERNAL_SSSE3(cpu_flags)) {
280  s->imdct36_blocks_float = imdct36_blocks_ssse3;
281  }
282 #endif
283 #if HAVE_AVX_EXTERNAL
284  if (EXTERNAL_AVX(cpu_flags)) {
285  s->imdct36_blocks_float = imdct36_blocks_avx;
286  }
287 #endif
288 #endif /* HAVE_X86ASM */
289 }
cpu.h
r
const char * r
Definition: vf_curves.c:114
ff_four_imdct36_float_avx
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, float *tmpbuf)
out
FILE * out
Definition: movenc.c:54
av_unused
#define av_unused
Definition: attributes.h:131
MPADSPContext
Definition: mpegaudiodsp.h:27
ff_mpadsp_init_x86
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
Definition: mpegaudiodsp.c:242
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:50
win
static float win(SuperEqualizerContext *s, float n, int N)
Definition: af_superequalizer.c:119
MACS
#define MACS(rt, ra, rb)
Definition: mpegaudiodsp_template.c:63
SUM8
#define SUM8(op, sum, w, p)
Definition: mpegaudiodsp_template.c:80
av_cold
#define av_cold
Definition: attributes.h:90
s
#define s(width, name)
Definition: cbs_vp9.c:257
EXTERNAL_SSE
#define EXTERNAL_SSE(flags)
Definition: cpu.h:58
sse
static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, int stride)
Definition: mpegvideo_enc.c:2704
INLINE_SSE
#define INLINE_SSE(flags)
Definition: cpu.h:88
EXTERNAL_SSE3
#define EXTERNAL_SSE3(flags)
Definition: cpu.h:62
ff_four_imdct36_float_sse
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, float *tmpbuf)
cpu.h
MLSS
#define MLSS(rt, ra, rb)
Definition: mpegaudiodsp_template.c:64
asm.h
MULT
#define MULT(c, x, n)
Definition: xvididct.c:145
attributes.h
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
in
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
Definition: audio_convert.c:326
DECL
#define DECL(CPU)
Definition: mpegaudiodsp.c:29
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
internal.h
apply_window
static void(*const apply_window[4])(AVFloatDSPContext *fdsp, SingleChannelElement *sce, const float *audio)
Definition: aacenc.c:193
else
else
Definition: snow.txt:125
len
int len
Definition: vorbis_enc_data.h:452
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
mdct_win_sse
static float mdct_win_sse[2][4][4 *40]
Definition: mpegaudiodsp.c:48
mpegaudiodsp.h
LOCAL_ALIGNED_16
#define LOCAL_ALIGNED_16(t, v,...)
Definition: internal.h:131
ff_mdct_win_float
float ff_mdct_win_float[8][MDCT_BUF_SIZE]
x86_reg
int x86_reg
Definition: asm.h:72
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65