FFmpeg
mpegaudiodsp.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <stddef.h>
23 
24 #include "config.h"
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
31 
32 #define DECL(CPU)\
33 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
34 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
35 
36 #if HAVE_X86ASM
37 DECL(sse2)
38 DECL(sse3)
39 DECL(ssse3)
40 DECL(avx)
41 #endif /* HAVE_X86ASM */
42 
43 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
44  float *tmpbuf);
45 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
46  float *tmpbuf);
47 
48 void ff_dct32_float_sse2(float *out, const float *in);
49 void ff_dct32_float_avx (float *out, const float *in);
50 
51 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
52 
53 #if HAVE_6REGS && HAVE_SSE_INLINE
54 
55 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
56 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
57 
58 #define SUM8(op, sum, w, p) \
59 { \
60  op(sum, (w)[0 * 64], (p)[0 * 64]); \
61  op(sum, (w)[1 * 64], (p)[1 * 64]); \
62  op(sum, (w)[2 * 64], (p)[2 * 64]); \
63  op(sum, (w)[3 * 64], (p)[3 * 64]); \
64  op(sum, (w)[4 * 64], (p)[4 * 64]); \
65  op(sum, (w)[5 * 64], (p)[5 * 64]); \
66  op(sum, (w)[6 * 64], (p)[6 * 64]); \
67  op(sum, (w)[7 * 64], (p)[7 * 64]); \
68 }
69 
70 static void apply_window(const float *buf, const float *win1,
71  const float *win2, float *sum1, float *sum2, int len)
72 {
73  x86_reg count = - 4*len;
74  const float *win1a = win1+len;
75  const float *win2a = win2+len;
76  const float *bufa = buf+len;
77  float *sum1a = sum1+len;
78  float *sum2a = sum2+len;
79 
80 
81 #define MULT(a, b) \
82  "movaps " #a "(%1,%0), %%xmm1 \n\t" \
83  "movaps " #a "(%3,%0), %%xmm2 \n\t" \
84  "mulps %%xmm2, %%xmm1 \n\t" \
85  "subps %%xmm1, %%xmm0 \n\t" \
86  "mulps " #b "(%2,%0), %%xmm2 \n\t" \
87  "subps %%xmm2, %%xmm4 \n\t" \
88 
89  __asm__ volatile(
90  "1: \n\t"
91  "xorps %%xmm0, %%xmm0 \n\t"
92  "xorps %%xmm4, %%xmm4 \n\t"
93 
94  MULT( 0, 0)
95  MULT( 256, 64)
96  MULT( 512, 128)
97  MULT( 768, 192)
98  MULT(1024, 256)
99  MULT(1280, 320)
100  MULT(1536, 384)
101  MULT(1792, 448)
102 
103  "movaps %%xmm0, (%4,%0) \n\t"
104  "movaps %%xmm4, (%5,%0) \n\t"
105  "add $16, %0 \n\t"
106  "jl 1b \n\t"
107  :"+&r"(count)
108  :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
109  );
110 
111 #undef MULT
112 }
113 
114 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
115  ptrdiff_t incr)
116 {
117  LOCAL_ALIGNED_16(float, suma, [17]);
118  LOCAL_ALIGNED_16(float, sumb, [17]);
119  LOCAL_ALIGNED_16(float, sumc, [17]);
120  LOCAL_ALIGNED_16(float, sumd, [17]);
121 
122  float sum;
123 
124  /* copy to avoid wrap */
125  __asm__ volatile(
126  "movaps 0(%0), %%xmm0 \n\t" \
127  "movaps 16(%0), %%xmm1 \n\t" \
128  "movaps 32(%0), %%xmm2 \n\t" \
129  "movaps 48(%0), %%xmm3 \n\t" \
130  "movaps %%xmm0, 0(%1) \n\t" \
131  "movaps %%xmm1, 16(%1) \n\t" \
132  "movaps %%xmm2, 32(%1) \n\t" \
133  "movaps %%xmm3, 48(%1) \n\t" \
134  "movaps 64(%0), %%xmm0 \n\t" \
135  "movaps 80(%0), %%xmm1 \n\t" \
136  "movaps 96(%0), %%xmm2 \n\t" \
137  "movaps 112(%0), %%xmm3 \n\t" \
138  "movaps %%xmm0, 64(%1) \n\t" \
139  "movaps %%xmm1, 80(%1) \n\t" \
140  "movaps %%xmm2, 96(%1) \n\t" \
141  "movaps %%xmm3, 112(%1) \n\t"
142  ::"r"(in), "r"(in+512)
143  :"memory"
144  );
145 
146  apply_window(in + 16, win , win + 512, suma, sumc, 16);
147  apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
148 
149  SUM8(MACS, suma[0], win + 32, in + 48);
150 
151  sumc[ 0] = 0;
152  sumb[16] = 0;
153  sumd[16] = 0;
154 
155 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
156  "movups " #sumd "(%4), %%xmm0 \n\t" \
157  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
158  "subps " #suma "(%1), %%xmm0 \n\t" \
159  "movaps %%xmm0," #out1 "(%0) \n\t" \
160 \
161  "movups " #sumc "(%3), %%xmm0 \n\t" \
162  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
163  "addps " #sumb "(%2), %%xmm0 \n\t" \
164  "movaps %%xmm0," #out2 "(%0) \n\t"
165 
166  if (incr == 1) {
167  __asm__ volatile(
168  SUMS( 0, 48, 4, 52, 0, 112)
169  SUMS(16, 32, 20, 36, 16, 96)
170  SUMS(32, 16, 36, 20, 32, 80)
171  SUMS(48, 0, 52, 4, 48, 64)
172 
173  :"+&r"(out)
174  :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
175  :"memory"
176  );
177  out += 16*incr;
178  } else {
179  int j;
180  float *out2 = out + 32 * incr;
181  out[0 ] = -suma[ 0];
182  out += incr;
183  out2 -= incr;
184  for(j=1;j<16;j++) {
185  *out = -suma[ j] + sumd[16-j];
186  *out2 = sumb[16-j] + sumc[ j];
187  out += incr;
188  out2 -= incr;
189  }
190  }
191 
192  sum = 0;
193  SUM8(MLSS, sum, win + 16 + 32, in + 32);
194  *out = sum;
195 }
196 
197 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
198 
199 #if HAVE_X86ASM
200 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
201 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
202  int count, int switch_point, int block_type) \
203 { \
204  int align_end = count - (count & 3); \
205  int j; \
206  for (j = 0; j < align_end; j+= 4) { \
207  LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
208  float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
209  /* apply window & overlap with previous buffer */ \
210  \
211  /* select window */ \
212  ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
213  in += 4*18; \
214  buf += 4*18; \
215  out += 4; \
216  } \
217  for (; j < count; j++) { \
218  /* apply window & overlap with previous buffer */ \
219  \
220  /* select window */ \
221  int win_idx = (switch_point && j < 2) ? 0 : block_type; \
222  float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
223  \
224  ff_imdct36_float_ ## CPU1(out, buf, in, win); \
225  \
226  in += 18; \
227  buf++; \
228  out++; \
229  } \
230 }
231 
232 #if HAVE_SSE
233 DECL_IMDCT_BLOCKS(sse2,sse)
234 DECL_IMDCT_BLOCKS(sse3,sse)
235 DECL_IMDCT_BLOCKS(ssse3,sse)
236 #endif
237 #if HAVE_AVX_EXTERNAL
238 DECL_IMDCT_BLOCKS(avx,avx)
239 #endif
240 #endif /* HAVE_X86ASM */
241 
243 {
244  int i, j;
245  for (j = 0; j < 4; j++) {
246  for (i = 0; i < 40; i ++) {
247  mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
248  mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
249  mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
250  mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
251  mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
252  mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
253  mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
254  mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
255  }
256  }
257 }
258 
260 {
262 
263 #if HAVE_6REGS && HAVE_SSE_INLINE
264  if (INLINE_SSE(cpu_flags)) {
265  s->apply_window_float = apply_window_mp3;
266  }
267 #endif /* HAVE_SSE_INLINE */
268 
269 #if HAVE_X86ASM
270 #if HAVE_SSE
271  if (EXTERNAL_SSE2(cpu_flags)) {
272  s->imdct36_blocks_float = imdct36_blocks_sse2;
273  s->dct32_float = ff_dct32_float_sse2;
274  }
275  if (EXTERNAL_SSE3(cpu_flags)) {
276  s->imdct36_blocks_float = imdct36_blocks_sse3;
277  }
278  if (EXTERNAL_SSSE3(cpu_flags)) {
279  s->imdct36_blocks_float = imdct36_blocks_ssse3;
280  }
281 #endif
282 #if HAVE_AVX_EXTERNAL
283  if (EXTERNAL_AVX(cpu_flags)) {
284  s->imdct36_blocks_float = imdct36_blocks_avx;
285  }
287  s->dct32_float = ff_dct32_float_avx;
288 #endif
289 #endif /* HAVE_X86ASM */
290 }
cpu.h
r
const char * r
Definition: vf_curves.c:127
ff_four_imdct36_float_avx
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, float *tmpbuf)
mem_internal.h
out
FILE * out
Definition: movenc.c:55
x86_reg
int x86_reg
Definition: asm.h:72
av_unused
#define av_unused
Definition: attributes.h:131
MPADSPContext
Definition: mpegaudiodsp.h:28
ff_mpadsp_init_x86
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
Definition: mpegaudiodsp.c:259
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
win
static float win(SuperEqualizerContext *s, float n, int N)
Definition: af_superequalizer.c:119
ff_dct32_float_sse2
void ff_dct32_float_sse2(float *out, const float *in)
MACS
#define MACS(rt, ra, rb)
Definition: mpegaudiodsp_template.c:65
SUM8
#define SUM8(op, sum, w, p)
Definition: mpegaudiodsp_template.c:82
av_cold
#define av_cold
Definition: attributes.h:90
s
#define s(width, name)
Definition: cbs_vp9.c:198
INLINE_SSE
#define INLINE_SSE(flags)
Definition: cpu.h:89
LOCAL_ALIGNED_16
#define LOCAL_ALIGNED_16(t, v,...)
Definition: mem_internal.h:128
EXTERNAL_AVX_FAST
#define EXTERNAL_AVX_FAST(flags)
Definition: cpu.h:71
asm.h
EXTERNAL_SSE3
#define EXTERNAL_SSE3(flags)
Definition: cpu.h:62
ff_four_imdct36_float_sse
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, float *tmpbuf)
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:102
cpu.h
MLSS
#define MLSS(rt, ra, rb)
Definition: mpegaudiodsp_template.c:66
MULT
#define MULT(c, x, n)
Definition: xvididct.c:145
attributes.h
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
DECL
#define DECL(CPU)
Definition: mpegaudiodsp.c:32
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
apply_window
static void(*const apply_window[4])(AVFloatDSPContext *fdsp, SingleChannelElement *sce, const float *audio)
Definition: aacenc.c:466
else
else
Definition: snow.txt:125
ff_dct32_float_avx
void ff_dct32_float_avx(float *out, const float *in)
len
int len
Definition: vorbis_enc_data.h:426
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
mdct_win_sse
static float mdct_win_sse[2][4][4 *40]
Definition: mpegaudiodsp.c:51
ff_mpadsp_init_x86_tabs
av_cold void ff_mpadsp_init_x86_tabs(void)
Definition: mpegaudiodsp.c:242
mpegaudiodsp.h
ff_mdct_win_float
float ff_mdct_win_float[8][MDCT_BUF_SIZE]
sse
static int sse(MpegEncContext *s, const uint8_t *src1, const uint8_t *src2, int w, int h, int stride)
Definition: mpegvideo_enc.c:2653
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65