FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * @brief Windows Media Audio Voice compatible decoder
25  * @author Ronald S. Bultje <rsbultje@gmail.com>
26  */
27 
28 #include <math.h>
29 
31 #include "libavutil/float_dsp.h"
32 #include "libavutil/mem.h"
33 #include "avcodec.h"
34 #include "internal.h"
35 #include "get_bits.h"
36 #include "put_bits.h"
37 #include "wmavoice_data.h"
38 #include "celp_filters.h"
39 #include "acelp_vectors.h"
40 #include "acelp_filters.h"
41 #include "lsp.h"
42 #include "dct.h"
43 #include "rdft.h"
44 #include "sinewin.h"
45 
46 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
47 #define MAX_LSPS 16 ///< maximum filter order
48 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
49  ///< of 16 for ASM input buffer alignment
50 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
51 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
52 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
53 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
54  ///< maximum number of samples per superframe
55 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
56  ///< was split over two packets
57 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
58 
59 /**
60  * Frame type VLC coding.
61  */
63 
64 /**
65  * Adaptive codebook types.
66  */
67 enum {
68  ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
69  ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
70  ///< we interpolate to get a per-sample pitch.
71  ///< Signal is generated using an asymmetric sinc
72  ///< window function
73  ///< @note see #wmavoice_ipol1_coeffs
74  ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
75  ///< a Hamming sinc window function
76  ///< @note see #wmavoice_ipol2_coeffs
77 };
78 
79 /**
80  * Fixed codebook types.
81  */
82 enum {
83  FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
84  ///< generated from a hardcoded (fixed) codebook
85  ///< with per-frame (low) gain values
86  FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
87  ///< gain values
88  FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
89  ///< used in particular for low-bitrate streams
90  FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
91  ///< combinations of either single pulses or
92  ///< pulse pairs
93 };
94 
95 /**
96  * Description of frame types.
97  */
98 static const struct frame_type_desc {
99  uint8_t n_blocks; ///< amount of blocks per frame (each block
100  ///< (contains 160/#n_blocks samples)
101  uint8_t log_n_blocks; ///< log2(#n_blocks)
102  uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
103  uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
104  uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
105  ///< (rather than just one single pulse)
106  ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
107  uint16_t frame_size; ///< the amount of bits that make up the block
108  ///< data (per frame)
109 } frame_descs[17] = {
110  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 },
111  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 },
112  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 },
113  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 },
114  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
115  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
116  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
117  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
118  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 },
119  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 },
120  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 },
121  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 },
122  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 },
123  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 },
124  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 },
125  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 },
126  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 }
127 };
128 
129 /**
130  * WMA Voice decoding context.
131  */
132 typedef struct {
133  /**
134  * @name Global values specified in the stream header / extradata or used all over.
135  * @{
136  */
137  GetBitContext gb; ///< packet bitreader. During decoder init,
138  ///< it contains the extradata from the
139  ///< demuxer. During decoding, it contains
140  ///< packet data.
141  int8_t vbm_tree[25]; ///< converts VLC codes to frame type
142 
143  int spillover_bitsize; ///< number of bits used to specify
144  ///< #spillover_nbits in the packet header
145  ///< = ceil(log2(ctx->block_align << 3))
146  int history_nsamples; ///< number of samples in history for signal
147  ///< prediction (through ACB)
148 
149  /* postfilter specific values */
150  int do_apf; ///< whether to apply the averaged
151  ///< projection filter (APF)
152  int denoise_strength; ///< strength of denoising in Wiener filter
153  ///< [0-11]
154  int denoise_tilt_corr; ///< Whether to apply tilt correction to the
155  ///< Wiener filter coefficients (postfilter)
156  int dc_level; ///< Predicted amount of DC noise, based
157  ///< on which a DC removal filter is used
158 
159  int lsps; ///< number of LSPs per frame [10 or 16]
160  int lsp_q_mode; ///< defines quantizer defaults [0, 1]
161  int lsp_def_mode; ///< defines different sets of LSP defaults
162  ///< [0, 1]
163  int frame_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
164  ///< per-frame (independent coding)
165  int sframe_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
166  ///< per superframe (residual coding)
167 
168  int min_pitch_val; ///< base value for pitch parsing code
169  int max_pitch_val; ///< max value + 1 for pitch parsing
170  int pitch_nbits; ///< number of bits used to specify the
171  ///< pitch value in the frame header
172  int block_pitch_nbits; ///< number of bits used to specify the
173  ///< first block's pitch value
174  int block_pitch_range; ///< range of the block pitch
175  int block_delta_pitch_nbits; ///< number of bits used to specify the
176  ///< delta pitch between this and the last
177  ///< block's pitch value, used in all but
178  ///< first block
179  int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
180  ///< from -this to +this-1)
181  uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
182  ///< conversion
183 
184  /**
185  * @}
186  *
187  * @name Packet values specified in the packet header or related to a packet.
188  *
189  * A packet is considered to be a single unit of data provided to this
190  * decoder by the demuxer.
191  * @{
192  */
193  int spillover_nbits; ///< number of bits of the previous packet's
194  ///< last superframe preceding this
195  ///< packet's first full superframe (useful
196  ///< for re-synchronization also)
197  int has_residual_lsps; ///< if set, superframes contain one set of
198  ///< LSPs that cover all frames, encoded as
199  ///< independent and residual LSPs; if not
200  ///< set, each frame contains its own, fully
201  ///< independent, LSPs
202  int skip_bits_next; ///< number of bits to skip at the next call
203  ///< to #wmavoice_decode_packet() (since
204  ///< they're part of the previous superframe)
205 
207  ///< cache for superframe data split over
208  ///< multiple packets
209  int sframe_cache_size; ///< set to >0 if we have data from an
210  ///< (incomplete) superframe from a previous
211  ///< packet that spilled over in the current
212  ///< packet; specifies the amount of bits in
213  ///< #sframe_cache
214  PutBitContext pb; ///< bitstream writer for #sframe_cache
215 
216  /**
217  * @}
218  *
219  * @name Frame and superframe values
220  * Superframe and frame data - these can change from frame to frame,
221  * although some of them do in that case serve as a cache / history for
222  * the next frame or superframe.
223  * @{
224  */
225  double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
226  ///< superframe
227  int last_pitch_val; ///< pitch value of the previous frame
228  int last_acb_type; ///< frame type [0-2] of the previous frame
229  int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
230  ///< << 16) / #MAX_FRAMESIZE
231  float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
232 
233  int aw_idx_is_ext; ///< whether the AW index was encoded in
234  ///< 8 bits (instead of 6)
235  int aw_pulse_range; ///< the range over which #aw_pulse_set1()
236  ///< can apply the pulse, relative to the
237  ///< value in aw_first_pulse_off. The exact
238  ///< position of the first AW-pulse is within
239  ///< [pulse_off, pulse_off + this], and
240  ///< depends on bitstream values; [16 or 24]
241  int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
242  ///< that this number can be negative (in
243  ///< which case it basically means "zero")
244  int aw_first_pulse_off[2]; ///< index of first sample to which to
245  ///< apply AW-pulses, or -0xff if unset
246  int aw_next_pulse_off_cache; ///< the position (relative to start of the
247  ///< second block) at which pulses should
248  ///< start to be positioned, serves as a
249  ///< cache for pitch-adaptive window pulses
250  ///< between blocks
251 
252  int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
253  ///< only used for comfort noise in #pRNG()
254  float gain_pred_err[6]; ///< cache for gain prediction
255  float excitation_history[MAX_SIGNAL_HISTORY];
256  ///< cache of the signal of previous
257  ///< superframes, used as a history for
258  ///< signal generation
259  float synth_history[MAX_LSPS]; ///< see #excitation_history
260  /**
261  * @}
262  *
263  * @name Postfilter values
264  *
265  * Variables used for postfilter implementation, mostly history for
266  * smoothing and so on, and context variables for FFT/iFFT.
267  * @{
268  */
269  RDFTContext rdft, irdft; ///< contexts for FFT-calculation in the
270  ///< postfilter (for denoise filter)
271  DCTContext dct, dst; ///< contexts for phase shift (in Hilbert
272  ///< transform, part of postfilter)
273  float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
274  ///< range
275  float postfilter_agc; ///< gain control memory, used in
276  ///< #adaptive_gain_control()
277  float dcf_mem[2]; ///< DC filter history
278  float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
279  ///< zero filter output (i.e. excitation)
280  ///< by postfilter
281  float denoise_filter_cache[MAX_FRAMESIZE];
282  int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
283  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
284  ///< aligned buffer for LPC tilting
285  DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
286  ///< aligned buffer for denoise coefficients
287  DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
288  ///< aligned buffer for postfilter speech
289  ///< synthesis
290  /**
291  * @}
292  */
294 
295 /**
296  * Set up the variable bit mode (VBM) tree from container extradata.
297  * @param gb bit I/O context.
298  * The bit context (s->gb) should be loaded with byte 23-46 of the
299  * container extradata (i.e. the ones containing the VBM tree).
300  * @param vbm_tree pointer to array to which the decoded VBM tree will be
301  * written.
302  * @return 0 on success, <0 on error.
303  */
304 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
305 {
306  static const uint8_t bits[] = {
307  2, 2, 2, 4, 4, 4,
308  6, 6, 6, 8, 8, 8,
309  10, 10, 10, 12, 12, 12,
310  14, 14, 14, 14
311  };
312  static const uint16_t codes[] = {
313  0x0000, 0x0001, 0x0002, // 00/01/10
314  0x000c, 0x000d, 0x000e, // 11+00/01/10
315  0x003c, 0x003d, 0x003e, // 1111+00/01/10
316  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
317  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
318  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
319  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
320  };
321  int cntr[8] = { 0 }, n, res;
322 
323  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
324  for (n = 0; n < 17; n++) {
325  res = get_bits(gb, 3);
326  if (cntr[res] > 3) // should be >= 3 + (res == 7))
327  return -1;
328  vbm_tree[res * 3 + cntr[res]++] = n;
329  }
330  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
331  bits, 1, 1, codes, 2, 2, 132);
332  return 0;
333 }
334 
335 /**
336  * Set up decoder with parameters from demuxer (extradata etc.).
337  */
339 {
340  int n, flags, pitch_range, lsp16_flag;
341  WMAVoiceContext *s = ctx->priv_data;
342 
343  /**
344  * Extradata layout:
345  * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
346  * - byte 19-22: flags field (annoyingly in LE; see below for known
347  * values),
348  * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
349  * rest is 0).
350  */
351  if (ctx->extradata_size != 46) {
352  av_log(ctx, AV_LOG_ERROR,
353  "Invalid extradata size %d (should be 46)\n",
354  ctx->extradata_size);
355  return -1;
356  }
357  flags = AV_RL32(ctx->extradata + 18);
358  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
359  s->do_apf = flags & 0x1;
360  if (s->do_apf) {
361  ff_rdft_init(&s->rdft, 7, DFT_R2C);
362  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
363  ff_dct_init(&s->dct, 6, DCT_I);
364  ff_dct_init(&s->dst, 6, DST_I);
365 
366  ff_sine_window_init(s->cos, 256);
367  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
368  for (n = 0; n < 255; n++) {
369  s->sin[n] = -s->sin[510 - n];
370  s->cos[510 - n] = s->cos[n];
371  }
372  }
373  s->denoise_strength = (flags >> 2) & 0xF;
374  if (s->denoise_strength >= 12) {
375  av_log(ctx, AV_LOG_ERROR,
376  "Invalid denoise filter strength %d (max=11)\n",
377  s->denoise_strength);
378  return -1;
379  }
380  s->denoise_tilt_corr = !!(flags & 0x40);
381  s->dc_level = (flags >> 7) & 0xF;
382  s->lsp_q_mode = !!(flags & 0x2000);
383  s->lsp_def_mode = !!(flags & 0x4000);
384  lsp16_flag = flags & 0x1000;
385  if (lsp16_flag) {
386  s->lsps = 16;
387  s->frame_lsp_bitsize = 34;
388  s->sframe_lsp_bitsize = 60;
389  } else {
390  s->lsps = 10;
391  s->frame_lsp_bitsize = 24;
392  s->sframe_lsp_bitsize = 48;
393  }
394  for (n = 0; n < s->lsps; n++)
395  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
396 
397  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
398  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
399  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
400  return -1;
401  }
402 
403  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
404  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
405  pitch_range = s->max_pitch_val - s->min_pitch_val;
406  if (pitch_range <= 0) {
407  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
408  return -1;
409  }
410  s->pitch_nbits = av_ceil_log2(pitch_range);
411  s->last_pitch_val = 40;
413  s->history_nsamples = s->max_pitch_val + 8;
414 
416  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
417  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
418 
419  av_log(ctx, AV_LOG_ERROR,
420  "Unsupported samplerate %d (min=%d, max=%d)\n",
421  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
422 
423  return -1;
424  }
425 
426  s->block_conv_table[0] = s->min_pitch_val;
427  s->block_conv_table[1] = (pitch_range * 25) >> 6;
428  s->block_conv_table[2] = (pitch_range * 44) >> 6;
429  s->block_conv_table[3] = s->max_pitch_val - 1;
430  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
431  if (s->block_delta_pitch_hrange <= 0) {
432  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
433  return -1;
434  }
435  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
437  s->block_conv_table[3] + 1 +
438  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
439  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
440 
441  ctx->channels = 1;
444 
445  return 0;
446 }
447 
448 /**
449  * @name Postfilter functions
450  * Postfilter functions (gain control, wiener denoise filter, DC filter,
451  * kalman smoothening, plus surrounding code to wrap it)
452  * @{
453  */
454 /**
455  * Adaptive gain control (as used in postfilter).
456  *
457  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
458  * that the energy here is calculated using sum(abs(...)), whereas the
459  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
460  *
461  * @param out output buffer for filtered samples
462  * @param in input buffer containing the samples as they are after the
463  * postfilter steps so far
464  * @param speech_synth input buffer containing speech synth before postfilter
465  * @param size input buffer size
466  * @param alpha exponential filter factor
467  * @param gain_mem pointer to filter memory (single float)
468  */
469 static void adaptive_gain_control(float *out, const float *in,
470  const float *speech_synth,
471  int size, float alpha, float *gain_mem)
472 {
473  int i;
474  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
475  float mem = *gain_mem;
476 
477  for (i = 0; i < size; i++) {
478  speech_energy += fabsf(speech_synth[i]);
479  postfilter_energy += fabsf(in[i]);
480  }
481  gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
482 
483  for (i = 0; i < size; i++) {
484  mem = alpha * mem + gain_scale_factor;
485  out[i] = in[i] * mem;
486  }
487 
488  *gain_mem = mem;
489 }
490 
491 /**
492  * Kalman smoothing function.
493  *
494  * This function looks back pitch +/- 3 samples back into history to find
495  * the best fitting curve (that one giving the optimal gain of the two
496  * signals, i.e. the highest dot product between the two), and then
497  * uses that signal history to smoothen the output of the speech synthesis
498  * filter.
499  *
500  * @param s WMA Voice decoding context
501  * @param pitch pitch of the speech signal
502  * @param in input speech signal
503  * @param out output pointer for smoothened signal
504  * @param size input/output buffer size
505  *
506  * @returns -1 if no smoothening took place, e.g. because no optimal
507  * fit could be found, or 0 on success.
508  */
509 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
510  const float *in, float *out, int size)
511 {
512  int n;
513  float optimal_gain = 0, dot;
514  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
515  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
516  *best_hist_ptr = NULL;
517 
518  /* find best fitting point in history */
519  do {
520  dot = avpriv_scalarproduct_float_c(in, ptr, size);
521  if (dot > optimal_gain) {
522  optimal_gain = dot;
523  best_hist_ptr = ptr;
524  }
525  } while (--ptr >= end);
526 
527  if (optimal_gain <= 0)
528  return -1;
529  dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
530  if (dot <= 0) // would be 1.0
531  return -1;
532 
533  if (optimal_gain <= dot) {
534  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
535  } else
536  dot = 0.625;
537 
538  /* actual smoothing */
539  for (n = 0; n < size; n++)
540  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
541 
542  return 0;
543 }
544 
545 /**
546  * Get the tilt factor of a formant filter from its transfer function
547  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
548  * but somehow (??) it does a speech synthesis filter in the
549  * middle, which is missing here
550  *
551  * @param lpcs LPC coefficients
552  * @param n_lpcs Size of LPC buffer
553  * @returns the tilt factor
554  */
555 static float tilt_factor(const float *lpcs, int n_lpcs)
556 {
557  float rh0, rh1;
558 
559  rh0 = 1.0 + avpriv_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
560  rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
561 
562  return rh1 / rh0;
563 }
564 
565 /**
566  * Derive denoise filter coefficients (in real domain) from the LPCs.
567  */
568 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
569  int fcb_type, float *coeffs, int remainder)
570 {
571  float last_coeff, min = 15.0, max = -15.0;
572  float irange, angle_mul, gain_mul, range, sq;
573  int n, idx;
574 
575  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
576  s->rdft.rdft_calc(&s->rdft, lpcs);
577 #define log_range(var, assign) do { \
578  float tmp = log10f(assign); var = tmp; \
579  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
580  } while (0)
581  log_range(last_coeff, lpcs[1] * lpcs[1]);
582  for (n = 1; n < 64; n++)
583  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
584  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
585  log_range(lpcs[0], lpcs[0] * lpcs[0]);
586 #undef log_range
587  range = max - min;
588  lpcs[64] = last_coeff;
589 
590  /* Now, use this spectrum to pick out these frequencies with higher
591  * (relative) power/energy (which we then take to be "not noise"),
592  * and set up a table (still in lpc[]) of (relative) gains per frequency.
593  * These frequencies will be maintained, while others ("noise") will be
594  * decreased in the filter output. */
595  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
596  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
597  (5.0 / 14.7));
598  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
599  for (n = 0; n <= 64; n++) {
600  float pwr;
601 
602  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
604  lpcs[n] = angle_mul * pwr;
605 
606  /* 70.57 =~ 1/log10(1.0331663) */
607  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
608  if (idx > 127) { // fall back if index falls outside table range
609  coeffs[n] = wmavoice_energy_table[127] *
610  powf(1.0331663, idx - 127);
611  } else
612  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
613  }
614 
615  /* calculate the Hilbert transform of the gains, which we do (since this
616  * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
617  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
618  * "moment" of the LPCs in this filter. */
619  s->dct.dct_calc(&s->dct, lpcs);
620  s->dst.dct_calc(&s->dst, lpcs);
621 
622  /* Split out the coefficient indexes into phase/magnitude pairs */
623  idx = 255 + av_clip(lpcs[64], -255, 255);
624  coeffs[0] = coeffs[0] * s->cos[idx];
625  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
626  last_coeff = coeffs[64] * s->cos[idx];
627  for (n = 63;; n--) {
628  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
629  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
630  coeffs[n * 2] = coeffs[n] * s->cos[idx];
631 
632  if (!--n) break;
633 
634  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
635  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
636  coeffs[n * 2] = coeffs[n] * s->cos[idx];
637  }
638  coeffs[1] = last_coeff;
639 
640  /* move into real domain */
641  s->irdft.rdft_calc(&s->irdft, coeffs);
642 
643  /* tilt correction and normalize scale */
644  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
645  if (s->denoise_tilt_corr) {
646  float tilt_mem = 0;
647 
648  coeffs[remainder - 1] = 0;
649  ff_tilt_compensation(&tilt_mem,
650  -1.8 * tilt_factor(coeffs, remainder - 1),
651  coeffs, remainder);
652  }
653  sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
654  remainder));
655  for (n = 0; n < remainder; n++)
656  coeffs[n] *= sq;
657 }
658 
659 /**
660  * This function applies a Wiener filter on the (noisy) speech signal as
661  * a means to denoise it.
662  *
663  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
664  * - using this power spectrum, calculate (for each frequency) the Wiener
665  * filter gain, which depends on the frequency power and desired level
666  * of noise subtraction (when set too high, this leads to artifacts)
667  * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
668  * of 4-8kHz);
669  * - by doing a phase shift, calculate the Hilbert transform of this array
670  * of per-frequency filter-gains to get the filtering coefficients;
671  * - smoothen/normalize/de-tilt these filter coefficients as desired;
672  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
673  * to get the denoised speech signal;
674  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
675  * the frame boundary) are saved and applied to subsequent frames by an
676  * overlap-add method (otherwise you get clicking-artifacts).
677  *
678  * @param s WMA Voice decoding context
679  * @param fcb_type Frame (codebook) type
680  * @param synth_pf input: the noisy speech signal, output: denoised speech
681  * data; should be 16-byte aligned (for ASM purposes)
682  * @param size size of the speech data
683  * @param lpcs LPCs used to synthesize this frame's speech data
684  */
685 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
686  float *synth_pf, int size,
687  const float *lpcs)
688 {
689  int remainder, lim, n;
690 
691  if (fcb_type != FCB_TYPE_SILENCE) {
692  float *tilted_lpcs = s->tilted_lpcs_pf,
693  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
694 
695  tilted_lpcs[0] = 1.0;
696  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
697  memset(&tilted_lpcs[s->lsps + 1], 0,
698  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
699  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
700  tilted_lpcs, s->lsps + 2);
701 
702  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
703  * size is applied to the next frame. All input beyond this is zero,
704  * and thus all output beyond this will go towards zero, hence we can
705  * limit to min(size-1, 127-size) as a performance consideration. */
706  remainder = FFMIN(127 - size, size - 1);
707  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
708 
709  /* apply coefficients (in frequency spectrum domain), i.e. complex
710  * number multiplication */
711  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
712  s->rdft.rdft_calc(&s->rdft, synth_pf);
713  s->rdft.rdft_calc(&s->rdft, coeffs);
714  synth_pf[0] *= coeffs[0];
715  synth_pf[1] *= coeffs[1];
716  for (n = 1; n < 64; n++) {
717  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
718  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
719  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
720  }
721  s->irdft.rdft_calc(&s->irdft, synth_pf);
722  }
723 
724  /* merge filter output with the history of previous runs */
725  if (s->denoise_filter_cache_size) {
726  lim = FFMIN(s->denoise_filter_cache_size, size);
727  for (n = 0; n < lim; n++)
728  synth_pf[n] += s->denoise_filter_cache[n];
729  s->denoise_filter_cache_size -= lim;
730  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
732  }
733 
734  /* move remainder of filter output into a cache for future runs */
735  if (fcb_type != FCB_TYPE_SILENCE) {
736  lim = FFMIN(remainder, s->denoise_filter_cache_size);
737  for (n = 0; n < lim; n++)
738  s->denoise_filter_cache[n] += synth_pf[size + n];
739  if (lim < remainder) {
740  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
741  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
742  s->denoise_filter_cache_size = remainder;
743  }
744  }
745 }
746 
747 /**
748  * Averaging projection filter, the postfilter used in WMAVoice.
749  *
750  * This uses the following steps:
751  * - A zero-synthesis filter (generate excitation from synth signal)
752  * - Kalman smoothing on excitation, based on pitch
753  * - Re-synthesized smoothened output
754  * - Iterative Wiener denoise filter
755  * - Adaptive gain filter
756  * - DC filter
757  *
758  * @param s WMAVoice decoding context
759  * @param synth Speech synthesis output (before postfilter)
760  * @param samples Output buffer for filtered samples
761  * @param size Buffer size of synth & samples
762  * @param lpcs Generated LPCs used for speech synthesis
763  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
764  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
765  * @param pitch Pitch of the input signal
766  */
767 static void postfilter(WMAVoiceContext *s, const float *synth,
768  float *samples, int size,
769  const float *lpcs, float *zero_exc_pf,
770  int fcb_type, int pitch)
771 {
772  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
773  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
774  *synth_filter_in = zero_exc_pf;
775 
776  av_assert0(size <= MAX_FRAMESIZE / 2);
777 
778  /* generate excitation from input signal */
779  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
780 
781  if (fcb_type >= FCB_TYPE_AW_PULSES &&
782  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
783  synth_filter_in = synth_filter_in_buf;
784 
785  /* re-synthesize speech after smoothening, and keep history */
786  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
787  synth_filter_in, size, s->lsps);
788  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
789  sizeof(synth_pf[0]) * s->lsps);
790 
791  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
792 
793  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
794  &s->postfilter_agc);
795 
796  if (s->dc_level > 8) {
797  /* remove ultra-low frequency DC noise / highpass filter;
798  * coefficients are identical to those used in SIPR decoding,
799  * and very closely resemble those used in AMR-NB decoding. */
801  (const float[2]) { -1.99997, 1.0 },
802  (const float[2]) { -1.9330735188, 0.93589198496 },
803  0.93980580475, s->dcf_mem, size);
804  }
805 }
806 /**
807  * @}
808  */
809 
810 /**
811  * Dequantize LSPs
812  * @param lsps output pointer to the array that will hold the LSPs
813  * @param num number of LSPs to be dequantized
814  * @param values quantized values, contains n_stages values
815  * @param sizes range (i.e. max value) of each quantized value
816  * @param n_stages number of dequantization runs
817  * @param table dequantization table to be used
818  * @param mul_q LSF multiplier
819  * @param base_q base (lowest) LSF values
820  */
821 static void dequant_lsps(double *lsps, int num,
822  const uint16_t *values,
823  const uint16_t *sizes,
824  int n_stages, const uint8_t *table,
825  const double *mul_q,
826  const double *base_q)
827 {
828  int n, m;
829 
830  memset(lsps, 0, num * sizeof(*lsps));
831  for (n = 0; n < n_stages; n++) {
832  const uint8_t *t_off = &table[values[n] * num];
833  double base = base_q[n], mul = mul_q[n];
834 
835  for (m = 0; m < num; m++)
836  lsps[m] += base + mul * t_off[m];
837 
838  table += sizes[n] * num;
839  }
840 }
841 
842 /**
843  * @name LSP dequantization routines
844  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
845  * @note we assume enough bits are available, caller should check.
846  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
847  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
848  * @{
849  */
850 /**
851  * Parse 10 independently-coded LSPs.
852  */
853 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
854 {
855  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
856  static const double mul_lsf[4] = {
857  5.2187144800e-3, 1.4626986422e-3,
858  9.6179549166e-4, 1.1325736225e-3
859  };
860  static const double base_lsf[4] = {
861  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
862  M_PI * -3.3486e-2, M_PI * -5.7408e-2
863  };
864  uint16_t v[4];
865 
866  v[0] = get_bits(gb, 8);
867  v[1] = get_bits(gb, 6);
868  v[2] = get_bits(gb, 5);
869  v[3] = get_bits(gb, 5);
870 
871  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
872  mul_lsf, base_lsf);
873 }
874 
875 /**
876  * Parse 10 independently-coded LSPs, and then derive the tables to
877  * generate LSPs for the other frames from them (residual coding).
878  */
880  double *i_lsps, const double *old,
881  double *a1, double *a2, int q_mode)
882 {
883  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
884  static const double mul_lsf[3] = {
885  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
886  };
887  static const double base_lsf[3] = {
888  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
889  };
890  const float (*ipol_tab)[2][10] = q_mode ?
892  uint16_t interpol, v[3];
893  int n;
894 
895  dequant_lsp10i(gb, i_lsps);
896 
897  interpol = get_bits(gb, 5);
898  v[0] = get_bits(gb, 7);
899  v[1] = get_bits(gb, 6);
900  v[2] = get_bits(gb, 6);
901 
902  for (n = 0; n < 10; n++) {
903  double delta = old[n] - i_lsps[n];
904  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
905  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
906  }
907 
908  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
909  mul_lsf, base_lsf);
910 }
911 
912 /**
913  * Parse 16 independently-coded LSPs.
914  */
915 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
916 {
917  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
918  static const double mul_lsf[5] = {
919  3.3439586280e-3, 6.9908173703e-4,
920  3.3216608306e-3, 1.0334960326e-3,
921  3.1899104283e-3
922  };
923  static const double base_lsf[5] = {
924  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
925  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
926  M_PI * -1.29816e-1
927  };
928  uint16_t v[5];
929 
930  v[0] = get_bits(gb, 8);
931  v[1] = get_bits(gb, 6);
932  v[2] = get_bits(gb, 7);
933  v[3] = get_bits(gb, 6);
934  v[4] = get_bits(gb, 7);
935 
936  dequant_lsps( lsps, 5, v, vec_sizes, 2,
937  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
938  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
939  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
940  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
941  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
942 }
943 
944 /**
945  * Parse 16 independently-coded LSPs, and then derive the tables to
946  * generate LSPs for the other frames from them (residual coding).
947  */
949  double *i_lsps, const double *old,
950  double *a1, double *a2, int q_mode)
951 {
952  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
953  static const double mul_lsf[3] = {
954  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
955  };
956  static const double base_lsf[3] = {
957  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
958  };
959  const float (*ipol_tab)[2][16] = q_mode ?
961  uint16_t interpol, v[3];
962  int n;
963 
964  dequant_lsp16i(gb, i_lsps);
965 
966  interpol = get_bits(gb, 5);
967  v[0] = get_bits(gb, 7);
968  v[1] = get_bits(gb, 7);
969  v[2] = get_bits(gb, 7);
970 
971  for (n = 0; n < 16; n++) {
972  double delta = old[n] - i_lsps[n];
973  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
974  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
975  }
976 
977  dequant_lsps( a2, 10, v, vec_sizes, 1,
978  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
979  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
980  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
981  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
982  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
983 }
984 
985 /**
986  * @}
987  * @name Pitch-adaptive window coding functions
988  * The next few functions are for pitch-adaptive window coding.
989  * @{
990  */
991 /**
992  * Parse the offset of the first pitch-adaptive window pulses, and
993  * the distribution of pulses between the two blocks in this frame.
994  * @param s WMA Voice decoding context private data
995  * @param gb bit I/O context
996  * @param pitch pitch for each block in this frame
997  */
999  const int *pitch)
1000 {
1001  static const int16_t start_offset[94] = {
1002  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1003  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1004  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1005  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1006  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1007  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1008  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1009  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1010  };
1011  int bits, offset;
1012 
1013  /* position of pulse */
1014  s->aw_idx_is_ext = 0;
1015  if ((bits = get_bits(gb, 6)) >= 54) {
1016  s->aw_idx_is_ext = 1;
1017  bits += (bits - 54) * 3 + get_bits(gb, 2);
1018  }
1019 
1020  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1021  * the distribution of the pulses in each block contained in this frame. */
1022  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1023  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1024  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1025  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1026  offset += s->aw_n_pulses[0] * pitch[0];
1027  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1028  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1029 
1030  /* if continuing from a position before the block, reset position to
1031  * start of block (when corrected for the range over which it can be
1032  * spread in aw_pulse_set1()). */
1033  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1034  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1035  s->aw_first_pulse_off[1] -= pitch[1];
1036  if (start_offset[bits] < 0)
1037  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1038  s->aw_first_pulse_off[0] -= pitch[0];
1039  }
1040 }
1041 
1042 /**
1043  * Apply second set of pitch-adaptive window pulses.
1044  * @param s WMA Voice decoding context private data
1045  * @param gb bit I/O context
1046  * @param block_idx block index in frame [0, 1]
1047  * @param fcb structure containing fixed codebook vector info
1048  */
1050  int block_idx, AMRFixed *fcb)
1051 {
1052  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1053  uint16_t *use_mask = use_mask_mem + 2;
1054  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1055  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1056  * of idx are the position of the bit within a particular item in the
1057  * array (0 being the most significant bit, and 15 being the least
1058  * significant bit), and the remainder (>> 4) is the index in the
1059  * use_mask[]-array. This is faster and uses less memory than using a
1060  * 80-byte/80-int array. */
1061  int pulse_off = s->aw_first_pulse_off[block_idx],
1062  pulse_start, n, idx, range, aidx, start_off = 0;
1063 
1064  /* set offset of first pulse to within this block */
1065  if (s->aw_n_pulses[block_idx] > 0)
1066  while (pulse_off + s->aw_pulse_range < 1)
1067  pulse_off += fcb->pitch_lag;
1068 
1069  /* find range per pulse */
1070  if (s->aw_n_pulses[0] > 0) {
1071  if (block_idx == 0) {
1072  range = 32;
1073  } else /* block_idx = 1 */ {
1074  range = 8;
1075  if (s->aw_n_pulses[block_idx] > 0)
1076  pulse_off = s->aw_next_pulse_off_cache;
1077  }
1078  } else
1079  range = 16;
1080  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1081 
1082  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1083  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1084  * we exclude that range from being pulsed again in this function. */
1085  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1086  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1087  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1088  if (s->aw_n_pulses[block_idx] > 0)
1089  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1090  int excl_range = s->aw_pulse_range; // always 16 or 24
1091  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1092  int first_sh = 16 - (idx & 15);
1093  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1094  excl_range -= first_sh;
1095  if (excl_range >= 16) {
1096  *use_mask_ptr++ = 0;
1097  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1098  } else
1099  *use_mask_ptr &= 0xFFFF >> excl_range;
1100  }
1101 
1102  /* find the 'aidx'th offset that is not excluded */
1103  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1104  for (n = 0; n <= aidx; pulse_start++) {
1105  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1106  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1107  if (use_mask[0]) idx = 0x0F;
1108  else if (use_mask[1]) idx = 0x1F;
1109  else if (use_mask[2]) idx = 0x2F;
1110  else if (use_mask[3]) idx = 0x3F;
1111  else if (use_mask[4]) idx = 0x4F;
1112  else return;
1113  idx -= av_log2_16bit(use_mask[idx >> 4]);
1114  }
1115  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1116  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1117  n++;
1118  start_off = idx;
1119  }
1120  }
1121 
1122  fcb->x[fcb->n] = start_off;
1123  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1124  fcb->n++;
1125 
1126  /* set offset for next block, relative to start of that block */
1127  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1128  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1129 }
1130 
1131 /**
1132  * Apply first set of pitch-adaptive window pulses.
1133  * @param s WMA Voice decoding context private data
1134  * @param gb bit I/O context
1135  * @param block_idx block index in frame [0, 1]
1136  * @param fcb storage location for fixed codebook pulse info
1137  */
1139  int block_idx, AMRFixed *fcb)
1140 {
1141  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1142  float v;
1143 
1144  if (s->aw_n_pulses[block_idx] > 0) {
1145  int n, v_mask, i_mask, sh, n_pulses;
1146 
1147  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1148  n_pulses = 3;
1149  v_mask = 8;
1150  i_mask = 7;
1151  sh = 4;
1152  } else { // 4 pulses, 1:sign + 2:index each
1153  n_pulses = 4;
1154  v_mask = 4;
1155  i_mask = 3;
1156  sh = 3;
1157  }
1158 
1159  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1160  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1161  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1162  s->aw_first_pulse_off[block_idx];
1163  while (fcb->x[fcb->n] < 0)
1164  fcb->x[fcb->n] += fcb->pitch_lag;
1165  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1166  fcb->n++;
1167  }
1168  } else {
1169  int num2 = (val & 0x1FF) >> 1, delta, idx;
1170 
1171  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1172  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1173  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1174  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1175  v = (val & 0x200) ? -1.0 : 1.0;
1176 
1177  fcb->no_repeat_mask |= 3 << fcb->n;
1178  fcb->x[fcb->n] = idx - delta;
1179  fcb->y[fcb->n] = v;
1180  fcb->x[fcb->n + 1] = idx;
1181  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1182  fcb->n += 2;
1183  }
1184 }
1185 
1186 /**
1187  * @}
1188  *
1189  * Generate a random number from frame_cntr and block_idx, which will lief
1190  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1191  * table of size 1000 of which you want to read block_size entries).
1192  *
1193  * @param frame_cntr current frame number
1194  * @param block_num current block index
1195  * @param block_size amount of entries we want to read from a table
1196  * that has 1000 entries
1197  * @return a (non-)random number in the [0, 1000 - block_size] range.
1198  */
1199 static int pRNG(int frame_cntr, int block_num, int block_size)
1200 {
1201  /* array to simplify the calculation of z:
1202  * y = (x % 9) * 5 + 6;
1203  * z = (49995 * x) / y;
1204  * Since y only has 9 values, we can remove the division by using a
1205  * LUT and using FASTDIV-style divisions. For each of the 9 values
1206  * of y, we can rewrite z as:
1207  * z = x * (49995 / y) + x * ((49995 % y) / y)
1208  * In this table, each col represents one possible value of y, the
1209  * first number is 49995 / y, and the second is the FASTDIV variant
1210  * of 49995 % y / y. */
1211  static const unsigned int div_tbl[9][2] = {
1212  { 8332, 3 * 715827883U }, // y = 6
1213  { 4545, 0 * 390451573U }, // y = 11
1214  { 3124, 11 * 268435456U }, // y = 16
1215  { 2380, 15 * 204522253U }, // y = 21
1216  { 1922, 23 * 165191050U }, // y = 26
1217  { 1612, 23 * 138547333U }, // y = 31
1218  { 1388, 27 * 119304648U }, // y = 36
1219  { 1219, 16 * 104755300U }, // y = 41
1220  { 1086, 39 * 93368855U } // y = 46
1221  };
1222  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1223  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1224  // so this is effectively a modulo (%)
1225  y = x - 9 * MULH(477218589, x); // x % 9
1226  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1227  // z = x * 49995 / (y * 5 + 6)
1228  return z % (1000 - block_size);
1229 }
1230 
1231 /**
1232  * Parse hardcoded signal for a single block.
1233  * @note see #synth_block().
1234  */
1236  int block_idx, int size,
1237  const struct frame_type_desc *frame_desc,
1238  float *excitation)
1239 {
1240  float gain;
1241  int n, r_idx;
1242 
1243  av_assert0(size <= MAX_FRAMESIZE);
1244 
1245  /* Set the offset from which we start reading wmavoice_std_codebook */
1246  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1247  r_idx = pRNG(s->frame_cntr, block_idx, size);
1248  gain = s->silence_gain;
1249  } else /* FCB_TYPE_HARDCODED */ {
1250  r_idx = get_bits(gb, 8);
1251  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1252  }
1253 
1254  /* Clear gain prediction parameters */
1255  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1256 
1257  /* Apply gain to hardcoded codebook and use that as excitation signal */
1258  for (n = 0; n < size; n++)
1259  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1260 }
1261 
1262 /**
1263  * Parse FCB/ACB signal for a single block.
1264  * @note see #synth_block().
1265  */
1267  int block_idx, int size,
1268  int block_pitch_sh2,
1269  const struct frame_type_desc *frame_desc,
1270  float *excitation)
1271 {
1272  static const float gain_coeff[6] = {
1273  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1274  };
1275  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1276  int n, idx, gain_weight;
1277  AMRFixed fcb;
1278 
1279  av_assert0(size <= MAX_FRAMESIZE / 2);
1280  memset(pulses, 0, sizeof(*pulses) * size);
1281 
1282  fcb.pitch_lag = block_pitch_sh2 >> 2;
1283  fcb.pitch_fac = 1.0;
1284  fcb.no_repeat_mask = 0;
1285  fcb.n = 0;
1286 
1287  /* For the other frame types, this is where we apply the innovation
1288  * (fixed) codebook pulses of the speech signal. */
1289  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1290  aw_pulse_set1(s, gb, block_idx, &fcb);
1291  aw_pulse_set2(s, gb, block_idx, &fcb);
1292  } else /* FCB_TYPE_EXC_PULSES */ {
1293  int offset_nbits = 5 - frame_desc->log_n_blocks;
1294 
1295  fcb.no_repeat_mask = -1;
1296  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1297  * (instead of double) for a subset of pulses */
1298  for (n = 0; n < 5; n++) {
1299  float sign;
1300  int pos1, pos2;
1301 
1302  sign = get_bits1(gb) ? 1.0 : -1.0;
1303  pos1 = get_bits(gb, offset_nbits);
1304  fcb.x[fcb.n] = n + 5 * pos1;
1305  fcb.y[fcb.n++] = sign;
1306  if (n < frame_desc->dbl_pulses) {
1307  pos2 = get_bits(gb, offset_nbits);
1308  fcb.x[fcb.n] = n + 5 * pos2;
1309  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1310  }
1311  }
1312  }
1313  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1314 
1315  /* Calculate gain for adaptive & fixed codebook signal.
1316  * see ff_amr_set_fixed_gain(). */
1317  idx = get_bits(gb, 7);
1319  gain_coeff, 6) -
1320  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1321  acb_gain = wmavoice_gain_codebook_acb[idx];
1322  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1323  -2.9957322736 /* log(0.05) */,
1324  1.6094379124 /* log(5.0) */);
1325 
1326  gain_weight = 8 >> frame_desc->log_n_blocks;
1327  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1328  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1329  for (n = 0; n < gain_weight; n++)
1330  s->gain_pred_err[n] = pred_err;
1331 
1332  /* Calculation of adaptive codebook */
1333  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1334  int len;
1335  for (n = 0; n < size; n += len) {
1336  int next_idx_sh16;
1337  int abs_idx = block_idx * size + n;
1338  int pitch_sh16 = (s->last_pitch_val << 16) +
1339  s->pitch_diff_sh16 * abs_idx;
1340  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1341  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1342  idx = idx_sh16 >> 16;
1343  if (s->pitch_diff_sh16) {
1344  if (s->pitch_diff_sh16 > 0) {
1345  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1346  } else
1347  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1348  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1349  1, size - n);
1350  } else
1351  len = size;
1352 
1353  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1355  idx, 9, len);
1356  }
1357  } else /* ACB_TYPE_HAMMING */ {
1358  int block_pitch = block_pitch_sh2 >> 2;
1359  idx = block_pitch_sh2 & 3;
1360  if (idx) {
1361  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1363  idx, 8, size);
1364  } else
1365  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1366  sizeof(float) * size);
1367  }
1368 
1369  /* Interpolate ACB/FCB and use as excitation signal */
1370  ff_weighted_vector_sumf(excitation, excitation, pulses,
1371  acb_gain, fcb_gain, size);
1372 }
1373 
1374 /**
1375  * Parse data in a single block.
1376  * @note we assume enough bits are available, caller should check.
1377  *
1378  * @param s WMA Voice decoding context private data
1379  * @param gb bit I/O context
1380  * @param block_idx index of the to-be-read block
1381  * @param size amount of samples to be read in this block
1382  * @param block_pitch_sh2 pitch for this block << 2
1383  * @param lsps LSPs for (the end of) this frame
1384  * @param prev_lsps LSPs for the last frame
1385  * @param frame_desc frame type descriptor
1386  * @param excitation target memory for the ACB+FCB interpolated signal
1387  * @param synth target memory for the speech synthesis filter output
1388  * @return 0 on success, <0 on error.
1389  */
1391  int block_idx, int size,
1392  int block_pitch_sh2,
1393  const double *lsps, const double *prev_lsps,
1394  const struct frame_type_desc *frame_desc,
1395  float *excitation, float *synth)
1396 {
1397  double i_lsps[MAX_LSPS];
1398  float lpcs[MAX_LSPS];
1399  float fac;
1400  int n;
1401 
1402  if (frame_desc->acb_type == ACB_TYPE_NONE)
1403  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1404  else
1405  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1406  frame_desc, excitation);
1407 
1408  /* convert interpolated LSPs to LPCs */
1409  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1410  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1411  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1412  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1413 
1414  /* Speech synthesis */
1415  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1416 }
1417 
1418 /**
1419  * Synthesize output samples for a single frame.
1420  * @note we assume enough bits are available, caller should check.
1421  *
1422  * @param ctx WMA Voice decoder context
1423  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1424  * @param frame_idx Frame number within superframe [0-2]
1425  * @param samples pointer to output sample buffer, has space for at least 160
1426  * samples
1427  * @param lsps LSP array
1428  * @param prev_lsps array of previous frame's LSPs
1429  * @param excitation target buffer for excitation signal
1430  * @param synth target buffer for synthesized speech data
1431  * @return 0 on success, <0 on error.
1432  */
1433 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1434  float *samples,
1435  const double *lsps, const double *prev_lsps,
1436  float *excitation, float *synth)
1437 {
1438  WMAVoiceContext *s = ctx->priv_data;
1439  int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1440  int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1441 
1442  /* Parse frame type ("frame header"), see frame_descs */
1443  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1444 
1445  if (bd_idx < 0) {
1446  av_log(ctx, AV_LOG_ERROR,
1447  "Invalid frame type VLC code, skipping\n");
1448  return -1;
1449  }
1450 
1451  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1452 
1453  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1454  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1455  /* Pitch is provided per frame, which is interpreted as the pitch of
1456  * the last sample of the last block of this frame. We can interpolate
1457  * the pitch of other blocks (and even pitch-per-sample) by gradually
1458  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1459  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1460  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1461  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1462  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1463  if (s->last_acb_type == ACB_TYPE_NONE ||
1464  20 * abs(cur_pitch_val - s->last_pitch_val) >
1465  (cur_pitch_val + s->last_pitch_val))
1466  s->last_pitch_val = cur_pitch_val;
1467 
1468  /* pitch per block */
1469  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1470  int fac = n * 2 + 1;
1471 
1472  pitch[n] = (MUL16(fac, cur_pitch_val) +
1473  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1474  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1475  }
1476 
1477  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1478  s->pitch_diff_sh16 =
1479  ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1480  }
1481 
1482  /* Global gain (if silence) and pitch-adaptive window coordinates */
1483  switch (frame_descs[bd_idx].fcb_type) {
1484  case FCB_TYPE_SILENCE:
1486  break;
1487  case FCB_TYPE_AW_PULSES:
1488  aw_parse_coords(s, gb, pitch);
1489  break;
1490  }
1491 
1492  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1493  int bl_pitch_sh2;
1494 
1495  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1496  switch (frame_descs[bd_idx].acb_type) {
1497  case ACB_TYPE_HAMMING: {
1498  /* Pitch is given per block. Per-block pitches are encoded as an
1499  * absolute value for the first block, and then delta values
1500  * relative to this value) for all subsequent blocks. The scale of
1501  * this pitch value is semi-logaritmic compared to its use in the
1502  * decoder, so we convert it to normal scale also. */
1503  int block_pitch,
1504  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1505  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1506  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1507 
1508  if (n == 0) {
1509  block_pitch = get_bits(gb, s->block_pitch_nbits);
1510  } else
1511  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1513  /* Convert last_ so that any next delta is within _range */
1514  last_block_pitch = av_clip(block_pitch,
1516  s->block_pitch_range -
1518 
1519  /* Convert semi-log-style scale back to normal scale */
1520  if (block_pitch < t1) {
1521  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1522  } else {
1523  block_pitch -= t1;
1524  if (block_pitch < t2) {
1525  bl_pitch_sh2 =
1526  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1527  } else {
1528  block_pitch -= t2;
1529  if (block_pitch < t3) {
1530  bl_pitch_sh2 =
1531  (s->block_conv_table[2] + block_pitch) << 2;
1532  } else
1533  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1534  }
1535  }
1536  pitch[n] = bl_pitch_sh2 >> 2;
1537  break;
1538  }
1539 
1540  case ACB_TYPE_ASYMMETRIC: {
1541  bl_pitch_sh2 = pitch[n] << 2;
1542  break;
1543  }
1544 
1545  default: // ACB_TYPE_NONE has no pitch
1546  bl_pitch_sh2 = 0;
1547  break;
1548  }
1549 
1550  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1551  lsps, prev_lsps, &frame_descs[bd_idx],
1552  &excitation[n * block_nsamples],
1553  &synth[n * block_nsamples]);
1554  }
1555 
1556  /* Averaging projection filter, if applicable. Else, just copy samples
1557  * from synthesis buffer */
1558  if (s->do_apf) {
1559  double i_lsps[MAX_LSPS];
1560  float lpcs[MAX_LSPS];
1561 
1562  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1563  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1564  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1565  postfilter(s, synth, samples, 80, lpcs,
1566  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1567  frame_descs[bd_idx].fcb_type, pitch[0]);
1568 
1569  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1570  i_lsps[n] = cos(lsps[n]);
1571  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1572  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1573  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1574  frame_descs[bd_idx].fcb_type, pitch[0]);
1575  } else
1576  memcpy(samples, synth, 160 * sizeof(synth[0]));
1577 
1578  /* Cache values for next frame */
1579  s->frame_cntr++;
1580  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1581  s->last_acb_type = frame_descs[bd_idx].acb_type;
1582  switch (frame_descs[bd_idx].acb_type) {
1583  case ACB_TYPE_NONE:
1584  s->last_pitch_val = 0;
1585  break;
1586  case ACB_TYPE_ASYMMETRIC:
1587  s->last_pitch_val = cur_pitch_val;
1588  break;
1589  case ACB_TYPE_HAMMING:
1590  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1591  break;
1592  }
1593 
1594  return 0;
1595 }
1596 
1597 /**
1598  * Ensure minimum value for first item, maximum value for last value,
1599  * proper spacing between each value and proper ordering.
1600  *
1601  * @param lsps array of LSPs
1602  * @param num size of LSP array
1603  *
1604  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1605  * useful to put in a generic location later on. Parts are also
1606  * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1607  * which is in float.
1608  */
1609 static void stabilize_lsps(double *lsps, int num)
1610 {
1611  int n, m, l;
1612 
1613  /* set minimum value for first, maximum value for last and minimum
1614  * spacing between LSF values.
1615  * Very similar to ff_set_min_dist_lsf(), but in double. */
1616  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1617  for (n = 1; n < num; n++)
1618  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1619  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1620 
1621  /* reorder (looks like one-time / non-recursed bubblesort).
1622  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1623  for (n = 1; n < num; n++) {
1624  if (lsps[n] < lsps[n - 1]) {
1625  for (m = 1; m < num; m++) {
1626  double tmp = lsps[m];
1627  for (l = m - 1; l >= 0; l--) {
1628  if (lsps[l] <= tmp) break;
1629  lsps[l + 1] = lsps[l];
1630  }
1631  lsps[l + 1] = tmp;
1632  }
1633  break;
1634  }
1635  }
1636 }
1637 
1638 /**
1639  * Test if there's enough bits to read 1 superframe.
1640  *
1641  * @param orig_gb bit I/O context used for reading. This function
1642  * does not modify the state of the bitreader; it
1643  * only uses it to copy the current stream position
1644  * @param s WMA Voice decoding context private data
1645  * @return -1 if unsupported, 1 on not enough bits or 0 if OK.
1646  */
1648  WMAVoiceContext *s)
1649 {
1650  GetBitContext s_gb, *gb = &s_gb;
1651  int n, need_bits, bd_idx;
1652  const struct frame_type_desc *frame_desc;
1653 
1654  /* initialize a copy */
1655  init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1656  skip_bits_long(gb, get_bits_count(orig_gb));
1657  av_assert1(get_bits_left(gb) == get_bits_left(orig_gb));
1658 
1659  /* superframe header */
1660  if (get_bits_left(gb) < 14)
1661  return 1;
1662  if (!get_bits1(gb))
1663  return -1; // WMAPro-in-WMAVoice superframe
1664  if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe
1665  if (s->has_residual_lsps) { // residual LSPs (for all frames)
1666  if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1667  return 1;
1669  }
1670 
1671  /* frames */
1672  for (n = 0; n < MAX_FRAMES; n++) {
1673  int aw_idx_is_ext = 0;
1674 
1675  if (!s->has_residual_lsps) { // independent LSPs (per-frame)
1676  if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1678  }
1679  bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1680  if (bd_idx < 0)
1681  return -1; // invalid frame type VLC code
1682  frame_desc = &frame_descs[bd_idx];
1683  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1684  if (get_bits_left(gb) < s->pitch_nbits)
1685  return 1;
1686  skip_bits_long(gb, s->pitch_nbits);
1687  }
1688  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1689  skip_bits(gb, 8);
1690  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1691  int tmp = get_bits(gb, 6);
1692  if (tmp >= 0x36) {
1693  skip_bits(gb, 2);
1694  aw_idx_is_ext = 1;
1695  }
1696  }
1697 
1698  /* blocks */
1699  if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1700  need_bits = s->block_pitch_nbits +
1701  (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1702  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1703  need_bits = 2 * !aw_idx_is_ext;
1704  } else
1705  need_bits = 0;
1706  need_bits += frame_desc->frame_size;
1707  if (get_bits_left(gb) < need_bits)
1708  return 1;
1709  skip_bits_long(gb, need_bits);
1710  }
1711 
1712  return 0;
1713 }
1714 
1715 /**
1716  * Synthesize output samples for a single superframe. If we have any data
1717  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1718  * in s->gb.
1719  *
1720  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1721  * to give a total of 480 samples per frame. See #synth_frame() for frame
1722  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1723  * (if these are globally specified for all frames (residually); they can
1724  * also be specified individually per-frame. See the s->has_residual_lsps
1725  * option), and can specify the number of samples encoded in this superframe
1726  * (if less than 480), usually used to prevent blanks at track boundaries.
1727  *
1728  * @param ctx WMA Voice decoder context
1729  * @return 0 on success, <0 on error or 1 if there was not enough data to
1730  * fully parse the superframe
1731  */
1733  int *got_frame_ptr)
1734 {
1735  WMAVoiceContext *s = ctx->priv_data;
1736  GetBitContext *gb = &s->gb, s_gb;
1737  int n, res, n_samples = 480;
1738  double lsps[MAX_FRAMES][MAX_LSPS];
1739  const double *mean_lsf = s->lsps == 16 ?
1741  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1742  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1743  float *samples;
1744 
1745  memcpy(synth, s->synth_history,
1746  s->lsps * sizeof(*synth));
1747  memcpy(excitation, s->excitation_history,
1748  s->history_nsamples * sizeof(*excitation));
1749 
1750  if (s->sframe_cache_size > 0) {
1751  gb = &s_gb;
1753  s->sframe_cache_size = 0;
1754  }
1755 
1756  if ((res = check_bits_for_superframe(gb, s)) == 1) {
1757  *got_frame_ptr = 0;
1758  return 1;
1759  }
1760 
1761  /* First bit is speech/music bit, it differentiates between WMAVoice
1762  * speech samples (the actual codec) and WMAVoice music samples, which
1763  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1764  * the wild yet. */
1765  if (!get_bits1(gb)) {
1766  avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1767  return AVERROR_PATCHWELCOME;
1768  }
1769 
1770  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1771  if (get_bits1(gb)) {
1772  if ((n_samples = get_bits(gb, 12)) > 480) {
1773  av_log(ctx, AV_LOG_ERROR,
1774  "Superframe encodes >480 samples (%d), not allowed\n",
1775  n_samples);
1776  return -1;
1777  }
1778  }
1779  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1780  if (s->has_residual_lsps) {
1781  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1782 
1783  for (n = 0; n < s->lsps; n++)
1784  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1785 
1786  if (s->lsps == 10) {
1787  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1788  } else /* s->lsps == 16 */
1789  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1790 
1791  for (n = 0; n < s->lsps; n++) {
1792  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1793  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1794  lsps[2][n] += mean_lsf[n];
1795  }
1796  for (n = 0; n < 3; n++)
1797  stabilize_lsps(lsps[n], s->lsps);
1798  }
1799 
1800  /* get output buffer */
1801  frame->nb_samples = 480;
1802  if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1803  return res;
1804  frame->nb_samples = n_samples;
1805  samples = (float *)frame->data[0];
1806 
1807  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1808  for (n = 0; n < 3; n++) {
1809  if (!s->has_residual_lsps) {
1810  int m;
1811 
1812  if (s->lsps == 10) {
1813  dequant_lsp10i(gb, lsps[n]);
1814  } else /* s->lsps == 16 */
1815  dequant_lsp16i(gb, lsps[n]);
1816 
1817  for (m = 0; m < s->lsps; m++)
1818  lsps[n][m] += mean_lsf[m];
1819  stabilize_lsps(lsps[n], s->lsps);
1820  }
1821 
1822  if ((res = synth_frame(ctx, gb, n,
1823  &samples[n * MAX_FRAMESIZE],
1824  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1825  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1826  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1827  *got_frame_ptr = 0;
1828  return res;
1829  }
1830  }
1831 
1832  /* Statistics? FIXME - we don't check for length, a slight overrun
1833  * will be caught by internal buffer padding, and anything else
1834  * will be skipped, not read. */
1835  if (get_bits1(gb)) {
1836  res = get_bits(gb, 4);
1837  skip_bits(gb, 10 * (res + 1));
1838  }
1839 
1840  *got_frame_ptr = 1;
1841 
1842  /* Update history */
1843  memcpy(s->prev_lsps, lsps[2],
1844  s->lsps * sizeof(*s->prev_lsps));
1845  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1846  s->lsps * sizeof(*synth));
1847  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1848  s->history_nsamples * sizeof(*excitation));
1849  if (s->do_apf)
1850  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1851  s->history_nsamples * sizeof(*s->zero_exc_pf));
1852 
1853  return 0;
1854 }
1855 
1856 /**
1857  * Parse the packet header at the start of each packet (input data to this
1858  * decoder).
1859  *
1860  * @param s WMA Voice decoding context private data
1861  * @return 1 if not enough bits were available, or 0 on success.
1862  */
1864 {
1865  GetBitContext *gb = &s->gb;
1866  unsigned int res;
1867 
1868  if (get_bits_left(gb) < 11)
1869  return 1;
1870  skip_bits(gb, 4); // packet sequence number
1871  s->has_residual_lsps = get_bits1(gb);
1872  do {
1873  res = get_bits(gb, 6); // number of superframes per packet
1874  // (minus first one if there is spillover)
1875  if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1876  return 1;
1877  } while (res == 0x3F);
1879 
1880  return 0;
1881 }
1882 
1883 /**
1884  * Copy (unaligned) bits from gb/data/size to pb.
1885  *
1886  * @param pb target buffer to copy bits into
1887  * @param data source buffer to copy bits from
1888  * @param size size of the source data, in bytes
1889  * @param gb bit I/O context specifying the current position in the source.
1890  * data. This function might use this to align the bit position to
1891  * a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1892  * source data
1893  * @param nbits the amount of bits to copy from source to target
1894  *
1895  * @note after calling this function, the current position in the input bit
1896  * I/O context is undefined.
1897  */
1898 static void copy_bits(PutBitContext *pb,
1899  const uint8_t *data, int size,
1900  GetBitContext *gb, int nbits)
1901 {
1902  int rmn_bytes, rmn_bits;
1903 
1904  rmn_bits = rmn_bytes = get_bits_left(gb);
1905  if (rmn_bits < nbits)
1906  return;
1907  if (nbits > pb->size_in_bits - put_bits_count(pb))
1908  return;
1909  rmn_bits &= 7; rmn_bytes >>= 3;
1910  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1911  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1912  avpriv_copy_bits(pb, data + size - rmn_bytes,
1913  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1914 }
1915 
1916 /**
1917  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1918  * and we expect that the demuxer / application provides it to us as such
1919  * (else you'll probably get garbage as output). Every packet has a size of
1920  * ctx->block_align bytes, starts with a packet header (see
1921  * #parse_packet_header()), and then a series of superframes. Superframe
1922  * boundaries may exceed packets, i.e. superframes can split data over
1923  * multiple (two) packets.
1924  *
1925  * For more information about frames, see #synth_superframe().
1926  */
1928  int *got_frame_ptr, AVPacket *avpkt)
1929 {
1930  WMAVoiceContext *s = ctx->priv_data;
1931  GetBitContext *gb = &s->gb;
1932  int size, res, pos;
1933 
1934  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1935  * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1936  * feeds us ASF packets, which may concatenate multiple "codec" packets
1937  * in a single "muxer" packet, so we artificially emulate that by
1938  * capping the packet size at ctx->block_align. */
1939  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1940  if (!size) {
1941  *got_frame_ptr = 0;
1942  return 0;
1943  }
1944  init_get_bits(&s->gb, avpkt->data, size << 3);
1945 
1946  /* size == ctx->block_align is used to indicate whether we are dealing with
1947  * a new packet or a packet of which we already read the packet header
1948  * previously. */
1949  if (size == ctx->block_align) { // new packet header
1950  if ((res = parse_packet_header(s)) < 0)
1951  return res;
1952 
1953  /* If the packet header specifies a s->spillover_nbits, then we want
1954  * to push out all data of the previous packet (+ spillover) before
1955  * continuing to parse new superframes in the current packet. */
1956  if (s->spillover_nbits > 0) {
1957  if (s->sframe_cache_size > 0) {
1958  int cnt = get_bits_count(gb);
1959  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1960  flush_put_bits(&s->pb);
1962  if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1963  *got_frame_ptr) {
1964  cnt += s->spillover_nbits;
1965  s->skip_bits_next = cnt & 7;
1966  return cnt >> 3;
1967  } else
1968  skip_bits_long (gb, s->spillover_nbits - cnt +
1969  get_bits_count(gb)); // resync
1970  } else
1971  skip_bits_long(gb, s->spillover_nbits); // resync
1972  }
1973  } else if (s->skip_bits_next)
1974  skip_bits(gb, s->skip_bits_next);
1975 
1976  /* Try parsing superframes in current packet */
1977  s->sframe_cache_size = 0;
1978  s->skip_bits_next = 0;
1979  pos = get_bits_left(gb);
1980  if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1981  return res;
1982  } else if (*got_frame_ptr) {
1983  int cnt = get_bits_count(gb);
1984  s->skip_bits_next = cnt & 7;
1985  return cnt >> 3;
1986  } else if ((s->sframe_cache_size = pos) > 0) {
1987  /* rewind bit reader to start of last (incomplete) superframe... */
1988  init_get_bits(gb, avpkt->data, size << 3);
1989  skip_bits_long(gb, (size << 3) - pos);
1990  av_assert1(get_bits_left(gb) == pos);
1991 
1992  /* ...and cache it for spillover in next packet */
1994  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1995  // FIXME bad - just copy bytes as whole and add use the
1996  // skip_bits_next field
1997  }
1998 
1999  return size;
2000 }
2001 
2003 {
2004  WMAVoiceContext *s = ctx->priv_data;
2005 
2006  if (s->do_apf) {
2007  ff_rdft_end(&s->rdft);
2008  ff_rdft_end(&s->irdft);
2009  ff_dct_end(&s->dct);
2010  ff_dct_end(&s->dst);
2011  }
2012 
2013  return 0;
2014 }
2015 
2017 {
2018  WMAVoiceContext *s = ctx->priv_data;
2019  int n;
2020 
2021  s->postfilter_agc = 0;
2022  s->sframe_cache_size = 0;
2023  s->skip_bits_next = 0;
2024  for (n = 0; n < s->lsps; n++)
2025  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2026  memset(s->excitation_history, 0,
2027  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2028  memset(s->synth_history, 0,
2029  sizeof(*s->synth_history) * MAX_LSPS);
2030  memset(s->gain_pred_err, 0,
2031  sizeof(s->gain_pred_err));
2032 
2033  if (s->do_apf) {
2034  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2035  sizeof(*s->synth_filter_out_buf) * s->lsps);
2036  memset(s->dcf_mem, 0,
2037  sizeof(*s->dcf_mem) * 2);
2038  memset(s->zero_exc_pf, 0,
2039  sizeof(*s->zero_exc_pf) * s->history_nsamples);
2040  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2041  }
2042 }
2043 
2045  .name = "wmavoice",
2046  .type = AVMEDIA_TYPE_AUDIO,
2047  .id = AV_CODEC_ID_WMAVOICE,
2048  .priv_data_size = sizeof(WMAVoiceContext),
2052  .capabilities = CODEC_CAP_SUBFRAMES | CODEC_CAP_DR1,
2053  .flush = wmavoice_flush,
2054  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2055 };