FFmpeg: libavcodec/aacpsy.c Source File

00001 /*
00002  * AAC encoder psychoacoustic model
00003  * Copyright (C) 2008 Konstantin Shishkov
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00027 #include "avcodec.h"
00028 #include "aactab.h"
00029 #include "psymodel.h"
00030 
00031 /***********************************
00032  *              TODOs:
00033  * thresholds linearization after their modifications for attaining given bitrate
00034  * try other bitrate controlling mechanism (maybe use ratecontrol.c?)
00035  * control quality for quality-based output
00036  **********************************/
00037 
00042 #define PSY_3GPP_SPREAD_LOW  1.5f // spreading factor for ascending threshold spreading  (15 dB/Bark)
00043 #define PSY_3GPP_SPREAD_HI   3.0f // spreading factor for descending threshold spreading (30 dB/Bark)
00044 
00045 #define PSY_3GPP_RPEMIN      0.01f
00046 #define PSY_3GPP_RPELEV      2.0f
00047 
00054 typedef struct Psy3gppBand{
00055     float energy;    
00056     float ffac;      
00057     float thr;       
00058     float min_snr;   
00059     float thr_quiet; 
00060 }Psy3gppBand;
00061 
00065 typedef struct Psy3gppChannel{
00066     Psy3gppBand band[128];               
00067     Psy3gppBand prev_band[128];          
00068 
00069     float       win_energy;              
00070     float       iir_state[2];            
00071     uint8_t     next_grouping;           
00072     enum WindowSequence next_window_seq; 
00073 }Psy3gppChannel;
00074 
00078 typedef struct Psy3gppCoeffs{
00079     float ath       [64]; 
00080     float barks     [64]; 
00081     float spread_low[64]; 
00082     float spread_hi [64]; 
00083 }Psy3gppCoeffs;
00084 
00088 typedef struct Psy3gppContext{
00089     Psy3gppCoeffs psy_coef[2];
00090     Psy3gppChannel *ch;
00091 }Psy3gppContext;
00092 
00096 static av_cold float calc_bark(float f)
00097 {
00098     return 13.3f * atanf(0.00076f * f) + 3.5f * atanf((f / 7500.0f) * (f / 7500.0f));
00099 }
00100 
00101 #define ATH_ADD 4
00102 
00106 static av_cold float ath(float f, float add)
00107 {
00108     f /= 1000.0f;
00109     return    3.64 * pow(f, -0.8)
00110             - 6.8  * exp(-0.6  * (f - 3.4) * (f - 3.4))
00111             + 6.0  * exp(-0.15 * (f - 8.7) * (f - 8.7))
00112             + (0.6 + 0.04 * add) * 0.001 * f * f * f * f;
00113 }
00114 
00115 static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
00116     Psy3gppContext *pctx;
00117     float barks[1024];
00118     int i, j, g, start;
00119     float prev, minscale, minath;
00120 
00121     ctx->model_priv_data = av_mallocz(sizeof(Psy3gppContext));
00122     pctx = (Psy3gppContext*) ctx->model_priv_data;
00123 
00124     for (i = 0; i < 1024; i++)
00125         barks[i] = calc_bark(i * ctx->avctx->sample_rate / 2048.0);
00126     minath = ath(3410, ATH_ADD);
00127     for (j = 0; j < 2; j++) {
00128         Psy3gppCoeffs *coeffs = &pctx->psy_coef[j];
00129         i = 0;
00130         prev = 0.0;
00131         for (g = 0; g < ctx->num_bands[j]; g++) {
00132             i += ctx->bands[j][g];
00133             coeffs->barks[g] = (barks[i - 1] + prev) / 2.0;
00134             prev = barks[i - 1];
00135         }
00136         for (g = 0; g < ctx->num_bands[j] - 1; g++) {
00137             coeffs->spread_low[g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_LOW);
00138             coeffs->spread_hi [g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_HI);
00139         }
00140         start = 0;
00141         for (g = 0; g < ctx->num_bands[j]; g++) {
00142             minscale = ath(ctx->avctx->sample_rate * start / 1024.0, ATH_ADD);
00143             for (i = 1; i < ctx->bands[j][g]; i++)
00144                 minscale = FFMIN(minscale, ath(ctx->avctx->sample_rate * (start + i) / 1024.0 / 2.0, ATH_ADD));
00145             coeffs->ath[g] = minscale - minath;
00146             start += ctx->bands[j][g];
00147         }
00148     }
00149 
00150     pctx->ch = av_mallocz(sizeof(Psy3gppChannel) * ctx->avctx->channels);
00151     return 0;
00152 }
00153 
00157 static float iir_filter(int in, float state[2])
00158 {
00159     float ret;
00160 
00161     ret = 0.7548f * (in - state[0]) + 0.5095f * state[1];
00162     state[0] = in;
00163     state[1] = ret;
00164     return ret;
00165 }
00166 
00170 static const uint8_t window_grouping[9] = {
00171     0xB6, 0x6C, 0xD8, 0xB2, 0x66, 0xC6, 0x96, 0x36, 0x36
00172 };
00173 
00178 static FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
00179                                        const int16_t *audio, const int16_t *la,
00180                                        int channel, int prev_type)
00181 {
00182     int i, j;
00183     int br               = ctx->avctx->bit_rate / ctx->avctx->channels;
00184     int attack_ratio     = br <= 16000 ? 18 : 10;
00185     Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data;
00186     Psy3gppChannel *pch  = &pctx->ch[channel];
00187     uint8_t grouping     = 0;
00188     FFPsyWindowInfo wi;
00189 
00190     memset(&wi, 0, sizeof(wi));
00191     if (la) {
00192         float s[8], v;
00193         int switch_to_eight = 0;
00194         float sum = 0.0, sum2 = 0.0;
00195         int attack_n = 0;
00196         for (i = 0; i < 8; i++) {
00197             for (j = 0; j < 128; j++) {
00198                 v = iir_filter(audio[(i*128+j)*ctx->avctx->channels], pch->iir_state);
00199                 sum += v*v;
00200             }
00201             s[i]  = sum;
00202             sum2 += sum;
00203         }
00204         for (i = 0; i < 8; i++) {
00205             if (s[i] > pch->win_energy * attack_ratio) {
00206                 attack_n        = i + 1;
00207                 switch_to_eight = 1;
00208                 break;
00209             }
00210         }
00211         pch->win_energy = pch->win_energy*7/8 + sum2/64;
00212 
00213         wi.window_type[1] = prev_type;
00214         switch (prev_type) {
00215         case ONLY_LONG_SEQUENCE:
00216             wi.window_type[0] = switch_to_eight ? LONG_START_SEQUENCE : ONLY_LONG_SEQUENCE;
00217             break;
00218         case LONG_START_SEQUENCE:
00219             wi.window_type[0] = EIGHT_SHORT_SEQUENCE;
00220             grouping = pch->next_grouping;
00221             break;
00222         case LONG_STOP_SEQUENCE:
00223             wi.window_type[0] = ONLY_LONG_SEQUENCE;
00224             break;
00225         case EIGHT_SHORT_SEQUENCE:
00226             wi.window_type[0] = switch_to_eight ? EIGHT_SHORT_SEQUENCE : LONG_STOP_SEQUENCE;
00227             grouping = switch_to_eight ? pch->next_grouping : 0;
00228             break;
00229         }
00230         pch->next_grouping = window_grouping[attack_n];
00231     } else {
00232         for (i = 0; i < 3; i++)
00233             wi.window_type[i] = prev_type;
00234         grouping = (prev_type == EIGHT_SHORT_SEQUENCE) ? window_grouping[0] : 0;
00235     }
00236 
00237     wi.window_shape   = 1;
00238     if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) {
00239         wi.num_windows = 1;
00240         wi.grouping[0] = 1;
00241     } else {
00242         int lastgrp = 0;
00243         wi.num_windows = 8;
00244         for (i = 0; i < 8; i++) {
00245             if (!((grouping >> i) & 1))
00246                 lastgrp = i;
00247             wi.grouping[lastgrp]++;
00248         }
00249     }
00250 
00251     return wi;
00252 }
00253 
00257 static void psy_3gpp_analyze(FFPsyContext *ctx, int channel,
00258                              const float *coefs, FFPsyWindowInfo *wi)
00259 {
00260     Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data;
00261     Psy3gppChannel *pch  = &pctx->ch[channel];
00262     int start = 0;
00263     int i, w, g;
00264     const int num_bands       = ctx->num_bands[wi->num_windows == 8];
00265     const uint8_t* band_sizes = ctx->bands[wi->num_windows == 8];
00266     Psy3gppCoeffs *coeffs     = &pctx->psy_coef[wi->num_windows == 8];
00267 
00268     //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
00269     for (w = 0; w < wi->num_windows*16; w += 16) {
00270         for (g = 0; g < num_bands; g++) {
00271             Psy3gppBand *band = &pch->band[w+g];
00272             band->energy = 0.0f;
00273             for (i = 0; i < band_sizes[g]; i++)
00274                 band->energy += coefs[start+i] * coefs[start+i];
00275             band->energy *= 1.0f / (512*512);
00276             band->thr     = band->energy * 0.001258925f;
00277             start        += band_sizes[g];
00278 
00279             ctx->psy_bands[channel*PSY_MAX_BANDS+w+g].energy = band->energy;
00280         }
00281     }
00282     //modify thresholds - spread, threshold in quiet - 5.4.3 "Spreaded Energy Calculation"
00283     for (w = 0; w < wi->num_windows*16; w += 16) {
00284         Psy3gppBand *band = &pch->band[w];
00285         for (g = 1; g < num_bands; g++)
00286             band[g].thr = FFMAX(band[g].thr, band[g-1].thr * coeffs->spread_low[g-1]);
00287         for (g = num_bands - 2; g >= 0; g--)
00288             band[g].thr = FFMAX(band[g].thr, band[g+1].thr * coeffs->spread_hi [g]);
00289         for (g = 0; g < num_bands; g++) {
00290             band[g].thr_quiet = FFMAX(band[g].thr, coeffs->ath[g]);
00291             if (wi->num_windows != 8 && wi->window_type[1] != EIGHT_SHORT_SEQUENCE)
00292                 band[g].thr_quiet = FFMAX(PSY_3GPP_RPEMIN*band[g].thr_quiet,
00293                                           FFMIN(band[g].thr_quiet,
00294                                           PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet));
00295             band[g].thr = FFMAX(band[g].thr, band[g].thr_quiet * 0.25);
00296 
00297             ctx->psy_bands[channel*PSY_MAX_BANDS+w+g].threshold = band[g].thr;
00298         }
00299     }
00300     memcpy(pch->prev_band, pch->band, sizeof(pch->band));
00301 }
00302 
00303 static av_cold void psy_3gpp_end(FFPsyContext *apc)
00304 {
00305     Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
00306     av_freep(&pctx->ch);
00307     av_freep(&apc->model_priv_data);
00308 }
00309 
00310 
00311 const FFPsyModel ff_aac_psy_model =
00312 {
00313     .name    = "3GPP TS 26.403-inspired model",
00314     .init    = psy_3gpp_init,
00315     .window  = psy_3gpp_window,
00316     .analyze = psy_3gpp_analyze,
00317     .end     = psy_3gpp_end,
00318 };