[FFmpeg-devel] [RFC] AAC Encoder

Michael Niedermayer michaelni
Sun Aug 17 03:08:58 CEST 2008


On Sat, Aug 16, 2008 at 06:00:39PM +0300, Kostya wrote:
> On Sat, Aug 16, 2008 at 03:57:56AM +0200, Michael Niedermayer wrote:
> > On Fri, Aug 15, 2008 at 07:59:52PM +0300, Kostya wrote:
[...]

> static void psy_test_window(AACPsyContext *apc, int16_t *audio, int16_t *la, int tag, int type, ChannelElement *cpe)
> {
>     int ch, i;
>     int chans = type == TYPE_CPE ? 2 : 1;
> 
>     for(ch = 0; ch < chans; ch++){
>         int prev_seq = cpe->ch[ch].ics.window_sequence[1];
>         cpe->ch[ch].ics.use_kb_window[1] = cpe->ch[ch].ics.use_kb_window[0];
>         cpe->ch[ch].ics.window_sequence[1] = cpe->ch[ch].ics.window_sequence[0];
>         switch(cpe->ch[ch].ics.window_sequence[0]){

>         case ONLY_LONG_SEQUENCE:   if(prev_seq == ONLY_LONG_SEQUENCE)cpe->ch[ch].ics.window_sequence[0] = LONG_START_SEQUENCE;   break;
>         case LONG_START_SEQUENCE:  cpe->ch[ch].ics.window_sequence[0] = EIGHT_SHORT_SEQUENCE; break;
>         case EIGHT_SHORT_SEQUENCE: if(prev_seq == EIGHT_SHORT_SEQUENCE)cpe->ch[ch].ics.window_sequence[0] = LONG_STOP_SEQUENCE;  break;
>         case LONG_STOP_SEQUENCE:   cpe->ch[ch].ics.window_sequence[0] = ONLY_LONG_SEQUENCE;   break;

please format this so it can be read by a human


>         }
> 
>         if(cpe->ch[ch].ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE){
>             cpe->ch[ch].ics.use_kb_window[0] = 1;
>             cpe->ch[ch].ics.num_windows = 1;
>             cpe->ch[ch].ics.swb_sizes = apc->bands1024;
>             cpe->ch[ch].ics.num_swb = apc->num_bands1024;
>             cpe->ch[ch].ics.num_window_groups = 1;
>             cpe->ch[ch].ics.group_len[0] = 1;
>         }else{
>             cpe->ch[ch].ics.use_kb_window[0] = 1;
>             cpe->ch[ch].ics.num_windows = 8;
>             cpe->ch[ch].ics.swb_sizes = apc->bands128;
>             cpe->ch[ch].ics.num_swb = apc->num_bands128;

>             cpe->ch[ch].ics.num_window_groups = 4;
>             for(i = 0; i < 4; i++)
>                 cpe->ch[ch].ics.group_len[i] = 2;

this is not optimal


>         }
>     }
>     cpe->common_window = cpe->ch[0].ics.use_kb_window[0] == cpe->ch[1].ics.use_kb_window[0];
> }
> 
[...]

> /**
>  * 3GPP TS26.403-inspired psychoacoustic model specific data
>  */
> typedef struct Psy3gppContext{

>     float       barks [1024]; ///< Bark value for each spectral line

unused outsde init


>     float       bark_l[64];   ///< Bark value for each spectral band in long frame
>     float       bark_s[16];   ///< Bark value for each spectral band in short frame
>     float       s_low_l[64];  ///< spreading factor for low-to-high threshold spreading in long frame
>     float       s_low_s[16];  ///< spreading factor for low-to-high threshold spreading in short frame
>     float       s_hi_l [64];  ///< spreading factor for high-to-low threshold spreading in long frame
>     float       s_hi_s [16];  ///< spreading factor for high-to-low threshold spreading in short frame

spread_low/hi

>     int         reservoir;    ///< bit reservoir fullness
>     int         avg_bits;     ///< average frame size of bits for CBR
>     float       ath_l[64];    ///< absolute threshold of hearing per bands in long frame
>     float       ath_s[16];    ///< absolute threshold of hearing per bands in short frame

most of these exist for long and short windows, they should be in a
array[2] maybe even a struct array[2] i dont know whats cleaner


[...]

>     for(g = 0; g < apc->num_bands1024; g++){
>         i += apc->bands1024[g];
>         pctx->bark_l[g] = (pctx->barks[i - 1] + prev) / 2.0;
>         prev = pctx->barks[i - 1];
>     }
>     for(g = 0; g < apc->num_bands1024 - 1; g++){
>         pctx->s_low_l[g] = pow(10.0, -(pctx->bark_l[g+1] - pctx->bark_l[g]) * PSY_3GPP_SPREAD_LOW);
>         pctx->s_hi_l [g] = pow(10.0, -(pctx->bark_l[g+1] - pctx->bark_l[g]) * PSY_3GPP_SPREAD_HI);
>     }
>     i = 0;
>     prev = 0.0;
>     for(g = 0; g < apc->num_bands128; g++){
>         i += apc->bands128[g];
>         pctx->bark_s[g] = (pctx->barks[i - 1] + prev) / 2.0;
>         prev = pctx->barks[i - 1];
>     }
>     for(g = 0; g < apc->num_bands128 - 1; g++){
>         pctx->s_low_s[g] = pow(10.0, -(pctx->bark_s[g+1] - pctx->bark_s[g]) * PSY_3GPP_SPREAD_LOW);
>         pctx->s_hi_s [g] = pow(10.0, -(pctx->bark_s[g+1] - pctx->bark_s[g]) * PSY_3GPP_SPREAD_HI);
>     }

duplicate



>     start = 0;
>     minath = ath(3410, ATH_ADD);
>     for(g = 0; g < apc->num_bands1024; g++){
>         minscale = ath(apc->avctx->sample_rate * start / 1024.0, ATH_ADD);
>         for(i = 1; i < apc->bands1024[g]; i++){
>             minscale = fminf(minscale, ath(apc->avctx->sample_rate * (start + i) / 1024.0 / 2.0, ATH_ADD));
>         }
>         pctx->ath_l[g] = minscale - minath;
>         start += apc->bands1024[g];
>     }
>     start = 0;
>     for(g = 0; g < apc->num_bands128; g++){
>         minscale = ath(apc->avctx->sample_rate * start / 1024.0, ATH_ADD);
>         for(i = 1; i < apc->bands128[g]; i++){
>             minscale = fminf(minscale, ath(apc->avctx->sample_rate * (start + i) / 1024.0 / 2.0, ATH_ADD));
>         }
>         pctx->ath_s[g] = minscale - minath;
>         start += apc->bands128[g];
>     }

duplicate


[...]
> /**
>  * window grouping information stored as bits (0 - new group, 1 - group continues)
>  */
> static const uint8_t window_grouping[9] = {
>     0xB6, 0x6C, 0xD8, 0xB2, 0x66, 0xC6, 0x96, 0x36, 0x36
> };
> 
> /**
>  * Tell encoder which window types to use.
>  * @see 3GPP TS26.403 5.4.1 "Blockswitching"
>  */
> static void psy_3gpp_window(AACPsyContext *apc, int16_t *audio, int16_t *la, int tag, int type, ChannelElement *cpe)
> {
>     int ch;
>     int chans = type == TYPE_CPE ? 2 : 1;
>     int i, j;
>     int br = apc->avctx->bit_rate / apc->avctx->channels;
>     int attack_ratio = (br <= 16000 + 8000*chans) ? 18 : 10;
>     Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
>     Psy3gppChannel *pch = &pctx->ch[tag];
>     uint8_t grouping[2];
>     enum WindowSequence win[2];
> 
>     if(la && !(apc->flags & PSY_MODEL_NO_SWITCH)){
>         float s[8], v;
>         for(ch = 0; ch < chans; ch++){
>             enum WindowSequence last_window_sequence = cpe->ch[ch].ics.window_sequence[0];
>             int switch_to_eight = 0;
>             float sum = 0.0, sum2 = 0.0;
>             int attack_n = 0;
>             for(i = 0; i < 8; i++){
>                 for(j = 0; j < 128; j++){
>                     v = iir_filter(audio[(i*128+j)*apc->avctx->channels+ch], pch->iir_state[ch]);
>                     sum += v*v;
>                 }
>                 s[i] = sum;
>                 sum2 += sum;
>             }
>             for(i = 0; i < 8; i++){
>                 if(s[i] > pch->win_energy[ch] * attack_ratio){
>                     attack_n = i + 1;
>                     switch_to_eight = 1;
>                     break;
>                 }
>             }
>             pch->win_energy[ch] = pch->win_energy[ch]*7/8 + sum2/64;
> 
>             switch(last_window_sequence){
>             case ONLY_LONG_SEQUENCE:
>                 win[ch] = switch_to_eight ? LONG_START_SEQUENCE : ONLY_LONG_SEQUENCE;
>                 grouping[ch] = 0;
>                 break;
>             case LONG_START_SEQUENCE:
>                 win[ch] = EIGHT_SHORT_SEQUENCE;
>                 grouping[ch] = pch->next_grouping[ch];
>                 break;
>             case LONG_STOP_SEQUENCE:
>                 win[ch] = ONLY_LONG_SEQUENCE;
>                 grouping[ch] = 0;
>                 break;
>             case EIGHT_SHORT_SEQUENCE:
>                 win[ch] = switch_to_eight ? EIGHT_SHORT_SEQUENCE : LONG_STOP_SEQUENCE;
>                 grouping[ch] = switch_to_eight ? pch->next_grouping[ch] : 0;
>                 break;
>             }
>             pch->next_grouping[ch] = window_grouping[attack_n];

this is limited to 9 of 256 possible groupings, not to mention that i have my
doubts about the optimality of the highpass based selection.



[...]
> /**
>  * Calculate perceptual entropy and its corresponding values for one band.
>  * @see 3GPP TS26.403 5.6.1.3 "Calculation of the reduction value"
>  */
> static void calc_pe(Psy3gppBand *band, int band_width)
> {
>     if(band->energy <= band->thr){
>         band->a  = 0.0f;
>         band->b  = 0.0f;
>         band->nl = 0.0f;
>         return;
>     }
>     band->nl = band->ffac / pow(band->energy/band_width, 0.25);
>     if(band->energy >= band->thr * 8.0){
>         band->a = band->nl * log2(band->energy);
>         band->b = band->nl;
>     }else{
>         band->a = band->nl * (PSY_3GPP_C2 + PSY_3GPP_C3 * log2(band->energy));
>         band->b = band->nl * PSY_3GPP_C3;
>     }
>     band->pe = band->a - band->b * log2(band->thr);
>     band->min_snr = 1.0 / (pow(2.0, band->pe / band_width) - 1.5);

>     if(band->min_snr < 1.26f)     band->min_snr = 1.26f;
>     if(band->min_snr > 316.2277f) band->min_snr = 316.2277f;

av_clipf()


> }
> 
> /**
>  * Determine scalefactors and prepare coefficients for encoding.
>  * @see 3GPP TS26.403 5.4 "Psychoacoustic model"
>  */
> static void psy_3gpp_process(AACPsyContext *apc, int tag, int type, ChannelElement *cpe)
> {
>     int start;
>     int ch, w, wg, g, i;
>     int prev_scale;
>     Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
>     float pe_target;
>     int bits_avail;
>     int chans = type == TYPE_CPE ? 2 : 1;
>     Psy3gppChannel *pch = &pctx->ch[tag];
> 
>     //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
>     memset(pch->band, 0, sizeof(pch->band));
>     for(ch = 0; ch < chans; ch++){
>         start = 0;
>         for(w = 0; w < cpe->ch[ch].ics.num_windows*16; w += 16){
>             for(g = 0; g < cpe->ch[ch].ics.num_swb; g++){
>                 for(i = 0; i < cpe->ch[ch].ics.swb_sizes[g]; i++)
>                     pch->band[ch][w+g].energy +=  cpe->ch[ch].coeffs[start+i] *  cpe->ch[ch].coeffs[start+i];
>                 pch->band[ch][w+g].energy /= 262144.0f;
>                 pch->band[ch][w+g].thr = pch->band[ch][w+g].energy * 0.001258925f;
>                 start += cpe->ch[ch].ics.swb_sizes[g];

>                 if(pch->band[ch][w+g].energy != 0.0){
>                     float ffac = 0.0;
> 
>                     for(i = 0; i < cpe->ch[ch].ics.swb_sizes[g]; i++)
>                         ffac += sqrt(FFABS(cpe->ch[ch].coeffs[start+i]));
>                     pch->band[ch][w+g].ffac = ffac / sqrt(512.0);
>                 }

apparently not used before M/S and its calculated after M/S again


>             }
>         }
>     }
> 
>     //modify thresholds - spread, threshold in quiet - 5.4.3 "Spreaded Energy Calculation"
>     for(ch = 0; ch < chans; ch++){
>         for(w = 0; w < cpe->ch[ch].ics.num_windows*16; w += 16){

>             for(g = 1; g < cpe->ch[ch].ics.num_swb; g++){
>                 if(cpe->ch[ch].ics.num_swb == apc->num_bands1024)
>                     pch->band[ch][w+g].thr = FFMAX(pch->band[ch][w+g].thr, pch->band[ch][w+g-1].thr * pctx->s_low_l[g-1]);
>                 else
>                     pch->band[ch][w+g].thr = FFMAX(pch->band[ch][w+g].thr, pch->band[ch][w+g-1].thr * pctx->s_low_s[g-1]);
>             }
>             for(g = cpe->ch[ch].ics.num_swb - 2; g >= 0; g--){
>                 if(cpe->ch[ch].ics.num_swb == apc->num_bands1024)
>                     pch->band[ch][w+g].thr = FFMAX(pch->band[ch][w+g].thr, pch->band[ch][w+g+1].thr * pctx->s_hi_l[g+1]);
>                 else
>                     pch->band[ch][w+g].thr = FFMAX(pch->band[ch][w+g].thr, pch->band[ch][w+g+1].thr * pctx->s_hi_s[g+1]);
>             }

pch->band[ch]+w occurs quite often in there that can be factored out
also the if() can be factord out and pointers be used



>             for(g = 0; g < cpe->ch[ch].ics.num_swb; g++){
>                 if(cpe->ch[ch].ics.num_swb == apc->num_bands1024)
>                     pch->band[ch][w+g].thr_quiet = FFMAX(pch->band[ch][w+g].thr, pctx->ath_l[g]);
>                 else
>                     pch->band[ch][w+g].thr_quiet = FFMAX(pch->band[ch][w+g].thr, pctx->ath_s[g]);

>                 pch->band[ch][w+g].thr_quiet = fmaxf(PSY_3GPP_RPEMIN*pch->band[ch][w+g].thr_quiet, fminf(pch->band[ch][w+g].thr_quiet, PSY_3GPP_RPELEV*pch->prev_band[ch][w+g].thr_quiet));

this line is too long
and this code needs to be cleaned up


>                 pch->band[ch][w+g].thr = FFMAX(pch->band[ch][w+g].thr, pch->band[ch][w+g].thr_quiet * 0.25);
>             }
>         }
>     }
> 
>     // M/S detection - 5.5.2 "Mid/Side Stereo"
>     if(chans > 1 && cpe->common_window){
>         start = 0;
>         for(w = 0; w < cpe->ch[0].ics.num_windows*16; w += 16){
>             for(g = 0; g < cpe->ch[0].ics.num_swb; g++){
>                 double en_m = 0.0, en_s = 0.0, ff_m = 0.0, ff_s = 0.0, minthr;
>                 float m, s;
> 
>                 cpe->ms_mask[w+g] = 0;
>                 if(pch->band[0][w+g].energy == 0.0 || pch->band[1][w+g].energy == 0.0)
>                     continue;
>                 for(i = 0; i < cpe->ch[0].ics.swb_sizes[g]; i++){
>                     m = cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i];
>                     s = cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
>                     en_m += m*m;
>                     en_s += s*s;
>                 }
>                 en_m /= 262144.0*4.0;
>                 en_s /= 262144.0*4.0;
>                 minthr = FFMIN(pch->band[0][w+g].thr, pch->band[1][w+g].thr);

>                 if(minthr * minthr * pch->band[0][w+g].energy * pch->band[1][w+g].energy  >= (pch->band[0][w+g].thr * pch->band[1][w+g].thr * en_m * en_s)){

i have in my previous review simplified this line already

anyway before the AAC encoder can reach svn it MUST be significantly improved
in terms of optimality as well as code cleanliness
basically everything should be RD optimal unless either a faster and equally
good heuristic exists or the RD optimal code is too slow.

About cleanliness
i do not mind lines >80 as its common that this improves readability but at
190 it gets a little too long
also the code is full of repeated struct->array[123]->struct->array[9]->a
that its really not easy to read. And that is then mixed with single letter
struct fields and local variables.

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

If you really think that XML is the answer, then you definitly missunderstood
the question -- Attila Kinali
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080817/4efa7624/attachment.pgp>



More information about the ffmpeg-devel mailing list