[FFmpeg-devel] [RFC] AAC Encoder

Michael Niedermayer michaelni
Mon Aug 11 20:03:49 CEST 2008


On Mon, Aug 11, 2008 at 12:18:03PM +0300, Kostya wrote:
> Here's my implementation of AAC encoder.
> 
> Since AAC decoder is not fully in SVN yet, I'm omitting data structures
> and declarations used from it, I'll sort it after decoder commit.
> The main difference is that psy model fills data structures and encoder
> writes bitstream with them, so several structures have additional members.
> 
> I've also stripped out model based on 3GPP 26.403 to make review easier.

[...]
> Index: libavcodec/aacenc.c
> ===================================================================
> --- libavcodec/aacenc.c	(revision 0)
> +++ libavcodec/aacenc.c	(revision 0)
> @@ -0,0 +1,729 @@
> +/*
> + * AAC encoder
> + * Copyright (C) 2008 Konstantin Shishkov
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file aacenc.c
> + * AAC encoder
> + */
> +
> +#include "avcodec.h"
> +#include "bitstream.h"
> +#include "dsputil.h"
> +#include "mpeg4audio.h"
> +
> +#include "aacpsy.h"
> +#include "aac.h"
> +#include "aactab.h"
> +
> +static const uint8_t swb_size_1024_96[] = {
> +    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8,
> +    12, 12, 12, 12, 12, 16, 16, 24, 28, 36, 44,
> +    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
> +};
> +
> +static const uint8_t swb_size_1024_64[] = {
> +    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8,
> +    12, 12, 12, 16, 16, 16, 20, 24, 24, 28, 36,
> +    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40
> +};
> +
> +static const uint8_t swb_size_1024_48[] = {
> +    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
> +    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
> +    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
> +    96
> +};
> +
> +static const uint8_t swb_size_1024_32[] = {
> +    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
> +    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
> +    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
> +};
> +
> +static const uint8_t swb_size_1024_24[] = {
> +    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
> +    12, 12, 12, 12, 16, 16, 16, 20, 20, 24, 24, 28, 28,
> +    32, 36, 36, 40, 44, 48, 52, 52, 64, 64, 64, 64, 64
> +};
> +
> +static const uint8_t swb_size_1024_16[] = {
> +    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
> +    12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16, 16, 20, 20, 20, 24, 24, 28, 28,
> +    32, 36, 40, 40, 44, 48, 52, 56, 60, 64, 64, 64
> +};
> +
> +static const uint8_t swb_size_1024_8[] = {
> +    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
> +    16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 28, 28,
> +    32, 36, 36, 40, 44, 48, 52, 56, 60, 64, 80
> +};
> +
> +static const uint8_t *swb_size_1024[] = {
> +    swb_size_1024_96, swb_size_1024_96, swb_size_1024_64,
> +    swb_size_1024_48, swb_size_1024_48, swb_size_1024_32,
> +    swb_size_1024_24, swb_size_1024_24, swb_size_1024_16,
> +    swb_size_1024_16, swb_size_1024_16, swb_size_1024_8
> +};
> +
> +static const uint8_t swb_size_128_96[] = {
> +    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
> +};
> +
> +static const uint8_t swb_size_128_48[] = {
> +    4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16
> +};
> +
> +static const uint8_t swb_size_128_24[] = {
> +    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 20
> +};
> +
> +static const uint8_t swb_size_128_16[] = {
> +    4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 12, 12, 16, 20, 20
> +};
> +
> +static const uint8_t swb_size_128_8[] = {
> +    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 12, 16, 20, 20
> +};
> +
> +static const uint8_t *swb_size_128[] = {
> +    /* the last entry on the following row is swb_size_128_64 but is a
> +       duplicate of swb_size_128_96 */
> +    swb_size_128_96, swb_size_128_96, swb_size_128_96,
> +    swb_size_128_48, swb_size_128_48, swb_size_128_48,
> +    swb_size_128_24, swb_size_128_24, swb_size_128_16,
> +    swb_size_128_16, swb_size_128_16, swb_size_128_8
> +};

ok


[...]
> +/** default channel configurations */
> +static const uint8_t aac_chan_configs[6][5] = {
> + {1, ID_SCE},                         // 1 channel  - single channel element
> + {1, ID_CPE},                         // 2 channels - channel pair
> + {2, ID_SCE, ID_CPE},                 // 3 channels - center + stereo
> + {3, ID_SCE, ID_CPE, ID_SCE},         // 4 channels - front center + stereo + back center
> + {3, ID_SCE, ID_CPE, ID_CPE},         // 5 channels - front center + stereo + back stereo
> + {4, ID_SCE, ID_CPE, ID_CPE, ID_LFE}, // 6 channels - front center + stereo + back stereo + LFE
> +};

can this maybe be used to simplify the decoder?


> +
> +typedef struct {
> +    PutBitContext pb;
> +    MDCTContext mdct1024;
> +    MDCTContext mdct128;
> +    DSPContext  dsp;

ok


[...]
> +/**
> + * Make AAC audio config object.
> + * @see 1.6.2.1
> + */
> +static void put_audio_specific_config(AVCodecContext *avctx)
> +{
> +    PutBitContext pb;
> +    AACEncContext *s = avctx->priv_data;
> +
> +    init_put_bits(&pb, avctx->extradata, avctx->extradata_size*8);
> +    put_bits(&pb, 5, 2); //object type - AAC-LC
> +    put_bits(&pb, 4, s->samplerate_index); //sample rate index
> +    put_bits(&pb, 4, avctx->channels);
> +    //GASpecificConfig
> +    put_bits(&pb, 1, 0); //frame length - 1024 samples
> +    put_bits(&pb, 1, 0); //does not depend on core coder
> +    put_bits(&pb, 1, 0); //is not extension
> +    flush_put_bits(&pb);
> +}

ok


> +
> +static av_cold int aac_encode_init(AVCodecContext *avctx)
> +{
> +    AACEncContext *s = avctx->priv_data;
> +    int i;
> +
> +    avctx->frame_size = 1024;
> +
> +    for(i = 0; i < 16; i++)
> +        if(avctx->sample_rate == ff_mpeg4audio_sample_rates[i])
> +            break;
> +    if(i == 16){
> +        av_log(avctx, AV_LOG_ERROR, "Unsupported sample rate %d\n", avctx->sample_rate);
> +        return -1;
> +    }
> +    if(avctx->channels > 6){
> +        av_log(avctx, AV_LOG_ERROR, "Unsupported number of channels: %d\n", avctx->channels);
> +        return -1;
> +    }
> +    s->samplerate_index = i;

ok


> +    s->swb_sizes1024 = swb_size_1024[i];
> +    s->swb_num1024 = ff_aac_num_swb_1024[i];
> +    s->swb_sizes128 = swb_size_128[i];
> +    s->swb_num128 = ff_aac_num_swb_128[i];

vertical align


> +
> +    dsputil_init(&s->dsp, avctx);
> +    ff_mdct_init(&s->mdct1024, 11, 0);
> +    ff_mdct_init(&s->mdct128,   8, 0);

ok


[...]
> +/**
> + * Perform windowing and MDCT.
> + */
> +static void analyze(AVCodecContext *avctx, AACEncContext *s, ChannelElement *cpe, short *audio, int channel)
> +{
> +    int i, j, k;
> +    const float * lwindow = cpe->ch[channel].ics.use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_aac_sine_long_1024;
> +    const float * swindow = cpe->ch[channel].ics.use_kb_window[0] ? ff_aac_kbd_short_128 : ff_aac_sine_short_128;
> +    const float * pwindow = cpe->ch[channel].ics.use_kb_window[1] ? ff_aac_kbd_short_128 : ff_aac_sine_short_128;
> +
> +    if (cpe->ch[channel].ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
> +        memcpy(s->output, cpe->ch[channel].saved, sizeof(float)*1024);
> +        if(cpe->ch[channel].ics.window_sequence[0] == LONG_STOP_SEQUENCE){
> +            memset(s->output, 0, sizeof(s->output[0]) * 448);
> +            for(i = 448; i < 576; i++)
> +                s->output[i] = cpe->ch[channel].saved[i] * pwindow[i - 448];
> +            for(i = 576; i < 704; i++)
> +                s->output[i] = cpe->ch[channel].saved[i];
> +        }
> +        if(cpe->ch[channel].ics.window_sequence[0] != LONG_START_SEQUENCE){
> +            j = channel;
> +            for (i = 0; i < 1024; i++, j += avctx->channels){
> +                s->output[i+1024]         = audio[j] / 512.0 * lwindow[1024 - i - 1];
> +                cpe->ch[channel].saved[i] = audio[j] / 512.0 * lwindow[i];
> +            }
> +        }else{
> +            j = channel;
> +            for(i = 0; i < 448; i++, j += avctx->channels)
> +                s->output[i+1024]         = audio[j] / 512.0;
> +            for(i = 448; i < 576; i++, j += avctx->channels)
> +                s->output[i+1024]         = audio[j] / 512.0 * swindow[576 - i - 1];
> +            memset(s->output+1024+576, 0, sizeof(s->output[0]) * 448);
> +            j = channel;
> +            for(i = 0; i < 1024; i++, j += avctx->channels)
> +                cpe->ch[channel].saved[i] = audio[j] / 512.0;
> +        }
> +        ff_mdct_calc(&s->mdct1024, cpe->ch[channel].coeffs, s->output, s->tmp);
> +    }else{
> +        j = channel;
> +        for (k = 0; k < 1024; k += 128) {
> +            for(i = 448 + k; i < 448 + k + 256; i++)
> +                s->output[i - 448 - k] = (i < 1024) ? cpe->ch[channel].saved[i] : audio[channel + (i-1024)*avctx->channels] / 512.0;
> +            s->dsp.vector_fmul        (s->output,     k ?  swindow : pwindow, 128);
> +            s->dsp.vector_fmul_reverse(s->output+128, s->output+128, swindow, 128);
> +            ff_mdct_calc(&s->mdct128, cpe->ch[channel].coeffs + k, s->output, s->tmp);
> +        }
> +        j = channel;

> +        for(i = 0; i < 1024; i++, j += avctx->channels)
> +            cpe->ch[channel].saved[i] = audio[j] / 512.0;

cant all this rescaling be done at somewhere else? window / quant/dequant?


> +    }
> +}
> +
> +/**
> + * Encode ics_info element.
> + * @see Table 4.6
> + */
> +static void put_ics_info(AVCodecContext *avctx, IndividualChannelStream *info)
> +{
> +    AACEncContext *s = avctx->priv_data;
> +    int i;
> +
> +    put_bits(&s->pb, 1, 0);                // ics_reserved bit
> +    put_bits(&s->pb, 2, info->window_sequence[0]);
> +    put_bits(&s->pb, 1, info->use_kb_window[0]);
> +    if(info->window_sequence[0] != EIGHT_SHORT_SEQUENCE){
> +        put_bits(&s->pb, 6, info->max_sfb);
> +        put_bits(&s->pb, 1, 0);            // no prediction
> +    }else{
> +        put_bits(&s->pb, 4, info->max_sfb);
> +        for(i = 1; i < info->num_windows; i++)
> +            put_bits(&s->pb, 1, info->group_len[i]);
> +    }
> +}

ok


[...]
> +/**
> + * Scan spectral band and determine optimal codebook for it.
> + */
> +static int determine_section_info(AACEncContext *s, ChannelElement *cpe, int channel, int win, int band, int start, int size)

document what the parameters mean please


> +{
> +    int i, j, w;
> +    int maxval, sign;
> +    int score, best, cb, bestcb, dim, idx;
> +
> +    maxval = 0;
> +    sign = 0;
> +    w = win;
> +    do{
> +        for(i = start + (w-win)*128; i < start + (w-win)*128 + size; i++){
> +            maxval = FFMAX(maxval, FFABS(cpe->ch[channel].icoefs[i]));
> +            if(cpe->ch[channel].icoefs[i] < 0) sign = 1;
> +        }
> +        w++;
> +    }while(w < cpe->ch[channel].ics.num_windows && cpe->ch[channel].ics.group_len[w]);
> +
> +    if(maxval > 12) return 11;
> +    if(!maxval) return 0;
> +
> +    for(cb = 0; cb < 12; cb++)
> +        if(aac_cb_info[cb].maxval >= maxval)
> +            break;

> +    best = 9999;

INT_MAX


> +    bestcb = 11;
> +    for(; cb < 12; cb++){
> +        score = 0;
> +        dim = (aac_cb_info[cb].flags & CB_PAIRS) ? 2 : 4;
> +        if(!band || cpe->ch[channel].band_type[win][band - 1] != cb)
> +            score += 9; //that's for new codebook entry
> +        w = win;
> +        if(aac_cb_info[cb].flags & CB_UNSIGNED){
> +            do{
> +                for(i = start + (w-win)*128; i < start + (w-win)*128 + size; i += dim){
> +                    idx = 0;
> +                    for(j = 0; j < dim; j++)
> +                        idx = idx * aac_cb_info[cb].maxval + FFABS(cpe->ch[channel].icoefs[i+j]);
> +                    score += ff_aac_spectral_bits[aac_cb_info[cb].cb_num][idx];
> +                    for(j = 0; j < dim; j++)
> +                        if(cpe->ch[channel].icoefs[i+j])
> +                            score++;
> +                }
> +                w++;
> +            }while(w < cpe->ch[channel].ics.num_windows && cpe->ch[channel].ics.group_len[w]);
> +        }else{
> +            do{

> +                for(i = start + (w-win)*128; i < start + (w-win)*128 + size; i += dim){
> +                    idx = 0;
> +                    for(j = 0; j < dim; j++)
> +                        idx = idx * (aac_cb_info[cb].maxval*2 + 1) + cpe->ch[channel].icoefs[i+j] + aac_cb_info[cb].maxval;
> +                    score += ff_aac_spectral_bits[aac_cb_info[cb].cb_num][idx];
> +                }
> +                w++;

a int start2= start + (w-win)*128
would simplify these, i dont know if its the best solution but its better than
it is ...


> +            }while(w < cpe->ch[channel].ics.num_windows && cpe->ch[channel].ics.group_len[w]);
> +        }
> +        if(score < best){
> +            best = score;
> +            bestcb = cb;
> +        }
> +    }
> +    return bestcb;
> +}

This code does not seem to consider the distortion
please use rate distortion not just the number of bits!


> +
> +/**
> + * Encode one scalefactor band with selected codebook.
> + */
> +static void encode_codebook(AACEncContext *s, ChannelElement *cpe, int channel, int start, int size, int cb)

hmmmmmmm
encode_coeffs
encode_spectral
encode_codebook_indexes
...
but not encode_codebook


> +{
> +    const uint8_t *bits = ff_aac_spectral_bits[aac_cb_info[cb].cb_num];
> +    const uint16_t *codes = ff_aac_spectral_codes[aac_cb_info[cb].cb_num];

vertical align


> +    const int dim = (aac_cb_info[cb].flags & CB_PAIRS) ? 2 : 4;
> +    int i, j, idx;
> +

> +    if(!bits || !codes) return;

one of these is redundant i guess


[...]
> +/**
> + * Encode information about codebooks used for scalefactor bands coding.
> + */
> +static void encode_section_data(AVCodecContext *avctx, AACEncContext *s, ChannelElement *cpe, int channel)

encode_band_types() i suspect ?


> +{
> +    int i, w;
> +    int bits = cpe->ch[channel].ics.num_windows == 1 ? 5 : 3;
> +    int esc = (1 << bits) - 1;
> +    int count;
> +
> +    for(w = 0; w < cpe->ch[channel].ics.num_windows; w++){
> +        if(cpe->ch[channel].ics.group_len[w]) continue;
> +        count = 0;
> +        for(i = 0; i < cpe->ch[channel].ics.max_sfb; i++){
> +            if(!i || cpe->ch[channel].band_type[w][i] != cpe->ch[channel].band_type[w][i-1]){
> +                if(count){
> +                    while(count >= esc){
> +                        put_bits(&s->pb, bits, esc);
> +                        count -= esc;
> +                    }
> +                    put_bits(&s->pb, bits, count);
> +                }
> +                put_bits(&s->pb, 4, cpe->ch[channel].band_type[w][i]);
> +                count = 1;
> +            }else
> +                count++;

The code that selects the band types should really already know the number
of identical band types.
And i think that should be determined with the viterbi algorithm.
Its pretty trivial to encode each with several bands (thats already done
as far as i can see) and then find the global optimal path through the
table using the viterbi algorithm.


> +        }
> +        if(count){
> +            while(count >= esc){
> +                put_bits(&s->pb, bits, esc);
> +                count -= esc;
> +            }
> +            put_bits(&s->pb, bits, count);
> +        }
> +    }
> +}

> +
> +/**
> + * Encode scalefactors.
> + */
> +static void encode_scale_factor_data(AVCodecContext *avctx, AACEncContext *s, ChannelElement *cpe, int channel)

s/_data/s/


> +{
> +    int off = cpe->ch[channel].mixing_gain, diff;
> +    int i, w;
> +
> +    for(w = 0; w < cpe->ch[channel].ics.num_windows; w++){
> +        if(cpe->ch[channel].ics.group_len[w]) continue;
> +        for(i = 0; i < cpe->ch[channel].ics.max_sfb; i++){
> +            if(!cpe->ch[channel].zeroes[w][i]){
> +                diff = cpe->ch[channel].sf_idx[w][i] - off + SCALE_DIFF_ZERO;

> +                if(diff < 0 || diff > 120) av_log(avctx, AV_LOG_ERROR, "Scalefactor difference is too big to be coded\n");

assert() ?


> +                off = cpe->ch[channel].sf_idx[w][i];
> +                put_bits(&s->pb, ff_aac_scalefactor_bits[diff], ff_aac_scalefactor_code[diff]);
> +            }
> +        }
> +    }
> +}




> +
> +/**
> + * Encode pulse data.
> + */
> +static void encode_pulse_data(AVCodecContext *avctx, AACEncContext *s, ChannelElement *cpe, int channel)

s/_data/s/
and the same for all other similar trashy names


> +{
> +    int i;
> +
> +    put_bits(&s->pb, 1, cpe->ch[channel].pulse.present);
> +    if(!cpe->ch[channel].pulse.present) return;
> +
> +    put_bits(&s->pb, 2, cpe->ch[channel].pulse.num_pulse - 1);

you can use num_pulse instead of pulse.present at least, possibly theres a
even better solution.

[...]

> +
> +/**
> + * Write some auxiliary information about created AAC file.
> + */
> +static void put_bitstream_info(AVCodecContext *avctx, AACEncContext *s, const char *name)
> +{
> +    int i, namelen, padbits;
> +
> +    namelen = strlen(name) + 2;
> +    put_bits(&s->pb, 3, ID_FIL);
> +    put_bits(&s->pb, 4, FFMIN(namelen, 15));
> +    if(namelen >= 15)
> +        put_bits(&s->pb, 8, namelen - 16);
> +    put_bits(&s->pb, 4, 0); //extension type - filler
> +    padbits = 8 - (put_bits_count(&s->pb) & 7);
> +    align_put_bits(&s->pb);
> +    for(i = 0; i < namelen - 2; i++)
> +        put_bits(&s->pb, 8, name[i]);
> +    put_bits(&s->pb, 12 - padbits, 0);
> +}

ok


[...]
> +static av_cold int aac_encode_end(AVCodecContext *avctx)
> +{
> +    AACEncContext *s = avctx->priv_data;
> +
> +    ff_mdct_end(&s->mdct1024);
> +    ff_mdct_end(&s->mdct128);
> +    ff_aac_psy_end(&s->psy);
> +    av_freep(&s->samples);
> +    av_freep(&s->cpe);
> +    return 0;
> +}
> +
> +AVCodec aac_encoder = {
> +    "aac",
> +    CODEC_TYPE_AUDIO,
> +    CODEC_ID_AAC,
> +    sizeof(AACEncContext),
> +    aac_encode_init,
> +    aac_encode_frame,
> +    aac_encode_end,

ok


> +    .capabilities = CODEC_CAP_SMALL_LAST_FRAME,
> +};

iam no sure maybe, this needs CODEC_CAP_DELAY as well


> Index: libavcodec/allcodecs.c
> ===================================================================
> --- libavcodec/allcodecs.c	(revision 14690)
> +++ libavcodec/allcodecs.c	(working copy)
> @@ -60,6 +60,7 @@
>      initialized = 1;
>  
>      /* video codecs */
> +    REGISTER_ENCODER (AAC, aac);
>      REGISTER_DECODER (AASC, aasc);
>      REGISTER_DECODER (AMV, amv);
>      REGISTER_ENCDEC  (ASV1, asv1);

> Index: libavcodec/aacpsy.c
> ===================================================================
> --- libavcodec/aacpsy.c	(revision 0)
> +++ libavcodec/aacpsy.c	(revision 0)
> @@ -0,0 +1,455 @@
> +/*
> + * AAC encoder psychoacoustic model
> + * Copyright (C) 2008 Konstantin Shishkov
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file aacpsy.c
> + * AAC encoder psychoacoustic model
> + */
> +
> +#include "avcodec.h"
> +#include "dsputil.h"
> +#include "aacpsy.h"

ok


> +
> +//borrowed from aac.c
> +static float pow2sf_tab[340];

duplicate


[...]
> +// low-pass filter declarations and code
> +#define IIR_ORDER 4
> +
> +/**
> + * filter data for 4th order IIR lowpass Butterworth filter
> + *
> + * data format:
> + * normalized cutoff frequency | inverse filter gain | coefficients
> + */
> +static const float lp_filter_data[][IIR_ORDER+2] = {
> +    { 0.4535147392, 6.816645e-01, -0.4646665999, -2.2127207402, -3.9912017501, -3.2380429984 },
> +    { 0.4166666667, 4.998150e-01, -0.2498216698, -1.3392807613, -2.7693097862, -2.6386277439 },
> +    { 0.3628117914, 3.103469e-01, -0.0965076902, -0.5977763360, -1.4972580903, -1.7740085241 },
> +    { 0.3333333333, 2.346995e-01, -0.0557639007, -0.3623690447, -1.0304538354, -1.3066051440 },
> +    { 0.2916666667, 1.528432e-01, -0.0261686639, -0.1473794606, -0.6204721225, -0.6514716536 },
> +    { 0.2267573696, 6.917529e-02, -0.0202414073,  0.0780167640, -0.5277442247,  0.3631641670 },
> +    { 0.2187500000, 6.178391e-02, -0.0223681543,  0.1069446609, -0.5615167033,  0.4883976841 },
> +    { 0.2083333333, 5.298685e-02, -0.0261686639,  0.1473794606, -0.6204721225,  0.6514716536 },
> +    { 0.1587301587, 2.229030e-02, -0.0647354087,  0.4172275190, -1.1412129810,  1.4320761385 },
> +    { 0.1458333333, 1.693903e-02, -0.0823177861,  0.5192354923, -1.3444768251,  1.6365345642 },
> +    { 0.1133786848, 7.374053e-03, -0.1481421788,  0.8650973862, -1.9894244796,  2.1544844308 },
> +    { 0.1041666667, 5.541768e-03, -0.1742301048,  0.9921936565, -2.2090801108,  2.3024482658 },
> +};
> +
> +/**
> + * IIR filter state
> + */
> +typedef struct LPFilterState{
> +    float x[IIR_ORDER + 1];
> +    float y[IIR_ORDER + 1];
> +}LPFilterState;
> +

> +static av_always_inline float lowpass_iir_filter(LPFilterState *s, const float *coeffs, float in)
> +{
> +    memmove(s->x, s->x + 1, sizeof(s->x) - sizeof(s->x[0]));
> +    memmove(s->y, s->y + 1, sizeof(s->y) - sizeof(s->y[0]));

no we will not do memmoves in av_always_inline functions


> +    s->x[IIR_ORDER] = in * coeffs[1];
> +    //FIXME: made only for 4th order filter
> +    s->y[IIR_ORDER] = (s->x[0] + s->x[4])*1 + (s->x[1] + s->x[3])*4 + s->x[2]*6
> +                    + coeffs[2]*s->y[0] + coeffs[3]*s->y[1] + coeffs[4]*s->y[2] + coeffs[5]*s->y[3];
> +    return s->y[IIR_ORDER];
> +}

the lowpass code belongs in a seperate file and patch and should probably
be applied outside the codec. Unless of course it uses some feedback from the
psy model ...
besides lowpass, things should be run through a highpass filter to remove
the DC component, its inaudible and expensive to represent after MDCT transform


[...]

> +void ff_aac_psy_preprocess(AACPsyContext *ctx, int16_t *audio, int16_t *dest, int tag, int type)
> +{
> +    int chans = type == ID_CPE ? 2 : 1;
> +    const int chstride = ctx->avctx->channels;
> +    int i, ch;
> +    float t[2];
> +

> +    if(chans == 1 || (ctx->flags & PSY_MODEL_NO_PREPROC) == PSY_MODEL_NO_PREPROC){
> +        for(ch = 0; ch < chans; ch++){
> +            for(i = 0; i < 1024; i++){
> +                dest[i * chstride + ch] = audio[i * chstride + ch];
> +            }
> +        }

memcpy()


> +    }else{
> +        for(i = 0; i < 1024; i++){
> +            if(ctx->flags & PSY_MODEL_NO_ST_ATT){
> +                for(ch = 0; ch < 2; ch++)
> +                    t[ch] = audio[i * chstride + ch];

> +            }else{
> +                t[0] = audio[i * chstride + 0] * (0.5 + ctx->stereo_att) + audio[i * chstride + 1] * (0.5 - ctx->stereo_att);
> +                t[1] = audio[i * chstride + 0] * (0.5 - ctx->stereo_att) + audio[i * chstride + 1] * (0.5 + ctx->stereo_att);
> +            }

this stereo attenuation looks like it should be in a filter before the codec

[...]

> Index: libavcodec/aacpsy.h
> ===================================================================
> --- libavcodec/aacpsy.h	(revision 0)
> +++ libavcodec/aacpsy.h	(revision 0)
> @@ -0,0 +1,145 @@
> +/*
> + * AAC encoder psychoacoustic model
> + * Copyright (C) 2008 Konstantin Shishkov
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef FFMPEG_AACPSY_H
> +#define FFMPEG_AACPSY_H
> +
> +#include "avcodec.h"
> +#include "dsputil.h"
> +#include "aac.h"
> +
> +enum AACPsyModelType{
> +    AAC_PSY_NULL,              ///< do nothing on frequencies
> +    AAC_PSY_NULL8,             ///< do nothing on frequencies but work with short windows
> +    AAC_PSY_3GPP,              ///< model following recommendations from 3GPP TS 26.403
> +

ok


> +    AAC_NB_PSY_MODELS          ///< total number of psychoacoustic models
> +};

make it clear that this is not part of he API due to binary compatibility
when a new model is added


> +
> +enum AACPsyModelMode{
> +    PSY_MODE_CBR,              ///< follow bitrate as closely as possible
> +    PSY_MODE_ABR,              ///< try to achieve bitrate but actual bitrate may differ significantly
> +    PSY_MODE_QUALITY,          ///< try to achieve set quality instead of bitrate
> +};

please use the bitrate_tolerance parameter for closeness of filesize matching
and the buffer_size / max and min bitrates for strict CBR
It might also be interresting to consider using ratecontrol.c


> +
> +#define PSY_MODEL_MODE_MASK  0x0000000F ///< bit fields for storing mode (CBR, ABR, VBR)
> +#define PSY_MODEL_NO_PULSE   0x00000010 ///< disable pulse searching
> +#define PSY_MODEL_NO_SWITCH  0x00000020 ///< disable window switching
> +#define PSY_MODEL_NO_ST_ATT  0x00000040 ///< disable stereo attenuation
> +#define PSY_MODEL_NO_LOWPASS 0x00000080 ///< disable low-pass filtering
> +
> +#define PSY_MODEL_NO_PREPROC (PSY_MODEL_NO_ST_ATT | PSY_MODEL_NO_LOWPASS)
> +
> +#define PSY_MODEL_MODE(a)  ((a) & PSY_MODEL_MODE_MASK)
> +
> +/**
> + * context used by psychoacoustic model
> + */
> +typedef struct AACPsyContext {
> +    AVCodecContext *avctx;

> +    DSPContext dsp;

for what does the psy model use dsp ?
Iam not very happy about this being duplicated between encoder and psy model


> +
> +    int flags;
> +    int window_type[2];
> +    int window_shape[2];
> +    const uint8_t *bands1024;
> +    int num_bands1024;
> +    const uint8_t *bands128;
> +    int num_bands128;
> +
> +    const struct AACPsyModel *model;
> +    void* model_priv_data;
> +
> +    float stereo_att;
> +    int   cutoff;
> +    void* lp_state;
> +}AACPsyContext;

document the fields please, also true for all other structs where fields
do not have a immedeatly obvious meaning.

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Concerning the gods, I have no means of knowing whether they exist or not
or of what sort they may be, because of the obscurity of the subject, and
the brevity of human life -- Protagoras
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080811/a3886c3e/attachment.pgp>



More information about the ffmpeg-devel mailing list