[FFmpeg-devel] [RFC] Generic psychoacoustic model interface

Sat Aug 30 12:21:54 CEST 2008

On Thu, Aug 28, 2008 at 10:36:57PM +0200, Michael Niedermayer wrote:
> On Thu, Aug 28, 2008 at 08:10:26PM +0300, Kostya wrote:
[...]
> > /**
> >  * windowing related information
> >  */
> > typedef struct FFWindowInfo{
> 
> >     int window_type[2];               ///< window type (short/long/transitional, etc.) - current and previous
> 
> How is this "transitional" going to work with many different frame lengths?
> is there 1? N*N ?

that's for AAC (i.e. requires a bit of different windowing),
encoder will set that to internal value

[...] 
> > /**
> >  * Get psychoacoustic model suggestion about coding two bands as M/S
> >  */
> > enum FFPsyMSDecision ff_psy_suggest_ms(FFPsyContext *ctx, FFPsyBand *left, FFPsyBand *right);
> 
> iam a little unsure about this one, but iam not objecting ...

dropped for now, may revive later

Here's another draft - it's psychoacoustic model interface with
partial implementation (there are some inaccuracies and debugs there,
but's this is RFC, not a final patch).

I plan to use it this way with my encoder.

General flow:

init
while(frame){
  suggest window()
  [encoder may ignore that]
  set band info() = calculate thresholds for all bands with provided window type
  psy analyze() = get distortions and weight for band quantized with a series of
                  quantizers, my encoder will use that for RD-aware quantization
}

> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> I count him braver who overcomes his desires than him who conquers his
> enemies for the hardest victory is over self. -- Aristotle
-------------- next part --------------
/*
 * audio encoder psychoacoustic model
 * Copyright (C) 2008 Konstantin Shishkov
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef FFMPEG_PSYMODEL_H
#define FFMPEG_PSYMODEL_H

#include "avcodec.h"

/** maximum possible number of bands */
#define MAX_BANDS 128

/**
 * single band psychoacoustic information
 */
typedef struct FFPsyBand{
    int   bits;
    float energy;
    float threshold;
    float distortion;
    float perceptual_weight;
}FFPsyBand;

/**
 * windowing related information
 */
typedef struct FFPsyWindowInfo{
    int window_type[2];               ///< window type (short/long/transitional, etc.) - current and previous
    int window_shape;                 ///< window shape (sine/KBD/whatever)
    int num_windows;                  ///< number of windows in a frame
    int grouping[8];                  ///< window grouping (for e.g. AAC)
    int *window_sizes;                ///< sequence of window sizes inside one frame (for eg. WMA)
}FFPsyWindowInfo;

/**
 * context used by psychoacoustic model
 */
typedef struct FFPsyContext{
    AVCodecContext *avctx;            ///< encoder context

    FFPsyBand *psy_bands;             ///< frame bands information
    FFPsyWindowInfo *win_info;        ///< frame window info

    uint8_t **bands;                  ///< scalefactor band sizes for possible frame sizes
    int     *num_bands;               ///< number of scalefactor bands for possible frame sizes
    int num_lens;                     ///< number of scalefactor band sets

    void* model_priv_data;            ///< psychoacoustic model implementation private data
}FFPsyContext;

/**
 * Initialize psychoacoustic model.
 *
 * @param ctx        model context
 * @param avctx      codec context
 * @param num_lens   number of possible frame lengths
 * @param bands      scalefactor band lengths for all frame lengths
 * @param num_bands  number of scalefactor bands for all frame lengths
 *
 * @return zero if successful, a negative value if not
 */
av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
                        int num_lens,
                        uint8_t **bands, int* num_bands);

/**
 * Suggest window sequence for channel.
 *
 * @param ctx       model context
 * @param audio     samples for the current frame
 * @param la        lookahead samples (NULL when unavailable)
 * @param channel   number of channel element to analyze
 * @param prev_type previous window type
 *
 * @return suggested window information in a structure
 */
FFPsyWindowInfo ff_psy_suggest_window(FFPsyContext *ctx,
                                      const int16_t *audio, const int16_t *la,
                                      int channel, int prev_type);

/**
 * Perform psychoacoustic analysis and set band info (threshold, energy).
 *
 * @param ctx     model context
 * @param channel audio channel number
 * @param coeffs  pointer to the transformed coefficients
 * @param wi      window information
 */
void ff_psy_set_band_info(FFPsyContext *ctx, int channel, const float *coeffs,
                          FFPsyWindowInfo *wi);

/**
 * Analyze band and output perceptual information for given quantizers.
 *
 * @param ctx     model context
 * @param channel audio channel number
 * @param band_no number of band to analyze
 * @param coeffs  pointer to the transformed coefficients
 * @param length  band length
 * @param quants  number of quantizers to try
 * @param Q       array of quantizers
 * @param qstep   quantizers stride
 * @param bands   output array where information will be stored
 */
void ff_psy_analyze(FFPsyContext *ctx, int channel, int band_no,
                    const float *coeffs, int length,
                    int quants, const float *Q, const int qstep,
                    FFPsyBand *bands);

/**
 * Cleanup model context at the end.
 *
 * @param ctx model context
 */
av_cold void ff_psy_end(FFPsyContext *ctx);

/**************************************************************************
 *                       Audio preprocessing stuff.                       *
 *       This should be moved into some audio filter eventually.          *
 **************************************************************************/
struct FFPsyPreprocessContext;

/**
 * psychoacoustic model audio preprocessing initialization
 */
av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx);

/**
 * Preprocess several channel in audio frame in order to compress it better.
 *
 * @param ctx      preprocessing context
 * @param audio    samples to preprocess
 * @param dest     place to put filtered samples
 * @param tag      channel number
 * @param channels number of channel to preprocess (some additional work may be done on stereo pair)
 */
void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx,
                       const int16_t *audio, int16_t *dest,
                       int tag, int channels);

/**
 * Cleanup audio preprocessing module.
 */
av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx);

#endif /* FFMPEG_PSYMODEL_H */
-------------- next part --------------
/*
 * audio encoder psychoacoustic model
 * Copyright (C) 2008 Konstantin Shishkov
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "avcodec.h"
#include "psymodel.h"
#include "iirfilter.h"

#ifdef ENABLE_AAC_ENCODER
#include "aac.h"
#include "aactab.h"

/**
 * Quantize one coefficient.
 * @return absolute value of the quantized coefficient
 * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
 */
static av_always_inline int quant(float coef, const float Q)
{
    return av_clip((int)(pow(fabsf(coef) * Q, 0.75) + 0.4054), 0, 8191);
}

static inline float psy_aac_get_approximate_quant_error(const float *c, int size,
                                                        const float Q, const float IQ)
{
    int i;
    int q;
    float coef, unquant, sum = 0.0f;
    for(i = 0; i < size; i++){
        coef = fabs(c[i]);
        q = quant(c[i], Q);
        unquant = (q * cbrt(q)) * IQ;
        sum += (coef - unquant) * (coef - unquant);
    }
    return sum * 1.0 / 512.0;
}

//XXX: stub
static inline int psy_aac_get_approximate_bits(const float *c, int size, const float Q)
{
    int i, bits = 0;
    for(i = 0; i < size; i += 2){
        int idx = 0, j, q;
        for(j = 0; j < 2; j++){
            q = quant(c[i+j], Q);
            q = FFABS(q);
            if(q)
                bits++;
            if(q > 16)
                bits += av_log2(q)*2 - 4 + 1;
            idx = idx*17 + FFMIN(q, 16);
        }
        bits += ff_aac_spectral_bits[10][idx];
    }
    return bits;
}

#define PSY_3GPP_SPREAD_LOW  1.5f // spreading factor for ascending threshold spreading  (15 dB/Bark)
#define PSY_3GPP_SPREAD_HI   3.0f // spreading factor for descending threshold spreading (30 dB/Bark)
#define PSY_3GPP_RPEMIN      0.01f
#define PSY_3GPP_RPELEV      2.0f

/**
 * information for single band used by 3GPP TS26.403-inspired psychoacoustic model
 */
typedef struct Psy3gppBand{
    float energy;    ///< band energy
    float ffac;      ///< form factor
    float thr;       ///< energy threshold
    float pe;        ///< perceptual entropy
    float thr_quiet; ///< threshold in quiet
}Psy3gppBand;

/**
 * single/pair channel context for psychoacoustic model
 */
typedef struct Psy3gppChannel{
    Psy3gppBand band[128];               ///< bands information
    Psy3gppBand prev_band[128];          ///< bands information from the previous frame

    float       win_energy;              ///< sliding average of channel energy
    float       iir_state[2];            ///< hi-pass IIR filter state
    uint8_t     next_grouping;           ///< stored grouping scheme for the next frame (in case of 8 short window sequence)
    enum WindowSequence next_window_seq; ///< window sequence to be used in the next frame
}Psy3gppChannel;

/**
 * psychoacoustic model frame type-dependent coefficients
 */
typedef struct Psy3gppCoeffs{
    float ath       [64]; ///< absolute threshold of hearing per bands
    float barks     [64]; ///< Bark value for each spectral band in long frame
    float spread_low[64]; ///< spreading factor for low-to-high threshold spreading in long frame
    float spread_hi [64]; ///< spreading factor for high-to-low threshold spreading in long frame
}Psy3gppCoeffs;

/**
 * 3GPP TS26.403-inspired psychoacoustic model specific data
 */
typedef struct Psy3gppContext{
    Psy3gppCoeffs psy_coef[2];
    Psy3gppChannel *ch;
}Psy3gppContext;

/**
 * Calculate Bark value for given line.
 */
static inline float calc_bark(float f)
{
    return 13.3f * atanf(0.00076f * f) + 3.5f * atanf((f / 7500.0f) * (f / 7500.0f));
}

#define ATH_ADD 4
/**
 * Calculate ATH value for given frequency.
 * Borrowed from Lame.
 */
static inline float ath(float f, float add)
{
    f /= 1000.0f;
    return   3.64 * pow(f, -0.8)
            - 6.8  * exp(-0.6  * (f - 3.4) * (f - 3.4))
            + 6.0  * exp(-0.15 * (f - 8.7) * (f - 8.7))
            + (0.6 + 0.04 * add) * 0.001 * f * f * f * f;
}

static av_cold int psy_aac_init(FFPsyContext *ctx){
    Psy3gppContext *pctx;
    float barks[1024];
    int i, j, g, start;
    float prev, minscale, minath;

    ctx->model_priv_data = av_mallocz(sizeof(Psy3gppContext));
    pctx = (Psy3gppContext*) ctx->model_priv_data;

    for(i = 0; i < 1024; i++)
        barks[i] = calc_bark(i * ctx->avctx->sample_rate / 2048.0);
    minath = ath(3410, ATH_ADD);
    for(j = 0; j < 2; j++){
        Psy3gppCoeffs *coeffs = &pctx->psy_coef[j];
        i = 0;
        prev = 0.0;
        for(g = 0; g < ctx->num_bands[j]; g++){
            i += ctx->bands[j][g];
            coeffs->barks[g] = (barks[i - 1] + prev) / 2.0;
            prev = barks[i - 1];
        }
        for(g = 0; g < ctx->num_bands[j] - 1; g++){
            coeffs->spread_low[g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_LOW);
            coeffs->spread_hi [g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_HI);
        }
        start = 0;
        for(g = 0; g < ctx->num_bands[j]; g++){
            minscale = ath(ctx->avctx->sample_rate * start / 1024.0, ATH_ADD);
            for(i = 1; i < ctx->bands[j][g]; i++){
                minscale = fminf(minscale, ath(ctx->avctx->sample_rate * (start + i) / 1024.0 / 2.0, ATH_ADD));
            }
            coeffs->ath[g] = minscale - minath;
            start += ctx->bands[j][g];
        }
    }

    pctx->ch = av_mallocz(sizeof(Psy3gppChannel) * ctx->avctx->channels);
    return 0;
}

static av_cold void psy_aac_end(FFPsyContext *ctx)
{
    Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data;
    av_freep(&pctx->ch);
    av_freep(&ctx->model_priv_data);
}

/**
 * Calculate band thresholds as suggested in 3GPP TS26.403
 */
static void psy_aac_calc_thresholds(FFPsyContext *ctx, int channel, const float *coefs,
                                    FFPsyWindowInfo *wi)
{
    Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data;
    Psy3gppChannel *pch = &pctx->ch[channel];
    int start = 0;
    int i, w, g;
    const int num_bands = ctx->num_bands[wi->num_windows == 8];
    const uint8_t* band_sizes = ctx->bands[wi->num_windows == 8];
    Psy3gppCoeffs *coeffs = &pctx->psy_coef[wi->num_windows == 8];

    //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
    for(w = 0; w < wi->num_windows*16; w += 16){
        for(g = 0; g < num_bands; g++){
            Psy3gppBand *band = &pch->band[w+g];
            for(i = 0; i < band_sizes[g]; i++)
                band->energy += coefs[start+i] * coefs[start+i];
            band->energy *= 1.0f / (512*512);
            band->thr = band->energy * 0.001258925f;
            start += band_sizes[g];

            ctx->psy_bands[channel*MAX_BANDS+w+g].energy = band->energy;
        }
    }
    //modify thresholds - spread, threshold in quiet - 5.4.3 "Spreaded Energy Calculation"
    for(w = 0; w < wi->num_windows*16; w += 16){
        Psy3gppBand *band = &pch->band[w];
        for(g = 1; g < num_bands; g++){
            band[g].thr = FFMAX(band[g].thr, band[g-1].thr * coeffs->spread_low[g-1]);
        }
        for(g = num_bands - 2; g >= 0; g--){
            band[g].thr = FFMAX(band[g].thr, band[g+1].thr * coeffs->spread_hi [g+1]);
        }
        for(g = 0; g < num_bands; g++){
            band[g].thr_quiet = FFMAX(band[g].thr, coeffs->ath[g]);
            if(wi->num_windows != 8 && wi->window_type[1] != EIGHT_SHORT_SEQUENCE){
                band[g].thr_quiet = fmaxf(PSY_3GPP_RPEMIN*band[g].thr_quiet,
                                          fminf(band[g].thr_quiet,
                                          PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet));
            }
            band[g].thr = FFMAX(band[g].thr, band[g].thr_quiet * 0.25);

            ctx->psy_bands[channel*MAX_BANDS+w+g].threshold = band[g].thr;
        }
    }
    memcpy(pch->prev_band, pch->band, sizeof(pch->band));
}
#endif

av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
                        int num_lens,
                        uint8_t **bands, int* num_bands)
{
    ctx->avctx = avctx;
    ctx->psy_bands = av_mallocz(sizeof(FFPsyBand) * MAX_BANDS * avctx->channels);
    ctx->bands     = av_malloc(sizeof(ctx->bands[0])     * num_lens);
    ctx->num_bands = av_malloc(sizeof(ctx->num_bands[0]) * num_lens);
    memcpy(ctx->bands,     bands,     sizeof(ctx->bands[0])     *  num_lens);
    memcpy(ctx->num_bands, num_bands, sizeof(ctx->num_bands[0]) *  num_lens);
    switch(ctx->avctx->codec_id){
    case CODEC_ID_AAC:
        return psy_aac_init(ctx);
    }
    return 0;
}

FFPsyWindowInfo ff_psy_suggest_window(FFPsyContext *ctx,
                                      const int16_t *audio, const int16_t *la,
                                      int channel, int prev_type)
{
    return *(FFPsyWindowInfo*)NULL;
}

void ff_psy_set_band_info(FFPsyContext *ctx, int channel, 
                          const float *coeffs, FFPsyWindowInfo *wi)
{
    switch(ctx->avctx->codec_id){
    case CODEC_ID_AAC:
        psy_aac_calc_thresholds(ctx, channel, coeffs, wi);
        break;
    }
}

void ff_psy_analyze(FFPsyContext *ctx, int channel, int band_no,
                    const float *coeffs, int length,
                    int quants, const float *Q, const int qstep,
                    FFPsyBand *bands)
{
    int i;
    float invthr;

    invthr = 1.0 / ctx->psy_bands[band_no].threshold;
    switch(ctx->avctx->codec_id){
    case CODEC_ID_AAC:
        for(i = 0; i < quants; i++, Q += qstep){
            const float IQ = 1.0 / Q[0];
            bands[i].threshold  = ctx->psy_bands[band_no].threshold;
            bands[i].energy     = ctx->psy_bands[band_no].energy;
            bands[i].distortion = psy_aac_get_approximate_quant_error(coeffs, length, IQ, Q[0]);
            bands[i].bits       = psy_aac_get_approximate_bits(coeffs, length, IQ);
            bands[i].perceptual_weight = bands[i].distortion * invthr;
        }
        break;
    }
}

av_cold void ff_psy_end(FFPsyContext *ctx)
{
    av_freep(&ctx->bands);
    av_freep(&ctx->num_bands);
    av_freep(&ctx->psy_bands);
    switch(ctx->avctx->codec_id){
    case CODEC_ID_AAC:
        psy_aac_end(ctx);
    }
}

typedef struct FFPsyPreprocessContext{
    AVCodecContext *avctx;
    float stereo_att;
    struct FFIIRFilterCoeffs *fcoeffs;
    struct FFIIRFilterState **fstate;
}FFPsyPreprocessContext;

#define FILT_ORDER 4

av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx)
{
    FFPsyPreprocessContext *ctx;
    int i;
    ctx = av_mallocz(sizeof(FFPsyPreprocessContext));
    ctx->avctx = avctx;
    ctx->fcoeffs = ff_iir_filter_init_coeffs(FF_FILTER_TYPE_BUTTERWORTH, FF_FILTER_MODE_LOWPASS,
                                           FILT_ORDER, 0.25, 0.0, 0.0);
    if(ctx->fcoeffs){
        ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
        for(i = 0; i < avctx->channels; i++)
            ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
    }
    return ctx;
}

void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx,
                       const int16_t *audio, int16_t *dest,
                       int tag, int channels)
{
    int ch, i;
    if(ctx->fstate){
        for(ch = 0; ch < channels; ch++){
            ff_iir_filter(ctx->fcoeffs, ctx->fstate[tag+ch], ctx->avctx->frame_size,
                          audio + ch, ctx->avctx->channels,
                          dest  + ch, ctx->avctx->channels);
        }
    }else{
        for(ch = 0; ch < channels; ch++){
            for(i = 0; i < ctx->avctx->frame_size; i++)
                dest[i*ctx->avctx->channels + ch] = audio[i*ctx->avctx->channels + ch];
        }
    }
}

av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx)
{
    int i;
    ff_iir_filter_free_coeffs(ctx->fcoeffs);
    for(i = 0; i < ctx->avctx->channels; i++){
        ff_iir_filter_free_state(ctx->fstate[i]);
    }
    av_freep(&ctx->fstate);
}