[FFmpeg-cvslog] r23647 - in trunk: Changelog libavcodec/Makefile libavcodec/aacdec.c libavcodec/aacsbr.c libavcodec/avcodec.h libavcodec/mpeg4audio.c libavcodec/ps.c libavcodec/ps.h libavcodec/ps_tablegen.c libavc...

Alex Converse alex.converse
Mon Jun 21 00:16:41 CEST 2010


On Sat, Jun 19, 2010 at 10:06 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Sat, Jun 19, 2010 at 04:14:51PM +0200, alexc wrote:
>> Author: alexc
>> Date: Sat Jun 19 16:14:51 2010
>> New Revision: 23647
>>
>> Log:
>> Add HE-AAC v2 support to the AAC decoder.
>>
>> Added:
>> ? ?trunk/libavcodec/ps.c
>> ? ?trunk/libavcodec/ps.h
>> ? ?trunk/libavcodec/ps_tablegen.c
>> ? ?trunk/libavcodec/ps_tablegen.h
>> ? ?trunk/libavcodec/psdata.c
>> Modified:
>> ? ?trunk/Changelog
>> ? ?trunk/libavcodec/Makefile
>> ? ?trunk/libavcodec/aacdec.c
>> ? ?trunk/libavcodec/aacsbr.c
>> ? ?trunk/libavcodec/avcodec.h
>> ? ?trunk/libavcodec/mpeg4audio.c
>> ? ?trunk/libavcodec/sbr.h
>
>
> [...]
>
>> @@ -1008,6 +1011,11 @@ static unsigned int read_sbr_data(AACCon
>> ? ? ? ? ? ? ?num_bits_left -= 2;
>> ? ? ? ? ? ? ?read_sbr_extension(ac, sbr, gb, get_bits(gb, 2), &num_bits_left); // bs_extension_id
>> ? ? ? ? ?}
>> + ? ? ? ?if (num_bits_left < 0) {
>> + ? ? ? ? ? ?av_log(ac->avctx, AV_LOG_ERROR, "SBR Extension over read.\n");
>> + ? ? ? ?}
>
> shouldnt this trigger error concealment
> or does it do it already somewhere?
>

Should be handled already be setting ps->start to zero...

int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host,
PSContext *ps, int bits_left)
[...]
    bits_consumed = get_bits_count(gb) - bit_count_start;
    if (bits_consumed <= bits_left) {
        skip_bits_long(gb_host, bits_consumed);
        return bits_consumed;
    }
    av_log(avctx, AV_LOG_ERROR, "Expected to read %d PS bits actually
read %d.\n", bits_left, bits_consumed);
err:
    ps->start = 0;
    skip_bits_long(gb_host, bits_left);
    return bits_left;
}

>
> [...]
>> @@ -1740,6 +1748,16 @@ void ff_sbr_apply(AACContext *ac, Spectr
>> ? ? ? ? ?/* synthesis */
>> ? ? ? ? ?sbr_x_gen(sbr, sbr->X[ch], sbr->X_low, sbr->data[ch].Y, ch);
>> ? ? ?}
>> +
>> + ? ?if (ac->m4ac.ps == 1) {
>> + ? ? ? ?if (sbr->ps.start) {
>> + ? ? ? ? ? ?ff_ps_apply(ac->avctx, &sbr->ps, sbr->X[0], sbr->X[1], sbr->kx[1] + sbr->m[1]);
>
>> + ? ? ? ?} else {
>> + ? ? ? ? ? ?memcpy(sbr->X[1], sbr->X[0], sizeof(sbr->X[0]));
>> + ? ? ? ?}
>
> if this is anything but rare then it should probably be done by changing the
> later code. if its rare it doesnt matter
>

PS is assumed for all SBR mono streams that don't explicitly signal
it's absence.

qmf_syntehsis clobbers the buffer

>
> [...]
>>
>>
>> Added: trunk/libavcodec/ps.c
>> ==============================================================================
>> --- /dev/null 00:00:00 1970 ? (empty, because file is newly added)
>> +++ trunk/libavcodec/ps.c ? ? Sat Jun 19 16:14:51 2010 ? ? ? ?(r23647)
>> @@ -0,0 +1,1124 @@
[...]
>> +
>
>> +#define PS_BASELINE 0
>
> needs doxy explaining what this does
>

Fixed

> [...]
>> +/**
>> + * Read Overall Phase Difference parameters from the bitstream.
>> + *
>> + * @param gb ? ?pointer to the input bitstream
>> + * @param ps ? ?pointer to the Parametric Stereo context
>> + * @param e ? ? envelope to decode
>> + * @param dt ? ?1: time delta-coded, 0: frequency delta-coded
>> + */
>> +static void opd_data(GetBitContext *gb, PSContext *ps, int e, int dt)
>> +{
>> + ? ?int b;
>> + ? ?int table_idx = dt ? huff_opd_dt : huff_opd_df;
>> + ? ?VLC_TYPE (*vlc_table)[2] = vlc_ps[table_idx].table;
>> + ? ?if (dt) {
>> + ? ? ? ?int e_prev = e ? e - 1 : ps->num_env_old - 1;
>> + ? ? ? ?e_prev = FFMAX(e_prev, 0);
>> + ? ? ? ?for (b = 0; b < ps->nr_ipdopd_par; b++) {
>> + ? ? ? ? ? ?ps->opd_par[e][b] = (ps->opd_par[e_prev][b] + get_vlc2(gb, vlc_table, 9, 1)) & 0x07;
>> + ? ? ? ?}
>> + ? ?} else {
>> + ? ? ? ?int prev = 0;
>> + ? ? ? ?for (b = 0; b < ps->nr_ipdopd_par; b++) {
>> + ? ? ? ? ? ?prev += get_vlc2(gb, vlc_table, 9, 3);
>> + ? ? ? ? ? ?prev &= 0x07;
>> + ? ? ? ? ? ?ps->opd_par[e][b] = prev;
>> + ? ? ? ?}
>> + ? ?}
>> +}
>> +
>> +static int ps_extension(GetBitContext *gb, PSContext *ps, int ps_extension_id)
>> +{
>> + ? ?int e;
>> + ? ?int count = get_bits_count(gb);
>> +
>> + ? ?if (ps_extension_id)
>> + ? ? ? ?return 0;
>> +
>> + ? ?ps->enable_ipdopd = get_bits1(gb);
>> + ? ?if (ps->enable_ipdopd) {
>> + ? ? ? ?for (e = 0; e < ps->num_env; e++) {
>> + ? ? ? ? ? ?int dt = get_bits1(gb);
>> + ? ? ? ? ? ?ipd_data(gb, ps, e, dt);
>> + ? ? ? ? ? ?dt = get_bits1(gb);
>> + ? ? ? ? ? ?opd_data(gb, ps, e, dt);
>
> all these functions do a dt = get_bits1(gb); before
> thus it could be merged into *_data()
> also i think read_*_data() is more understandable
> similarly read_ps_extension() would imho be clearer
>

fixed

>
>> + ? ? ? ?}
>> + ? ?}
>> + ? ?skip_bits1(gb); ? ? ?//reserved_ps
>> + ? ?return get_bits_count(gb) - count;
>> +}
>> +
>
>> +static void ipdopd_reset(int8_t *opd_hist, int8_t *ipd_hist)
>> +{
>> + ? ?int i;
>> + ? ?for (i = 0; i < PS_MAX_NR_IPDOPD; i++) {
>> + ? ? ? ?opd_hist[i] = 0;
>> + ? ? ? ?ipd_hist[i] = 0;
>> + ? ?}
>> +}
>
>> +
>> +int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps, int bits_left)
>> +{
>> + ? ?int e;
>> + ? ?int bit_count_start = get_bits_count(gb_host);
>> + ? ?int header;
>> + ? ?int bits_consumed;
>> + ? ?GetBitContext gbc = *gb_host, *gb = &gbc;
>> +
>
>> + ? ?header = get_bits1(gb);
>> + ? ?if (header) { ? ? //enable_ps_header
>
> it would be more readable if the variable was named enable_ps_header instead
> of this being mentioned in one use case of the variable
>

It's just a hint for reading the spec. Is "header" too vague? If
anything it probably makes the most sense to just eliminate that
comment.

>
>> + ? ? ? ?ps->enable_iid = get_bits1(gb);
>> + ? ? ? ?if (ps->enable_iid) {
>> + ? ? ? ? ? ?ps->iid_mode = get_bits(gb, 3);
>> + ? ? ? ? ? ?if (ps->iid_mode > 5) {
>> + ? ? ? ? ? ? ? ?av_log(avctx, AV_LOG_ERROR, "iid_mode %d is reserved.\n",
>> + ? ? ? ? ? ? ? ? ? ? ? ps->iid_mode);
>> + ? ? ? ? ? ? ? ?goto err;
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?ps->nr_iid_par ? ?= nr_iidicc_par_tab[ps->iid_mode];
>> + ? ? ? ? ? ?ps->iid_quant ? ? = ps->iid_mode > 2;
>> + ? ? ? ? ? ?ps->nr_ipdopd_par = nr_iidopd_par_tab[ps->iid_mode];
>
> iid_mode is only used here and thus could be a local variable
>

Fixed

>
>> + ? ? ? ?}
>> + ? ? ? ?ps->enable_icc = get_bits1(gb);
>> + ? ? ? ?if (ps->enable_icc) {
>> + ? ? ? ? ? ?ps->icc_mode = get_bits(gb, 3);
>> + ? ? ? ? ? ?if (ps->icc_mode > 5) {
>> + ? ? ? ? ? ? ? ?av_log(avctx, AV_LOG_ERROR, "icc_mode %d is reserved.\n",
>> + ? ? ? ? ? ? ? ? ? ? ? ps->icc_mode);
>> + ? ? ? ? ? ? ? ?goto err;
>
> is it safe to leave an invalid value in the context?
>

Based on how it's used (selecting mixing mode B vs mode A) it should be.

>
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?ps->nr_icc_par = nr_iidicc_par_tab[ps->icc_mode];
>> + ? ? ? ?}
>> + ? ? ? ?ps->enable_ext = get_bits1(gb);
>> + ? ?}
>> +
>> + ? ?ps->frame_class = get_bits1(gb);
>> + ? ?ps->num_env_old = ps->num_env;
>> + ? ?ps->num_env ? ? = num_env_tab[ps->frame_class][get_bits(gb, 2)];
>> +
>> + ? ?ps->border_position[0] = -1;
>> + ? ?if (ps->frame_class) {
>> + ? ? ? ?for (e = 1; e <= ps->num_env; e++)
>> + ? ? ? ? ? ?ps->border_position[e] = get_bits(gb, 5);
>> + ? ?} else
>> + ? ? ? ?for (e = 1; e <= ps->num_env; e++)
>> + ? ? ? ? ? ?ps->border_position[e] = e * numQMFSlots / ps->num_env - 1;
>
> the division is always by a power of 2 and can thus be a >>
>

While conceptually true we are looking at up to four divides per frame
here. Is this measurably better?

Fixed anyway

>
>> +
>> + ? ?if (ps->enable_iid) {
>> + ? ? ? ?for (e = 0; e < ps->num_env; e++) {
>> + ? ? ? ? ? ?int dt = get_bits1(gb);
>> + ? ? ? ? ? ?if (iid_data(avctx, gb, ps, e, dt))
>> + ? ? ? ? ? ? ? ?goto err;
>> + ? ? ? ?}
>> + ? ?} else
>> + ? ? ? ?memset(ps->iid_par, 0, sizeof(ps->iid_par));
>> +
>> + ? ?if (ps->enable_icc)
>> + ? ? ? ?for (e = 0; e < ps->num_env; e++) {
>> + ? ? ? ? ? ?int dt = get_bits1(gb);
>> + ? ? ? ? ? ?if (icc_data(avctx, gb, ps, e, dt))
>> + ? ? ? ? ? ? ? ?goto err;
>> + ? ? ? ?}
>> + ? ?else
>> + ? ? ? ?memset(ps->icc_par, 0, sizeof(ps->icc_par));
>> +
>> + ? ?if (ps->enable_ext) {
>> + ? ? ? ?int cnt = get_bits(gb, 4);
>> + ? ? ? ?if (cnt == 15) {
>> + ? ? ? ? ? ?cnt += get_bits(gb, 8);
>> + ? ? ? ?}
>> + ? ? ? ?cnt *= 8;
>> + ? ? ? ?while (cnt > 7) {
>> + ? ? ? ? ? ?int ps_extension_id = get_bits(gb, 2);
>> + ? ? ? ? ? ?cnt -= 2 + ps_extension(gb, ps, ps_extension_id);
>> + ? ? ? ?}
>> + ? ? ? ?if (cnt < 0) {
>> + ? ? ? ? ? ?av_log(avctx, AV_LOG_ERROR, "ps extension overflow %d", cnt);
>> + ? ? ? ? ? ?goto err;
>> + ? ? ? ?}
>> + ? ? ? ?skip_bits(gb, cnt);
>> + ? ?}
>> +
>> + ? ?ps->enable_ipdopd &= !PS_BASELINE;
>> +
>
>> + ? ?//Fix up envelopes
>> + ? ?if (!ps->num_env || ps->border_position[ps->num_env] < numQMFSlots - 1) {
>> + ? ? ? ?//Create a fake envelope
>> + ? ? ? ?int source = ps->num_env ? ps->num_env - 1 : ps->num_env_old - 1;
>> + ? ? ? ?if (source >= 0 && source != ps->num_env) {
>> + ? ? ? ? ? ?if (ps->enable_iid && ps->num_env_old > 1) {
>> + ? ? ? ? ? ? ? ?memcpy(ps->iid_par+ps->num_env, ps->iid_par+source, sizeof(ps->iid_par[0]));
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?if (ps->enable_icc && ps->num_env_old > 1) {
>> + ? ? ? ? ? ? ? ?memcpy(ps->icc_par+ps->num_env, ps->icc_par+source, sizeof(ps->icc_par[0]));
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?if (ps->enable_ipdopd && ps->num_env_old > 1) {
>> + ? ? ? ? ? ? ? ?memcpy(ps->ipd_par+ps->num_env, ps->ipd_par+source, sizeof(ps->ipd_par[0]));
>> + ? ? ? ? ? ? ? ?memcpy(ps->opd_par+ps->num_env, ps->opd_par+source, sizeof(ps->opd_par[0]));
>> + ? ? ? ? ? ?}
>
> ps->num_env_old > 1
> can be factored out into the parent if()
>

It isn't needed at all.
It is already rejected by the "source >= 0 && source != ps->num_env"
for the zero envelope case and is perfectly legally for the suppressed
final envelope case.

>> + ? ? ? ?}
>> + ? ? ? ?ps->num_env++;
>> + ? ? ? ?ps->border_position[ps->num_env] = numQMFSlots - 1;
>> + ? ?}
>> +
>> +
>> + ? ?ps->is34bands_old = ps->is34bands;
>> + ? ?if (!PS_BASELINE && (ps->enable_iid || ps->enable_icc))
>> + ? ? ? ?ps->is34bands = (ps->enable_iid && ps->nr_iid_par == 34) ||
>> + ? ? ? ? ? ? ? ? ? ? ? ?(ps->enable_icc && ps->nr_icc_par == 34);
>> +
>> + ? ?//Baseline
>> + ? ?if (!ps->enable_ipdopd) {
>> + ? ? ? ?memset(ps->ipd_par, 0, sizeof(ps->ipd_par));
>> + ? ? ? ?memset(ps->opd_par, 0, sizeof(ps->opd_par));
>> + ? ?}
>> +
>> + ? ?if (header)
>> + ? ? ? ?ps->start = 1;
>> +
>> + ? ?bits_consumed = get_bits_count(gb) - bit_count_start;
>> + ? ?if (bits_consumed <= bits_left) {
>> + ? ? ? ?skip_bits_long(gb_host, bits_consumed);
>> + ? ? ? ?return bits_consumed;
>> + ? ?}
>> + ? ?av_log(avctx, AV_LOG_ERROR, "Expected to read %d PS bits actually read %d.\n", bits_left, bits_consumed);
>> +err:
>> + ? ?ps->start = 0;
>> + ? ?skip_bits_long(gb_host, bits_left);
>> + ? ?return bits_left;
>> +}
>> +
>
>> +/** Split one subband into 2 subsubbands with a symmetric real filter.
>> + * The filter must have its non-center even coefficients equal to zero. */
>> +static void hybrid2_re(float (*in)[2], float (*out)[32][2], const float filter[7], int len, int reverse)
>> +{
>> + ? ?int i, j;
>> + ? ?for (i = 0; i < len; i++) {
>> + ? ? ? ?float re_in = filter[6] * in[6+i][0]; ? ? ? ?//real inphase
>> + ? ? ? ?float re_op = 0.0f; ? ? ? ? ? ? ? ? ? ? ? ? ?//real out of phase
>> + ? ? ? ?float im_in = filter[6] * in[6+i][1]; ? ? ? ?//imag inphase
>> + ? ? ? ?float im_op = 0.0f; ? ? ? ? ? ? ? ? ? ? ? ? ?//imag out of phase
>> + ? ? ? ?for (j = 0; j < 6; j += 2) {
>> + ? ? ? ? ? ?re_op += filter[j+1] * (in[i+j+1][0] + in[12-j-1+i][0]);
>> + ? ? ? ? ? ?im_op += filter[j+1] * (in[i+j+1][1] + in[12-j-1+i][1]);
>> + ? ? ? ?}
>> + ? ? ? ?out[ reverse][i][0] = re_in + re_op;
>> + ? ? ? ?out[ reverse][i][1] = im_in + im_op;
>> + ? ? ? ?out[!reverse][i][0] = re_in - re_op;
>> + ? ? ? ?out[!reverse][i][1] = im_in - im_op;
>
> a in++ would avoid quite a few +i
>

Fixed

>
>> + ? ?}
>> +}
>> +
>> +/** Split one subband into 6 subsubbands with a complex filter */
>> +static void hybrid6_cx(float (*in)[2], float (*out)[32][2], const float (*filter)[7][2], int len)
>> +{
>> + ? ?int i, j, ssb;
>> + ? ?int N = 8;
>> + ? ?float temp[8][2];
>> +
>> + ? ?for (i = 0; i < len; i++) {
>> + ? ? ? ?for (ssb = 0; ssb < N; ssb++) {
>
>> + ? ? ? ? ? ?float sum_re = filter[ssb][6][0] * in[i+6][0], sum_im = filter[ssb][6][0] * in[i+6][1];
>
> 0.125*
> no need to read filter[]
>

If that is the case we may as well hardcode the filter name because
the function ceases being generic.

>
>> + ? ? ? ? ? ?for (j = 0; j < 6; j++) {
>> + ? ? ? ? ? ? ? ?float in0_re = in[i+j][0];
>> + ? ? ? ? ? ? ? ?float in0_im = in[i+j][1];
>> + ? ? ? ? ? ? ? ?float in1_re = in[i+12-j][0];
>> + ? ? ? ? ? ? ? ?float in1_im = in[i+12-j][1];
>> + ? ? ? ? ? ? ? ?sum_re += filter[ssb][j][0] * (in0_re + in1_re) - filter[ssb][j][1] * (in0_im - in1_im);
>> + ? ? ? ? ? ? ? ?sum_im += filter[ssb][j][0] * (in0_im + in1_im) + filter[ssb][j][1] * (in0_re - in1_re);
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?temp[ssb][0] = sum_re;
>> + ? ? ? ? ? ?temp[ssb][1] = sum_im;
>> + ? ? ? ?}
>
> filter[*][0][*] has the same value for all *, so does filter[*][4][*]
> filter[*][2][X] has the same value for all *, so does filter[*][6][X]
> ...
>
> in summary you have a table of 112 values but they are the
> same 10 values repeated over and over
> please dont do 10x redundant multiplies ;)
>

There is even more redundancy than that. I just don't know the best
way to exploit it all.

> also in++ can be factored out
>
>
>> + ? ? ? ?out[0][i][0] = temp[6][0];
>> + ? ? ? ?out[0][i][1] = temp[6][1];
>> + ? ? ? ?out[1][i][0] = temp[7][0];
>> + ? ? ? ?out[1][i][1] = temp[7][1];
>> + ? ? ? ?out[2][i][0] = temp[0][0];
>> + ? ? ? ?out[2][i][1] = temp[0][1];
>> + ? ? ? ?out[3][i][0] = temp[1][0];
>> + ? ? ? ?out[3][i][1] = temp[1][1];
>
>> + ? ? ? ?out[4][i][0] = temp[2][0] + temp[5][0];
>> + ? ? ? ?out[4][i][1] = temp[2][1] + temp[5][1];
>> + ? ? ? ?out[5][i][0] = temp[3][0] + temp[4][0];
>> + ? ? ? ?out[5][i][1] = temp[3][1] + temp[4][1];
>
> this part looks like it can be merged into te filter coeffs
> reducing computations by half
>

In that case we may lose the ability to exploit some of the redundancy
mentioned above.

> also the coeffs should be sorted so that no temporary is needed

agreed

>
>
>
>> + ? ?}
>> +}
>> +
>> +static void hybrid4_8_12_cx(float (*in)[2], float (*out)[32][2], const float (*filter)[7][2], int N, int len)
>
> similar things apply here
>

Similar comments apply here

>
>> +{
>> + ? ?int i, j, ssb;
>> +
>> + ? ?for (i = 0; i < len; i++) {
>> + ? ? ? ?for (ssb = 0; ssb < N; ssb++) {
>> + ? ? ? ? ? ?float sum_re = filter[ssb][6][0] * in[i+6][0], sum_im = filter[ssb][6][0] * in[i+6][1];
>> + ? ? ? ? ? ?for (j = 0; j < 6; j++) {
>> + ? ? ? ? ? ? ? ?float in0_re = in[i+j][0];
>> + ? ? ? ? ? ? ? ?float in0_im = in[i+j][1];
>> + ? ? ? ? ? ? ? ?float in1_re = in[i+12-j][0];
>> + ? ? ? ? ? ? ? ?float in1_im = in[i+12-j][1];
>> + ? ? ? ? ? ? ? ?sum_re += filter[ssb][j][0] * (in0_re + in1_re) - filter[ssb][j][1] * (in0_im - in1_im);
>> + ? ? ? ? ? ? ? ?sum_im += filter[ssb][j][0] * (in0_im + in1_im) + filter[ssb][j][1] * (in0_re - in1_re);
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?out[ssb][i][0] = sum_re;
>> + ? ? ? ? ? ?out[ssb][i][1] = sum_im;
>> + ? ? ? ?}
>> + ? ?}
>> +}
>> +
>> +static void hybrid_analysis(float out[91][32][2], float in[5][44][2], float L[2][38][64], int is34, int len)
>> +{
>> + ? ?int i, j;
>
>> + ? ?for (i = 0; i < 5; i++) {
>> + ? ? ? ?for (j = 0; j < 38; j++) {
>> + ? ? ? ? ? ?in[i][j+6][0] = L[0][j][i];
>> + ? ? ? ? ? ?in[i][j+6][1] = L[1][j][i];
>> + ? ? ? ?}
>> + ? ?}
>
> i suspect this reordering copy can be avoided through restructuring of the
> code

some of the copying can be avoided. The filtering code needs the input
to be in one contiguous array.

>
>
>> + ? ?if(is34) {
>> + ? ? ? ?hybrid4_8_12_cx(in[0], out, ? ?f34_0_12, 12, len);
>> + ? ? ? ?hybrid4_8_12_cx(in[1], out+12, f34_1_8, ? 8, len);
>> + ? ? ? ?hybrid4_8_12_cx(in[2], out+20, f34_2_4, ? 4, len);
>> + ? ? ? ?hybrid4_8_12_cx(in[3], out+24, f34_2_4, ? 4, len);
>> + ? ? ? ?hybrid4_8_12_cx(in[4], out+28, f34_2_4, ? 4, len);
>> + ? ? ? ?for (i = 0; i < 59; i++) {
>> + ? ? ? ? ? ?for (j = 0; j < len; j++) {
>> + ? ? ? ? ? ? ? ?out[i+32][j][0] = L[0][j][i+5];
>> + ? ? ? ? ? ? ? ?out[i+32][j][1] = L[1][j][i+5];
>> + ? ? ? ? ? ?}
>> + ? ? ? ?}
>> + ? ?} else {
>> + ? ? ? ?hybrid6_cx(in[0], out, f20_0_8, len);
>> + ? ? ? ?hybrid2_re(in[1], out+6, g1_Q2, len, 1);
>> + ? ? ? ?hybrid2_re(in[2], out+8, g1_Q2, len, 0);
>
>> + ? ? ? ?for (i = 0; i < 61; i++) {
>> + ? ? ? ? ? ?for (j = 0; j < len; j++) {
>> + ? ? ? ? ? ? ? ?out[i+10][j][0] = L[0][j][i+3];
>> + ? ? ? ? ? ? ? ?out[i+10][j][1] = L[1][j][i+3];
>> + ? ? ? ? ? ?}
>> + ? ? ? ?}
>
> maybe later code can be run twice, that is once on out0..10 and once
> on L3.. so as to avoid this copy
> ?

That would require transposing all the later code, no?
probably needs a benchmark.

>
> also analysis splits out L -> Lbuf and synthesis moves Lbuf->L back
> both doing a copy for te higher indexes. Where they done inplace
> this copy would disappear possibly unles i miss something
>

That is true, again there is an ordering mismatch between between
optimal sbr qmf synthesis and PS causing the need for a transpose.

>
> [...]
>> +static void map_idx_34_to_20(int8_t *par_mapped, const int8_t *par, int full)
>> +{
>
>> + ? ?par_mapped[ 0] = (2*par[ 0] + ? par[ 1]) / 3;
>
> ((i+(i<0)+1)*341)>>10;
> or a LUT may be faster
>

benchmarks?

>
> [...]
>> +static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[32][2], int is34)
>> +{
>> + ? ?float power[34][PS_QMF_TIME_SLOTS] = {{0}};
>> + ? ?float transient_gain[34][PS_QMF_TIME_SLOTS];
>> + ? ?float *peak_decay_nrg = ps->peak_decay_nrg;
>> + ? ?float *power_smooth = ps->power_smooth;
>> + ? ?float *peak_decay_diff_smooth = ps->peak_decay_diff_smooth;
>> + ? ?float (*delay)[PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2] = ps->delay;
>> + ? ?float (*ap_delay)[PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2] = ps->ap_delay;
>> + ? ?const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
>> + ? ?const float peak_decay_factor = 0.76592833836465f;
>> + ? ?const float transient_impact ?= 1.5f;
>> + ? ?const float a_smooth ? ? ? ? ?= 0.25f; //< Smoothing coefficient
>> + ? ?int i, k, m, n;
>> + ? ?int n0 = 0, nL = 32;
>> + ? ?static const int link_delay[] = { 3, 4, 5 };
>> + ? ?static const float a[] = { 0.65143905753106f,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 0.56471812200776f,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 0.48954165955695f };
>> +
>> + ? ?if (is34 != ps->is34bands_old) {
>> + ? ? ? ?memset(ps->peak_decay_nrg, ? ? ? ? 0, sizeof(ps->peak_decay_nrg));
>> + ? ? ? ?memset(ps->power_smooth, ? ? ? ? ? 0, sizeof(ps->power_smooth));
>> + ? ? ? ?memset(ps->peak_decay_diff_smooth, 0, sizeof(ps->peak_decay_diff_smooth));
>> + ? ? ? ?memset(ps->delay, ? ? ? ? ? ? ? ? ?0, sizeof(ps->delay));
>> + ? ? ? ?memset(ps->ap_delay, ? ? ? ? ? ? ? 0, sizeof(ps->ap_delay));
>> + ? ?}
>> +
>> + ? ?for (n = n0; n < nL; n++) {
>> + ? ? ? ?for (k = 0; k < NR_BANDS[is34]; k++) {
>> + ? ? ? ? ? ?int i = k_to_i[k];
>> + ? ? ? ? ? ?power[i][n] += s[k][n][0] * s[k][n][0] + s[k][n][1] * s[k][n][1];
>> + ? ? ? ?}
>> + ? ?}
>> +
>> + ? ?//Transient detection
>> + ? ?for (i = 0; i < NR_PAR_BANDS[is34]; i++) {
>> + ? ? ? ?for (n = n0; n < nL; n++) {
>> + ? ? ? ? ? ?float decayed_peak = peak_decay_factor * peak_decay_nrg[i];
>> + ? ? ? ? ? ?float denom;
>> + ? ? ? ? ? ?peak_decay_nrg[i] = FFMAX(decayed_peak, power[i][n]);
>> + ? ? ? ? ? ?power_smooth[i] += a_smooth * (power[i][n] - power_smooth[i]);
>> + ? ? ? ? ? ?peak_decay_diff_smooth[i] += a_smooth * (peak_decay_nrg[i] - power[i][n] - peak_decay_diff_smooth[i]);
>> + ? ? ? ? ? ?denom = transient_impact * peak_decay_diff_smooth[i];
>> + ? ? ? ? ? ?transient_gain[i][n] ? = (denom > power_smooth[i]) ?
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? power_smooth[i] / denom : 1.0f;
>> + ? ? ? ?}
>> + ? ?}
>> +
>> + ? ?//Decorrelation and transient reduction
>> + ? ?// ? ? ? ? ? ? ? ? ? ? ? ? PS_AP_LINKS - 1
>> + ? ?// ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? -----
>> + ? ?// ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?| | ?Q_fract_allpass[k][m]*z^-link_delay[m] - a[m]*g_decay_slope[k]
>> + ? ?//H[k][z] = z^-2 * phi_fract[k] * | | ----------------------------------------------------------------
>> + ? ?// ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?| | 1 - a[m]*g_decay_slope[k]*Q_fract_allpass[k][m]*z^-link_delay[m]
>> + ? ?// ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? m = 0
>> + ? ?//d[k][z] (out) = transient_gain_mapped[k][z] * H[k][z] * s[k][z]
>> + ? ?for (k = 0; k < NR_ALLPASS_BANDS[is34]; k++) {
>> + ? ? ? ?int b = k_to_i[k];
>> + ? ? ? ?float g_decay_slope = 1.f - DECAY_SLOPE * (k - DECAY_CUTOFF[is34]);
>> + ? ? ? ?float ag[PS_AP_LINKS];
>> + ? ? ? ?g_decay_slope = av_clipf(g_decay_slope, 0.f, 1.f);
>> + ? ? ? ?memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
>> + ? ? ? ?memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
>> + ? ? ? ?for (m = 0; m < PS_AP_LINKS; m++) {
>> + ? ? ? ? ? ?memcpy(ap_delay[k][m], ? ap_delay[k][m]+numQMFSlots, ? ? ? ? ? 5*sizeof(ap_delay[k][m][0]));
>> + ? ? ? ? ? ?ag[m] = a[m] * g_decay_slope;
>> + ? ? ? ?}
>> + ? ? ? ?for (n = n0; n < nL; n++) {
>> + ? ? ? ? ? ?float in_re = delay[k][n+PS_MAX_DELAY-2][0] * phi_fract[is34][k][0] -
>> + ? ? ? ? ? ? ? ? ? ? ? ? ?delay[k][n+PS_MAX_DELAY-2][1] * phi_fract[is34][k][1];
>> + ? ? ? ? ? ?float in_im = delay[k][n+PS_MAX_DELAY-2][0] * phi_fract[is34][k][1] +
>> + ? ? ? ? ? ? ? ? ? ? ? ? ?delay[k][n+PS_MAX_DELAY-2][1] * phi_fract[is34][k][0];
>> + ? ? ? ? ? ?for (m = 0; m < PS_AP_LINKS; m++) {
>> + ? ? ? ? ? ? ? ?float a_re ? ? ? ? ? ? ? ?= ag[m] * in_re;
>> + ? ? ? ? ? ? ? ?float a_im ? ? ? ? ? ? ? ?= ag[m] * in_im;
>> + ? ? ? ? ? ? ? ?float link_delay_re ? ? ? = ap_delay[k][m][n+5-link_delay[m]][0];
>> + ? ? ? ? ? ? ? ?float link_delay_im ? ? ? = ap_delay[k][m][n+5-link_delay[m]][1];
>> + ? ? ? ? ? ? ? ?float fractional_delay_re = Q_fract_allpass[is34][k][m][0];
>> + ? ? ? ? ? ? ? ?float fractional_delay_im = Q_fract_allpass[is34][k][m][1];
>> + ? ? ? ? ? ? ? ?ap_delay[k][m][n+5][0] = in_re;
>> + ? ? ? ? ? ? ? ?ap_delay[k][m][n+5][1] = in_im;
>> + ? ? ? ? ? ? ? ?in_re = link_delay_re * fractional_delay_re - link_delay_im * fractional_delay_im - a_re;
>> + ? ? ? ? ? ? ? ?in_im = link_delay_re * fractional_delay_im + link_delay_im * fractional_delay_re - a_im;
>> + ? ? ? ? ? ? ? ?ap_delay[k][m][n+5][0] += ag[m] * in_re;
>> + ? ? ? ? ? ? ? ?ap_delay[k][m][n+5][1] += ag[m] * in_im;
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?out[k][n][0] = transient_gain[b][n] * in_re;
>> + ? ? ? ? ? ?out[k][n][1] = transient_gain[b][n] * in_im;
>> + ? ? ? ?}
>> + ? ?}
>> + ? ?for (; k < SHORT_DELAY_BAND[is34]; k++) {
>> + ? ? ? ?memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
>> + ? ? ? ?memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
>> + ? ? ? ?for (n = n0; n < nL; n++) {
>> + ? ? ? ? ? ?//H = delay 14
>> + ? ? ? ? ? ?out[k][n][0] = transient_gain[k_to_i[k]][n] * delay[k][n+PS_MAX_DELAY-14][0];
>> + ? ? ? ? ? ?out[k][n][1] = transient_gain[k_to_i[k]][n] * delay[k][n+PS_MAX_DELAY-14][1];
>> + ? ? ? ?}
>> + ? ?}
>> + ? ?for (; k < NR_BANDS[is34]; k++) {
>> + ? ? ? ?memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
>> + ? ? ? ?memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
>> + ? ? ? ?for (n = n0; n < nL; n++) {
>> + ? ? ? ? ? ?//H = delay 1
>> + ? ? ? ? ? ?out[k][n][0] = transient_gain[k_to_i[k]][n] * delay[k][n+PS_MAX_DELAY-1][0];
>> + ? ? ? ? ? ?out[k][n][1] = transient_gain[k_to_i[k]][n] * delay[k][n+PS_MAX_DELAY-1][1];
>> + ? ? ? ?}
>> + ? ?}
>
> ive not inbstigated now but are all the memcpy above needed?
>

Other suggestions on managing that buffer are welcome. Under current
structuring none of those memcpys are superfluous.

>
> [...]
>> +static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2], int is34)
>> +{
>> + ? ?int e, b, k, n;
>> +
>> + ? ?float (*H11)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H11;
>> + ? ?float (*H12)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H12;
>> + ? ?float (*H21)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H21;
>> + ? ?float (*H22)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H22;
>> + ? ?int8_t *opd_hist = ps->opd_hist;
>> + ? ?int8_t *ipd_hist = ps->ipd_hist;
>> + ? ?int8_t iid_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
>> + ? ?int8_t icc_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
>> + ? ?int8_t ipd_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
>> + ? ?int8_t opd_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
>> + ? ?int8_t (*iid_mapped)[PS_MAX_NR_IIDICC] = iid_mapped_buf;
>> + ? ?int8_t (*icc_mapped)[PS_MAX_NR_IIDICC] = icc_mapped_buf;
>> + ? ?int8_t (*ipd_mapped)[PS_MAX_NR_IIDICC] = ipd_mapped_buf;
>> + ? ?int8_t (*opd_mapped)[PS_MAX_NR_IIDICC] = opd_mapped_buf;
>> + ? ?const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
>> + ? ?const float (*H_LUT)[8][4] = (PS_BASELINE || ps->icc_mode < 3) ? HA : HB;
>> +
>
>> + ? ?//Remapping
>> + ? ?for (b = 0; b < PS_MAX_NR_IIDICC; b++) {
>> + ? ? ? ?H11[0][0][b] = H11[0][ps->num_env_old][b];
>> + ? ? ? ?H12[0][0][b] = H12[0][ps->num_env_old][b];
>> + ? ? ? ?H21[0][0][b] = H21[0][ps->num_env_old][b];
>> + ? ? ? ?H22[0][0][b] = H22[0][ps->num_env_old][b];
>> + ? ? ? ?H11[1][0][b] = H11[1][ps->num_env_old][b];
>> + ? ? ? ?H12[1][0][b] = H12[1][ps->num_env_old][b];
>> + ? ? ? ?H21[1][0][b] = H21[1][ps->num_env_old][b];
>> + ? ? ? ?H22[1][0][b] = H22[1][ps->num_env_old][b];
>> + ? ?}
>
> i suspect this would be faster with memcpies()
>

Fixed

>
>
> [...]
>> + ? ? ? ? ? ?//Is this necessary? ps_04_new seems unchanged
>> + ? ? ? ? ? ?if ((is34 && k <= 13 && k >= 9) || (!is34 && k <= 1)) {
>> + ? ? ? ? ? ? ? ?h11i = -H11[1][e][b];
>> + ? ? ? ? ? ? ? ?h12i = -H12[1][e][b];
>> + ? ? ? ? ? ? ? ?h21i = -H21[1][e][b];
>> + ? ? ? ? ? ? ? ?h22i = -H22[1][e][b];
>> + ? ? ? ? ? ?} else {
>> + ? ? ? ? ? ? ? ?h11i = H11[1][e][b];
>> + ? ? ? ? ? ? ? ?h12i = H12[1][e][b];
>> + ? ? ? ? ? ? ? ?h21i = H21[1][e][b];
>> + ? ? ? ? ? ? ? ?h22i = H22[1][e][b];
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?//Interpolation
>> + ? ? ? ? ? ?h11r_step = (H11[0][e+1][b] - h11r) * width;
>> + ? ? ? ? ? ?h12r_step = (H12[0][e+1][b] - h12r) * width;
>> + ? ? ? ? ? ?h21r_step = (H21[0][e+1][b] - h21r) * width;
>> + ? ? ? ? ? ?h22r_step = (H22[0][e+1][b] - h22r) * width;
>> + ? ? ? ? ? ?if (!PS_BASELINE && ps->enable_ipdopd) {
>> + ? ? ? ? ? ? ? ?h11i_step = (H11[1][e+1][b] - h11i) * width;
>> + ? ? ? ? ? ? ? ?h12i_step = (H12[1][e+1][b] - h12i) * width;
>> + ? ? ? ? ? ? ? ?h21i_step = (H21[1][e+1][b] - h21i) * width;
>> + ? ? ? ? ? ? ? ?h22i_step = (H22[1][e+1][b] - h22i) * width;
>> + ? ? ? ? ? ?}
>> + ? ? ? ? ? ?for (n = start + 1; n <= stop; n++) {
>> + ? ? ? ? ? ? ? ?//l is s, r is d
>> + ? ? ? ? ? ? ? ?float l_re = l[k][n][0];
>> + ? ? ? ? ? ? ? ?float l_im = l[k][n][1];
>> + ? ? ? ? ? ? ? ?float r_re = r[k][n][0];
>> + ? ? ? ? ? ? ? ?float r_im = r[k][n][1];
>> + ? ? ? ? ? ? ? ?h11r += h11r_step;
>> + ? ? ? ? ? ? ? ?h12r += h12r_step;
>> + ? ? ? ? ? ? ? ?h21r += h21r_step;
>> + ? ? ? ? ? ? ? ?h22r += h22r_step;
>
>> + ? ? ? ? ? ? ? ?if (!PS_BASELINE && ps->enable_ipdopd) {
>> + ? ? ? ? ? ? ? ?h11i += h11i_step;
>> + ? ? ? ? ? ? ? ?h12i += h12i_step;
>> + ? ? ? ? ? ? ? ?h21i += h21i_step;
>> + ? ? ? ? ? ? ? ?h22i += h22i_step;
>> +
>> + ? ? ? ? ? ? ? ?l[k][n][0] = h11r*l_re + h21r*r_re - h11i*l_im - h21i*r_im;
>> + ? ? ? ? ? ? ? ?l[k][n][1] = h11r*l_im + h21r*r_im + h11i*l_re + h21i*r_re;
>> + ? ? ? ? ? ? ? ?r[k][n][0] = h12r*l_re + h22r*r_re - h12i*l_im - h22i*r_im;
>> + ? ? ? ? ? ? ? ?r[k][n][1] = h12r*l_im + h22r*r_im + h12i*l_re + h22i*r_re;
>> + ? ? ? ? ? ? ? ?} else {
>> + ? ? ? ? ? ? ? ?l[k][n][0] = h11r*l_re + h21r*r_re;
>> + ? ? ? ? ? ? ? ?l[k][n][1] = h11r*l_im + h21r*r_im;
>> + ? ? ? ? ? ? ? ?r[k][n][0] = h12r*l_re + h22r*r_re;
>> + ? ? ? ? ? ? ? ?r[k][n][1] = h12r*l_im + h22r*r_im;
>> + ? ? ? ? ? ? ? ?}
>
> the indention here is off
>

fixed

>
> [...]
>> +#define PS_MAX_NUM_ENV 5
>> +#define PS_MAX_NR_IIDICC 34
>> +#define PS_MAX_NR_IPDOPD 17
>> +#define PS_MAX_SSB 91
>> +#define PS_MAX_AP_BANDS 50
>> +#define PS_QMF_TIME_SLOTS 32
>> +#define PS_MAX_DELAY 14
>> +#define PS_AP_LINKS 3
>> +#define PS_MAX_AP_DELAY 5
>> +
>> +typedef struct {
>
>> + ? ?int ? ?start;
>
> needs documentation, what does this field mean.
>

It means start applying PS based on bitstream decoded parameters.
The exact same thing it means for SBR which is also undocumented.

> [...]
>> Added: trunk/libavcodec/ps_tablegen.h
>> ==============================================================================
>> --- /dev/null 00:00:00 1970 ? (empty, because file is newly added)
>> +++ trunk/libavcodec/ps_tablegen.h ? ?Sat Jun 19 16:14:51 2010 ? ? ? ?(r23647)
[...]
>> +#ifndef PS_TABLEGEN_H
>> +#define PS_TABLEGEN_H
>> +
>> +#include <stdint.h>
>> +#include <math.h>
>> +
>> +#if CONFIG_HARDCODED_TABLES
>> +#define ps_tableinit()
>> +#include "libavcodec/ps_tables.h"
>> +#else
>> +#include "../libavutil/common.h"
>> +#ifndef M_SQRT1_2
>> +#define M_SQRT1_2 ? ? ?0.70710678118654752440 ?/* 1/sqrt(2) */
>> +#endif
>> +#ifndef M_PI
>> +#define M_PI ? ? ? ? ? 3.14159265358979323846 ?/* pi */
>> +#endif
>> +#ifndef M_SQRT2
>> +#define M_SQRT2 ? ? ? ?1.41421356237309504880 ?/* sqrt(2) */
>
> these constants should be already in some libavutil header
>

Already addressed

[...]



More information about the ffmpeg-cvslog mailing list