[FFmpeg-cvslog] opus_pvq: port to allow for SIMD functions

Tue May 16 13:22:21 EEST 2017

ffmpeg | branch: master | Rostislav Pehlivanov <atomnuker at gmail.com> | Wed May 10 06:47:44 2017 +0100| [8e7e74df93d18c903164a67c861a428bd4244cb1] | committer: Rostislav Pehlivanov

opus_pvq: port to allow for SIMD functions

Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8e7e74df93d18c903164a67c861a428bd4244cb1
---

 libavcodec/opus_celt.c |  17 ++++---
 libavcodec/opus_celt.h |   6 ++-
 libavcodec/opus_pvq.c  | 118 +++++++++++++++++++++++++++----------------------
 libavcodec/opus_pvq.h  |  32 ++++++++------
 libavcodec/opusenc.c   |  12 +++--
 5 files changed, 107 insertions(+), 78 deletions(-)

diff --git a/libavcodec/opus_celt.c b/libavcodec/opus_celt.c
index aee8ddc616..feb604d9af 100644
--- a/libavcodec/opus_celt.c
+++ b/libavcodec/opus_celt.c
@@ -753,15 +753,15 @@ static void celt_decode_bands(CeltFrame *f, OpusRangeCoder *rc)
         }
 
         if (f->dual_stereo) {
-            cm[0] = ff_celt_decode_band(f, rc, i, X, NULL, band_size, b / 2, f->blocks,
+            cm[0] = f->pvq->decode_band(f->pvq, f, rc, i, X, NULL, band_size, b / 2, f->blocks,
                                         effective_lowband != -1 ? norm + (effective_lowband << f->size) : NULL, f->size,
                                         norm + band_offset, 0, 1.0f, lowband_scratch, cm[0]);
 
-            cm[1] = ff_celt_decode_band(f, rc, i, Y, NULL, band_size, b/2, f->blocks,
+            cm[1] = f->pvq->decode_band(f->pvq, f, rc, i, Y, NULL, band_size, b/2, f->blocks,
                                         effective_lowband != -1 ? norm2 + (effective_lowband << f->size) : NULL, f->size,
                                         norm2 + band_offset, 0, 1.0f, lowband_scratch, cm[1]);
         } else {
-            cm[0] = ff_celt_decode_band(f, rc, i, X, Y, band_size, b, f->blocks,
+            cm[0] = f->pvq->decode_band(f->pvq, f, rc, i, X, Y, band_size, b, f->blocks,
                                         effective_lowband != -1 ? norm + (effective_lowband << f->size) : NULL, f->size,
                                         norm + band_offset, 0, 1.0f, lowband_scratch, cm[0]|cm[1]);
             cm[1] = cm[0];
@@ -984,6 +984,8 @@ void ff_celt_free(CeltFrame **f)
     for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++)
         ff_mdct15_uninit(&frm->imdct[i]);
 
+    ff_celt_pvq_uninit(&frm->pvq);
+
     av_freep(&frm->dsp);
     av_freep(f);
 }
@@ -1006,11 +1008,12 @@ int ff_celt_init(AVCodecContext *avctx, CeltFrame **f, int output_channels)
     frm->avctx           = avctx;
     frm->output_channels = output_channels;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++) {
-        ret = ff_mdct15_init(&frm->imdct[i], 1, i + 3, -1.0f);
-        if (ret < 0)
+    for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++)
+        if ((ret = ff_mdct15_init(&frm->imdct[i], 1, i + 3, -1.0f)) < 0)
             goto fail;
-    }
+
+    if ((ret = ff_celt_pvq_init(&frm->pvq)) < 0)
+        goto fail;
 
     frm->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
     if (!frm->dsp) {
diff --git a/libavcodec/opus_celt.h b/libavcodec/opus_celt.h
index f0d55d600b..b80ade84f2 100644
--- a/libavcodec/opus_celt.h
+++ b/libavcodec/opus_celt.h
@@ -27,6 +27,7 @@
 #include <float.h>
 
 #include "opus.h"
+#include "opus_pvq.h"
 
 #include "mdct15.h"
 #include "libavutil/float_dsp.h"
@@ -43,6 +44,8 @@
 #define CELT_POSTFILTER_MINPERIOD    15
 #define CELT_ENERGY_SILENCE          (-28.0f)
 
+typedef struct CeltPVQ CeltPVQ;
+
 enum CeltSpread {
     CELT_SPREAD_NONE,
     CELT_SPREAD_LIGHT,
@@ -92,6 +95,7 @@ struct CeltFrame {
     MDCT15Context       *imdct[4];
     AVFloatDSPContext   *dsp;
     CeltBlock           block[2];
+    CeltPVQ             *pvq;
     int channels;
     int output_channels;
 
@@ -125,8 +129,6 @@ struct CeltFrame {
     int fine_priority[CELT_MAX_BANDS];
     int pulses       [CELT_MAX_BANDS];
     int tf_change    [CELT_MAX_BANDS];
-
-    DECLARE_ALIGNED(32, float, scratch)[22 * 8]; // MAX(ff_celt_freq_range) * 1<<CELT_MAX_LOG_BLOCKS
 };
 
 /* LCG for noise generation */
diff --git a/libavcodec/opus_pvq.c b/libavcodec/opus_pvq.c
index fa349c47da..2ac66a0ede 100644
--- a/libavcodec/opus_pvq.c
+++ b/libavcodec/opus_pvq.c
@@ -363,7 +363,7 @@ static inline float celt_decode_pulses(OpusRangeCoder *rc, int *y, uint32_t N, u
  * Faster than libopus's search, operates entirely in the signed domain.
  * Slightly worse/better depending on N, K and the input vector.
  */
-static int celt_pvq_search(float *X, int *y, int K, int N)
+static float ppp_pvq_search_c(float *X, int *y, int K, int N)
 {
     int i, y_norm = 0;
     float res = 0.0f, xy_norm = 0.0f;
@@ -408,17 +408,17 @@ static int celt_pvq_search(float *X, int *y, int K, int N)
         y[max_idx] += phase;
     }
 
-    return y_norm;
+    return (float)y_norm;
 }
 
 static uint32_t celt_alg_quant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_t K,
                                enum CeltSpread spread, uint32_t blocks, float gain,
-                               void *scratch)
+                               CeltPVQ *pvq)
 {
-    int *y = scratch;
+    int *y = pvq->qcoeff;
 
     celt_exp_rotation(X, N, blocks, K, spread, 1);
-    gain /= sqrtf(celt_pvq_search(X, y, K, N));
+    gain /= sqrtf(pvq->pvq_search(X, y, K, N));
     celt_encode_pulses(rc, y,  N, K);
     celt_normalize_residual(y, X, N, gain);
     celt_exp_rotation(X, N, blocks, K, spread, 0);
@@ -429,9 +429,9 @@ static uint32_t celt_alg_quant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_
     the final normalised signal in the current band. */
 static uint32_t celt_alg_unquant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_t K,
                                  enum CeltSpread spread, uint32_t blocks, float gain,
-                                 void *scratch)
+                                 CeltPVQ *pvq)
 {
-    int *y = scratch;
+    int *y = pvq->qcoeff;
 
     gain /= sqrtf(celt_decode_pulses(rc, y, N, K));
     celt_normalize_residual(y, X, N, gain);
@@ -477,19 +477,16 @@ static void celt_stereo_ms_decouple(float *X, float *Y, int N)
     }
 }
 
-#define QUANT_FN(name) uint32_t (*name)(CeltFrame *f, OpusRangeCoder *rc,      \
-                                        const int band, float *X, float *Y,    \
-                                        int N, int b, uint32_t blocks,         \
-                                        float *lowband, int duration,          \
-                                        float *lowband_out, int level,         \
-                                        float gain, float *lowband_scratch,    \
-                                        int fill)
-
-static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCoder *rc, const int band,
-                                                     float *X, float *Y, int N, int b, uint32_t blocks,
-                                                     float *lowband, int duration, float *lowband_out,
-                                                     int level, float gain, float *lowband_scratch,
-                                                     int fill, int quant)
+static av_always_inline uint32_t quant_band_template(CeltPVQ *pvq, CeltFrame *f,
+                                                     OpusRangeCoder *rc,
+                                                     const int band, float *X,
+                                                     float *Y, int N, int b,
+                                                     uint32_t blocks, float *lowband,
+                                                     int duration, float *lowband_out,
+                                                     int level, float gain,
+                                                     float *lowband_scratch,
+                                                     int fill, int quant,
+                                                     QUANT_FN(*rec))
 {
     int i;
     const uint8_t *cache;
@@ -505,7 +502,6 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
     float mid = 0, side = 0;
     int longblocks = (B0 == 1);
     uint32_t cm = 0;
-    QUANT_FN(rec) = quant ? ff_celt_encode_band : ff_celt_decode_band;
 
     if (N == 1) {
         float *x = X;
@@ -565,7 +561,7 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
 
         /* Reorganize the samples in time order instead of frequency order */
         if (B0 > 1 && (quant || lowband))
-            celt_deinterleave_hadamard(f->scratch, quant ? X : lowband,
+            celt_deinterleave_hadamard(pvq->hadamard_tmp, quant ? X : lowband,
                                        N_B >> recombine, B0 << recombine,
                                        longblocks);
     }
@@ -702,7 +698,7 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
             sign = 1 - 2 * sign;
             /* We use orig_fill here because we want to fold the side, but if
             itheta==16384, we'll have cleared the low bits of fill. */
-            cm = rec(f, rc, band, x2, NULL, N, mbits, blocks, lowband, duration,
+            cm = rec(pvq, f, rc, band, x2, NULL, N, mbits, blocks, lowband, duration,
                      lowband_out, level, gain, lowband_scratch, orig_fill);
             /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse),
             and there's no need to worry about mixing with the other channel. */
@@ -755,7 +751,7 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
             if (mbits >= sbits) {
                 /* In stereo mode, we do not apply a scaling to the mid
                  * because we need the normalized mid for folding later */
-                cm = rec(f, rc, band, X, NULL, N, mbits, blocks, lowband,
+                cm = rec(pvq, f, rc, band, X, NULL, N, mbits, blocks, lowband,
                          duration, next_lowband_out1, next_level,
                          stereo ? 1.0f : (gain * mid), lowband_scratch, fill);
                 rebalance = mbits - (rebalance - f->remaining2);
@@ -764,14 +760,14 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
 
                 /* For a stereo split, the high bits of fill are always zero,
                  * so no folding will be done to the side. */
-                cmt = rec(f, rc, band, Y, NULL, N, sbits, blocks, next_lowband2,
+                cmt = rec(pvq, f, rc, band, Y, NULL, N, sbits, blocks, next_lowband2,
                           duration, NULL, next_level, gain * side, NULL,
                           fill >> blocks);
                 cm |= cmt << ((B0 >> 1) & (stereo - 1));
             } else {
                 /* For a stereo split, the high bits of fill are always zero,
                  * so no folding will be done to the side. */
-                cm = rec(f, rc, band, Y, NULL, N, sbits, blocks, next_lowband2,
+                cm = rec(pvq, f, rc, band, Y, NULL, N, sbits, blocks, next_lowband2,
                          duration, NULL, next_level, gain * side, NULL, fill >> blocks);
                 cm <<= ((B0 >> 1) & (stereo - 1));
                 rebalance = sbits - (rebalance - f->remaining2);
@@ -780,7 +776,7 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
 
                 /* In stereo mode, we do not apply a scaling to the mid because
                  * we need the normalized mid for folding later */
-                cm |= rec(f, rc, band, X, NULL, N, mbits, blocks, lowband, duration,
+                cm |= rec(pvq, f, rc, band, X, NULL, N, mbits, blocks, lowband, duration,
                           next_lowband_out1, next_level, stereo ? 1.0f : (gain * mid),
                           lowband_scratch, fill);
             }
@@ -802,10 +798,10 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
             /* Finally do the actual (de)quantization */
             if (quant) {
                 cm = celt_alg_quant(rc, X, N, (q < 8) ? q : (8 + (q & 7)) << ((q >> 3) - 1),
-                                    f->spread, blocks, gain, f->scratch);
+                                    f->spread, blocks, gain, pvq);
             } else {
                 cm = celt_alg_unquant(rc, X, N, (q < 8) ? q : (8 + (q & 7)) << ((q >> 3) - 1),
-                                      f->spread, blocks, gain, f->scratch);
+                                      f->spread, blocks, gain, pvq);
             }
         } else {
             /* If there's no pulse, fill the band anyway */
@@ -845,7 +841,7 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
 
         /* Undo the sample reorganization going from time order to frequency order */
         if (B0 > 1)
-            celt_interleave_hadamard(f->scratch, X, N_B >> recombine,
+            celt_interleave_hadamard(pvq->hadamard_tmp, X, N_B >> recombine,
                                      B0 << recombine, longblocks);
 
         /* Undo time-freq changes that we did earlier */
@@ -876,33 +872,28 @@ static av_always_inline uint32_t quant_band_template(CeltFrame *f, OpusRangeCode
     return cm;
 }
 
-uint32_t ff_celt_decode_band(CeltFrame *f, OpusRangeCoder *rc, const int band,
-                             float *X, float *Y, int N, int b, uint32_t blocks,
-                             float *lowband, int duration, float *lowband_out,
-                             int level, float gain, float *lowband_scratch,
-                             int fill)
+
+static QUANT_FN(pvq_decode_band)
 {
-    return quant_band_template(f, rc, band, X, Y, N, b, blocks, lowband, duration,
-                               lowband_out, level, gain, lowband_scratch, fill, 0);
+    return quant_band_template(pvq, f, rc, band, X, Y, N, b, blocks, lowband, duration,
+                               lowband_out, level, gain, lowband_scratch, fill, 0,
+                               pvq->decode_band);
 }
 
-uint32_t ff_celt_encode_band(CeltFrame *f, OpusRangeCoder *rc, const int band,
-                             float *X, float *Y, int N, int b, uint32_t blocks,
-                             float *lowband, int duration, float *lowband_out,
-                             int level, float gain, float *lowband_scratch,
-                             int fill)
+static QUANT_FN(pvq_encode_band)
 {
-    return quant_band_template(f, rc, band, X, Y, N, b, blocks, lowband, duration,
-                               lowband_out, level, gain, lowband_scratch, fill, 1);
+    return quant_band_template(pvq, f, rc, band, X, Y, N, b, blocks, lowband, duration,
+                               lowband_out, level, gain, lowband_scratch, fill, 1,
+                               pvq->encode_band);
 }
 
-float ff_celt_quant_band_cost(CeltFrame *f, OpusRangeCoder *rc, int band, float *bits,
-                              float lambda)
+static float pvq_band_cost(CeltPVQ *pvq, CeltFrame *f, OpusRangeCoder *rc, int band,
+                           float *bits, float lambda)
 {
     int i, b = 0;
     uint32_t cm[2] = { (1 << f->blocks) - 1, (1 << f->blocks) - 1 };
     const int band_size = ff_celt_freq_range[band] << f->size;
-    float buf[352], lowband_scratch[176], norm1[176], norm2[176];
+    float buf[176 * 2], lowband_scratch[176], norm1[176], norm2[176];
     float dist, cost, err_x = 0.0f, err_y = 0.0f;
     float *X = buf;
     float *X_orig = f->block[0].coeffs + (ff_celt_freq_bands[band] << f->size);
@@ -921,14 +912,14 @@ float ff_celt_quant_band_cost(CeltFrame *f, OpusRangeCoder *rc, int band, float
     }
 
     if (f->dual_stereo) {
-        ff_celt_encode_band(f, rc, band, X, NULL, band_size, b / 2, f->blocks, NULL,
-                            f->size, norm1, 0, 1.0f, lowband_scratch, cm[0]);
+        pvq->encode_band(pvq, f, rc, band, X, NULL, band_size, b / 2, f->blocks, NULL,
+                         f->size, norm1, 0, 1.0f, lowband_scratch, cm[0]);
 
-        ff_celt_encode_band(f, rc, band, Y, NULL, band_size, b / 2, f->blocks, NULL,
-                            f->size, norm2, 0, 1.0f, lowband_scratch, cm[1]);
+        pvq->encode_band(pvq, f, rc, band, Y, NULL, band_size, b / 2, f->blocks, NULL,
+                         f->size, norm2, 0, 1.0f, lowband_scratch, cm[1]);
     } else {
-        ff_celt_encode_band(f, rc, band, X, Y, band_size, b, f->blocks, NULL, f->size,
-                            norm1, 0, 1.0f, lowband_scratch, cm[0] | cm[1]);
+        pvq->encode_band(pvq, f, rc, band, X, Y, band_size, b, f->blocks, NULL, f->size,
+                         norm1, 0, 1.0f, lowband_scratch, cm[0] | cm[1]);
     }
 
     for (i = 0; i < band_size; i++) {
@@ -944,3 +935,24 @@ float ff_celt_quant_band_cost(CeltFrame *f, OpusRangeCoder *rc, int band, float
 
     return lambda*dist*cost;
 }
+
+int av_cold ff_celt_pvq_init(CeltPVQ **pvq)
+{
+    CeltPVQ *s = av_malloc(sizeof(CeltPVQ));
+    if (!s)
+        return AVERROR(ENOMEM);
+
+    s->pvq_search         = ppp_pvq_search_c;
+    s->decode_band        = pvq_decode_band;
+    s->encode_band        = pvq_encode_band;
+    s->band_cost          = pvq_band_cost;
+
+    *pvq = s;
+
+    return 0;
+}
+
+void av_cold ff_celt_pvq_uninit(CeltPVQ **pvq)
+{
+    av_freep(pvq);
+}
diff --git a/libavcodec/opus_pvq.h b/libavcodec/opus_pvq.h
index 045015406b..6691494838 100644
--- a/libavcodec/opus_pvq.h
+++ b/libavcodec/opus_pvq.h
@@ -23,22 +23,28 @@
 #ifndef AVCODEC_OPUS_PVQ_H
 #define AVCODEC_OPUS_PVQ_H
 
-#include "opus.h"
 #include "opus_celt.h"
 
-/* Decodes a band using PVQ */
-uint32_t ff_celt_decode_band(CeltFrame *f, OpusRangeCoder *rc, const int band,
-                             float *X, float *Y, int N, int b, uint32_t blocks,
-                             float *lowband, int duration, float *lowband_out, int level,
-                             float gain, float *lowband_scratch, int fill);
+#define QUANT_FN(name) uint32_t (name)(struct CeltPVQ *pvq, CeltFrame *f,            \
+                                       OpusRangeCoder *rc, const int band, float *X, \
+                                       float *Y, int N, int b, uint32_t blocks,      \
+                                       float *lowband, int duration,                 \
+                                       float *lowband_out, int level, float gain,    \
+                                       float *lowband_scratch, int fill)
 
-/* Encodes a band using PVQ */
-uint32_t ff_celt_encode_band(CeltFrame *f, OpusRangeCoder *rc, const int band,
-                             float *X, float *Y, int N, int b, uint32_t blocks,
-                             float *lowband, int duration, float *lowband_out, int level,
-                             float gain, float *lowband_scratch, int fill);
+struct CeltPVQ {
+    DECLARE_ALIGNED(32, int,   qcoeff      )[176];
+    DECLARE_ALIGNED(32, float, hadamard_tmp)[176];
 
-float ff_celt_quant_band_cost(CeltFrame *f, OpusRangeCoder *rc, int band,
-                              float *bits, float lambda);
+    float (*pvq_search)(float *X, int *y, int K, int N);
+
+    QUANT_FN(*decode_band);
+    QUANT_FN(*encode_band);
+    float (*band_cost)(struct CeltPVQ *pvq, CeltFrame *f, OpusRangeCoder *rc,
+                       int band, float *bits, float lambda);
+};
+
+int  ff_celt_pvq_init  (struct CeltPVQ **pvq);
+void ff_celt_pvq_uninit(struct CeltPVQ **pvq);
 
 #endif /* AVCODEC_OPUS_PVQ_H */
diff --git a/libavcodec/opusenc.c b/libavcodec/opusenc.c
index 41e1a3fb38..303f11f7e7 100644
--- a/libavcodec/opusenc.c
+++ b/libavcodec/opusenc.c
@@ -55,6 +55,7 @@ typedef struct OpusEncContext {
     AudioFrameQueue afq;
     AVFloatDSPContext *dsp;
     MDCT15Context *mdct[CELT_BLOCK_NB];
+    CeltPVQ *pvq;
     struct FFBufQueue bufqueue;
 
     enum OpusMode mode;
@@ -797,15 +798,15 @@ static void celt_quant_bands(OpusRangeCoder *rc, CeltFrame *f)
         }
 
         if (f->dual_stereo) {
-            cm[0] = ff_celt_encode_band(f, rc, i, X, NULL, band_size, b / 2, f->blocks,
+            cm[0] = f->pvq->encode_band(f->pvq, f, rc, i, X, NULL, band_size, b / 2, f->blocks,
                                         effective_lowband != -1 ? norm + (effective_lowband << f->size) : NULL, f->size,
                                         norm + band_offset, 0, 1.0f, lowband_scratch, cm[0]);
 
-            cm[1] = ff_celt_encode_band(f, rc, i, Y, NULL, band_size, b / 2, f->blocks,
+            cm[1] = f->pvq->encode_band(f->pvq, f, rc, i, Y, NULL, band_size, b / 2, f->blocks,
                                         effective_lowband != -1 ? norm2 + (effective_lowband << f->size) : NULL, f->size,
                                         norm2 + band_offset, 0, 1.0f, lowband_scratch, cm[1]);
         } else {
-            cm[0] = ff_celt_encode_band(f, rc, i, X, Y, band_size, b, f->blocks,
+            cm[0] = f->pvq->encode_band(f->pvq, f, rc, i, X, Y, band_size, b, f->blocks,
                                         effective_lowband != -1 ? norm + (effective_lowband << f->size) : NULL, f->size,
                                         norm + band_offset, 0, 1.0f, lowband_scratch, cm[0] | cm[1]);
             cm[1] = cm[0];
@@ -883,6 +884,7 @@ static void ff_opus_psy_celt_frame_setup(OpusEncContext *s, CeltFrame *f, int in
 
     f->avctx = s->avctx;
     f->dsp = s->dsp;
+    f->pvq = s->pvq;
     f->start_band = (s->mode == OPUS_MODE_HYBRID) ? 17 : 0;
     f->end_band = ff_celt_band_end[s->bandwidth];
     f->channels = s->channels;
@@ -1019,6 +1021,7 @@ static av_cold int opus_encode_end(AVCodecContext *avctx)
     for (i = 0; i < CELT_BLOCK_NB; i++)
         ff_mdct15_uninit(&s->mdct[i]);
 
+    ff_celt_pvq_uninit(&s->pvq);
     av_freep(&s->dsp);
     av_freep(&s->frame);
     av_freep(&s->rc);
@@ -1075,6 +1078,9 @@ static av_cold int opus_encode_init(AVCodecContext *avctx)
 
     ff_af_queue_init(avctx, &s->afq);
 
+    if ((ret = ff_celt_pvq_init(&s->pvq)) < 0)
+        return ret;
+
     if (!(s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT)))
         return AVERROR(ENOMEM);