[FFmpeg-devel] One pass volume normalization (ebur128)

Nicolas George nicolas.george at normalesup.org
Sat Jul 13 21:41:52 CEST 2013


Le quintidi 25 messidor, an CCXXI, Jan Ehrhardt a écrit :
> Subject: [FFmpeg-devel] One pass volume normalization (ebur128)

Single-pass volume normalization is not possible, please do not call the
feature that way.

> I am once again proposing a patch for one pass volume normalization base
> on ebur128, as I see this still did not make it into FFMpeg 2.0. My
> patch is heaveily based on Clément Bœsch's proposal in
> http://permalink.gmane.org/gmane.comp.video.ffmpeg.devel/159978
> 
> We have been using this patch now for more than 4 months and 1800+
> videos of approximately 1 hour have been transcoded with it.
> 
> Part of our FFMpeg commandline reads as
> -filter_complex \
>     "[0:v]setpts=PTS-STARTPTS[v0];[0:a]asetpts=PTS-STARTPTS,ebur128=metadata=1,volume=metadata=lavfi.r128.I,ebur128[a0]" \
>    -map [v0] -map [a0]

r128.I is not a good choice, but there is nothing better yet.

> 
> It uses the already present ebur128 meta injection to adjust the
> volume on the fly. What would be the objection to move this into the
> FFMpeg core, so I do not have to patch my FFMpeg every time I compile
> a new one? I applied the patch below to FFMpeg Release/v.2.0.
> 
> Jan
> 
> 
> diff --git a/libavfilter/af_volume.c b/libavfilter/af_volume.c
> index a2ac1e2..6372bb2 100644
> --- a/libavfilter/af_volume.c
> +++ b/libavfilter/af_volume.c

Missing documentation update.

> @@ -51,18 +51,24 @@ static const AVOption volume_options[] = {
>          { "fixed",  "select 8-bit fixed-point",     0, AV_OPT_TYPE_CONST, { .i64 = PRECISION_FIXED  }, INT_MIN, INT_MAX, A|F, "precision" },
>          { "float",  "select 32-bit floating-point", 0, AV_OPT_TYPE_CONST, { .i64 = PRECISION_FLOAT  }, INT_MIN, INT_MAX, A|F, "precision" },
>          { "double", "select 64-bit floating-point", 0, AV_OPT_TYPE_CONST, { .i64 = PRECISION_DOUBLE }, INT_MIN, INT_MAX, A|F, "precision" },

> +    { "metadata", "set the metadata key for loudness normalization", OFFSET(metadata), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = A|F },

Inconsistent indentation.

>      { NULL },
>  };
>  
>  AVFILTER_DEFINE_CLASS(volume);
>  
> +static void set_fixed_volume(VolumeContext *vol, double volume)
> +{
> +    vol->volume_i = (int)(volume * 256 + 0.5);
> +    vol->volume   = vol->volume_i / 256.0;
> +}
> +
>  static av_cold int init(AVFilterContext *ctx)
>  {
>      VolumeContext *vol = ctx->priv;
>  
>      if (vol->precision == PRECISION_FIXED) {
> -        vol->volume_i = (int)(vol->volume * 256 + 0.5);
> -        vol->volume   = vol->volume_i / 256.0;
> +        set_fixed_volume(vol, vol->volume);
>          av_log(ctx, AV_LOG_VERBOSE, "volume:(%d/256)(%f)(%1.2fdB) precision:fixed\n",
>                 vol->volume_i, vol->volume, 20.0*log(vol->volume)/M_LN10);
>      } else {
> @@ -171,13 +177,13 @@ static av_cold void volume_init(VolumeContext *vol)
>  
>      switch (av_get_packed_sample_fmt(vol->sample_fmt)) {
>      case AV_SAMPLE_FMT_U8:
> -        if (vol->volume_i < 0x1000000)
> +        if (vol->volume_i < 0x1000000 && !vol->metadata)
>              vol->scale_samples = scale_samples_u8_small;
>          else
>              vol->scale_samples = scale_samples_u8;
>          break;
>      case AV_SAMPLE_FMT_S16:
> -        if (vol->volume_i < 0x10000)
> +        if (vol->volume_i < 0x10000 && !vol->metadata)
>              vol->scale_samples = scale_samples_s16_small;
>          else
>              vol->scale_samples = scale_samples_s16;
> @@ -216,11 +222,30 @@ static int config_output(AVFilterLink *outlink)
>  
>  static int filter_frame(AVFilterLink *inlink, AVFrame *buf)
>  {
> -    VolumeContext *vol    = inlink->dst->priv;
> -    AVFilterLink *outlink = inlink->dst->outputs[0];
> +    AVFilterContext *ctx  = inlink->dst;
> +    VolumeContext *vol    = ctx->priv;
> +    AVFilterLink *outlink = ctx->outputs[0];
>      int nb_samples        = buf->nb_samples;
>      AVFrame *out_buf;
>  
> +    if (vol->metadata) {
> +        double loudness, new_volume, timestamp, mx;
> +        AVDictionaryEntry *e;
> +        mx = 20; 
> +        timestamp = (float)(1.0 * buf->pts / outlink->sample_rate);
> +        mx = fmin(mx, timestamp);
> +        e = av_dict_get(buf->metadata, vol->metadata, NULL, 0);
> +        if (e) {
> +            loudness = av_strtod(e->value, NULL);

> +            if (loudness > -69) {
> +                new_volume = fmax(-mx,fmin(mx,(-23 - loudness)));
> +                av_log(NULL, AV_LOG_VERBOSE, "loudness=%f => %f => volume=%f\n",
> +                    loudness, new_volume, pow(10, new_volume / 20));
> +                set_fixed_volume(vol, pow(10, new_volume / 20));
> +            }

This paragraph has several problems. First, it is missing spaces around
words, that is easy to fix.

Second, it has a duplicated mathematical formula, which is pretty much a
recipe for inconsistency. That is easy to fix too.

Third, it has several hardcoded values, and that is not good design.

It seems to me that using an expression, evaluated each time the metadata
value changes and with that value available as a variable would be a much
nicer design.

> +        }
> +    }
> +
>      if (vol->volume == 1.0 || vol->volume_i == 256)
>          return ff_filter_frame(outlink, buf);
>  
> @@ -269,6 +294,12 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *buf)
>      return ff_filter_frame(outlink, out_buf);
>  }
>  

> +static av_cold void uninit(AVFilterContext *ctx)
> +{
> +    VolumeContext *vol = ctx->priv;
> +    av_opt_free(vol);
> +}

AFAIK, this is unneeded since the "evil plan".

> +
>  static const AVFilterPad avfilter_af_volume_inputs[] = {
>      {
>          .name           = "default",
> @@ -294,6 +325,7 @@ AVFilter avfilter_af_volume = {
>      .priv_size      = sizeof(VolumeContext),
>      .priv_class     = &volume_class,
>      .init           = init,
> +    .uninit         = uninit,
>      .inputs         = avfilter_af_volume_inputs,
>      .outputs        = avfilter_af_volume_outputs,
>      .flags          = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,
> diff --git a/libavfilter/af_volume.h b/libavfilter/af_volume.h
> index bd7932e..4deca9c 100644
> --- a/libavfilter/af_volume.h
> +++ b/libavfilter/af_volume.h
> @@ -48,6 +48,7 @@ typedef struct VolumeContext {
>      void (*scale_samples)(uint8_t *dst, const uint8_t *src, int nb_samples,
>                            int volume);
>      int samples_align;
> +    char *metadata;
>  } VolumeContext;
>  
>  void ff_volume_init_x86(VolumeContext *vol);

> diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
> index 88d37e8..f4ce6d9 100644
> --- a/libavfilter/f_ebur128.c
> +++ b/libavfilter/f_ebur128.c

Unrelated.

> @@ -410,7 +410,7 @@ static av_cold int init(AVFilterContext *ctx)
>  
>      if (ebur128->loglevel != AV_LOG_INFO &&
>          ebur128->loglevel != AV_LOG_VERBOSE) {

> -        if (ebur128->do_video || ebur128->metadata)
> +        if (ebur128->do_video)
>              ebur128->loglevel = AV_LOG_VERBOSE;
>          else
>              ebur128->loglevel = AV_LOG_INFO;
> @@ -689,7 +689,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
>                  SET_META("LRA.high", ebur128->lra_high);
>              }
>  
> -            av_log(ctx, ebur128->loglevel, "t: %-10s " LOG_FMT "\n",
> +            av_log(ctx, ebur128->metadata || !ebur128->do_video ? AV_LOG_VERBOSE : ebur128->loglevel, "t: %-10s " LOG_FMT "\n",
>                     av_ts2timestr(pts, &outlink->time_base),
>                     loudness_400, loudness_3000,
>                     ebur128->integrated_loudness, ebur128->loudness_range);
> diff --git a/libavfilter/x86/af_volume_init.c b/libavfilter/x86/af_volume_init.c
> index 81d605f..fab5a03 100644
> --- a/libavfilter/x86/af_volume_init.c
> +++ b/libavfilter/x86/af_volume_init.c
> @@ -39,7 +39,7 @@ av_cold void ff_volume_init_x86(VolumeContext *vol)
>      enum AVSampleFormat sample_fmt = av_get_packed_sample_fmt(vol->sample_fmt);
>  
>      if (sample_fmt == AV_SAMPLE_FMT_S16) {
> -        if (EXTERNAL_SSE2(mm_flags) && vol->volume_i < 32768) {
> +        if (EXTERNAL_SSE2(mm_flags) && vol->volume_i < 32768 && !vol->metadata) {
>              vol->scale_samples = ff_scale_samples_s16_sse2;
>              vol->samples_align = 8;
>          }

Regards,

-- 
  Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130713/ffcf7984/attachment.asc>


More information about the ffmpeg-devel mailing list