[FFmpeg-devel] [RFC][PATCH] ffmpeg: add option to transform metadata using iconv

Nicolas George george at nsup.org
Wed Sep 23 16:56:27 CEST 2015


Le primidi 1er vendémiaire, an CCXXIV, James Darnley a écrit :
> At present it only converts global metadata as that is what I wanted to do.  It
> should be possible to extend it so that the conversion can be different for
> different files or streams.
> ---
>  doc/ffmpeg.texi |   6 +++
>  ffmpeg.c        |  15 ++++++
>  ffmpeg.h        |   1 +
>  ffmpeg_opt.c    | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 169 insertions(+), 2 deletions(-)
> 

> Then I will change all instances of "code page" to "character encoding"
> even in the code, except for one in the docs which I might change to
> "character encoding (code page)" unless that will be confusing for users.

Having the words "code page" in the docs is good for users that know it by
that name. In that case, I suggest to also include "charset", which is
another incorrect way of calling it (incorrect because en encoding is not
just a set).

See the technical comments interleaved in the code.

More globally, there are several issues with the patch as is that come from
the direct use of iconv:

- it uses an optional feature, with all the code noise and ifdefry that
  implies;

- it has annoying buffer handling;

- it does not let specify the replacement character for invalid encodings.

There is already a place in lavc that uses iconv, with the exact same
drawbacks. There will possibly be other similar places later if someone
decides that vf_drawtext needs to accept legacy encodings for example.

For some time, I have had in mind a generic text encoding conversion utility
in libavutil, that would solve all this once and for all. It was rather
low-priority for me, but since the question is raised now, I can get on with
it.

I will post the API I have in mind in a separate thread.

> diff --git a/doc/ffmpeg.texi b/doc/ffmpeg.texi
> index f4ffc6c..d4c1c23 100644
> --- a/doc/ffmpeg.texi
> +++ b/doc/ffmpeg.texi
> @@ -855,6 +855,12 @@ such streams is attempted.
>  Allow input streams with unknown type to be copied instead of failing if copying
>  such streams is attempted.
>  

> + at item -metadata_iconv_code_page_list @var{code_page_list}

Can you explain the case for having a list here?

> +Force the metadata from input files to be converted through the given codepages
> +using iconv. This allows the user to correct
> + at uref{https://en.wikipedia.org/wiki/Mojibake, mojibake} providing they know the
> +correct code pages to use.
> +
>  @item -map_channel [@var{input_file_id}. at var{stream_specifier}. at var{channel_id}|-1][:@var{output_file_id}. at var{stream_specifier}]
>  Map an audio channel from a given input to an output. If
>  @var{output_file_id}. at var{stream_specifier} is not set, the audio channel will
> diff --git a/ffmpeg.c b/ffmpeg.c
> index e31a2c6..5c04571 100644
> --- a/ffmpeg.c
> +++ b/ffmpeg.c
> @@ -101,6 +101,10 @@
>  #include <pthread.h>
>  #endif
>  
> +#if CONFIG_ICONV
> +#include <iconv.h>
> +#endif
> +
>  #include <time.h>
>  
>  #include "ffmpeg.h"
> @@ -564,6 +568,17 @@ static void ffmpeg_cleanup(int ret)
>          fclose(vstats_file);
>      av_freep(&vstats_filename);
>  
> +#if CONFIG_ICONV
> +    if (metadata_iconv_contexts) {
> +        iconv_t *cd = metadata_iconv_contexts;
> +        for (i = 0; cd[i]; i++) {
> +            iconv_close(cd[i]);
> +            cd[i] = NULL;
> +        }
> +    }
> +    av_freep(&metadata_iconv_contexts);
> +#endif
> +
>      av_freep(&input_streams);
>      av_freep(&input_files);
>      av_freep(&output_streams);
> diff --git a/ffmpeg.h b/ffmpeg.h
> index 6544e6f..2b8cbd7 100644
> --- a/ffmpeg.h
> +++ b/ffmpeg.h
> @@ -495,6 +495,7 @@ extern int        nb_filtergraphs;
>  
>  extern char *vstats_filename;
>  extern char *sdp_filename;

> +extern void *metadata_iconv_contexts;

I am not very comfortable with that structure.

>  
>  extern float audio_drift_threshold;
>  extern float dts_delta_threshold;
> diff --git a/ffmpeg_opt.c b/ffmpeg_opt.c
> index 4edd118..d226f78 100644
> --- a/ffmpeg_opt.c
> +++ b/ffmpeg_opt.c
> @@ -42,6 +42,10 @@
>  #include "libavutil/pixfmt.h"
>  #include "libavutil/time_internal.h"
>  
> +#if CONFIG_ICONV
> +# include <iconv.h>
> +#endif
> +
>  #define DEFAULT_PASS_LOGFILENAME_PREFIX "ffmpeg2pass"
>  
>  #define MATCH_PER_STREAM_OPT(name, type, outvar, fmtctx, st)\
> @@ -84,6 +88,7 @@ const HWAccel hwaccels[] = {
>  
>  char *vstats_filename;
>  char *sdp_filename;
> +void *metadata_iconv_contexts;
>  
>  float audio_drift_threshold = 0.1;
>  float dts_delta_threshold   = 10;
> @@ -120,6 +125,7 @@ static int input_stream_potentially_available = 0;
>  static int ignore_unknown_streams = 0;
>  static int copy_unknown_streams = 0;
>  

> +

Stray.

>  static void uninit_options(OptionsContext *o)
>  {
>      const OptionDef *po = options;
> @@ -196,6 +202,58 @@ static AVDictionary *strip_specifiers(AVDictionary *dict)
>      return ret;
>  }
>  
> +static int opt_metadata_iconv(void *optctx, const char *opt, const char *arg)
> +{
> +#if !CONFIG_ICONV
> +    av_log(NULL, AV_LOG_ERROR, "converting metadata through codepages requires "
> +            "ffmpeg to be built with iconv support\n");

> +    return AVERROR(EINVAL);

ENOSYS?

> +#else
> +    void *temp;
> +    char **list = NULL;
> +    char *code_page_list = av_strdup(arg);
> +    char *token = av_strtok(code_page_list, ",", (char**)&temp);
> +    int i, token_count = 0;
> +    iconv_t *cd;
> +
> +    /* TODO: correct handling of memory in error case. */
> +
> +    while (token) {
> +        av_dynarray_add(&list, &token_count, token);
> +        if (!list)
> +            return AVERROR(ENOMEM);
> +        token = av_strtok(NULL, ",", (char**)&temp);
> +    }
> +
> +    if (token_count < 2) {
> +        av_log(NULL, AV_LOG_ERROR, "too few code pages (%d) in list (%s)\n", token_count, code_page_list);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cd = av_mallocz_array(sizeof(iconv_t), token_count);
> +    if (!cd)
> +        return AVERROR(ENOMEM);
> +
> +    for (i = 0; i < token_count - 1; i++) {
> +        av_log(NULL, AV_LOG_DEBUG, "Opening iconv with code pages: %s, %s\n", list[i], list[i+1]);
> +
> +        temp = iconv_open(list[i], list[i+1]);
> +        if (temp == (iconv_t)(-1)) {
> +            av_log(NULL, AV_LOG_ERROR, "error opening iconv with code pages (%s, %s)\n", list[i], list[i+1]);
> +            return AVERROR(EINVAL);
> +            /* TODO: check for other errors. */
> +        }
> +        cd[i] = temp;
> +    }
> +    metadata_iconv_contexts = cd;
> +
> +    av_freep(&code_page_list);
> +    av_freep(&list);
> +
> +    return 0;
> +#endif /* !CONFIG_ICONV */
> +}
> +
>  static int opt_sameq(void *optctx, const char *opt, const char *arg)
>  {
>      av_log(NULL, AV_LOG_ERROR, "Option '%s' was removed. "
> @@ -454,6 +512,84 @@ static void parse_meta_type(char *arg, char *type, int *index, const char **stre
>          *type = 'g';
>  }
>  
> +#if CONFIG_ICONV
> +static int run_one_iconv(iconv_t cd, char *buf1, size_t len, char *buf2, size_t buf2_len)
> +{
> +    /* TODO: maybe handle other errors. */
> +    if (iconv(cd, &buf1, &len, &buf2, &buf2_len) == (size_t)(-1)
> +            && errno == E2BIG)
> +        return E2BIG;
> +

> +    buf2[0] = 0;

May overflow.

> +    return 0;
> +}
> +
> +static int metadata_iconv_internal(AVDictionary **dst, const AVDictionary *src, int flags,
> +        iconv_t *cd)
> +{
> +    AVDictionaryEntry *t = NULL;
> +

> +    size_t BUF_LEN = 256;

Variable name in ALL CAPS is rather unusual.

> +
> +    char *buffer1 = av_realloc(NULL, BUF_LEN);
> +    char *buffer2 = av_realloc(NULL, BUF_LEN);

> +    if (!buffer1 || !buffer2)
> +        return AVERROR(ENOMEM);

Possible leak.

> +
> +    memset(buffer1, 0, BUF_LEN);
> +    memset(buffer2, 0, BUF_LEN);
> +
> +    while ((t = av_dict_get(src, "", t, AV_DICT_IGNORE_SUFFIX))) {
> +        size_t tag_len = strlen(t->value);
> +        int i;
> +

> +        while (BUF_LEN < tag_len) {
> +            buffer1 = av_realloc_f(buffer1, BUF_LEN, 2);
> +            buffer2 = av_realloc_f(buffer2, BUF_LEN, 2);
> +            if (!buffer1 || !buffer2)
> +                return AVERROR(ENOMEM);
> +            BUF_LEN *= 2;
> +        }

Reallocating in the loop is wasteful.

> +
> +        strncpy(buffer1, t->value, BUF_LEN-1);
> +
> +        for (i = 0; cd[i]; i++) {
> +            char *temp;
> +
> +            while (run_one_iconv(cd[i], buffer1, tag_len, buffer2, BUF_LEN - 1) == E2BIG) {
> +                buffer1 = av_realloc_f(buffer1, BUF_LEN, 2);
> +                buffer2 = av_realloc_f(buffer2, BUF_LEN, 2);
> +                if (!buffer1 || !buffer2)
> +                    return AVERROR(ENOMEM);
> +                BUF_LEN *= 2;
> +            }
> +
> +            tag_len = strlen(buffer2);
> +            temp = buffer1;
> +            buffer1 = buffer2;
> +            buffer2 = temp;
> +        }
> +
> +        av_dict_set(dst, t->key, buffer1, flags);
> +    }
> +
> +    return 0;
> +}
> +#endif /* CONFIG_ICONV */
> +
> +static int av_dict_copy_with_iconv(AVDictionary **dst, const AVDictionary *src, int flags,
> +        void *metadata_iconv_contexts)
> +{
> +    if (!metadata_iconv_contexts)
> +        av_dict_copy(dst, src, flags);
> +
> +#if CONFIG_ICONV
> +    return metadata_iconv_internal(dst, src, flags, metadata_iconv_contexts);
> +#endif
> +

> +    return 0;

This can not be reached -> av_assert(0);

> +}
> +
>  static int copy_metadata(char *outspec, char *inspec, AVFormatContext *oc, AVFormatContext *ic, OptionsContext *o)
>  {
>      AVDictionary **meta_in = NULL;
> @@ -2306,8 +2442,8 @@ loop_end:
>  
>      /* copy global metadata by default */
>      if (!o->metadata_global_manual && nb_input_files){
> -        av_dict_copy(&oc->metadata, input_files[0]->ctx->metadata,
> -                     AV_DICT_DONT_OVERWRITE);
> +        av_dict_copy_with_iconv(&oc->metadata, input_files[0]->ctx->metadata,
> +                     AV_DICT_DONT_OVERWRITE, metadata_iconv_contexts);
>          if(o->recording_time != INT64_MAX)
>              av_dict_set(&oc->metadata, "duration", NULL, 0);
>          av_dict_set(&oc->metadata, "creation_time", NULL, 0);
> @@ -3094,6 +3230,15 @@ const OptionDef options[] = {
>      { "map_chapters",   HAS_ARG | OPT_INT | OPT_EXPERT | OPT_OFFSET |
>                          OPT_OUTPUT,                                  { .off = OFFSET(chapters_input_file) },
>          "set chapters mapping", "input_file_index" },
> +
> +    {
> +        "metadata_iconv_code_page_list",
> +        HAS_ARG | OPT_EXPERT,
> +        { .func_arg = opt_metadata_iconv },
> +        "convert metadata through the comma-separated list of code pages",
> +        "code_page_list",
> +    },
> +
>      { "t",              HAS_ARG | OPT_TIME | OPT_OFFSET |
>                          OPT_INPUT | OPT_OUTPUT,                      { .off = OFFSET(recording_time) },
>          "record or transcode \"duration\" seconds of audio/video",

Regards,

-- 
  Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150923/1931e85d/attachment.sig>


More information about the ffmpeg-devel mailing list