[FFmpeg-devel] [RFC][PATCH] ffmpeg: add option to transform metadata using iconv

Fri Sep 25 18:42:59 CEST 2015

On 2015-09-24 20:37, Nicolas George wrote:
> Le tridi 3 vendémiaire, an CCXXIV, James Darnley a écrit :
>> I don't know what to say here.  I know the encodings needed for iconv
>> because I arrived at them by brute force.  I wrote a short Lua script to
>> iterate over a list of encodings supported by my iconv and arrived at
>> this answer.  The command line tool called iconv is too clever for this
>> because it returns an error when it can't convert.  As for ending in
>> GBK, it is what the script told me.
> 
> Could you share the script and enough input to run it and reproduce the
> results?

I can.  You should find it attached to this email.  I cleaned it up and
put two test cases of data into the file.  You will need Lua and the
Lua-iconv module.  If your package manager doesn't have that see here:
https://ittner.github.io/lua-iconv/

To run it: lua <filename>

>> This feature would not work if there was a misinterpretation in the
>> middle.  As you say that would need A->B and C->D where B != C.  Perhaps
>> this is why my solution isn't perfect, because there should be an
>> assumption in the middle.
>>
>> I could rework my code to allow for assumptions in the middle.  My case
>> would then use "CP1252,UTF-8,UTF-8,GBK" as an argument.
> 
> I must say, I do not like your approach very much because it manipulates
> text encoding in the middle of the program. All strings inside the program
> should be in UTF-8.
> 
> I can propose this: add an option "metadata_text_encoding" to
> AVFormatContext. If it is set on a demuxer, the demuxing framework uses it
> to convert from it to UTF-8; and similarly, if it is set on a muxer, the
> muxing framework uses it to convert from UTF-8 to it.
>
> Then we can have a special syntax for it to specify bogus conversions.
> Possibly: -metadata_text_encoding "[CP1252>UTF-8]GBK" to specify that the
> text must first be converted from CP1252 to UTF-8 then considered to be GBK
> (and converted to UTF-8). (Well, I consider the feature evil, so I will
> probably not volunteer to implement it, but I will not oppose as long as it
> can not be triggered too easily by an unsuspecting user.
> 
> What do you think of it?

As for more special syntax, I'm not a fan of it.  Handling this in the
demuxer, somewhere, might be a better idea.

-------------- next part --------------
local iconv = require('iconv')

local function canonicalize_list(list)
    local tbl = {}

    for _,v in ipairs(list) do
        local cp = iconv.canonicalize(v)
        tbl[cp] = true
    end

    local ret = {}
    for k,_ in pairs(tbl) do
        table.insert(ret, k)
    end
    table.sort(ret)

    return ret
end

local function hex_string_to_bytes(str)
    local ret = ''
    for i in string.gmatch(str, '%x%x') do
        ret = ret .. string.char(tonumber(i, 16))
    end
    return ret
end

-- Moderately slow, ~15sec for 143 encodings.
local function run(encoding_list, mojibake, correct)
    for _,a in ipairs(encoding_list) do
        for _,b in ipairs(encoding_list) do
            for _,c in ipairs(encoding_list) do

                local a2b = iconv.new(a, b)
                local b2c = iconv.new(b, c)
                local str = a2b:iconv(mojibake)
                str = b2c:iconv(str)
                if string.match(str, correct) then
                    io.stdout:write(string.format('%s,%s,%s = %s\n', a, b, c, str))
                end

            end
        end
    end
end

-- Very fast, ~0.1sec for 143 encodings.
local function run_assume_middle_utf8(encoding_list, mojibake, correct)
    for _,a in ipairs(encoding_list) do
        for _,b in ipairs(encoding_list) do

            local a2utf = iconv.new(a, 'UTF-8')
            local utf2b = iconv.new('UTF-8', b)
            local str = a2utf:iconv(mojibake)
            str = utf2b:iconv(str)
            if string.match(str, correct) then
                io.stdout:write(string.format('%s,UTF-8,%s = %s\n', a, b, str))
            end

        end
    end
end

-- Very slow, many minutes for 143 encodings.
local function run_assume_middle_random(encoding_list, mojibake, correct)
    for _,a in ipairs(encoding_list) do
        for _,b in ipairs(encoding_list) do
            for _,c in ipairs(encoding_list) do
                for _,d in ipairs(encoding_list) do

                    local a2b = iconv.new(a, b)
                    local c2d = iconv.new(c, d)
                    local str = a2b:iconv(mojibake)
                    str = c2d:iconv(str)
                    if string.match(str, correct) then
                        io.stdout:write(string.format('%s,%s_%s,%s = %s\n', a, b, c, d, str))
                    end

                end
            end
        end
    end
end

-- Main program

local encoding_list = {}
if true or not iconv.list or not iconv.canonicalize then io.stdout:write(
    'The iconv module does not support the list or canonicalize functions so '
    .. 'this tool will use an internal list of character encodings.\n')

    encoding_list = { "ARMSCII-8", "ATARIST", "BIG5", "BIG5-2003", "BIG5-HKSCS",
    "BIG5-HKSCS:1999", "BIG5-HKSCS:2001", "BIG5-HKSCS:2004", "C99", "CP1046",
    "CP1124", "CP1125", "CP1129", "CP1131", "CP1133", "CP1161", "CP1162",
    "CP1163", "CP1250", "CP1251", "CP1252", "CP1253", "CP1254", "CP1255",
    "CP1256", "CP1257", "CP1258", "CP437", "CP737", "CP775", "CP850", "CP852",
    "CP853", "CP855", "CP856", "CP857", "CP858", "CP860", "CP861", "CP862",
    "CP863", "CP864", "CP865", "CP866", "CP869", "CP874", "CP922", "CP932",
    "CP936", "CP943", "CP949", "CP950", "DEC-HANYU", "DEC-KANJI", "EUC-CN",
    "EUC-JISX0213", "EUC-JP", "EUC-KR", "EUC-TW", "GB18030", "GBK",
    "GB_1988-80", "GB_2312-80", "GEORGIAN-ACADEMY", "GEORGIAN-PS", "HP-ROMAN8",
    "HZ", "ISO-2022-CN", "ISO-2022-CN-EXT", "ISO-2022-JP", "ISO-2022-JP-1",
    "ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-KR", "ISO-8859-1",
    "ISO-8859-10", "ISO-8859-11", "ISO-8859-13", "ISO-8859-14", "ISO-8859-15",
    "ISO-8859-16", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5",
    "ISO-8859-6", "ISO-8859-7", "ISO-8859-8", "ISO-8859-9", "ISO-IR-165",
    "JAVA", "JIS_C6220-1969-RO", "JIS_X0201", "JIS_X0208", "JIS_X0212", "JOHAB",
    "KOI8-R", "KOI8-RU", "KOI8-T", "KOI8-U", "KSC_5601", "MACARABIC",
    "MACCENTRALEUROPE", "MACCROATIAN", "MACCYRILLIC", "MACGREEK", "MACHEBREW",
    "MACICELAND", "MACROMAN", "MACROMANIA", "MACTHAI", "MACTURKISH",
    "MACUKRAINE", "MULELAO-1", "NEXTSTEP", "PT154", "RISCOS-LATIN1", "RK1048",
    "SHIFT_JIS", "SHIFT_JISX0213", "TCVN", "TDS565", "TIS-620", "UCS-2",
    "UCS-2-INTERNAL", "UCS-2-SWAPPED", "UCS-2BE", "UCS-2LE", "UCS-4",
    "UCS-4-INTERNAL", "UCS-4-SWAPPED", "UCS-4BE", "UCS-4LE", "US-ASCII",
    "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "UTF-7",
    "UTF-8", "VISCII" }
else
    encoding_list = canonicalize_list(iconv.list())
end

local mojibake_correct_pairs = {
    bcc_14 = { [[
        C38AC2AFC2B4C2A8C393C2A2C380C389
        20617320C2B8C2A1C396C3B1C38AC2AE
        C38BC384C380C3893FC38EC3B7C2B4C3
        A5C2A4C381C2A4C38AC2A4C39F206173
        20C2BBC2A2C28FC398C387C3A5C392C3
        B43FC39F68C2BDC3BCC390C2A2C392C2
        BB20617320C390C2A1C2B4C2BBC38FC3
        89C38CC2ABC380C389]], '一' },
    bcc_15 = { [[
        C395C39BC3B3C392C2B8C2BBC383C380
        C397C39320617320C390C3A0C384C2BE
        C2A5C3ABC2A5C2ADC2A5C2A23FC38BC3
        89C592C3B9C393C389C3994620617320
        C2BEC2AEC389C38FC2BFE28094C5A0C2
        AA]], '姫'},
    }

for _,tbl in pairs(mojibake_correct_pairs) do
    local mojibake = hex_string_to_bytes(tbl[1])
    local correct = tbl[2]

    io.stdout:write(string.format(
        '\nTrying to find the codepages that will transform:\n\t%s\n'
        .. 'So that it contains:\n\t%s\n', mojibake, correct))
    run_assume_middle_utf8(encoding_list, mojibake, correct)
end

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 603 bytes
Desc: OpenPGP digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150925/9c153979/attachment.sig>