[FFmpeg-devel] [RFC][PATCH] ffmpeg: add option to transform metadata using iconv
James Darnley
james.darnley at gmail.com
Fri Sep 25 18:42:59 CEST 2015
On 2015-09-24 20:37, Nicolas George wrote:
> Le tridi 3 vendémiaire, an CCXXIV, James Darnley a écrit :
>> I don't know what to say here. I know the encodings needed for iconv
>> because I arrived at them by brute force. I wrote a short Lua script to
>> iterate over a list of encodings supported by my iconv and arrived at
>> this answer. The command line tool called iconv is too clever for this
>> because it returns an error when it can't convert. As for ending in
>> GBK, it is what the script told me.
>
> Could you share the script and enough input to run it and reproduce the
> results?
I can. You should find it attached to this email. I cleaned it up and
put two test cases of data into the file. You will need Lua and the
Lua-iconv module. If your package manager doesn't have that see here:
https://ittner.github.io/lua-iconv/
To run it: lua <filename>
>> This feature would not work if there was a misinterpretation in the
>> middle. As you say that would need A->B and C->D where B != C. Perhaps
>> this is why my solution isn't perfect, because there should be an
>> assumption in the middle.
>>
>> I could rework my code to allow for assumptions in the middle. My case
>> would then use "CP1252,UTF-8,UTF-8,GBK" as an argument.
>
> I must say, I do not like your approach very much because it manipulates
> text encoding in the middle of the program. All strings inside the program
> should be in UTF-8.
>
> I can propose this: add an option "metadata_text_encoding" to
> AVFormatContext. If it is set on a demuxer, the demuxing framework uses it
> to convert from it to UTF-8; and similarly, if it is set on a muxer, the
> muxing framework uses it to convert from UTF-8 to it.
>
> Then we can have a special syntax for it to specify bogus conversions.
> Possibly: -metadata_text_encoding "[CP1252>UTF-8]GBK" to specify that the
> text must first be converted from CP1252 to UTF-8 then considered to be GBK
> (and converted to UTF-8). (Well, I consider the feature evil, so I will
> probably not volunteer to implement it, but I will not oppose as long as it
> can not be triggered too easily by an unsuspecting user.
>
> What do you think of it?
As for more special syntax, I'm not a fan of it. Handling this in the
demuxer, somewhere, might be a better idea.
-------------- next part --------------
local iconv = require('iconv')
local function canonicalize_list(list)
local tbl = {}
for _,v in ipairs(list) do
local cp = iconv.canonicalize(v)
tbl[cp] = true
end
local ret = {}
for k,_ in pairs(tbl) do
table.insert(ret, k)
end
table.sort(ret)
return ret
end
local function hex_string_to_bytes(str)
local ret = ''
for i in string.gmatch(str, '%x%x') do
ret = ret .. string.char(tonumber(i, 16))
end
return ret
end
-- Moderately slow, ~15sec for 143 encodings.
local function run(encoding_list, mojibake, correct)
for _,a in ipairs(encoding_list) do
for _,b in ipairs(encoding_list) do
for _,c in ipairs(encoding_list) do
local a2b = iconv.new(a, b)
local b2c = iconv.new(b, c)
local str = a2b:iconv(mojibake)
str = b2c:iconv(str)
if string.match(str, correct) then
io.stdout:write(string.format('%s,%s,%s = %s\n', a, b, c, str))
end
end
end
end
end
-- Very fast, ~0.1sec for 143 encodings.
local function run_assume_middle_utf8(encoding_list, mojibake, correct)
for _,a in ipairs(encoding_list) do
for _,b in ipairs(encoding_list) do
local a2utf = iconv.new(a, 'UTF-8')
local utf2b = iconv.new('UTF-8', b)
local str = a2utf:iconv(mojibake)
str = utf2b:iconv(str)
if string.match(str, correct) then
io.stdout:write(string.format('%s,UTF-8,%s = %s\n', a, b, str))
end
end
end
end
-- Very slow, many minutes for 143 encodings.
local function run_assume_middle_random(encoding_list, mojibake, correct)
for _,a in ipairs(encoding_list) do
for _,b in ipairs(encoding_list) do
for _,c in ipairs(encoding_list) do
for _,d in ipairs(encoding_list) do
local a2b = iconv.new(a, b)
local c2d = iconv.new(c, d)
local str = a2b:iconv(mojibake)
str = c2d:iconv(str)
if string.match(str, correct) then
io.stdout:write(string.format('%s,%s_%s,%s = %s\n', a, b, c, d, str))
end
end
end
end
end
end
-- Main program
local encoding_list = {}
if true or not iconv.list or not iconv.canonicalize then io.stdout:write(
'The iconv module does not support the list or canonicalize functions so '
.. 'this tool will use an internal list of character encodings.\n')
encoding_list = { "ARMSCII-8", "ATARIST", "BIG5", "BIG5-2003", "BIG5-HKSCS",
"BIG5-HKSCS:1999", "BIG5-HKSCS:2001", "BIG5-HKSCS:2004", "C99", "CP1046",
"CP1124", "CP1125", "CP1129", "CP1131", "CP1133", "CP1161", "CP1162",
"CP1163", "CP1250", "CP1251", "CP1252", "CP1253", "CP1254", "CP1255",
"CP1256", "CP1257", "CP1258", "CP437", "CP737", "CP775", "CP850", "CP852",
"CP853", "CP855", "CP856", "CP857", "CP858", "CP860", "CP861", "CP862",
"CP863", "CP864", "CP865", "CP866", "CP869", "CP874", "CP922", "CP932",
"CP936", "CP943", "CP949", "CP950", "DEC-HANYU", "DEC-KANJI", "EUC-CN",
"EUC-JISX0213", "EUC-JP", "EUC-KR", "EUC-TW", "GB18030", "GBK",
"GB_1988-80", "GB_2312-80", "GEORGIAN-ACADEMY", "GEORGIAN-PS", "HP-ROMAN8",
"HZ", "ISO-2022-CN", "ISO-2022-CN-EXT", "ISO-2022-JP", "ISO-2022-JP-1",
"ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-KR", "ISO-8859-1",
"ISO-8859-10", "ISO-8859-11", "ISO-8859-13", "ISO-8859-14", "ISO-8859-15",
"ISO-8859-16", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5",
"ISO-8859-6", "ISO-8859-7", "ISO-8859-8", "ISO-8859-9", "ISO-IR-165",
"JAVA", "JIS_C6220-1969-RO", "JIS_X0201", "JIS_X0208", "JIS_X0212", "JOHAB",
"KOI8-R", "KOI8-RU", "KOI8-T", "KOI8-U", "KSC_5601", "MACARABIC",
"MACCENTRALEUROPE", "MACCROATIAN", "MACCYRILLIC", "MACGREEK", "MACHEBREW",
"MACICELAND", "MACROMAN", "MACROMANIA", "MACTHAI", "MACTURKISH",
"MACUKRAINE", "MULELAO-1", "NEXTSTEP", "PT154", "RISCOS-LATIN1", "RK1048",
"SHIFT_JIS", "SHIFT_JISX0213", "TCVN", "TDS565", "TIS-620", "UCS-2",
"UCS-2-INTERNAL", "UCS-2-SWAPPED", "UCS-2BE", "UCS-2LE", "UCS-4",
"UCS-4-INTERNAL", "UCS-4-SWAPPED", "UCS-4BE", "UCS-4LE", "US-ASCII",
"UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "UTF-7",
"UTF-8", "VISCII" }
else
encoding_list = canonicalize_list(iconv.list())
end
local mojibake_correct_pairs = {
bcc_14 = { [[
C38AC2AFC2B4C2A8C393C2A2C380C389
20617320C2B8C2A1C396C3B1C38AC2AE
C38BC384C380C3893FC38EC3B7C2B4C3
A5C2A4C381C2A4C38AC2A4C39F206173
20C2BBC2A2C28FC398C387C3A5C392C3
B43FC39F68C2BDC3BCC390C2A2C392C2
BB20617320C390C2A1C2B4C2BBC38FC3
89C38CC2ABC380C389]], '一' },
bcc_15 = { [[
C395C39BC3B3C392C2B8C2BBC383C380
C397C39320617320C390C3A0C384C2BE
C2A5C3ABC2A5C2ADC2A5C2A23FC38BC3
89C592C3B9C393C389C3994620617320
C2BEC2AEC389C38FC2BFE28094C5A0C2
AA]], '姫'},
}
for _,tbl in pairs(mojibake_correct_pairs) do
local mojibake = hex_string_to_bytes(tbl[1])
local correct = tbl[2]
io.stdout:write(string.format(
'\nTrying to find the codepages that will transform:\n\t%s\n'
.. 'So that it contains:\n\t%s\n', mojibake, correct))
run_assume_middle_utf8(encoding_list, mojibake, correct)
end
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 603 bytes
Desc: OpenPGP digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150925/9c153979/attachment.sig>
More information about the ffmpeg-devel
mailing list