[FFmpeg-devel] [PATCH 1/2] lavu: add text_file API.
Nicolas George
nicolas.george at normalesup.org
Thu Aug 8 14:08:43 CEST 2013
TODO: version bump, APIChanges entry, !HAVE_ICONV path.
Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
---
libavutil/Makefile | 1 +
libavutil/text_file.c | 262 +++++++++++++++++++++++++++++++++++++++++++++++++
libavutil/text_file.h | 181 ++++++++++++++++++++++++++++++++++
3 files changed, 444 insertions(+)
create mode 100644 libavutil/text_file.c
create mode 100644 libavutil/text_file.h
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 21746f0..7d59a73 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -107,6 +107,7 @@ OBJS = adler32.o \
samplefmt.o \
sha.o \
sha512.o \
+ text_file.o \
time.o \
timecode.o \
tree.o \
diff --git a/libavutil/text_file.c b/libavutil/text_file.c
new file mode 100644
index 0000000..e5f8b78
--- /dev/null
+++ b/libavutil/text_file.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2013 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avassert.h"
+#include "avstring.h"
+#include "bprint.h"
+#include "text_file.h"
+
+#include <iconv.h>
+
+#define COPY_FROM_USER(var) \
+ av_assert0(var ## _user->struct_size <= sizeof(var)); \
+ memcpy(&var, var ## _user, var ## _user->struct_size);
+#define COPY_TO_USER(var) \
+ memcpy(var ## _user, &var, var ## _user->struct_size);
+
+static const struct {
+ unsigned char encoding[9], bom[4], len;
+} byte_order_marks[] = {
+ { "UTF-8", "\xef\xbb\xbf", 3 },
+ { "UCS-4BE", "\x00\x00\xfe\xff", 4 },
+ { "UCS-4LE", "\xff\xfe\x00\x00", 4 },
+ { "UTF-16BE", "\xfe\xff", 2 },
+ { "UTF-16LE", "\xff\xfe", 2 },
+};
+
+static const char *const default_encodings[] = {
+ "UTF-8",
+ "US-ASCII",
+ "WINDOWS-1252",
+ "ISO-8859-1",
+ NULL
+};
+
+static int try_encoding(AVTextFile *tf, const char *encoding)
+{
+ iconv_t cd;
+ AVBPrint bp;
+ char *inbuf, *outbuf, *recoded;
+ size_t insize, outsize, insize_orig;
+ unsigned outsize_int;
+ int ret = 0;
+
+ if ((cd = iconv_open("UTF-8", encoding)) == (iconv_t)-1)
+ return AVERROR(errno);
+ av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
+ inbuf = tf->full_data;
+ insize = tf->full_data_size;
+ while (insize) {
+ av_bprint_get_buffer(&bp, 512, (unsigned char **)&outbuf, &outsize_int);
+ if (outsize_int <= 1) {
+ ret = AVERROR(ENOMEM);
+ break;
+ }
+ outsize_int--;
+ outsize = outsize_int;
+ insize_orig = insize;
+ iconv(cd, &inbuf, &insize, &outbuf, &outsize);
+ if (insize == insize_orig) {
+ ret = AVERROR_INVALIDDATA;
+ break;
+ }
+ bp.len += outsize_int - outsize;
+ }
+ iconv_close(cd);
+ if (ret < 0) {
+ av_bprint_finalize(&bp, NULL);
+ return ret;
+ }
+ av_assert1(!insize);
+ bp.str[bp.len] = 0;
+ if ((ret = av_bprint_finalize(&bp, &recoded)) < 0)
+ return ret;
+ av_free(tf->full_data);
+ tf->full_data = recoded;
+ tf->full_data_size = bp.len;
+ tf->encoding = encoding;
+ return 0;
+}
+
+static int guess_encoding(AVTextFile *tf)
+{
+ const char *bom_encoding[2] = { NULL, NULL };
+ const char *const *encodings;
+ int ret, i;
+
+ encodings = tf->encodings;
+ if (!encodings) {
+ for (i = 0; i < FF_ARRAY_ELEMS(byte_order_marks); i++) {
+ if (!memcmp(tf->full_data, byte_order_marks[i].bom,
+ byte_order_marks[i].len)) {
+ encodings = bom_encoding;
+ bom_encoding[0] = byte_order_marks[i].encoding;
+ break;
+ }
+ }
+ if (!encodings)
+ encodings = default_encodings;
+ }
+
+ for (i = 0; encodings[i]; i++)
+ if ((ret = try_encoding(tf, encodings[i])) >= 0)
+ return ret;
+
+ av_strlcpy(tf->error, "Unable to guess character encoding",
+ sizeof(tf->error));
+ return AVERROR_INVALIDDATA;
+}
+
+static void remove_cr(AVTextFile *tf)
+{
+ uint8_t *p, *q, *end;
+
+ p = q = tf->text;
+ end = p + tf->text_size;
+ for (; p < end; p++)
+ if (*p != '\r' || p[1] != '\n')
+ *(q++) = *p;
+ tf->text_size = q - tf->text;
+ *(q++) = 0;
+}
+
+static int split_lines(AVTextFile *tf)
+{
+ size_t i, nb_lines = 0;
+ uint8_t *p, *end = tf->text + tf->text_size;
+
+ if (tf->text_size) {
+ nb_lines++;
+ for (p = tf->text; p < end - 1; p++)
+ if (*p == '\n')
+ nb_lines++;
+ }
+ tf->lines = av_calloc(nb_lines + 1, sizeof(*tf->lines));
+ tf->lines[0] = p = tf->text;
+ for (i = 1; i < nb_lines; i++) {
+ p = memchr(p, '\n', end - p);
+ av_assert1(p);
+ *p = 0;
+ tf->lines[i] = ++p;
+ }
+ if (tf->text_size) {
+ if ((p = memchr(p, '\n', end - p))) {
+ av_assert1(p == end - 1);
+ *p = 0;
+ } else {
+ tf->text_flags |= AV_TEXT_FLAG_NO_EOL;
+ }
+ }
+ tf->nb_lines = nb_lines;
+ return 0;
+}
+
+static int text_file_process(AVTextFile *tf)
+{
+ int ret;
+
+ tf->text_flags = 0;
+ if ((ret = guess_encoding(tf)) < 0)
+ return ret;
+ tf->text = tf->full_data;
+ tf->text_size = tf->full_data_size;
+
+ if (!memcmp(tf->text, byte_order_marks[0].bom, 3)) {
+ tf->text_size -= 3;
+ tf->text += 3;
+ tf->text_flags |= AV_TEXT_FLAG_HAS_BOM;
+ }
+
+ if ((tf->flags & AV_TEXT_FLAG_REMOVE_CR))
+ remove_cr(tf);
+ if ((tf->flags & AV_TEXT_FLAG_SPLIT_LINES))
+ if ((ret = split_lines(tf)) < 0)
+ return ret;
+
+ return 0;
+}
+
+static int text_file_try_read(AVTextFile *tf,
+ AVTextFileRead callback, void *opaque)
+{
+ AVBPrint bp;
+ unsigned buf_size;
+ uint8_t *buf;
+ int ret;
+
+ av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
+ while (1) {
+ av_bprint_get_buffer(&bp, 512, &buf, &buf_size);
+ if (buf_size <= 1) {
+ ret = AVERROR(ENOMEM);
+ break;
+ }
+ ret = callback(opaque, buf, FFMIN(buf_size - 1, INT_MAX));
+ if (ret < 0) {
+ if (ret == AVERROR_EOF)
+ ret = 0;
+ break;
+ }
+ bp.len += ret;
+ }
+
+ if (ret < 0) {
+ av_bprint_finalize(&bp, NULL);
+ return ret;
+ }
+ if ((ret = av_bprint_finalize(&bp, (char **)&tf->full_data)) < 0)
+ return ret;
+ tf->full_data_size = bp.len;
+ return text_file_process(tf);
+}
+
+static int text_file_read(AVTextFile *tf,
+ AVTextFileRead callback, void *opaque)
+{
+ int ret;
+
+ *tf->error = 0;
+ if ((ret = text_file_try_read(tf, callback, opaque)) < 0) {
+ if (!*tf->error)
+ av_strerror(ret, tf->error, sizeof(tf->error));
+ av_text_file_free(tf);
+ }
+ return ret;
+}
+
+void av_text_file_free(AVTextFile *tf)
+{
+ tf->text = NULL;
+ av_freep(&tf->lines);
+ av_freep(&tf->full_data);
+ tf->text_size = tf->full_data_size = 0;
+}
+
+int av_text_file_read_callback(AVTextFile *tf_user,
+ AVTextFileRead callback, void *opaque)
+{
+ AVTextFile tf = { 0 };
+ int ret;
+
+ COPY_FROM_USER(tf);
+ ret = text_file_read(&tf, callback, opaque);
+ COPY_TO_USER(tf);
+ return ret;
+}
diff --git a/libavutil/text_file.h b/libavutil/text_file.h
new file mode 100644
index 0000000..d1bfbd3
--- /dev/null
+++ b/libavutil/text_file.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2013 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_TEXT_FILE_H
+#define AVUTIL_TEXT_FILE_H
+
+#include "common.h"
+
+/**
+ * Structure to help read text files.
+ * This API allows to read text files (or other ways of storing text) while
+ * handling the subtleties of character encodings, end-of-line separators,
+ * etc.
+ *
+ * The text returned by this function is always recoded to UTF-8.
+ *
+ * The typical way of using this API is to declare a AVTextFile variable
+ * with the default initialization macro:
+ * AVTextFile tf = { AV_TEXT_FILE_DEFAULT };
+ * Then set fields to control various parts of the process and use it with
+ * the API functions.
+ */
+typedef struct AVTextFile {
+
+ /**
+ * Size of the structure; must be set to sizeof(AVTextFile) to ensure
+ * compatibility with later versions of the library.
+ */
+
+ size_t struct_size;
+
+ /**
+ * Text read from the file. Always terminated by an additional 0.
+ */
+ uint8_t *text;
+
+ /**
+ * Size of text, in bytes, not counting the additional terminating 0.
+ */
+ size_t text_size;
+
+ /**
+ * Full data buffer containing the text; must be freed with av_free()
+ * when no longer needed. Can be different from text due to details such
+ * as byte-order-marks.
+ */
+ uint8_t *full_data;
+
+ /**
+ * Size of full_data, in bytes, not counting the additional 0.
+ */
+ size_t full_data_size;
+
+ /**
+ * Detected encoding; will point to either a static string or an element
+ * of the encodings field.
+ */
+ const char *encoding;
+
+ /**
+ * List of encodings for audodetection, terminated by NULL.
+ * The first encoding in this list that can apply to the file is used.
+ */
+ const char *const *encodings;
+
+ /**
+ * Lines of the file; only relevant if AV_TEXT_FLAG_SPLIT_LINES is set.
+ * Terminated by an additional NULL pointer.
+ */
+ char **lines;
+
+ /**
+ * Number of elements in the lines array, not counting the additional
+ * NULL.
+ */
+ size_t nb_lines;
+
+ /**
+ * Flags to control the processing of the file. See the AV_TEXT_FLAG_*
+ * constants below.
+ */
+ unsigned flags;
+
+ /**
+ * Flags describing features of the file hidden by the conversion. See
+ * the AV_TEXT_FLAG_* constants below.
+ */
+ unsigned text_flags;
+
+ /**
+ * Error message. If something fails, this field will contain a
+ * human-readable error message.
+ */
+ char error[128];
+
+} AVTextFile;
+
+/**
+ * Processing flags.
+ * The following constants apply to the AVTextFile.flags field.
+ */
+enum {
+
+ /**
+ * Split the file into individual lines.
+ * The newline characters are replaced by 0.
+ */
+ AV_TEXT_FLAG_SPLIT_LINES = 0x1,
+
+ /**
+ * Remove CR (\r) before LF (\n).
+ * In other words, convert DOS-style line breaks to Unix-style.
+ */
+ AV_TEXT_FLAG_REMOVE_CR = 0x2,
+};
+
+/**
+ * Result flags.
+ * The following constants apply to the AVTextFile.text_flags field.
+ */
+enum {
+
+ /**
+ * The file had a byte order mark.
+ * The first character of the file was U+FEFF ZERO WIDTH NO-BREAK SPACE.
+ */
+ AV_TEXT_FLAG_HAS_BOM = 0x1,
+
+ /**
+ * The final line of the file was not terminated by a final LF (\n).
+ * Only relevant if lines were split.
+ */
+ AV_TEXT_FLAG_NO_EOL = 0x2,
+};
+
+/**
+ * Callback to read from a file.
+ * @param opaque opaque value passed from the caller
+ * @param buf buffer to fill with the file data
+ * @param buf_size size of the buffer
+ * @return the number of bytes read or a negative error code
+ */
+typedef int (*AVTextFileRead)(void *opaque, unsigned char *buf, int buf_size);
+
+/**
+ * Read a text file from a callback.
+ */
+int av_text_file_read_callback(AVTextFile *tf,
+ AVTextFileRead callback, void *opaque);
+
+/**
+ * Read a text file from the local file system (using stdio).
+ */
+int av_text_file_read_file(AVTextFile *tf, const char *filename);
+
+/**
+ * Free all memory allocated while reading the file.
+ * The corresponding fields are set to NULL.
+ */
+void av_text_file_free(AVTextFile *tf);
+
+#define AV_TEXT_FILE_DEFAULT sizeof(AVTextFile)
+
+#endif /* AVUTIL_TEXT_FILE_H */
--
1.7.10.4
More information about the ffmpeg-devel
mailing list