FFmpeg
vf_ocr.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Paul B Mahol
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <tesseract/capi.h>
22 
23 #include "libavutil/opt.h"
24 #include "avfilter.h"
25 #include "filters.h"
26 #include "video.h"
27 
28 typedef struct OCRContext {
29  const AVClass *class;
30 
31  char *datapath;
32  char *language;
33  char *whitelist;
34  char *blacklist;
35 
36  TessBaseAPI *tess;
37 } OCRContext;
38 
39 #define OFFSET(x) offsetof(OCRContext, x)
40 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
41 
42 static const AVOption ocr_options[] = {
43  { "datapath", "set datapath", OFFSET(datapath), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
44  { "language", "set language", OFFSET(language), AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS },
45  { "whitelist", "set character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING, {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~ "}, 0, 0, FLAGS },
46  { "blacklist", "set character blacklist", OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""}, 0, 0, FLAGS },
47  { NULL }
48 };
49 
51 {
52  OCRContext *s = ctx->priv;
53 
54  s->tess = TessBaseAPICreate();
55  if (TessBaseAPIInit3(s->tess, s->datapath, s->language) == -1) {
56  av_log(ctx, AV_LOG_ERROR, "failed to init tesseract\n");
57  return AVERROR(EINVAL);
58  }
59 
60  if (!TessBaseAPISetVariable(s->tess, "tessedit_char_whitelist", s->whitelist)) {
61  av_log(ctx, AV_LOG_ERROR, "failed to set whitelist\n");
62  return AVERROR(EINVAL);
63  }
64 
65  if (!TessBaseAPISetVariable(s->tess, "tessedit_char_blacklist", s->blacklist)) {
66  av_log(ctx, AV_LOG_ERROR, "failed to set blacklist\n");
67  return AVERROR(EINVAL);
68  }
69 
70  av_log(ctx, AV_LOG_DEBUG, "Tesseract version: %s\n", TessVersion());
71 
72  return 0;
73 }
74 
75 static const enum AVPixelFormat pix_fmts[] = {
85 };
86 
88 {
89  AVDictionary **metadata = &in->metadata;
90  AVFilterContext *ctx = inlink->dst;
91  AVFilterLink *outlink = ctx->outputs[0];
92  OCRContext *s = ctx->priv;
93  char *result;
94  int *confs;
95 
96  result = TessBaseAPIRect(s->tess, in->data[0], 1,
97  in->linesize[0], 0, 0, in->width, in->height);
98  confs = TessBaseAPIAllWordConfidences(s->tess);
99  av_dict_set(metadata, "lavfi.ocr.text", result, 0);
100  for (int i = 0; confs[i] != -1; i++) {
101  char number[256];
102 
103  snprintf(number, sizeof(number), "%d ", confs[i]);
104  av_dict_set(metadata, "lavfi.ocr.confidence", number, AV_DICT_APPEND);
105  }
106 
107  TessDeleteText(result);
108  TessDeleteIntArray(confs);
109 
110  return ff_filter_frame(outlink, in);
111 }
112 
114 {
115  OCRContext *s = ctx->priv;
116 
117  TessBaseAPIEnd(s->tess);
118  TessBaseAPIDelete(s->tess);
119 }
120 
122 
123 static const AVFilterPad ocr_inputs[] = {
124  {
125  .name = "default",
126  .type = AVMEDIA_TYPE_VIDEO,
127  .filter_frame = filter_frame,
128  },
129 };
130 
132  .name = "ocr",
133  .description = NULL_IF_CONFIG_SMALL("Optical Character Recognition."),
134  .priv_size = sizeof(OCRContext),
135  .priv_class = &ocr_class,
136  .init = init,
137  .uninit = uninit,
142 };
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
FILTER_PIXFMTS_ARRAY
#define FILTER_PIXFMTS_ARRAY(array)
Definition: filters.h:242
init
static av_cold int init(AVFilterContext *ctx)
Definition: vf_ocr.c:50
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:1061
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
OCRContext::tess
TessBaseAPI * tess
Definition: vf_ocr.c:36
uninit
static av_cold void uninit(AVFilterContext *ctx)
Definition: vf_ocr.c:113
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: filters.h:262
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:403
AVFrame::width
int width
Definition: frame.h:475
AVOption
AVOption.
Definition: opt.h:429
AV_DICT_APPEND
#define AV_DICT_APPEND
If the entry already exists, append to it.
Definition: dict.h:82
AV_PIX_FMT_YUV440P
@ AV_PIX_FMT_YUV440P
planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples)
Definition: pixfmt.h:106
AVDictionary
Definition: dict.c:34
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:205
video.h
FLAGS
#define FLAGS
Definition: vf_ocr.c:40
AVFrame::data
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:424
AVFILTER_DEFINE_CLASS
AVFILTER_DEFINE_CLASS(ocr)
AVFilterPad
A filter pad used for either input or output.
Definition: filters.h:38
AV_PIX_FMT_YUVJ411P
@ AV_PIX_FMT_YUVJ411P
planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples) full scale (JPEG), deprecated in favor ...
Definition: pixfmt.h:283
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:209
av_cold
#define av_cold
Definition: attributes.h:90
OCRContext::datapath
char * datapath
Definition: vf_ocr.c:31
ff_video_default_filterpad
const AVFilterPad ff_video_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_VIDEO.
Definition: video.c:37
AV_PIX_FMT_YUVJ422P
@ AV_PIX_FMT_YUVJ422P
planar YUV 4:2:2, 16bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV422P and setting col...
Definition: pixfmt.h:86
s
#define s(width, name)
Definition: cbs_vp9.c:198
AV_PIX_FMT_YUVA420P
@ AV_PIX_FMT_YUVA420P
planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples)
Definition: pixfmt.h:108
filters.h
AV_LOG_DEBUG
#define AV_LOG_DEBUG
Stuff which is only useful for libav* developers.
Definition: log.h:230
ctx
AVFormatContext * ctx
Definition: movenc.c:49
AV_PIX_FMT_YUV420P
@ AV_PIX_FMT_YUV420P
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:73
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: filters.h:263
AV_PIX_FMT_YUVJ444P
@ AV_PIX_FMT_YUVJ444P
planar YUV 4:4:4, 24bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV444P and setting col...
Definition: pixfmt.h:87
OCRContext::blacklist
char * blacklist
Definition: vf_ocr.c:34
ocr_inputs
static const AVFilterPad ocr_inputs[]
Definition: vf_ocr.c:123
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:75
result
and forward the result(frame or status change) to the corresponding input. If nothing is possible
NULL
#define NULL
Definition: coverity.c:32
AV_PIX_FMT_YUVJ420P
@ AV_PIX_FMT_YUVJ420P
planar YUV 4:2:0, 12bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV420P and setting col...
Definition: pixfmt.h:85
AV_PIX_FMT_GRAY8
@ AV_PIX_FMT_GRAY8
Y , 8bpp.
Definition: pixfmt.h:81
ff_vf_ocr
const AVFilter ff_vf_ocr
Definition: vf_ocr.c:131
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:94
filter_frame
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
Definition: vf_ocr.c:87
AV_PIX_FMT_YUVA444P
@ AV_PIX_FMT_YUVA444P
planar YUV 4:4:4 32bpp, (1 Cr & Cb sample per 1x1 Y & A samples)
Definition: pixfmt.h:174
OCRContext
Definition: vf_ocr.c:28
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
AV_PIX_FMT_YUVJ440P
@ AV_PIX_FMT_YUVJ440P
planar YUV 4:4:0 full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV440P and setting color_range
Definition: pixfmt.h:107
AVFilterPad::name
const char * name
Pad name.
Definition: filters.h:44
pix_fmts
static enum AVPixelFormat pix_fmts[]
Definition: vf_ocr.c:75
AVFilter
Filter definition.
Definition: avfilter.h:201
language
Undefined Behavior In the C language
Definition: undefined.txt:3
OCRContext::whitelist
char * whitelist
Definition: vf_ocr.c:33
OFFSET
#define OFFSET(x)
Definition: vf_ocr.c:39
OCRContext::language
char * language
Definition: vf_ocr.c:32
AVFrame::height
int height
Definition: frame.h:475
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
avfilter.h
AVFrame::metadata
AVDictionary * metadata
metadata.
Definition: frame.h:725
AVFILTER_FLAG_METADATA_ONLY
#define AVFILTER_FLAG_METADATA_ONLY
The filter is a "metadata" filter - it does not modify the frame data in any way.
Definition: avfilter.h:168
ocr_options
static const AVOption ocr_options[]
Definition: vf_ocr.c:42
AV_PIX_FMT_YUV444P
@ AV_PIX_FMT_YUV444P
planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
Definition: pixfmt.h:78
AVFilterContext
An instance of a filter.
Definition: avfilter.h:457
AVMEDIA_TYPE_VIDEO
@ AVMEDIA_TYPE_VIDEO
Definition: avutil.h:201
AV_PIX_FMT_YUV422P
@ AV_PIX_FMT_YUV422P
planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
Definition: pixfmt.h:77
av_dict_set
int av_dict_set(AVDictionary **pm, const char *key, const char *value, int flags)
Set the given entry in *pm, overwriting an existing entry.
Definition: dict.c:88
AV_PIX_FMT_YUV411P
@ AV_PIX_FMT_YUV411P
planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
Definition: pixfmt.h:80
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:482
AVFrame::linesize
int linesize[AV_NUM_DATA_POINTERS]
For video, a positive or negative value, which is typically indicating the size in bytes of each pict...
Definition: frame.h:448
AV_PIX_FMT_YUV410P
@ AV_PIX_FMT_YUV410P
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:79
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...
Definition: opt.h:276
snprintf
#define snprintf
Definition: snprintf.h:34
AV_PIX_FMT_YUVA422P
@ AV_PIX_FMT_YUVA422P
planar YUV 4:2:2 24bpp, (1 Cr & Cb sample per 2x1 Y & A samples)
Definition: pixfmt.h:173