00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00026 #include <flite/flite.h>
00027 #include "libavutil/audioconvert.h"
00028 #include "libavutil/file.h"
00029 #include "libavutil/opt.h"
00030 #include "avfilter.h"
00031 #include "audio.h"
00032 #include "formats.h"
00033 #include "internal.h"
00034
00035 typedef struct {
00036 const AVClass *class;
00037 char *voice_str;
00038 char *textfile;
00039 char *text;
00040 cst_wave *wave;
00041 int16_t *wave_samples;
00042 int wave_nb_samples;
00043 int list_voices;
00044 cst_voice *voice;
00045 struct voice_entry *voice_entry;
00046 int64_t pts;
00047 int frame_nb_samples;
00048 } FliteContext;
00049
00050 #define OFFSET(x) offsetof(FliteContext, x)
00051 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
00052
00053 static const AVOption flite_options[] = {
00054 { "list_voices", "list voices and exit", OFFSET(list_voices), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS },
00055 { "nb_samples", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.i64=512}, 0, INT_MAX, FLAGS },
00056 { "n", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.i64=512}, 0, INT_MAX, FLAGS },
00057 { "text", "set text to speak", OFFSET(text), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, FLAGS },
00058 { "textfile", "set filename of the text to speak", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, FLAGS },
00059 { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX, FLAGS },
00060 { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX, FLAGS },
00061 { NULL }
00062 };
00063
00064 AVFILTER_DEFINE_CLASS(flite);
00065
00066 static volatile int flite_inited = 0;
00067
00068
00069 #define DECLARE_REGISTER_VOICE_FN(name) \
00070 cst_voice *register_cmu_us_## name(const char *); \
00071 void unregister_cmu_us_## name(cst_voice *);
00072 DECLARE_REGISTER_VOICE_FN(awb);
00073 DECLARE_REGISTER_VOICE_FN(kal);
00074 DECLARE_REGISTER_VOICE_FN(kal16);
00075 DECLARE_REGISTER_VOICE_FN(rms);
00076 DECLARE_REGISTER_VOICE_FN(slt);
00077
00078 struct voice_entry {
00079 const char *name;
00080 cst_voice * (*register_fn)(const char *);
00081 void (*unregister_fn)(cst_voice *);
00082 cst_voice *voice;
00083 unsigned usage_count;
00084 } voice_entry;
00085
00086 #define MAKE_VOICE_STRUCTURE(voice_name) { \
00087 .name = #voice_name, \
00088 .register_fn = register_cmu_us_ ## voice_name, \
00089 .unregister_fn = unregister_cmu_us_ ## voice_name, \
00090 }
00091 static struct voice_entry voice_entries[] = {
00092 MAKE_VOICE_STRUCTURE(awb),
00093 MAKE_VOICE_STRUCTURE(kal),
00094 MAKE_VOICE_STRUCTURE(kal16),
00095 MAKE_VOICE_STRUCTURE(rms),
00096 MAKE_VOICE_STRUCTURE(slt),
00097 };
00098
00099 static void list_voices(void *log_ctx, const char *sep)
00100 {
00101 int i, n = FF_ARRAY_ELEMS(voice_entries);
00102 for (i = 0; i < n; i++)
00103 av_log(log_ctx, AV_LOG_INFO, "%s%s",
00104 voice_entries[i].name, i < (n-1) ? sep : "\n");
00105 }
00106
00107 static int select_voice(struct voice_entry **entry_ret, const char *voice_name, void *log_ctx)
00108 {
00109 int i;
00110
00111 for (i = 0; i < FF_ARRAY_ELEMS(voice_entries); i++) {
00112 struct voice_entry *entry = &voice_entries[i];
00113 if (!strcmp(entry->name, voice_name)) {
00114 if (!entry->voice)
00115 entry->voice = entry->register_fn(NULL);
00116 if (!entry->voice) {
00117 av_log(log_ctx, AV_LOG_ERROR,
00118 "Could not register voice '%s'\n", voice_name);
00119 return AVERROR_UNKNOWN;
00120 }
00121 entry->usage_count++;
00122 *entry_ret = entry;
00123 return 0;
00124 }
00125 }
00126
00127 av_log(log_ctx, AV_LOG_ERROR, "Could not find voice '%s'\n", voice_name);
00128 av_log(log_ctx, AV_LOG_INFO, "Choose between the voices: ");
00129 list_voices(log_ctx, ", ");
00130
00131 return AVERROR(EINVAL);
00132 }
00133
00134 static av_cold int init(AVFilterContext *ctx, const char *args)
00135 {
00136 FliteContext *flite = ctx->priv;
00137 int ret = 0;
00138
00139 flite->class = &flite_class;
00140 av_opt_set_defaults(flite);
00141
00142 if ((ret = av_set_options_string(flite, args, "=", ":")) < 0)
00143 return ret;
00144
00145 if (flite->list_voices) {
00146 list_voices(ctx, "\n");
00147 return AVERROR_EXIT;
00148 }
00149
00150 if (!flite_inited) {
00151 if (flite_init() < 0) {
00152 av_log(ctx, AV_LOG_ERROR, "flite initialization failed\n");
00153 return AVERROR_UNKNOWN;
00154 }
00155 flite_inited++;
00156 }
00157
00158 if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
00159 return ret;
00160 flite->voice = flite->voice_entry->voice;
00161
00162 if (flite->textfile && flite->text) {
00163 av_log(ctx, AV_LOG_ERROR,
00164 "Both text and textfile options set: only one must be specified\n");
00165 return AVERROR(EINVAL);
00166 }
00167
00168 if (flite->textfile) {
00169 uint8_t *textbuf;
00170 size_t textbuf_size;
00171
00172 if ((ret = av_file_map(flite->textfile, &textbuf, &textbuf_size, 0, ctx)) < 0) {
00173 av_log(ctx, AV_LOG_ERROR,
00174 "The text file '%s' could not be read: %s\n",
00175 flite->textfile, av_err2str(ret));
00176 return ret;
00177 }
00178
00179 if (!(flite->text = av_malloc(textbuf_size+1)))
00180 return AVERROR(ENOMEM);
00181 memcpy(flite->text, textbuf, textbuf_size);
00182 flite->text[textbuf_size] = 0;
00183 av_file_unmap(textbuf, textbuf_size);
00184 }
00185
00186 if (!flite->text) {
00187 av_log(ctx, AV_LOG_ERROR,
00188 "No speech text specified, specify the 'text' or 'textfile' option\n");
00189 return AVERROR(EINVAL);
00190 }
00191
00192
00193 flite->wave = flite_text_to_wave(flite->text, flite->voice);
00194 flite->wave_samples = flite->wave->samples;
00195 flite->wave_nb_samples = flite->wave->num_samples;
00196 return 0;
00197 }
00198
00199 static av_cold void uninit(AVFilterContext *ctx)
00200 {
00201 FliteContext *flite = ctx->priv;
00202
00203 av_opt_free(flite);
00204
00205 if (!--flite->voice_entry->usage_count)
00206 flite->voice_entry->unregister_fn(flite->voice);
00207 flite->voice = NULL;
00208 flite->voice_entry = NULL;
00209 delete_wave(flite->wave);
00210 flite->wave = NULL;
00211 }
00212
00213 static int query_formats(AVFilterContext *ctx)
00214 {
00215 FliteContext *flite = ctx->priv;
00216
00217 AVFilterChannelLayouts *chlayouts = NULL;
00218 int64_t chlayout = av_get_default_channel_layout(flite->wave->num_channels);
00219 AVFilterFormats *sample_formats = NULL;
00220 AVFilterFormats *sample_rates = NULL;
00221
00222 ff_add_channel_layout(&chlayouts, chlayout);
00223 ff_set_common_channel_layouts(ctx, chlayouts);
00224 ff_add_format(&sample_formats, AV_SAMPLE_FMT_S16);
00225 ff_set_common_formats(ctx, sample_formats);
00226 ff_add_format(&sample_rates, flite->wave->sample_rate);
00227 ff_set_common_samplerates (ctx, sample_rates);
00228
00229 return 0;
00230 }
00231
00232 static int config_props(AVFilterLink *outlink)
00233 {
00234 AVFilterContext *ctx = outlink->src;
00235 FliteContext *flite = ctx->priv;
00236
00237 outlink->sample_rate = flite->wave->sample_rate;
00238 outlink->time_base = (AVRational){1, flite->wave->sample_rate};
00239
00240 av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
00241 flite->voice_str,
00242 av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
00243 return 0;
00244 }
00245
00246 static int request_frame(AVFilterLink *outlink)
00247 {
00248 AVFilterBufferRef *samplesref;
00249 FliteContext *flite = outlink->src->priv;
00250 int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples);
00251
00252 if (!nb_samples)
00253 return AVERROR_EOF;
00254
00255 samplesref = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
00256 if (!samplesref)
00257 return AVERROR(ENOMEM);
00258
00259 memcpy(samplesref->data[0], flite->wave_samples,
00260 nb_samples * flite->wave->num_channels * 2);
00261 samplesref->pts = flite->pts;
00262 samplesref->pos = -1;
00263 samplesref->audio->sample_rate = flite->wave->sample_rate;
00264 flite->pts += nb_samples;
00265 flite->wave_samples += nb_samples * flite->wave->num_channels;
00266 flite->wave_nb_samples -= nb_samples;
00267
00268 return ff_filter_samples(outlink, samplesref);
00269 }
00270
00271 AVFilter avfilter_asrc_flite = {
00272 .name = "flite",
00273 .description = NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
00274 .query_formats = query_formats,
00275 .init = init,
00276 .uninit = uninit,
00277 .priv_size = sizeof(FliteContext),
00278
00279 .inputs = (const AVFilterPad[]) {{ .name = NULL}},
00280
00281 .outputs = (const AVFilterPad[]) {
00282 {
00283 .name = "default",
00284 .type = AVMEDIA_TYPE_AUDIO,
00285 .config_props = config_props,
00286 .request_frame = request_frame,
00287 },
00288 { .name = NULL }
00289 },
00290
00291 .priv_class = &flite_class,
00292 };