ref: c5bd2add37725041c1924132a8a4fd67548fb975
dir: /src/synth/cst_ssml.c/
/*************************************************************************/ /* */ /* Language Technologies Institute */ /* Carnegie Mellon University */ /* Copyright (c) 2001-2011 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author: Alan W Black (awb@cs.cmu.edu) */ /* Date: June 2008 */ /*************************************************************************/ /* */ /* SSML support for flite ( http://www.w3.org/TR/speech-synthesis/ ) */ /* */ /* We don't use a full XML parser here for space and availability */ /* reasons, but this is adequate for SSML */ /* This is based on some old SABLE support in flite that never got */ /* completed */ /* */ /* <ssml> </ssml> */ /* <voice ...> </voice> */ /* name or urls for voices */ /* <audio ...> </audio> */ /* <!-- ... --> */ /* <break .../> */ /* <prosody ...> </prosody> rate volume (no pitch yet) */ /* <emphasis ...> </emphasis> */ /* <sub alias="World Wide Web Consortium">W3C</sub> */ /* <phoneme ph="x x x"> </phoneme> */ /* */ /* <...> ignore all others */ /* */ /* Voice call backs (e.g. -pw and -ps) are not transfered when new */ /* voices are selected */ /* */ /*************************************************************************/ #include "flite.h" #include "cst_tokenstream.h" static const char * const ssml_singlecharsymbols_general = "<>&/\";"; static const char * const ssml_singlecharsymbols_inattr = "=>;/\""; #define SSML_DEBUG 0 static const char *ts_get_quoted_remainder(cst_tokenstream *ts) { const char *q; q = ts_get_quoted_token(ts,'"','\\'); return q; } static cst_features *ssml_get_attributes(cst_tokenstream *ts) { cst_features *a = new_features(); const char* name, *val; const char *fnn,*vnn; int i=0; set_charclasses(ts, ts->p_whitespacesymbols, ssml_singlecharsymbols_inattr, ts->p_prepunctuationsymbols, ts->p_postpunctuationsymbols); name = ts_get(ts); while (!cst_streq(">",name)) { /* I want names and values to be const */ if (i == 0) { fnn="_name0"; vnn="_val0"; } else { fnn="_name1"; vnn="_val1"; } if (cst_streq(name,"/")) feat_set_string(a,"_type","startend"); else { feat_set_string(a,"_type","start"); feat_set_string(a,fnn,name); if (cst_streq("=",ts_get(ts))) { val = ts_get_quoted_remainder(ts); feat_set_string(a,vnn,val); } } if (ts_eof(ts)) { fprintf(stderr,"ssml: unexpected EOF\n"); delete_features(a); return 0; } name = ts_get(ts); i++; } set_charclasses(ts, ts->p_whitespacesymbols, ssml_singlecharsymbols_general, ts->p_prepunctuationsymbols, ts->p_postpunctuationsymbols); return a; } static cst_utterance *ssml_apply_tag(const char *tag, cst_features *attributes, cst_utterance *u, cst_features *word_feats, cst_features *feats) { const char *wavefilename; const char *vname; cst_voice *nvoice; cst_wave *wave; cst_item *t; cst_relation *r; float break_size; #if SSML_DEBUG printf("SSML TAG %s\n",tag); cst_feat_print(stdout,attributes); printf("...\n"); #endif if (cst_streq("AUDIO",tag)) { if ((cst_streq("start",feat_string(attributes,"_type"))) || (cst_streq("startend",feat_string(attributes,"_type")))) { wavefilename = feat_string(attributes,"_val0"); wave = new_wave(); if (cst_wave_load_riff(wave,wavefilename) == CST_OK_FORMAT) { if (cst_streq("start",feat_string(attributes,"_type"))) { feat_set_string(word_feats,"ssml_comment","1"); } feat_set(word_feats,"ssml_play_audio",wave_val(wave)); } else delete_wave(wave); return NULL; /* Cause eou */ } else if (cst_streq("end",feat_string(attributes,"_type"))) { feat_remove(word_feats,"ssml_comment"); return NULL; /* Cause eou */ } } else if (cst_streq("BREAK",tag)) { if (u && ((r = utt_relation(u,"Token")) != NULL) && ((t = relation_tail(r)) != NULL)) { item_set_string(t,"break","1"); /* cst_feat_print(stdout,attributes); */ if (cst_streq("size",get_param_string(attributes,"_name0",""))) { break_size=feat_float(attributes,"_val0"); item_set_float(t,"break_size",break_size); } } } else if (cst_streq("PROSODY",tag)) { if (cst_streq("start",feat_string(attributes,"_type"))) { /* Note SSML doesn't do stretch it does reciprical of stretch */ if (cst_streq("rate",get_param_string(attributes,"_name0",""))) feat_set_float(word_feats,"local_duration_stretch", 1.0/feat_float(attributes,"_val0")); if (cst_streq("rate",get_param_string(attributes,"_name1",""))) feat_set_float(word_feats,"local_duration_stretch", 1.0/feat_float(attributes,"_val1")); if (cst_streq("volume",get_param_string(attributes,"_name0",""))) feat_set_float(word_feats,"local_gain", feat_float(attributes,"_val0")/100.0); if (cst_streq("volume",get_param_string(attributes,"_name1",""))) feat_set_float(word_feats,"local_gain", feat_float(attributes,"_val1")/100.0); } else if (cst_streq("end",feat_string(attributes,"_type"))) { feat_remove(word_feats,"local_duration_stretch"); feat_remove(word_feats,"local_gain"); } } else if (cst_streq("PHONEME",tag)) { if (cst_streq("start",feat_string(attributes,"_type"))) { if (cst_streq("ph",get_param_string(attributes,"_name0",""))) { const char *ph; ph = feat_string(attributes,"_val0"); feat_set_string(word_feats,"phones",ph); } } else if (cst_streq("end",feat_string(attributes,"_type"))) { feat_remove(word_feats,"phones"); } } else if (cst_streq("SUB",tag)) { if (cst_streq("start",feat_string(attributes,"_type"))) { if (cst_streq("alias",get_param_string(attributes,"_name0",""))) { const char *alias; alias = feat_string(attributes,"_val0"); feat_set_string(word_feats,"ssml_alias",alias); } } else if (cst_streq("end",feat_string(attributes,"_type"))) { feat_remove(word_feats,"ssml_alias"); } } else if (cst_streq("VOICE",tag)) { if (cst_streq("start",feat_string(attributes,"_type"))) { vname = get_param_string(attributes,"_val0",""); nvoice = flite_voice_select(vname); feat_set(feats,"current_voice",userdata_val(nvoice)); return NULL; /* cause an utterance break */ } else if (cst_streq("end",feat_string(attributes,"_type"))) { /* Hmm we should really have a stack of these */ nvoice = (cst_voice *)val_userdata(feat_val(feats,"default_voice")); feat_set(feats,"current_voice",userdata_val(nvoice)); return NULL; } } /* do stuff */ /* flag what to do mark or end */ /* ph set attributes silence all contained tokens break add to previous token a break marker audio silence all following tokens (utt break) insert waveform */ return u; } static float flite_ssml_to_speech_ts(cst_tokenstream *ts, cst_voice *voice, const char *outtype) { /* This is a very ugly function, that might be better written with gotos */ /* This just doesn't seem to be properly functions -- perhaps a proper */ /* consumer/producer threaded model might be better here -- but its */ /* not clear. There is so much have-to-be-done-now vs note-for-later */ /* code, that the code is far from clear, and probably not right */ cst_features *ssml_feats, *ssml_word_feats; cst_features *attributes; const char *token = ""; char *tag=NULL; cst_utterance *utt; cst_relation *tokrel; int num_tokens; cst_breakfunc breakfunc = default_utt_break; cst_uttfunc utt_user_callback = 0; float durs = 0.0; cst_item *t; cst_voice *current_voice; int ssml_eou = 0; const cst_wave *wave; cst_wave *w; ssml_feats = new_features(); feat_set(ssml_feats,"current_voice",userdata_val(voice)); feat_set(ssml_feats,"default_voice",userdata_val(voice)); ssml_word_feats = new_features(); set_charclasses(ts, " \t\n\r", ssml_singlecharsymbols_general, get_param_string(voice->features,"text_prepunctuation",""), get_param_string(voice->features,"text_postpunctuation","") ); if (feat_present(voice->features,"utt_break")) breakfunc = val_breakfunc(feat_val(voice->features,"utt_break")); if (feat_present(voice->features,"utt_user_callback")) utt_user_callback = val_uttfunc(feat_val(voice->features,"utt_user_callback")); /* If its a file to write to, create and save an empty wave file */ /* as we are going to incrementally append to it */ if (!cst_streq(outtype,"play") && !cst_streq(outtype,"none") && !cst_streq(outtype,"stream")) { w = new_wave(); cst_wave_resize(w,0,1); cst_wave_set_sample_rate(w,16000); cst_wave_save_riff(w,outtype); /* an empty wave */ delete_wave(w); } num_tokens = 0; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); while (!ts_eof(ts) || num_tokens > 0) { current_voice = (cst_voice *)val_userdata(feat_val(ssml_feats,"current_voice")); /* printf("awb_debug prewhile %d %s\n",ssml_eou,token); */ if (ssml_eou == 0) token = ts_get(ts); else { if (!cst_streq("<",token)) token = ts_get(ts); ssml_eou = 0; } while ((cst_streq("<",token)) && (ssml_eou == 0)) { /* A tag -- look ahead and process it to find out how to advance */ tag = cst_upcase(ts_get(ts)); /* printf("awb_debug tag is %s\n",tag); */ if (cst_streq("/",tag)) /* an end tag */ { cst_free(tag); tag=NULL; tag = cst_upcase(ts_get(ts)); attributes = ssml_get_attributes(ts); feat_set_string(attributes,"_type","end"); } else attributes = ssml_get_attributes(ts); token = ts_get(ts); /* skip ">" */ if (ssml_apply_tag(tag,attributes,utt,ssml_word_feats,ssml_feats)) ssml_eou = 0; else ssml_eou = 1; delete_features(attributes); cst_free(tag); tag=NULL; } if ((cst_strlen(token) == 0) || (num_tokens > 500) || /* need an upper bound */ (ssml_eou == 1) || /* ssml tag was utterance break */ (relation_head(tokrel) && breakfunc(ts,token,tokrel))) { /* An end of utt, so synthesize it */ if (utt_user_callback) utt = (utt_user_callback)(utt); if (utt) { utt = flite_do_synth(utt,current_voice,utt_synth_tokens); if (feat_present(utt->features,"Interrupted")) { delete_utterance(utt); utt = NULL; break; } durs += flite_process_output(utt,outtype,TRUE); delete_utterance(utt); utt = NULL; } else break; if (ts_eof(ts)) break; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); num_tokens = 0; } if (feat_present(ssml_word_feats,"ssml_play_audio")) { wave = val_wave(feat_val(ssml_word_feats,"ssml_play_audio")); /* Should create an utterances with the waveform in it */ /* Have to stream it if there is streaming */ if (utt) delete_utterance(utt); utt = utt_synth_wave(copy_wave(wave),current_voice); if (utt_user_callback) utt = (utt_user_callback)(utt); durs += flite_process_output(utt,outtype,TRUE); delete_utterance(utt); utt = NULL; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); num_tokens = 0; feat_remove(ssml_word_feats,"ssml_play_audio"); } else if (!cst_streq("<",token)) { /* wasn't an ssml tag */ num_tokens++; t = relation_append(tokrel, NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",ts->whitespace); item_set_string(t,"prepunctuation",ts->prepunctuation); item_set_string(t,"punc",ts->postpunctuation); /* Mark it at the beginning of the token */ item_set_int(t,"file_pos", ts->file_pos-(1+ /* as we are already on the next char */ cst_strlen(token)+ cst_strlen(ts->prepunctuation)+ cst_strlen(ts->postpunctuation))); item_set_int(t,"line_number",ts->line_number); feat_copy_into(ssml_word_feats,item_feats(t)); } } delete_utterance(utt); delete_features(ssml_feats); delete_features(ssml_word_feats); return durs; } float flite_ssml_file_to_speech(const char *filename, cst_voice *voice, const char *outtype) { cst_tokenstream *ts; int fp; cst_wave *w; float d; if ((ts = ts_open(filename, get_param_string(voice->features,"text_whitespace",NULL), get_param_string(voice->features,"text_singlecharsymbols",NULL), get_param_string(voice->features,"text_prepunctuation",NULL), get_param_string(voice->features,"text_postpunctuation",NULL))) == NULL) { cst_errmsg("failed to open file \"%s\" for ssml reading\n", filename); return 1; } fp = get_param_int(voice->features,"file_start_position",0); if (fp > 0) ts_set_stream_pos(ts,fp); /* If its a file to write to, create and save an empty wave file */ /* as we are going to incrementally append to it */ if (!cst_streq(outtype,"play") && !cst_streq(outtype,"none") && !cst_streq(outtype,"stream")) { w = new_wave(); cst_wave_resize(w,0,1); cst_wave_set_sample_rate(w,16000); cst_wave_save_riff(w,outtype); /* an empty wave */ delete_wave(w); } d = flite_ssml_to_speech_ts(ts,voice,outtype); ts_close(ts); return d; } float flite_ssml_text_to_speech(const char *text, cst_voice *voice, const char *outtype) { cst_tokenstream *ts; int fp; cst_wave *w; float d; if ((ts = ts_open_string(text, get_param_string(voice->features,"text_whitespace",NULL), get_param_string(voice->features,"text_singlecharsymbols",NULL), get_param_string(voice->features,"text_prepunctuation",NULL), get_param_string(voice->features,"text_postpunctuation",NULL))) == NULL) { return 1; } fp = get_param_int(voice->features,"file_start_position",0); if (fp > 0) ts_set_stream_pos(ts,fp); /* If its a file to write to, create and save an empty wave file */ /* as we are going to incrementally append to it */ if (!cst_streq(outtype,"play") && !cst_streq(outtype,"none") && !cst_streq(outtype,"stream")) { w = new_wave(); cst_wave_resize(w,0,1); cst_wave_set_sample_rate(w,16000); cst_wave_save_riff(w,outtype); /* an empty wave */ delete_wave(w); } d = flite_ssml_to_speech_ts(ts,voice,outtype); ts_close(ts); return d; }