ref: c5bd2add37725041c1924132a8a4fd67548fb975
dir: /src/synth/cst_synth.c/
/*************************************************************************/ /* */ /* Language Technologies Institute */ /* Carnegie Mellon University */ /* Copyright (c) 2000 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author: Alan W Black (awb@cs.cmu.edu) */ /* Date: September 2000 */ /*************************************************************************/ /* */ /* General synthesis control */ /* */ /*************************************************************************/ #include "cst_hrg.h" #include "cst_cart.h" #include "cst_tokenstream.h" #include "cst_utt_utils.h" #include "cst_lexicon.h" #include "cst_units.h" #include "cst_synth.h" #include "cst_phoneset.h" CST_VAL_REGISTER_FUNCPTR(breakfunc,cst_breakfunc) #ifndef SYNTH_MODULES_DEBUG #define SYNTH_MODULES_DEBUG 0 #endif #if SYNTH_MODULES_DEBUG > 0 #define DPRINTF(l,x) if (SYNTH_MODULES_DEBUG > l) cst_dbgmsg x #else #define DPRINTF(l,x) #endif static cst_utterance *tokentosegs(cst_utterance *u); static const cst_synth_module synth_method_text[] = { { "tokenizer_func", default_tokenization }, { "textanalysis_func", default_textanalysis }, { "pos_tagger_func", default_pos_tagger }, { "phrasing_func", default_phrasing }, { "lexical_insertion_func", default_lexical_insertion }, { "pause_insertion_func", default_pause_insertion }, { "intonation_func", cart_intonation }, { "postlex_func", NULL }, { "duration_model_func", cart_duration }, { "f0_model_func", NULL }, { "wave_synth_func", NULL }, { "post_synth_hook_func", NULL }, { NULL, NULL } }; static const cst_synth_module synth_method_text2segs[] = { { "tokenizer_func", default_tokenization }, { "textanalysis_func", default_textanalysis }, { "pos_tagger_func", default_pos_tagger }, { "phrasing_func", default_phrasing }, { "lexical_insertion_func", default_lexical_insertion }, { "pause_insertion_func", default_pause_insertion }, { NULL, NULL } }; static const cst_synth_module synth_method_tokens[] = { { "textanalysis_func", default_textanalysis }, { "pos_tagger_func", default_pos_tagger }, { "phrasing_func", default_phrasing }, { "lexical_insertion_func", default_lexical_insertion }, { "pause_insertion_func", default_pause_insertion }, { "intonation_func", cart_intonation }, { "postlex_func", NULL }, { "duration_model_func", cart_duration }, { "f0_model_func", NULL }, { "wave_synth_func", NULL }, { "post_synth_hook_func", NULL }, { NULL, NULL } }; static const cst_synth_module synth_method_phones[] = { { "tokenizer_func", default_tokenization }, { "textanalysis_func", tokentosegs }, { "pos_tagger_func", default_pos_tagger }, { "intonation_func", NULL }, { "duration_model_func", cart_duration }, { "f0_model_func", flat_prosody }, { "wave_synth_func", NULL }, { "post_synth_hook_func", NULL }, { NULL, NULL } }; cst_utterance *utt_synth_wave(cst_wave *w,cst_voice *v) { /* Create an utterance with a wave in it as if we've synthesized it */ /* Put it through streaming if that is require */ cst_utterance *u; const cst_val *streaming_info_val; cst_audio_streaming_info *asi = NULL; u = new_utterance(); utt_init(u,v); utt_set_wave(u,w); streaming_info_val=get_param_val(u->features,"streaming_info",NULL); if (streaming_info_val) { asi = val_audio_streaming_info(streaming_info_val); asi->utt = u; } if (!asi) return u; /* no stream */ /* Do streaming */ (*asi->asc)(w,0,w->num_samples,1,asi); return u; } cst_utterance *apply_synth_module(cst_utterance *u, const cst_synth_module *mod) { const cst_val *v; v = feat_val(u->features, mod->hookname); if (v) return (*val_uttfunc(v))(u); if (mod->defhook) return (*mod->defhook)(u); return u; } cst_utterance *apply_synth_method(cst_utterance *u, const cst_synth_module meth[]) { while (meth->hookname) { if ((u = apply_synth_module(u, meth)) == NULL) return NULL; ++meth; } return u; } cst_utterance *utt_init(cst_utterance *u, cst_voice *vox) { /* Link the vox features into the utterance features so the voice */ /* features will be searched too (after the utt ones) */ feat_link_into(vox->features,u->features); feat_link_into(vox->ffunctions,u->ffunctions); /* Do the initialization function, if there is one */ if (vox->utt_init) vox->utt_init(u, vox); return u; } cst_utterance *utt_synth(cst_utterance *u) { return apply_synth_method(u, synth_method_text); } cst_utterance *utt_synth_tokens(cst_utterance *u) { return apply_synth_method(u, synth_method_tokens); } cst_utterance *utt_synth_text2segs(cst_utterance *u) { return apply_synth_method(u, synth_method_text2segs); } cst_utterance *utt_synth_phones(cst_utterance *u) { return apply_synth_method(u, synth_method_phones); } cst_utterance *default_tokenization(cst_utterance *u) { const char *text,*token; cst_tokenstream *fd; cst_item *t; cst_relation *r; text = utt_input_text(u); r = utt_relation_create(u,"Token"); fd = ts_open_string(text, get_param_string(u->features,"text_whitespace",NULL), get_param_string(u->features,"text_singlecharsymbols",NULL), get_param_string(u->features,"text_prepunctuation",NULL), get_param_string(u->features,"text_postpunctuation",NULL)); while(!ts_eof(fd)) { token = ts_get(fd); if (cst_strlen(token) > 0) { t = relation_append(r,NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",fd->whitespace); item_set_string(t,"prepunctuation",fd->prepunctuation); item_set_string(t,"punc",fd->postpunctuation); item_set_int(t,"file_pos",fd->file_pos); item_set_int(t,"line_number",fd->line_number); } } ts_close(fd); return u; } cst_val *default_tokentowords(cst_item *i) { return cons_val(string_val(item_feat_string(i,"name")), NULL); } cst_utterance *default_textanalysis(cst_utterance *u) { cst_item *t,*word; cst_relation *word_rel; cst_val *words; const cst_val *w; const cst_val *ttwv; word_rel = utt_relation_create(u,"Word"); ttwv = feat_val(u->features, "tokentowords_func"); for (t=relation_head(utt_relation(u,"Token")); t; t=item_next(t)) { if (ttwv) words = (cst_val *)(*val_itemfunc(ttwv))(t); else words = default_tokentowords(t); for (w=words; w; w=val_cdr(w)) { word = item_add_daughter(t,NULL); if (cst_val_consp(val_car(w))) { /* Has extra features */ item_set_string(word,"name",val_string(val_car(val_car(w)))); feat_copy_into(val_features(val_cdr(val_car(w))), item_feats(word)); } else item_set_string(word,"name",val_string(val_car(w))); relation_append(word_rel,word); } delete_val(words); } return u; } cst_utterance *default_phrasing(cst_utterance *u) { cst_relation *r; cst_item *w, *p, *lp=NULL; const cst_val *v; cst_cart *phrasing_cart; r = utt_relation_create(u,"Phrase"); if (feat_present(u->features,"phrasing_cart")) phrasing_cart = val_cart(feat_val(u->features,"phrasing_cart")); else phrasing_cart = NULL; for (p=NULL,w=relation_head(utt_relation(u,"Word")); w; w=item_next(w)) { if (p == NULL) { p = relation_append(r,NULL); lp = p; item_set_string(p,"name","B"); } item_add_daughter(p,w); if (phrasing_cart) { v = cart_interpret(w,phrasing_cart); if (cst_streq(val_string(v),"BB")) p = NULL; } } if (lp && item_prev(lp)) /* follow festival */ item_set_string(lp,"name","BB"); return u; } cst_utterance *default_pause_insertion(cst_utterance *u) { /* Add initial silences and silence at each phrase break */ const char *silence; const cst_item *w; cst_item *p, *s; silence = val_string(feat_val(u->features,"silence")); /* Insert initial silence */ s = relation_head(utt_relation(u,"Segment")); if (s == NULL) s = relation_append(utt_relation(u,"Segment"),NULL); else s = item_prepend(s,NULL); item_set_string(s,"name",silence); for (p=relation_head(utt_relation(u,"Phrase")); p; p=item_next(p)) { for (w = item_last_daughter(p); w; w=item_prev(w)) { s = path_to_item(w,"R:SylStructure.daughtern.daughtern.R:Segment"); if (s) { s = item_append(s,NULL); item_set_string(s,"name",silence); break; } } } return u; } cst_utterance *cart_intonation(cst_utterance *u) { cst_cart *accents, *tones; cst_item *s; const cst_val *v; if (feat_present(u->features,"no_intonation_accent_model")) return u; /* not all languages have intonation models */ accents = val_cart(feat_val(u->features,"int_cart_accents")); tones = val_cart(feat_val(u->features,"int_cart_tones")); for (s=relation_head(utt_relation(u,"Syllable")); s; s=item_next(s)) { v = cart_interpret(s,accents); if (!cst_streq("NONE",val_string(v))) item_set_string(s,"accent",val_string(v)); v = cart_interpret(s,tones); if (!cst_streq("NONE",val_string(v))) item_set_string(s,"endtone",val_string(v)); DPRINTF(0,("word %s gpos %s stress %s ssyl_in %s ssyl_out %s accent %s endtone %s\n", ffeature_string(s,"R:SylStructure.parent.name"), ffeature_string(s,"R:SylStructure.parent.gpos"), ffeature_string(s,"stress"), ffeature_string(s,"ssyl_in"), ffeature_string(s,"ssyl_out"), ffeature_string(s,"accent"), ffeature_string(s,"endtone"))); } return u; } CST_VAL_REGISTER_TYPE_NODEL(dur_stats,dur_stats) const dur_stat *phone_dur_stat(const dur_stats *ds,const char *ph) { int i; for (i=0; ds[i]; i++) if (cst_streq(ph,ds[i]->phone)) return ds[i]; return ds[0]; } cst_utterance *cart_duration(cst_utterance *u) { cst_cart *dur_tree; cst_item *s; float zdur, dur_stretch, local_dur_stretch, dur; float end; dur_stats *ds; const dur_stat *dur_stat; end = 0; if (feat_present(u->features,"no_segment_duration_model")) return u; /* not all methods need segment durations */ dur_tree = val_cart(feat_val(u->features,"dur_cart")); dur_stretch = get_param_float(u->features,"duration_stretch", 1.0); ds = val_dur_stats(feat_val(u->features,"dur_stats")); for (s=relation_head(utt_relation(u,"Segment")); s; s=item_next(s)) { zdur = val_float(cart_interpret(s,dur_tree)); dur_stat = phone_dur_stat(ds,item_name(s)); local_dur_stretch = ffeature_float(s, "R:SylStructure.parent.parent." "R:Token.parent.local_duration_stretch"); if (local_dur_stretch) local_dur_stretch *= dur_stretch; else local_dur_stretch = dur_stretch; dur = local_dur_stretch * ((zdur*dur_stat->stddev)+dur_stat->mean); DPRINTF(0,("phone %s accent %s stress %s pdur %f stretch %f mean %f std %f dur %f\n", item_name(s), ffeature_string(s,"R:SylStructure.parent.accented"), ffeature_string(s,"R:SylStructure.parent.stress"), zdur, local_dur_stretch, dur_stat->mean, dur_stat->stddev, dur)); end += dur; item_set_float(s,"end",end); } return u; } cst_utterance *default_pos_tagger(cst_utterance *u) { cst_item *word; const cst_val *p; const cst_cart *tagger; p = get_param_val(u->features,"pos_tagger_cart",NULL); if (p == NULL) return u; tagger = val_cart(p); for (word=relation_head(utt_relation(u,"Word")); word; word=item_next(word)) { p = cart_interpret(word,tagger); item_set_string(word,"pos",val_string(p)); } return u; } cst_utterance *default_lexical_insertion(cst_utterance *u) { cst_item *word; cst_relation *sylstructure,*seg,*syl; cst_lexicon *lex; const cst_val *lex_addenda = NULL; const cst_val *p, *wp = NULL; char *phone_name; const char *stress = "0"; const char *pos; cst_val *phones; cst_item *ssword, *sssyl, *segitem, *sylitem, *seg_in_syl; const cst_val *vpn; int dp = 0; lex = val_lexicon(feat_val(u->features,"lexicon")); if (lex->lex_addenda) lex_addenda = lex->lex_addenda; syl = utt_relation_create(u,"Syllable"); sylstructure = utt_relation_create(u,"SylStructure"); seg = utt_relation_create(u,"Segment"); for (word=relation_head(utt_relation(u,"Word")); word; word=item_next(word)) { ssword = relation_append(sylstructure,word); pos = ffeature_string(word,"pos"); phones = NULL; wp = NULL; dp = 0; /* should the phones get deleted or not */ /* printf("awb_debug word %s pos %s gpos %s\n", item_feat_string(word,"name"), pos, ffeature_string(word,"gpos")); */ /* FIXME: need to make sure that textanalysis won't split tokens with explicit pronunciation (or that it will propagate such to words, then we can remove the path here) */ if (item_feat_present(item_parent(item_as(word, "Token")), "phones")) { vpn = item_feat(item_parent(item_as(word, "Token")), "phones"); if (cst_val_consp(vpn)) { /* for SAPI ?? */ /* awb oct11: this seems wrong -- */ /* not sure SAPI still (ever) works Oct11 */ phones = (cst_val *) vpn; } else { dp = 1; if (cst_streq(val_string(vpn), ffeature_string(word,"p.R:Token.parent.phones"))) phones = NULL; /* Already given these phones */ else phones = val_readlist_string(val_string(vpn)); } } else { wp = val_assoc_string(item_feat_string(word, "name"),lex_addenda); if (wp) phones = (cst_val *)val_cdr(val_cdr(wp)); else { dp = 1; phones = lex_lookup(lex,item_feat_string(word,"name"),pos, u->features); } } for (sssyl=NULL,sylitem=NULL,p=phones; p; p=val_cdr(p)) { if (sylitem == NULL) { sylitem = relation_append(syl,NULL); sssyl = item_add_daughter(ssword,sylitem); stress = "0"; } segitem = relation_append(seg,NULL); phone_name = cst_strdup(val_string(val_car(p))); if (phone_name[cst_strlen(phone_name)-1] == '1') { stress = "1"; phone_name[cst_strlen(phone_name)-1] = '\0'; } else if (phone_name[cst_strlen(phone_name)-1] == '0') { stress = "0"; phone_name[cst_strlen(phone_name)-1] = '\0'; } item_set_string(segitem,"name",phone_name); seg_in_syl = item_add_daughter(sssyl,segitem); #if 0 printf("awb_debug ph %s\n",phone_name); #endif if ((lex->syl_boundary)(seg_in_syl,val_cdr(p))) { #if 0 printf("awb_debug SYL\n"); #endif sylitem = NULL; if (sssyl) item_set_string(sssyl,"stress",stress); } cst_free(phone_name); } if (dp) { delete_val(phones); phones = NULL; } } return u; } /* Dummy F0 modelling for phones, copied directly from us_f0_model.c */ cst_utterance *flat_prosody(cst_utterance *u) { /* F0 target model */ cst_item *s,*t; cst_relation *targ_rel; float mean, stddev; targ_rel = utt_relation_create(u,"Target"); mean = get_param_float(u->features,"target_f0_mean", 100.0); mean *= get_param_float(u->features,"f0_shift", 1.0); stddev = get_param_float(u->features,"target_f0_stddev", 12.0); s=relation_head(utt_relation(u,"Segment")); t = relation_append(targ_rel,NULL); item_set_float(t,"pos",0.0); item_set_float(t,"f0",mean+stddev); s=relation_tail(utt_relation(u,"Segment")); t = relation_append(targ_rel,NULL); item_set_float(t,"pos",item_feat_float(s,"end")); item_set_float(t,"f0",mean-stddev); return u; } static cst_utterance *tokentosegs(cst_utterance *u) { cst_item *t; cst_relation *seg, *syl, *sylstructure, *word; cst_item *sylitem, *sylstructureitem, *worditem, *sssyl; cst_phoneset *ps; ps = val_phoneset(utt_feat_val(u, "phoneset")); /* Just copy tokens into the Segment relation */ seg = utt_relation_create(u, "Segment"); syl = utt_relation_create(u, "Syllable"); word = utt_relation_create(u, "Word"); sylstructure = utt_relation_create(u, "SylStructure"); sssyl = sylitem = worditem = sylstructureitem = 0; for (t = relation_head(utt_relation(u, "Token")); t; t = item_next(t)) { cst_item *segitem = relation_append(seg, NULL); char const *pname = item_feat_string(t, "name"); char *name = cst_strdup(pname); if (worditem == 0) { worditem = relation_append(word,NULL); item_set_string(worditem, "name", "phonestring"); sylstructureitem = relation_append(sylstructure,worditem); } if (sylitem == 0) { sylitem = relation_append(syl,NULL); sssyl = item_add_daughter(sylstructureitem,sylitem); } if (name[cst_strlen(name)-1] == '1') { item_set_string(sssyl,"stress","1"); name[cst_strlen(name)-1] = '\0'; } else if (name[cst_strlen(name)-1] == '0') { item_set_string(sssyl,"stress","0"); name[cst_strlen(name)-1] = '\0'; } if (cst_streq(name,"-")) { sylitem = 0; /* syllable break */ } else if (phone_id(ps, name) == -1) { cst_errmsg("Phone `%s' not in phoneset\n", pname); cst_error(); } else { item_add_daughter(sssyl,segitem); item_set_string(segitem, "name", name); } cst_free(name); } return u; } int default_utt_break(cst_tokenstream *ts, const char *token, cst_relation *tokens) { /* This is the default utt break functions, languages may override this */ /* This will be ok for some latin based languages */ const char *postpunct = item_feat_string(relation_tail(tokens), "punc"); const char *ltoken = item_name(relation_tail(tokens)); if (cst_strchr(ts->whitespace,'\n') != cst_strrchr(ts->whitespace,'\n')) /* contains two new lines */ return TRUE; /* Well, this is a little specific isn't it. */ else if (((cst_streq(ltoken,"Yahoo")) || (cst_streq(ltoken,"YAHOO")) || (cst_streq(ltoken,"yahoo"))) && strchr(postpunct,'!') && strchr("abcdefghijklmnopqrstuvwxyz",token[0])) return FALSE; else if (strchr(postpunct,':') || strchr(postpunct,'?') || strchr(postpunct,'!')) return TRUE; else if (strchr(postpunct,'.') && (cst_strlen(ts->whitespace) > 1) && strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0])) return TRUE; else if (strchr(postpunct,'.') && /* next word starts with a capital */ strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]) && /* last word isn't an abbreviation */ !(strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[cst_strlen(ltoken)-1])|| ((cst_strlen(ltoken) < 4) && strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[0])))) return TRUE; else return FALSE; }