shithub: flite

ref: b66315e2c408ce553082f722b0a9587a5ab5460e
dir: /src/synth/cst_synth.c/

View raw version
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                         Copyright (c) 2000                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  September 2000                                   */
/*************************************************************************/
/*                                                                       */
/*  General synthesis control                                            */
/*                                                                       */
/*************************************************************************/

#include "cst_hrg.h"
#include "cst_cart.h"
#include "cst_tokenstream.h"
#include "cst_utt_utils.h"
#include "cst_lexicon.h"
#include "cst_units.h"
#include "cst_synth.h"
#include "cst_phoneset.h"

CST_VAL_REGISTER_FUNCPTR(breakfunc,cst_breakfunc)

#ifndef SYNTH_MODULES_DEBUG
#define SYNTH_MODULES_DEBUG 0
#endif

#if SYNTH_MODULES_DEBUG > 0
#define DPRINTF(l,x) if (SYNTH_MODULES_DEBUG > l) cst_dbgmsg x
#else
#define DPRINTF(l,x)
#endif

static cst_utterance *tokentosegs(cst_utterance *u);

static const cst_synth_module synth_method_text[] = {
    { "tokenizer_func", default_tokenization },
    { "textanalysis_func", default_textanalysis },
    { "pos_tagger_func", default_pos_tagger },
    { "phrasing_func", default_phrasing },
    { "lexical_insertion_func", default_lexical_insertion },
    { "pause_insertion_func", default_pause_insertion },
    { "intonation_func", cart_intonation },
    { "postlex_func", NULL },
    { "duration_model_func", cart_duration },
    { "f0_model_func", NULL },
    { "wave_synth_func", NULL },
    { "post_synth_hook_func", NULL },
    { NULL, NULL }
};

static const cst_synth_module synth_method_text2segs[] = {
    { "tokenizer_func", default_tokenization },
    { "textanalysis_func", default_textanalysis },
    { "pos_tagger_func", default_pos_tagger },
    { "phrasing_func", default_phrasing },
    { "lexical_insertion_func", default_lexical_insertion },
    { "pause_insertion_func", default_pause_insertion },
    { NULL, NULL }
};

static const cst_synth_module synth_method_tokens[] = {
    { "textanalysis_func", default_textanalysis },
    { "pos_tagger_func", default_pos_tagger },
    { "phrasing_func", default_phrasing },
    { "lexical_insertion_func", default_lexical_insertion },
    { "pause_insertion_func", default_pause_insertion },
    { "intonation_func", cart_intonation },
    { "postlex_func", NULL },
    { "duration_model_func", cart_duration },
    { "f0_model_func", NULL },
    { "wave_synth_func", NULL },
    { "post_synth_hook_func", NULL },
    { NULL, NULL }
};

static const cst_synth_module synth_method_phones[] = {
    { "tokenizer_func", default_tokenization },
    { "textanalysis_func", tokentosegs },
    { "pos_tagger_func", default_pos_tagger },
    { "intonation_func", NULL },
    { "duration_model_func", cart_duration },
    { "f0_model_func", flat_prosody },
    { "wave_synth_func", NULL },
    { "post_synth_hook_func", NULL },
    { NULL, NULL }
};

cst_utterance *utt_synth_wave(cst_wave *w,cst_voice *v)
{
    /* Create an utterance with a wave in it as if we've synthesized it */
    /* Put it through streaming if that is require */
    cst_utterance *u;
    const cst_val *streaming_info_val;
    cst_audio_streaming_info *asi = NULL;

    u = new_utterance();
    utt_init(u,v);
    utt_set_wave(u,w);

    streaming_info_val=get_param_val(u->features,"streaming_info",NULL);
    if (streaming_info_val)
    {
        asi = val_audio_streaming_info(streaming_info_val);
        asi->utt = u;
    }

    if (!asi) return u;  /* no stream */

    /* Do streaming */
    (*asi->asc)(w,0,w->num_samples,1,asi);

    return u;
}

cst_utterance *apply_synth_module(cst_utterance *u,
				  const cst_synth_module *mod)
{
    const cst_val *v;

    v = feat_val(u->features, mod->hookname);
    if (v)
	return (*val_uttfunc(v))(u);
    if (mod->defhook)
	return (*mod->defhook)(u);
    return u;
}

cst_utterance *apply_synth_method(cst_utterance *u,
				  const cst_synth_module meth[])
{
    while (meth->hookname)
    {
	if ((u = apply_synth_module(u, meth)) == NULL)
	    return NULL;
	++meth;
    }

    return u;
}

cst_utterance *utt_init(cst_utterance *u, cst_voice *vox)
{
    /* Link the vox features into the utterance features so the voice  */
    /* features will be searched too (after the utt ones)              */
    feat_link_into(vox->features,u->features);
    feat_link_into(vox->ffunctions,u->ffunctions);

    /* Do the initialization function, if there is one */
    if (vox->utt_init)
	vox->utt_init(u, vox);

    return u;
}

cst_utterance *utt_synth(cst_utterance *u)
{
    return apply_synth_method(u, synth_method_text);
}

cst_utterance *utt_synth_tokens(cst_utterance *u)
{
    return apply_synth_method(u, synth_method_tokens);
}

cst_utterance *utt_synth_text2segs(cst_utterance *u)
{
    return apply_synth_method(u, synth_method_text2segs);
}

cst_utterance *utt_synth_phones(cst_utterance *u)
{
    return apply_synth_method(u, synth_method_phones);
}

cst_utterance *default_tokenization(cst_utterance *u)
{
    const char *text,*token;
    cst_tokenstream *fd;
    cst_item *t;
    cst_relation *r;

    text = utt_input_text(u);
    r = utt_relation_create(u,"Token");
    fd = ts_open_string(text,
	get_param_string(u->features,"text_whitespace",NULL),
	get_param_string(u->features,"text_singlecharsymbols",NULL),
	get_param_string(u->features,"text_prepunctuation",NULL),
        get_param_string(u->features,"text_postpunctuation",NULL));
    
    while(!ts_eof(fd))
    {
	token = ts_get(fd);
	if (cst_strlen(token) > 0)
	{
	    t = relation_append(r,NULL);
	    item_set_string(t,"name",token);
	    item_set_string(t,"whitespace",fd->whitespace);
	    item_set_string(t,"prepunctuation",fd->prepunctuation);
	    item_set_string(t,"punc",fd->postpunctuation);
	    item_set_int(t,"file_pos",fd->file_pos);
	    item_set_int(t,"line_number",fd->line_number);
	}
    }

    ts_close(fd);
    
    return u;
}

cst_val *default_tokentowords(cst_item *i)
{
    return cons_val(string_val(item_feat_string(i,"name")), NULL);
}

cst_utterance *default_textanalysis(cst_utterance *u)
{
    cst_item *t,*word;
    cst_relation *word_rel;
    cst_val *words;
    const cst_val *w;
    const cst_val *ttwv;

    word_rel = utt_relation_create(u,"Word");
    ttwv = feat_val(u->features, "tokentowords_func");

    for (t=relation_head(utt_relation(u,"Token")); t; t=item_next(t))
    {
	if (ttwv)
	    words = (cst_val *)(*val_itemfunc(ttwv))(t);
	else
	    words = default_tokentowords(t);

	for (w=words; w; w=val_cdr(w))
	{
	    word = item_add_daughter(t,NULL);
	    if (cst_val_consp(val_car(w)))
	    {   /* Has extra features */
		item_set_string(word,"name",val_string(val_car(val_car(w))));
		feat_copy_into(val_features(val_cdr(val_car(w))),
			       item_feats(word));
	    }
	    else
		item_set_string(word,"name",val_string(val_car(w)));
	    relation_append(word_rel,word);
	}
	delete_val(words);
    }

    return u;
}

cst_utterance *default_phrasing(cst_utterance *u)
{
    cst_relation *r;
    cst_item *w, *p, *lp=NULL;
    const cst_val *v;
    cst_cart *phrasing_cart;

    r = utt_relation_create(u,"Phrase");
    if (feat_present(u->features,"phrasing_cart"))
        phrasing_cart = val_cart(feat_val(u->features,"phrasing_cart"));
    else
        phrasing_cart = NULL;

    for (p=NULL,w=relation_head(utt_relation(u,"Word")); w; w=item_next(w))
    {
	if (p == NULL)
	{
	    p = relation_append(r,NULL);
            lp = p;
            item_set_string(p,"name","B");
	}
	item_add_daughter(p,w);
        if (phrasing_cart)
        {
            v = cart_interpret(w,phrasing_cart);
            if (cst_streq(val_string(v),"BB"))
                p = NULL;
        }
    }

    if (lp && item_prev(lp)) /* follow festival */
        item_set_string(lp,"name","BB");
    
    return u;
}

cst_utterance *default_pause_insertion(cst_utterance *u)
{
    /* Add initial silences and silence at each phrase break */
    const char *silence;
    const cst_item *w;
    cst_item *p, *s;

    silence = val_string(feat_val(u->features,"silence"));

    /* Insert initial silence */
    s = relation_head(utt_relation(u,"Segment"));
    if (s == NULL)
	s = relation_append(utt_relation(u,"Segment"),NULL);
    else
	s = item_prepend(s,NULL);
    item_set_string(s,"name",silence);

    for (p=relation_head(utt_relation(u,"Phrase")); p; p=item_next(p))
    {
	for (w = item_last_daughter(p); w; w=item_prev(w))
	{
	    s = path_to_item(w,"R:SylStructure.daughtern.daughtern.R:Segment");
	    if (s)
	    {
		s = item_append(s,NULL);
		item_set_string(s,"name",silence);
		break;
	    }
	}
    }

    return u;
}

cst_utterance *cart_intonation(cst_utterance *u)
{
    cst_cart *accents, *tones;
    cst_item *s;
    const cst_val *v;

    if (feat_present(u->features,"no_intonation_accent_model"))
        return u;  /* not all languages have intonation models */

    accents = val_cart(feat_val(u->features,"int_cart_accents"));
    tones = val_cart(feat_val(u->features,"int_cart_tones"));
    
    for (s=relation_head(utt_relation(u,"Syllable")); s; s=item_next(s))
    {
	v = cart_interpret(s,accents);
	if (!cst_streq("NONE",val_string(v)))
	    item_set_string(s,"accent",val_string(v));
	v = cart_interpret(s,tones);
	if (!cst_streq("NONE",val_string(v)))
	    item_set_string(s,"endtone",val_string(v));
	DPRINTF(0,("word %s gpos %s stress %s ssyl_in %s ssyl_out %s accent %s endtone %s\n",
		   ffeature_string(s,"R:SylStructure.parent.name"),
		   ffeature_string(s,"R:SylStructure.parent.gpos"),
		   ffeature_string(s,"stress"),
		   ffeature_string(s,"ssyl_in"),
		   ffeature_string(s,"ssyl_out"),
		   ffeature_string(s,"accent"),
		   ffeature_string(s,"endtone")));
    }

    return u;
}

CST_VAL_REGISTER_TYPE_NODEL(dur_stats,dur_stats)

const dur_stat *phone_dur_stat(const dur_stats *ds,const char *ph)
{
    int i;
    for (i=0; ds[i]; i++)
	if (cst_streq(ph,ds[i]->phone))
            return ds[i];

    return ds[0];
}

cst_utterance *cart_duration(cst_utterance *u)
{
    cst_cart *dur_tree;
    cst_item *s;
    float zdur, dur_stretch, local_dur_stretch, dur;
    float end;
    dur_stats *ds;
    const dur_stat *dur_stat;

    end = 0;

    if (feat_present(u->features,"no_segment_duration_model"))
        return u;  /* not all methods need segment durations */

    dur_tree = val_cart(feat_val(u->features,"dur_cart"));
    dur_stretch = get_param_float(u->features,"duration_stretch", 1.0);
    ds = val_dur_stats(feat_val(u->features,"dur_stats"));
    
    for (s=relation_head(utt_relation(u,"Segment")); s; s=item_next(s))
    {
	zdur = val_float(cart_interpret(s,dur_tree));
	dur_stat = phone_dur_stat(ds,item_name(s));

	local_dur_stretch = ffeature_float(s, "R:SylStructure.parent.parent."
					   "R:Token.parent.local_duration_stretch");
	if (local_dur_stretch)
	    local_dur_stretch *= dur_stretch;
	else
	    local_dur_stretch = dur_stretch;

	dur = local_dur_stretch * ((zdur*dur_stat->stddev)+dur_stat->mean);
	DPRINTF(0,("phone %s accent %s stress %s pdur %f stretch %f mean %f std %f dur %f\n",
		   item_name(s),
		   ffeature_string(s,"R:SylStructure.parent.accented"),
		   ffeature_string(s,"R:SylStructure.parent.stress"),
		   zdur, local_dur_stretch, dur_stat->mean,
		   dur_stat->stddev, dur));
	end += dur;
	item_set_float(s,"end",end);
    }
    return u;
}

cst_utterance *default_pos_tagger(cst_utterance *u)
{
    cst_item *word;
    const cst_val *p;
    const cst_cart *tagger;

    p = get_param_val(u->features,"pos_tagger_cart",NULL);
    if (p == NULL)
        return u;
    tagger = val_cart(p);

    for (word=relation_head(utt_relation(u,"Word")); 
	 word; word=item_next(word))
    {
        p = cart_interpret(word,tagger);
        item_set_string(word,"pos",val_string(p));
    }

    return u;
}

cst_utterance *default_lexical_insertion(cst_utterance *u)
{
    cst_item *word;
    cst_relation *sylstructure,*seg,*syl;
    cst_lexicon *lex;
    const cst_val *lex_addenda = NULL;
    const cst_val *p, *wp = NULL;
    char *phone_name;
    const char *stress = "0";
    const char *pos;
    cst_val *phones;
    cst_item *ssword, *sssyl, *segitem, *sylitem, *seg_in_syl;
    const cst_val *vpn;
    int dp = 0;

    lex = val_lexicon(feat_val(u->features,"lexicon"));
    if (lex->lex_addenda)
	lex_addenda = lex->lex_addenda;

    syl = utt_relation_create(u,"Syllable");
    sylstructure = utt_relation_create(u,"SylStructure");
    seg = utt_relation_create(u,"Segment");

    for (word=relation_head(utt_relation(u,"Word")); 
	 word; word=item_next(word))
    {
	ssword = relation_append(sylstructure,word);
        pos = ffeature_string(word,"pos");
	phones = NULL;
        wp = NULL;
        dp = 0;  /* should the phones get deleted or not */
        
        /*        printf("awb_debug word %s pos %s gpos %s\n",
               item_feat_string(word,"name"),
               pos,
               ffeature_string(word,"gpos")); */

	/* FIXME: need to make sure that textanalysis won't split
           tokens with explicit pronunciation (or that it will
           propagate such to words, then we can remove the path here) */
	if (item_feat_present(item_parent(item_as(word, "Token")), "phones"))
        {
            vpn = item_feat(item_parent(item_as(word, "Token")), "phones");
            if (cst_val_consp(vpn))
            {   /* for SAPI ?? */
                /* awb oct11: this seems wrong -- */
                /* not sure SAPI still (ever) works Oct11 */
                phones = (cst_val *) vpn;  
            }
            else
            {
                dp = 1;
                if (cst_streq(val_string(vpn),
                              ffeature_string(word,"p.R:Token.parent.phones")))
                    phones = NULL; /* Already given these phones */
                else
                    phones = val_readlist_string(val_string(vpn));
            }
        }
	else
	{
            wp = val_assoc_string(item_feat_string(word, "name"),lex_addenda);
            if (wp)
                phones = (cst_val *)val_cdr(val_cdr(wp));
            else
            {
                dp = 1;
		phones = lex_lookup(lex,item_feat_string(word,"name"),pos,
                                    u->features);
            }
	}

	for (sssyl=NULL,sylitem=NULL,p=phones; p; p=val_cdr(p))
	{
	    if (sylitem == NULL)
	    {
		sylitem = relation_append(syl,NULL);
		sssyl = item_add_daughter(ssword,sylitem);
		stress = "0";
	    }
	    segitem = relation_append(seg,NULL);
	    phone_name = cst_strdup(val_string(val_car(p)));
	    if (phone_name[cst_strlen(phone_name)-1] == '1')
	    {
		stress = "1";
		phone_name[cst_strlen(phone_name)-1] = '\0';
	    }
	    else if (phone_name[cst_strlen(phone_name)-1] == '0')
	    {
		stress = "0";
		phone_name[cst_strlen(phone_name)-1] = '\0';
	    }
	    item_set_string(segitem,"name",phone_name);
	    seg_in_syl = item_add_daughter(sssyl,segitem);
#if 0
            printf("awb_debug ph %s\n",phone_name);
#endif
	    if ((lex->syl_boundary)(seg_in_syl,val_cdr(p)))
	    {
#if 0
                printf("awb_debug SYL\n");
#endif
		sylitem = NULL;
		if (sssyl)
		    item_set_string(sssyl,"stress",stress);
	    }
	    cst_free(phone_name);
	}
	if (dp)
        {
	    delete_val(phones);
            phones = NULL;
        }
    }

    return u;
}

/* Dummy F0 modelling for phones, copied directly from us_f0_model.c */
cst_utterance *flat_prosody(cst_utterance *u)
{
    /* F0 target model */
    cst_item *s,*t;
    cst_relation *targ_rel;
    float mean, stddev;

    targ_rel = utt_relation_create(u,"Target");
    mean = get_param_float(u->features,"target_f0_mean", 100.0);
    mean *= get_param_float(u->features,"f0_shift", 1.0);
    stddev = get_param_float(u->features,"target_f0_stddev", 12.0);

    s=relation_head(utt_relation(u,"Segment"));
    t = relation_append(targ_rel,NULL);
    item_set_float(t,"pos",0.0);
    item_set_float(t,"f0",mean+stddev);

    s=relation_tail(utt_relation(u,"Segment"));
    t = relation_append(targ_rel,NULL);

    item_set_float(t,"pos",item_feat_float(s,"end"));
    item_set_float(t,"f0",mean-stddev);

    return u;
}

static cst_utterance *tokentosegs(cst_utterance *u)
{
    cst_item *t;
    cst_relation *seg, *syl, *sylstructure, *word;
    cst_item *sylitem, *sylstructureitem, *worditem, *sssyl;
    cst_phoneset *ps;

    ps = val_phoneset(utt_feat_val(u, "phoneset"));
    /* Just copy tokens into the Segment relation */
    seg = utt_relation_create(u, "Segment");
    syl = utt_relation_create(u, "Syllable");
    word = utt_relation_create(u, "Word");
    sylstructure = utt_relation_create(u, "SylStructure");
    sssyl = sylitem = worditem = sylstructureitem = 0;
    for (t = relation_head(utt_relation(u, "Token")); t; t = item_next(t)) 
    {
	cst_item *segitem = relation_append(seg, NULL);
	char const *pname = item_feat_string(t, "name");
	char *name = cst_strdup(pname);

	if (worditem == 0)
	{
	    worditem = relation_append(word,NULL);
	    item_set_string(worditem, "name", "phonestring");
	    sylstructureitem = relation_append(sylstructure,worditem);
	}
	if (sylitem == 0)
	{
	    sylitem = relation_append(syl,NULL);
	    sssyl = item_add_daughter(sylstructureitem,sylitem);
	}
	
	if (name[cst_strlen(name)-1] == '1')
	{
	    item_set_string(sssyl,"stress","1");
	    name[cst_strlen(name)-1] = '\0';
	}
	else if (name[cst_strlen(name)-1] == '0')
	{
	    item_set_string(sssyl,"stress","0");
	    name[cst_strlen(name)-1] = '\0';
	}

	if (cst_streq(name,"-"))
	{
	    sylitem = 0;  /* syllable break */
	}
	else if (phone_id(ps, name) == -1) 
	{
	    cst_errmsg("Phone `%s' not in phoneset\n", pname);
	    cst_error();
	}
	else
	{
	    item_add_daughter(sssyl,segitem);
	    item_set_string(segitem, "name", name);
	}

	cst_free(name);
    }

    return u;
}

int default_utt_break(cst_tokenstream *ts,
		      const char *token,
		      cst_relation *tokens)
{
    /* This is the default utt break functions, languages may override this */
    /* This will be ok for some latin based languages */
    const char *postpunct = item_feat_string(relation_tail(tokens), "punc");
    const char *ltoken = item_name(relation_tail(tokens));

    if (cst_strchr(ts->whitespace,'\n') != cst_strrchr(ts->whitespace,'\n'))
	 /* contains two new lines */
	 return TRUE;
    /* Well, this is a little specific isn't it. */
    else if (((cst_streq(ltoken,"Yahoo")) ||
              (cst_streq(ltoken,"YAHOO")) ||
              (cst_streq(ltoken,"yahoo"))) &&
             strchr(postpunct,'!') &&
	     strchr("abcdefghijklmnopqrstuvwxyz",token[0]))
        return FALSE;
    else if (strchr(postpunct,':') ||
	     strchr(postpunct,'?') ||
	     strchr(postpunct,'!'))
	return TRUE;
    else if (strchr(postpunct,'.') &&
	     (cst_strlen(ts->whitespace) > 1) &&
	     strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]))
	return TRUE;
    else if (strchr(postpunct,'.') &&
	     /* next word starts with a capital */
	     strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]) &&
	     /* last word isn't an abbreviation */
	     !(strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[cst_strlen(ltoken)-1])||
	       ((cst_strlen(ltoken) < 4) &&
		strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[0]))))
	return TRUE;
    else
	return FALSE;
}