shithub: flite

ref: b273a40041a0330bd1545316eed63959a6cc4bdd
dir: /src/wavesynth/cst_units.c/

View raw version
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                         Copyright (c) 2001                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  January 2001                                     */
/*************************************************************************/
/*                                                                       */
/*  General unit functions (diphones or clunit)                          */
/*                                                                       */
/*************************************************************************/

#include "cst_math.h"
#include "cst_hrg.h"
#include "cst_utt_utils.h"
#include "cst_wave.h"
#include "cst_track.h"
#include "cst_units.h"
#include "cst_sigpr.h"

static int nearest_pm(cst_sts_list *sts_list,int start,int end,float u_index);

cst_utterance *join_units(cst_utterance *utt)
{
    /* Make a waveform form the units */
    const char *join_type;

    join_type = get_param_string(utt->features,"join_type", "modified_lpc");

    if (cst_streq(join_type,"none"))
	return utt;
#if 0
    else if (cst_streq(join_type,"windowed_join"))
	join_units_windowed(utt);
#endif
    else if (cst_streq(join_type,"simple_join"))
	join_units_simple(utt);
    else if (cst_streq(join_type,"modified_lpc"))
	join_units_modified_lpc(utt);
    
    return utt;
}

cst_utterance *join_units_simple(cst_utterance *utt)
{
    cst_wave *w = 0;
    cst_lpcres *lpcres;
    const char *resynth_type;
    const cst_val *streaming_info_val;

    resynth_type = get_param_string(utt->features,"resynth_type", "fixed");
    
    asis_to_pm(utt);
    concat_units(utt);

    lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));

    streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
    if (streaming_info_val)
    {
        lpcres->asi = val_audio_streaming_info(streaming_info_val);
        lpcres->asi->utt = utt;
    }

    if (cst_streq(resynth_type, "fixed"))
	w = lpc_resynth_fixedpoint(lpcres); 
    else 
    {
	cst_errmsg("unknown resynthesis type %s\n", resynth_type);
	cst_error(); /* Should not happen */
    }

    utt_set_wave(utt,w);
    
    return utt;
}

cst_utterance *join_units_modified_lpc(cst_utterance *utt)
{
    cst_wave *w = 0;
    cst_lpcres *lpcres;
    const char *resynth_type;
    const cst_val *streaming_info_val;

    resynth_type = get_param_string(utt->features,"resynth_type", "float");

    f0_targets_to_pm(utt);
    concat_units(utt);

    lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));

    streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
    if (streaming_info_val)
    {
        lpcres->asi = val_audio_streaming_info(streaming_info_val);
        lpcres->asi->utt = utt;
    }

    if (cst_streq(resynth_type, "float"))
	w = lpc_resynth(lpcres); 
    else if (cst_streq(resynth_type, "fixed"))
    {
	w = lpc_resynth_fixedpoint(lpcres); 
    }
    else 
    {
	cst_errmsg("unknown resynthesis type %s\n", resynth_type);
	cst_error(); /* Should not happen */
    }

    if (w == NULL)
    {
        /* Synthesis Failed, probably because it was interrupted */
        utt_set_feat_int(utt,"Interrupted",1);
        w = new_wave();
    }

    utt_set_wave(utt,w);
    
    return utt;
}

cst_utterance *asis_to_pm(cst_utterance *utt)
{
    /* Copy the PM structure from the units unchanged */
    cst_item *u;
    cst_lpcres *target_lpcres;
    int  unit_start, unit_end;
    int utt_pms, utt_size, i;
    cst_sts_list *sts_list;

    sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
    target_lpcres = new_lpcres();

    /* Pass one to find the size */
    utt_pms = utt_size = 0;
    for (u=relation_head(utt_relation(utt,"Unit"));
	 u; 
	 u=item_next(u))
    {
	unit_start = item_feat_int(u,"unit_start");
	unit_end = item_feat_int(u,"unit_end");
	utt_size += get_unit_size(sts_list,unit_start,unit_end);
	utt_pms += unit_end - unit_start;
	item_set_int(u,"target_end",utt_size);
    }
    lpcres_resize_frames(target_lpcres,utt_pms);

    /* Pass two to fill in the values */
    utt_pms = utt_size = 0;
    for (u=relation_head(utt_relation(utt,"Unit"));
	 u; 
	 u=item_next(u))
    {
	unit_start = item_feat_int(u,"unit_start");
	unit_end = item_feat_int(u,"unit_end");
	for (i=unit_start; i<unit_end; i++,utt_pms++)
	{
	    utt_size += get_frame_size(sts_list, i);
	    target_lpcres->times[utt_pms] = utt_size;
	}
    }
    utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres));
    return utt;
}

cst_utterance *f0_targets_to_pm(cst_utterance *utt)
{
    cst_item *t;
    float pos,lpos,f0,lf0,m;
    double time;
    int pm;
    cst_sts_list *sts_list;
    cst_lpcres *target_lpcres;

    sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
    lpos = 0;
    lf0 = 120; /* hmm */
    pm = 0;
    time = 0;
    /* First pass to count how many pms will be required */
    for (t=relation_head(utt_relation(utt,"Target"));
	 t;
	 t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */
    {
	pos = item_feat_float(t,"pos");
	f0 = item_feat_float(t,"f0");
	if (time == pos) continue;
	m = (f0-lf0)/(pos-lpos);
	for ( ; time < pos; pm++)
	{
	    time += 1/(lf0 + ((time-lpos)*m));
	}
    }
    target_lpcres = new_lpcres();
    lpcres_resize_frames(target_lpcres,pm);

    lpos = 0;
    lf0 = 120;
    pm = 0;
    time = 0;
    /* Second pass puts the values in */
    for (t=relation_head(utt_relation(utt,"Target"));
	 t;
	 t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */
    {
	pos = item_feat_float(t,"pos");
	f0 = item_feat_float(t,"f0");
	if (time == pos) continue;
	m = (f0-lf0)/(pos-lpos);
	for ( ; time < pos; pm++)
	{
	    time += 1/(lf0 + ((time-lpos)*m));
	    target_lpcres->times[pm] = sts_list->sample_rate * time;
	}
    }
    utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres));
    return utt;
}

cst_utterance *concat_units(cst_utterance *utt)
{
    cst_lpcres *target_lpcres;
    cst_item *u;
    int pm_i;
    int  unit_size, unit_start, unit_end;
    int rpos, nearest_u_pm;
    int target_end, target_start;
    float m, u_index;
    cst_sts_list *sts_list;
    const char *residual_type;

    sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
    if (sts_list->codec == NULL)
        residual_type = "ulaw";
    else
        residual_type = sts_list->codec;
    target_lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));
    
    target_lpcres->lpc_min = sts_list->coeff_min;
    target_lpcres->lpc_range = sts_list->coeff_range;
    target_lpcres->num_channels = sts_list->num_channels;
    target_lpcres->sample_rate = sts_list->sample_rate;
    lpcres_resize_samples(target_lpcres,
			  target_lpcres->times[target_lpcres->num_frames-1]);
    if (utt_feat_val(utt,"delayed_decoding"))
    {
        target_lpcres->delayed_decoding = 1;
        target_lpcres->packed_residuals = 
            cst_alloc(const unsigned char *,target_lpcres->num_frames);
    }

    target_start = 0.0; rpos = 0; pm_i = 0; u_index = 0;
    for (u=relation_head(utt_relation(utt,"Unit")); u; u=item_next(u))
    {
	unit_start = item_feat_int(u,"unit_start");
	unit_end = item_feat_int(u,"unit_end");
	unit_size = get_unit_size(sts_list,unit_start,unit_end);
	target_end = item_feat_int(u,"target_end");
	
	u_index = 0;
	m = (float)unit_size/(float)(target_end-target_start);
/*	printf("unit_size %d start %d end %d tstart %d tend %d m %f\n",  
	unit_size, unit_start, unit_end, target_start, target_end, m); */
	for ( /* pm_start=pm_i */ ; 
	     (pm_i < target_lpcres->num_frames) &&
		 (target_lpcres->times[pm_i] <= target_end);
	     pm_i++)
	{
	    nearest_u_pm = nearest_pm(sts_list,unit_start,unit_end,u_index);
	    /* Get LPC coefs (pointer) */
	    target_lpcres->frames[pm_i] = get_sts_frame(sts_list, nearest_u_pm);
	    /* Get residual (copy) */
	    target_lpcres->sizes[pm_i] =
		target_lpcres->times[pm_i] -
		(pm_i > 0 ? target_lpcres->times[pm_i-1] : 0);
	    if (cst_streq(residual_type,"pulse"))
		add_residual_pulse(target_lpcres->sizes[pm_i],
				   &target_lpcres->residual[rpos],
				   get_frame_size(sts_list, nearest_u_pm),
				   get_sts_residual(sts_list, nearest_u_pm));
	    else if (cst_streq(residual_type,"g721"))
		add_residual_g721(target_lpcres->sizes[pm_i],
				   &target_lpcres->residual[rpos],
				   get_frame_size(sts_list, nearest_u_pm),
				   get_sts_residual(sts_list, nearest_u_pm));
	    else if (cst_streq(residual_type,"g721vuv"))
            {
                if (target_lpcres->delayed_decoding)
                {
                    target_lpcres->packed_residuals[pm_i] =
                        get_sts_residual(sts_list, nearest_u_pm);
                }
                else
                {
                    add_residual_g721vuv(target_lpcres->sizes[pm_i],
				   &target_lpcres->residual[rpos],
				   get_frame_size(sts_list, nearest_u_pm),
				   get_sts_residual(sts_list, nearest_u_pm));
                }
            }
	    else if (cst_streq(residual_type,"vuv"))
		add_residual_vuv(target_lpcres->sizes[pm_i],
                                 &target_lpcres->residual[rpos],
                                 get_frame_size(sts_list, nearest_u_pm),
                                 get_sts_residual(sts_list, nearest_u_pm));
	    /* But this requires particular layout of residuals which
	       probably isn't true */
	    /*
	    if (cst_streq(residual_type,"windowed"))
		add_residual_windowed(target_lpcres->sizes[pm_i],
				     &target_lpcres->residual[rpos],
				     get_frame_size(sts_list, nearest_u_pm),
				     get_sts_residual(sts_list, nearest_u_pm));
	    */
	    else /* default is "ulaw" */
		add_residual(target_lpcres->sizes[pm_i],
			     &target_lpcres->residual[rpos],
			     get_frame_size(sts_list, nearest_u_pm),
			     get_sts_residual(sts_list, nearest_u_pm));
	    rpos+=target_lpcres->sizes[pm_i];
	    u_index += (float)target_lpcres->sizes[pm_i]*m;
	}
	target_start = target_end;
    }
    target_lpcres->num_frames = pm_i;
    return utt;
}

static int nearest_pm(cst_sts_list *sts_list, int start,int end,float u_index)
{
    /* First the pm in unit_entry that is closest to u_index */
    int i, i_size, n_size;
    i_size = 0;

    for (i=start; i < end; i++)
    {
	n_size = i_size + get_frame_size(sts_list, i);
	if (fabs((double)(u_index-(float)i_size)) <
	    fabs((double)(u_index-(float)n_size)))
	    return i;
	i_size = n_size;
    }

    return end-1;
}

#if 0
void add_residual_windowed(int targ_size, 
			   unsigned char *targ_residual,
			   int unit_size, 
			   const unsigned char *unit_residual)
{
    /* Note this doesn't work unless the unit_residuals and consecutive */
#define DI_PI 3.14159265358979323846
    float *window, *unit, *residual;
    int i,j,k, offset, win_size;

    win_size = (targ_size*2)+1;
    window = cst_alloc(float,win_size);
    window[targ_size+1] = 1.0;
    k = DI_PI / (win_size - 1);
    for (i=0,j=win_size-1; i < targ_size+1; i++,j--)
	window[j] = window[i] = 0.54 - (0.46 * cos(k * i));

    residual = cst_alloc(float,win_size);
    for (i=0; i<win_size; i++)
	residual[i] = cst_ulaw_to_short(targ_residual[i]);

    unit = cst_alloc(float,(unit_size*2)+1);
    for (i=0; i<(unit_size*2)+1; i++)
	unit[i] = cst_ulaw_to_short(unit_residual[i]);

    if (targ_size < unit_size)
	for (i=0; i < win_size; i++)
	    residual[i] += window[i] * unit[i+(unit_size-targ_size)/2];
    else
    {
	offset = (targ_size-unit_size)/2;
	for (i=offset; i < win_size-offset; i++)
	    residual[i] += window[i] * unit[i-offset];
    }

    for (i=0; i < win_size; i++)
	targ_residual[i] = cst_short_to_ulaw((short)residual[i]);

    cst_free(window);
    cst_free(residual);
    cst_free(unit);

}
#endif

void add_residual(int targ_size, unsigned char *targ_residual,
		  int unit_size, const unsigned char *unit_residual)
{
/*    float pow_factor;
      int i; */

    if (unit_size < targ_size)
	memmove(&targ_residual[((targ_size-unit_size)/2)],
		&unit_residual[0],
		unit_size*sizeof(unsigned char));
    else
    {
	memmove(&targ_residual[0],
		&unit_residual[((unit_size-targ_size)/2)],
		targ_size*sizeof(unsigned char));
    }
#if 0
    if (unit_size < targ_size)
	memmove(&targ_residual[0],
		&unit_residual[0],
		unit_size*sizeof(unsigned char));
    else
    {
	memmove(&targ_residual[0],
		&unit_residual[0],
		targ_size*sizeof(unsigned char));
    }
#endif
}

void add_residual_g721(int targ_size, unsigned char *targ_residual,
                       int uunit_size, const unsigned char *unit_residual)
{
    /* Residual is encoded with g721 */
    unsigned char *unit_residual_unpacked;
    int unit_size;

    unit_residual_unpacked = 
        cst_g721_decode(&unit_size, (uunit_size+CST_G721_LEADIN+1)/2, unit_residual);

    if (uunit_size < targ_size)
	memmove(&targ_residual[((targ_size-uunit_size)/2)],
		&unit_residual_unpacked[CST_G721_LEADIN],
		uunit_size*sizeof(unsigned char));
    else
    {
	memmove(&targ_residual[0],
		&unit_residual_unpacked[CST_G721_LEADIN+((uunit_size-targ_size)/2)],
		targ_size*sizeof(unsigned char));
    }

    cst_free(unit_residual_unpacked);
}

static double plus_or_minus_one()
{
    /* Randomly return 1 or -1 */
    /* not sure rand() is portable */
    if (rand() > RAND_MAX/2.0)
        return 1.0;
    else
        return -1.0;
}

static double rand_zero_to_one()
{
    /* Return number between 0.0 and 1.0 */
    return rand()/(float)RAND_MAX;
}

void add_residual_g721vuv(int targ_size, unsigned char *targ_residual,
                       int uunit_size, const unsigned char *unit_residual)
{
    /* Residual is encoded with g721 */
    unsigned char *unit_residual_unpacked;
    int p, j;
    float m, q;
    int unit_size;
    int offset;

    if (unit_residual[0] == 0)
    {
        unit_size = uunit_size;
        unit_residual_unpacked = cst_alloc(unsigned char,unit_size);
        p = unit_residual[4]; p = p << 8;
        p += unit_residual[3]; p = p << 8;
        p += unit_residual[2]; p = p << 8;
        p += unit_residual[1]; 
        m = ((float)p);
        for (j=0; j<unit_size; j++)
        {
            q = m*2*rand_zero_to_one()*plus_or_minus_one();
            unit_residual_unpacked[j] = cst_short_to_ulaw((short)q);
        }
        offset = 0;
    }
    else
    {
        unit_residual_unpacked = 
            cst_g721_decode(&unit_size, (uunit_size+CST_G721_LEADIN+1)/2, unit_residual);
        offset = CST_G721_LEADIN;
    }
     
    if (uunit_size < targ_size)
	memmove(&targ_residual[((targ_size-uunit_size)/2)],
		&unit_residual_unpacked[offset],
		uunit_size*sizeof(unsigned char));
    else
    {
	memmove(&targ_residual[0],
		&unit_residual_unpacked[offset+((uunit_size-targ_size)/2)],
		targ_size*sizeof(unsigned char));
    }

    cst_free(unit_residual_unpacked);
}

void add_residual_vuv(int targ_size, unsigned char *targ_residual,
                      int uunit_size, const unsigned char *unit_residual)
{
    /* Residual is encoded with vuv */
    unsigned char *unit_residual_unpacked;
    int p, j;
    float m, q;
    int unit_size;

    if (unit_residual[0] == 0)
    {
        unit_size = uunit_size;
        unit_residual_unpacked = cst_alloc(unsigned char,unit_size);
        p = unit_residual[4]; p = p << 8;
        p += unit_residual[3]; p = p << 8;
        p += unit_residual[2]; p = p << 8;
        p += unit_residual[1]; 
        m = ((float)p);
        for (j=0; j<unit_size; j++)
        {
            q = m*2*rand_zero_to_one()*plus_or_minus_one();
            unit_residual_unpacked[j] = cst_short_to_ulaw((short)q);
        }
    }
    else
    {
        /* Put in to the unpacked -- with no unpacking */
        /* The cast is because unit_residual is const, and can't be deleted */
        unit_residual_unpacked = (unsigned char *)(void *)unit_residual;
    }
     
    if (uunit_size < targ_size)
	memmove(&targ_residual[((targ_size-uunit_size)/2)],
		&unit_residual_unpacked[0],
		uunit_size*sizeof(unsigned char));
    else
    {
	memmove(&targ_residual[0],
		&unit_residual_unpacked[((uunit_size-targ_size)/2)],
		targ_size*sizeof(unsigned char));
    }

    if (unit_residual[0] == 0)
        cst_free(unit_residual_unpacked);
}

void add_residual_pulse(int targ_size, unsigned char *targ_residual,
			int unit_size, const unsigned char *unit_residual)
{
    int p,i,m;
    /* Unit residual isn't a pointer its a number, the power for the 
       the sts, yes this is hackily casting the address to a number */

    /* Need voiced and unvoiced model */
    p = (int)unit_residual; /* I know the compiler will complain about this */

    if (p > 7000)  /* voiced */
    {
        i = ((targ_size-unit_size)/2);
	targ_residual[i-2] = cst_short_to_ulaw((short)(p/4));
	targ_residual[i] = cst_short_to_ulaw((short)(p/2));
	targ_residual[i+2] = cst_short_to_ulaw((short)(p/4));
    }
    else /* unvoiced */
    {
        m = p / targ_size;
        for (i=0; i<targ_size; i++)
            targ_residual[i] = 
                cst_short_to_ulaw((short)(m*plus_or_minus_one()));
    }

#if 0
    if (unit_size < targ_size)
	targ_residual[((targ_size-unit_size)/2)] 
	    = cst_short_to_ulaw((short)(int)unit_residual);
    else
	targ_residual[((unit_size-targ_size)/2)] 
	    = cst_short_to_ulaw((short)(int)unit_residual);
#endif
}