shithub: flite

ref: 6e3aac3abfd17457d033d433abf820e9bbc8847f
dir: /src/cg/cst_mlsa.c/

View raw version
/*  ---------------------------------------------------------------  */
/*      The HMM-Based Speech Synthesis System (HTS): version 1.1b    */
/*                        HTS Working Group                          */
/*                                                                   */
/*                   Department of Computer Science                  */
/*                   Nagoya Institute of Technology                  */
/*                                and                                */
/*    Interdisciplinary Graduate School of Science and Engineering   */
/*                   Tokyo Institute of Technology                   */
/*                      Copyright (c) 2001-2003                      */
/*                        All Rights Reserved.                       */
/*                                                                   */
/*  Permission is hereby granted, free of charge, to use and         */
/*  distribute this software and its documentation without           */
/*  restriction, including without limitation the rights to use,     */
/*  copy, modify, merge, publish, distribute, sublicense, and/or     */
/*  sell copies of this work, and to permit persons to whom this     */
/*  work is furnished to do so, subject to the following conditions: */
/*                                                                   */
/*    1. The code must retain the above copyright notice, this list  */
/*       of conditions and the following disclaimer.                 */
/*                                                                   */
/*    2. Any modifications must be clearly marked as such.           */
/*                                                                   */    
/*  NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSITITUTE OF TECHNOLOGY,  */
/*  HTS WORKING GROUP, AND THE CONTRIBUTORS TO THIS WORK DISCLAIM    */
/*  ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL       */
/*  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSITITUTE OF        */
/*  TECHNOLOGY, HTS WORKING GROUP, NOR THE CONTRIBUTORS BE LIABLE    */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY        */
/*  DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  */
/*  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS   */
/*  ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR          */
/*  PERFORMANCE OF THIS SOFTWARE.                                    */
/*                                                                   */
/*  ---------------------------------------------------------------  */
/*   This is Zen's MLSA filter as ported by Toda to festvox vc       */
/*   and back ported into hts/festival so we can do MLSA filtering   */
/*   If I took more time I could probably make this use the same as  */
/*   as the other code in this directory -- awb@cs.cmu.edu 03JAN06   */
/*  ---------------------------------------------------------------  */
/*   and then ported into Flite (November 2007 awb@cs.cmu.edu)       */
/*     with some speed uptimizations                                 */

/*********************************************************************/
/*                                                                   */
/*  Mel-cepstral vocoder (pulse/noise excitation & MLSA filter)      */
/*                                    2003/12/26 by Heiga Zen        */
/*                                                                   */
/*  Extracted from HTS and slightly modified                         */
/*   by Tomoki Toda (tomoki@ics.nitech.ac.jp)                        */
/*  June 2004                                                        */
/*  Integrate as a Voice Conversion module                           */
/*                                                                   */
/*-------------------------------------------------------------------*/

#include "cst_alloc.h"
#include "cst_string.h"
#include "cst_math.h"
#include "cst_track.h"
#include "cst_wave.h"
#include "cst_audio.h"

#ifdef ANDROID
#define SPEED_HACK
#endif

#ifdef UNDER_CE
#define SPEED_HACK
/* This is one of those other things you shouldn't do, but it makes
   CG voices in flowm fast enough on my phone */
#define double float
#endif

#include "cst_vc.h"
#include "cst_cg.h"
#include "cst_mlsa.h"


static cst_wave *synthesis_body(const cst_track *params, 
                                const cst_track *str, 
                                double fs, double framem,
                                cst_cg_db *cg_db,
                                cst_audio_streaming_info *asi,
                                int mlsa_speed_param);

cst_wave *mlsa_resynthesis(const cst_track *params, 
                           const cst_track *str, 
                           cst_cg_db *cg_db,
                           cst_audio_streaming_info *asi,
                           int mlsa_speed_param)
{
    /* Resynthesizes a wave from given track */
    cst_wave *wave = 0;
    int sr = cg_db->sample_rate;
    double shift;

    if (params->num_frames > 1)
        shift = 1000.0*(params->times[1]-params->times[0]);
    else
        shift = 5.0;

    wave = synthesis_body(params,str,sr,shift,cg_db,asi,mlsa_speed_param);

    return wave;
}

static cst_wave *synthesis_body(const cst_track *params, /* f0 + mcep */
                                const cst_track *str,
                                double fs,	/* sampling frequency (Hz) */
                                double framem,	/* frame size */
                                cst_cg_db *cg_db,
                                cst_audio_streaming_info *asi,
                                int mlsa_speed_param)
{
    long t, pos;
    int framel, i;
    double f0;
    VocoderSetup vs;
    cst_wave *wave = 0;
    double *mcep;
    int stream_mark;
    int rc = CST_AUDIO_STREAM_CONT;
    int num_mcep;
    double ffs = fs;

    num_mcep = params->num_channels-1;
    if ((num_mcep > mlsa_speed_param) &&
        ((num_mcep - mlsa_speed_param) > 4))
        /* Basically ignore some of the higher coeffs */
        /* It'll sound worse, but it will be faster */
        num_mcep -= mlsa_speed_param;
    framel = (int)(0.5 + (framem * ffs / 1000.0)); /* 80 for 16KHz */
    init_vocoder(ffs, framel, num_mcep, &vs, cg_db);

    if (str != NULL)
        vs.gauss = MFALSE;

    /* synthesize waveforms by MLSA filter */
    wave = new_wave();
    cst_wave_resize(wave,params->num_frames * framel,1);
    wave->sample_rate = fs; 

    mcep = cst_alloc(double,num_mcep+1);

    for (t = 0, stream_mark = pos = 0; 
         (rc == CST_AUDIO_STREAM_CONT) && (t < params->num_frames);
         t++) 
    {
        f0 = (double)params->frames[t][0];
        for (i=1; i<num_mcep+1; i++)
            mcep[i-1] = params->frames[t][i];
        mcep[i-1] = 0;

        if (str)
            vocoder(f0, mcep, str->frames[t], num_mcep, cg_db, &vs, wave, &pos);
        else
            vocoder(f0, mcep, NULL, num_mcep, cg_db, &vs, wave, &pos);

        if (asi && (pos-stream_mark > asi->min_buffsize))
        {
            rc=(*asi->asc)(wave,stream_mark,pos-stream_mark,0,asi);
            stream_mark = pos;
        }
    }
    wave->num_samples = pos;

    if (asi && (rc == CST_AUDIO_STREAM_CONT))
    {   /* drain the last part of the waveform */
        (*asi->asc)(wave,stream_mark,pos-stream_mark,1,asi);
    }

    /* memory free */
    cst_free(mcep);
    free_vocoder(&vs);

    if (rc == CST_AUDIO_STREAM_STOP)
    {
        delete_wave(wave);
        return NULL;
    }
    else
        return wave;
}

static void init_vocoder(double fs, int framel, int m, 
                         VocoderSetup *vs, cst_cg_db *cg_db)
{
    /* initialize global parameter */
    vs->fprd = framel;
    vs->iprd = 1;
    vs->seed = 1;
#ifdef SPEED_HACK
    /* This makes it about 25% faster and sounds basically the same */
    vs->pd   = 4;
#else
    vs->pd   = 5;
#endif

    vs->next =1;
    vs->gauss = MTRUE;

    /* Pade' approximants */
    vs->pade[ 0]=1.0;
    vs->pade[ 1]=1.0; vs->pade[ 2]=0.0;
    vs->pade[ 3]=1.0; vs->pade[ 4]=0.0;      vs->pade[ 5]=0.0;
    vs->pade[ 6]=1.0; vs->pade[ 7]=0.0;      vs->pade[ 8]=0.0;      vs->pade[ 9]=0.0;
    vs->pade[10]=1.0; vs->pade[11]=0.4999273; vs->pade[12]=0.1067005; vs->pade[13]=0.01170221; vs->pade[14]=0.0005656279;
    vs->pade[15]=1.0; vs->pade[16]=0.4999391; vs->pade[17]=0.1107098; vs->pade[18]=0.01369984; vs->pade[19]=0.0009564853;
    vs->pade[20]=0.00003041721;

    vs->rate = fs;
    vs->c = cst_alloc(double,3 * (m + 1) + 3 * (vs->pd + 1) + vs->pd * (m + 2));
   
    vs->p1 = -1;
    vs->sw = 0;
    vs->x  = 0x55555555;
   
    /* for postfiltering */
    vs->mc = NULL;
    vs->o  = 0;
    vs->d  = NULL;
    vs->irleng= 64;
   
    /* for MIXED EXCITATION */
    vs->ME_order = cg_db->ME_order;
    vs->ME_num = cg_db->ME_num;
    vs->hpulse = cst_alloc(double,vs->ME_order);
    vs->hnoise = cst_alloc(double,vs->ME_order);
    vs->xpulsesig = cst_alloc(double,vs->ME_order);
    vs->xnoisesig = cst_alloc(double,vs->ME_order);
    vs->h = cg_db->me_h;

    return;
}

static double plus_or_minus_one()
{
    /* Randomly return 1 or -1 */
    /* not sure rand() is portable */
    if (rand() > RAND_MAX/2.0)
        return 1.0;
    else
        return -1.0;
}

static void vocoder(double p, double *mc, 
                    const float *str,
                    int m, cst_cg_db *cg_db,
                    VocoderSetup *vs, cst_wave *wav, long *pos)
{
    double inc, x, e1, e2;
    int i, j, k; 
    double xpulse, xnoise;
    double fxpulse, fxnoise;
    float gain=1.0;

    if (cg_db->gain != 0.0)
        gain = cg_db->gain;
   
    if (str != NULL)     /* MIXED-EXCITATION */
    {
        /* Copy in str's and build hpulse and hnoise for this frame */
        for (i=0; i<vs->ME_order; i++)
        {
            vs->hpulse[i] = vs->hnoise[i] = 0.0;
            for (j=0; j<vs->ME_num; j++)
            {
                vs->hpulse[i] += str[j] * vs->h[j][i];
                vs->hnoise[i] += (1 - str[j]) * vs->h[j][i];
            }
        }
    }

    if (p != 0.0)
	p = vs->rate / p;  /* f0 -> pitch */
   
    if (vs->p1 < 0) {
	if (vs->gauss & (vs->seed != 1)) 
	    vs->next = srnd((unsigned)vs->seed);
         
	vs->p1   = p;
	vs->pc   = vs->p1;
	vs->cc   = vs->c + m + 1;
	vs->cinc = vs->cc + m + 1;
	vs->d1   = vs->cinc + m + 1;

	mc2b(mc, vs->c, m, cg_db->mlsa_alpha);

	if (cg_db->mlsa_beta > 0.0 && m > 1) {
	    e1 = b2en(vs->c, m, cg_db->mlsa_alpha, vs);
	    vs->c[1] -= cg_db->mlsa_beta * cg_db->mlsa_alpha * mc[2];
	    for (k=2;k<=m;k++)
		vs->c[k] *= (1.0 + cg_db->mlsa_beta);
	    e2 = b2en(vs->c, m, cg_db->mlsa_alpha, vs);
	    vs->c[0] += log(e1/e2)/2;
	}

	return;
    }

    mc2b(mc, vs->cc, m, cg_db->mlsa_alpha); 

    if (cg_db->mlsa_beta>0.0 && m > 1) {
	e1 = b2en(vs->cc, m, cg_db->mlsa_alpha, vs);
	vs->cc[1] -= cg_db->mlsa_beta * cg_db->mlsa_alpha * mc[2];
	for (k = 2; k <= m; k++)
	    vs->cc[k] *= (1.0 + cg_db->mlsa_beta);
	e2 = b2en(vs->cc, m, cg_db->mlsa_alpha, vs);
	vs->cc[0] += log(e1 / e2) / 2.0;
    }

    for (k=0; k<=m; k++)
	vs->cinc[k] = (vs->cc[k] - vs->c[k]) *
	    (double)vs->iprd / (double)vs->fprd;

    if (vs->p1!=0.0 && p!=0.0) {
	inc = (p - vs->p1) * (double)vs->iprd / (double)vs->fprd;
    } else {
	inc = 0.0;
	vs->pc = p;
	vs->p1 = 0.0;
    }

    for (j = vs->fprd, i = (vs->iprd + 1) / 2; j--;) {
	if (vs->p1 == 0.0) {
	    if (vs->gauss)
		x = (double) nrandom(vs);
	    else
		x = plus_or_minus_one();
            if (str != NULL)             /* MIXED EXCITATION */
            {
                xnoise = x;
                xpulse = 0.0;
            }
	} else {
	    if ((vs->pc += 1.0) >= vs->p1) {
		x = sqrt (vs->p1);
		vs->pc = vs->pc - vs->p1;
	    } else 
                x = 0.0;

            if (str != NULL)  /* MIXED EXCITATION */
            {
                xpulse = x;
                xnoise = plus_or_minus_one();
            }
	}

        /* MIXED EXCITATION */
        /* The real work -- apply shaping filters to pulse and noise */
        if (str != NULL)
        {
            fxpulse = fxnoise = 0.0;
            for (k=vs->ME_order-1; k>0; k--)
            {
                fxpulse += vs->hpulse[k] * vs->xpulsesig[k];
                fxnoise += vs->hnoise[k] * vs->xnoisesig[k];

                vs->xpulsesig[k] = vs->xpulsesig[k-1];
                vs->xnoisesig[k] = vs->xnoisesig[k-1];
            }
            fxpulse += vs->hpulse[0] * xpulse;
            fxnoise += vs->hnoise[0] * xnoise;
            vs->xpulsesig[0] = xpulse;
            vs->xnoisesig[0] = xnoise;

            x = fxpulse + fxnoise; /* excitation is pulse plus noise */
        }

        if (cg_db->sample_rate == 8000)
            /* 8KHz voices are too quiet: this is probably not general */
            x *= exp(vs->c[0])*2.0;
        else
            x *= exp(vs->c[0])*gain;

	x = mlsadf(x, vs->c, m, cg_db->mlsa_alpha, vs->pd, vs->d1, vs);

        wav->samples[*pos] = (short)x;
	*pos += 1;

	if (!--i) {
	    vs->p1 += inc;
	    for (k = 0; k <= m; k++) vs->c[k] += vs->cinc[k];
	    i = vs->iprd;
	}
    }
   
    vs->p1 = p;
    memmove(vs->c,vs->cc,sizeof(double)*(m+1));
   
    return;
}

static double mlsadf(double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs)
{

   vs->ppade = &(vs->pade[pd*(pd+1)/2]);
    
   x = mlsadf1 (x, b, m, a, pd, d, vs);
   x = mlsadf2 (x, b, m, a, pd, &d[2*(pd+1)], vs);

   return(x);
}

static double mlsadf1(double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs)
{
   double v, out = 0.0, *pt, aa;
   register int i;

   aa = 1 - a*a;
   pt = &d[pd+1];

   for (i=pd; i>=1; i--) {
      d[i] = aa*pt[i-1] + a*d[i];
      pt[i] = d[i] * b[1];
      v = pt[i] * vs->ppade[i];
      x += (1 & i) ? v : -v;
      out += v;
   }

   pt[0] = x;
   out += x;

   return(out);
}

static double mlsadf2 (double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs)
{
  double v, out = 0.0, *pt;
  register int i;
    
   pt = &d[pd * (m+2)];

   for (i=pd; i>=1; i--) {
       pt[i] = mlsafir (pt[i-1], b, m, a, &d[(i-1)*(m+2)]);

       v = pt[i] * vs->ppade[i];

       x  += (1&i) ? v : -v;
       out += v;
   }
    
   pt[0] = x;
   out  += x;

   return(out);
}

static double mlsafir (double x, double *b, int m, double a, double *d)
{  
   double y = 0.0;
   double aa;
   register int i;

   aa = 1 - a*a;

   d[0] = x;
   d[1] = aa*d[0] + a*d[1];
   for (i=2; i<= m; i++) {
      d[i] = d[i] + a*(d[i+1]-d[i-1]);
      y += d[i]*b[i];
   }

   for (i=m+1; i>1; i--) 
      d[i] = d[i-1];

   return(y);
}

static double nrandom (VocoderSetup *vs)
{
   if (vs->sw == 0) {
      vs->sw = 1;
      do {
         vs->r1 = 2.0 * rnd(&vs->next) - 1.0;
         vs->r2 = 2.0 * rnd(&vs->next) - 1.0;
         vs->s  = vs->r1 * vs->r1 + vs->r2 * vs->r2;
      } while (vs->s > 1 || vs->s == 0);

      vs->s = sqrt (-2 * log(vs->s) / vs->s);
      
      return(vs->r1*vs->s);
   }
   else {
      vs->sw = 0;
      
      return (vs->r2*vs->s);
   }
}

static double rnd (unsigned long *next)
{
   double r;

   *next = *next * 1103515245L + 12345;
   r = (*next / 65536L) % 32768L;

   return(r/RANDMAX); 
}

static unsigned long srnd ( unsigned long seed )
{
   return(seed);
}

/* mc2b : transform mel-cepstrum to MLSA digital fillter coefficients */
static void mc2b (double *mc, double *b, int m, double a)
{
   b[m] = mc[m];
    
   for (m--; m>=0; m--)
      b[m] = mc[m] - a * b[m+1];
   
   return;
}


static double b2en (double *b, int m, double a, VocoderSetup *vs)
{
   double en;
   int k;
   
   if (vs->o<m) {
      if (vs->mc != NULL)
          cst_free(vs->mc);
    
      vs->mc = cst_alloc(double,(m + 1) + 2 * vs->irleng);
      vs->cep = vs->mc + m+1;
      vs->ir  = vs->cep + vs->irleng;
   }

   b2mc(b, vs->mc, m, a);
   freqt(vs->mc, m, vs->cep, vs->irleng-1, -a, vs);
   c2ir(vs->cep, vs->irleng, vs->ir, vs->irleng);
   en = 0.0;
   
   for (k=0;k<vs->irleng;k++)
      en += vs->ir[k] * vs->ir[k];

   return(en);
}


/* b2bc : transform MLSA digital filter coefficients to mel-cepstrum */
static void b2mc (double *b, double *mc, int m, double a)
{
  double d, o;
        
  d = mc[m] = b[m];
  for (m--; m>=0; m--) {
    o = b[m] + a * d;
    d = b[m];
    mc[m] = o;
  }
  
  return;
}

/* freqt : frequency transformation */
static void freqt (double *c1, int m1, double *c2, int m2, double a, VocoderSetup *vs)
{
   register int i, j;
   double b;
    
   if (vs->d==NULL) {
      vs->size = m2;
      vs->d    = cst_alloc(double,vs->size + vs->size + 2);
      vs->g    = vs->d+vs->size+1;
   }

   if (m2>vs->size) {
       cst_free(vs->d);
      vs->size = m2;
      vs->d    = cst_alloc(double,vs->size + vs->size + 2);
      vs->g    = vs->d+vs->size+1;
   }
    
   b = 1-a*a;
   for (i=0; i<m2+1; i++)
      vs->g[i] = 0.0;

   for (i=-m1; i<=0; i++) {
      if (0 <= m2)
         vs->g[0] = c1[-i]+a*(vs->d[0]=vs->g[0]);
      if (1 <= m2)
         vs->g[1] = b*vs->d[0]+a*(vs->d[1]=vs->g[1]);
      for (j=2; j<=m2; j++)
         vs->g[j] = vs->d[j-1]+a*((vs->d[j]=vs->g[j])-vs->g[j-1]);
   }

   memmove(c2,vs->g,sizeof(double)*(m2+1));
   
   return;
}

/* c2ir : The minimum phase impulse response is evaluated from the minimum phase cepstrum */
static void c2ir (double *c, int nc, double *h, int leng)
{
   register int n, k, upl;
   double  d;

   h[0] = exp(c[0]);
   for (n=1; n<leng; n++) {
      d = 0;
      upl = (n>=nc) ? nc-1 : n;
      for (k=1; k<=upl; k++)
         d += k*c[k]*h[n-k];
      h[n] = d/n;
   }
   
   return;
}

static void free_vocoder(VocoderSetup *vs)
{

    cst_free(vs->c);
    cst_free(vs->mc);
    cst_free(vs->d);
 
    vs->c = NULL;
    vs->mc = NULL;
    vs->d = NULL;
    vs->ppade = NULL;
    vs->cc = NULL;
    vs->cinc = NULL;
    vs->d1 = NULL;
    vs->g = NULL;
    vs->cep = NULL;
    vs->ir = NULL;

    cst_free(vs->hpulse);
    cst_free(vs->hnoise);
    cst_free(vs->xpulsesig);
    cst_free(vs->xnoisesig);

   
    return;
}