shithub: flite

ref: 4e2ee15fa101e23939f78f246754de558034fa40
dir: /src/lexicon/cst_lexicon.c/

View raw version
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                        Copyright (c) 1999                             */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  December 1999                                    */
/*************************************************************************/
/*                                                                       */
/*  Lexicon related functions                                            */
/*                                                                       */
/*************************************************************************/

#include "cst_features.h"
#include "cst_lexicon.h"
#include "cst_tokenstream.h"

CST_VAL_REGISTER_TYPE_NODEL(lexicon,cst_lexicon)

#define WP_SIZE 64

static int no_syl_boundaries(const cst_item *i, const cst_val *p);
static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l,
                                   int *found);

static int lex_match_entry(const char *a, const char *b);
static int lex_lookup_bsearch(const cst_lexicon *l,const char *word);
static int find_full_match(const cst_lexicon *l,
			   int i,const char *word);

cst_lexicon *new_lexicon()
{
    cst_lexicon *l = cst_alloc(cst_lexicon,1);
    l->syl_boundary = no_syl_boundaries;
    return l;
}

void delete_lexicon(cst_lexicon *lex)
{   /* But I doubt if this will ever be called, lexicons are mapped */
    /* This probably isn't complete */
    if (lex)
    {
	cst_free(lex->data);
	cst_free(lex);
    }
}

cst_val *cst_lex_load_addenda(const cst_lexicon *lex, const char *lexfile)
{   /* Load an addend from given file, check its phones wrt lex */
    cst_tokenstream *lf;
    const cst_string *line;
    cst_val *e = NULL;
    cst_val *na = NULL;
    int i;

    lf = ts_open(lexfile,"\n","","","");
    if (lf == NULL)
    {
	cst_errmsg("lex_add_addenda: cannot open lexicon file\n");
        return NULL;;
    }

    while (!ts_eof(lf))
    {
        line = ts_get(lf);
        if (line[0] == '#')
            continue;  /* a comment */
        for (i=0; line[i]; i++)
        {
            if (line[i] != ' ')
                break;
        }
        if (line[i])
        {
            e = cst_lex_make_entry(lex,line);
            if (e)
                na = cons_val(e,na);
        }
        else
            continue;  /* a blank line */
    }

    ts_close(lf);
    return val_reverse(na);
}

cst_val *cst_lex_make_entry(const cst_lexicon *lex, const cst_string *entry)
{   /* if replace then replace entry in addenda of lex with entry */
    /* else append entry to addenda of lex                        */
    cst_tokenstream *e;
    cst_val *phones = NULL;
    cst_val *ventry;
    const cst_string *w, *p;
    cst_string *word;
    cst_string *pos;
    int i;

    e = ts_open_string(entry,
                       cst_ts_default_whitespacesymbols,
                       "","","");

    w = ts_get(e);
    if (w[0] == '"') /* it was a quoted entry */
    {                   /* so reparse it */
        ts_close(e);
        e = ts_open_string(entry,
                           cst_ts_default_whitespacesymbols,
                           "","","");
        w = ts_get_quoted_token(e,'"','\\');
    }

    word = cst_strdup(w);
    p = ts_get(e);
    if (!cst_streq(":",p)) /* there is a real pos */
    {
        pos = cst_strdup(p);
        p = ts_get(e);
        if (!cst_streq(":",p)) /* there is a real pos */
        {
            cst_fprintf(stdout,"add_addenda: lex %s: expected \":\" in %s\n",
                        lex->name,
                        word);
            cst_free(word); cst_free(pos); ts_close(e);
            return NULL;
        }
    }
    else
        pos = cst_strdup("nil");

    while (!ts_eof(e))
    {
        p = ts_get(e);
        /* Check its a legal phone */
        for (i=0; lex->phone_table[i]; i++)
       {
            if (cst_streq(p,lex->phone_table[i]))
                break;
        }
        if (cst_streq("#",p)) /* comment to end of line */
            break;
        else if (cst_streq("",p)) /* trailing ws at eoln causes this */
            break;
        else if (lex->phone_table[i])
            /* Only add it if its a valid phone */
            phones = cons_val(string_val(p),phones);
        else
        {
            cst_fprintf(stdout,"add_addenda: lex: %s word %s phone %s not in lexicon phoneset\n",
                        lex->name,
                        word,
                        p);
        }
    }

    ventry = cons_val(string_val(word),cons_val(string_val(pos),
                                                val_reverse(phones)));
    cst_free(word); cst_free(pos); ts_close(e);
#if 0
    printf("entry: ");
    val_print(stdout,ventry);
    printf("\n");
#endif

    return ventry;
 }

#if 0
void lexicon_register(cst_lexicon *lex)
{
    /* Add given lexicon to list of known lexicons */
    cst_lexicon **old_lexs;
    int i;
    
    old_lexs = flite_lexicons;
    flite_num_lexicons++;
    flite_lexicons = cst_alloc(cst_lexicon *,flite_num_lexicons);
    for (i=0; i<flite_num_lexicons-1; i++)
	flite_lexicons[i] = old_lexs[i];
    flite_lexicons[i] = lex;
    cst_free(old_lexs);
}

cst_lexicon *lexicon_select(const char *name)
{
    int i;

    for (i=0; i < flite_num_lexicons; i++)
	if (cst_streq(name,flite_lexicons[i]->name))
	    return flite_lexicons[i];
    return NULL;
}
#endif

static int no_syl_boundaries(const cst_item *i, const cst_val *p)
{
    /* This is a default function that will normally be replaced */
    /* for each lexicon                                         */
    (void)i;
    (void)p;
    return FALSE;
}

int in_lex(const cst_lexicon *l, const char *word, const char *pos,
           const cst_features *feats)
{
    /* return TRUE is its in the lexicon */
    int r = FALSE, i;
    char *wp;

    wp = cst_alloc(char,cst_strlen(word)+2);
    cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word);

    for (i=0; l->addenda && l->addenda[i]; i++)
    {
	if (((wp[0] == '0') || (wp[0] == l->addenda[i][0][0])) &&
	    (cst_streq(wp+1,l->addenda[i][0]+1)))
	{
	    r = TRUE;
	    break;
	}
    }

    if (!r && (lex_lookup_bsearch(l,wp) >= 0))
	r = TRUE;

    cst_free(wp);
    return r;
}

cst_val *lex_lookup(const cst_lexicon *l, const char *word, const char *pos,
                    const cst_features *feats)
{
    int index;
    int p;
    const char *q;
    char *wp;
    cst_val *phones = 0;
    int found = FALSE;

    wp = cst_alloc(char,cst_strlen(word)+2);
    cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word);

    if (l->addenda)
	phones = lex_lookup_addenda(wp,l,&found);

    if (!found)
    {
	index = lex_lookup_bsearch(l,wp);

	if (index >= 0)
	{
	    if (l->phone_hufftable)
	    {
		for (p=index-2; l->data[p]; p--)
		    for (q=l->phone_hufftable[l->data[p]]; *q; q++)
			phones = cons_val(string_val(l->phone_table[(unsigned char)*q]),
				  phones);
	    }
	    else  /* no compression -- should we still support this ? */
	    {
		for (p=index-2; l->data[p]; p--)
		    phones = cons_val(string_val(l->phone_table[l->data[p]]),
				      phones);
	    }
	    phones = val_reverse(phones);
	}
	else if (l->lts_function)
	{
	    phones = (l->lts_function)(l,word,"",feats);
	}
	else if (l->lts_rule_set)
	{
	    phones = lts_apply(word,
			       "",  /* more features if we had them */
			       l->lts_rule_set);
	}
    }

    cst_free(wp);
    
    return phones;
}

static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l, 
				   int *found)
{
    /* For those other words */
    int i,j;
    cst_val *phones;
    
    phones = NULL;

    for (i=0; l->addenda[i]; i++)
    {
	if (((wp[0] == '0') || 
             (wp[0] == l->addenda[i][0][0]) || 
             (l->addenda[i][0][0] == '0')) &&
	    (cst_streq(wp+1,l->addenda[i][0]+1)))
	{
	    for (j=1; l->addenda[i][j]; j++)
		phones = cons_val(string_val(l->addenda[i][j]),phones);
	    *found = TRUE;
	    return val_reverse(phones);
	}
    }
    
    return NULL;
}

static int lex_uncompress_word(char *ucword,int max_size,
			       int p,const cst_lexicon *l)
{
    int i,j=0,length;
    unsigned char *cword;

    if (l->entry_hufftable == 0)
        /* can have "compressed" lexicons without compression */
	cst_sprintf(ucword,"%s",&l->data[p]);
    else
    {
	cword = &l->data[p];
	for (i=0,j=0; cword[i]; i++)
	{
	    length = cst_strlen(l->entry_hufftable[cword[i]]);
	    if (j+length+1<max_size)
	    {
		memmove(ucword+j,l->entry_hufftable[cword[i]],length);
		j += length;
	    }
	    else
		break;
	}
	ucword[j] = '\0';
    }

    return j;
}
static int lex_data_next_entry(const cst_lexicon *l,int p,int end)
{
    for (p++; p < end; p++)
	if (l->data[p-1] == 255)
	    return p;
    return end;
}
static int lex_data_prev_entry(const cst_lexicon *l,int p,int start)
{
    for (p--; p > start; p--)
	if (l->data[p-1] == 255)
	    return p;
    return start;
}
static int lex_data_closest_entry(const cst_lexicon *l,int p,int start,int end)
{
    int d;

    d=0;
    while ((p-d > start) && 
	   (p+d < end))
    {
	if (l->data[(p+d)-1] == 255)
        {
	    return p+d;
        }
	else if (l->data[(p-d)-1] == 255)
        {
	    return p-d;
        }
	d++;
    }
    return p-d;
}

static int lex_lookup_bsearch(const cst_lexicon *l, const char *word)
{
    int start,mid,end,c;
    /* needs to be longer than longest word in lexicon */
    char word_pos[WP_SIZE];

    start = 0;
    end = l->num_bytes;
    while (start < end) {
	mid = (start + end)/2;

	/* find previous entry start */
	mid = lex_data_closest_entry(l,mid,start,end);
	lex_uncompress_word(word_pos,WP_SIZE,mid,l);

	c = lex_match_entry(word_pos,word);

	if (c == 0)
        {
	    return find_full_match(l,mid,word);
        }
	else if (c > 0)
	    end = mid;
	else
        {
	    start = lex_data_next_entry(l,mid + 1,end);
        }

#if 0
	if (l->data[start-1] == 255)
	{
	    lex_uncompress_word(word_pos,WP_SIZE,start,l);
	    printf("start %s %d ",word_pos,start);
	}
	else
	    printf("start NULL %d ",start);
	if (l->data[mid-1] == 255)
	{
	    lex_uncompress_word(word_pos,WP_SIZE,mid,l);
	    printf("mid %s %d ",word_pos,mid);
	}
	else
	    printf("mid NULL %d ",mid);
	if (l->data[end-1] == 255)
	{
	    lex_uncompress_word(word_pos,WP_SIZE,end,l);
	    printf("end %s %d ",word_pos,end);
	}
	else
	    printf("end NULL %d ",end);
	printf("\n");
#endif

    }
    return -1;
}

static int find_full_match(const cst_lexicon *l,
			   int i,const char *word)

{
    /* found word, now look for actual match including pos */
    int w, match=i;
    /* needs to be longer than longest word in lexicon */
    char word_pos[WP_SIZE];

    for (w=i; w > 0; )
    {
	lex_uncompress_word(word_pos,WP_SIZE,w,l);
	if (!cst_streq(word+1,word_pos+1))
	    break;
	else if (cst_streq(word,word_pos))
	    return w;
	match = w;  /* if we can't find an exact match we'll take this one */
        /* go back to last entry */
	w = lex_data_prev_entry(l,w,0);
    }

    for (w=i; w < l->num_bytes;)
    {
	lex_uncompress_word(word_pos,WP_SIZE,w,l);
	if (!cst_streq(word+1,word_pos+1))
	    break;
	else if (cst_streq(word,word_pos))
	    return w;
        /* go to next entry */
	w = lex_data_next_entry(l,w,l->num_bytes);
    }

    return match;
}

static int lex_match_entry(const char *a, const char *b)
{
    int c;

    c = strcmp(a+1,b+1);

    return c;
}