ref: b66315e2c408ce553082f722b0a9587a5ab5460e
dir: /src/lexicon/cst_lexicon.c/
/*************************************************************************/ /* */ /* Language Technologies Institute */ /* Carnegie Mellon University */ /* Copyright (c) 1999 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author: Alan W Black (awb@cs.cmu.edu) */ /* Date: December 1999 */ /*************************************************************************/ /* */ /* Lexicon related functions */ /* */ /*************************************************************************/ #include "cst_features.h" #include "cst_lexicon.h" #include "cst_tokenstream.h" CST_VAL_REGISTER_TYPE_NODEL(lexicon,cst_lexicon) #define WP_SIZE 64 static int no_syl_boundaries(const cst_item *i, const cst_val *p); static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l, int *found); static int lex_match_entry(const char *a, const char *b); static int lex_lookup_bsearch(const cst_lexicon *l,const char *word); static int find_full_match(const cst_lexicon *l, int i,const char *word); cst_lexicon *new_lexicon() { cst_lexicon *l = cst_alloc(cst_lexicon,1); l->syl_boundary = no_syl_boundaries; return l; } void delete_lexicon(cst_lexicon *lex) { /* But I doubt if this will ever be called, lexicons are mapped */ /* This probably isn't complete */ if (lex) { cst_free(lex->data); cst_free(lex); } } cst_val *cst_lex_load_addenda(const cst_lexicon *lex, const char *lexfile) { /* Load an addend from given file, check its phones wrt lex */ cst_tokenstream *lf; const cst_string *line; cst_val *e = NULL; cst_val *na = NULL; int i; lf = ts_open(lexfile,"\n","","",""); if (lf == NULL) { cst_errmsg("lex_add_addenda: cannot open lexicon file\n"); return NULL;; } while (!ts_eof(lf)) { line = ts_get(lf); if (line[0] == '#') continue; /* a comment */ for (i=0; line[i]; i++) { if (line[i] != ' ') break; } if (line[i]) { e = cst_lex_make_entry(lex,line); if (e) na = cons_val(e,na); } else continue; /* a blank line */ } ts_close(lf); return val_reverse(na); } cst_val *cst_lex_make_entry(const cst_lexicon *lex, const cst_string *entry) { /* if replace then replace entry in addenda of lex with entry */ /* else append entry to addenda of lex */ cst_tokenstream *e; cst_val *phones = NULL; cst_val *ventry; const cst_string *w, *p; cst_string *word; cst_string *pos; int i; e = ts_open_string(entry, cst_ts_default_whitespacesymbols, "","",""); w = ts_get(e); if (w[0] == '"') /* it was a quoted entry */ { /* so reparse it */ ts_close(e); e = ts_open_string(entry, cst_ts_default_whitespacesymbols, "","",""); w = ts_get_quoted_token(e,'"','\\'); } word = cst_strdup(w); p = ts_get(e); if (!cst_streq(":",p)) /* there is a real pos */ { pos = cst_strdup(p); p = ts_get(e); if (!cst_streq(":",p)) /* there is a real pos */ { cst_fprintf(stdout,"add_addenda: lex %s: expected \":\" in %s\n", lex->name, word); cst_free(word); cst_free(pos); ts_close(e); return NULL; } } else pos = cst_strdup("nil"); while (!ts_eof(e)) { p = ts_get(e); /* Check its a legal phone */ for (i=0; lex->phone_table[i]; i++) { if (cst_streq(p,lex->phone_table[i])) break; } if (cst_streq("#",p)) /* comment to end of line */ break; else if (cst_streq("",p)) /* trailing ws at eoln causes this */ break; else if (lex->phone_table[i]) /* Only add it if its a valid phone */ phones = cons_val(string_val(p),phones); else { cst_fprintf(stdout,"add_addenda: lex: %s word %s phone %s not in lexicon phoneset\n", lex->name, word, p); } } ventry = cons_val(string_val(word),cons_val(string_val(pos), val_reverse(phones))); cst_free(word); cst_free(pos); ts_close(e); #if 0 printf("entry: "); val_print(stdout,ventry); printf("\n"); #endif return ventry; } #if 0 void lexicon_register(cst_lexicon *lex) { /* Add given lexicon to list of known lexicons */ cst_lexicon **old_lexs; int i; old_lexs = flite_lexicons; flite_num_lexicons++; flite_lexicons = cst_alloc(cst_lexicon *,flite_num_lexicons); for (i=0; i<flite_num_lexicons-1; i++) flite_lexicons[i] = old_lexs[i]; flite_lexicons[i] = lex; cst_free(old_lexs); } cst_lexicon *lexicon_select(const char *name) { int i; for (i=0; i < flite_num_lexicons; i++) if (cst_streq(name,flite_lexicons[i]->name)) return flite_lexicons[i]; return NULL; } #endif static int no_syl_boundaries(const cst_item *i, const cst_val *p) { /* This is a default function that will normally be replaced */ /* for each lexicon */ (void)i; (void)p; return FALSE; } int in_lex(const cst_lexicon *l, const char *word, const char *pos, const cst_features *feats) { /* return TRUE is its in the lexicon */ int r = FALSE, i; char *wp; wp = cst_alloc(char,cst_strlen(word)+2); cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word); for (i=0; l->addenda && l->addenda[i]; i++) { if (((wp[0] == '0') || (wp[0] == l->addenda[i][0][0])) && (cst_streq(wp+1,l->addenda[i][0]+1))) { r = TRUE; break; } } if (!r && (lex_lookup_bsearch(l,wp) >= 0)) r = TRUE; cst_free(wp); return r; } cst_val *lex_lookup(const cst_lexicon *l, const char *word, const char *pos, const cst_features *feats) { int index; int p; const char *q; char *wp; cst_val *phones = 0; int found = FALSE; wp = cst_alloc(char,cst_strlen(word)+2); cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word); if (l->addenda) phones = lex_lookup_addenda(wp,l,&found); if (!found) { index = lex_lookup_bsearch(l,wp); if (index >= 0) { if (l->phone_hufftable) { for (p=index-2; l->data[p]; p--) for (q=l->phone_hufftable[l->data[p]]; *q; q++) phones = cons_val(string_val(l->phone_table[(unsigned char)*q]), phones); } else /* no compression -- should we still support this ? */ { for (p=index-2; l->data[p]; p--) phones = cons_val(string_val(l->phone_table[l->data[p]]), phones); } phones = val_reverse(phones); } else if (l->lts_function) { phones = (l->lts_function)(l,word,"",feats); } else if (l->lts_rule_set) { phones = lts_apply(word, "", /* more features if we had them */ l->lts_rule_set); } } cst_free(wp); return phones; } static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l, int *found) { /* For those other words */ int i,j; cst_val *phones; phones = NULL; for (i=0; l->addenda[i]; i++) { if (((wp[0] == '0') || (wp[0] == l->addenda[i][0][0]) || (l->addenda[i][0][0] == '0')) && (cst_streq(wp+1,l->addenda[i][0]+1))) { for (j=1; l->addenda[i][j]; j++) phones = cons_val(string_val(l->addenda[i][j]),phones); *found = TRUE; return val_reverse(phones); } } return NULL; } static int lex_uncompress_word(char *ucword,int max_size, int p,const cst_lexicon *l) { int i,j=0,length; unsigned char *cword; if (l->entry_hufftable == 0) /* can have "compressed" lexicons without compression */ cst_sprintf(ucword,"%s",&l->data[p]); else { cword = &l->data[p]; for (i=0,j=0; cword[i]; i++) { length = cst_strlen(l->entry_hufftable[cword[i]]); if (j+length+1<max_size) { memmove(ucword+j,l->entry_hufftable[cword[i]],length); j += length; } else break; } ucword[j] = '\0'; } return j; } static int lex_data_next_entry(const cst_lexicon *l,int p,int end) { for (p++; p < end; p++) if (l->data[p-1] == 255) return p; return end; } static int lex_data_prev_entry(const cst_lexicon *l,int p,int start) { for (p--; p > start; p--) if (l->data[p-1] == 255) return p; return start; } static int lex_data_closest_entry(const cst_lexicon *l,int p,int start,int end) { int d; d=0; while ((p-d > start) && (p+d < end)) { if (l->data[(p+d)-1] == 255) { return p+d; } else if (l->data[(p-d)-1] == 255) { return p-d; } d++; } return p-d; } static int lex_lookup_bsearch(const cst_lexicon *l, const char *word) { int start,mid,end,c; /* needs to be longer than longest word in lexicon */ char word_pos[WP_SIZE]; start = 0; end = l->num_bytes; while (start < end) { mid = (start + end)/2; /* find previous entry start */ mid = lex_data_closest_entry(l,mid,start,end); lex_uncompress_word(word_pos,WP_SIZE,mid,l); c = lex_match_entry(word_pos,word); if (c == 0) { return find_full_match(l,mid,word); } else if (c > 0) end = mid; else { start = lex_data_next_entry(l,mid + 1,end); } #if 0 if (l->data[start-1] == 255) { lex_uncompress_word(word_pos,WP_SIZE,start,l); printf("start %s %d ",word_pos,start); } else printf("start NULL %d ",start); if (l->data[mid-1] == 255) { lex_uncompress_word(word_pos,WP_SIZE,mid,l); printf("mid %s %d ",word_pos,mid); } else printf("mid NULL %d ",mid); if (l->data[end-1] == 255) { lex_uncompress_word(word_pos,WP_SIZE,end,l); printf("end %s %d ",word_pos,end); } else printf("end NULL %d ",end); printf("\n"); #endif } return -1; } static int find_full_match(const cst_lexicon *l, int i,const char *word) { /* found word, now look for actual match including pos */ int w, match=i; /* needs to be longer than longest word in lexicon */ char word_pos[WP_SIZE]; for (w=i; w > 0; ) { lex_uncompress_word(word_pos,WP_SIZE,w,l); if (!cst_streq(word+1,word_pos+1)) break; else if (cst_streq(word,word_pos)) return w; match = w; /* if we can't find an exact match we'll take this one */ /* go back to last entry */ w = lex_data_prev_entry(l,w,0); } for (w=i; w < l->num_bytes;) { lex_uncompress_word(word_pos,WP_SIZE,w,l); if (!cst_streq(word+1,word_pos+1)) break; else if (cst_streq(word,word_pos)) return w; /* go to next entry */ w = lex_data_next_entry(l,w,l->num_bytes); } return match; } static int lex_match_entry(const char *a, const char *b) { int c; c = strcmp(a+1,b+1); return c; }