ref: f136f4eb6b2d32aa04499aeee872d3d7586e925f
dir: /lang/usenglish/us_expand.c/
/*************************************************************************/ /* */ /* Language Technologies Institute */ /* Carnegie Mellon University */ /* Copyright (c) 2001 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and distribute */ /* this software and its documentation without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of this work, and to */ /* permit persons to whom this work is furnished to do so, subject to */ /* the following conditions: */ /* 1. The code must retain the above copyright notice, this list of */ /* conditions and the following disclaimer. */ /* 2. Any modifications must be clearly marked as such. */ /* 3. Original authors' names are not deleted. */ /* 4. The authors' names are not used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ /* THIS SOFTWARE. */ /* */ /*************************************************************************/ /* Author: Alan W Black (awb@cs.cmu.edu) */ /* Date: January 2001 */ /*************************************************************************/ /* */ /* English text expanders */ /* */ /* numbers, digits, ids (years), money */ /* */ /*************************************************************************/ #include <ctype.h> #include "us_text.h" static const char * const digit2num[] = { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine" }; static const char * const digit2teen[] = { "ten", /* shouldn't get called */ "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen" }; static const char * const digit2enty[] = { "zero", /* shouldn't get called */ "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety" }; static const char * const ord2num[] = { "zeroth", "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth" }; static const char * const ord2teen[] = { "tenth", /* shouldn't get called */ "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth" }; static const char * const ord2enty[] = { "zeroth", /* shouldn't get called */ "tenth", "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", "eightieth", "ninetieth" }; cst_val *en_exp_number(const char *numstring) { /* Expand given token to list of words pronouncing it as a number */ int num_digits = cst_strlen(numstring); char part[4]; cst_val *p; int i; if (num_digits == 0) return NULL; else if (num_digits == 1) return en_exp_digits(numstring); else if (num_digits == 2) { if (numstring[0] == '0') { if (numstring[1] == '0') return 0; else return cons_val(string_val(digit2num[numstring[1]-'0']),0); } else if (numstring[1] == '0') return cons_val(string_val(digit2enty[numstring[0]-'0']),0); else if (numstring[0] == '1') return cons_val(string_val(digit2teen[numstring[1]-'0']),0); else return cons_val(string_val(digit2enty[numstring[0]-'0']), en_exp_digits(numstring+1)); } else if (num_digits == 3) { if (numstring[0] == '0') return en_exp_number(numstring+1); else return cons_val(string_val(digit2num[numstring[0]-'0']), cons_val(string_val("hundred"), en_exp_number(numstring+1))); } else if (num_digits < 7) { for (i=0; i < num_digits-3; i++) part[i] = numstring[i]; part[i]='\0'; p = en_exp_number(part); if (p == 0) /* no thousands */ return en_exp_number(numstring+i); else return val_append(p,cons_val(string_val("thousand"), en_exp_number(numstring+i))); } else if (num_digits < 10) { for (i=0; i < num_digits-6; i++) part[i] = numstring[i]; part[i]='\0'; p = en_exp_number(part); if (p == 0) /* no millions */ return en_exp_number(numstring+i); else return val_append(p,cons_val(string_val("million"), en_exp_number(numstring+i))); } else if (num_digits < 13) { /* If there are pedantic brits out there, tough!, 10^9 is a billion */ for (i=0; i < num_digits-9; i++) part[i] = numstring[i]; part[i]='\0'; p = en_exp_number(part); if (p == 0) /* no billions */ return en_exp_number(numstring+i); else return val_append(p,cons_val(string_val("billion"), en_exp_number(numstring+i))); } else /* Way too many digits here, to be a number */ { return en_exp_digits(numstring); } } cst_val *en_exp_ordinal(const char *rawnumstring) { /* return ordinal for digit string */ cst_val *card, *o; const cst_val *t; const char *l; const char *ord; char *numstring; int i,j; numstring = cst_strdup(rawnumstring); for (j=i=0; i < (signed int)cst_strlen(rawnumstring); i++) if (rawnumstring[i] != ',') { numstring[j] = rawnumstring[i]; j++; } numstring[j] = '\0'; card = val_reverse(en_exp_number(numstring)); if (!card) card = cons_val(string_val("zero"),0); cst_free(numstring); l = val_string(val_car(card)); ord = 0; for (i=0; i<10; i++) if (cst_streq(l,digit2num[i])) ord = ord2num[i]; if (!ord) for (i=0; i<10; i++) if (cst_streq(l,digit2teen[i])) ord = ord2teen[i]; if (!ord) for (i=0; i<10; i++) if (cst_streq(l,digit2enty[i])) ord = ord2enty[i]; if (cst_streq(l,"hundred")) ord = "hundredth"; if (cst_streq(l,"thousand")) ord = "thousandth"; if (cst_streq(l,"billion")) ord = "billtionth"; if (!ord) /* dunno, so don't convert anything */ return card; o = cons_val(string_val(ord),0); for (t=val_cdr(card); t; t=val_cdr(t)) o = cons_val(val_car(t),o); delete_val(card); return o; } cst_val *en_exp_id(const char *numstring) { /* Expand numstring as pairs as in years or ids */ char aaa[3]; if ((cst_strlen(numstring) == 4) && (numstring[2] == '0') && (numstring[3] == '0')) { if (numstring[1] == '0') return en_exp_number(numstring); /* 2000, 3000 */ else { aaa[0] = numstring[0]; aaa[1] = numstring[1]; aaa[2] = '\0'; return val_append(en_exp_number(aaa), cons_val(string_val("hundred"),0)); } } else if ((cst_strlen(numstring) == 3) && (numstring[0] != '0') && (numstring[1] == '0') && (numstring[2] == '0')) { return cons_val(string_val(digit2num[numstring[0]-'0']), cons_val(string_val("hundred"),0)); } else if ((cst_strlen(numstring) == 2) && (numstring[0] == '0') && (numstring[1] == '0')) return cons_val(string_val("zero"), cons_val(string_val("zero"),NULL)); else if ((cst_strlen(numstring) == 2) && (numstring[0] == '0')) return cons_val(string_val("oh"), en_exp_digits(&numstring[1])); else if (((cst_strlen(numstring) == 4) && (numstring[1] == '0') && (numstring[2] == '0')) || (cst_strlen(numstring) < 3)) return en_exp_number(numstring); else if (cst_strlen(numstring)%2 == 1) { return cons_val(string_val(digit2num[numstring[0]-'0']), en_exp_id(&numstring[1])); } else { aaa[0] = numstring[0]; aaa[1] = numstring[1]; aaa[2] = '\0'; return val_append(en_exp_number(aaa),en_exp_id(&numstring[2])); } } cst_val *en_exp_real(const char *numstring) { char *aaa; const char *p; cst_val *r; if (numstring && (numstring[0] == '-')) r = cons_val(string_val("minus"), en_exp_real(&numstring[1])); else if (numstring && (numstring[0] == '+')) r = cons_val(string_val("plus"), en_exp_real(&numstring[1])); else if (((p=strchr(numstring,'e')) != 0) || ((p=strchr(numstring,'E')) != 0)) { aaa = cst_strdup(numstring); aaa[cst_strlen(numstring)-cst_strlen(p)] = '\0'; r = val_append(en_exp_real(aaa), cons_val(string_val("e"), en_exp_real(p+1))); cst_free(aaa); } else if ((p=strchr(numstring,'.')) != 0) { aaa = cst_strdup(numstring); aaa[cst_strlen(numstring)-cst_strlen(p)] = '\0'; r = val_append(en_exp_number(aaa), cons_val(string_val("point"), en_exp_digits(p+1))); cst_free(aaa); } else r = en_exp_number(numstring); /* I don't think you can get here */ return r; } cst_val *en_exp_digits(const char *numstring) { /* Expand given token to list of words pronouncing it as digits */ cst_val *d = 0; const char *p; for (p=numstring; *p; p++) { if ((*p >= '0') && (*p <= '9')) d = cons_val(string_val(digit2num[*p-'0']),d); else d = cons_val(string_val("umpty"),d); } return val_reverse(d); } cst_val *en_exp_letters(const char *lets) { /* returns these as list of single char symbols */ char *aaa; cst_val *r; int i; aaa = cst_alloc(char,2); aaa[1] = '\0'; for (r=0,i=0; lets[i] != '\0'; i++) { aaa[0] = lets[i]; if (isupper((int)aaa[0])) aaa[0] = tolower((int)aaa[0]); if (strchr("0123456789",aaa[0])) r = cons_val(string_val(digit2num[aaa[0]-'0']),r); else if (cst_streq(aaa,"a")) r = cons_val(string_val("_a"),r); else r = cons_val(string_val(aaa),r); } cst_free(aaa); return val_reverse(r); } int en_exp_roman(const char *roman) { int val; const char *p; val = 0; for (p=roman; *p != 0; p++) { if (*p == 'X') val += 10; else if (*p == 'V') val += 5; else if (*p == 'I') { if (p[1] == 'V') { val += 4; p++; } else if (p[1] == 'X') { val += 9; p++; } else val += 1; } } return val; }