shithub: flite

ref: e4ff39a77cfa1c09e5a5486005f0b9ea5a50250e
dir: /lang/usenglish/us_expand.c/

View raw version
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                         Copyright (c) 2001                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  January 2001                                     */
/*************************************************************************/
/*                                                                       */
/*  English text expanders                                               */
/*                                                                       */
/*  numbers, digits, ids (years), money                                  */
/*                                                                       */
/*************************************************************************/

#include <ctype.h>
#include "us_text.h"

static const char * const digit2num[] = {
    "zero",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine" };

static const char * const digit2teen[] = {
    "ten",  /* shouldn't get called */
    "eleven",
    "twelve",
    "thirteen",
    "fourteen",
    "fifteen",
    "sixteen",
    "seventeen",
    "eighteen",
    "nineteen" };

static const char * const digit2enty[] = {
    "zero",  /* shouldn't get called */
    "ten",
    "twenty",
    "thirty",
    "forty",
    "fifty",
    "sixty",
    "seventy",
    "eighty",
    "ninety" };

static const char * const ord2num[] = {
    "zeroth",
    "first",
    "second",
    "third",
    "fourth",
    "fifth",
    "sixth",
    "seventh",
    "eighth",
    "ninth" };

static const char * const ord2teen[] = {
    "tenth",  /* shouldn't get called */
    "eleventh",
    "twelfth",
    "thirteenth",
    "fourteenth",
    "fifteenth",
    "sixteenth",
    "seventeenth",
    "eighteenth",
    "nineteenth" };

static const char * const ord2enty[] = {
    "zeroth",  /* shouldn't get called */
    "tenth",
    "twentieth",
    "thirtieth",
    "fortieth",
    "fiftieth",
    "sixtieth",
    "seventieth",
    "eightieth",
    "ninetieth" };

cst_val *en_exp_number(const char *numstring)
{
    /* Expand given token to list of words pronouncing it as a number */
    int num_digits = cst_strlen(numstring);
    char part[4];
    cst_val *p;
    int i;

    if (num_digits == 0)
	return NULL;
    else if (num_digits == 1)
	return en_exp_digits(numstring);
    else if (num_digits == 2)
    {
	if (numstring[0] == '0')
	{
	    if (numstring[1] == '0')
		return 0;
            else 
		return cons_val(string_val(digit2num[numstring[1]-'0']),0);
	}
	else if (numstring[1] == '0')
	    return cons_val(string_val(digit2enty[numstring[0]-'0']),0);
	else if (numstring[0] == '1')
	    return cons_val(string_val(digit2teen[numstring[1]-'0']),0);
	else 
	    return cons_val(string_val(digit2enty[numstring[0]-'0']),
			    en_exp_digits(numstring+1));
    }
    else if (num_digits == 3)
    {
	if (numstring[0] == '0')
	    return en_exp_number(numstring+1);
	else
	    return cons_val(string_val(digit2num[numstring[0]-'0']),
				cons_val(string_val("hundred"),
					     en_exp_number(numstring+1)));
    }
    else if (num_digits < 7)
    {
	for (i=0; i < num_digits-3; i++)
	    part[i] = numstring[i];
	part[i]='\0';
	p = en_exp_number(part);
	if (p == 0)  /* no thousands */
	    return en_exp_number(numstring+i);
	else
	    return val_append(p,cons_val(string_val("thousand"),
					 en_exp_number(numstring+i)));
    }
    else if (num_digits < 10)
    {
	for (i=0; i < num_digits-6; i++)
	    part[i] = numstring[i];
	part[i]='\0';
	p = en_exp_number(part);
	if (p == 0)  /* no millions */
	    return en_exp_number(numstring+i);
	else
	    return val_append(p,cons_val(string_val("million"),
					 en_exp_number(numstring+i)));
    }
    else if (num_digits < 13)
    {   /* If there are pedantic brits out there, tough!, 10^9 is a billion */
	for (i=0; i < num_digits-9; i++)
	    part[i] = numstring[i];
	part[i]='\0';
	p = en_exp_number(part);
	if (p == 0)  /* no billions */
	    return en_exp_number(numstring+i);
	else
	    return val_append(p,cons_val(string_val("billion"),
					 en_exp_number(numstring+i)));
    }
    else  /* Way too many digits here, to be a number */
    {
	return en_exp_digits(numstring);
    }
}

cst_val *en_exp_ordinal(const char *rawnumstring)
{
    /* return ordinal for digit string */
    cst_val *card, *o;
    const cst_val *t;
    const char *l;
    const char *ord;
    char *numstring;
    int i,j;

    numstring = cst_strdup(rawnumstring);
    for (j=i=0; i < (signed int)cst_strlen(rawnumstring); i++)
	if (rawnumstring[i] != ',')
	{
	    numstring[j] = rawnumstring[i];
	    j++;
	}
    numstring[j] = '\0';
    card = val_reverse(en_exp_number(numstring));
    if (!card)
        card = cons_val(string_val("zero"),0);
    cst_free(numstring);

    l = val_string(val_car(card));
    ord = 0;
    for (i=0; i<10; i++)
	if (cst_streq(l,digit2num[i]))
	    ord = ord2num[i];
    if (!ord)
	for (i=0; i<10; i++)
	    if (cst_streq(l,digit2teen[i]))
		ord = ord2teen[i];
    if (!ord)
	for (i=0; i<10; i++)
	    if (cst_streq(l,digit2enty[i]))
		ord = ord2enty[i];
    if (cst_streq(l,"hundred"))
	ord = "hundredth";
    if (cst_streq(l,"thousand"))
	ord = "thousandth";
    if (cst_streq(l,"billion"))
	ord = "billtionth";
    if (!ord)  /* dunno, so don't convert anything */
	return card;
    o = cons_val(string_val(ord),0);
    for (t=val_cdr(card); t; t=val_cdr(t))
	o = cons_val(val_car(t),o);
    delete_val(card);
    return o;
}

cst_val *en_exp_id(const char *numstring)
{
    /* Expand numstring as pairs as in years or ids */
    char aaa[3];

    if ((cst_strlen(numstring) == 4) && 
	(numstring[2] == '0') &&
	(numstring[3] == '0'))
    {
	if (numstring[1] == '0')
	    return en_exp_number(numstring); /* 2000, 3000 */
	else
	{
	    aaa[0] = numstring[0];
	    aaa[1] = numstring[1];
	    aaa[2] = '\0';
	    return val_append(en_exp_number(aaa),
			      cons_val(string_val("hundred"),0));
	}
    }
    else if ((cst_strlen(numstring) == 3) && 
             (numstring[0] != '0') &&
             (numstring[1] == '0') && 
             (numstring[2] == '0'))
    {
        return cons_val(string_val(digit2num[numstring[0]-'0']),
                        cons_val(string_val("hundred"),0));
    }
    else if ((cst_strlen(numstring) == 2) && (numstring[0] == '0')
             && (numstring[1] == '0'))
	return cons_val(string_val("zero"),
                        cons_val(string_val("zero"),NULL));
    else if ((cst_strlen(numstring) == 2) && (numstring[0] == '0'))
	return cons_val(string_val("oh"),
			en_exp_digits(&numstring[1]));
    else if (((cst_strlen(numstring) == 4) && 
              (numstring[1] == '0') && (numstring[2] == '0')) ||
             (cst_strlen(numstring) < 3))
	return en_exp_number(numstring);
    else if (cst_strlen(numstring)%2 == 1)
    {
	return cons_val(string_val(digit2num[numstring[0]-'0']),
			en_exp_id(&numstring[1]));
    }
    else 
    {
	aaa[0] = numstring[0];
	aaa[1] = numstring[1];
	aaa[2] = '\0';
	return val_append(en_exp_number(aaa),en_exp_id(&numstring[2]));
    }
}

cst_val *en_exp_real(const char *numstring)
{
    char *aaa;
    const char *p;
    cst_val *r;

    if (numstring && (numstring[0] == '-'))
	r = cons_val(string_val("minus"),
		     en_exp_real(&numstring[1]));
    else if (numstring && (numstring[0] == '+'))
	r = cons_val(string_val("plus"),
		     en_exp_real(&numstring[1]));
    else if (((p=strchr(numstring,'e')) != 0) ||
	     ((p=strchr(numstring,'E')) != 0))
    {
	aaa = cst_strdup(numstring);
	aaa[cst_strlen(numstring)-cst_strlen(p)] = '\0';
	r = val_append(en_exp_real(aaa),
		       cons_val(string_val("e"),
				en_exp_real(p+1)));
	cst_free(aaa);
    }
    else if ((p=strchr(numstring,'.')) != 0)
    {
	aaa = cst_strdup(numstring);
	aaa[cst_strlen(numstring)-cst_strlen(p)] = '\0';
	r = val_append(en_exp_number(aaa),
		       cons_val(string_val("point"),
				en_exp_digits(p+1)));
	cst_free(aaa);
    }
    else
	r = en_exp_number(numstring);  /* I don't think you can get here */

    return r;
}

cst_val *en_exp_digits(const char *numstring)
{
    /* Expand given token to list of words pronouncing it as digits */
    cst_val *d = 0;
    const char *p;

    for (p=numstring; *p; p++)
    {
	if ((*p >= '0') && (*p <= '9'))
	    d = cons_val(string_val(digit2num[*p-'0']),d);
	else
	    d = cons_val(string_val("umpty"),d);
    }

    return val_reverse(d);
}

cst_val *en_exp_letters(const char *lets)
{
    /* returns these as list of single char symbols */
    char *aaa;
    cst_val *r;
    int i;

    aaa = cst_alloc(char,2);
    aaa[1] = '\0';
    for (r=0,i=0; lets[i] != '\0'; i++)
    {
	aaa[0] = lets[i];
	if (isupper((int)aaa[0])) 
	    aaa[0] = tolower((int)aaa[0]);
	if (strchr("0123456789",aaa[0]))
	    r = cons_val(string_val(digit2num[aaa[0]-'0']),r);
	else if (cst_streq(aaa,"a"))
	    r = cons_val(string_val("_a"),r);
	else
	    r = cons_val(string_val(aaa),r);
    }
    cst_free(aaa);

    return val_reverse(r);
}

int en_exp_roman(const char *roman)
{
    int val;
    const char *p;
    val = 0;

    for (p=roman; *p != 0; p++)
    {
	if (*p == 'X')
	    val += 10;
	else if (*p == 'V')
	    val += 5;
	else if (*p == 'I')
	{
	    if (p[1] == 'V')
	    {
		val += 4;
		p++;
	    }
	    else if (p[1] == 'X')
	    {
		val += 9;
		p++;
	    }
	    else 
		val += 1;
	}
    }
    return val;
}