shithub: flite

ref: 826ecd8f682bf7396ea9ca0520f589da626ef261
dir: /src/utils/cst_val.c/

View raw version
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                        Copyright (c) 1999                             */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  December 1999                                    */
/*************************************************************************/
/*                                                                       */
/*  Typed values                                                          */
/*                                                                       */
/*************************************************************************/
#include "cst_math.h"
#include "cst_file.h"
#include "cst_val.h"
#include "cst_string.h"
#include "cst_tokenstream.h"

static cst_val *new_val()
{
    return cst_alloc(struct cst_val_struct,1);
}

cst_val *int_val(int i)
{
    cst_val *v = new_val();
    CST_VAL_TYPE(v) = CST_VAL_TYPE_INT;
    CST_VAL_INT(v) = i;
    return v;
}
    
cst_val *float_val(float f)
{
    cst_val *v = new_val();
    CST_VAL_TYPE(v) = CST_VAL_TYPE_FLOAT;
    CST_VAL_FLOAT(v) = f;
    return v;
}

cst_val *string_val(const char *s)
{
    cst_val *v = new_val();
    CST_VAL_TYPE(v) = CST_VAL_TYPE_STRING;
    /* would be nice to note if this is a deletable string or not */
    CST_VAL_STRING_LVAL(v) = cst_strdup(s);
    return v;
}

cst_val *cons_val(const cst_val *a, const cst_val *b)
{
    cst_val *v = new_val();
    CST_VAL_CAR(v)=((!a || cst_val_consp(a)) ? 
		    (cst_val *)(void *)a:val_inc_refcount(a));
    CST_VAL_CDR(v)=((!b || cst_val_consp(b)) ? 
		    (cst_val *)(void *)b:val_inc_refcount(b));
    return v;
}

cst_val *val_new_typed(int type,void *vv)
{
    cst_val *v = new_val();
    CST_VAL_TYPE(v) = type;
    CST_VAL_VOID(v) = vv;
    return v;
}

void delete_val_list(cst_val *v)
{
    if (v)
    {
	if (cst_val_consp(v))
	{
	    delete_val_list(CST_VAL_CDR(v));
	    cst_free(v);
	}
	else
	    delete_val(v);
    }
}

void delete_val(cst_val *v)
{
    if (v)
    {
	if (cst_val_consp(v))
	{
	    delete_val(CST_VAL_CAR(v));
	    delete_val(CST_VAL_CDR(v));
	    cst_free(v);
	}
	else if (val_dec_refcount(v) == 0)
	{
	    if (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING)
		cst_free(CST_VAL_VOID(v));
	    else if (CST_VAL_TYPE(v) >= CST_VAL_TYPE_FIRST_FREE)
            {
                if (cst_val_defs[CST_VAL_TYPE(v)/2].delete_function)
                    (cst_val_defs[CST_VAL_TYPE(v)/2].delete_function)
                        (CST_VAL_VOID(v));
            }
            cst_free(v);
	}
    }
}

/* Accessor functions */
int val_int(const cst_val *v)
{
    if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_INT))
	return CST_VAL_INT(v);
    else if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_FLOAT))
	return (int)CST_VAL_FLOAT(v);
    else if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING))
	return atoi(CST_VAL_STRING(v));
    else
    {
	cst_errmsg("VAL: tried to access int in %d typed val\n",
		   (v ? CST_VAL_TYPE(v) : -1));
	cst_error();
    }
    return 0;
}

float val_float(const cst_val *v)
{
    if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_INT))
	return (float)CST_VAL_INT(v);
    else if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_FLOAT))
	return CST_VAL_FLOAT(v);
    else if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING))
	return cst_atof(CST_VAL_STRING(v));
    else
    {
	cst_errmsg("VAL: tried to access float in %d typed val\n",
		   (v ? CST_VAL_TYPE(v) : -1));
	cst_error();
    }
    return 0;
}

const char *val_string(const cst_val *v)
{
    if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING))
	return CST_VAL_STRING(v);
    else
    {
	cst_errmsg("VAL: tried to access string in %d typed val\n",
		   (v ? CST_VAL_TYPE(v) : -1));
	cst_error();
    }
    return 0;
}

const cst_val *val_car(const cst_val *v)
{
    if (v && cst_val_consp(v))
	return CST_VAL_CAR(v);
    else
    {
	cst_errmsg("VAL: tried to access car in %d typed val\n",
		   (v ? CST_VAL_TYPE(v) : -1));
	cst_error();
    }
    return 0;
}

const cst_val *val_cdr(const cst_val *v)
{
    if (v && cst_val_consp(v))
	return CST_VAL_CDR(v);
    else
    {
	cst_errmsg("VAL: tried to access cdr in %d typed val\n",
		   (v ? CST_VAL_TYPE(v) : -1));
	cst_error();
    }
    return 0;
}

void *val_generic(const cst_val *v, int type, const char *stype)
{   /* a generic access function that checks the expected type */
    if (v && CST_VAL_TYPE(v) == type)
	return CST_VAL_VOID(v);
    else
    {
        cst_errmsg("VAL: tried to access %s in %d type val\n",
                       stype,
                       (v ? CST_VAL_TYPE(v) : -1));
        cst_error();
    }
    return NULL;
}

void *val_void(const cst_val *v)
{
    /* The scary, do anything function, this shouldn't be called by mortals */
    if ((v == NULL) ||
	(CST_VAL_TYPE(v) == CST_VAL_TYPE_CONS) ||
	(CST_VAL_TYPE(v) == CST_VAL_TYPE_INT) ||
	(CST_VAL_TYPE(v) == CST_VAL_TYPE_FLOAT))
    {
	cst_errmsg("VAL: tried to access void in %d typed val\n",
		   (v ? CST_VAL_TYPE(v) : -1));
	cst_error();
	return NULL;
    }
    else 
	return CST_VAL_VOID(v);
}

int cst_val_consp(const cst_val *v)
{
    /* To keep a val cell down to 8 bytes we identify non-cons cells  */
    /* with non-zero values in the least significant bit of the first */
    /* address in the cell (this is a standard technique used on Lisp */
    /* machines)                                                      */
#if 0
    void *t;
    int t1;

    /* Hmm this still isn't right (it can be) but this isn't it */
    t = CST_VAL_CAR(v);
    t1 = *(int *)&t;

    if ((t1&0x1) == 0)
	return TRUE;
    else
	return FALSE;
#endif
    const cst_val_atom *t;

    t = (const cst_val_atom *)v;
    if (t->type % 2 == 0)
      return TRUE;
    else
      return FALSE;
}

const cst_val *set_cdr(cst_val *v1, const cst_val *v2)
{
    /* destructive set cdr, be careful you have a pointer to current cdr */
    
    if (!cst_val_consp(v1))
    {
	cst_errmsg("VAL: tried to set cdr of non-consp cell\n");
	cst_error();
	return NULL;
    }
    else
    {
        if (CST_VAL_CDR(v1))
        {
            val_dec_refcount(CST_VAL_CDR(v1));
            val_inc_refcount(v1);
        }
	CST_VAL_CDR(v1) = (cst_val *)v2;
    }
    return v1;
}

const cst_val *set_car(cst_val *v1, const cst_val *v2)
{
    /* destructive set car, be careful you have a pointer to current car */
    
    if (!cst_val_consp(v1))
    {
	cst_errmsg("VAL: tried to set car of non-consp cell\n");
	cst_error();
	return NULL;
    }
    else
    {
	val_dec_refcount(CST_VAL_CAR(v1));
	val_inc_refcount(v1);
	CST_VAL_CAR(v1) = (cst_val *)v2;
    }
    return v1;
}

void val_print(cst_file fd,const cst_val *v)
{
    const cst_val *p;

    if (v == NULL)
	cst_fprintf(fd,"[null]");
    else if (CST_VAL_TYPE(v) == CST_VAL_TYPE_INT)
	cst_fprintf(fd,"%d",val_int(v));
    else if (CST_VAL_TYPE(v) == CST_VAL_TYPE_FLOAT)
	cst_fprintf(fd,"%f",val_float(v));
    else if (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING)
	cst_fprintf(fd,"%s",val_string(v));
    else if (cst_val_consp(v))
    {
	cst_fprintf(fd,"(");
	for (p=v; p; )
	{
	    val_print(fd,val_car(p));
	    p=val_cdr(p);
	    if (p)
		cst_fprintf(fd," ");
            if (p && !cst_val_consp(p))  /* dotted pairs for non-list */
            {                            
                cst_fprintf(fd,". ");
                val_print(fd,p);
                break;
            }
	}
	cst_fprintf(fd,")");
    }
    else 
	cst_fprintf(fd,"[Val %s 0x%p]",
		cst_val_defs[CST_VAL_TYPE(v)/2].name,CST_VAL_VOID(v));
}

cst_val *val_reverse(cst_val *l)
{   /* destructively reverse the list */
    cst_val *n,*np,*nl;
    for (nl=0,n=l; n; nl=n,n=np)
    {
	np=CST_VAL_CDR(n);
	CST_VAL_CDR(n) = nl;
    }
    return nl;
}

cst_val *val_append(cst_val *l1, cst_val *l2)
{
    /* Destructively add l2 to the end of l1 return l1 */
    cst_val *t;

    if (l1 == 0)
	return l2;
    else
    {
	for (t=l1; val_cdr(t); t=CST_VAL_CDR(t));
	CST_VAL_CDR(t) = l2;
	return l1;
    }
}

int val_length(const cst_val *l)
{
    const cst_val *n;
    int i;

    for (i=0,n=l; n; n=val_cdr(n))
	i++;

    return i;
}

int val_equal(const cst_val *v1, const cst_val *v2)
{
    if (v1 == v2)
	return TRUE;  /* its eq so its equal */
    else if (v1 == 0)
	return FALSE;
    else if (CST_VAL_TYPE(v1) == CST_VAL_TYPE(v2))
    {
	if (cst_val_consp(v1))
	    return ((val_equal(val_car(v1),val_car(v2))) &&
		    (val_equal(val_cdr(v1),val_cdr(v2))));
	else if (CST_VAL_TYPE(v1) == CST_VAL_TYPE_INT)
	    return (val_int(v1) == val_int(v2));
	else if (CST_VAL_TYPE(v1) == CST_VAL_TYPE_FLOAT)
	    return (val_float(v1) == val_float(v2));
	else if (CST_VAL_TYPE(v1) == CST_VAL_TYPE_STRING)
	    return (cst_streq(CST_VAL_STRING(v1),CST_VAL_STRING(v2)));
	else 
	    return CST_VAL_VOID(v1) == CST_VAL_VOID(v2);
    }
    else
	return FALSE;
}

int val_less(const cst_val *v1, const cst_val *v2)
{
    return val_float(v1) < val_float(v2);
}

int val_greater(const cst_val *v1,const cst_val *v2)
{
    return val_float(v1) > val_float(v2);
}

int val_member(const cst_val *v1,const cst_val *l)
{
    const cst_val *i;

    for (i=l; i; i=val_cdr(i))
    {
	if (val_equal(val_car(i),v1))
	    return TRUE;
    }
    return FALSE;
}

int val_member_string(const char *v1,const cst_val *l)
{
    const cst_val *i;

    for (i=l; i; i=val_cdr(i))
    {
	if (cst_streq(v1,val_string(val_car(i))))
	    return TRUE;
    }
    return FALSE;
}

cst_val *val_inc_refcount(const cst_val *b)
{
    cst_val *wb;

    /* Well I was lying, they're not really const, but this is the place */
    /* where breaking const is reasonable                              */
    wb = (cst_val *)(void *)b;

    if (CST_VAL_REFCOUNT(wb) == -1) 
	/* or is a cons cell in the text segment, how do I do that ? */
	return wb;
    else if (!cst_val_consp(wb)) /* we don't ref count cons cells */
	CST_VAL_REFCOUNT(wb) += 1;
    return wb;
}

int val_dec_refcount(const cst_val *b)
{
    cst_val *wb;

    wb = (cst_val *)(void *)b;

    if (CST_VAL_REFCOUNT(wb) == -1) 
	/* or is a cons cell in the text segment, how do I do that ? */
	return -1;
    else if (cst_val_consp(wb)) /* we don't ref count cons cells */
	return 0;
    else if (CST_VAL_REFCOUNT(wb) == 0)
    {
	/* Otherwise, trying to free a val outside an
           item/relation/etc has rather the opposite effect from what
           you might have intended... */
	return 0;
    }
    else
    {
	CST_VAL_REFCOUNT(wb) -= 1;
	return 	CST_VAL_REFCOUNT(wb);
    }
}

#ifdef _WIN32
__inline int utf8_sequence_length(char c0)
#else
int utf8_sequence_length(char c0)
#endif
{
    /* Get the expected length of UTF8 sequence given its most */
    /* significant byte */
    return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
}

cst_val *cst_utf8_explode(const cst_string *utf8string)
{
    /* Return a list of utf8 characters as strings */
    cst_val *chars=NULL;
  
    const unsigned char *str = (const unsigned char*)utf8string;
    char utf8char[5];
    char c0;
    int charlength;

    while ((c0 = *str))
    {
        charlength = utf8_sequence_length(c0);
        cst_snprintf(utf8char, charlength + 1, "%s", str);
        chars = cons_val(string_val(utf8char),chars);
        str += charlength;
    }
    return val_reverse(chars);
}

static int utf8_ord(const char *utf8_seq) 
{
    unsigned int len;
    int ord;

    unsigned char c0, c1, c2, c3;  /* Potential bytes in the UTF8 symbol */
    c0 = utf8_seq[0];
    len = utf8_sequence_length(c0);

    /* Make sure the string sequence we received matches with the */
    /* expected length, and that the expected length is nonzero.  */
    if ( (len == 0) ||
         (len != strlen(utf8_seq))) {
        return -1;
    }

    if (len == 1) {
        /* ASCII sequence. */
        return c0;
    }

    c1 = utf8_seq[1];
    if (len == 2) {
        ord = ((c0 & 0x1F) << 6) | (c1 & 0x3F);
        if (ord < 0x80)
            return -1;
        return ord;
    }

    c2 = utf8_seq[2];
    if (len == 3) {
        if ((c2 & 0xC0) != 0x80)
            return -1;
        ord = ((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F);
        if (ord < 0x800 ||
            (ord >= 0xD800 && ord <= 0xDFFF))
            return -1;
        return ord;
    }

    c3 = utf8_seq[3];
    if (len == 4) {
        if ((c3 & 0xC0) != 0x80)
            return -1;
        ord =
            ((c0 & 0x7) << 18) | ((c1 & 0x3F) << 12) |
            ((c2 & 0x3F) << 6) | (c3 & 0x3F);
        if (ord < 0x10000 || ord > 0x10FFFF)
            return -1;
        return ord;
    }

    return -1;
}

cst_val *cst_utf8_ord(const cst_val *utf8_char) {
  const char *ch=(const char *)val_string(utf8_char);
  return int_val(utf8_ord(ch));
}
int cst_utf8_ord_string(const char *utf8_char)
{
    return utf8_ord(utf8_char);
}

static int utf8_chr(int ord, char* utf8char) 
{
    unsigned int utf8len;
    int i = 0;

    if (ord < 0x80) {
        utf8len = 1;
    } else if (ord < 0x800) {
        utf8len = 2;
    } else if (ord <= 0xFFFF) {
        utf8len = 3;
    } else if (ord <= 0x200000) {
        utf8len = 4;
    } else {
        /* Replace invalid character with FFFD */
        utf8len = 2;
        ord = 0xFFFD;
    }

    i = utf8len;  /* Index into utf8char */
    utf8char[i--] = 0;

    switch (utf8len) {
        /* These fallthrough deliberately */
    case 6:
        utf8char[i--] = (ord | 0x80) & 0xBF;
        ord >>= 6;
    case 5:
        utf8char[i--] = (ord | 0x80) & 0xBF;
        ord >>= 6;
    case 4:
        utf8char[i--] = (ord | 0x80) & 0xBF;
        ord >>= 6;
    case 3:
        utf8char[i--] = (ord | 0x80) & 0xBF;
        ord >>= 6;
    case 2:
        utf8char[i--] = (ord | 0x80) & 0xBF;
        ord >>= 6;
    case 1:
        switch (utf8len) {
        case 0:
        case 1:
            utf8char[i--] = ord;
            break;
        case 2:
            utf8char[i--] = ord | 0xC0;
            break;
        case 3:
            utf8char[i--] = ord | 0xE0;
            break;
        case 4:
            utf8char[i--] = ord | 0xF0;
        }
    }
    return utf8len;
}

cst_val *cst_utf8_chr(const cst_val *ord) {
  char ch[5];
  int utf8len;

  utf8len = utf8_chr(val_int(ord),ch);
  if (utf8len == 0) {
    return 0;
  }

  return string_val(ch);
}

int val_stringp(const cst_val *v)
{
    if (cst_val_consp(v))
        return FALSE;
    else if (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING)
        return TRUE;
    else
        return FALSE;
}

const cst_val *val_assoc_string(const char *v1,const cst_val *al)
{
    const cst_val *i;

    for (i=al; i; i=val_cdr(i))
    {
	if (cst_streq(v1,val_string(val_car(val_car(i)))))
	    return val_car(i);
    }
    return NULL;
}

cst_string *cst_implode(const cst_val *sl)
{
    const cst_val *v;
    int l=0;
    char *s;

    for (v=sl; v; v=val_cdr(v))
    {
        if (val_stringp(val_car(v)))
            l += cst_strlen(val_string(val_car(v)));
    }

    s = cst_alloc(cst_string,l+1);

    for (v=sl; v; v=val_cdr(v))
    {
        if (val_stringp(val_car(v)))
            cst_sprintf(s,"%s%s",s,val_string(val_car(v)));

    }

    return s;
}

cst_val *val_readlist_string(const char *str)
{   /* not fully general but a good start */
    cst_tokenstream *ts;
    cst_val *v = NULL;
    const char *p;

    ts = ts_open_string(str,
                        cst_ts_default_whitespacesymbols,
                        "",
                        "",
                        "");

    while (!ts_eof(ts))
    {
        p = ts_get(ts);
        v = cons_val(string_val(p),v);
    }

    ts_close(ts);

    return val_reverse(v);
}