shithub: flite

ref: 826ecd8f682bf7396ea9ca0520f589da626ef261
dir: /src/utils/cst_tokenstream.c/

View raw version
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                        Copyright (c) 1999                             */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  July 1999                                        */
/*************************************************************************/
/*                                                                       */
/*  Tokenizer for strings and files                                      */
/*                                                                       */
/*************************************************************************/
#include "cst_tokenstream.h"

const cst_string * const cst_ts_default_whitespacesymbols = " \t\n\r";
const cst_string * const cst_ts_default_singlecharsymbols = "(){}[]";
const cst_string * const cst_ts_default_prepunctuationsymbols = "\"'`({[";
const cst_string * const cst_ts_default_postpunctuationsymbols = "\"'`.,:;!?(){}[]";

#define TS_BUFFER_SIZE 256

static cst_string ts_getc(cst_tokenstream *ts);
static cst_string internal_ts_getc(cst_tokenstream *ts);

static void set_charclass_table(cst_tokenstream *ts)
{
    int i;
    memset(ts->charclass,0,256);  /* zero everything */
    
    for (i=0; ts->p_whitespacesymbols[i]; i++)
	ts->charclass[(unsigned char)ts->p_whitespacesymbols[i]] |= TS_CHARCLASS_WHITESPACE;
    for (i=0; ts->p_singlecharsymbols[i]; i++)
	ts->charclass[(unsigned char)ts->p_singlecharsymbols[i]] |= TS_CHARCLASS_SINGLECHAR;
    for (i=0; ts->p_prepunctuationsymbols[i]; i++)
	ts->charclass[(unsigned char)ts->p_prepunctuationsymbols[i]] |= TS_CHARCLASS_PREPUNCT;
    for (i=0; ts->p_postpunctuationsymbols[i]; i++)
	ts->charclass[(unsigned char)ts->p_postpunctuationsymbols[i]]|=TS_CHARCLASS_POSTPUNCT;
    return;
}

void set_charclasses(cst_tokenstream *ts,
		     const cst_string *whitespace,
		     const cst_string *singlecharsymbols,
		     const cst_string *prepunctuation,
		     const cst_string *postpunctuation)
{
    ts->p_whitespacesymbols = 
	(whitespace ? whitespace : cst_ts_default_whitespacesymbols);
    ts->p_singlecharsymbols = 
    (singlecharsymbols ? singlecharsymbols : cst_ts_default_singlecharsymbols);
    ts->p_prepunctuationsymbols = 
    (prepunctuation ? prepunctuation : cst_ts_default_prepunctuationsymbols);
    ts->p_postpunctuationsymbols =
   (postpunctuation ? postpunctuation : cst_ts_default_postpunctuationsymbols);

    set_charclass_table(ts);
    return;
}

static void extend_buffer(cst_string **buffer,int *buffer_max)
{
    int new_max;
    cst_string *new_buffer;

    new_max = (*buffer_max)+(*buffer_max)/5;
    new_buffer = cst_alloc(cst_string,new_max);
    memmove(new_buffer,*buffer,*buffer_max);
    cst_free(*buffer);
    *buffer = new_buffer;
    *buffer_max = new_max;
}			  

static cst_tokenstream *new_tokenstream(const cst_string *whitespace,
					const cst_string *singlechars,
					const cst_string *prepunct,
					const cst_string *postpunct)
{   /* Constructor function */
    cst_tokenstream *ts = cst_alloc(cst_tokenstream,1);
    ts->fd = NULL;
    ts->file_pos = 0;
    ts->line_number = 0;
    ts->eof_flag = 0;
    ts->string_buffer = NULL;
    ts->token_pos = 0;
    ts->whitespace = cst_alloc(cst_string,TS_BUFFER_SIZE);
    ts->ws_max = TS_BUFFER_SIZE;
    if (prepunct && prepunct[0])
    {
        ts->prepunctuation = cst_alloc(cst_string,TS_BUFFER_SIZE);
        ts->prep_max = TS_BUFFER_SIZE;
    }
    ts->token = cst_alloc(cst_string,TS_BUFFER_SIZE);
    ts->token_max = TS_BUFFER_SIZE;
    if (postpunct && postpunct[0])
    {
        ts->postpunctuation = cst_alloc(cst_string,TS_BUFFER_SIZE);
        ts->postp_max = TS_BUFFER_SIZE;
    }

    set_charclasses(ts,whitespace,singlechars,prepunct,postpunct);
    ts->current_char = 0;

    return ts;
}

void delete_tokenstream(cst_tokenstream *ts)
{
    cst_free(ts->whitespace);
    cst_free(ts->token);
    if (ts->tags) delete_features(ts->tags);
    if (ts->prepunctuation) cst_free(ts->prepunctuation);
    if (ts->postpunctuation) cst_free(ts->postpunctuation);
    cst_free(ts);
}

cst_tokenstream *ts_open(const char *filename,
			 const cst_string *whitespace,
			 const cst_string *singlechars,
			 const cst_string *prepunct,
			 const cst_string *postpunct)
{
    cst_tokenstream *ts = new_tokenstream(whitespace,
					  singlechars,
					  prepunct,
					  postpunct);

#ifndef UNDER_CE
    if (cst_streq("-",filename))
	ts->fd = stdin;
    else
#endif
	ts->fd = cst_fopen(filename,CST_OPEN_READ|CST_OPEN_BINARY);
    ts_getc(ts);

    if (ts->fd == NULL)
    {
	delete_tokenstream(ts);
	return NULL;
    }
    else
	return ts;
}

cst_tokenstream *ts_open_string(const cst_string *string,
				const cst_string *whitespace,
				const cst_string *singlechars,
				const cst_string *prepunct,
				const cst_string *postpunct)
{
    cst_tokenstream *ts = new_tokenstream(whitespace,
					  singlechars,
					  prepunct,
					  postpunct);

    ts->string_buffer = cst_strdup(string);
    ts_getc(ts);

    return ts;
}

cst_tokenstream *ts_open_generic(const char *filename,
                                 const cst_string *whitespacesymbols,
                                 const cst_string *singlecharsymbols,
                                 const cst_string *prepunctsymbols,
                                 const cst_string *postpunctsymbols,
                                 void *streamtype_data,
                                 int (*open)(cst_tokenstream *ts,
                                             const char *filename),
                                 void (*close)(cst_tokenstream *ts),
                                 int (*eof)(cst_tokenstream *ts),
                                 int (*seek)(cst_tokenstream *ts, int pos),
                                 int (*tell)(cst_tokenstream *ts),
                                 int (*size)(cst_tokenstream *ts),
                                 int (*getc)(cst_tokenstream *ts))
{   /* Its a generic token stream where user has specified the low level */
    /* file/stream access functions                                      */
    cst_tokenstream *ts = new_tokenstream(whitespacesymbols,
					  singlecharsymbols,
					  prepunctsymbols,
					  postpunctsymbols);

    ts->streamtype_data = streamtype_data;
    ts->open = open;
    ts->close = close;
    ts->eof = eof;
    ts->seek = seek;
    ts->tell = tell;
    ts->size = size;
    ts->getc = getc;

    if ((ts->open)(ts,filename) != 0)
    {
        (ts->getc)(ts);
        return ts;
    }
    else
    {
	delete_tokenstream(ts);
	return NULL;
    }
}

void ts_close(cst_tokenstream *ts)
{
    if (ts->fd != NULL)
    {
#ifndef UNDER_CE
	if (ts->fd != stdin)
#endif
	    cst_fclose(ts->fd);
	ts->fd = NULL; /* just in case close gets called twice */
    }
    if (ts->string_buffer != NULL)
    {
        cst_free(ts->string_buffer);
	ts->string_buffer = NULL;
    }
    if (ts->open)
        (ts->close)(ts);
    delete_tokenstream(ts);
}

static void get_token_sub_part(cst_tokenstream *ts,
			       int charclass,
			       cst_string **buffer,
			       int *buffer_max)
{
    int p;

    for (p=0; ((!ts_eof(ts)) &&
               (ts_charclass(ts->current_char,charclass,ts)) &&
	       (!ts_charclass(ts->current_char,
			      TS_CHARCLASS_SINGLECHAR,ts))); p++)
    {
	if (p+1 >= *buffer_max) extend_buffer(buffer,buffer_max);
	(*buffer)[p] = ts->current_char;
	ts_getc(ts);
    }
    (*buffer)[p] = '\0';
}

 #ifdef _WIN32
 __inline int ts_utf8_sequence_length(char c0)
 #else
 int ts_utf8_sequence_length(char c0)
 #endif
 {
    /* Get the expected length of UTF8 sequence given its most */
    /* significant byte */
     return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
 }

/* Can't afford dynamically generate this char class so have separated func */
/* so do the core token part -- this goes while not givenlass (while the    */
/* above function oes while is givenclass */
static void get_token_sub_part_2(cst_tokenstream *ts,
				 int endclass1,
				 cst_string **buffer,
				 int *buffer_max)
{
    int p;

    for (p=0; ((!ts_eof(ts)) &&
               (!ts_charclass(ts->current_char,endclass1,ts)) &&
	       (!ts_charclass(ts->current_char,
			      TS_CHARCLASS_SINGLECHAR,ts)));
         p++)
    {
	if (p+1 >= *buffer_max) extend_buffer(buffer,buffer_max);
	(*buffer)[p] = ts->current_char;
        /* If someone sets tags we end the token */
        /* This can't happen in standard tokenstreams, but can in user */
        /* defined ones */
        if (ts->tags) break;  

        /* In the special utf8 char by char mode we end at end of a utf8 char */
        if ((ts->utf8_explode_mode) &&
            (p == ts_utf8_sequence_length((*buffer)[0])))
            break;

	ts_getc(ts);
    }
    (*buffer)[p] = '\0';
}

static void get_token_postpunctuation(cst_tokenstream *ts)
{
    int p,t;

    t = cst_strlen(ts->token);
    for (p=t;
	 (p > 0) && 
	     ((ts->token[p] == '\0') ||
	      (ts_charclass(ts->token[p],TS_CHARCLASS_POSTPUNCT,ts)));
	 p--);

    if (t != p)
    {
	if (t-p >= ts->postp_max) 
	    extend_buffer(&ts->postpunctuation,&ts->postp_max);
	/* Copy postpunctuation from token */
	memmove(ts->postpunctuation,&ts->token[p+1],(t-p));
	/* truncate token at postpunctuation */
	ts->token[p+1] = '\0';
    }
}

int ts_eof(cst_tokenstream *ts)
{
    if (ts->eof_flag)
	return TRUE;
    else
	return FALSE;
}

int ts_set_stream_pos(cst_tokenstream *ts, int pos)
{
    /* Note this doesn't preserve line_pos */
    int new_pos, l;

    if (ts->fd)
    {
        new_pos = (int)cst_fseek(ts->fd,(long)pos,CST_SEEK_ABSOLUTE);
        if (new_pos == pos)
            ts->eof_flag = FALSE;
    }
    else if (ts->string_buffer)
    {
        l = cst_strlen(ts->string_buffer);
        if (pos > l)
            new_pos = l;
        else if (pos < 0)
            new_pos = 0;
        else
            new_pos = pos;
        ts->eof_flag = FALSE;
    }
    else if (ts->open)
    {
        new_pos = (ts->seek)(ts,pos);
        if (new_pos == pos)
            ts->eof_flag = FALSE;
    }
    else
        new_pos = pos;  /* not sure it can get here */
    ts->file_pos = new_pos;
    ts->current_char = ' ';  /* To be safe (but this is wrong) */

    return ts->file_pos;
}

int ts_get_stream_pos(cst_tokenstream *ts)
{
    if (ts->open)
        return (ts->tell)(ts);
    else
        return ts->file_pos;
}

int ts_get_stream_size(cst_tokenstream *ts)
{
    int current_pos, end_pos;
    if (ts->fd)
    {
        current_pos = ts->file_pos;
        end_pos = (int)cst_fseek(ts->fd,(long)0,CST_SEEK_ENDREL);
        cst_fseek(ts->fd,(long)current_pos,CST_SEEK_ABSOLUTE);
        return end_pos;
    } else if (ts->string_buffer)
        return cst_strlen(ts->string_buffer);
    else if (ts->open)
        return (ts->size)(ts);
    else
        return 0;
}

cst_string private_ts_getc(cst_tokenstream *ts)
{
    return internal_ts_getc(ts);
}

static cst_string ts_getc(cst_tokenstream *ts)
{
    if (ts->open)
        ts->current_char = (ts->getc)(ts);
    else
        ts->current_char = internal_ts_getc(ts);
    return ts->current_char;
}

static cst_string internal_ts_getc(cst_tokenstream *ts)
{
    if (ts->fd)
    {
	ts->current_char = cst_fgetc(ts->fd);
        if (ts->current_char == -1)
	    ts->eof_flag = TRUE;
    }
    else if (ts->string_buffer)
    {
	if (ts->string_buffer[ts->file_pos] == '\0')
        {
	    ts->eof_flag = TRUE;
	    ts->current_char = '\0';
        }
	else
	    ts->current_char = ts->string_buffer[ts->file_pos];
    }
    
    if (!ts_eof(ts))
	ts->file_pos++;
    if (ts->current_char == '\n')
	ts->line_number++;
    return ts->current_char;
}

const cst_string *ts_get_quoted_token(cst_tokenstream *ts,
					 char quote,
					 char escape)
{
    /* for reading the next quoted token that starts with quote and
       ends with quote, quote may appear only if preceded by escape */
    int p;

    /* Hmm can't change quotes within a ts */
    ts->charclass[(unsigned int)quote] |= TS_CHARCLASS_QUOTE;
    ts->charclass[(unsigned int)escape] |= TS_CHARCLASS_QUOTE;

    /* skipping whitespace */
    get_token_sub_part(ts,TS_CHARCLASS_WHITESPACE,
		       &ts->whitespace,
		       &ts->ws_max);
    ts->token_pos = ts->file_pos - 1;

    if (ts->current_char == quote)
    {   /* go until quote */
	ts_getc(ts);
        for (p=0; ((!ts_eof(ts)) &&
                   (ts->current_char != quote));
             p++)
        {
            if (p >= ts->token_max) 
                extend_buffer(&ts->token,&ts->token_max);
            ts->token[p] = ts->current_char;
            ts_getc(ts);
            if (ts->current_char == escape)
            {
                ts_get(ts);
                if (p >= ts->token_max) 
                    extend_buffer(&ts->token,&ts->token_max);
                ts->token[p] = ts->current_char;
                ts_get(ts);
            }
        }
        ts->token[p] = '\0';
	ts_getc(ts);
    }
    else /* its not quoted, like to be careful dont you */
    {    /* treat is as standard token                  */
	/* Get prepunctuation */
        extend_buffer(&ts->prepunctuation,&ts->prep_max);
	get_token_sub_part(ts,TS_CHARCLASS_PREPUNCT,
			   &ts->prepunctuation,
			   &ts->prep_max);
	/* Get the symbol itself */
	if (ts_charclass(ts->current_char,TS_CHARCLASS_SINGLECHAR,ts))
	{
	    if (2 >= ts->token_max) extend_buffer(&ts->token,&ts->token_max);
	    ts->token[0] = ts->current_char;
	    ts->token[1] = '\0';
	    ts_getc(ts);
	}
	else
	    get_token_sub_part_2(ts,
				 TS_CHARCLASS_WHITESPACE,    /* end class1 */
				 &ts->token,
				 &ts->token_max);
	/* This'll have token *plus* post punctuation in ts->token */
	/* Get postpunctuation */
	get_token_postpunctuation(ts);
    }

    return ts->token;
}

const cst_string *ts_get(cst_tokenstream *ts)
{
    /* Get next token */

    if (ts->tags)
    {  /* Someone didn't delete them before -- so we delete them now */
        delete_features(ts->tags);
        ts->tags = NULL;
    }

    /* Skip whitespace */
    get_token_sub_part(ts,
		       TS_CHARCLASS_WHITESPACE,
		       &ts->whitespace,
		       &ts->ws_max);

    /* quoted strings currently ignored */
    ts->token_pos = ts->file_pos - 1;
	
    /* Get prepunctuation */
    if (!ts_eof(ts) &&
        ts_charclass(ts->current_char,TS_CHARCLASS_PREPUNCT,ts))
	get_token_sub_part(ts,
			   TS_CHARCLASS_PREPUNCT,
			   &ts->prepunctuation,
			   &ts->prep_max);
    else if (ts->prepunctuation)
	ts->prepunctuation[0] = '\0';
    /* Get the symbol itself */
    if (!ts_eof(ts) &&
        ts_charclass(ts->current_char,TS_CHARCLASS_SINGLECHAR,ts))
    {
	if (2 >= ts->token_max) extend_buffer(&ts->token,&ts->token_max);
	ts->token[0] = ts->current_char;
	ts->token[1] = '\0';
	ts_getc(ts);
    }
    else
	get_token_sub_part_2(ts,
			     TS_CHARCLASS_WHITESPACE,       /* end class1 */
			     &ts->token,
			     &ts->token_max);
    /* This'll have token *plus* post punctuation in ts->token */
    /* Get postpunctuation */
    if (ts->postpunctuation)
	ts->postpunctuation[0] = '\0';
    if (ts->p_postpunctuationsymbols[0])
        get_token_postpunctuation(ts);

    return ts->token;
}

int ts_read(void *buff, int size, int num, cst_tokenstream *ts)
{
    /* people should complain about the speed here */
    /* people will complain about EOF as end of file */
    int i,j,p;
    cst_string *cbuff;

    cbuff = (cst_string *)buff;

    for (p=i=0; i < num; i++)
	for (j=0; j < size; j++,p++)
	    cbuff[p] = ts_getc(ts);

    return i;
}