shithub: femtolisp

ref: 31edb5251c01f81e23b50ca129431e4ed164e7af
dir: /read.c/

View raw version
enum {
	TOK_NONE, TOK_OPEN, TOK_CLOSE, TOK_DOT, TOK_QUOTE, TOK_SYM, TOK_NUM,
	TOK_BQ, TOK_COMMA, TOK_COMMAAT, TOK_COMMADOT,
	TOK_SHARPDOT, TOK_LABEL, TOK_BACKREF, TOK_SHARPQUOTE, TOK_SHARPOPEN,
	TOK_OPENB, TOK_CLOSEB, TOK_SHARPSYM, TOK_GENSYM, TOK_DOUBLEQUOTE
};

#if defined(__plan9__)
static int errno;
#define VLONG_MAX ~(1LL<<63)
#define VLONG_MIN (1LL<<63)
#define UVLONG_MAX (1LL<<63)
static mpint *mp_vlong_min, *mp_vlong_max, *mp_uvlong_max;
#endif

static int64_t
strtoll_mp(char *nptr, char **rptr, int base, mpint **mp)
{
	int64_t x;
	mpint *m;

	*mp = nil;
	errno = 0;
	x = strtoll(nptr, rptr, base);
#if defined(__plan9__)
	if((x != VLONG_MAX && x != VLONG_MIN) || *rptr == nptr)
		return x;
	mpint *c;
	m = strtomp(nptr, rptr, base, nil);
	if(x == VLONG_MAX){
		if(mp_vlong_max == nil)
			mp_vlong_max = vtomp(VLONG_MAX, nil);
		c = mp_vlong_max;
	}else{
		if(mp_vlong_min == nil)
			mp_vlong_min = vtomp(VLONG_MIN, nil);
		c = mp_vlong_min;
	}
	if(mpcmp(c, m) == 0){
		mpfree(m);
		m = nil;
	}
#else
	m = nil;
	if(errno == ERANGE && (x == LLONG_MAX || x == LLONG_MIN))
		m = strtomp(nptr, rptr, base, nil);
#endif
	*mp = m;
	return x;
}

static uint64_t
strtoull_mp(char *nptr, char **rptr, int base, mpint **mp)
{
	uint64_t x;
	mpint *m;

	*mp = nil;
	errno = 0;
	x = strtoull(nptr, rptr, base);
#if defined(__plan9__)
	if(x != UVLONG_MAX || *rptr == nptr)
		return x;
	m = strtomp(nptr, rptr, base, nil);
	if(mp_uvlong_max == nil)
		mp_uvlong_max = uvtomp(UVLONG_MAX, nil);
	if(mpcmp(mp_uvlong_max, m) == 0){
		mpfree(m);
		m = nil;
	}
#else
	m = nil;
	if(errno == ERANGE && x == ULLONG_MAX)
		m = strtomp(nptr, rptr, base, nil);
#endif
	*mp = m;
	return x;
}

#define F value2c(ios_t*, readstate->source)

// defines which characters are ordinary symbol characters.
// exceptions are '.', which is an ordinary symbol character
// unless it's the only character in the symbol, and '#', which is
// an ordinary symbol character unless it's the first character.
static inline int
symchar(char c)
{
	static char *special = "()[]'\";`,\\| \a\b\f\n\r\t\v";
	return !strchr(special, c);
}

int
isnumtok_base(char *tok, value_t *pval, int base)
{
	char *end;
	int64_t i64;
	uint64_t ui64;
	double d;
	mpint *mp = nil;
	if(*tok == '\0')
		return 0;
	if(!((tok[0] == '0' && tok[1] == 'x') || (base >= 15)) && strpbrk(tok, ".eEpP")){
		d = strtod(tok, &end);
		if(*end == '\0'){
			if(pval)
				*pval = mk_double(d);
			return 1;
		}
		// floats can end in f or f0
		if(end > tok && end[0] == 'f' &&
			(end[1] == '\0' ||
			 (end[1] == '0' && end[2] == '\0'))){
			if(pval)
				*pval = mk_float((float)d);
			return 1;
		}
	}

	if(tok[0] == '+'){
		if(!strcmp(tok, "+NaN") || !strcasecmp(tok, "+nan.0")){
			if(pval)
				*pval = mk_double(D_PNAN);
			return 1;
		}
		if(!strcmp(tok, "+Inf") || !strcasecmp(tok, "+inf.0")){
			if(pval)
				*pval = mk_double(D_PINF);
			return 1;
		}
	}else if(tok[0] == '-'){
		if(!strcmp(tok, "-NaN") || !strcasecmp(tok, "-nan.0")){
			if(pval)
				*pval = mk_double(D_NNAN);
			return 1;
		}
		if(!strcmp(tok, "-Inf") || !strcasecmp(tok, "-inf.0")){
			if(pval)
				*pval = mk_double(D_NINF);
			return 1;
		}
		i64 = strtoll_mp(tok, &end, base, &mp);
		if(pval)
			*pval = mp == nil ? return_from_int64(i64) : mk_mpint(mp);
		return *end == '\0';
	}
	ui64 = strtoull_mp(tok, &end, base, &mp);
	if(pval)
		*pval = mp == nil ? return_from_uint64(ui64) : mk_mpint(mp);
	return *end == '\0';
}

static int
isnumtok(char *tok, value_t *pval)
{
	return isnumtok_base(tok, pval, 0);
}

static int
read_numtok(char *tok, value_t *pval, int base)
{
	return isnumtok_base(tok, pval, base);
}

static uint32_t toktype = TOK_NONE;
static value_t tokval;
static char buf[256];

static char
nextchar(void)
{
	int ch;
	char c;
	ios_t *f = F;

	do{
		if(f->bpos < f->size){
			ch = f->buf[f->bpos++];
		}else{
			ch = ios_getc(f);
			if(ch == IOS_EOF)
				return 0;
		}
		c = (char)ch;
		if(c == ';'){
			// single-line comment
			do{
				ch = ios_getc(f);
				if(ch == IOS_EOF)
					return 0;
			}while((char)ch != '\n');
			c = (char)ch;
		}
	}while(c == ' ' || isspace(c));
	return c;
}

static void
take(void)
{
	toktype = TOK_NONE;
}

static void
accumchar(char c, int *pi)
{
	buf[(*pi)++] = c;
	if(*pi >= (int)(sizeof(buf)-1))
		lerrorf(ParseError, "token too long");
}

// return: 1 if escaped (forced to be symbol)
static int
read_token(char c, int digits)
{
	int i = 0, ch, escaped = 0, issym = 0, nc = 0;

	while(1){
		if(nc != 0){
			if(nc != 1)
				ios_getc(F);
			ch = ios_peekc(F);
			if(ch == IOS_EOF)
				goto terminate;
			c = (char)ch;
		}
		if(c == '|'){
			issym = 1;
			escaped = !escaped;
		}else if(c == '\\'){
			issym = 1;
			ios_getc(F);
			ch = ios_peekc(F);
			if(ch == IOS_EOF)
				goto terminate;
			accumchar((char)ch, &i);
		}else if(!escaped && !(symchar(c) && (!digits || isdigit(c)))){
			break;
		}else{
			accumchar(c, &i);
		}
		nc++;
	}
	if(nc == 0)
		ios_skip(F, -1);
 terminate:
	buf[i++] = '\0';
	return issym;
}

static value_t do_read_sexpr(value_t label);

static uint32_t
peek(void)
{
	char c, *end;
	fixnum_t x;
	int ch, base;

	if(toktype != TOK_NONE)
		return toktype;
	c = nextchar();
	if(ios_eof(F))
		return TOK_NONE;
	if(c == '(')
		toktype = TOK_OPEN;
	else if(c == ')')
		toktype = TOK_CLOSE;
	else if(c == '[')
		toktype = TOK_OPENB;
	else if(c == ']')
		toktype = TOK_CLOSEB;
	else if(c == '\'')
		toktype = TOK_QUOTE;
	else if(c == '`')
		toktype = TOK_BQ;
	else if(c == '"')
		toktype = TOK_DOUBLEQUOTE;
	else if(c == '#'){
		ch = ios_getc(F); c = (char)ch;
		if(ch == IOS_EOF)
			lerrorf(ParseError, "invalid read macro");
		if(c == '.')
			toktype = TOK_SHARPDOT;
		else if(c == '\'')
			toktype = TOK_SHARPQUOTE;
		else if(c == '\\'){
			uint32_t cval;
			if(ios_getutf8(F, &cval) == IOS_EOF)
				lerrorf(ParseError, "end of input in character constant");
			if(cval == (uint32_t)'u' || cval == (uint32_t)'U' || cval == (uint32_t)'x'){
				read_token('u', 0);
				if(buf[1] != '\0'){ // not a solitary 'u','U','x'
					if(!read_numtok(&buf[1], &tokval, 16))
						lerrorf(ParseError, "invalid hex character constant");
					cval = numval(tokval);
				}
			}else if(cval >= 'a' && cval <= 'z'){
				read_token((char)cval, 0);
				tokval = symbol(buf);
				if(buf[1] == '\0') USED(cval); /* one character */
				else if(tokval == nulsym)       cval = 0x00;
				else if(tokval == alarmsym)     cval = 0x07;
				else if(tokval == backspacesym) cval = 0x08;
				else if(tokval == tabsym)       cval = 0x09;
				else if(tokval == linefeedsym)  cval = 0x0A;
				else if(tokval == newlinesym)   cval = 0x0A;
				else if(tokval == vtabsym)      cval = 0x0B;
				else if(tokval == pagesym)      cval = 0x0C;
				else if(tokval == returnsym)    cval = 0x0D;
				else if(tokval == escsym)       cval = 0x1B;
				else if(tokval == spacesym)     cval = 0x20;
				else if(tokval == deletesym)    cval = 0x7F;
				else
					lerrorf(ParseError, "unknown character #\\%s", buf);
			}
			toktype = TOK_NUM;
			tokval = mk_wchar(cval);
		}else if(c == '('){
			toktype = TOK_SHARPOPEN;
		}else if(c == '<'){
			lerrorf(ParseError, "unreadable object");
		}else if(isdigit(c)){
			read_token(c, 1);
			c = (char)ios_getc(F);
			if(c == '#')
				toktype = TOK_BACKREF;
			else if(c == '=')
				toktype = TOK_LABEL;
			else
				lerrorf(ParseError, "invalid label");
			x = strtoll(buf, &end, 10);
			if(*end != '\0')
				lerrorf(ParseError, "invalid label");
			tokval = fixnum(x);
		}else if(c == '!'){
			// #! single line comment for shbang script support
			do{
				ch = ios_getc(F);
			}while(ch != IOS_EOF && (char)ch != '\n');
			return peek();
		}else if(c == '|'){
			// multiline comment
			int commentlevel = 1;
			while(1){
				ch = ios_getc(F);
			hashpipe_gotc:
				if(ch == IOS_EOF)
					lerrorf(ParseError, "eof within comment");
				if((char)ch == '|'){
					ch = ios_getc(F);
					if((char)ch == '#'){
						commentlevel--;
						if(commentlevel == 0)
							break;
						else
							continue;
					}
					goto hashpipe_gotc;
				}else if((char)ch == '#'){
					ch = ios_getc(F);
					if((char)ch == '|')
						commentlevel++;
					else
						goto hashpipe_gotc;
				}
			}
			// this was whitespace, so keep peeking
			return peek();
		}else if(c == ';'){
			// datum comment
			(void)do_read_sexpr(UNBOUND); // skip
			return peek();
		}else if(c == ':'){
			// gensym
			ch = ios_getc(F);
			if((char)ch == 'g')
				ch = ios_getc(F);
			read_token((char)ch, 0);
			x = strtol(buf, &end, 10);
			if(*end != '\0' || buf[0] == '\0')
				lerrorf(ParseError, "invalid gensym label");
			toktype = TOK_GENSYM;
			tokval = fixnum(x);
		}else if(symchar(c)){
			read_token(ch, 0);

			if(((c == 'b' && (base = 2)) ||
			    (c == 'o' && (base = 8)) ||
			    (c == 'd' && (base = 10)) ||
			    (c == 'x' && (base = 16))) && (isdigit_base(buf[1], base) || buf[1] == '-')){
				if(!read_numtok(&buf[1], &tokval, base))
					lerrorf(ParseError, "invalid base %d constant", base);
				return (toktype = TOK_NUM);
			}

			toktype = TOK_SHARPSYM;
			tokval = symbol(buf);
		}else{
			lerrorf(ParseError, "unknown read macro");
		}
	}else if(c == ','){
		toktype = TOK_COMMA;
		ch = ios_peekc(F);
		if(ch == IOS_EOF)
			return toktype;
		if((char)ch == '@')
			toktype = TOK_COMMAAT;
		else if((char)ch == '.')
			toktype = TOK_COMMADOT;
		else
			return toktype;
		ios_getc(F);
	}else if(c == '{' || c == '}'){
		lerrorf(ParseError, "invalid character %c", c);
	}else{
		if(!read_token(c, 0)){
			if(buf[0] == '.' && buf[1] == '\0')
				return (toktype = TOK_DOT);
			if(read_numtok(buf, &tokval, 0))
				return (toktype = TOK_NUM);
		}
		toktype = TOK_SYM;
		tokval = symbol(buf);
	}
	return toktype;
}

// NOTE: this is NOT an efficient operation. it is only used by the
// reader, and requires at least 1 and up to 3 garbage collections!
static value_t
vector_grow(value_t v)
{
	size_t i, s = vector_size(v);
	size_t d = vector_grow_amt(s);
	PUSH(v);
	assert(s+d > s);
	value_t newv = alloc_vector(s+d, 1);
	v = Stack[SP-1];
	for(i = 0; i < s; i++)
		vector_elt(newv, i) = vector_elt(v, i);
	// use gc to rewrite references from the old vector to the new
	Stack[SP-1] = newv;
	if(s > 0){
		((size_t*)ptr(v))[0] |= 0x1;
		vector_elt(v, 0) = newv;
		gc(0);
	}
	return POP();
}

static value_t
read_vector(value_t label, uint32_t closer)
{
	value_t v = the_empty_vector, elt;
	uint32_t i = 0;
	PUSH(v);
	if(label != UNBOUND)
		ptrhash_put(&readstate->backrefs, (void*)label, (void*)v);
	while(peek() != closer){
		if(ios_eof(F))
			lerrorf(ParseError, "unexpected end of input");
		if(i >= vector_size(v)){
			v = Stack[SP-1] = vector_grow(v);
			if(label != UNBOUND)
				ptrhash_put(&readstate->backrefs, (void*)label, (void*)v);
		}
		elt = do_read_sexpr(UNBOUND);
		v = Stack[SP-1];
		assert(i < vector_size(v));
		vector_elt(v, i) = elt;
		i++;
	}
	take();
	if(i > 0)
		vector_setsize(v, i);
	return POP();
}

static value_t
read_string(void)
{
	char *buf, *temp;
	char eseq[10];
	size_t i = 0, j, sz = 64, ndig;
	int c;
	value_t s;
	uint32_t wc = 0;

	buf = malloc(sz);
	while(1){
		if(i >= sz-4){ // -4: leaves room for longest utf8 sequence
			sz *= 2;
			temp = realloc(buf, sz);
			if(temp == nil){
				free(buf);
				lerrorf(ParseError, "out of memory reading string");
			}
			buf = temp;
		}
		c = ios_getc(F);
		if(c == IOS_EOF){
			free(buf);
			lerrorf(ParseError, "unexpected end of input in string");
		}
		if(c == '"')
			break;
		else if(c == '\\'){
			c = ios_getc(F);
			if(c == IOS_EOF){
				free(buf);
				lerrorf(ParseError, "end of input in escape sequence");
			}
			j = 0;
			if(octal_digit(c)){
				while(1){
					eseq[j++] = c;
					c = ios_peekc(F);
					if(c == IOS_EOF || !octal_digit(c) || j >= 3)
						break;
					ios_getc(F);
				}
				eseq[j] = '\0';
				wc = strtol(eseq, nil, 8);
				// \DDD and \xXX read bytes, not characters
				buf[i++] = ((char)wc);
			}else if((c == 'x' && (ndig = 2)) || (c == 'u' && (ndig = 4)) || (c == 'U' && (ndig = 8))){
				while(1){
					c = ios_peekc(F);
					if(c == IOS_EOF || !hex_digit(c) || j >= ndig)
						break;
					eseq[j++] = c;
					ios_getc(F);
				}
				eseq[j] = '\0';
				if(j)
					wc = strtol(eseq, nil, 16);
				if(!j || wc > 0x10ffff){
					free(buf);
					lerrorf(ParseError, "invalid escape sequence");
				}
				if(ndig == 2)
					buf[i++] = ((char)wc);
				else
					i += u8_wc_toutf8(&buf[i], wc);
			}else{
				char esc = read_escape_control_char((char)c);
				if(esc == (char)c && !strchr("\\'\"`", esc)){
					free(buf);
					lerrorf(ParseError, "invalid escape sequence: \\%c", (char)c);
				}
				buf[i++] = esc;
			}
		}else{
			buf[i++] = c;
		}
	}
	s = cvalue_string(i);
	memmove(cvalue_data(s), buf, i);
	free(buf);
	return s;
}

// build a list of conses. this is complicated by the fact that all conses
// can move whenever a new cons is allocated. we have to refer to every cons
// through a handle to a relocatable pointer (i.e. a pointer on the stack).
static void
read_list(value_t *pval, value_t label, uint32_t closer)
{
	value_t c, *pc;
	uint32_t t;

	PUSH(NIL);
	pc = &Stack[SP-1];  // to keep track of current cons cell
	t = peek();
	while(t != closer){
		if(ios_eof(F))
			lerrorf(ParseError, "unexpected end of input");
		c = mk_cons(); car_(c) = cdr_(c) = NIL;
		if(iscons(*pc))
			cdr_(*pc) = c;
		else{
			*pval = c;
			if(label != UNBOUND)
				ptrhash_put(&readstate->backrefs, (void*)label, (void*)c);
		}
		*pc = c;
		c = do_read_sexpr(UNBOUND); // must be on separate lines due to
		car_(*pc) = c;			  // undefined evaluation order

		t = peek();
		if(t == TOK_DOT){
			take();
			c = do_read_sexpr(UNBOUND);
			cdr_(*pc) = c;
			t = peek();
			if(ios_eof(F))
				lerrorf(ParseError, "unexpected end of input");
			if(t != closer){
				take();
				lerrorf(ParseError, "expected '%c'", closer == TOK_CLOSEB ? ']' : ')');
			}
		}
	}
	take();
	c = POP();
	USED(c);
}

// label is the backreference we'd like to fix up with this read
static value_t
do_read_sexpr(value_t label)
{
	value_t v, sym, oldtokval, *head;
	value_t *pv;
	uint32_t t;
	char c;

	t = peek();
	take();
	switch(t){
	case TOK_CLOSE:
		lerrorf(ParseError, "unexpected ')'");
	case TOK_CLOSEB:
		lerrorf(ParseError, "unexpected ']'");
	case TOK_DOT:
		lerrorf(ParseError, "unexpected '.'");
	case TOK_SYM:
	case TOK_NUM:
		return tokval;
	case TOK_COMMA:
		head = &COMMA; goto listwith;
	case TOK_COMMAAT:
		head = &COMMAAT; goto listwith;
	case TOK_COMMADOT:
		head = &COMMADOT; goto listwith;
	case TOK_BQ:
		head = &BACKQUOTE; goto listwith;
	case TOK_QUOTE:
		head = &QUOTE;
	listwith:
		v = cons_reserve(2);
		car_(v) = *head;
		cdr_(v) = tagptr(((cons_t*)ptr(v))+1, TAG_CONS);
		car_(cdr_(v)) = cdr_(cdr_(v)) = NIL;
		PUSH(v);
		if(label != UNBOUND)
			ptrhash_put(&readstate->backrefs, (void*)label, (void*)v);
		v = do_read_sexpr(UNBOUND);
		car_(cdr_(Stack[SP-1])) = v;
		return POP();
	case TOK_SHARPQUOTE:
		// femtoLisp doesn't need symbol-function, so #' does nothing
		return do_read_sexpr(label);
	case TOK_OPEN:
		PUSH(NIL);
		read_list(&Stack[SP-1], label, TOK_CLOSE);
		return POP();
	case TOK_OPENB:
		PUSH(NIL);
		read_list(&Stack[SP-1], label, TOK_CLOSEB);
		return POP();
	case TOK_SHARPSYM:
		sym = tokval;
		if(sym == tsym || sym == Tsym)
			return FL_T;
		if(sym == fsym || sym == Fsym)
			return FL_F;
		// constructor notation
		c = nextchar();
		if(c != '('){
			take();
			lerrorf(ParseError, "expected argument list for %s", symbol_name(tokval));
		}
		PUSH(NIL);
		read_list(&Stack[SP-1], UNBOUND, TOK_CLOSE);
		if(sym == vu8sym){
			sym = arraysym;
			Stack[SP-1] = fl_cons(uint8sym, Stack[SP-1]);
		}else if(sym == fnsym){
			sym = FUNCTION;
		}
		v = symbol_value(sym);
		if(v == UNBOUND)
			unbound_error(sym);
		return fl_apply(v, POP());
	case TOK_SHARPOPEN:
		return read_vector(label, TOK_CLOSE);
	case TOK_SHARPDOT:
		// eval-when-read
		// evaluated expressions can refer to existing backreferences, but they
		// cannot see pending labels. in other words:
		// (... #2=#.#0# ... )	OK
		// (... #2=#.(#2#) ... )  DO NOT WANT
		sym = do_read_sexpr(UNBOUND);
		if(issymbol(sym)){
			v = symbol_value(sym);
			if(v == UNBOUND)
				unbound_error(sym);
			return v;
		}
		return fl_toplevel_eval(sym);
	case TOK_LABEL:
		// create backreference label
		if(ptrhash_has(&readstate->backrefs, (void*)tokval))
			lerrorf(ParseError, "label %"PRIdPTR" redefined", numval(tokval));
		oldtokval = tokval;
		v = do_read_sexpr(tokval);
		ptrhash_put(&readstate->backrefs, (void*)oldtokval, (void*)v);
		return v;
	case TOK_BACKREF:
		// look up backreference
		v = (value_t)ptrhash_get(&readstate->backrefs, (void*)tokval);
		if(v == (value_t)HT_NOTFOUND)
			lerrorf(ParseError, "undefined label %"PRIdPTR, numval(tokval));
		return v;
	case TOK_GENSYM:
		pv = (value_t*)ptrhash_bp(&readstate->gensyms, (void*)tokval);
		if(*pv == (value_t)HT_NOTFOUND)
			*pv = gensym();
		return *pv;
	case TOK_DOUBLEQUOTE:
		return read_string();
	}
	return FL_UNSPECIFIED;
}

value_t
fl_read_sexpr(value_t f)
{
	value_t v;
	fl_readstate_t state;
	state.prev = readstate;
	htable_new(&state.backrefs, 8);
	htable_new(&state.gensyms, 8);
	state.source = f;
	readstate = &state;
	assert(toktype == TOK_NONE);
	fl_gc_handle(&tokval);

	v = do_read_sexpr(UNBOUND);

	fl_free_gc_handles(1);
	readstate = state.prev;
	free_readstate(&state);
	return v;
}