ref: ec2a902acc1c05ed0a95c26249bbda4032c668e7
dir: /read.c/
#include "flisp.h" #include "cvalues.h" #include "read.h" enum { TOK_NONE, TOK_OPEN, TOK_CLOSE, TOK_DOT, TOK_QUOTE, TOK_SYM, TOK_NUM, TOK_BQ, TOK_COMMA, TOK_COMMAAT, TOK_COMMADOT, TOK_SHARPDOT, TOK_LABEL, TOK_BACKREF, TOK_SHARPQUOTE, TOK_SHARPOPEN, TOK_OPENB, TOK_CLOSEB, TOK_SHARPSYM, TOK_GENSYM, TOK_DOUBLEQUOTE, TOK_OPENC, TOK_CLOSEC, }; typedef struct Rctx Rctx; struct Rctx { uint32_t toktype; value_t tokval; char buf[256]; }; static value_t do_read_sexpr(Rctx *ctx, value_t label); #define RS value2c(ios_t*, FL(readstate)->source) bool fl_read_numtok(const char *tok, value_t *pval, int base) { char *end; int64_t i64; double d; if(*tok == '\0') return false; if(!((tok[0] == '0' && tok[1] == 'x') || (base >= 15)) && strpbrk(tok, ".eEpP")){ d = strtod(tok, &end); if(*end == '\0'){ if(pval) *pval = mk_double(d); return true; } // floats can end in f or f0 if(end > tok && end[0] == 'f' && (end[1] == '\0' || (end[1] == '0' && end[2] == '\0'))){ if(pval) *pval = mk_float((float)d); return true; } } if(tok[0] == '+'){ if(!strcmp(tok, "+NaN") || !strcasecmp(tok, "+nan.0")){ if(pval) *pval = mk_double(D_PNAN); return true; } if(!strcmp(tok, "+Inf") || !strcasecmp(tok, "+inf.0")){ if(pval) *pval = mk_double(D_PINF); return true; } }else if(tok[0] == '-'){ if(!strcmp(tok, "-NaN") || !strcasecmp(tok, "-nan.0")){ if(pval) *pval = mk_double(D_NNAN); return true; } if(!strcmp(tok, "-Inf") || !strcasecmp(tok, "-inf.0")){ if(pval) *pval = mk_double(D_NINF); return true; } } i64 = strtoll(tok, &end, base); if(*end != '\0') return false; if(pval != nil) *pval = fits_fixnum(i64) ? fixnum(i64) : mk_mpint(strtomp(tok, &end, base, nil)); return true; } static char nextchar(void) { int ch; char c; ios_t *f = RS; do{ ch = ios_getc(RS); if(ch == IOS_EOF) return 0; c = (char)ch; if(c == ';'){ // single-line comment do{ ch = ios_getc(f); if(ch == IOS_EOF) return 0; }while((char)ch != '\n'); c = (char)ch; } }while(c == ' ' || isspace(c)); return c; } static void take(Rctx *ctx) { ctx->toktype = TOK_NONE; } static _Noreturn void parse_error(const char *format, ...) { char msgbuf[512]; va_list args; int n; n = snprintf(msgbuf, sizeof(msgbuf), "%s:%"PRIu64":%"PRIu64": ", RS->filename, (uint64_t)RS->lineno, (uint64_t)RS->colno); if(n >= (int)sizeof(msgbuf)) n = 0; va_start(args, format); vsnprintf(msgbuf+n, sizeof(msgbuf)-n, format, args); value_t msg = string_from_cstr(msgbuf); va_end(args); fl_raise(fl_list2(FL(ParseError), msg)); } static void accumchar(Rctx *ctx, char c, int *pi) { ctx->buf[(*pi)++] = c; if(*pi >= (int)(sizeof(ctx->buf)-1)) parse_error("token too long"); } // return: 1 if escaped (forced to be symbol) static bool read_token(Rctx *ctx, char c, bool digits) { int i = 0, ch, nc = 0; bool escaped = false, issym = false; while(1){ if(nc != 0){ if(nc != 1) ios_getc(RS); ch = ios_peekc(RS); if(ch == IOS_EOF) goto terminate; c = (char)ch; } if(c == '|'){ issym = true; escaped = !escaped; }else if(c == '\\'){ issym = true; ios_getc(RS); ch = ios_peekc(RS); if(ch == IOS_EOF) goto terminate; accumchar(ctx, (char)ch, &i); }else if(!escaped && !(symchar(c) && (!digits || isdigit(c)))){ break; }else{ accumchar(ctx, c, &i); } nc++; } if(nc == 0) ios_skip(RS, -1); terminate: ctx->buf[i++] = '\0'; return issym; } static int isdigit_base(char c, int base) { if(base < 11) return c >= '0' && c < '0'+base; return (c >= '0' && c <= '9') || (c >= 'a' && c < 'a'+base-10) || (c >= 'A' && c < 'A'+base-10); } static uint32_t peek(Rctx *ctx) { char c, *end; fixnum_t x; int ch, base; if(ctx->toktype != TOK_NONE) return ctx->toktype; c = nextchar(); if(ios_eof(RS)) return TOK_NONE; if(c == '(') ctx->toktype = TOK_OPEN; else if(c == ')') ctx->toktype = TOK_CLOSE; else if(c == '[') ctx->toktype = TOK_OPENB; else if(c == ']') ctx->toktype = TOK_CLOSEB; else if(c == '{') ctx->toktype = TOK_OPENC; else if(c == '}') ctx->toktype = TOK_CLOSEC; else if(c == '\'') ctx->toktype = TOK_QUOTE; else if(c == '`') ctx->toktype = TOK_BQ; else if(c == '"') ctx->toktype = TOK_DOUBLEQUOTE; else if(c == '#'){ ch = ios_getc(RS); c = (char)ch; if(ch == IOS_EOF) parse_error("invalid read macro"); if(c == '.') ctx->toktype = TOK_SHARPDOT; else if(c == '\'') ctx->toktype = TOK_SHARPQUOTE; else if(c == '\\'){ Rune cval; if(ios_getutf8(RS, &cval) == IOS_EOF) parse_error("end of input in character constant"); if(cval == 'u' || cval == 'U' || cval == 'x'){ read_token(ctx, 'u', 0); if(ctx->buf[1] != '\0'){ // not a solitary 'u','U','x' if(!fl_read_numtok(&ctx->buf[1], &ctx->tokval, 16)) parse_error("invalid hex character constant"); cval = numval(ctx->tokval); } }else if(cval >= 'a' && cval <= 'z'){ read_token(ctx, (char)cval, 0); ctx->tokval = symbol(ctx->buf); if(ctx->buf[1] == '\0') USED(cval); /* one character */ else if(ctx->tokval == FL(nulsym)) cval = 0x00; else if(ctx->tokval == FL(alarmsym)) cval = 0x07; else if(ctx->tokval == FL(backspacesym)) cval = 0x08; else if(ctx->tokval == FL(tabsym)) cval = 0x09; else if(ctx->tokval == FL(linefeedsym)) cval = 0x0A; else if(ctx->tokval == FL(newlinesym)) cval = 0x0A; else if(ctx->tokval == FL(vtabsym)) cval = 0x0B; else if(ctx->tokval == FL(pagesym)) cval = 0x0C; else if(ctx->tokval == FL(returnsym)) cval = 0x0D; else if(ctx->tokval == FL(escsym)) cval = 0x1B; else if(ctx->tokval == FL(spacesym)) cval = 0x20; else if(ctx->tokval == FL(deletesym)) cval = 0x7F; else parse_error("unknown character #\\%s", ctx->buf); } ctx->toktype = TOK_NUM; ctx->tokval = mk_rune(cval); }else if(c == '('){ ctx->toktype = TOK_SHARPOPEN; }else if(c == '<'){ parse_error("unreadable object"); }else if(isdigit(c)){ read_token(ctx, c, 1); c = (char)ios_getc(RS); if(c == '#') ctx->toktype = TOK_BACKREF; else if(c == '=') ctx->toktype = TOK_LABEL; else parse_error("invalid label"); x = strtoll(ctx->buf, &end, 10); if(*end != '\0') parse_error("invalid label"); ctx->tokval = fixnum(x); }else if(c == '!'){ // #! single line comment for shbang script support do{ ch = ios_getc(RS); }while(ch != IOS_EOF && (char)ch != '\n'); return peek(ctx); }else if(c == '|'){ // multiline comment int commentlevel = 1; while(1){ ch = ios_getc(RS); hashpipe_gotc: if(ch == IOS_EOF) parse_error("eof within comment"); if((char)ch == '|'){ ch = ios_getc(RS); if((char)ch == '#'){ commentlevel--; if(commentlevel == 0) break; else continue; } goto hashpipe_gotc; }else if((char)ch == '#'){ ch = ios_getc(RS); if((char)ch == '|') commentlevel++; else goto hashpipe_gotc; } } // this was whitespace, so keep peeking return peek(ctx); }else if(c == ';'){ // datum comment (void)do_read_sexpr(ctx, UNBOUND); // skip return peek(ctx); }else if(c == ':'){ // gensym ch = ios_getc(RS); if((char)ch == 'g') ch = ios_getc(RS); read_token(ctx, (char)ch, 0); x = strtol(ctx->buf, &end, 10); if(*end != '\0' || ctx->buf[0] == '\0') parse_error("invalid gensym label"); ctx->toktype = TOK_GENSYM; ctx->tokval = fixnum(x); }else if(symchar(c)){ read_token(ctx, ch, 0); if(((c == 'b' && (base = 2)) || (c == 'o' && (base = 8)) || (c == 'd' && (base = 10)) || (c == 'x' && (base = 16))) && (isdigit_base(ctx->buf[1], base) || ctx->buf[1] == '-')){ if(!fl_read_numtok(&ctx->buf[1], &ctx->tokval, base)) parse_error("invalid base %d constant", base); return (ctx->toktype = TOK_NUM); } ctx->toktype = TOK_SHARPSYM; ctx->tokval = symbol(ctx->buf); }else{ parse_error("unknown read macro"); } }else if(c == ','){ ctx->toktype = TOK_COMMA; ch = ios_peekc(RS); if(ch == IOS_EOF) return ctx->toktype; if((char)ch == '@') ctx->toktype = TOK_COMMAAT; else if((char)ch == '.') ctx->toktype = TOK_COMMADOT; else return ctx->toktype; ios_getc(RS); }else{ if(!read_token(ctx, c, 0)){ if(ctx->buf[0] == '.' && ctx->buf[1] == '\0') return (ctx->toktype = TOK_DOT); if(fl_read_numtok(ctx->buf, &ctx->tokval, 0)) return (ctx->toktype = TOK_NUM); } ctx->toktype = TOK_SYM; ctx->tokval = symbol(strcmp(ctx->buf, "lambda") == 0 ? "λ" : ctx->buf); } return ctx->toktype; } // NOTE: this is NOT an efficient operation. it is only used by the // reader, and requires at least 1 and up to 3 garbage collections! static value_t vector_grow(value_t v, bool rewrite_refs) { size_t i, s = vector_size(v); size_t d = vector_grow_amt(s); PUSH(v); assert(s+d > s); value_t newv = alloc_vector(s+d, 1); v = FL(stack)[FL(sp)-1]; for(i = 0; i < s; i++) vector_elt(newv, i) = vector_elt(v, i); // use gc to rewrite references from the old vector to the new FL(stack)[FL(sp)-1] = newv; if(s > 0 && rewrite_refs){ ((size_t*)ptr(v))[0] |= 0x1; vector_elt(v, 0) = newv; gc(0); } return POP(); } static value_t read_vector(Rctx *ctx, value_t label, uint32_t closer) { value_t v = FL(the_empty_vector), elt; uint32_t i = 0; PUSH(v); if(label != UNBOUND) ptrhash_put(&FL(readstate)->backrefs, (void*)label, (void*)v); while(peek(ctx) != closer){ if(ios_eof(RS)) parse_error("unexpected end of input"); v = FL(stack)[FL(sp)-1]; // reload after possible alloc in peek() if(i >= vector_size(v)){ v = FL(stack)[FL(sp)-1] = vector_grow(v, label != UNBOUND); if(label != UNBOUND) ptrhash_put(&FL(readstate)->backrefs, (void*)label, (void*)v); } elt = do_read_sexpr(ctx, UNBOUND); v = FL(stack)[FL(sp)-1]; assert(i < vector_size(v)); vector_elt(v, i) = elt; i++; } take(ctx); if(i > 0) vector_setsize(v, i); return POP(); } static value_t read_string(void) { char *buf, *temp; char eseq[10]; size_t i = 0, j, sz = 64, ndig; int c; value_t s; Rune r = 0; buf = MEM_ALLOC(sz); while(1){ if(i >= sz-UTFmax){ // -UTFmax: leaves room for longest utf8 sequence sz *= 2; temp = MEM_REALLOC(buf, sz); if(temp == nil){ MEM_FREE(buf); parse_error("out of memory reading string"); } buf = temp; } c = ios_getc(RS); if(c == IOS_EOF){ MEM_FREE(buf); parse_error("unexpected end of input in string"); } if(c == '"') break; else if(c == '\\'){ c = ios_getc(RS); if(c == IOS_EOF){ MEM_FREE(buf); parse_error("end of input in escape sequence"); } j = 0; if(octal_digit(c)){ while(1){ eseq[j++] = c; c = ios_peekc(RS); if(c == IOS_EOF || !octal_digit(c) || j >= 3) break; ios_getc(RS); } eseq[j] = '\0'; r = strtol(eseq, nil, 8); // \DDD and \xXX read bytes, not characters buf[i++] = (char)r; }else if((c == 'x' && (ndig = 2)) || (c == 'u' && (ndig = 4)) || (c == 'U' && (ndig = 8))){ while(1){ c = ios_peekc(RS); if(c == IOS_EOF || !hex_digit(c) || j >= ndig) break; eseq[j++] = c; ios_getc(RS); } eseq[j] = '\0'; if(j) r = strtol(eseq, nil, 16); if(!j || r > Runemax){ MEM_FREE(buf); parse_error("invalid escape sequence"); } if(ndig == 2) buf[i++] = (char)r; else i += runetochar(&buf[i], &r); }else if(c == '\n'){ /* do nothing */ }else{ char esc = read_escape_control_char((char)c); if(esc == (char)c && !strchr("\\'\"`", esc)){ MEM_FREE(buf); parse_error("invalid escape sequence: \\%c", (char)c); } buf[i++] = esc; } }else{ buf[i++] = c; } } s = cvalue_string(i); memcpy(cvalue_data(s), buf, i); MEM_FREE(buf); return s; } // build a list of conses. this is complicated by the fact that all conses // can move whenever a new cons is allocated. we have to refer to every cons // through a handle to a relocatable pointer (i.e. a pointer on the stack). static void read_list(Rctx *ctx, value_t *pval, value_t label, uint32_t closer) { value_t c, *pc; uint32_t t; uint64_t lineno0, colno0; lineno0 = RS->lineno; colno0 = RS->colno - 1; PUSH(FL(Nil)); pc = &FL(stack)[FL(sp)-1]; // to keep track of current cons cell t = peek(ctx); while(t != closer){ if(ios_eof(RS)) parse_error("unexpected end of input: %"PRIu64":%"PRIu64" not closed", lineno0, colno0); c = mk_cons(); car_(c) = cdr_(c) = FL(Nil); if(iscons(*pc)) cdr_(*pc) = c; else{ *pval = c; if(label != UNBOUND) ptrhash_put(&FL(readstate)->backrefs, (void*)label, (void*)c); } *pc = c; c = do_read_sexpr(ctx, UNBOUND); // must be on separate lines due to car_(*pc) = c; // undefined evaluation order t = peek(ctx); if(t == TOK_DOT){ take(ctx); c = do_read_sexpr(ctx, UNBOUND); cdr_(*pc) = c; t = peek(ctx); if(ios_eof(RS)) parse_error("unexpected end of input"); if(t != closer){ take(ctx); parse_error( "expected '%c'", closer == TOK_CLOSEB ? ']' : (closer == TOK_CLOSEC ? '}' : ')') ); } } } take(ctx); c = POP(); USED(c); } // label is the backreference we'd like to fix up with this read static value_t do_read_sexpr(Rctx *ctx, value_t label) { value_t v, sym, oldtokval, *head; value_t *pv; uint32_t t; char c; t = peek(ctx); take(ctx); switch(t){ case TOK_CLOSE: parse_error("unexpected ')'"); case TOK_CLOSEB: parse_error("unexpected ']'"); case TOK_CLOSEC: parse_error("unexpected '}'"); case TOK_DOT: parse_error("unexpected '.'"); case TOK_SYM: case TOK_NUM: return ctx->tokval; case TOK_COMMA: head = &FL(comma); goto listwith; case TOK_COMMAAT: head = &FL(commaat); goto listwith; case TOK_COMMADOT: head = &FL(commadot); goto listwith; case TOK_BQ: head = &FL(backquote); goto listwith; case TOK_QUOTE: head = &FL(quote); listwith: v = cons_reserve(2); car_(v) = *head; cdr_(v) = tagptr(((cons_t*)ptr(v))+1, TAG_CONS); car_(cdr_(v)) = cdr_(cdr_(v)) = FL(Nil); PUSH(v); if(label != UNBOUND) ptrhash_put(&FL(readstate)->backrefs, (void*)label, (void*)v); v = do_read_sexpr(ctx, UNBOUND); car_(cdr_(FL(stack)[FL(sp)-1])) = v; return POP(); case TOK_SHARPQUOTE: // femtoLisp doesn't need symbol-function, so #' does nothing return do_read_sexpr(ctx, label); case TOK_OPEN: PUSH(FL(Nil)); read_list(ctx, &FL(stack)[FL(sp)-1], label, TOK_CLOSE); return POP(); case TOK_OPENB: PUSH(FL(Nil)); read_list(ctx, &FL(stack)[FL(sp)-1], label, TOK_CLOSEB); return POP(); case TOK_OPENC: PUSH(FL(Nil)); read_list(ctx, &FL(stack)[FL(sp)-1], label, TOK_CLOSEC); return POP(); case TOK_SHARPSYM: sym = ctx->tokval; if(sym == FL(tsym) || sym == FL(Tsym)) return FL(t); if(sym == FL(fsym) || sym == FL(Fsym)) return FL(f); // constructor notation c = nextchar(); if(c != '('){ take(ctx); parse_error("expected argument list for %s", symbol_name(ctx->tokval)); } PUSH(FL(Nil)); read_list(ctx, &FL(stack)[FL(sp)-1], UNBOUND, TOK_CLOSE); if(sym == FL(vu8sym)){ sym = FL(arraysym); FL(stack)[FL(sp)-1] = fl_cons(FL(uint8sym), FL(stack)[FL(sp)-1]); }else if(sym == FL(fnsym)){ sym = FL(function); } v = symbol_value(sym); if(v == UNBOUND) unbound_error(sym); return fl_apply(v, POP()); case TOK_SHARPOPEN: return read_vector(ctx, label, TOK_CLOSE); case TOK_SHARPDOT: // eval-when-read // evaluated expressions can refer to existing backreferences, but they // cannot see pending labels. in other words: // (... #2=#.#0# ... ) OK // (... #2=#.(#2#) ... ) DO NOT WANT sym = do_read_sexpr(ctx, UNBOUND); if(issymbol(sym)){ v = symbol_value(sym); if(v == UNBOUND) unbound_error(sym); return v; } return fl_toplevel_eval(sym); case TOK_LABEL: // create backreference label if(ptrhash_has(&FL(readstate)->backrefs, (void*)ctx->tokval)) parse_error("label %"PRIdPTR" redefined", numval(ctx->tokval)); oldtokval = ctx->tokval; v = do_read_sexpr(ctx, ctx->tokval); ptrhash_put(&FL(readstate)->backrefs, (void*)oldtokval, (void*)v); return v; case TOK_BACKREF: // look up backreference v = (value_t)ptrhash_get(&FL(readstate)->backrefs, (void*)ctx->tokval); if(v == (value_t)HT_NOTFOUND) parse_error("undefined label %"PRIdPTR, numval(ctx->tokval)); return v; case TOK_GENSYM: pv = (value_t*)ptrhash_bp(&FL(readstate)->gensyms, (void*)ctx->tokval); if(*pv == (value_t)HT_NOTFOUND) *pv = gensym(); return *pv; case TOK_DOUBLEQUOTE: return read_string(); } return FL(unspecified); } value_t fl_read_sexpr(value_t f) { fl_readstate_t state; state.prev = FL(readstate); htable_new(&state.backrefs, 8); htable_new(&state.gensyms, 8); state.source = f; FL(readstate) = &state; Rctx ctx; ctx.toktype = TOK_NONE; fl_gc_handle(&ctx.tokval); value_t v = do_read_sexpr(&ctx, UNBOUND); fl_free_gc_handles(1); FL(readstate) = state.prev; free_readstate(&state); return v; }