shithub: femtolisp

ref: 92980154dee84747ca798bafe6e05d87519ab665
dir: /read.c/

View raw version
enum {
    TOK_NONE, TOK_OPEN, TOK_CLOSE, TOK_DOT, TOK_QUOTE, TOK_SYM, TOK_NUM,
    TOK_BQ, TOK_COMMA, TOK_COMMAAT, TOK_COMMADOT,
    TOK_SHARPDOT, TOK_LABEL, TOK_BACKREF, TOK_SHARPQUOTE, TOK_SHARPOPEN,
    TOK_OPENB, TOK_CLOSEB, TOK_SHARPSYM, TOK_GENSYM, TOK_DOUBLEQUOTE
};

#if defined(__plan9__)
static int errno;
#define VLONG_MAX ~(1LL<<63)
#define VLONG_MIN (1LL<<63)
#define UVLONG_MAX (1LL<<63)
static mpint *mp_vlong_min, *mp_vlong_max, *mp_uvlong_max;
#endif

static int64_t strtoll_mp(char *nptr, char **rptr, int base, mpint **mp)
{
    int64_t x;
    mpint *m;

    *mp = nil;
    errno = 0;
    x = strtoll(nptr, rptr, base);
#if defined(__plan9__)
    if((x != VLONG_MAX && x != VLONG_MIN) || *rptr == nptr)
        return x;
    mpint *c;
    m = strtomp(nptr, rptr, base, nil);
    if(x == VLONG_MAX){
        if(mp_vlong_max == nil) mp_vlong_max = vtomp(VLONG_MAX, nil);
        c = mp_vlong_max;
    }else{
        if(mp_vlong_min == nil) mp_vlong_min = vtomp(VLONG_MIN, nil);
        c = mp_vlong_min;
    }
    if (mpcmp(c, m) == 0) {
        mpfree(m);
        m = nil;
    }
#else
    m = nil;
    if (errno == ERANGE && (x == LLONG_MAX || x == LLONG_MIN))
        m = strtomp(nptr, rptr, base, nil);
#endif
    *mp = m;
    return x;
}

static uint64_t strtoull_mp(char *nptr, char **rptr, int base, mpint **mp)
{
    uint64_t x;
    mpint *m;

    *mp = nil;
    errno = 0;
    x = strtoull(nptr, rptr, base);
#if defined(__plan9__)
    if(x != UVLONG_MAX || *rptr == nptr)
        return x;
    m = strtomp(nptr, rptr, base, nil);
    if(mp_uvlong_max == nil)
        mp_uvlong_max = uvtomp(UVLONG_MAX, nil);
    if(mpcmp(mp_uvlong_max, m) == 0){
        mpfree(m);
        m = nil;
    }
#else
    m = nil;
    if (errno == ERANGE && x == ULLONG_MAX)
        m = strtomp(nptr, rptr, base, nil);
#endif
    *mp = m;
    return x;
}

#define F value2c(ios_t*,readstate->source)

// defines which characters are ordinary symbol characters.
// exceptions are '.', which is an ordinary symbol character
// unless it's the only character in the symbol, and '#', which is
// an ordinary symbol character unless it's the first character.
static inline int symchar(char c)
{
    static char *special = "()[]'\";`,\\| \a\b\f\n\r\t\v";
    return !strchr(special, c);
}

int isnumtok_base(char *tok, value_t *pval, int base)
{
    char *end;
    int64_t i64;
    uint64_t ui64;
    double d;
    mpint *mp = nil;
    if (*tok == '\0')
        return 0;
    if (!((tok[0]=='0' && tok[1]=='x') || (base >= 15)) &&
        strpbrk(tok, ".eEpP")) {
        d = strtod(tok, &end);
        if (*end == '\0') {
            if (pval) *pval = mk_double(d);
            return 1;
        }
        // floats can end in f or f0
        if (end > tok && end[0] == 'f' &&
            (end[1] == '\0' ||
             (end[1] == '0' && end[2] == '\0'))) {
            if (pval) *pval = mk_float((float)d);
            return 1;
        }
    }

    if (tok[0] == '+') {
        if (!strcmp(tok,"+NaN") || !strcasecmp(tok,"+nan.0")) {
            if (pval) *pval = mk_double(D_PNAN);
            return 1;
        }
        if (!strcmp(tok,"+Inf") || !strcasecmp(tok,"+inf.0")) {
            if (pval) *pval = mk_double(D_PINF);
            return 1;
        }
    }
    else if (tok[0] == '-') {
        if (!strcmp(tok,"-NaN") || !strcasecmp(tok,"-nan.0")) {
            if (pval) *pval = mk_double(D_NNAN);
            return 1;
        }
        if (!strcmp(tok,"-Inf") || !strcasecmp(tok,"-inf.0")) {
            if (pval) *pval = mk_double(D_NINF);
            return 1;
        }
        i64 = strtoll_mp(tok, &end, base, &mp);
        if (pval)
            *pval = mp == nil ? return_from_int64(i64) : mk_mpint(mp);
        return (*end == '\0');
    }
    ui64 = strtoull_mp(tok, &end, base, &mp);
    if (pval)
        *pval = mp == nil ? return_from_uint64(ui64) : mk_mpint(mp);
    return (*end == '\0');
}

static int isnumtok(char *tok, value_t *pval)
{
    return isnumtok_base(tok, pval, 0);
}

static int read_numtok(char *tok, value_t *pval, int base)
{
    return isnumtok_base(tok, pval, base);
}

static uint32_t toktype = TOK_NONE;
static value_t tokval;
static char buf[256];

static char nextchar(void)
{
    int ch;
    char c;
    ios_t *f = F;

    do {
        if (f->bpos < f->size) {
            ch = f->buf[f->bpos++];
        }
        else {
            ch = ios_getc(f);
            if (ch == IOS_EOF)
                return 0;
        }
        c = (char)ch;
        if (c == ';') {
            // single-line comment
            do {
                ch = ios_getc(f);
                if (ch == IOS_EOF)
                    return 0;
            } while ((char)ch != '\n');
            c = (char)ch;
        }
    } while (c==' ' || isspace(c));
    return c;
}

static void take(void)
{
    toktype = TOK_NONE;
}

static void accumchar(char c, int *pi)
{
    buf[(*pi)++] = c;
    if (*pi >= (int)(sizeof(buf)-1))
        lerrorf(ParseError, "token too long");
}

// return: 1 if escaped (forced to be symbol)
static int read_token(char c, int digits)
{
    int i=0, ch, escaped=0, issym=0, nc=0;

    while (1) {
        if (nc != 0) {
            if (nc != 1)
                ios_getc(F);
            ch = ios_peekc(F);
            if (ch == IOS_EOF)
                goto terminate;
            c = (char)ch;
        }
        if (c == '|') {
            issym = 1;
            escaped = !escaped;
        }
        else if (c == '\\') {
            issym = 1;
            ios_getc(F);
            ch = ios_peekc(F);
            if (ch == IOS_EOF)
                goto terminate;
            accumchar((char)ch, &i);
        }
        else if (!escaped && !(symchar(c) && (!digits || isdigit(c)))) {
            break;
        }
        else {
            accumchar(c, &i);
        }
        nc++;
    }
    if (nc == 0)
        ios_skip(F, -1);
 terminate:
    buf[i++] = '\0';
    return issym;
}

static value_t do_read_sexpr(value_t label);

static uint32_t peek(void)
{
    char c, *end;
    fixnum_t x;
    int ch, base;

    if (toktype != TOK_NONE)
        return toktype;
    c = nextchar();
    if (ios_eof(F)) return TOK_NONE;
    if (c == '(') {
        toktype = TOK_OPEN;
    }
    else if (c == ')') {
        toktype = TOK_CLOSE;
    }
    else if (c == '[') {
        toktype = TOK_OPENB;
    }
    else if (c == ']') {
        toktype = TOK_CLOSEB;
    }
    else if (c == '\'') {
        toktype = TOK_QUOTE;
    }
    else if (c == '`') {
        toktype = TOK_BQ;
    }
    else if (c == '"') {
        toktype = TOK_DOUBLEQUOTE;
    }
    else if (c == '#') {
        ch = ios_getc(F); c = (char)ch;
        if (ch == IOS_EOF)
            lerrorf(ParseError, "invalid read macro");
        if (c == '.') {
            toktype = TOK_SHARPDOT;
        }
        else if (c == '\'') {
            toktype = TOK_SHARPQUOTE;
        }
        else if (c == '\\') {
            uint32_t cval;
            if (ios_getutf8(F, &cval) == IOS_EOF)
                lerrorf(ParseError, "end of input in character constant");
            if (cval == (uint32_t)'u' || cval == (uint32_t)'U' ||
                cval == (uint32_t)'x') {
                read_token('u', 0);
                if (buf[1] != '\0') {  // not a solitary 'u','U','x'
                    if (!read_numtok(&buf[1], &tokval, 16))
                        lerrorf(ParseError,
                               "read: invalid hex character constant");
                    cval = numval(tokval);
                }
            }
            else if (cval >= 'a' && cval <= 'z') {
                read_token((char)cval, 0);
                tokval = symbol(buf);
                if (buf[1] == '\0') { USED(cval); } /* one character */
                else if (tokval == nulsym)        cval = 0x00;
                else if (tokval == alarmsym)      cval = 0x07;
                else if (tokval == backspacesym)  cval = 0x08;
                else if (tokval == tabsym)        cval = 0x09;
                else if (tokval == linefeedsym)   cval = 0x0A;
                else if (tokval == newlinesym)    cval = 0x0A;
                else if (tokval == vtabsym)       cval = 0x0B;
                else if (tokval == pagesym)       cval = 0x0C;
                else if (tokval == returnsym)     cval = 0x0D;
                else if (tokval == escsym)        cval = 0x1B;
                else if (tokval == spacesym)      cval = 0x20;
                else if (tokval == deletesym)     cval = 0x7F;
                else
                    lerrorf(ParseError, "unknown character #\\%s", buf);
            }
            toktype = TOK_NUM;
            tokval = mk_wchar(cval);
        }
        else if (c == '(') {
            toktype = TOK_SHARPOPEN;
        }
        else if (c == '<') {
            lerrorf(ParseError, "unreadable object");
        }
        else if (isdigit(c)) {
            read_token(c, 1);
            c = (char)ios_getc(F);
            if (c == '#')
                toktype = TOK_BACKREF;
            else if (c == '=')
                toktype = TOK_LABEL;
            else
                lerrorf(ParseError, "invalid label");
            x = strtoll(buf, &end, 10);
            if (*end != '\0')
                lerrorf(ParseError, "invalid label");
            tokval = fixnum(x);
        }
        else if (c == '!') {
            // #! single line comment for shbang script support
            do {
                ch = ios_getc(F);
            } while (ch != IOS_EOF && (char)ch != '\n');
            return peek();
        }
        else if (c == '|') {
            // multiline comment
            int commentlevel=1;
            while (1) {
                ch = ios_getc(F);
            hashpipe_gotc:
                if (ch == IOS_EOF)
                    lerrorf(ParseError, "eof within comment");
                if ((char)ch == '|') {
                    ch = ios_getc(F);
                    if ((char)ch == '#') {
                        commentlevel--;
                        if (commentlevel == 0)
                            break;
                        else
                            continue;
                    }
                    goto hashpipe_gotc;
                }
                else if ((char)ch == '#') {
                    ch = ios_getc(F);
                    if ((char)ch == '|')
                        commentlevel++;
                    else
                        goto hashpipe_gotc;
                }
            }
            // this was whitespace, so keep peeking
            return peek();
        }
        else if (c == ';') {
            // datum comment
            (void)do_read_sexpr(UNBOUND); // skip
            return peek();
        }
        else if (c == ':') {
            // gensym
            ch = ios_getc(F);
            if ((char)ch == 'g')
                ch = ios_getc(F);
            read_token((char)ch, 0);
            x = strtol(buf, &end, 10);
            if (*end != '\0' || buf[0] == '\0')
                lerrorf(ParseError, "invalid gensym label");
            toktype = TOK_GENSYM;
            tokval = fixnum(x);
        }
        else if (symchar(c)) {
            read_token(ch, 0);

            if (((c == 'b' && (base= 2)) ||
                 (c == 'o' && (base= 8)) ||
                 (c == 'd' && (base=10)) ||
                 (c == 'x' && (base=16))) &&
                (isdigit_base(buf[1],base) ||
                 buf[1]=='-')) {
                if (!read_numtok(&buf[1], &tokval, base))
                    lerrorf(ParseError, "invalid base %d constant", base);
                return (toktype=TOK_NUM);
            }

            toktype = TOK_SHARPSYM;
            tokval = symbol(buf);
        }
        else {
            lerrorf(ParseError, "unknown read macro");
        }
    }
    else if (c == ',') {
        toktype = TOK_COMMA;
        ch = ios_peekc(F);
        if (ch == IOS_EOF)
            return toktype;
        if ((char)ch == '@')
            toktype = TOK_COMMAAT;
        else if ((char)ch == '.')
            toktype = TOK_COMMADOT;
        else
            return toktype;
        ios_getc(F);
    }
    else if (c == '{' || c == '}') {
        lerrorf(ParseError, "invalid character %c", c);
    }
    else {
        if (!read_token(c, 0)) {
            if (buf[0]=='.' && buf[1]=='\0') {
                return (toktype=TOK_DOT);
            }
            else {
                if (read_numtok(buf, &tokval, 0))
                    return (toktype=TOK_NUM);
            }
        }
        toktype = TOK_SYM;
        tokval = symbol(buf);
    }
    return toktype;
}

// NOTE: this is NOT an efficient operation. it is only used by the
// reader, and requires at least 1 and up to 3 garbage collections!
static value_t vector_grow(value_t v)
{
    size_t i, s = vector_size(v);
    size_t d = vector_grow_amt(s);
    PUSH(v);
    assert(s+d > s);
    value_t newv = alloc_vector(s+d, 1);
    v = Stack[SP-1];
    for(i=0; i < s; i++)
        vector_elt(newv, i) = vector_elt(v, i);
    // use gc to rewrite references from the old vector to the new
    Stack[SP-1] = newv;
    if (s > 0) {
        ((size_t*)ptr(v))[0] |= 0x1;
        vector_elt(v, 0) = newv;
        gc(0);
    }
    return POP();
}

static value_t read_vector(value_t label, uint32_t closer)
{
    value_t v=the_empty_vector, elt;
    uint32_t i=0;
    PUSH(v);
    if (label != UNBOUND)
        ptrhash_put(&readstate->backrefs, (void*)label, (void*)v);
    while (peek() != closer) {
        if (ios_eof(F))
            lerrorf(ParseError, "unexpected end of input");
        if (i >= vector_size(v)) {
            v = Stack[SP-1] = vector_grow(v);
            if (label != UNBOUND)
                ptrhash_put(&readstate->backrefs, (void*)label, (void*)v);
        }
        elt = do_read_sexpr(UNBOUND);
        v = Stack[SP-1];
        assert(i < vector_size(v));
        vector_elt(v,i) = elt;
        i++;
    }
    take();
    if (i > 0)
        vector_setsize(v, i);
    return POP();
}

static value_t read_string(void)
{
    char *buf, *temp;
    char eseq[10];
    size_t i=0, j, sz = 64, ndig;
    int c;
    value_t s;
    uint32_t wc=0;

    buf = malloc(sz);
    while (1) {
        if (i >= sz-4) {  // -4: leaves room for longest utf8 sequence
            sz *= 2;
            temp = realloc(buf, sz);
            if (temp == nil) {
                free(buf);
                lerrorf(ParseError, "out of memory reading string");
            }
            buf = temp;
        }
        c = ios_getc(F);
        if (c == IOS_EOF) {
            free(buf);
            lerrorf(ParseError, "unexpected end of input in string");
        }
        if (c == '"')
            break;
        else if (c == '\\') {
            c = ios_getc(F);
            if (c == IOS_EOF) {
                free(buf);
                lerrorf(ParseError, "end of input in escape sequence");
            }
            j = 0;
            if (octal_digit(c)) {
                while (1) {
                    eseq[j++] = c;
                    c = ios_peekc(F);
                    if (c == IOS_EOF || !octal_digit(c) || j >= 3)
                        break;
                    ios_getc(F);
                }
                eseq[j] = '\0';
                wc = strtol(eseq, nil, 8);
                // \DDD and \xXX read bytes, not characters
                buf[i++] = ((char)wc);
            }
            else if ((c=='x' && (ndig=2)) ||
                     (c=='u' && (ndig=4)) ||
                     (c=='U' && (ndig=8))) {
                while (1) {
                    c = ios_peekc(F);
                    if (c == IOS_EOF || !hex_digit(c) || j >= ndig)
                        break;
                    eseq[j++] = c;
                    ios_getc(F);
                }
                eseq[j] = '\0';
                if (j) wc = strtol(eseq, nil, 16);
                if (!j || wc > 0x10ffff) {
                    free(buf);
                    lerrorf(ParseError, "invalid escape sequence");
                }
                if (ndig == 2)
                    buf[i++] = ((char)wc);
                else
                    i += u8_wc_toutf8(&buf[i], wc);
            }
            else {
                char esc = read_escape_control_char((char)c);
                if (esc == (char)c && !strchr("\\'\"`", esc)) {
                    free(buf);
                    lerrorf(ParseError, "invalid escape sequence: \\%c", (char)c);
                }
                buf[i++] = esc;
            }
        }
        else {
            buf[i++] = c;
        }
    }
    s = cvalue_string(i);
    memmove(cvalue_data(s), buf, i);
    free(buf);
    return s;
}

// build a list of conses. this is complicated by the fact that all conses
// can move whenever a new cons is allocated. we have to refer to every cons
// through a handle to a relocatable pointer (i.e. a pointer on the stack).
static void read_list(value_t *pval, value_t label, uint32_t closer)
{
    value_t c, *pc;
    uint32_t t;

    PUSH(NIL);
    pc = &Stack[SP-1];  // to keep track of current cons cell
    t = peek();
    while (t != closer) {
        if (ios_eof(F))
            lerrorf(ParseError, "unexpected end of input");
        c = mk_cons(); car_(c) = cdr_(c) = NIL;
        if (iscons(*pc)) {
            cdr_(*pc) = c;
        }
        else {
            *pval = c;
            if (label != UNBOUND)
                ptrhash_put(&readstate->backrefs, (void*)label, (void*)c);
        }
        *pc = c;
        c = do_read_sexpr(UNBOUND); // must be on separate lines due to
        car_(*pc) = c;              // undefined evaluation order

        t = peek();
        if (t == TOK_DOT) {
            take();
            c = do_read_sexpr(UNBOUND);
            cdr_(*pc) = c;
            t = peek();
            if (ios_eof(F))
                lerrorf(ParseError, "unexpected end of input");
            if (t != closer) {
                take();
                lerrorf(ParseError, "expected '%c'", closer==TOK_CLOSEB ? ']' : ')');
            }
        }
    }
    take();
    c = POP();
    USED(c);
}

// label is the backreference we'd like to fix up with this read
static value_t do_read_sexpr(value_t label)
{
    value_t v, sym, oldtokval, *head;
    value_t *pv;
    uint32_t t;
    char c;

    t = peek();
    take();
    switch (t) {
    case TOK_CLOSE:
        lerrorf(ParseError, "unexpected ')'");
    case TOK_CLOSEB:
        lerrorf(ParseError, "unexpected ']'");
    case TOK_DOT:
        lerrorf(ParseError, "unexpected '.'");
    case TOK_SYM:
    case TOK_NUM:
        return tokval;
    case TOK_COMMA:
        head = &COMMA; goto listwith;
    case TOK_COMMAAT:
        head = &COMMAAT; goto listwith;
    case TOK_COMMADOT:
        head = &COMMADOT; goto listwith;
    case TOK_BQ:
        head = &BACKQUOTE; goto listwith;
    case TOK_QUOTE:
        head = &QUOTE;
    listwith:
        v = cons_reserve(2);
        car_(v) = *head;
        cdr_(v) = tagptr(((cons_t*)ptr(v))+1, TAG_CONS);
        car_(cdr_(v)) = cdr_(cdr_(v)) = NIL;
        PUSH(v);
        if (label != UNBOUND)
            ptrhash_put(&readstate->backrefs, (void*)label, (void*)v);
        v = do_read_sexpr(UNBOUND);
        car_(cdr_(Stack[SP-1])) = v;
        return POP();
    case TOK_SHARPQUOTE:
        // femtoLisp doesn't need symbol-function, so #' does nothing
        return do_read_sexpr(label);
    case TOK_OPEN:
        PUSH(NIL);
        read_list(&Stack[SP-1], label, TOK_CLOSE);
        return POP();
    case TOK_OPENB:
        PUSH(NIL);
        read_list(&Stack[SP-1], label, TOK_CLOSEB);
        return POP();
    case TOK_SHARPSYM:
        sym = tokval;
        if (sym == tsym || sym == Tsym)
            return FL_T;
        else if (sym == fsym || sym == Fsym)
            return FL_F;
        // constructor notation
        c = nextchar();
        if (c != '(') {
            take();
            lerrorf(ParseError, "expected argument list for %s",
                    symbol_name(tokval));
        }
        PUSH(NIL);
        read_list(&Stack[SP-1], UNBOUND, TOK_CLOSE);
        if (sym == vu8sym) {
            sym = arraysym;
            Stack[SP-1] = fl_cons(uint8sym, Stack[SP-1]);
        }
        else if (sym == fnsym) {
            sym = FUNCTION;
        }
        v = symbol_value(sym);
        if (v == UNBOUND)
            unbound_error(sym);
        return fl_apply(v, POP());
    case TOK_SHARPOPEN:
        return read_vector(label, TOK_CLOSE);
    case TOK_SHARPDOT:
        // eval-when-read
        // evaluated expressions can refer to existing backreferences, but they
        // cannot see pending labels. in other words:
        // (... #2=#.#0# ... )    OK
        // (... #2=#.(#2#) ... )  DO NOT WANT
        sym = do_read_sexpr(UNBOUND);
        if (issymbol(sym)) {
            v = symbol_value(sym);
            if (v == UNBOUND)
                unbound_error(sym);
            return v;
        }
        return fl_toplevel_eval(sym);
    case TOK_LABEL:
        // create backreference label
        if (ptrhash_has(&readstate->backrefs, (void*)tokval))
            lerrorf(ParseError, "label %"PRIdPTR" redefined", numval(tokval));
        oldtokval = tokval;
        v = do_read_sexpr(tokval);
        ptrhash_put(&readstate->backrefs, (void*)oldtokval, (void*)v);
        return v;
    case TOK_BACKREF:
        // look up backreference
        v = (value_t)ptrhash_get(&readstate->backrefs, (void*)tokval);
        if (v == (value_t)HT_NOTFOUND)
            lerrorf(ParseError, "undefined label %"PRIdPTR, numval(tokval));
        return v;
    case TOK_GENSYM:
        pv = (value_t*)ptrhash_bp(&readstate->gensyms, (void*)tokval);
        if (*pv == (value_t)HT_NOTFOUND)
            *pv = gensym();
        return *pv;
    case TOK_DOUBLEQUOTE:
        return read_string();
    }
    return FL_UNSPECIFIED;
}

value_t fl_read_sexpr(value_t f)
{
    value_t v;
    fl_readstate_t state;
    state.prev = readstate;
    htable_new(&state.backrefs, 8);
    htable_new(&state.gensyms, 8);
    state.source = f;
    readstate = &state;
    assert(toktype == TOK_NONE);
    fl_gc_handle(&tokval);

    v = do_read_sexpr(UNBOUND);

    fl_free_gc_handles(1);
    readstate = state.prev;
    free_readstate(&state);
    return v;
}