ref: 950ee74350cdb6b7ff170234fd9341f1b094d171
parent: 993d1af900057e362e08ea738e1f6f6b409c749e
	author: Ori Bernstein <ori@eigenstate.org>
	date: Wed Dec 18 11:42:55 EST 2013
	
Support utf8 character values.
--- a/parse/gram.y
+++ b/parse/gram.y
@@ -626,7 +626,7 @@
 littok  : Tstrlit       {$$ = mkstr($1->line, $1->str);}         | Tintlit       {$$ = mkint($1->line, $1->intval);}-        | Tchrlit       {$$ = mkchar($1->line, *$1->str);} /* FIXME: expand escapes, unicode  */+        | Tchrlit       {$$ = mkchar($1->line, $1->chrval);}         | Tfloatlit     {$$ = mkfloat($1->line, $1->fltval);}         | Tboollit      {$$ = mkbool($1->line, !strcmp($1->str, "true"));};
--- a/parse/parse.h
+++ b/parse/parse.h
@@ -86,6 +86,7 @@
/* values parsed out */
vlong intval;
double fltval;
+ uint32_t chrval;
};
 struct Stab {--- a/parse/tok.c
+++ b/parse/tok.c
@@ -208,7 +208,7 @@
 {size_t i;
char c;
-
+
i = 0;
     for (c = peek(); i < sz && identchar(c); c = peek()) {next();
@@ -332,36 +332,60 @@
return t;
}
+static uint32_t readutf(char c, char **buf, size_t *buflen, size_t *sz) {+ size_t i, len;
+ uint32_t val;
+
+ if ((c & 0x80) == 0)
+ len = 1;
+ else if ((c & 0xe0) == 0xc0)
+ len = 2;
+ else if ((c & 0xf0) == 0xe0)
+ len = 3;
+ else if ((c & 0xf8) == 0xf0)
+ len = 4;
+
+ val = c & ((1 << (8 - len)) - 1);
+ append(buf, buflen, sz, c);
+    for (i = 1; i < len; i++) {+ c = next();
+ if ((c & 0xc0) != 0x80)
+ fatal(line, "Invalid utf8 codepoint in character literal");
+ val = (val << 6) | (c & 0x3f);
+ append(buf, buflen, sz, c);
+ }
+ return val;
+}
+
static Tok *charlit()
 {Tok *t;
int c;
+ uint32_t val;
size_t len, sz;
char *buf;
+
assert(next() == '\'');
buf = NULL;
len = 0;
sz = 0;
-    while (1) {- c = next();
- /* we don't unescape here, but on output */
- if (c == '\'')
- break;
- else if (c == End)
- fatal(line, "Unexpected EOF within char lit");
- else if (c == '\n')
- fatal(line, "Newlines not allowed in char lit");
- else if (c == '\\')
- decode(&buf, &len, &sz);
- else
- append(&buf, &len, &sz, c);
-
- };
+ c = next();
+ if (c == End)
+ fatal(line, "Unexpected EOF within char lit");
+ else if (c == '\n')
+ fatal(line, "Newlines not allowed in char lit");
+ else if (c == '\\')
+ decode(&buf, &len, &sz);
+ else
+ val = readutf(c, &buf, &len, &sz);
append(&buf, &len, &sz, '\0');
+ if (next() != '\'')
+ fatal(line, "Character constant with multiple characters");
t = mktok(Tchrlit);
+ t->chrval = val;
t->str = buf;
return t;
}
--
⑨