ref: a4e846ba5305fe5834266780267f2b16c650bc15
parent: cb273846e4702b096feb761d26c3cff82f2f22df
author: Tor Andersson <tor@ccxvii.net>
date: Fri Dec 27 18:54:45 EST 2013
Prepare lexer for hooking up to generated parser. Use single character tokens as-is. Make token enums private in a separate header file, and give them better names.
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-SRCS := $(wildcard js*.c)
-HDRS := $(wildcard js*.h)
+SRCS := js-state.c js-load.c js-lex.c js-parse.c
+HDRS := js.h js-parse.h
OBJS := $(SRCS:%.c=build/%.o)
CFLAGS = -Wall -g
--- a/js-lex.c
+++ b/js-lex.c
@@ -1,5 +1,17 @@
#include "js.h"
+#include "js-parse.h"
+static int syntaxerror(js_State *J, const char *fmt, ...)
+{
+ va_list ap;
+ fprintf(stderr, "syntax error: line %d: ", J->yyline);
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ fprintf(stderr, "\n");
+ return TK_ERROR;
+}
+
#define nelem(a) (sizeof (a) / sizeof (a)[0])
static const char *keywords[] = {
@@ -35,40 +47,20 @@
return -1;
}
-static inline js_Token findkeyword(js_State *J, const char *s)
+static inline int findkeyword(js_State *J, const char *s)
{
int i = findword(s, keywords, nelem(keywords));
if (i >= 0)
- return JS_BREAK + i;
+ return TK_BREAK + i; /* first keyword + i */
if (findword(s, futurewords, nelem(futurewords)) >= 0)
- return js_syntaxerror(J, "'%s' is a future reserved word", s);
+ return syntaxerror(J, "'%s' is a future reserved word", s);
if (J->strict && findword(s, strictfuturewords, nelem(strictfuturewords)) >= 0)
- return js_syntaxerror(J, "'%s' is a strict mode future reserved word", s);
+ return syntaxerror(J, "'%s' is a strict mode future reserved word", s);
- return JS_IDENTIFIER;
+ return TK_IDENTIFIER;
}
-const char *tokenstrings[] = {
- "(error)", "(eof)", "(identifier)", "null", "true", "false",
- "(number)", "(string)", "(regexp)", "(newline)",
- "{", "}", "(", ")", "[", "]", ".", ";", ",",
- "<", ">", "<=", ">=", "==", "!=", "===", "!==",
- "+", "-", "*", "%", "++", "--", "<<", ">>", ">>>", "&", "|",
- "^", "!", "~", "&&", "||", "?", ":",
- "=", "+=", "-=", "*=", "%=", "<<=", ">>=", ">>>=", "&=", "|=", "^=",
- "/", "/=",
- "break", "case", "catch", "continue", "debugger", "default", "delete",
- "do", "else", "finally", "for", "function", "if", "in", "instanceof",
- "new", "return", "switch", "this", "throw", "try", "typeof", "var",
- "void", "while", "with",
-};
-
-const char *js_tokentostring(js_Token t)
-{
- return tokenstrings[t];
-}
-
#define GET() (*(*sp)++)
#define UNGET() ((*sp)--)
#define PEEK() (**sp)
@@ -76,17 +68,6 @@
#define NEXTPEEK() (NEXT(), PEEK())
#define LOOK(x) (PEEK() == x ? (NEXT(), 1) : 0)
-js_Token js_syntaxerror(js_State *J, const char *fmt, ...)
-{
- va_list ap;
- fprintf(stderr, "syntax error: line %d: ", J->yyline);
- va_start(ap, fmt);
- vfprintf(stderr, fmt, ap);
- va_end(ap);
- fprintf(stderr, "\n");
- return JS_ERROR;
-}
-
static void textinit(js_State *J)
{
if (!J->yytext) {
@@ -112,12 +93,12 @@
static inline int iswhite(int c)
{
- return c == 0x9 || c == 0xb || c == 0xc || c == 0x20 || c == 0xa0;
+ return c == 0x9 || c == 0xB || c == 0xC || c == 0x20 || c == 0xA0;
}
static inline int isnewline(c)
{
- return c == 0xa || c == 0xd || c == 0x2028 || c == 0x2029;
+ return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
}
static inline int isidentifierstart(int c)
@@ -145,9 +126,9 @@
if (c >= '0' && c <= '9')
return c - '0';
if (c >= 'a' && c <= 'f')
- return c - 'a' + 0xa;
+ return c - 'a' + 0xA;
if (c >= 'A' && c <= 'F')
- return c - 'A' + 0xa;
+ return c - 'A' + 0xA;
return 0;
}
@@ -222,7 +203,7 @@
return 0;
}
-static inline js_Token lexnumber(js_State *J, const char **sp)
+static inline int lexnumber(js_State *J, const char **sp)
{
double n;
@@ -229,13 +210,13 @@
if ((*sp)[0] == '0' && ((*sp)[1] == 'x' || (*sp)[1] == 'X')) {
*sp += 2;
if (!ishex(PEEK()))
- return js_syntaxerror(J, "0x not followed by hexademical digit");
+ return syntaxerror(J, "0x not followed by hexademical digit");
J->yynumber = lexhex(sp);
- return JS_NUMBER;
+ return TK_NUMBER;
}
if ((*sp)[0] == '0' && isdec((*sp)[1]))
- return js_syntaxerror(J, "number with leading zero");
+ return syntaxerror(J, "number with leading zero");
n = lexinteger(sp);
if (LOOK('.'))
@@ -243,10 +224,10 @@
n *= pow(10, lexexponent(sp));
if (isidentifierstart(PEEK()))
- return js_syntaxerror(J, "number with letter suffix");
+ return syntaxerror(J, "number with letter suffix");
J->yynumber = n;
- return JS_NUMBER;
+ return TK_NUMBER;
}
static inline int lexescape(const char **sp)
@@ -279,7 +260,7 @@
}
}
-static inline js_Token lexstring(js_State *J, const char **sp, int q)
+static inline int lexstring(js_State *J, const char **sp, int q)
{
int c = GET();
@@ -287,7 +268,7 @@
while (c != q) {
if (c == 0 || isnewline(c))
- return js_syntaxerror(J, "string not terminated");
+ return syntaxerror(J, "string not terminated");
if (c == '\\')
c = lexescape(sp);
@@ -299,23 +280,23 @@
textend(J);
- return JS_STRING;
+ return TK_STRING;
}
/* the ugliest language wart ever... */
-static int isregexpcontext(js_Token last)
+static int isregexpcontext(int last)
{
switch (last)
{
- case JS_IDENTIFIER:
- case JS_NULL:
- case JS_TRUE:
- case JS_FALSE:
- case JS_THIS:
- case JS_NUMBER:
- case JS_STRING:
- case JS_RSQUARE:
- case JS_RPAREN:
+ case ']':
+ case ')':
+ case TK_IDENTIFIER:
+ case TK_NUMBER:
+ case TK_STRING:
+ case TK_FALSE:
+ case TK_NULL:
+ case TK_THIS:
+ case TK_TRUE:
return 0;
default:
return 1;
@@ -322,7 +303,7 @@
}
}
-static js_Token lexregexp(js_State *J, const char **sp)
+static int lexregexp(js_State *J, const char **sp)
{
int c;
@@ -332,12 +313,12 @@
c = GET();
while (c != '/') {
if (c == 0 || isnewline(c)) {
- return js_syntaxerror(J, "regular expression not terminated");
+ return syntaxerror(J, "regular expression not terminated");
} else if (c == '\\') {
textpush(J, c);
c = GET();
if (c == 0 || isnewline(c))
- return js_syntaxerror(J, "regular expression not terminated");
+ return syntaxerror(J, "regular expression not terminated");
textpush(J, c);
c = GET();
} else {
@@ -356,20 +337,21 @@
if (c == 'g') J->yyflags.g ++;
else if (c == 'i') J->yyflags.i ++;
else if (c == 'm') J->yyflags.m ++;
- else return js_syntaxerror(J, "illegal flag in regular expression: %c", c);
+ else return syntaxerror(J, "illegal flag in regular expression: %c", c);
c = NEXTPEEK();
}
if (J->yyflags.g > 1 || J->yyflags.i > 1 || J->yyflags.m > 1)
- return js_syntaxerror(J, "duplicated flag in regular expression");
+ return syntaxerror(J, "duplicated flag in regular expression");
- return JS_REGEXP;
+ return TK_REGEXP;
}
-static js_Token js_leximp(js_State *J, const char **sp)
+static int lex(js_State *J, const char **sp)
{
- int c = GET();
- while (c) {
+ while (1) {
+ int c = GET();
+
while (iswhite(c))
c = GET();
@@ -378,21 +360,23 @@
if (c == '\r' && PEEK() == '\n')
NEXT();
J->yyline++;
- return JS_NEWLINE;
+ return TK_NEWLINE;
}
if (c == '/') {
if (LOOK('/')) {
lexlinecomment(sp);
+ continue;
} else if (LOOK('*')) {
if (lexcomment(sp))
- return js_syntaxerror(J, "multi-line comment not terminated");
+ return syntaxerror(J, "multi-line comment not terminated");
+ continue;
} else if (isregexpcontext(J->lasttoken)) {
return lexregexp(J, sp);
} else if (LOOK('=')) {
- return JS_SLASH_EQ;
+ return TK_DIV_ASS;
} else {
- return JS_SLASH;
+ return '/';
}
}
@@ -411,137 +395,141 @@
return findkeyword(J, J->yytext);
}
- if (c == '.') {
- if (isdec(PEEK())) {
- UNGET();
- return lexnumber(J, sp);
- }
- return JS_PERIOD;
- }
-
if (c >= '0' && c <= '9') {
UNGET();
return lexnumber(J, sp);
}
- if (c == '\'' || c == '"')
+ switch (c) {
+ case '(':
+ case ')':
+ case ',':
+ case ':':
+ case ';':
+ case '?':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '~':
+ return c;
+
+ case '\'':
return lexstring(J, sp, c);
+ case '"':
+ return lexstring(J, sp, c);
- switch (c) {
- case '{': return JS_LCURLY;
- case '}': return JS_RCURLY;
- case '(': return JS_LPAREN;
- case ')': return JS_RPAREN;
- case '[': return JS_LSQUARE;
- case ']': return JS_RSQUARE;
- case '.': return JS_PERIOD;
- case ';': return JS_SEMICOLON;
- case ',': return JS_COMMA;
+ case '.':
+ if (isdec(PEEK())) {
+ UNGET();
+ return lexnumber(J, sp);
+ }
+ return '.';
case '<':
if (LOOK('<')) {
if (LOOK('='))
- return JS_LT_LT_EQ;
- return JS_LT_LT;
+ return TK_SHL_ASS;
+ return TK_SHL;
}
if (LOOK('='))
- return JS_LT_EQ;
- return JS_LT;
+ return TK_LE;
+ return '<';
case '>':
if (LOOK('>')) {
if (LOOK('>')) {
if (LOOK('='))
- return JS_GT_GT_GT_EQ;
- return JS_GT_GT_GT;
+ return TK_USHR_ASS;
+ return TK_USHR;
}
if (LOOK('='))
- return JS_GT_GT_EQ;
- return JS_GT_GT;
+ return TK_SHR_ASS;
+ return TK_SHR;
}
if (LOOK('='))
- return JS_GT_EQ;
- return JS_GT;
+ return TK_GE;
+ return '>';
case '=':
if (LOOK('=')) {
if (LOOK('='))
- return JS_EQ_EQ_EQ;
- return JS_EQ_EQ;
+ return TK_EQ3;
+ return TK_EQ;
}
- return JS_EQ;
+ return '=';
case '!':
if (LOOK('=')) {
if (LOOK('='))
- return JS_EXCL_EQ_EQ;
- return JS_EXCL_EQ;
+ return TK_NE3;
+ return TK_NE;
}
- return JS_EXCL;
+ return '!';
case '+':
if (LOOK('+'))
- return JS_PLUS_PLUS;
+ return TK_INC;
if (LOOK('='))
- return JS_PLUS_EQ;
- return JS_PLUS;
+ return TK_ADD_ASS;
+ return '+';
case '-':
if (LOOK('-'))
- return JS_MINUS_MINUS;
+ return TK_DEC;
if (LOOK('='))
- return JS_MINUS_EQ;
- return JS_MINUS;
+ return TK_SUB_ASS;
+ return '-';
case '*':
if (LOOK('='))
- return JS_STAR_EQ;
- return JS_STAR;
+ return TK_MUL_ASS;
+ return '*';
case '%':
if (LOOK('='))
- return JS_PERCENT_EQ;
- return JS_PERCENT;
+ return TK_MOD_ASS;
+ return '%';
case '&':
if (LOOK('&'))
- return JS_AND_AND;
+ return TK_AND;
if (LOOK('='))
- return JS_AND_EQ;
- return JS_AND;
+ return TK_AND_ASS;
+ return '&';
case '|':
if (LOOK('|'))
- return JS_BAR_BAR;
+ return TK_OR;
if (LOOK('='))
- return JS_BAR_EQ;
- return JS_BAR;
+ return TK_OR_ASS;
+ return '|';
case '^':
if (LOOK('='))
- return JS_HAT_EQ;
- return JS_HAT;
+ return TK_XOR_ASS;
+ return '^';
- case '~': return JS_TILDE;
- case '?': return JS_QUESTION;
- case ':': return JS_COLON;
+ case 0:
+ return 0; /* EOF */
}
- c = GET();
+ if (c >= 0x20 && c <= 0x7E)
+ return syntaxerror(J, "unexpected character: '%c'", c);
+ return syntaxerror(J, "unexpected character: \\u%04X", c);
}
-
- return JS_EOF;
}
-js_Token js_lex(js_State *J, const char **sp)
+void jsP_initlex(js_State *J, const char *source)
{
- js_Token t = js_leximp(J, sp);
- J->lasttoken = t;
- return t;
+ J->yysource = source;
+ J->yyline = 1;
+ J->lasttoken = 0;
}
-void js_initlex(js_State *J)
+int jsP_lex(js_State *J)
{
- J->yyline = 1;
- J->lasttoken = JS_ERROR;
+ int t = lex(J, &J->yysource);
+ J->lasttoken = t;
+ return t;
}
--- a/js-load.c
+++ b/js-load.c
@@ -1,25 +1,13 @@
#include "js.h"
+#include "js-parse.h"
int js_loadstring(js_State *J, const char *source)
{
- js_Token t;
+ int t;
- js_initlex(J);
-
- do {
- t = js_lex(J, &source);
-
- if (t == JS_NUMBER)
- printf("%g\n", J->yynumber);
- else if (t == JS_IDENTIFIER)
- printf("id:%s\n", J->yytext);
- else if (t == JS_STRING)
- printf("'%s'\n", J->yytext);
- else if (t == JS_REGEXP)
- printf("/%s/\n", J->yytext);
- else
- printf("%s\n", js_tokentostring(t));
- } while (t != JS_EOF && t != JS_ERROR);
+ jsP_initlex(J, source);
+ t = jsP_parse(J);
+ printf("parse result = %d\n", t);
return 0;
}
--- /dev/null
+++ b/js-parse.c
@@ -1,0 +1,13 @@
+#include "js.h"
+#include "js-parse.h"
+
+int jsP_parse(js_State *J)
+{
+ int t;
+ do {
+ t = jsP_lex(J);
+ if (t == TK_ERROR)
+ return 1;
+ } while (t);
+ return 0;
+}
--- /dev/null
+++ b/js-parse.h
@@ -1,0 +1,70 @@
+#ifndef js_parse_h
+#define js_parse_h
+
+enum {
+ TK_ERROR = 257,
+ TK_NEWLINE,
+ TK_IDENTIFIER,
+ TK_NUMBER,
+ TK_STRING,
+ TK_REGEXP,
+
+ /* multi-character punctuators */
+ TK_LE,
+ TK_GE,
+ TK_EQ,
+ TK_NE,
+ TK_EQ3,
+ TK_NE3,
+ TK_SHL,
+ TK_SHR,
+ TK_USHR,
+ TK_AND,
+ TK_OR,
+ TK_ADD_ASS,
+ TK_SUB_ASS,
+ TK_MUL_ASS,
+ TK_DIV_ASS,
+ TK_MOD_ASS,
+ TK_SHL_ASS,
+ TK_SHR_ASS,
+ TK_USHR_ASS,
+ TK_AND_ASS,
+ TK_OR_ASS,
+ TK_XOR_ASS,
+ TK_INC,
+ TK_DEC,
+
+ /* keywords */
+ TK_BREAK,
+ TK_CASE,
+ TK_CATCH,
+ TK_CONTINUE,
+ TK_DEBUGGER,
+ TK_DEFAULT,
+ TK_DELETE,
+ TK_DO,
+ TK_ELSE,
+ TK_FALSE,
+ TK_FINALLY,
+ TK_FOR,
+ TK_FUNCTION,
+ TK_IF,
+ TK_IN,
+ TK_INSTANCEOF,
+ TK_NEW,
+ TK_NULL,
+ TK_RETURN,
+ TK_SWITCH,
+ TK_THIS,
+ TK_THROW,
+ TK_TRUE,
+ TK_TRY,
+ TK_TYPEOF,
+ TK_VAR,
+ TK_VOID,
+ TK_WHILE,
+ TK_WITH,
+};
+
+#endif
--- a/js.h
+++ b/js.h
@@ -22,115 +22,20 @@
/* private */
-typedef enum js_Token js_Token;
+void jsP_initlex(js_State *J, const char *source);
+int jsP_lex(js_State *J);
+int jsP_parse(js_State *J);
-enum js_Token
-{
- JS_ERROR,
- JS_EOF,
-
- JS_IDENTIFIER,
- JS_NULL,
- JS_TRUE,
- JS_FALSE,
- JS_NUMBER,
- JS_STRING,
- JS_REGEXP,
- JS_NEWLINE,
-
- /* punctuators */
- JS_LCURLY,
- JS_RCURLY,
- JS_LPAREN,
- JS_RPAREN,
- JS_LSQUARE,
- JS_RSQUARE,
- JS_PERIOD,
- JS_SEMICOLON,
- JS_COMMA,
- JS_LT,
- JS_GT,
- JS_LT_EQ,
- JS_GT_EQ,
- JS_EQ_EQ,
- JS_EXCL_EQ,
- JS_EQ_EQ_EQ,
- JS_EXCL_EQ_EQ,
- JS_PLUS,
- JS_MINUS,
- JS_STAR,
- JS_PERCENT,
- JS_PLUS_PLUS,
- JS_MINUS_MINUS,
- JS_LT_LT,
- JS_GT_GT,
- JS_GT_GT_GT,
- JS_AND,
- JS_BAR,
- JS_HAT,
- JS_EXCL,
- JS_TILDE,
- JS_AND_AND,
- JS_BAR_BAR,
- JS_QUESTION,
- JS_COLON,
- JS_EQ,
- JS_PLUS_EQ,
- JS_MINUS_EQ,
- JS_STAR_EQ,
- JS_PERCENT_EQ,
- JS_LT_LT_EQ,
- JS_GT_GT_EQ,
- JS_GT_GT_GT_EQ,
- JS_AND_EQ,
- JS_BAR_EQ,
- JS_HAT_EQ,
- JS_SLASH,
- JS_SLASH_EQ,
-
- /* keywords */
- JS_BREAK,
- JS_CASE,
- JS_CATCH,
- JS_CONTINUE,
- JS_DEBUGGER,
- JS_DEFAULT,
- JS_DELETE,
- JS_DO,
- JS_ELSE,
- JS_FINALLY,
- JS_FOR,
- JS_FUNCTION,
- JS_IF,
- JS_IN,
- JS_INSTANCEOF,
- JS_NEW,
- JS_RETURN,
- JS_SWITCH,
- JS_THIS,
- JS_THROW,
- JS_TRY,
- JS_TYPEOF,
- JS_VAR,
- JS_VOID,
- JS_WHILE,
- JS_WITH,
-};
-
struct js_State
{
+ const char *yysource;
char *yytext;
size_t yylen, yycap;
double yynumber;
struct { int g, i, m; } yyflags;
int yyline;
- js_Token lasttoken;
+ int lasttoken;
int strict;
};
-
-void js_initlex(js_State *J);
-js_Token js_lex(js_State *J, const char **sp);
-js_Token js_syntaxerror(js_State *J, const char *fmt, ...);
-const char *js_tokentostring(js_Token t);
#endif