shithub: libmujs

Download patch

ref: b60c1cca9b93d4b7975d4e06bc0a633b6c2cb67e
parent: 697eee89ad751841e3c60e116629989cb3dbea98
author: Tor Andersson <tor@ccxvii.net>
date: Tue Feb 25 13:24:17 EST 2014

Use explicit backtracking stack rather than recursion.

Lets us safely detect overflow instead of crashing.

--- a/regex.c
+++ b/regex.c
@@ -14,10 +14,13 @@
 #define nelem(a) (sizeof (a) / sizeof (a)[0])
 
 #define REPINF 255
+#define MAXTHREAD 1000
+#define MAXSUB 10
 
 typedef struct Reclass Reclass;
 typedef struct Renode Renode;
 typedef struct Reinst Reinst;
+typedef struct Rethread Rethread;
 
 struct Reclass {
 	Rune *end;
@@ -26,7 +29,7 @@
 
 struct Reprog {
 	Reinst *start, *end;
-	int icase, newline;
+	int flags;
 	unsigned int ncap;
 	Reclass cclass[16];
 };
@@ -34,12 +37,11 @@
 struct cstate {
 	Reprog *prog;
 	Renode *pstart, *pend;
-	int flags;
 
 	const char *source;
 	unsigned int ncclass;
 	unsigned int ncap;
-	int nref[10];
+	int nref[MAXSUB];
 
 	int lookahead;
 	Rune yychar;
@@ -448,7 +450,7 @@
 		return newnode(g, P_ANY);
 	if (accept(g, '(')) {
 		atom = newnode(g, P_PAR);
-		if (++g->ncap == 10)
+		if (++g->ncap == MAXSUB)
 			die(g, "too many captures");
 		atom->n = g->ncap;
 		g->nref[atom->n] = 0;
@@ -538,7 +540,7 @@
 
 enum {
 	I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
-	I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
+	I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
 	I_BOL, I_EOL, I_WORD, I_NWORD,
 	I_LPAR, I_RPAR
 };
@@ -550,7 +552,6 @@
 	Reclass *cc;
 	Reinst *x;
 	Reinst *y;
-	const char *p;
 };
 
 static unsigned int count(Renode *node)
@@ -577,11 +578,10 @@
 {
 	Reinst *inst = prog->end++;
 	inst->opcode = opcode;
-	inst->cc = NULL;
-	inst->c = 0;
 	inst->n = 0;
+	inst->c = 0;
+	inst->cc = NULL;
 	inst->x = inst->y = NULL;
-	inst->p = 0;
 	return inst;
 }
 
@@ -684,7 +684,7 @@
 		break;
 	case P_CHAR:
 		inst = emit(prog, I_CHAR);
-		inst->c = prog->icase ? canon(node->c) : node->c;
+		inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
 		break;
 	case P_CCLASS:
 		inst = emit(prog, I_CCLASS);
@@ -750,6 +750,7 @@
 		case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
 		case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
 		case I_ANY: puts("any"); break;
+		case I_ANYNL: puts("anynl"); break;
 		case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
 		case I_CCLASS: puts("cclass"); break;
 		case I_NCCLASS: puts("ncclass"); break;
@@ -769,6 +770,7 @@
 {
 	struct cstate g;
 	Renode *node;
+	Reinst *split, *jump;
 	int i;
 
 	g.prog = malloc(sizeof (Reprog));
@@ -784,11 +786,10 @@
 	g.source = pattern;
 	g.ncclass = 0;
 	g.ncap = 0;
-	for (i = 0; i < 10; ++i)
+	for (i = 0; i < MAXSUB; ++i)
 		g.nref[i] = 0;
 
-	g.prog->icase = cflags & REG_ICASE;
-	g.prog->newline = cflags & REG_NEWLINE;
+	g.prog->flags = cflags;
 
 	next(&g);
 	node = parsealt(&g);
@@ -798,7 +799,14 @@
 		die(&g, "syntax error");
 
 	g.prog->ncap = g.ncap;
-	g.prog->start = g.prog->end = malloc((count(node) + 3) * sizeof (Reinst));
+	g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
+
+	split = emit(g.prog, I_SPLIT);
+	split->x = split + 3;
+	split->y = split + 1;
+	emit(g.prog, I_ANYNL);
+	jump = emit(g.prog, I_JUMP);
+	jump->x = split;
 	emit(g.prog, I_LPAR);
 	compile(g.prog, node);
 	emit(g.prog, I_RPAR);
@@ -826,13 +834,6 @@
 
 /* Match */
 
-struct estate {
-	int icase, newline, notbol;
-	int nla;
-	const char *bol;
-	Resub *m;
-};
-
 static int isnewline(int c)
 {
 	return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
@@ -881,176 +882,182 @@
 	return 0;
 }
 
-static int match(struct estate *g, Reinst *pc, const char *s)
+struct Rethread {
+	Reinst *pc;
+	const char *sp;
+	Resub sub[MAXSUB];
+};
+
+static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
 {
-	const char *p;
+	t->pc = pc;
+	t->sp = sp;
+	memcpy(t->sub, sub, sizeof t->sub);
+}
+
+static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
+{
+	Rethread ready[MAXTHREAD];
+	Resub scrap[MAXSUB];
+	Resub sub[MAXSUB];
 	Rune c;
-	unsigned int n;
-	int v;
+	unsigned int nready;
+	int i;
 
-	for (;;) {
-		switch (pc->opcode) {
-		case I_END:
-			return 1;
-		case I_JUMP:
-			pc = pc->x;
-			continue;
-		case I_SPLIT:
-#if 0
-			if (match(g, pc->x, s))
+	/* queue initial thread */
+	spawn(ready + 0, pc, sp, out);
+	nready = 1;
+
+	/* run threads in stack order */
+	while (nready > 0) {
+		--nready;
+		pc = ready[nready].pc;
+		sp = ready[nready].sp;
+		memcpy(sub, ready[nready].sub, sizeof sub);
+		for (;;) {
+			switch (pc->opcode) {
+			case I_END:
+				for (i = 0; i < MAXSUB; ++i) {
+					out[i].sp = sub[i].sp;
+					out[i].ep = sub[i].ep;
+				}
 				return 1;
-			pc = pc->y;
-			continue;
-#endif
-			if (pc->p == s)
-				return 0;
-			p = pc->p;
-			pc->p = s;
-			v = match(g, pc->x, s);
-			pc->p = p;
-			if (v)
-				return 1;
-			pc = pc->y;
-			continue;
-		case I_PLA:
-			if (!match(g, pc->x, s))
-				return 0;
-			pc = pc->y;
-			continue;
-		case I_NLA:
-			++g->nla;
-			v = match(g, pc->x, s);
-			--g->nla;
-			if (v)
-				return 0;
-			pc = pc->y;
-			continue;
-		case I_ANY:
-			s += chartorune(&c, s);
-			if (c == 0)
-				return 0;
-			if (isnewline(c))
-				return 0;
-			break;
-		case I_CHAR:
-			s += chartorune(&c, s);
-			if (c == 0)
-				return 0;
-			if (g->icase)
-				c = canon(c);
-			if (c != pc->c)
-				return 0;
-			break;
-		case I_CCLASS:
-			s += chartorune(&c, s);
-			if (c == 0)
-				return 0;
-			if (g->icase) {
-				if (!incclasscanon(pc->cc, canon(c)))
+			case I_JUMP:
+				pc = pc->x;
+				continue;
+			case I_SPLIT:
+				if (nready >= MAXTHREAD) {
+					fprintf(stderr, "regexec: backtrack overflow!\n");
 					return 0;
-			} else {
-				if (!incclass(pc->cc, c))
-					return 0;
-			}
-			break;
-		case I_NCCLASS:
-			s += chartorune(&c, s);
-			if (c == 0)
-				return 0;
-			if (g->icase) {
-				if (incclasscanon(pc->cc, canon(c)))
-					return 0;
-			} else {
-				if (incclass(pc->cc, c))
-					return 0;
-			}
-			break;
-		case I_REF:
-			n = g->m[pc->n].ep - g->m[pc->n].sp;
-			if (g->icase) {
-				if (strncmpcanon(s, g->m[pc->n].sp, n))
-					return 0;
-			} else {
-				if (strncmp(s, g->m[pc->n].sp, n))
-					return 0;
-			}
-			if (n > 0)
-				s += n;
-			break;
-		case I_BOL:
-			if (s == g->bol && !g->notbol)
+				}
+				spawn(&ready[nready++], pc->y, sp, sub);
+				pc = pc->x;
+				continue;
+
+			case I_PLA:
+				if (!match(pc->x, sp, bol, flags, sub))
+					goto dead;
+				pc = pc->y;
+				continue;
+			case I_NLA:
+				memcpy(scrap, sub, sizeof scrap);
+				if (match(pc->x, sp, bol, flags, scrap))
+					goto dead;
+				pc = pc->y;
+				continue;
+
+			case I_ANYNL:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
 				break;
-			if (g->newline)
-				if (s > g->bol && isnewline(s[-1]))
-					break;
-			return 0;
-		case I_EOL:
-			if (*s == 0)
+			case I_ANY:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				if (isnewline(c))
+					goto dead;
 				break;
-			if (g->newline)
-				if (isnewline(*s))
+			case I_CHAR:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				if (flags & REG_ICASE)
+					c = canon(c);
+				if (c != pc->c)
+					goto dead;
+				break;
+			case I_CCLASS:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				if (flags & REG_ICASE) {
+					if (!incclasscanon(pc->cc, canon(c)))
+						goto dead;
+				} else {
+					if (!incclass(pc->cc, c))
+						goto dead;
+				}
+				break;
+			case I_NCCLASS:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				if (flags & REG_ICASE) {
+					if (incclasscanon(pc->cc, canon(c)))
+						goto dead;
+				} else {
+					if (incclass(pc->cc, c))
+						goto dead;
+				}
+				break;
+			case I_REF:
+				i = sub[pc->n].ep - sub[pc->n].sp;
+				if (flags & REG_ICASE) {
+					if (strncmpcanon(sp, sub[pc->n].sp, i))
+						goto dead;
+				} else {
+					if (strncmp(sp, sub[pc->n].sp, i))
+						goto dead;
+				}
+				if (i > 0)
+					sp += i;
+				break;
+
+			case I_BOL:
+				if (sp == bol && !(flags & REG_NOTBOL))
 					break;
-			return 0;
-		case I_WORD:
-			v = s > g->bol && iswordchar(s[-1]);
-			v ^= iswordchar(s[0]);
-			if (v)
+				if (flags & REG_NEWLINE)
+					if (sp > bol && isnewline(sp[-1]))
+						break;
+				goto dead;
+			case I_EOL:
+				if (*sp == 0)
+					break;
+				if (flags & REG_NEWLINE)
+					if (isnewline(*sp))
+						break;
+				goto dead;
+			case I_WORD:
+				i = sp > bol && iswordchar(sp[-1]);
+				i ^= iswordchar(sp[0]);
+				if (i)
+					break;
+				goto dead;
+			case I_NWORD:
+				i = sp > bol && iswordchar(sp[-1]);
+				i ^= iswordchar(sp[0]);
+				if (!i)
+					break;
+				goto dead;
+
+			case I_LPAR:
+				sub[pc->n].sp = sp;
 				break;
-			return 0;
-		case I_NWORD:
-			v = s > g->bol && iswordchar(s[-1]);
-			v ^= iswordchar(s[0]);
-			if (!v)
+			case I_RPAR:
+				sub[pc->n].ep = sp;
 				break;
-			return 0;
-		case I_LPAR:
-			p = g->m[pc->n].sp;
-			g->m[pc->n].sp = s;
-			if (match(g, pc + 1, s)) {
-				if (g->nla) g->m[pc->n].sp = p;
-				return 1;
+			default:
+				goto dead;
 			}
-			g->m[pc->n].sp = p;
-			return 0;
-		case I_RPAR:
-			p = g->m[pc->n].ep;
-			g->m[pc->n].ep = s;
-			if (match(g, pc + 1, s)) {
-				if (g->nla) g->m[pc->n].ep = p;
-				return 1;
-			}
-			g->m[pc->n].ep = p;
-			return 0;
-		default:
-			return 0;
+			pc = pc + 1;
 		}
-		pc = pc + 1;
+dead: ;
 	}
+	return 0;
 }
 
-int regexec(Reprog *prog, const char *s, int n, Resub *m, int eflags)
+int regexec(Reprog *prog, const char *sp, int n, Resub *m, int eflags)
 {
-	struct estate g;
-	Resub gm[10];
-	Rune c;
+	Resub gm[MAXSUB];
 	unsigned int i;
 
-	g.icase = prog->icase;
-	g.newline = prog->newline;
-	g.notbol = eflags & REG_NOTBOL;
-	g.bol = s;
-	g.nla = 0;
-	g.m = m ? m : gm;
-	for (i = 0; i < 10; ++i)
-		g.m[i].sp = g.m[i].ep = i <= prog->ncap ? s : NULL;
+	m = m ? m : gm;
 
-	do {
-		if (match(&g, prog->start, s))
-			return 0;
-		s += chartorune(&c, s);
-	} while (c);
+	for (i = 0; i < MAXSUB; ++i)
+		m[i].sp = m[i].ep = i <= prog->ncap ? sp : NULL;
 
-	return 1;
+	return !match(prog->start, sp, sp, prog->flags | eflags, m);
 }
 
 #ifdef TEST
@@ -1059,7 +1066,7 @@
 	const char *error;
 	const char *s;
 	Reprog *p;
-	Resub m[10];
+	Resub m[MAXSUB];
 	int i;
 
 	if (argc > 1) {
@@ -1072,8 +1079,8 @@
 		if (argc > 2) {
 			s = argv[2];
 			printf("ncap = %d\n", p->ncap);
-			if (!regexec(p, s, 10, m, 0)) {
-				for (i = 0; i < 10; ++i)
+			if (!regexec(p, s, MAXSUB, m, 0)) {
+				for (i = 0; i < MAXSUB; ++i)
 					if (m[i].sp) {
 						int n = m[i].ep - m[i].sp;
 						printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m[i].sp - s), (int)(m[i].ep - s), n, n, m[i].sp);
--- a/regex.h
+++ b/regex.h
@@ -19,13 +19,10 @@
 enum {
 	/* regcomp flags */
 	REG_ICASE = 1,
-	REG_NEWLINE = 2
-};
+	REG_NEWLINE = 2,
 
-enum {
 	/* regexec flags */
-	REG_NOTBOL = 1,
-	REG_NOTEOL = 2
+	REG_NOTBOL = 4,
 };
 
 #endif