shithub: libmujs

Download patch

ref: bfe569921d63fdbb29fe06c8e19ac402e009b960
parent: 01d85a49949e513d82882239a028ca2ba0790b36
author: Tor Andersson <tor@ccxvii.net>
date: Wed Feb 26 15:22:53 EST 2014

Improve Resub API.

Hold the subexpression count and array of matches inside a struct.

--- a/jsregexp.c
+++ b/jsregexp.c
@@ -29,9 +29,9 @@
 
 void js_RegExp_prototype_exec(js_State *J, js_Regexp *re, const char *text)
 {
-	Resub m[REG_MAXSUB];
 	unsigned int i;
 	int opts;
+	Resub m;
 
 	opts = 0;
 	if (re->flags & JS_REGEXP_G) {
@@ -46,14 +46,14 @@
 		}
 	}
 
-	if (!js_regexec(re->prog, text, nelem(m), m, opts)) {
+	if (!js_regexec(re->prog, text, &m, opts)) {
 		js_newarray(J);
-		for (i = 0; i < nelem(m) && m[i].sp; ++i) {
-			js_pushlstring(J, m[i].sp, m[i].ep - m[i].sp);
+		for (i = 0; i < m.nsub; ++i) {
+			js_pushlstring(J, m.sub[i].sp, m.sub[i].ep - m.sub[i].sp);
 			js_setindex(J, -2, i);
 		}
 		if (re->flags & JS_REGEXP_G)
-			re->last = re->last + (m[0].ep - text);
+			re->last = re->last + (m.sub[0].ep - text);
 		return;
 	}
 
@@ -67,8 +67,8 @@
 {
 	js_Regexp *re;
 	const char *text;
-	Resub m[REG_MAXSUB];
 	int opts;
+	Resub m;
 
 	re = js_toregexp(J, 0);
 	text = js_tostring(J, 1);
@@ -86,9 +86,9 @@
 		}
 	}
 
-	if (!js_regexec(re->prog, text, nelem(m), m, opts)) {
+	if (!js_regexec(re->prog, text, &m, opts)) {
 		if (re->flags & JS_REGEXP_G)
-			re->last = re->last + (m[0].ep - text);
+			re->last = re->last + (m.sub[0].ep - text);
 		js_pushboolean(J, 1);
 		return;
 	}
--- a/jsstring.c
+++ b/jsstring.c
@@ -307,10 +307,10 @@
 static void Sp_match(js_State *J, unsigned int argc)
 {
 	js_Regexp *re;
-	Resub m[REG_MAXSUB];
 	const char *text;
 	unsigned int len;
 	const char *a, *b, *c, *e;
+	Resub m;
 
 	text = js_tostring(J, 0);
 
@@ -335,11 +335,11 @@
 	a = text;
 	e = text + strlen(text);
 	while (a <= e) {
-		if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0))
+		if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0))
 			break;
 
-		b = m[0].sp;
-		c = m[0].ep;
+		b = m.sub[0].sp;
+		c = m.sub[0].ep;
 
 		js_pushlstring(J, b, c - b);
 		js_setindex(J, -2, len++);
@@ -353,8 +353,8 @@
 static void Sp_search(js_State *J, unsigned int argc)
 {
 	js_Regexp *re;
-	Resub m[REG_MAXSUB];
 	const char *text;
+	Resub m;
 
 	text = js_tostring(J, 0);
 
@@ -367,8 +367,8 @@
 
 	re = js_toregexp(J, -1);
 
-	if (!js_regexec(re->prog, text, nelem(m), m, 0))
-		js_pushnumber(J, js_utfptrtoidx(text, m[0].sp));
+	if (!js_regexec(re->prog, text, &m, 0))
+		js_pushnumber(J, js_utfptrtoidx(text, m.sub[0].sp));
 	else
 		js_pushnumber(J, -1);
 }
@@ -376,15 +376,15 @@
 static void Sp_replace_regexp(js_State *J, unsigned int argc)
 {
 	js_Regexp *re;
-	Resub m[REG_MAXSUB];
 	const char *source, *s, *r;
 	js_Buffer *sb = NULL;
-	int n, x;
+	unsigned int n, x;
+	Resub m;
 
 	source = js_tostring(J, 0);
 	re = js_toregexp(J, 1);
 
-	if (js_regexec(re->prog, source, nelem(m), m, 0)) {
+	if (js_regexec(re->prog, source, &m, 0)) {
 		js_copy(J, 0);
 		return;
 	}
@@ -392,14 +392,14 @@
 	re->last = 0;
 
 loop:
-	s = m[0].sp;
-	n = m[0].ep - m[0].sp;
+	s = m.sub[0].sp;
+	n = m.sub[0].ep - m.sub[0].sp;
 
 	if (js_iscallable(J, 2)) {
 		js_copy(J, 2);
 		js_pushglobal(J);
-		for (x = 0; m[x].sp; ++x) /* arg 0..x: substring and subexps that matched */
-			js_pushlstring(J, m[x].sp, m[x].ep - m[x].sp);
+		for (x = 0; m.sub[x].sp; ++x) /* arg 0..x: substring and subexps that matched */
+			js_pushlstring(J, m.sub[x].sp, m.sub[x].ep - m.sub[x].sp);
 		js_pushnumber(J, s - source); /* arg x+2: offset within search string */
 		js_copy(J, 0); /* arg x+3: search string */
 		js_call(J, 2 + x);
@@ -425,8 +425,8 @@
 					if (r[1] >= '0' && r[1] <= '9')
 						x = x * 10 + *(++r) - '0';
 					// TODO: use prog->nsub somehow
-					if (x > 0 && x < REG_MAXSUB && m[x].sp) {
-						sb_putm(&sb, m[x].sp, m[x].ep);
+					if (x > 0 && x < m.nsub) {
+						sb_putm(&sb, m.sub[x].sp, m.sub[x].ep);
 					} else {
 						sb_putc(&sb, '$');
 						if (x > 10) {
@@ -450,7 +450,7 @@
 	}
 
 	if (re->flags & JS_REGEXP_G) {
-		source = m[0].ep;
+		source = m.sub[0].ep;
 		if (n == 0) {
 			if (*source)
 				sb_putc(&sb, *source++);
@@ -457,7 +457,7 @@
 			else
 				goto end;
 		}
-		if (!js_regexec(re->prog, source, nelem(m), m, REG_NOTBOL))
+		if (!js_regexec(re->prog, source, &m, REG_NOTBOL))
 			goto loop;
 	}
 
@@ -544,10 +544,10 @@
 static void Sp_split_regexp(js_State *J, unsigned int argc)
 {
 	js_Regexp *re;
-	Resub m[REG_MAXSUB];
 	const char *text;
 	unsigned int limit, len, k;
 	const char *p, *a, *b, *c, *e;
+	Resub m;
 
 	text = js_tostring(J, 0);
 	re = js_toregexp(J, 1);
@@ -560,7 +560,7 @@
 
 	/* splitting the empty string */
 	if (e == 0) {
-		if (js_regexec(re->prog, text, nelem(m), m, 0)) {
+		if (js_regexec(re->prog, text, &m, 0)) {
 			if (len == limit) return;
 			js_pushliteral(J, "");
 			js_setindex(J, -2, 0);
@@ -570,11 +570,11 @@
 
 	p = a = text;
 	while (a < e) {
-		if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0))
+		if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0))
 			break; /* no match */
 
-		b = m[0].sp;
-		c = m[0].ep;
+		b = m.sub[0].sp;
+		c = m.sub[0].ep;
 
 		/* empty string at end of last match */
 		if (b == p) {
@@ -586,9 +586,9 @@
 		js_pushlstring(J, p, b - p);
 		js_setindex(J, -2, len++);
 
-		for (k = 1; k < nelem(m) && m[k].sp; ++k) {
+		for (k = 1; k < m.nsub; ++k) {
 			if (len == limit) return;
-			js_pushlstring(J, m[k].sp, m[k].ep - m[k].sp);
+			js_pushlstring(J, m.sub[k].sp, m.sub[k].ep - m.sub[k].sp);
 			js_setindex(J, -2, len++);
 		}
 
--- a/regex.c
+++ b/regex.c
@@ -30,7 +30,7 @@
 struct Reprog {
 	Reinst *start, *end;
 	int flags;
-	unsigned int ncap;
+	unsigned int nsub;
 	Reclass cclass[16];
 };
 
@@ -40,8 +40,8 @@
 
 	const char *source;
 	unsigned int ncclass;
-	unsigned int ncap;
-	Renode *cap[MAXSUB];
+	unsigned int nsub;
+	Renode *sub[MAXSUB];
 
 	int lookahead;
 	Rune yychar;
@@ -77,7 +77,7 @@
 	L_NLA,		/* "(?!" negative lookahead */
 	L_WORD,		/* "\b" word boundary */
 	L_NWORD,	/* "\B" non-word boundary */
-	L_REF,		/* "\0" back-reference */
+	L_REF,		/* "\1" back-reference */
 	L_COUNT,	/* {M,N} */
 };
 
@@ -459,10 +459,10 @@
 	}
 	if (g->lookahead == L_REF) {
 		atom = newnode(g, P_REF);
-		if (g->yychar == 0 || g->yychar > g->ncap || !g->cap[g->yychar])
+		if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
 			die(g, "invalid back-reference");
 		atom->n = g->yychar;
-		atom->x = g->cap[g->yychar];
+		atom->x = g->sub[g->yychar];
 		next(g);
 		return atom;
 	}
@@ -470,12 +470,11 @@
 		return newnode(g, P_ANY);
 	if (accept(g, '(')) {
 		atom = newnode(g, P_PAR);
-		if (++g->ncap == MAXSUB)
+		if (g->nsub == MAXSUB)
 			die(g, "too many captures");
-		atom->n = g->ncap;
-		g->cap[atom->n] = NULL;
+		atom->n = g->nsub++;
 		atom->x = parsealt(g);
-		g->cap[atom->n] = atom;
+		g->sub[atom->n] = atom;
 		if (!accept(g, ')'))
 			die(g, "unmatched '('");
 		return atom;
@@ -805,9 +804,9 @@
 
 	g.source = pattern;
 	g.ncclass = 0;
-	g.ncap = 0;
+	g.nsub = 1;
 	for (i = 0; i < MAXSUB; ++i)
-		g.cap[i] = 0;
+		g.sub[i] = 0;
 
 	g.prog->flags = cflags;
 
@@ -818,7 +817,7 @@
 	if (g.lookahead != 0)
 		die(&g, "syntax error");
 
-	g.prog->ncap = g.ncap;
+	g.prog->nsub = g.nsub;
 	g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
 
 	split = emit(g.prog, I_SPLIT);
@@ -905,7 +904,7 @@
 struct Rethread {
 	Reinst *pc;
 	const char *sp;
-	Resub sub[MAXSUB];
+	Resub sub;
 };
 
 static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
@@ -912,14 +911,14 @@
 {
 	t->pc = pc;
 	t->sp = sp;
-	memcpy(t->sub, sub, sizeof t->sub);
+	memcpy(&t->sub, sub, sizeof t->sub);
 }
 
 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
 {
 	Rethread ready[MAXTHREAD];
-	Resub scrap[MAXSUB];
-	Resub sub[MAXSUB];
+	Resub scratch;
+	Resub sub;
 	Rune c;
 	unsigned int nready;
 	int i;
@@ -933,13 +932,13 @@
 		--nready;
 		pc = ready[nready].pc;
 		sp = ready[nready].sp;
-		memcpy(sub, ready[nready].sub, sizeof sub);
+		memcpy(&sub, &ready[nready].sub, sizeof sub);
 		for (;;) {
 			switch (pc->opcode) {
 			case I_END:
 				for (i = 0; i < MAXSUB; ++i) {
-					out[i].sp = sub[i].sp;
-					out[i].ep = sub[i].ep;
+					out->sub[i].sp = sub.sub[i].sp;
+					out->sub[i].ep = sub.sub[i].ep;
 				}
 				return 1;
 			case I_JUMP:
@@ -950,18 +949,18 @@
 					fprintf(stderr, "regexec: backtrack overflow!\n");
 					return 0;
 				}
-				spawn(&ready[nready++], pc->y, sp, sub);
+				spawn(&ready[nready++], pc->y, sp, &sub);
 				pc = pc->x;
 				continue;
 
 			case I_PLA:
-				if (!match(pc->x, sp, bol, flags, sub))
+				if (!match(pc->x, sp, bol, flags, &sub))
 					goto dead;
 				pc = pc->y;
 				continue;
 			case I_NLA:
-				memcpy(scrap, sub, sizeof scrap);
-				if (match(pc->x, sp, bol, flags, scrap))
+				memcpy(&scratch, &sub, sizeof scratch);
+				if (match(pc->x, sp, bol, flags, &scratch))
 					goto dead;
 				pc = pc->y;
 				continue;
@@ -1012,12 +1011,12 @@
 				}
 				break;
 			case I_REF:
-				i = sub[pc->n].ep - sub[pc->n].sp;
+				i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
 				if (flags & REG_ICASE) {
-					if (strncmpcanon(sp, sub[pc->n].sp, i))
+					if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
 						goto dead;
 				} else {
-					if (strncmp(sp, sub[pc->n].sp, i))
+					if (strncmp(sp, sub.sub[pc->n].sp, i))
 						goto dead;
 				}
 				if (i > 0)
@@ -1052,10 +1051,10 @@
 				goto dead;
 
 			case I_LPAR:
-				sub[pc->n].sp = sp;
+				sub.sub[pc->n].sp = sp;
 				break;
 			case I_RPAR:
-				sub[pc->n].ep = sp;
+				sub.sub[pc->n].ep = sp;
 				break;
 			default:
 				goto dead;
@@ -1067,17 +1066,19 @@
 	return 0;
 }
 
-int regexec(Reprog *prog, const char *sp, int n, Resub *m, int eflags)
+int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
 {
-	Resub gm[MAXSUB];
-	unsigned int i;
+	Resub scratch;
+	int i;
 
-	m = m ? m : gm;
+	if (!sub)
+		sub = &scratch;
 
+	sub->nsub = prog->nsub;
 	for (i = 0; i < MAXSUB; ++i)
-		m[i].sp = m[i].ep = i <= prog->ncap ? sp : NULL;
+		sub->sub[i].sp = sub->sub[i].ep = NULL;
 
-	return !match(prog->start, sp, sp, prog->flags | eflags, m);
+	return !match(prog->start, sp, sp, prog->flags | eflags, sub);
 }
 
 #ifdef TEST
@@ -1086,8 +1087,8 @@
 	const char *error;
 	const char *s;
 	Reprog *p;
-	Resub m[MAXSUB];
-	int i;
+	Resub m;
+	unsigned int i;
 
 	if (argc > 1) {
 		p = regcomp(argv[1], 0, &error);
@@ -1098,13 +1099,12 @@
 
 		if (argc > 2) {
 			s = argv[2];
-			printf("ncap = %d\n", p->ncap);
-			if (!regexec(p, s, MAXSUB, m, 0)) {
-				for (i = 0; i < MAXSUB; ++i)
-					if (m[i].sp) {
-						int n = m[i].ep - m[i].sp;
-						printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m[i].sp - s), (int)(m[i].ep - s), n, n, m[i].sp);
-					}
+			printf("nsub = %d\n", p->nsub);
+			if (!regexec(p, s, &m, 0)) {
+				for (i = 0; i < m.nsub; ++i) {
+					int n = m.sub[i].ep - m.sub[i].sp;
+					printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
+				}
 			} else {
 				printf("no match\n");
 			}
--- a/regex.h
+++ b/regex.h
@@ -7,13 +7,9 @@
 
 typedef struct Reprog Reprog;
 typedef struct Resub Resub;
-struct Resub {
-	const char *sp;
-	const char *ep;
-};
 
 Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
-int regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags);
+int regexec(Reprog *prog, const char *string, Resub *sub, int eflags);
 void regfree(Reprog *prog);
 
 enum {
@@ -26,6 +22,14 @@
 
 	/* limits */
 	REG_MAXSUB = 16
+};
+
+struct Resub {
+	unsigned int nsub;
+	struct {
+		const char *sp;
+		const char *ep;
+	} sub[REG_MAXSUB];
 };
 
 #endif