ref: e0885b354c902a70245d43dd8d05c82dcef2f450
parent: a76d157bdaa9aec8de5ea5a362eed2f59139152d
author: Tor Andersson <tor.andersson@artifex.com>
date: Wed Jan 17 11:04:05 EST 2024
regex: Dynamically allocate character class buffer. Use a huge buffer for compilation, copy into exact size for program. Merge overlapping and adjoining character class ranges. This reduces the number of classes needed for badly constructed character classes like [ABCDEFGHIJKLMNOPQRSTUVWXYZ].
--- a/regexp.c
+++ b/regexp.c
@@ -24,7 +24,7 @@
#define REG_MAXSPAN 64
#endif
#ifndef REG_MAXCLASS
-#define REG_MAXCLASS 16
+#define REG_MAXCLASS 128
#endif
typedef struct Reclass Reclass;
@@ -39,9 +39,9 @@
struct Reprog {
Reinst *start, *end;
+ Reclass *cclass;
int flags;
int nsub;
- Reclass cclass[REG_MAXCLASS];
};
struct cstate {
@@ -60,6 +60,8 @@
const char *error;
jmp_buf kaboom;
+
+ Reclass cclass[REG_MAXCLASS];
};
static void die(struct cstate *g, const char *message)
@@ -207,20 +209,50 @@
static void newcclass(struct cstate *g)
{
- if (g->ncclass >= nelem(g->prog->cclass))
+ if (g->ncclass >= REG_MAXCLASS)
die(g, "too many character classes");
- g->yycc = g->prog->cclass + g->ncclass++;
+ g->yycc = g->cclass + g->ncclass++;
g->yycc->end = g->yycc->spans;
}
static void addrange(struct cstate *g, Rune a, Rune b)
{
+ Reclass *cc = g->yycc;
+ Rune *p;
+
if (a > b)
die(g, "invalid character class range");
- if (g->yycc->end + 2 >= g->yycc->spans + nelem(g->yycc->spans))
+
+ /* extend existing ranges if they overlap */
+ for (p = cc->spans; p < cc->end; p += 2) {
+ /* completely inside old range */
+ if (a >= p[0] && b <= p[1])
+ return;
+
+ /* completely swallows old range */
+ if (a < p[0] && b >= p[1]) {
+ p[0] = a;
+ p[1] = b;
+ return;
+ }
+
+ /* extend at start */
+ if (b >= p[0] - 1 && b <= p[1] && a < p[0]) {
+ p[0] = a;
+ return;
+ }
+
+ /* extend at end */
+ if (a >= p[0] && a <= p[1] + 1 && b > p[1]) {
+ p[1] = b;
+ return;
+ }
+ }
+
+ if (cc->end + 2 >= cc->spans + nelem(cc->spans))
die(g, "too many character class ranges");
- *g->yycc->end++ = a;
- *g->yycc->end++ = b;
+ *cc->end++ = a;
+ *cc->end++ = b;
}
static void addranges_d(struct cstate *g)
@@ -422,7 +454,7 @@
unsigned char type;
unsigned char ng, m, n;
Rune c;
- Reclass *cc;
+ int cc;
Renode *x;
Renode *y;
};
@@ -431,7 +463,7 @@
{
Renode *node = g->pend++;
node->type = type;
- node->cc = NULL;
+ node->cc = -1;
node->c = 0;
node->ng = 0;
node->m = 0;
@@ -493,13 +525,13 @@
}
if (g->lookahead == L_CCLASS) {
atom = newnode(g, P_CCLASS);
- atom->cc = g->yycc;
+ atom->cc = g->yycc - g->cclass;
next(g);
return atom;
}
if (g->lookahead == L_NCCLASS) {
atom = newnode(g, P_NCCLASS);
- atom->cc = g->yycc;
+ atom->cc = g->yycc - g->cclass;
next(g);
return atom;
}
@@ -761,11 +793,11 @@
break;
case P_CCLASS:
inst = emit(prog, I_CCLASS);
- inst->cc = node->cc;
+ inst->cc = prog->cclass + node->cc;
break;
case P_NCCLASS:
inst = emit(prog, I_NCCLASS);
- inst->cc = node->cc;
+ inst->cc = prog->cclass + node->cc;
break;
case P_REF:
inst = emit(prog, I_REF);
@@ -775,16 +807,17 @@
}
#ifdef TEST
-static void dumpnode(Renode *node)
+static void dumpnode(struct cstate *g, Renode *node)
{
Rune *p;
+ Reclass *cc;
if (!node) { printf("Empty"); return; }
switch (node->type) {
- case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
- case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
+ case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
+ case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
case P_REP:
printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
- dumpnode(node->x);
+ dumpnode(g, node->x);
printf(")");
break;
case P_BOL: printf("Bol"); break;
@@ -791,19 +824,21 @@
case P_EOL: printf("Eol"); break;
case P_WORD: printf("Word"); break;
case P_NWORD: printf("NotWord"); break;
- case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
- case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
- case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
+ case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break;
+ case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break;
+ case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break;
case P_ANY: printf("Any"); break;
case P_CHAR: printf("Char(%c)", node->c); break;
case P_CCLASS:
printf("Class(");
- for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
+ cc = g->cclass + node->cc;
+ for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
printf(")");
break;
case P_NCCLASS:
printf("NotClass(");
- for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
+ cc = g->cclass + node->cc;
+ for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
printf(")");
break;
case P_REF: printf("Ref(%d)", node->n); break;
@@ -868,7 +903,11 @@
if (setjmp(g.kaboom)) {
if (errorp) *errorp = g.error;
alloc(ctx, g.pstart, 0);
- alloc(ctx, g.prog, 0);
+ if (g.prog) {
+ alloc(ctx, g.prog->cclass, 0);
+ alloc(ctx, g.prog->start, 0);
+ alloc(ctx, g.prog, 0);
+ }
return NULL;
}
@@ -875,6 +914,9 @@
g.prog = alloc(ctx, NULL, sizeof (Reprog));
if (!g.prog)
die(&g, "cannot allocate regular expression");
+ g.prog->start = NULL;
+ g.prog->cclass = NULL;
+
n = strlen(pattern) * 2;
if (n > REG_MAXPROG)
die(&g, "program too large");
@@ -900,7 +942,7 @@
die(&g, "syntax error");
#ifdef TEST
- dumpnode(node);
+ dumpnode(&g, node);
putchar('\n');
#endif
@@ -913,6 +955,15 @@
if (!g.prog->start)
die(&g, "cannot allocate regular expression instruction list");
+ if (g.ncclass > 0) {
+ g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass));
+ if (!g.prog->cclass)
+ die(&g, "cannot allocate regular expression character class list");
+ memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass));
+ for (i = 0; i < g.ncclass; ++i)
+ g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans);
+ }
+
split = emit(g.prog, I_SPLIT);
split->x = split + 3;
split->y = split + 1;
@@ -937,6 +988,8 @@
void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
{
if (prog) {
+ if (prog->cclass)
+ alloc(ctx, prog->cclass, 0);
alloc(ctx, prog->start, 0);
alloc(ctx, prog, 0);
}
--- a/regexp.h
+++ b/regexp.h
@@ -32,7 +32,7 @@
* code and the regexp.c compilation unit use the same value!
*/
#ifndef REG_MAXSUB
-#define REG_MAXSUB 10
+#define REG_MAXSUB 16
#endif
struct Resub {