ref: 72958419b4fb70531051e05e2ca56ecccc524b6b
dir: /lib/regex/interp.myr/
use std use "types" pkg regex = /* regex execution */ const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:])) const search : (re : regex#, str : byte[:] -> std.option(byte[:][:])) /* regex execution returning indexes */ const iexec : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:])) const isearch : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:])) /* substitution */ const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:])) const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool) const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:]) const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void) const matchfree : (pat : byte[:][:] -> void) ;; /* Ugly: for performance. std.option() should be used instead when unions get faster. */ const Zthr = (0 : rethread#) const Maxfree = 128 const matchfree = {m std.slfree(m) } const exec = {re, str -> getmatches(re, run(re, str, 0, true)) } const iexec = {re, str -> getidxmatches(re, run(re, str, 0, true)) } const search = {re, str var thr = Zthr for var i = 0; i < str.len; i++ thr = run(re, str[i:], 0, false) if thr != Zthr break ;; ;; -> getmatches(re, thr) } const isearch = {re, str var thr = Zthr for var i = 0; i < str.len; i++ thr = run(re, str[i:], 0, false) if thr != Zthr break ;; ;; -> getidxmatches(re, thr) } const sub = {re, str, subst var sb sb = std.mksb() if sbsub(sb, re, str, subst) -> `std.Some std.sbfin(sb) ;; -> `std.None } const sbsub = {sb, re, str, subst std.assert(re.nmatch == subst.len + 1, "substitution length does not match capture count") -> dosubst(sb, re, run(re, str, 0, true), str, subst) } const suball = {re, str, subst var sb sb = std.mksb() sbsuball(sb, re, str, subst) -> std.sbfin(sb) } const sbsuball = {sb, re, str, subst var thr, len, i std.assert(re.nmatch == subst.len + 1, "substitution length does not match capture count") i = 0 while i < str.len thr = run(re, str[i:], 0, false) if thr == Zthr std.sbputb(sb, str[i]) i++ else len = thr.mgroup[0][1] dosubst(sb, re, thr, str[i:len + i], subst) i += len ;; cleanup(re, thr) ;; } const dosubst = {sb, re, thr, str, subst var off if thr == Zthr -> false ;; off = 0 for var i = 1; i < re.nmatch; i++ if thr.mgroup[i][0] != -1 && thr.mgroup[i][1] != -1 std.sbputs(sb, str[off:thr.mgroup[i][0]]) std.sbputs(sb, subst[i - 1]) off = thr.mgroup[i][1] ;; ;; std.sbputs(sb, str[off:]) -> true } const cleanup = {re, result lfree(re.runq) lfree(re.expired) lfree(re.free) std.free(result) re.runq = Zthr re.expired = Zthr re.free = Zthr re.nfree = 0 re.nexttid = 0 re.nthr = 0 } const lfree = {thr for var next = thr; thr != Zthr; thr = next next = thr.next std.free(thr) ;; } const getmatches = {re, thr var ret, i if thr == Zthr -> `std.None ;; i = 0 ret = std.slalloc(re.nmatch) for [lo, hi] : thr.mgroup[:re.nmatch] if lo != -1 && hi != -1 ret[i] = re.str[lo : hi] ;; i++ ;; cleanup(re, thr) -> `std.Some ret } const getidxmatches = {re, thr var ret if thr == Zthr -> `std.None ;; ret = std.slalloc(re.nmatch) for var i = 0; i < re.nmatch; i++ ret[i] = (thr.mgroup[i][0], thr.mgroup[i][1]) ;; cleanup(re, thr) -> `std.Some ret } /* returns a matching thread, or Zthr if no threads matched */ const run = {re, str, idx, wholestr var bestmatch var consumed var states var thr var ip re.str = str re.strp = 0 bestmatch = Zthr states = std.mkbs() re.runq = mkthread(re, 0) if re.debug /* The last run could have left things here, since we need this info after the run */ for bs : re.traces std.bsfree(bs) ;; std.slfree(re.traces) re.traces = [][:] std.slpush(&re.traces, std.mkbs()) ;; for var i = 0; i < re.nmatch; i++ re.runq.mgroup[i][0] = -1 re.runq.mgroup[i][1] = -1 ;; while re.nthr > 0 while re.runq != Zthr if re.trace std.put("switch\n") ;; /* set up the next thread */ thr = re.runq re.runq = thr.next ip = thr.ip consumed = step(re, thr, -1) while !consumed consumed = step(re, thr, ip) ;; if std.bshas(states, thr.ip) die(re, thr) ;; if thr.dead thrfree(re, thr) elif thr.matched if bestmatch != Zthr thrfree(re, bestmatch) ;; if re.strp == re.str.len bestmatch = thr goto done elif !wholestr bestmatch = thr else thrfree(re, thr) ;; elif !thr.matched std.bsput(states, thr.ip) if re.expired == Zthr re.expired = thr ;; if re.expiredtail != Zthr re.expiredtail.next = thr ;; re.expiredtail = thr thr.next = Zthr ;; ;; std.bsclear(states) re.runq = re.expired re.expired = Zthr re.expiredtail = Zthr re.strp++ ;; :done std.bsfree(states) -> bestmatch } /* Steps forward one instruction. Returns true if a byte of input was consumed, false otherwise. */ const step = {re, thr, curip var str, nthr, inst str = re.str inst = re.code[thr.ip] if re.trace itrace(re, thr, re.prog[thr.ip]) ;; if re.debug std.bsput(re.traces[thr.tid], thr.ip) ;; match inst & 0xf /* Char matching. Consume exactly one byte from the string. */ | OpRange: var lo = (inst >> 4 : byte) var hi = (inst >> 16 : byte) if !within(re, str) || lo > str[re.strp] || hi < str[re.strp] die(re, thr) else thr.ip++ ;; | OpByte: var b = (inst >> 4 : byte) if !within(re, str) die(re, thr) elif b != str[re.strp] die(re, thr) else thr.ip++ ;; | OpFork: var lip = ((inst >> 4) & 0x3fffffff : std.size) var rip = ((inst >> 34) & 0x3fffffff : std.size) if rip != curip nthr = mkthread(re, rip) nthr.next = re.runq nthr.mgroup = thr.mgroup re.runq = nthr ;; if re.debug std.slpush(&re.traces, std.bsdup(re.traces[thr.tid])) ;; thr.ip = lip -> false /* Non-consuming. All of these return false, and expect step to be called again until exactly one byte is consumed from the string. */ | OpJmp: var ip = (inst >> 4 : std.size) thr.ip = ip -> false | OpMatch: var id = (inst >> 4 : std.size) re.lastthr = thr.tid finish(re, thr) -> true | OpLbra: var m = (inst >> 4 : std.size) thr.mgroup[m][0] = re.strp thr.ip++ -> false | OpRbra: var m = (inst >> 4 : std.size) thr.mgroup[m][1] = re.strp thr.ip++ -> false | OpBol: if re.strp == 0 || str[re.strp - 1] == ('\n' : byte) thr.ip++ -> false else die(re, thr) ;; | OpEol: if re.strp == str.len || str[re.strp] == ('\n' : byte) thr.ip++ -> false else die(re, thr) ;; /* check for word characters */ | OpBow: if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp))) thr.ip++ -> false else die(re, thr) ;; | OpEow: if re.strp == str.len && iswordchar(prevchar(str, re.strp)) thr.ip++ -> false elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp)) thr.ip++ -> false else die(re, thr) ;; | _: std.die("corrupt regex bytecode") ;; -> true } const die = {re, thr /* we can have die called on a thread multiple times, eg, if it has a bad range *and* end in a state that another thread is in. We should only decrement the number of threads for that once. */ if !thr.dead re.nthr-- ;; re.lastip = thr.ip re.lastthr = thr.tid thr.dead = true } const finish = {re, thr thr.matched = true re.nthr-- } const mkthread = {re, ip var thr : rethread# if re.free != Zthr thr = re.free re.free = thr.next re.nfree-- else thr = std.alloc() ;; thr.next = Zthr thr.ip = ip thr.tid = re.nexttid++ thr.dead = false thr.matched = false re.nthr++ -> thr } const thrfree = {re, thr if re.nfree >= Maxfree std.free(thr) else thr.next = re.free re.free = thr re.nfree++ ;; } const within = {re, str -> re.strp < str.len } const itrace = {re, thr, inst match inst | `Ibyte b: std.put("\t{}.{}:\tByte ({})\n", thr.tid, thr.ip, thr.tid, thr.ip, b) | `Irange (lo, hi): std.put("\t{}.{}:\tRange {}, {}\n", thr.tid, thr.ip, lo, hi) | `Ilbra m: std.put("\t{}.{}:\tLbra {}\n", thr.tid, thr.ip, m) | `Irbra m: std.put("\t{}.{}:\tRbra {}\n", thr.tid, thr.ip, m) /* anchors */ | `Ibol: std.put("\t{}.{}:\tBol\n", thr.tid, thr.ip) | `Ieol: std.put("\t{}.{}:\tEol\n", thr.tid, thr.ip) | `Ibow: std.put("\t{}.{}:\tBow\n", thr.tid, thr.ip) | `Ieow: std.put("\t{}.{}:\tEow\n", thr.tid, thr.ip) /* control flow */ | `Ifork (l, r): std.put("\t{}.{}:\tFork {}, {}\n", thr.tid, thr.ip, l, r) | `Ijmp ip: std.put("\t{}.{}:\tJmp {}\n", thr.tid, thr.ip, ip) | `Imatch m: std.put("\t{}.{}:\tMatch {}\n", thr.tid, thr.ip, m) ;; } /* must be called with i >= 1 */ const prevchar = {s, i std.assert(i != 0, "prevchar must be called with i >= 1\n") i-- while i != 0 && s[i] >= 0x80 i-- ;; -> s[i:] } const iswordchar = {s var c c = std.decode(s) -> std.isalpha(c) || std.isdigit(c) || c == '_' }