ref: 15392a525cae61a9fc06d40cd4846664512661b2
dir: /lib/regex/interp.myr/
use std
use "types"
pkg regex =
/* regex execution */
const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
const search : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
/* regex execution returning indexes */
const iexec : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:]))
const isearch : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:]))
/* substitution */
const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)
const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)
const matchfree : (pat : byte[:][:] -> void)
;;
/* Ugly: for performance. std.option() should be used instead when unions get faster. */
const Zthr = (0 : rethread#)
const exec = {re, str
var thr, m
thr = run(re, str, 0, true)
m = getmatches(re, thr)
cleanup(re)
-> m
}
const iexec = {re, str
var thr, m
thr = run(re, str, 0, true)
m = getidxmatches(re, thr)
cleanup(re)
-> m
}
const search = {re, str
var thr
var m
m = `std.None
for var i = 0; i < str.len; i++
thr = run(re, str[i:], 0, false)
m = getmatches(re, thr)
match m
| `std.Some _: break
| `std.None: /* nothing */
;;
cleanup(re)
;;
-> m
}
const isearch = {re, str
var thr
var m
m = `std.None
for var i = 0; i < str.len; i++
thr = run(re, str[i:], 0, false)
m = getidxmatches(re, thr)
match m
| `std.Some _: break
| `std.None: /* nothing */
;;
cleanup(re)
;;
-> m
}
const sub = {re, str, subst
var sb
sb = std.mksb()
if !sbsub(sb, re, str, subst)
-> `std.None
else
-> `std.Some std.sbfin(sb)
;;
}
const sbsub = {sb, re, str, subst
var thr, m
/* we always have m[0] as the full match */
if re.nmatch != subst.len + 1
-> false
;;
thr = run(re, str, 0, true)
if thr == Zthr
m = false
else
m = dosubst(sb, re, thr, str, subst)
;;
cleanup(re)
-> m
}
const suball = {re, str, subst
var sb
sb = std.mksb()
sbsuball(sb, re, str, subst)
-> std.sbfin(sb)
}
const sbsuball = {sb, re, str, subst
var thr, len, s, i
/* we always have m[0] as the full match */
if re.nmatch != subst.len + 1
-> void
;;
i = 0
while i < str.len
thr = run(re, str[i:], 0, false)
if thr == Zthr
std.sbputb(sb, str[i])
i++
else
len = thr.mend[0]
s = str[i:len + i]
dosubst(sb, re, thr, s, subst)
i += len
;;
cleanup(re)
;;
}
const dosubst = {sb, re, thr, str, subst
var off
off = 0
for var i = 1; i < re.nmatch; i++
if thr.mstart[i] != -1 && thr.mend[i] != -1
std.sbputs(sb, str[off:thr.mstart[i]])
std.sbputs(sb, subst[i - 1])
off = thr.mend[i]
;;
;;
std.sbputs(sb, str[off:])
thrfree(re, thr)
-> true
}
const cleanup = {re
var thr, next
for thr = re.runq; thr != Zthr; thr = next
next = thr.next
thrfree(re, thr)
;;
for thr = re.expired; thr != Zthr; thr = next
next = thr.next
thrfree(re, thr)
;;
re.nexttid = 0
}
const matchfree = {m
std.slfree(m)
}
const getmatches = {re, thr
var ret
if thr == Zthr
-> `std.None
;;
ret = std.slalloc(re.nmatch)
for var i = 0; i < re.nmatch; i++
if thr.mstart[i] != -1 && thr.mend[i] != -1
ret[i] = re.str[thr.mstart[i]:thr.mend[i]]
else
ret[i] = [][:]
;;
;;
thrfree(re, thr)
-> `std.Some ret
}
const getidxmatches = {re, thr
var ret
if thr == Zthr
-> `std.None
;;
ret = std.slalloc(re.nmatch)
for var i = 0; i < re.nmatch; i++
if thr.mstart[i] != -1 && thr.mend[i] != -1
ret[i] = (thr.mstart[i], thr.mend[i])
else
ret[i] = (-1, -1)
;;
;;
thrfree(re, thr)
-> `std.Some ret
}
/* returns a matching thread, or Zthr if no threads matched */
const run = {re, str, idx, wholestr
var bestmatch
var consumed
var states
var thr
var ip
re.str = str
re.strp = 0
bestmatch = Zthr
states = std.mkbs()
re.runq = mkthread(re, 0)
if re.debug
/* The last run could have left things here, since we need this info after the run */
for bs in re.traces
std.bsfree(bs)
;;
std.slfree(re.traces)
re.traces = [][:]
std.slpush(&re.traces, std.mkbs())
;;
re.runq.mstart = std.slalloc(re.nmatch)
re.runq.mend = std.slalloc(re.nmatch)
for var i = 0; i < re.nmatch; i++
re.runq.mstart[i] = -1
re.runq.mend[i] = -1
;;
while re.nthr > 0
while re.runq != Zthr
/* set up the next thread */
thr = re.runq
re.runq = thr.next
trace(re, thr, "\nrunning tid={}, ip={}, s[{}]={}\n", \
thr.tid, thr.ip, re.strp, std.decode(re.str[re.strp:]))
ip = thr.ip
consumed = step(re, thr, -1)
while !consumed
consumed = step(re, thr, ip)
;;
if std.bshas(states, thr.ip)
die(re, thr, "there can be only one")
;;
if thr.dead
thrfree(re, thr)
elif thr.matched
trace(re, thr, "new bestmatch\n")
if bestmatch != Zthr
thrfree(re, bestmatch)
;;
if re.strp == re.str.len
bestmatch = thr
goto done
elif !wholestr
bestmatch = thr
;;
elif !thr.matched
std.bsput(states, thr.ip)
if re.expired == Zthr
re.expired = thr
;;
if re.expiredtail != Zthr
re.expiredtail.next = thr
;;
re.expiredtail = thr
thr.next = Zthr
;;
;;
std.bsclear(states)
trace(re, Zthr, "switch\n")
re.runq = re.expired
re.expired = Zthr
re.expiredtail = Zthr
re.strp++
;;
:done
std.bsfree(states)
-> bestmatch
}
/*
Steps forward one instruction. Returns true if a byte of input was
consumed, false otherwise.
*/
const step = {re, thr, curip
var str
var mstart
var mend
str = re.str
match re.prog[thr.ip]
/* Char matching. Consume exactly one byte from the string. */
| `Ibyte b:
trace(re, thr, "\t{}:\tByte {} ({})\n", thr.ip, b, (b : char))
if !within(re, str)
die(re, thr, "end of string")
elif b != str[re.strp]
die(re, thr, "not right char")
else
hit(re, thr)
thr.ip++
trace(re, thr, "\t\tmatched {} with {}\n", b, str[re.strp])
;;
| `Irange (start, end):
trace(re, thr, "\t{}:\tRange ({}, {}) /* {} - {} */\n", thr.ip, start, end, (start : char), (end : char))
if !within(re, str) || start > str[re.strp] || end < str[re.strp]
die(re, thr, "bad range")
else
hit(re, thr)
thr.ip++
;;
/*
Non-consuming. All of these return false, and expect step to be
called again until exactly one byte is consumed from the string.
*/
| `Ibol:
trace(re, thr, "\t{}:\tBol\n", thr.ip)
if re.strp == 0 || str[re.strp - 1] == ('\n' : byte)
hit(re, thr)
thr.ip++
-> false
else
die(re, thr, "not beginning of line")
;;
| `Ieol:
trace(re, thr, "\t{}:\tEol\n", thr.ip)
if re.strp == str.len || str[re.strp] == ('\n' : byte)
hit(re, thr)
thr.ip++
-> false
else
die(re, thr, "not end of line")
;;
/* check for word characters */
| `Ibow:
trace(re, thr, "\t{}:\tBow\n", thr.ip)
if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp)))
hit(re, thr)
thr.ip++
-> false
else
die(re, thr, "not beginning of word")
;;
| `Ieow:
trace(re, thr, "\t{}:\tEow\n", thr.ip)
if re.strp == str.len && iswordchar(prevchar(str, re.strp))
hit(re, thr)
thr.ip++
-> false
elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp))
hit(re, thr)
thr.ip++
-> false
else
die(re, thr, "not end of word")
;;
| `Ilbra m:
trace(re, thr, "\t{}:\tLbra {}\n", thr.ip, m)
trace(re, thr, "\t\tmatch start = {}\n", re.strp)
thr.mstart[m] = re.strp
hit(re, thr)
thr.ip++
-> false
| `Irbra m:
trace(re, thr, "\t{}:\tRbra {}\n", thr.ip, m)
thr.mend[m] = re.strp
hit(re, thr)
thr.ip++
-> false
| `Ifork (lip, rip):
trace(re, thr, "\t{}:\tFork ({}, {})\n", thr.ip, lip, rip)
mstart = std.sldup(thr.mstart)
mend = std.sldup(thr.mend)
hit(re, thr)
fork(re, thr, rip, curip, mstart, mend)
if re.debug
std.slpush(&re.traces, std.bsdup(re.traces[thr.tid]))
;;
thr.ip = lip
-> false
| `Ijmp ip:
trace(re, thr, "\t{}:\tJmp {}\n", thr.ip, ip)
hit(re, thr)
thr.ip = ip
-> false
| `Imatch id:
trace(re, thr, "\t{}:\tMatch\n", thr.ip)
re.lastthr = thr.tid
finish(re, thr)
-> true
;;
-> true
}
const fork = {re, thr, ip, curip, mstart, mend
var thr
if ip == curip /* loop detection */
-> void
;;
thr = mkthread(re, ip)
thr.next = re.runq
thr.mstart = mstart
thr.mend = mend
re.runq = thr
}
const die = {re, thr, msg
/*
we can have die called on a thread
multiple times, eg, if it has a bad
range *and* end in a state that another
thread is in. We should only decrement
the number of threads for that once.
*/
trace(re, thr, "\t\tdie {}: {}\n", thr.tid, msg)
if !thr.dead
re.nthr--
;;
re.lastip = thr.ip
re.lastthr = thr.tid
thr.dead = true
}
const finish = {re, thr
trace(re, thr, "finish {}\n", thr.tid)
thr.matched = true
re.nthr--
}
const mkthread = {re, ip
var thr : rethread#
thr = std.alloc()
thr.next = Zthr
thr.ip = ip
thr.tid = re.nexttid++
thr.dead = false
thr.matched = false
thr.mstart = [][:]
thr.mend = [][:]
re.nthr++
-> thr
}
const thrfree = {re, thr
trace(re, thr, "\t\tcleanup {}\n", thr.tid)
std.slfree(thr.mstart)
std.slfree(thr.mend)
std.free(thr)
}
const within = {re, str
-> re.strp < str.len
}
const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void) = {re, thr, msg, args
var ap
if re.trace && thr != Zthr
ap = std.vastart(&args)
std.putv(msg, &ap)
std.put("\t{}\n", re.pat)
std.put("\t")
for var i = 0; i < re.pcidx[thr.ip] - 1; i++
std.put(" ")
;;
std.put("^\n")
;;
}
const hit = {re, thr
if re.debug
std.bsput(re.traces[thr.tid], thr.ip)
;;
}
/* must be called with i >= 1 */
const prevchar = {s, i
std.assert(i != 0, "prevchar must be called with i >= 1\n")
i--
while i != 0 && s[i] >= 0x80
i--
;;
-> s[i:]
}
const iswordchar = {s
var c
c = std.decode(s)
-> std.isalpha(c) || std.isdigit(c) || c == '_'
}