ref: 972b52f65248e3fb498069676c4f4707f4eb9457
dir: /compile.myr/
use std
use "types.use"
pkg regex =
const compile : (re : byte[:] -> std.error(regex#, status))
const dbgcompile : (re : byte[:] -> std.error(regex#, status))
const free : (re : regex# -> void)
;;
type tree = union
/* basic string building */
`Alt [tree#, tree#]
`Cat [tree#, tree#]
/* repetition */
`Star tree#
`Plus tree#
`Quest tree#
/* end matches */
`Byte byte
`Chr char
`Class [char, char]
/* meta */
`Cap tree#
`Bol /* beginning of line */
`Eol /* end of line */
;;
type parseresult = union
`Some tree#
`None
`Fail status
;;
/* Compiles a pattern into a regex */
const compile = {pat
-> regexcompile(std.zalloc(), pat)
}
/* Compiles a pattern into a debug regex. This can be verbose. */
const dbgcompile = {pat
var re
re = std.zalloc()
re.debug = true
-> regexcompile(re, pat)
}
/* compiles a pattern into an allocated regex */
const regexcompile = {re, pat
re.pat = pat
re.nmatch = 1 /* whole match */
match parse(re)
| `None: -> `std.Failure (`Earlystop)
| `Fail f: -> `std.Failure f
| `Some t:
dump(re, t, 0)
append(re, `Ilbra 0)
gen(re, t)
append(re, `Irbra 0)
append(re, `Imatch)
idump(re)
astfree(t)
-> `std.Success re
;;
-> `std.Failure (`Noimpl)
}
const free = {re
/* all the threads should be dead,
so we shouldn't have to free any*/
std.slfree(re.prog)
std.free(re)
}
/* generates bytecode from an AST */
const gen = {re, t
var m
match t#
|`Alt (a, b): genalt(re, a, b)
|`Cat (a, b): gen(re, a); gen(re, b)
/* repetition */
|`Star a: genstar(re, a)
|`Plus a: gen(re, a); genstar(re, a)
|`Quest a: genquest(re, a)
/* end matches */
|`Byte b: append(re, `Ibyte b)
|`Chr c: genchar(re, c)
|`Class (a, b): genrange(re, a, b)
/* meta */
|`Bol: append(re, `Ibol)
|`Eol: append(re, `Ibol)
|`Cap a:
m = re.nmatch++
append(re, `Ilbra m)
gen(re, a)
append(re, `Irbra m)
;;
-> re.proglen
}
/*
converts a codepoint range spanning multiple utf8 byte lenghts into a
set of utf8 ranges. Eg:
[0x00-0x2000] => [0x00-0x7F]|[0xC2-0xDF][0x80-0x8F]
*/
const genrange = {re, lo, hi
/* the transitions between different char lenghts for unicode
characters, needed so that we know how to generate the
different size categories */
var charrng = [
0,
0x80,
0x800,
0x10000,
0x200000,
-1
]
var lbuf : byte[4], hbuf : byte[4]
var lsz, hsz
var sz, end
var d
var i, j
lsz = std.charlen(lo)
hsz = std.charlen(hi)
charrng[lsz - 1] = lo
charrng[hsz] = hi
if lsz == 1 && hsz == 1
append(re, `Irange (lo castto(byte), hi castto(byte)))
else
for i = hsz; i > lsz; i--
if re.debug
std.put("range size = %z\n", i - 2)
;;
d = re.proglen + i - 1
append(re, `Ifork (re.proglen + 1, jmpdist(i) + d))
;;
end = re.proglen + jmpdist(hsz + 1);
for i = 0; i < hsz; i++
if re.debug
std.put("lo[%z] = %i\n", i, charrng[i] castto(int))
std.put("hi[%z] = %i\n", i, (charrng[i + 1] - 1) castto(int))
;;
sz = std.encode(lbuf[:], charrng[i])
std.encode(hbuf[:], charrng[i + 1] - 1)
for j = 0; j < sz; j++
append(re, `Irange (lbuf[j], hbuf[j]))
;;
append(re, `Ijmp (end))
;;
;;
-> re.proglen
}
/* calculates the forward jump distance for a utf8 character range */
const jmpdist = {n
var d
var i
d = n - 1
for i = n - 1; i > 0; i--
d += i
;;
-> d
}
/* generates an alternation */
const genalt = {re, l, r
var alt
var jmp
var l0
var l1
var l2
alt = re.proglen
l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
gen(re, l)
jmp = re.proglen
l1 = append(re, `Ijmp -1) /* needs to be replaced */
l2 = gen(re, r)
re.prog[alt] = `Ifork(l0, l1)
re.prog[jmp] = `Ijmp l2
-> re.proglen
}
/* generates a repetition operator */
const genstar = {re, rep
var alt
var jmp
var l0
var l1
var l2
l0 = re.proglen
alt = re.proglen
l1 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
jmp = gen(re, rep)
l2 = append(re, `Ijmp -1)
re.prog[alt] = `Ifork (l1, l2)
re.prog[jmp] = `Ijmp l0
-> re.proglen
}
/* generates a question mark operator */
const genquest = {re, q
var alt
var l0
var l1
alt = re.proglen
l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
l1 = gen(re, q)
re.prog[alt] = `Ifork (l0, l1)
-> re.proglen
}
/* generates a single char match */
const genchar = {re, c
var b : byte[4]
var n
var i
n = std.encode(b[:], c)
for i = 0; i < n; i++
append(re, `Ibyte b[i])
;;
-> re.proglen
}
/* appends an instructon to an re program */
const append = {re, insn
if re.proglen == re.prog.len
re.prog = std.slgrow(re.prog, std.max(1, 2*re.proglen))
;;
re.prog[re.proglen] = insn
re.proglen++
-> re.proglen
}
/* instruction dump */
const idump = {re
var i
if !re.debug
->
;;
for i = 0; i < re.proglen; i++
std.put("%i:\t", i)
match re.prog[i]
/* Char matching. Consume exactly one byte from the string. */
| `Ibyte b: std.put("`Ibyte %b (%c)\n", b, b castto(char))
| `Irange (start, end): std.put("`Irange (%b,%b)\n", start, end)
/* capture groups */
| `Ilbra m: std.put("`Ilbra %z\n", m)
| `Irbra m: std.put("`Irbra %z\n", m)
/* anchors */
| `Ibol: std.put("`Ibol\n")
| `Ieol: std.put("`Ieol\n")
/* control flow */
| `Ifork (lip, rip): std.put("`Ifork (%z,%z)\n", lip, rip)
| `Ijmp ip: std.put("`Ijmp %z\n", ip)
| `Imatch: std.put("`Imatch\n")
;;
;;
}
/* AST dump */
const dump = {re, t, indent
var i
if !re.debug
->
;;
for i = 0; i < indent; i++
std.put(" ")
;;
match t#
| `Alt (a, b):
std.put("Alt\n")
dump(re, a, indent + 1)
dump(re, b, indent + 1)
| `Cat (a, b):
std.put("Cat\n")
dump(re, a, indent + 1)
dump(re, b, indent + 1)
/* repetition */
| `Star a:
std.put("Star\n")
dump(re, a, indent + 1)
| `Plus a:
std.put("Plus\n")
dump(re, a, indent + 1)
| `Quest a:
std.put("Quest\n")
dump(re, a, indent + 1)
| `Bol:
std.put("Bol\n")
| `Eol:
std.put("Eol\n")
/* end matches */
| `Byte b:
std.put("Byte %b\n", b)
| `Chr c:
std.put("Char %c\n", c)
| `Class (a, b):
std.put("Class (%c-%c)\n", a, b)
/* meta */
| `Cap a:
std.put("Cap\n")
dump(re, a, indent + 1)
;;
}
/* parses an expression */
const parse = {re
match altexpr(re)
| `Some t:
if re.pat.len == 0
-> `Some t
else
astfree(t)
-> `Fail (`Earlystop)
;;
| `None:
-> `None
;;
}
const altexpr = {re
var ret : tree#
match catexpr(re)
| `Some t:
ret = t
if matchc(re, '|')
match altexpr(re)
| `Some rhs:
ret = mk(`Alt (ret, rhs))
| `None:
astfree(ret)
-> `Fail (`Earlystop)
| `Fail f:
-> `Fail f
;;
;;
| other:
-> other
;;
-> `Some ret
}
const catexpr = {re
var ret
match repexpr(re)
| `Some t:
ret = t
match catexpr(re)
| `Some rhs:
ret = mk(`Cat (t, rhs))
| `Fail f: -> `Fail f
| `None: /* nothing */
;;
| other:
-> other
;;
-> `Some ret
}
const repexpr = {re
var ret
match baseexpr(re)
| `Some t:
if matchc(re, '*')
ret = mk(`Star t)
elif matchc(re, '+')
ret = mk(`Plus t)
elif matchc(re, '?')
ret = mk(`Quest t)
else
ret = t
;;
| other:
-> other
;;
-> `Some ret
}
const baseexpr = {re
var ret
if re.pat.len == 0
-> `None
;;
match peekc(re)
/* lower prec operators */
| '|': -> `None
| ')': -> `None
| '*': -> `Fail (`Badrep)
| '+': -> `Fail (`Badrep)
| '?': -> `Fail (`Badrep)
| '[': -> chrclass(re)
| '.': getc(re); ret = mk(`Class (0, std.Maxcharval))
| '^': getc(re); ret = mk(`Bol)
| '$': getc(re); ret = mk(`Eol)
| '(':
getc(re)
match altexpr(re)
| `Some s: ret = mk(`Cap s)
| `None: -> `Fail (`Emptyparen)
;;
if !matchc(re, ')')
astfree(ret)
-> `Fail (`Unbalanced)
;;
| c:
getc(re)
if c == '\\'
if re.pat.len == 0
-> `Fail (`Earlystop)
;;
c = getc(re)
;;
ret = mk(`Chr c)
;;
dump(re, ret, 0)
-> `Some ret
}
const chrclass = {re
var r
var t
matchc(re, '[')
t = rangematch(re)
while peekc(re) != ']'
r = rangematch(re)
t = mk(`Alt (t, r))
;;
if !matchc(re, ']')
astfree(t)
-> `Fail (`Earlystop)
else
-> `Some t
;;
}
const rangematch = {re
var lo
var hi
lo = getc(re)
if matchc(re, '-')
hi = getc(re)
-> mk(`Class (lo, hi))
else
-> mk(`Chr lo)
;;
}
const matchc = {re, c
var str
var chr
(chr, str) = std.striter(re.pat)
if chr != c
-> false
;;
re.pat = str
-> true
}
const getc = {re
var c
(c, re.pat) = std.striter(re.pat)
-> c
}
const peekc = {re
var c
var _
(c, _) = std.striter(re.pat)
-> c
}
const mk = {v
var t
t = std.alloc()
t# = v
-> t
}
const astfree = {t
match t#
| `Alt (a, b): astfree(a); astfree(b)
| `Cat (a, b): astfree(a); astfree(b)
/* repetition */
| `Star a: astfree(a)
| `Plus a: astfree(a)
| `Quest a: astfree(a)
/* end matches */
| `Byte b:
| `Chr c:
| `Class (a, b):
/* meta */
| `Cap a: astfree(a)
;;
std.free(t)
}