ref: a849073c6d1c7174ba648155bcca935a00faab0f
parent: 0c475a0c82f93c48c82e95bf52fb5a3fe59a96c1
author: Ori Bernstein <ori@eigenstate.org>
date: Tue May 24 17:43:01 EDT 2016
Add support for unicode escapes.
--- a/lib/regex/compile.myr
+++ b/lib/regex/compile.myr
@@ -634,41 +634,61 @@
idx = re.idx
match getc(re)
/* character classes */
- | 'd': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciidigit[:]), idx)
- | 'x': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciixdigit[:]), idx)
- | 's': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciispace[:]), idx)
- | 'w': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciiword[:]), idx)
- | 'h': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciiblank[:]), idx)
+ | 'd': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciidigit[:]), idx)
+ | 'x': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciixdigit[:]), idx)
+ | 's': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciispace[:]), idx)
+ | 'w': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciiword[:]), idx)
+ | 'h': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciiblank[:]), idx)
/* negated character classes */
- | 'W': ret = `Some mk(re, `Ranges negate(_ranges.tabasciiword[:]), idx)
- | 'S': ret = `Some mk(re, `Ranges negate(_ranges.tabasciispace[:]), idx)
- | 'D': ret = `Some mk(re, `Ranges negate(_ranges.tabasciidigit[:]), idx)
- | 'X': ret = `Some mk(re, `Ranges negate(_ranges.tabasciixdigit[:]), idx)
- | 'H': ret = `Some mk(re, `Ranges negate(_ranges.tabasciiblank[:]), idx)
+ | 'W': ret = `Some mk(re, `Ranges negate(_ranges.tabasciiword[:]), idx)
+ | 'S': ret = `Some mk(re, `Ranges negate(_ranges.tabasciispace[:]), idx)
+ | 'D': ret = `Some mk(re, `Ranges negate(_ranges.tabasciidigit[:]), idx)
+ | 'X': ret = `Some mk(re, `Ranges negate(_ranges.tabasciixdigit[:]), idx)
+ | 'H': ret = `Some mk(re, `Ranges negate(_ranges.tabasciiblank[:]), idx)
/* unicode character classes */
| 'p': ret = unicodeclass(re, false)
- | 'P': ret = unicodeclass(re, true)
+ | 'P': ret = unicodeclass(re, true)
/* operators that need an escape */
- | '<': ret = `Some mk(re, `Bow, idx)
- | '>': ret = `Some mk(re, `Eow, idx)
+ | '<': ret = `Some mk(re, `Bow, idx)
+ | '>': ret = `Some mk(re, `Eow, idx)
/* escaped metachars */
- | '^': ret = `Some mk(re, `Chr '^', idx)
- | '$': ret = `Some mk(re, `Chr '$', idx)
- | '.': ret = `Some mk(re, `Chr '.', idx)
- | '+': ret = `Some mk(re, `Chr '+', idx)
- | '?': ret = `Some mk(re, `Chr '?', idx)
- | '*': ret = `Some mk(re, `Chr '*', idx)
+ | '^': ret = `Some mk(re, `Chr '^', idx)
+ | '$': ret = `Some mk(re, `Chr '$', idx)
+ | '.': ret = `Some mk(re, `Chr '.', idx)
+ | '+': ret = `Some mk(re, `Chr '+', idx)
+ | '?': ret = `Some mk(re, `Chr '?', idx)
+ | '*': ret = `Some mk(re, `Chr '*', idx)
+
/* escaped nonprintable characters */
- | 'r': ret = `Some mk(re, `Chr '\r', idx)
- | 'n': ret = `Some mk(re, `Chr '\n', idx)
- | 'b': ret = `Some mk(re, `Chr '\b', idx)
- | chr: ret = `Fail `Badescape chr
+ | 'r': ret = `Some mk(re, `Chr '\r', idx)
+ | 'n': ret = `Some mk(re, `Chr '\n', idx)
+ | 'b': ret = `Some mk(re, `Chr '\b', idx)
+ | 'u': ret = unichar(re, idx)
+ | chr: ret = `Fail `Badescape chr
;;
-> ret
+}
+
+const unichar = {re, idx
+ var c
+
+ if !matchc(re, '{')
+ -> `Fail `Badescape 'u'
+ ;;
+
+ c = 0
+ while std.isxdigit(peekc(re))
+ c *= 16
+ c += std.charval(getc(re), 16)
+ ;;
+ if !matchc(re, '}')
+ -> `Fail `Badescape 'u'
+ ;;
+ -> `Some mk(re, `Chr c, idx)
}
const unicodeclass = {re, neg