ref: 70b4fc9127a3e8863f128642d62d2fbcbf7121be
dir: /lib/std/utf.myr/
use "extremum" use "chartype" use "die" use "types" pkg std = const Badchar : char = -1 const Maxcharlen : size = 4 const Maxcharval : char = 0x10FFFF const charlen : (chr : char -> size) const encode : (buf : byte[:], chr : char -> size) const decode : (buf : byte[:] -> char) const charstep : (str : byte[:] -> (char, byte[:])) const graphemestep : (str : byte[:] -> (byte[:], byte[:])) const strcellwidth : (str : byte[:] -> size) ;; const charlen = {c if c < 0x80 -> 1 elif c < 0x800 -> 2 elif c < 0x10000 -> 3 elif c < 0x200000 -> 4 else -> 1 /* attempt to resync */ ;; } const encode = {buf, c var len var mark len = charlen(c) if len < 0 || buf.len < len -> -1 ;; if (len == 1) mark = 0 else mark = ((1 << (8 - len) - 1) ^ 0xff : char) ;; for var i = len - 1; i > 0; i-- buf[i] = (c & 0x3f | 0x80 : byte) c >>= 6 ;; buf[0] = (c | mark : byte) -> len } const decode = {buf var c var b (c, b) = charstep(buf) -> c } const graphemestep = {str var len = 0 var rest = str var c var cn = 0 var width = 0 while rest.len > 0 (c, rest) = charstep(rest) cn = cellwidth(c) if (c == '\r' || c == '\n' || c == '\t') if len == 0 -> (str[:1], str[1:]) else -> (str[:len], str[len:]) ;; elif (cn > 0 || c == Badchar) && len > 0 -> (str[:len], str[len:]) elif c == Badchar -> (str[:1], str[1:]) else len += charlen(c) width += cn ;; ;; -> (str[:len], str[len:]) } const charstep = {str var len var mask var chr var c var tmp if str.len == 0 /* empty string: no resync needed */ -> (Badchar, str) ;; c = str[0] len = 0 if c & 0x80 == 0 /* 0b0xxx_xxxx */ len = 1 elif c & 0xe0 == 0xc0 /* 0b110x_xxxx */ len = 2 elif c & 0xf0 == 0xe0 /* 0b1110_xxxx */ len = 3 elif c & 0xf8 == 0xf0 /* 0b1111_0xxx */ len = 4 else /* skip one char forward so we can try resyncing the character stream */ -> (Badchar, str[1:]) ;; if len == 0 || len > str.len /* again, we want to try to resync */ -> (Badchar, str[1:]) ;; mask = (1 << (8 - len)) - 1 chr = (c : uint32) & mask for var i = 1; i < len; i++ tmp = (str[i] : uint32) chr = (chr << 6) | (tmp & 0x3f) ;; -> ((chr : char), str[len:]) } const strcellwidth = {str var s : byte[:] = str var c : char = Badchar var n : size = 0 while s.len > 0 (c, s) = charstep(s) if s.len != 0 && c == Badchar /* Something will probably be printed as U+FFFD */ n++ elif c < 0x20 /* Control characters take 0 cells */ elif c < 0x7f /* Bog standard ASCII takes 1 cell */ n++ elif c == 0x7f /* DEL is like a control character */ else /* It's not ASCII, so ask chartype what to do */ n += (abs(cellwidth(c)) : size) ;; ;; -> n }