ref: c634b3005ccd2ddfe903b1c1c7b7d61e2e8d6fb4
dir: /lib/std/utf.myr/
use "extremum"
use "chartype"
use "die"
use "types"
pkg std =
const Badchar : char = -1
const Maxcharlen : size = 4
const Maxcharval : char = 0x10FFFF
const charlen : (chr : char -> size)
const encode : (buf : byte[:], chr : char -> size)
const decode : (buf : byte[:] -> char)
const charstep : (str : byte[:] -> (char, byte[:]))
const graphemestep : (str : byte[:] -> (byte[:], byte[:]))
const strcellwidth : (str : byte[:] -> size)
;;
const charlen = {c
if c < 0x80
-> 1
elif c < 0x800
-> 2
elif c < 0x10000
-> 3
elif c < 0x200000
-> 4
else
-> 1 /* attempt to resync */
;;
}
const encode = {buf, c
var len
var mark
len = charlen(c)
if len < 0 || buf.len < len
-> -1
;;
if (len == 1)
mark = 0
else
mark = ((1 << (8 - len) - 1) ^ 0xff : char)
;;
for var i = len - 1; i > 0; i--
buf[i] = (c & 0x3f | 0x80 : byte)
c >>= 6
;;
buf[0] = (c | mark : byte)
-> len
}
const decode = {buf
var c
var b
(c, b) = charstep(buf)
-> c
}
const graphemestep = {str
var len = 0
var rest = str
var c
var cn = 0
var width = 0
while rest.len > 0
(c, rest) = charstep(rest)
cn = cellwidth(c)
if (c == '\r' || c == '\n' || c == '\t')
if len == 0
-> (str[:1], str[1:])
else
-> (str[:len], str[len:])
;;
elif (cn > 0 || c == Badchar) && len > 0
-> (str[:len], str[len:])
elif c == Badchar
-> (str[:1], str[1:])
else
len += charlen(c)
width += cn
;;
;;
-> (str[:len], str[len:])
}
const charstep = {str
var len
var mask
var chr
var c
var tmp
if str.len == 0
/* empty string: no resync needed */
-> (Badchar, str)
;;
c = str[0]
len = 0
if c & 0x80 == 0 /* 0b0xxx_xxxx */
len = 1
elif c & 0xe0 == 0xc0 /* 0b110x_xxxx */
len = 2
elif c & 0xf0 == 0xe0 /* 0b1110_xxxx */
len = 3
elif c & 0xf8 == 0xf0 /* 0b1111_0xxx */
len = 4
else
/* skip one char forward so we can try
resyncing the character stream */
-> (Badchar, str[1:])
;;
if len == 0 || len > str.len
/* again, we want to try to resync */
-> (Badchar, str[1:])
;;
mask = (1 << (8 - len)) - 1
chr = (c : uint32) & mask
for var i = 1; i < len; i++
tmp = (str[i] : uint32)
chr = (chr << 6) | (tmp & 0x3f)
;;
-> ((chr : char), str[len:])
}
const strcellwidth = {str
var s : byte[:] = str
var c : char = Badchar
var n : size = 0
while s.len > 0
(c, s) = charstep(s)
if c == Badchar
/* Something will probably be printed as U+FFFD */
n++
elif c < 0x20
/* Control characters take 0 cells */
elif c < 0x7f
/* Bog standard ASCII takes 1 cell */
n++
elif c == 0x7f
/* DEL is like a control character */
else
/* It's not ASCII, so ask chartype what to do */
n += (abs(cellwidth(c)) : size)
;;
;;
-> n
}