ref: 595f94567824d5c4d1e4ef75ed825eccc6b1783c
dir: /lib/std/utf.myr/
use "die"
use "types"
pkg std =
const Badchar : char = -1
const Maxcharlen : size = 4
const Maxcharval : char = 0x10FFFF
const charlen : (chr : char -> size)
const encode : (buf : byte[:], chr : char -> size)
const decode : (buf : byte[:] -> char)
const strstep : (str : byte[:] -> (char, byte[:]))
;;
const charlen = {c
if c < 0x80
-> 1
elif c < 0x800
-> 2
elif c < 0x10000
-> 3
elif c < 0x200000
-> 4
else
-> -1
;;
}
const encode = {buf, c
var len
var mark
len = charlen(c)
if len < 0 || buf.len < len
-> -1
;;
if (len == 1)
mark = 0
else
mark = ((1 << (8 - len) - 1) ^ 0xff : char)
;;
for var i = len - 1; i > 0; i--
buf[i] = (c & 0x3f | 0x80 : byte)
c >>= 6
;;
buf[0] = (c | mark : byte)
-> len
}
const decode = {buf
var c
var b
(c, b) = strstep(buf)
-> c
}
const strstep = {str
var len
var mask
var chr
var c
var tmp
if str.len == 0
/* empty string: no resync needed */
-> (Badchar, str)
;;
c = str[0]
len = 0
if c & 0x80 == 0 /* 0b0xxx_xxxx */
len = 1
elif c & 0xe0 == 0xc0 /* 0b110x_xxxx */
len = 2
elif c & 0xf0 == 0xe0 /* 0b1110_xxxx */
len = 3
elif c & 0xf8 == 0xf0 /* 0b1111_0xxx */
len = 4
else
/* skip one char forward so we can try
resyncing the character stream */
-> (Badchar, str[1:])
;;
if len == 0 || len > str.len
/* again, we want to try to resync */
-> (Badchar, str[1:])
;;
mask = (1 << (8 - len)) - 1
chr = (c : uint32) & mask
for var i = 1; i < len; i++
tmp = (str[i] : uint32)
chr = (chr << 6) | (tmp & 0x3f)
;;
-> ((chr : char), str[len:])
}