ref: 04759ec9af6dcc78ea5873ceaf6db2e3b3920b22
parent: 2163aebcb85e8214869a2c026b3fc5bd9ddac22c
author: Jacob Moody <moody@posixcafe.org>
date: Sat Mar 25 21:02:20 EDT 2023
runecomp(2)
--- /dev/null
+++ b/lib/ucd/mkfile
@@ -1,0 +1,85 @@
+</$objtype/mkfile
+
+VERSION='15.0.0'
+URL='https://www.unicode.org/Public/'$VERSION'/ucd/'
+
+TXT=\
+ ArabicShaping.txt\
+ BidiBrackets.txt\
+ BidiMirroring.txt\
+ BidiTest.txt\
+ Blocks.txt\
+ CJKRadicals.txt\
+ CaseFolding.txt\
+ CompositionExclusions.txt\
+ DerivedAge.txt\
+ DerivedCoreProperties.txt\
+ DerivedNormalizationProps.txt\
+ EastAsianWidth.txt\
+ EmojiSources.txt\
+ EquivalentUnifiedIdeograph.txt\
+ HangulSyllableType.txt\
+ Index.txt\
+ IndicPositionalCategory.txt\
+ IndicSyllabicCategory.txt\
+ Jamo.txt\
+ LineBreak.txt\
+ NameAliases.txt\
+ NamedSequences.txt\
+ NamedSequencesProv.txt\
+ NamesList.txt\
+ NormalizationCorrections.txt\
+ NushuSources.txt\
+ PropList.txt\
+ PropertyAliases.txt\
+ PropertyValueAliases.txt\
+ ScriptExtensions.txt\
+ Scripts.txt\
+ SpecialCasing.txt\
+ StandardizedVariants.txt\
+ TangutSources.txt\
+ USourceData.txt\
+ UnicodeData.txt\
+ VerticalOrientation.txt\
+
+TEST=\
+ NormalizationTest.txt\
+ BidiCharacterTest.txt\
+
+PDF=\
+ USourceGlyphs.pdf\
+ USourceRSChart.pdf\
+
+AUX=\
+ WordBreakProperty.txt\
+ GraphemeBreakProperty.txt\
+
+ucd:V: UnicodeData.txt
+
+%.txt:
+ hget $URL^$target > $target >[2]/dev/null
+%.pdf:
+ hget $URL^$target > $target
+
+emoji-data.txt:
+ hget $URL^emoji/^$target > $target
+
+WordBreakProperty.txt:
+ hget $URL^'auxiliary/'^$target > $target
+
+GraphemeBreakProperty.txt:
+ hget $URL^'auxiliary/'^$target > $target
+
+WordBreakTest.txt:
+ hget $URL^'auxiliary/'^$target > $target
+
+GraphemeBreakTest.txt:
+ hget $URL^'auxiliary/'^$target > $target
+
+txt:V: $TXT
+
+pdf:V: $PDF
+
+test:V: $TEST
+
+all:V: $TXT $PDF $TEST
--- a/sys/include/libc.h
+++ b/sys/include/libc.h
@@ -77,6 +77,18 @@
extern long runestrlen(Rune*);
extern Rune* runestrstr(Rune*, Rune*);
+extern int runecomp(Rune*, Rune*, int);
+extern int runedecomp(Rune*, Rune*, int);
+extern int utfcomp(char*, char*, int);
+extern int utfdecomp(char*, char*, int);
+extern char* fullutfnorm(char*,int);
+extern Rune* fullrunenorm(Rune*,int);
+
+extern Rune* runewbreak(Rune*);
+extern char* utfwbreak(char*);
+extern Rune* runegbreak(Rune*);
+extern char* utfgbreak(char*);
+
extern Rune tolowerrune(Rune);
extern Rune totitlerune(Rune);
extern Rune toupperrune(Rune);
--- a/sys/man/2/isalpharune
+++ b/sys/man/2/isalpharune
@@ -48,7 +48,11 @@
.PP
The case-conversion routines return the character unchanged if it has no case.
.SH SOURCE
-.B /sys/src/libc/port/runetype.c
+.B /sys/src/libc/port/mkrunetype.c
+.br
+.B /sys/src/libc/port/runeistype.c
+.br
+.B /sys/src/libc/port/runetotype.c
.SH "SEE ALSO
.IR ctype (2) ,
.IR "The Unicode Standard" .
--- /dev/null
+++ b/sys/man/2/runecomp
@@ -1,0 +1,116 @@
+.TH RUNECOMP 2
+.SH NAME
+runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak, utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak \- multi-rune graphemes
+.SH SYNOPSIS
+.ta \w'\fLchar*xx'u
+.B #include <u.h>
+.br
+.B #include <libc.h>
+.PP
+.B
+int runecomp(Rune *dst, Rune *src, int max)
+.PP
+.B
+int runedecomp(Rune *dst, Rune *src, int max)
+.PP
+.B
+Rune* fullrunenorm(Rune *s, int n)
+.PP
+.B
+Rune* runegbreak(Rune *s)
+.PP
+.B
+Rune* runewbreak(Rune *s)
+.PP
+.B
+int utfcomp(char *dst, char *src, int max)
+.PP
+.B
+int utfdecomp(char *dst, char *src, int max)
+.PP
+.B
+char* fullutfnorm(char *s, int n)
+.PP
+.B
+char* utfgbreak(char *s)
+.PP
+.B
+char* utfwbreak(char *s)
+.SH DESCRIPTION
+These routines help in handling
+graphemes that may span multiple runes.
+.PP
+.IR Runecomp ,
+.IR runedecomp ,
+.IR utfcomp ,
+and
+.I utfdecomp
+perform Unicode® normalization on
+.IR src ,
+storing the result in
+.IR dst .
+No more than
+.I max
+elements will be written, and the resulting string
+will always be null terminated. The return value
+is always the total number of elements required to
+store the transformation. If this value is larger
+than the supplied
+.I max
+the caller can assume the result has been truncated.
+.I Runecomp
+and
+.I utfcomp
+perform NFC normalization while
+.I runedecomp
+and
+.I utfdecomp
+perform NFD normalization.
+.PP
+.IR Fullrunenorm ,
+and
+.I fullutfnorm
+determine if enough elements are present in
+.I s
+to perform normalization. If enough are present,
+a pointer is returned to the first element that begins
+the next context. Otherwise
+.I s
+is returned. No more then
+.I n
+elements will be read. In order to find the boundary, the
+first element of the next context must be peeked.
+.PP
+.I Runegbreak
+and
+.I utfgbreak
+search
+.B s
+for the next grapheme break opportunity.
+If none is found before the end of the string,
+.I s
+is returned.
+.PP
+.I Runewbreak
+and
+.I utfwbreak
+search
+.B s
+for the next word break opportunity.
+If none is found before the end of the string,
+.I s
+is returned.
+.SH SOURCE
+.B /sys/src/libc/port/mkrunetype.c
+.br
+.B /sys/src/libc/port/runenorm.c
+.br
+.B /sys/src/libc/port/runebreak.c
+.SH SEE ALSO
+Unicode® Standard Annex #15
+.br
+Unicode® Standard Annex #29
+.br
+.IR rune (2),
+.IR utf (6),
+.IR tcs (1)
--- a/sys/src/libc/port/mkfile
+++ b/sys/src/libc/port/mkfile
@@ -62,6 +62,9 @@
rand.c\
readn.c\
rune.c\
+ runebreak.c\
+ runeistype.c\
+ runenorm.c\
runestrcat.c\
runestrchr.c\
runestrcmp.c\
@@ -74,7 +77,7 @@
runestrrchr.c\
runestrlen.c\
runestrstr.c\
- runetype.c\
+ runetotype.c\
sin.c\
sinh.c\
sqrt.c\
@@ -127,3 +130,26 @@
</sys/src/cmd/mksyslib
profile.$O: /sys/include/tos.h
+
+runenorm.$O: runenormdata runenorm.c
+runetotype.$O: runetotypedata runetotype.c
+runeistype.$O: runeistypedata runeistype.c
+runebreak.$O: runebreakdata runebreak.c
+
+UCD=\
+ /lib/ucd/WordBreakProperty.txt\
+ /lib/ucd/GraphemeBreakProperty.txt\
+ /lib/ucd/emoji-data.txt\
+ /lib/ucd/CompositionExclusions.txt\
+ /lib/ucd/UnicodeData.txt\
+
+/lib/ucd/%:
+ cd /lib/ucd && mk $stem
+
+runenormdata runetotypedata runeistypedata runebreakdata: mkrunetype.c $UCD
+ @{
+ eval `{grep '^[A-Z]' /$cputype/mkfile}
+ $CC $CFLAGS -o mkrunetype.$O mkrunetype.c
+ $LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
+ $O.mkrunetype
+ }
--- /dev/null
+++ b/sys/src/libc/port/mkrunetype.c
@@ -1,0 +1,748 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+enum{
+ NRUNES = 1<<21
+};
+
+typedef struct Param Param;
+typedef struct Lvl Lvl;
+struct Lvl{
+ int bits;
+ int max;
+ int mask;
+};
+struct Param{
+ Lvl idx1;
+ Lvl idx2;
+ Lvl data;
+
+ int round1max;
+};
+
+static void
+derive(Lvl *l)
+{
+ l->max = 1 << l->bits;
+ l->mask = l->max - 1;
+}
+
+static void
+param(Param *p, int idx1, int idx2)
+{
+
+ assert(idx1 + idx2 < 21);
+ p->idx1.bits = idx1;
+ p->idx2.bits = idx2;
+ p->data.bits = 21 - idx1 - idx2;
+ derive(&p->idx1);
+ derive(&p->idx2);
+ derive(&p->data);
+
+ p->round1max = NRUNES/p->data.max;
+}
+
+static int
+lkup(Param *p, int *idx1, int *idx2, int *data, int x)
+{
+ int y, z;
+
+ y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
+ z = (((x)>>p->data.bits)&p->idx2.mask);
+ return data[idx2[idx1[y] + z] + (x&p->data.mask)];
+}
+
+static int
+mkarrvar(int fd, char *name, int *d, int len)
+{
+ int i, sz;
+ int max, min;
+ char *t;
+
+ max = min = 0;
+ for(i = 0; i < len; i++){
+ if(d[i] > max)
+ max = d[i];
+ if(d[i] < min)
+ min = d[i];
+ }
+ if(min == 0){
+ if(max < 0xFF)
+ t = "uchar", sz = 1;
+ else if(max < 0xFFFF)
+ t = "ushort", sz = 2;
+ else
+ t = "uint", sz = 4;
+ } else {
+ if(max < 1<<7)
+ t = "char", sz = 1;
+ else if(max < 1<<15)
+ t = "short", sz = 2;
+ else
+ t = "int", sz = 4;
+ }
+ if(fd < 0)
+ return sz * len;
+
+ fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
+ for(i = 0; i < len; i++){
+ fprint(fd, "%d,", d[i]);
+ if((i+1) % 16 == 0)
+ fprint(fd, "\n\t");
+ }
+ fprint(fd, "\n};\n");
+
+ return sz * len;
+}
+
+static int
+mkexceptarr(int fd, char *name, int *d, int n, int all)
+{
+ int i;
+ fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
+ for(i = 0; i < n*3; i += 3){
+ if(all && d[i] != 0)
+ fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
+ else if(!all)
+ fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);
+ if((i+3) % (8*3) == 0)
+ fprint(fd, "\n\t");
+ }
+ fprint(fd, "\n};\n");
+ return n * sizeof(Rune) * 2;
+}
+
+static int
+compact(int *data, int *idx, int nidx, int *src, int chunksize)
+{
+ int i, n, ndata, best;
+ int *dot, *lp, *rp;
+
+ dot = src;
+ ndata = 0;
+ idx[0] = 0;
+ for(i = 1; i <= nidx; i++){
+ rp = dot + chunksize;
+ lp = rp - 1;
+
+ for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
+ if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
+ best = n+1;
+ }
+ memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
+ ndata += (chunksize - best);
+ idx[i] = idx[i - 1] + (chunksize - best);
+ dot = rp;
+ }
+ return ndata;
+}
+
+
+static int
+mklkup(int fd, char *label, int *map, Param *p)
+{
+ static int data[NRUNES];
+ static int idx2[NRUNES];
+ static int idx2dest[NRUNES];
+ static int idx1[NRUNES];
+ int i, nidx2, ndata;
+ int size;
+
+ ndata = compact(data, idx2, p->round1max, map, p->data.max);
+ nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
+
+ if(fd >= 0){
+ for(i = 0; i < NRUNES; i++)
+ if(map[i] != lkup(p, idx1, idx2dest, data, i))
+ sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i));
+ }
+
+ size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
+ size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
+ size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
+ if(fd >= 0){
+ fprint(fd, "\n");
+ fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);
+ fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);
+ fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
+ fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",
+ label, label, label, label, label, label, label);
+ }
+ return size;
+}
+
+static void
+mklkupmatrix(char *label, int *map, Param *p)
+{
+ int bestsize, size, bestx, besty;
+ int x, y;
+
+ bestsize = bestx = besty = -1;
+ for(x = 4; x <= 12; x++)
+ for(y=4; y <= (19 - x); y++){
+ param(p, x, y);
+ size = mklkup(-1, label, map, p);
+ if(bestsize == -1 || size < bestsize){
+ bestx = x;
+ besty = y;
+ bestsize = size;
+ }
+ }
+
+ assert(bestsize != -1);
+ fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
+ param(p, bestx, besty);
+}
+
+static int myismerged[NRUNES];
+static int mytoupper[NRUNES];
+static int mytolower[NRUNES];
+static int mytotitle[NRUNES];
+static int mybreak[NRUNES];
+
+enum{ DSTART = 0xEEEE };
+static int mydecomp[NRUNES];
+static int mydespecial[256*3];
+static int nspecial;
+static int myccc[NRUNES];
+
+typedef struct KV KV;
+struct KV{
+ uint key;
+ uint val;
+ ushort next;
+};
+
+static KV myrecomp[2000];
+static int nrecomp;
+
+static int recompext[256*3];
+static int nrecompext;
+
+static uint
+hash(uint x)
+{
+ x ^= x >> 16;
+ x *= 0x21f0aaad;
+ x ^= x >> 15;
+ x *= 0xd35a2d97;
+ x ^= x >> 15;
+ return x;
+}
+
+static void
+mkrecomp(int fd)
+{
+ int i;
+ KV *p;
+ static KV vals[512];
+ static KV coll[1000];
+ int over;
+ int maxchain;
+
+ for(i = 0; i < nelem(vals); i++)
+ vals[i] = (KV){0, 0, 0};
+ for(i = 0; i < nelem(coll); i++)
+ coll[i] = (KV){0, 0, 0};
+ over = 1;
+ for(i = 0; i < nrecomp; i++){
+ p = vals + (hash(myrecomp[i].key) % nelem(vals));
+ maxchain = 0;
+ while(p->key != 0){
+ maxchain++;
+ if(p->next == 0){
+ p->next = over;
+ p = coll + over - 1;
+ over++;
+ } else
+ p = coll + p->next - 1;
+ }
+ p->key = myrecomp[i].key;
+ p->val = myrecomp[i].val;
+ }
+ fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));
+ fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
+ for(p = vals, i = 0;; i++){
+ assert(p->val < 0xFFFF);
+ assert(p->next < 0xFFFF);
+ fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
+ if((i+1) % 8 == 0)
+ fprint(fd, "\n\t");
+
+ if(p == vals+nelem(vals)-1)
+ p = coll;
+ else if(p == coll + over - 2)
+ break;
+ else
+ p++;
+ }
+ fprint(fd, "\n};\n");
+ fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
+}
+
+static void
+mktables(void)
+{
+ Param p;
+ int tofd, isfd, normfd, breakfd;
+ int size;
+
+ tofd = create("runetotypedata", OWRITE, 0664);
+ if(tofd < 0)
+ sysfatal("could not create runetotypedata: %r");
+ param(&p, 10, 7);
+ size = mklkup(tofd, "upper", mytoupper, &p);
+ fprint(2, "%s: %d\n", "upper", size);
+
+ size = mklkup(tofd, "lower", mytolower, &p);
+ fprint(2, "%s: %d\n", "lower", size);
+
+ size = mklkup(tofd, "title", mytotitle, &p);
+ fprint(2, "%s: %d\n", "title", size);
+ close(tofd);
+
+ isfd = create("runeistypedata", OWRITE, 0664);
+ if(isfd < 0)
+ sysfatal("could not create runeistypedata: %r");
+ param(&p, 11, 6);
+ size = mklkup(isfd, "merged", myismerged, &p);
+ fprint(2, "%s: %d\n", "merged", size);
+ fprint(isfd, "static\nenum {\n");
+ fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
+ fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
+ fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
+ fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
+ fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
+ fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
+ fprint(isfd, "};\n");
+ close(isfd);
+
+ normfd = create("runenormdata", OWRITE, 0664);
+ if(normfd < 0)
+ sysfatal("could not create runenormdata: %r");
+ param(&p, 10, 7);
+ size = mklkup(normfd, "decomp", mydecomp, &p);
+ fprint(2, "%s: %d\n", "decomp", size);
+
+ param(&p, 9, 7);
+ size = mklkup(normfd, "ccc", myccc, &p);
+ fprint(2, "%s: %d\n", "ccc", size);
+
+ mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
+ mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
+ mkrecomp(normfd);
+ close(normfd);
+
+ param(&p, 10, 6);
+ breakfd = create("runebreakdata", OWRITE, 0644);
+ if(breakfd < 0)
+ sysfatal("could not create runebreakdata: %r");
+ size = mklkup(breakfd, "break", mybreak, &p);
+ fprint(2, "%s: %d\n", "break", size);
+}
+
+enum {
+ FIELD_CODE,
+ FIELD_NAME,
+ FIELD_CATEGORY,
+ FIELD_COMBINING,
+ FIELD_BIDIR,
+ FIELD_DECOMP,
+ FIELD_DECIMAL_DIG,
+ FIELD_DIG,
+ FIELD_NUMERIC_VAL,
+ FIELD_MIRRORED,
+ FIELD_UNICODE_1_NAME,
+ FIELD_COMMENT,
+ FIELD_UPPER,
+ FIELD_LOWER,
+ FIELD_TITLE,
+ NFIELDS,
+};
+
+static int
+getunicodeline(Biobuf *in, char **fields)
+{
+ char *p;
+
+ if((p = Brdline(in, '\n')) == nil)
+ return 0;
+
+ p[Blinelen(in)-1] = '\0';
+
+ if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
+ sysfatal("bad number of fields");
+
+ return 1;
+}
+
+static int
+estrtoul(char *s, int base)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, base);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+};
+
+static void
+markbreak(void)
+{
+ Biobuf *b;
+ char *p, *dot;
+ int i, s, e;
+ uchar v;
+
+ b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load word breaks: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "ExtendNumLet") != nil)
+ v = ExtendNumLet;
+ else if(strstr(dot, "Hebrew_Letter") != nil)
+ v = Hebrew_Letter;
+ else if(strstr(dot, "Newline") != nil)
+ v = Newline;
+ else if(strstr(dot, "Extend") != nil)
+ v = Extend;
+ else if(strstr(dot, "Format") != nil)
+ v = Format;
+ else if(strstr(dot, "Katakana") != nil)
+ v = Katakana;
+ else if(strstr(dot, "ALetter") != nil)
+ v = ALetter;
+ else if(strstr(dot, "MidLetter") != nil)
+ v = MidLetter;
+ else if(strstr(dot, "MidNum") != nil)
+ v = MidNum;
+ else if(strstr(dot, "Numeric") != nil)
+ v = Numeric;
+ else if(strstr(dot, "WSegSpace") != nil)
+ v = WSegSpace;
+ for(i = s; i <= e; i++)
+ mybreak[i] = v;
+ }
+ Bterm(b);
+ b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load Grapheme breaks: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "; Prepend #") != nil)
+ v = PREPEND;
+ else if(strstr(dot, "; Control #") != nil)
+ v = CONTROL;
+ else if(strstr(dot, "; Extend #") != nil)
+ v = EXTEND;
+ else if(strstr(dot, "; Regional_Indicator #") != nil)
+ v = REGION;
+ else if(strstr(dot, "; SpacingMark #") != nil)
+ v = SPACEMK;
+ else if(strstr(dot, "; L #") != nil)
+ v = L;
+ else if(strstr(dot, "; V #") != nil)
+ v = V;
+ else if(strstr(dot, "; T #") != nil)
+ v = T;
+ else if(strstr(dot, "; LV #") != nil)
+ v = LV;
+ else if(strstr(dot, "; LVT #") != nil)
+ v = LVT;
+ for(i = s; i <= e; i++)
+ mybreak[i] |= v;
+ }
+ Bterm(b);
+
+ b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load emoji-data: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "; Extended_Pictographic") != nil)
+ v = EMOJIEX;
+ for(i = s; i <= e; i++)
+ mybreak[i] |= v;
+ }
+ Bterm(b);
+}
+
+static void
+markexclusions(void)
+{
+ Biobuf *b;
+ char *p;
+ int i;
+ uint x;
+
+ b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ x = estrtoul(p, 16);
+ for(i = 0; i < nrecomp; i++){
+ if(myrecomp[i].val == x){
+ myrecomp[i].val = 0;
+ break;
+ }
+ }
+ if(i == nrecomp){
+ for(i = 0; i < nrecompext; i++){
+ if(recompext[i*3] == x){
+ recompext[i*3] = 0;
+ break;
+ }
+ }
+ }
+ }
+ Bterm(b);
+}
+
+void
+main(int, char)
+{
+ static char myisspace[NRUNES];
+ static char myisalpha[NRUNES];
+ static char myisdigit[NRUNES];
+ static char myisupper[NRUNES];
+ static char myislower[NRUNES];
+ static char myistitle[NRUNES];
+ Biobuf *in;
+ char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
+ char *p, *d;
+ int i, code, last;
+ int decomp[2], *ip;
+
+ in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
+ if(in == nil)
+ sysfatal("can't open UnicodeData.txt: %r");
+
+ for(i = 0; i < NRUNES; i++){
+ mytoupper[i] = -1;
+ mytolower[i] = -1;
+ mytotitle[i] = -1;
+ mydecomp[i] = 0;
+ myccc[i] = 0;
+ mybreak[i] = 0;
+ }
+
+ myisspace['\t'] = 1;
+ myisspace['\n'] = 1;
+ myisspace['\r'] = 1;
+ myisspace['\f'] = 1;
+ myisspace['\v'] = 1;
+ myisspace[0x85] = 1; /* control char, "next line" */
+ myisspace[0xfeff] = 1; /* zero-width non-break space */
+
+ last = -1;
+ nspecial = nrecomp = nrecompext = 0;
+ while(getunicodeline(in, fields)){
+ code = estrtoul(fields[FIELD_CODE], 16);
+ if (code >= NRUNES)
+ sysfatal("code-point value too big: %x", code);
+ if(code <= last)
+ sysfatal("bad code sequence: %x then %x", last, code);
+ last = code;
+
+ p = fields[FIELD_CATEGORY];
+ if(strstr(fields[FIELD_NAME], ", First>") != nil){
+ if(!getunicodeline(in, fields2))
+ sysfatal("range start at eof");
+ if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
+ sysfatal("range start not followed by range end");
+ last = estrtoul(fields2[FIELD_CODE], 16);
+ if(last <= code)
+ sysfatal("range out of sequence: %x then %x", code, last);
+ if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
+ sysfatal("range with mismatched category");
+ }
+
+ d = fields[FIELD_DECOMP];
+ if(strlen(d) > 0 && strstr(d, "<") == nil){
+ decomp[0] = estrtoul(d, 16);
+ d = strstr(d, " ");
+ if(d == nil){
+ /* singleton recompositions are verboden */
+ decomp[1] = 0;
+ if(decomp[0] > 0xFFFF){
+ ip = mydespecial + nspecial*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = 0;
+ mydecomp[code] = (DSTART+nspecial)<<16;
+ nspecial++;
+ } else
+ mydecomp[code] = decomp[0]<<16;
+ } else {
+ d++;
+ decomp[1] = estrtoul(d, 16);
+ if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
+ ip = mydespecial + nspecial*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = decomp[1];
+ mydecomp[code] = (DSTART+nspecial)<<16;
+ nspecial++;
+ ip = recompext + nrecompext*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = decomp[1];
+ nrecompext++;
+ } else {
+ mydecomp[code] = decomp[0]<<16 | decomp[1];
+ myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
+ }
+ }
+ }
+
+ for (; code <= last; code++){
+ if(p[0] == 'L')
+ myisalpha[code] = 1;
+ if(p[0] == 'Z')
+ myisspace[code] = 1;
+
+ if(strcmp(p, "Lu") == 0)
+ myisupper[code] = 1;
+ if(strcmp(p, "Ll") == 0)
+ myislower[code] = 1;
+
+ if(strcmp(p, "Lt") == 0)
+ myistitle[code] = 1;
+
+ if(strcmp(p, "Nd") == 0)
+ myisdigit[code] = 1;
+
+ if(fields[FIELD_UPPER][0] != '\0')
+ mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
+
+ if(fields[FIELD_LOWER][0] != '\0')
+ mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
+
+ if(fields[FIELD_TITLE][0] != '\0')
+ mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
+
+ myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
+ }
+ }
+
+ Bterm(in);
+ markexclusions();
+
+ /*
+ * according to standard, if totitle(x) is not defined in ucd
+ * but toupper(x) is, then totitle is defined to be toupper(x)
+ */
+ for(i = 0; i < NRUNES; i++){
+ if(mytotitle[i] == -1
+ && mytoupper[i] != -1
+ && !myistitle[i])
+ mytotitle[i] = mytoupper[i];
+ }
+
+ /*
+ * A couple corrections:
+ * is*(to*(x)) should be true.
+ * restore undefined transformations.
+ * store offset instead of value, makes them sparse.
+ */
+ for(i = 0; i < NRUNES; i++){
+ if(mytoupper[i] != -1)
+ myisupper[mytoupper[i]] = 1;
+ else
+ mytoupper[i] = i;
+
+ if(mytolower[i] != -1)
+ myislower[mytolower[i]] = 1;
+ else
+ mytolower[i] = i;
+
+ if(mytotitle[i] != -1)
+ myistitle[mytotitle[i]] = 1;
+ else
+ mytotitle[i] = i;
+
+ mytoupper[i] = mytoupper[i] - i;
+ mytolower[i] = mytolower[i] - i;
+ mytotitle[i] = mytotitle[i] - i;
+ }
+
+ uchar b;
+ for(i = 0; i < NRUNES; i++){
+ b = 0;
+ if(myisspace[i])
+ b |= 1<<0;
+ if(myisalpha[i])
+ b |= 1<<1;
+ if(myisdigit[i])
+ b |= 1<<2;
+ if(myisupper[i])
+ b |= 1<<3;
+ if(myislower[i])
+ b |= 1<<4;
+ if(myistitle[i])
+ b |= 1<<5;
+
+ myismerged[i] = b;
+ }
+
+ markbreak();
+ mktables();
+ exits(nil);
+}
--- /dev/null
+++ b/sys/src/libc/port/runebreak.c
@@ -1,0 +1,293 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runebreakdata"
+
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+
+ ZWJ = 0x200DU,
+ LINETAB = 0xB,
+};
+
+#define IS(x, y) ((x&0xf) == y)
+#define ISG(x, y) ((x&0xf0) == y)
+
+Rune*
+runegbreak(Rune *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ Rune *p;
+
+ p = s;
+ if((l = *p++) == 0)
+ return s;
+ if((r = *p) == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+ return p;
+ if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+ return p;
+ if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+ goto Done;
+ if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+ goto Done;
+ if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+ goto Done;
+ if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+ goto Done;
+ if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+ while(ISG(rt, EXTEND)){
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ }
+ if(r != ZWJ)
+ return p;
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, EMOJIEX))
+ goto Done;
+ return p;
+ }
+ if(ISG(rt, EXTEND) || r == ZWJ)
+ goto Done;
+ if(ISG(lt, REGION) && ISG(rt, REGION))
+ goto Done;
+
+ return p;
+
+Done:
+ if(p[1] == 0)
+ return s;
+ return p + 1;
+}
+
+char*
+utfgbreak(char *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ char *p;
+
+ p = s;
+ p += chartorune(&l, p);
+ if(l == 0)
+ return s;
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+ return p;
+ if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+ return p;
+ if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+ goto Done;
+ if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+ goto Done;
+ if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+ goto Done;
+ if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+ goto Done;
+ if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+ while(ISG(rt, EXTEND)){
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ rt = breaklkup(r);
+ }
+ if(r != ZWJ)
+ return p;
+
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, EMOJIEX))
+ goto Done;
+ return p;
+ }
+ if(ISG(rt, EXTEND) || r == ZWJ)
+ goto Done;
+ if(ISG(lt, REGION) && ISG(rt, REGION))
+ goto Done;
+
+ return p;
+
+Done:
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ return p;
+}
+
+#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
+#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
+
+Rune*
+runewbreak(Rune *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ Rune *p;
+
+ p = s;
+ if((l = *p++) == 0)
+ return s;
+ if((r = *p) == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(l == '\r' || l == '\n' || l == LINETAB)
+ return p;
+ if(r == '\r' || r == '\n' || l == LINETAB)
+ return p;
+ if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+ goto Done;
+ if(IS(rt, Format) || IS(rt, Extend))
+ goto Done;
+ if(AH(lt)){
+ if(AH(rt))
+ goto Done;
+ if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
+ goto Done;
+ if(IS(lt, Hebrew_Letter) && r == '\'')
+ goto Done;
+ if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))
+ goto Done;
+ if(IS(rt, Numeric))
+ goto Done;
+ }
+ if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+ goto Done;
+ if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))
+ goto Done;
+ if(IS(lt, Katakana) && IS(rt, Katakana))
+ goto Done;
+ if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+ if(IS(rt, ExtendNumLet))
+ goto Done;
+ if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+ goto Done;
+ if(ISG(lt, REGION)){
+ if(ISG(rt, REGION))
+ goto Done;
+ if(r != ZWJ)
+ return p;
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, REGION))
+ goto Done;
+ }
+
+ return p;
+
+Done:
+ if(p[1] == 0)
+ return s;
+ return p + 1;
+}
+
+char*
+utfwbreak(char *s)
+{
+ Rune l, r;
+ Rune peek;
+ uchar lt, rt;
+ char *p;
+
+ p = s;
+ p += chartorune(&l, p);
+ if(l == 0)
+ return s;
+ chartorune(&peek, p+chartorune(&r, p));
+ if(r == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(l == '\r' || l == '\n' || l == LINETAB)
+ return p;
+ if(r == '\r' || r == '\n' || l == LINETAB)
+ return p;
+ if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+ goto Done;
+ if(IS(rt, Format) || IS(rt, Extend))
+ goto Done;
+ if(AH(lt)){
+ if(AH(rt))
+ goto Done;
+ if(IS(rt, MidLetter) || MNLQ(rt))
+ if(peek != 0 && AH(breaklkup(peek)))
+ goto Done;
+
+ if(IS(lt, Hebrew_Letter) && r == '\'')
+ goto Done;
+
+ if(IS(lt, Hebrew_Letter) && r == '"')
+ if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))
+ goto Done;
+
+ if(IS(rt, Numeric))
+ goto Done;
+ }
+ if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+ goto Done;
+ if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))
+ goto Done;
+ if(IS(lt, Katakana) && IS(rt, Katakana))
+ goto Done;
+ if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+ if(IS(rt, ExtendNumLet))
+ goto Done;
+ if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+ goto Done;
+ if(ISG(lt, REGION)){
+ if(ISG(rt, REGION))
+ goto Done;
+ if(r != ZWJ)
+ return p;
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, REGION))
+ goto Done;
+ }
+
+ return p;
+
+Done:
+ p += chartorune(&r, p);
+ chartorune(&r, p);
+ if(r == 0)
+ return s;
+ return p;
+}
--- /dev/null
+++ b/sys/src/libc/port/runeistype.c
@@ -1,0 +1,40 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runeistypedata"
+
+int
+isspacerune(Rune c)
+{
+ return (mergedlkup(c) & Lspace) == Lspace;
+}
+
+int
+isalpharune(Rune c)
+{
+ return (mergedlkup(c) & Lalpha) == Lalpha;
+}
+
+int
+isdigitrune(Rune c)
+{
+ return (mergedlkup(c) & Ldigit) == Ldigit;
+}
+
+int
+isupperrune(Rune c)
+{
+ return (mergedlkup(c) & Lupper) == Lupper;
+}
+
+int
+islowerrune(Rune c)
+{
+ return (mergedlkup(c) & Llower) == Llower;
+}
+
+int
+istitlerune(Rune c)
+{
+ return (mergedlkup(c) & Ltitle) == Ltitle;
+}
--- /dev/null
+++ b/sys/src/libc/port/runenorm.c
@@ -1,0 +1,334 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runenormdata"
+
+//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
+enum {
+ SBase = 0xAC00,
+ LBase = 0x1100,
+ VBase = 0x1161,
+ TBase = 0x11A7,
+
+ LCount = 19,
+ VCount = 21,
+ TCount = 28,
+ NCount = VCount * TCount,
+ SCount = LCount * NCount,
+
+ LLast = LBase + LCount - 1,
+ SLast = SBase + SCount - 1,
+ VLast = VBase + VCount - 1,
+ TLast = TBase + TCount - 1,
+};
+
+static void
+_runedecomp(Rune dst[2], Rune c)
+{
+ uint x;
+
+ if(c >= SBase && c <= SLast){
+ c -= SBase;
+ x = c % TCount;
+ if(x){
+ dst[0] = SBase + ((c / TCount) * TCount);
+ dst[1] = TBase + x;
+ return;
+ }
+ dst[0] = LBase + (c / NCount);
+ dst[1] = VBase + ((c % NCount) / TCount);
+ return;
+ }
+ x = decomplkup(c);
+ if((x & 0xFFFF) != 0){
+ dst[0] = x>>16;
+ dst[1] = x & 0xFFFF;
+ return;
+ }
+ x >>= 16;
+ if(x >= 0xEEEE && x <0xF8FF){
+ memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2);
+ return;
+ }
+ dst[0] = x;
+ dst[1] = 0;
+}
+
+static Rune
+_runerecomp(Rune r[2])
+{
+ uint x, y, *p, next;
+
+ if(r[0] >= LBase && r[0] <= LLast){
+ if(r[1] < VBase || r[1] > VLast)
+ return 0;
+ x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount;
+ return SBase + x;
+ }
+ if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){
+ if(r[1] > TBase && r[1] <= TLast)
+ return r[0] + (r[1] - TBase);
+ return 0;
+ }
+ if(r[0] > 0xFFFF || r[1] > 0xFFFF){
+ for(x = 0; x < nelem(_recompexceptions); x++)
+ if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2])
+ return _recompexceptions[x][0];
+ return 0;
+ }
+ y = x = r[0]<<16 | r[1];
+ x ^= x >> 16;
+ x *= 0x21f0aaad;
+ x ^= x >> 15;
+ x *= 0xd35a2d97;
+ x ^= x >> 15;
+ p = _recompdata + (x%512)*2;
+ while(p[0] != y){
+ next = p[1]>>16;
+ if(!next)
+ return 0;
+ p = _recompcoll + (next-1)*2;
+ }
+ return p[1] & 0xFFFF;
+}
+
+static void
+runecccsort(Rune *a, int len)
+{
+ Rune r;
+ int i;
+ int fail;
+
+ do {
+ fail = 0;
+ for(i = 0; i < len - 1; i++){
+ if(ccclkup(a[i]) > ccclkup(a[i+1]) > 0){
+ r = a[i];
+ a[i] = a[i+1];
+ a[i + 1] = r;
+ fail = 1;
+ }
+ }
+ } while(fail);
+}
+
+char*
+fullutfnorm(char *s, int n)
+{
+ Rune r, peek;
+ char *p, *p2;
+
+ p = s;
+ if(fullrune(p, n) == 0)
+ return s;
+
+ p += chartorune(&r, p);
+ n -= (p - s);
+
+ if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){
+ do {
+ if(fullrune(p, n) == 0)
+ return s;
+ p2 = p + chartorune(&peek, p);
+ n -= (p2 - p);
+ p = p2;
+ } while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast));
+ if(n <= 0)
+ return s;
+ return p;
+ }
+
+ do {
+ if(fullrune(p, n) == 0)
+ return s;
+ p2 = p + chartorune(&peek, p);
+ n -= (p2 - p);
+ p = p2;
+ if(ccclkup(peek) == 0)
+ return p;
+ } while(n > 0);
+
+ return s;
+}
+
+Rune*
+fullrunenorm(Rune *r, int n)
+{
+ Rune *e, *p;
+
+ p = r;
+ e = p + n;
+
+ if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){
+ p++;
+ while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast))
+ p++;
+
+ if(p >= e)
+ return r;
+ return p;
+ }
+
+ for(; p < e && p + 1 < e; p++)
+ if(ccclkup(p[1]) == 0)
+ return p + 1;
+
+ return r;
+}
+
+static int
+runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose)
+{
+ Rune c, r[2], _stack[32];
+ Rune *p, *stack, *sp, *tp;
+ char *strp, *strstop;
+ Rune *rp, *rrp;
+ Rune *stop;
+ Rune peek;
+ int w, w2, size;
+ int mode;
+
+ if(src){
+ mode = 1;
+ p = src;
+ stop = dst + (max - 1);
+ strp = "";
+ strstop = nil;
+ } else {
+ mode = 0;
+ p = L"";
+ stop = nil;
+ strp = ssrc;
+ strstop = sdst + (max - 1);
+ }
+
+ stack = _stack + nelem(_stack)/2;
+ size = 0;
+ w = w2 = 0;
+ while(*strp || *p){
+ if(mode)
+ c = *p;
+ else
+ w = chartorune(&c, strp);
+
+ sp = stack - 1;
+ tp = stack;
+ _runedecomp(r, c);
+ while(r[0] != 0){
+ c = r[0];
+ if(r[1] != 0){
+ *sp-- = r[1];
+ if(sp == _stack)
+ break;
+ }
+ _runedecomp(r, c);
+ }
+
+ *sp = c;
+ if(mode)
+ peek = p[1];
+ else
+ w2 = chartorune(&peek, strp+w);
+
+ if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){
+ while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){
+ *tp++ = peek;
+ if(mode){
+ p++;
+ peek = p[1];
+ } else {
+ strp += w;
+ w = w2;
+ w2 = chartorune(&peek, strp+w);
+ }
+ if(tp == _stack + nelem(_stack))
+ break;
+ }
+ }
+ while(peek != 0 && ccclkup(peek) != 0){
+ _runedecomp(r, peek);
+ if(r[1] != 0){
+ if(tp+1 >= _stack + nelem(_stack))
+ break;
+ *tp++ = r[0];
+ *tp++ = r[1];
+ } else if(r[0] != 0)
+ *tp++ = r[0];
+ else
+ *tp++ = peek;
+
+ if(mode){
+ p++;
+ peek = p[1];
+ } else {
+ strp += w;
+ w = w2;
+ w2 = chartorune(&peek, strp+w);
+ }
+ if(tp == _stack + nelem(_stack))
+ break;
+ }
+ runecccsort(sp, tp - sp);
+
+ if(compose && ccclkup(*sp) == 0){
+ for(rp = sp + 1; rp < tp; rp++){
+ r[0] = *sp;
+ r[1] = *rp;
+ c = _runerecomp(r);
+ if(c != 0){
+ *sp = c;
+ for(rrp = rp; rrp > sp; rrp--)
+ *rrp = rrp[-1];
+ sp++;
+ } else while(rp + 1 < tp && ccclkup(*rp) == ccclkup(*(rp+1)))
+ rp++;
+ }
+ }
+
+ for(; sp < tp; sp++){
+ if(mode){
+ if(dst < stop)
+ *dst++ = *sp;
+ size++;
+ } else {
+ w2 = runelen(*sp);
+ if(sdst+w2 < strstop)
+ sdst += runetochar(sdst, sp);
+ size += w2;
+ }
+ }
+ if(mode)
+ p++;
+ else
+ strp += w;
+ }
+ if(mode)
+ *dst = 0;
+ else
+ *sdst = 0;
+ return size;
+}
+
+int
+runecomp(Rune *dst, Rune *src, int max)
+{
+ return runenorm(dst, src, nil, nil, max, 1);
+}
+
+int
+runedecomp(Rune *dst, Rune *src, int max)
+{
+ return runenorm(dst, src, nil, nil, max, 0);
+}
+
+int
+utfcomp(char *dst, char *src, int max)
+{
+ return runenorm(nil, nil, dst, src, max, 1);
+}
+
+int
+utfdecomp(char *dst, char *src, int max)
+{
+ return runenorm(nil, nil, dst, src, max, 0);
+}
--- /dev/null
+++ b/sys/src/libc/port/runetotype.c
@@ -1,0 +1,22 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runetotypedata"
+
+Rune
+toupperrune(Rune c)
+{
+ return c + upperlkup(c);
+}
+
+Rune
+tolowerrune(Rune c)
+{
+ return c + lowerlkup(c);
+}
+
+Rune
+totitlerune(Rune c)
+{
+ return c + titlelkup(c);
+}
--- a/sys/src/libc/test/mkfile
+++ b/sys/src/libc/test/mkfile
@@ -3,6 +3,14 @@
TEST=\
date\
pow\
+ runebreak\
+ runenorm\
strchr\
</sys/src/cmd/mktest
+
+/lib/ucd/%:
+ cd /lib/ucd && mk $stem
+
+runebreak.test: /lib/ucd/GraphemeBreakTest.txt /lib/ucd/WordBreakTest.txt
+runenorm.test: /lib/ucd/NormalizationTest.txt
--- /dev/null
+++ b/sys/src/libc/test/runebreak.c
@@ -1,0 +1,112 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+static int
+estrtoul(char *s)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, 16);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+static Rune*
+check(Rune *r, Rune* (*fn)(Rune*), char* (*fn2)(char*))
+{
+ Rune *r2, *tmp;
+ char *p, *p2;
+
+ p = smprint("%S", r);
+ r2 = fn(r);
+ p2 = fn2(p);
+
+ tmp = runesmprint("%.*s", (int)(p2-p), p);
+ if(memcmp(r, tmp, r2-r) != 0)
+ print("utf mismstach\n");
+
+ free(p);
+ free(tmp);
+ return r2;
+}
+
+static void
+run(char *file, Rune* (*fn)(Rune*), char* (*fn2)(char*))
+{
+ Biobuf *b;
+ char *p, *dot;
+ char *pieces[16];
+ int i, j, n;
+ Rune stack[16], ops[16];
+ int nstack, nops;
+ Rune r, *rp, *rp2;
+ char *line;
+
+ b = Bopen(file, OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ for(;(p = Brdline(b, '\n')) != nil; free(line)){
+ p[Blinelen(b)-1] = 0;
+ line = strdup(p);
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "#")) != nil)
+ *dot = 0;
+ n = getfields(p, pieces, nelem(pieces), 0, " ");
+ nstack = nops = 0;
+ for(i = 0; i < n; i++){
+ chartorune(&r, pieces[i]);
+ if(r != L'÷' && r != L'×'){
+ r = estrtoul(pieces[i]);
+ stack[nstack++] = r;
+ stack[nstack] = 0;
+ } else {
+ ops[nops++] = r;
+ ops[nops] = 0;
+ }
+ }
+
+ rp = stack;
+ for(i = 1; i < nops-1;){
+ rp2 = check(rp, fn, fn2);
+ switch(ops[i]){
+ case L'÷':
+ if(rp2 != rp+1){
+ print("break fail %X %X || %s\n", rp[0], rp[1], line);
+ goto Break;
+ }
+ rp++;
+ i++;
+ break;
+ case L'×':
+ if(rp2 - rp == 0){
+ for(j = i; j < nops - 1; j++)
+ if(ops[j] != L'×')
+ print("skipped %d %d %s\n", i, nops, line);
+ goto Break;
+ }
+ for(; rp < (rp2-1); rp++, i++){
+ if(ops[i] != L'×')
+ print("skipped %d %d %s\n", i, nops, line);
+ }
+ rp = rp2;
+ i++;
+ break;
+ }
+ }
+Break:
+ ;
+ }
+}
+
+void
+main(int, char)
+{
+ run("/lib/ucd/GraphemeBreakTest.txt", runegbreak, utfgbreak);
+ run("/lib/ucd/WordBreakTest.txt", runewbreak, utfwbreak);
+ exits(nil);
+}
--- /dev/null
+++ b/sys/src/libc/test/runenorm.c
@@ -1,0 +1,92 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+static int
+estrtoul(char *s)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, 16);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+void
+main(int, char)
+{
+ Rune buffer1[64];
+ Rune buffer2[64];
+ char utfbuff1[128];
+ char utfbuff2[128];
+ char srctmp[128], tmp1[128], tmp2[128];
+ char *fields[10];
+ char *runes[32];
+ char *p;
+ int n, n2;
+ int i;
+ uint fail;
+ Biobuf *b;
+
+ b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ struct {
+ Rune src[32];
+ Rune nfc[32];
+ Rune nfd[32];
+ } test;
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#' || p[0] == '@')
+ continue;
+ getfields(p, fields, 6 + 1, 0, ";");
+ n = getfields(fields[0], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.src[i] = estrtoul(runes[i]);
+ test.src[i] = 0;
+
+ n = getfields(fields[1], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.nfc[i] = estrtoul(runes[i]);
+ test.nfc[i] = 0;
+
+ n = getfields(fields[2], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.nfd[i] = estrtoul(runes[i]);
+ test.nfd[i] = 0;
+
+ n = runecomp(buffer1, test.src, nelem(buffer1));
+ n2 = runedecomp(buffer2, test.src, nelem(buffer2));
+ fail = 0;
+
+ if(runestrcmp(buffer1, test.nfc) != 0)
+ fail |= 1<<0;
+ if(runestrcmp(buffer2, test.nfd) != 0)
+ fail |= 1<<1;
+ if(fail)
+ print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1);
+ assert(n == runestrlen(test.nfc));
+ assert(n2 == runestrlen(test.nfd));
+
+ snprint(srctmp, sizeof tmp1, "%S", test.src);
+ snprint(tmp1, sizeof tmp1, "%S", test.nfc);
+ snprint(tmp2, sizeof tmp2, "%S", test.nfd);
+
+ n = utfcomp(utfbuff1, srctmp, nelem(utfbuff1));
+ n2 = utfdecomp(utfbuff2, srctmp, nelem(utfbuff2));
+
+ if(strcmp(utfbuff1, tmp1) != 0)
+ fail |= 1<<2;
+ if(strcmp(utfbuff2, tmp2) != 0)
+ fail |= 1<<3;
+ if(fail)
+ print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1);
+ assert(n == strlen(tmp1));
+ assert(n2 == strlen(tmp2));
+ }
+ exits(nil);
+}