ref: edca217bb99f7c32413c117239d12acdc223e811
parent: 7388792a124756a528666cb5c375ee919db9ca11
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sat May 10 20:54:59 EDT 2014
tcs: handle surrogate pairs
--- a/sys/src/cmd/tcs/conv_big5.c
+++ b/sys/src/cmd/tcs/conv_big5.c
@@ -82,7 +82,7 @@
}
void
-big5_in(int fd, long *notused, struct convert *out)
+big5_in(int fd, long *, struct convert *out)
{
Rune ob[N];
Rune *r, *re;
@@ -90,7 +90,6 @@
int n, i;
long nin;
- USED(notused);
r = ob;
re = ob+N-3;
nin = 0;
@@ -114,7 +113,7 @@
}
void
-big5_out(Rune *base, int n, long *notused)
+big5_out(Rune *base, int n, long *)
{
char *p;
int i;
@@ -121,7 +120,6 @@
Rune r;
static int first = 1;
- USED(notused);
if(first){
first = 0;
for(i = 0; i < NRUNE; i++)
--- a/sys/src/cmd/tcs/conv_gb.c
+++ b/sys/src/cmd/tcs/conv_gb.c
@@ -60,7 +60,7 @@
}
void
-gb_in(int fd, long *notused, struct convert *out)
+gb_in(int fd, long *, struct convert *out)
{
Rune ob[N];
Rune *r, *re;
@@ -68,7 +68,6 @@
int n, i;
long nin;
- USED(notused);
r = ob;
re = ob+N-3;
nin = 0;
@@ -92,7 +91,7 @@
}
void
-gb_out(Rune *base, int n, long *notused)
+gb_out(Rune *base, int n, long *)
{
char *p;
int i;
@@ -99,7 +98,6 @@
Rune r;
static int first = 1;
- USED(notused);
if(first){
first = 0;
for(i = 0; i < NRUNE; i++)
--- a/sys/src/cmd/tcs/conv_gbk.c
+++ b/sys/src/cmd/tcs/conv_gbk.c
@@ -51,7 +51,7 @@
}
void
-gbk_in(int fd, long *notused, struct convert *out)
+gbk_in(int fd, long *, struct convert *out)
{
Rune ob[N];
Rune *r, *re;
@@ -59,7 +59,6 @@
int n, i;
long nin;
- USED(notused);
r = ob;
re = ob+N-3;
nin = 0;
@@ -84,7 +83,7 @@
void
-gbk_out(Rune *base, int n, long *notused)
+gbk_out(Rune *base, int n, long *)
{
char *p;
int i;
@@ -91,7 +90,6 @@
Rune r;
static int first = 1;
- USED(notused);
if(first){
first = 0;
for(i = 0; i < NRUNE; i++)
--- a/sys/src/cmd/tcs/conv_jis.c
+++ b/sys/src/cmd/tcs/conv_jis.c
@@ -367,30 +367,26 @@
}
void
-jis_in(int fd, long *notused, struct convert *out)
+jis_in(int fd, long *, struct convert *out)
{
- USED(notused);
do_in(fd, alljis, out);
}
void
-ujis_in(int fd, long *notused, struct convert *out)
+ujis_in(int fd, long *, struct convert *out)
{
- USED(notused);
do_in(fd, ujis, out);
}
void
-msjis_in(int fd, long *notused, struct convert *out)
+msjis_in(int fd, long *, struct convert *out)
{
- USED(notused);
do_in(fd, ms, out);
}
void
-jisjis_in(int fd, long *notused, struct convert *out)
+jisjis_in(int fd, long *, struct convert *out)
{
- USED(notused);
do_in(fd, jis, out);
}
@@ -417,7 +413,7 @@
/* jis-kanji, or ISO 2022-JP */
void
-jisjis_out(Rune *base, int n, long *notused)
+jisjis_out(Rune *base, int n, long *)
{
char *p;
int i;
@@ -424,7 +420,6 @@
Rune r;
static enum { ascii, japan646, jp2022 } state = ascii;
- USED(notused);
if(first)
tab_init();
nrunes += n;
@@ -462,13 +457,12 @@
/* ms-kanji, or Shift-JIS */
void
-msjis_out(Rune *base, int n, long *notused)
+msjis_out(Rune *base, int n, long *)
{
char *p;
int i, hi, lo;
Rune r;
- USED(notused);
if(first)
tab_init();
nrunes += n;
@@ -501,13 +495,12 @@
/* ujis, or EUC */
void
-ujis_out(Rune *base, int n, long *notused)
+ujis_out(Rune *base, int n, long *)
{
char *p;
int i;
Rune r;
- USED(notused);
if(first)
tab_init();
nrunes += n;
--- a/sys/src/cmd/tcs/conv_ksc.c
+++ b/sys/src/cmd/tcs/conv_ksc.c
@@ -81,7 +81,7 @@
}
void
-uksc_in(int fd, long *notused, struct convert *out)
+uksc_in(int fd, long *, struct convert *out)
{
Rune ob[N];
Rune *r, *re;
@@ -89,7 +89,6 @@
int n, i;
long nin;
- USED(notused);
r = ob;
re = ob+N-3;
nin = 0;
@@ -113,7 +112,7 @@
}
void
-uksc_out(Rune *base, int n, long *notused)
+uksc_out(Rune *base, int n, long *)
{
char *p;
int i;
@@ -121,7 +120,6 @@
long l;
static int first = 1;
- USED(notused);
if(first){
first = 0;
for(i = 0; i < NRUNE; i++)
--- a/sys/src/cmd/tcs/hdr.h
+++ b/sys/src/cmd/tcs/hdr.h
@@ -19,6 +19,7 @@
typedef void (*Infn)(int, long *, struct convert *);
typedef void (*Outfn)(Rune *, int, long *);
void outtable(Rune *, int, long *);
+int fixsurrogate(Rune *rp, Rune r2);
void utf_in(int, long *, struct convert *);
void utf_out(Rune *, int, long *);
@@ -41,6 +42,5 @@
#define EXIT(n,s) exits(s)
#else
#define EPR fprintf(stderr,
-#define USED(x) /* in plan 9, USED(x) tells the compiler to treat x as used */
#define EXIT(n,s) exit(n)
#endif
--- a/sys/src/cmd/tcs/html.c
+++ b/sys/src/cmd/tcs/html.c
@@ -2141,24 +2141,22 @@
}
void
-html_in(int fd, long *x, struct convert *out)
+html_in(int fd, long *, struct convert *out)
{
char buf[100], *p;
Biobuf b;
- Rune rbuf[N];
- Rune *r, *er;
+ Rune *r, *er, r2;
int c, s, i;
- USED(x);
-
html_init();
- r = rbuf;
- er = rbuf+N;
+ r = runes;
+ er = runes+N;
+ r2 = 0;
Binit(&b, fd, OREAD);
while((c = Bgetrune(&b)) != Beof){
if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
+ OUT(out, runes, r-runes);
+ r = runes;
}
if(c == '&'){
s = 0;
@@ -2185,7 +2183,7 @@
c = strtol(buf+3, &p, 16);
else
c = strtol(buf+2, &p, 10);
- if(*p || c >= NRUNE || c < 0)
+ if(*p || c < 0)
goto bad;
goto out;
}
@@ -2196,10 +2194,11 @@
for(p=buf; p<buf+i; ){
p += chartorune(r++, p);
if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
+ OUT(out, runes, r-runes);
+ r = runes;
}
}
+ r2 = 0;
continue;
out:
if((c & 0x7f) == c && strchr("<>&\"'", c)){
@@ -2207,12 +2206,18 @@
i = sprint(buf, "&%s", findbyrune(c));
goto bad;
}
+ }
+ *r = c;
+ if(fixsurrogate(r, r2)){
+ r2 = *r;
+ continue;
}
- *r++ = c;
+ r2 = 0;
+ r++;
}
- if(r > rbuf)
- OUT(out, rbuf, r-rbuf);
- OUT(out, rbuf, 0);
+ if(r > runes)
+ OUT(out, runes, r-runes);
+ OUT(out, runes, 0);
}
/*
@@ -2219,13 +2224,12 @@
* use biobuf because can use more than UTFmax bytes per rune
*/
void
-html_out(Rune *r, int n, long *x)
+html_out(Rune *r, int n, long *)
{
char *s;
Biobuf b;
Rune *er;
- USED(x);
html_init();
Binit(&b, 1, OWRITE);
er = r+n;
--- a/sys/src/cmd/tcs/tcs.c
+++ b/sys/src/cmd/tcs/tcs.c
@@ -73,7 +73,6 @@
break;
} ARGEND
- USED(argc);
if(verbose)
squawk = 1;
if(listem){
@@ -214,49 +213,63 @@
}
void
-unicode_in_be(int fd, long *notused, struct convert *out)
+unicode_in_be(int fd, long *, struct convert *out)
{
- int i, n;
- Rune buf[N], r;
- uchar *p;
+ uchar buf[2*N], *p, *e;
+ Rune *r, r2;
+ int n;
- USED(notused);
- while((n = cread(fd, (char *)buf, 2*N, 2)) > 0){
- /* go backwards as sizeof(Rune) >= 2 */
- p = (uchar*)buf + n;
+ r2 = 0;
+ while((n = cread(fd, (char*)buf, 2*N, 2)) > 0){
ninput += n;
- n /= 2;
- for(i=n-1; i>=0; i--){
- r = *(--p);
- r |= *(--p) << 8;
- buf[i] = r;
+ p = buf;
+ e = buf + n;
+ r = runes;
+ while(p < e){
+ *r = *p++ << 8;
+ *r |= *p++;
+ if(fixsurrogate(r, r2)){
+ r2 = *r;
+ continue;
+ }
+ r2 = 0;
+ r++;
}
- OUT(out, buf, n);
+ if(r > runes){
+ OUT(out, runes, r-runes);
+ }
}
- OUT(out, buf, 0);
+ OUT(out, runes, 0);
}
void
-unicode_in_le(int fd, long *notused, struct convert *out)
+unicode_in_le(int fd, long *, struct convert *out)
{
- int i, n;
- Rune buf[N], r;
- uchar *p;
+ uchar buf[2*N], *p, *e;
+ Rune *r, r2;
+ int n;
- USED(notused);
- while((n = cread(fd, (char *)buf, 2*N, 2)) > 0){
- /* go backwards as sizeof(Rune) >= 2 */
- p = (uchar*)buf + n;
+ r2 = 0;
+ while((n = cread(fd, (char*)buf, 2*N, 2)) > 0){
ninput += n;
- n /= 2;
- for(i=n-1; i>=0; i--){
- r = *(--p) << 8;
- r |= *(--p);
- buf[i] = r;
+ p = buf;
+ e = buf + n;
+ r = runes;
+ while(p < e){
+ *r = *p++;
+ *r |= *p++ << 8;
+ if(fixsurrogate(r, r2)){
+ r2 = *r;
+ continue;
+ }
+ r2 = 0;
+ r++;
}
- OUT(out, buf, n);
+ if(r > runes){
+ OUT(out, runes, r-runes);
+ }
}
- OUT(out, buf, 0);
+ OUT(out, runes, 0);
}
void
@@ -284,41 +297,57 @@
}
void
-unicode_out_be(Rune *base, int n, long *notused)
+unicode_out_be(Rune *base, int n, long *)
{
int i;
uchar *p;
- Rune r;
+ unsigned long r;
- USED(notused);
p = (uchar*)base;
for(i=0; i<n; i++){
r = base[i];
- *p++ = r>>8;
- *p++ = r;
+ if(r > 0xFFFF){
+ r -= 0x10000;
+ *p++ = ((r>>18)&3) + 0xD8;
+ *p++ = r>>10;
+ *p++ = ((r>>8)&3) + 0xDC;
+ *p++ = r;
+ } else {
+ *p++ = r>>8;
+ *p++ = r;
+ }
}
nrunes += n;
- noutput += 2*n;
- write(1, (char *)base, 2*n);
+ n = p - (uchar*)base;
+ noutput += n;
+ write(1, (char *)base, n);
}
void
-unicode_out_le(Rune *base, int n, long *notused)
+unicode_out_le(Rune *base, int n, long *)
{
int i;
uchar *p;
- Rune r;
+ unsigned long r;
- USED(notused);
p = (uchar*)base;
for(i=0; i<n; i++){
r = base[i];
- *p++ = r;
- *p++ = r>>8;
+ if(r > 0xFFFF){
+ r -= 0x10000;
+ *p++ = r>>10;
+ *p++ = ((r>>18)&3) + 0xD8;
+ *p++ = r;
+ *p++ = ((r>>8)&3) + 0xDC;
+ } else {
+ *p++ = r;
+ *p++ = r>>8;
+ }
}
nrunes += n;
- noutput += 2*n;
- write(1, (char *)base, 2*n);
+ n = p - (uchar*)base;
+ noutput += n;
+ write(1, (char *)base, n);
}
void
@@ -401,6 +430,29 @@
}
noutput += p-obuf;
write(1, obuf, p-obuf);
+}
+
+int
+fixsurrogate(Rune *rp, Rune r2)
+{
+ Rune r1;
+
+ r1 = *rp;
+ if(r1 >= 0xD800 && r1 <= 0xDBFF){
+ if(r2 >= 0xDC00 && r2 <= 0xDFFF){
+ *rp = 0x10000 + (((r1 - 0xD800)<<10) | (r2 - 0xDC00));
+ return 0;
+ }
+ return 1;
+ } else
+ if(r1 >= 0xDC00 && r1 <= 0xDFFF){
+ if(r2 >= 0xD800 && r2 <= 0xDBFF){
+ *rp = 0x10000 + (((r2 - 0xD800)<<10) | (r1 - 0xDC00));
+ return 0;
+ }
+ return 1;
+ }
+ return 0;
}
long tabascii[256] =
--- a/sys/src/cmd/tcs/tune.c
+++ b/sys/src/cmd/tcs/tune.c
@@ -106,22 +106,20 @@
}
void
-tune_in(int fd, long *x, struct convert *out)
+tune_in(int fd, long *, struct convert *out)
{
Biobuf b;
- Rune rbuf[N];
Rune *r, *er, tr;
int c, i;
- USED(x);
- r = rbuf;
- er = rbuf+N-3;
+ r = runes;
+ er = runes+N-3;
Binit(&b, fd, OREAD);
while((c = Bgetrune(&b)) != Beof){
ninput += b.runesize;
if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
+ OUT(out, runes, r-runes);
+ r = runes;
}
if(c>=0xe210/**/ && c <= 0xe38c/**/ && (i = c%16) < nelem(t2)){
if(c >= 0xe380/**/){
@@ -172,13 +170,13 @@
break;
}
}
- if(r > rbuf)
- OUT(out, rbuf, r-rbuf);
- OUT(out, rbuf, 0);
+ if(r > runes)
+ OUT(out, runes, r-runes);
+ OUT(out, runes, 0);
}
void
-tune_out(Rune *r, int n, long *x)
+tune_out(Rune *r, int n, long *)
{
static int state = 0;
static Rune lastr;
@@ -186,7 +184,6 @@
char *p;
int i;
- USED(x);
nrunes += n;
er = r+n;
for(p = obuf; r < er; r++){
--- a/sys/src/cmd/tcs/utf.c
+++ b/sys/src/cmd/tcs/utf.c
@@ -27,13 +27,12 @@
int isochartorune(Rune *rune, char *str);
void
-utf_in(int fd, long *notused, struct convert *out)
+utf_in(int fd, long *, struct convert *out)
{
char buf[N];
int i, j, c, n, tot;
- ulong l;
+ unsigned long l;
- USED(notused);
tot = 0;
while((n = read(fd, buf+tot, N-tot)) >= 0){
tot += n;
@@ -65,12 +64,11 @@
}
void
-utf_out(Rune *base, int n, long *notused)
+utf_out(Rune *base, int n, long *)
{
char *p;
Rune *r;
- USED(notused);
nrunes += n;
for(r = base, p = obuf; n-- > 0; r++){
p += our_wctomb(p, *r);
@@ -80,12 +78,11 @@
}
void
-isoutf_in(int fd, long *notused, struct convert *out)
+isoutf_in(int fd, long *, struct convert *out)
{
char buf[N];
int i, j, c, n, tot;
- USED(notused);
tot = 0;
while((n = read(fd, buf+tot, N-tot)) >= 0){
tot += n;
@@ -117,12 +114,11 @@
}
void
-isoutf_out(Rune *base, int n, long *notused)
+isoutf_out(Rune *base, int n, long *)
{
char *p;
Rune *r;
- USED(notused);
nrunes += n;
for(r = base, p = obuf; n-- > 0; r++)
p += runetoisoutf(p, r);