shithub: riscv

Download patch

ref: edca217bb99f7c32413c117239d12acdc223e811
parent: 7388792a124756a528666cb5c375ee919db9ca11
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sat May 10 20:54:59 EDT 2014

tcs: handle surrogate pairs

--- a/sys/src/cmd/tcs/conv_big5.c
+++ b/sys/src/cmd/tcs/conv_big5.c
@@ -82,7 +82,7 @@
 }
 
 void
-big5_in(int fd, long *notused, struct convert *out)
+big5_in(int fd, long *, struct convert *out)
 {
 	Rune ob[N];
 	Rune *r, *re;
@@ -90,7 +90,6 @@
 	int n, i;
 	long nin;
 
-	USED(notused);
 	r = ob;
 	re = ob+N-3;
 	nin = 0;
@@ -114,7 +113,7 @@
 }
 
 void
-big5_out(Rune *base, int n, long *notused)
+big5_out(Rune *base, int n, long *)
 {
 	char *p;
 	int i;
@@ -121,7 +120,6 @@
 	Rune r;
 	static int first = 1;
 
-	USED(notused);
 	if(first){
 		first = 0;
 		for(i = 0; i < NRUNE; i++)
--- a/sys/src/cmd/tcs/conv_gb.c
+++ b/sys/src/cmd/tcs/conv_gb.c
@@ -60,7 +60,7 @@
 }
 
 void
-gb_in(int fd, long *notused, struct convert *out)
+gb_in(int fd, long *, struct convert *out)
 {
 	Rune ob[N];
 	Rune *r, *re;
@@ -68,7 +68,6 @@
 	int n, i;
 	long nin;
 
-	USED(notused);
 	r = ob;
 	re = ob+N-3;
 	nin = 0;
@@ -92,7 +91,7 @@
 }
 
 void
-gb_out(Rune *base, int n, long *notused)
+gb_out(Rune *base, int n, long *)
 {
 	char *p;
 	int i;
@@ -99,7 +98,6 @@
 	Rune r;
 	static int first = 1;
 
-	USED(notused);
 	if(first){
 		first = 0;
 		for(i = 0; i < NRUNE; i++)
--- a/sys/src/cmd/tcs/conv_gbk.c
+++ b/sys/src/cmd/tcs/conv_gbk.c
@@ -51,7 +51,7 @@
 }
 
 void
-gbk_in(int fd, long *notused, struct convert *out)
+gbk_in(int fd, long *, struct convert *out)
 {
 	Rune ob[N];
 	Rune *r, *re;
@@ -59,7 +59,6 @@
 	int n, i;
 	long nin;
 
-	USED(notused);
 	r = ob;
 	re = ob+N-3;
 	nin = 0;
@@ -84,7 +83,7 @@
 
 
 void
-gbk_out(Rune *base, int n, long *notused)
+gbk_out(Rune *base, int n, long *)
 {
 	char *p;
 	int i;
@@ -91,7 +90,6 @@
 	Rune r;
 	static int first = 1;
 
-	USED(notused);
 	if(first){
 		first = 0;
 		for(i = 0; i < NRUNE; i++)
--- a/sys/src/cmd/tcs/conv_jis.c
+++ b/sys/src/cmd/tcs/conv_jis.c
@@ -367,30 +367,26 @@
 }
 
 void
-jis_in(int fd, long *notused, struct convert *out)
+jis_in(int fd, long *, struct convert *out)
 {
-	USED(notused);
 	do_in(fd, alljis, out);
 }
 
 void
-ujis_in(int fd, long *notused, struct convert *out)
+ujis_in(int fd, long *, struct convert *out)
 {
-	USED(notused);
 	do_in(fd, ujis, out);
 }
 
 void
-msjis_in(int fd, long *notused, struct convert *out)
+msjis_in(int fd, long *, struct convert *out)
 {
-	USED(notused);
 	do_in(fd, ms, out);
 }
 
 void
-jisjis_in(int fd, long *notused, struct convert *out)
+jisjis_in(int fd, long *, struct convert *out)
 {
-	USED(notused);
 	do_in(fd, jis, out);
 }
 
@@ -417,7 +413,7 @@
 
 /*	jis-kanji, or ISO 2022-JP	*/
 void
-jisjis_out(Rune *base, int n, long *notused)
+jisjis_out(Rune *base, int n, long *)
 {
 	char *p;
 	int i;
@@ -424,7 +420,6 @@
 	Rune r;
 	static enum { ascii, japan646, jp2022 } state = ascii;
 
-	USED(notused);
 	if(first)
 		tab_init();
 	nrunes += n;
@@ -462,13 +457,12 @@
 
 /*	ms-kanji, or Shift-JIS	*/
 void
-msjis_out(Rune *base, int n, long *notused)
+msjis_out(Rune *base, int n, long *)
 {
 	char *p;
 	int i, hi, lo;
 	Rune r;
 
-	USED(notused);
 	if(first)
 		tab_init();
 	nrunes += n;
@@ -501,13 +495,12 @@
 
 /*	ujis, or EUC	*/
 void
-ujis_out(Rune *base, int n, long *notused)
+ujis_out(Rune *base, int n, long *)
 {
 	char *p;
 	int i;
 	Rune r;
 
-	USED(notused);
 	if(first)
 		tab_init();
 	nrunes += n;
--- a/sys/src/cmd/tcs/conv_ksc.c
+++ b/sys/src/cmd/tcs/conv_ksc.c
@@ -81,7 +81,7 @@
 }
 
 void
-uksc_in(int fd, long *notused, struct convert *out)
+uksc_in(int fd, long *, struct convert *out)
 {
 	Rune ob[N];
 	Rune *r, *re;
@@ -89,7 +89,6 @@
 	int n, i;
 	long nin;
 
-	USED(notused);
 	r = ob;
 	re = ob+N-3;
 	nin = 0;
@@ -113,7 +112,7 @@
 }
 
 void
-uksc_out(Rune *base, int n, long *notused)
+uksc_out(Rune *base, int n, long *)
 {
 	char *p;
 	int i;
@@ -121,7 +120,6 @@
 	long l;
 	static int first = 1;
 
-	USED(notused);
 	if(first){
 		first = 0;
 		for(i = 0; i < NRUNE; i++)
--- a/sys/src/cmd/tcs/hdr.h
+++ b/sys/src/cmd/tcs/hdr.h
@@ -19,6 +19,7 @@
 typedef void (*Infn)(int, long *, struct convert *);
 typedef void (*Outfn)(Rune *, int, long *);
 void outtable(Rune *, int, long *);
+int fixsurrogate(Rune *rp, Rune r2);
 
 void utf_in(int, long *, struct convert *);
 void utf_out(Rune *, int, long *);
@@ -41,6 +42,5 @@
 #define	EXIT(n,s)	exits(s)
 #else
 #define	EPR		fprintf(stderr,
-#define	USED(x)		/* in plan 9, USED(x) tells the compiler to treat x as used */
 #define	EXIT(n,s)	exit(n)
 #endif
--- a/sys/src/cmd/tcs/html.c
+++ b/sys/src/cmd/tcs/html.c
@@ -2141,24 +2141,22 @@
 }
 
 void
-html_in(int fd, long *x, struct convert *out)
+html_in(int fd, long *, struct convert *out)
 {
 	char buf[100], *p;
 	Biobuf b;
-	Rune rbuf[N];
-	Rune *r, *er;
+	Rune *r, *er, r2;
 	int c, s, i;
 	
-	USED(x);
-	
 	html_init();
-	r = rbuf;
-	er = rbuf+N;
+	r = runes;
+	er = runes+N;
+	r2 = 0;
 	Binit(&b, fd, OREAD);
 	while((c = Bgetrune(&b)) != Beof){
 		if(r >= er){
-			OUT(out, rbuf, r-rbuf);
-			r = rbuf;
+			OUT(out, runes, r-runes);
+			r = runes;
 		}
 		if(c == '&'){
 			s = 0;
@@ -2185,7 +2183,7 @@
 						c = strtol(buf+3, &p, 16);
 					else
 						c = strtol(buf+2, &p, 10);
-					if(*p || c >= NRUNE || c < 0)
+					if(*p || c < 0)
 						goto bad;
 					goto out;
 				}
@@ -2196,10 +2194,11 @@
 			for(p=buf; p<buf+i; ){
 				p += chartorune(r++, p);
 				if(r >= er){
-					OUT(out, rbuf, r-rbuf);
-					r = rbuf;
+					OUT(out, runes, r-runes);
+					r = runes;
 				}
 			}
+			r2 = 0;
 			continue;
 		out:
 			if((c & 0x7f) == c && strchr("<>&\"'", c)){
@@ -2207,12 +2206,18 @@
 				i = sprint(buf, "&%s", findbyrune(c));
 				goto bad;
 			}
+		}	
+		*r = c;
+		if(fixsurrogate(r, r2)){
+			r2 = *r;
+			continue;
 		}
-		*r++ = c;
+		r2 = 0;
+		r++;
 	}
-	if(r > rbuf)
-		OUT(out, rbuf, r-rbuf);
-	OUT(out, rbuf, 0);
+	if(r > runes)
+		OUT(out, runes, r-runes);
+	OUT(out, runes, 0);
 }
 
 /*
@@ -2219,13 +2224,12 @@
  * use biobuf because can use more than UTFmax bytes per rune
  */
 void
-html_out(Rune *r, int n, long *x)
+html_out(Rune *r, int n, long *)
 {
 	char *s;
 	Biobuf b;
 	Rune *er;
 	
-	USED(x);
 	html_init();
 	Binit(&b, 1, OWRITE);
 	er = r+n;
--- a/sys/src/cmd/tcs/tcs.c
+++ b/sys/src/cmd/tcs/tcs.c
@@ -73,7 +73,6 @@
 		break;
 	} ARGEND
 
-	USED(argc);
 	if(verbose)
 		squawk = 1;
 	if(listem){
@@ -214,49 +213,63 @@
 }
 
 void
-unicode_in_be(int fd, long *notused, struct convert *out)
+unicode_in_be(int fd, long *, struct convert *out)
 {
-	int i, n;
-	Rune buf[N], r;
-	uchar *p;
+	uchar buf[2*N], *p, *e;
+	Rune *r, r2;
+	int n;
 
-	USED(notused);
-	while((n = cread(fd, (char *)buf, 2*N, 2)) > 0){
-		/* go backwards as sizeof(Rune) >= 2 */
-		p = (uchar*)buf + n;
+	r2 = 0;
+	while((n = cread(fd, (char*)buf, 2*N, 2)) > 0){
 		ninput += n;
-		n /= 2;
-		for(i=n-1; i>=0; i--){
-			r = *(--p);
-			r |= *(--p) << 8;
-			buf[i] = r;
+		p = buf;
+		e = buf + n;
+		r = runes;
+		while(p < e){
+			*r = *p++ << 8;
+			*r |= *p++;
+			if(fixsurrogate(r, r2)){
+				r2 = *r;
+				continue;
+			}
+			r2 = 0;
+			r++;
 		}
-		OUT(out, buf, n);
+		if(r > runes){
+			OUT(out, runes, r-runes);
+		}
 	}
-	OUT(out, buf, 0);
+	OUT(out, runes, 0);
 }
 
 void
-unicode_in_le(int fd, long *notused, struct convert *out)
+unicode_in_le(int fd, long *, struct convert *out)
 {
-	int i, n;
-	Rune buf[N], r;
-	uchar *p;
+	uchar buf[2*N], *p, *e;
+	Rune *r, r2;
+	int n;
 
-	USED(notused);
-	while((n = cread(fd, (char *)buf, 2*N, 2)) > 0){
-		/* go backwards as sizeof(Rune) >= 2 */
-		p = (uchar*)buf + n;
+	r2 = 0;
+	while((n = cread(fd, (char*)buf, 2*N, 2)) > 0){
 		ninput += n;
-		n /= 2;
-		for(i=n-1; i>=0; i--){
-			r = *(--p) << 8;
-			r |= *(--p);
-			buf[i] = r;
+		p = buf;
+		e = buf + n;
+		r = runes;
+		while(p < e){
+			*r = *p++;
+			*r |= *p++ << 8;
+			if(fixsurrogate(r, r2)){
+				r2 = *r;
+				continue;
+			}
+			r2 = 0;
+			r++;
 		}
-		OUT(out, buf, n);
+		if(r > runes){
+			OUT(out, runes, r-runes);
+		}
 	}
-	OUT(out, buf, 0);
+	OUT(out, runes, 0);
 }
 
 void
@@ -284,41 +297,57 @@
 }
 
 void
-unicode_out_be(Rune *base, int n, long *notused)
+unicode_out_be(Rune *base, int n, long *)
 {
 	int i;
 	uchar *p;
-	Rune r;
+	unsigned long r;
 
-	USED(notused);
 	p = (uchar*)base;
 	for(i=0; i<n; i++){
 		r = base[i];
-		*p++ = r>>8;
-		*p++ = r;
+		if(r > 0xFFFF){
+			r -= 0x10000;
+			*p++ = ((r>>18)&3) + 0xD8;
+			*p++ = r>>10;
+			*p++ = ((r>>8)&3) + 0xDC;
+			*p++ = r;
+		} else {
+			*p++ = r>>8;
+			*p++ = r;
+		}
 	}
 	nrunes += n;
-	noutput += 2*n;
-	write(1, (char *)base, 2*n);
+	n = p - (uchar*)base;
+	noutput += n;
+	write(1, (char *)base, n);
 }
 
 void
-unicode_out_le(Rune *base, int n, long *notused)
+unicode_out_le(Rune *base, int n, long *)
 {
 	int i;
 	uchar *p;
-	Rune r;
+	unsigned long r;
 
-	USED(notused);
 	p = (uchar*)base;
 	for(i=0; i<n; i++){
 		r = base[i];
-		*p++ = r;
-		*p++ = r>>8;
+		if(r > 0xFFFF){
+			r -= 0x10000;
+			*p++ = r>>10;
+			*p++ = ((r>>18)&3) + 0xD8;
+			*p++ = r;
+			*p++ = ((r>>8)&3) + 0xDC;
+		} else {
+			*p++ = r;
+			*p++ = r>>8;
+		}
 	}
 	nrunes += n;
-	noutput += 2*n;
-	write(1, (char *)base, 2*n);
+	n = p - (uchar*)base;
+	noutput += n;
+	write(1, (char *)base, n);
 }
 
 void
@@ -401,6 +430,29 @@
 	}
 	noutput += p-obuf;
 	write(1, obuf, p-obuf);
+}
+
+int
+fixsurrogate(Rune *rp, Rune r2)
+{
+	Rune r1;
+
+	r1 = *rp;
+	if(r1 >= 0xD800 && r1 <= 0xDBFF){
+		if(r2 >= 0xDC00 && r2 <= 0xDFFF){
+			*rp = 0x10000 + (((r1 - 0xD800)<<10) | (r2 - 0xDC00));
+			return 0;
+		}
+		return 1;
+	} else
+	if(r1 >= 0xDC00 && r1 <= 0xDFFF){
+		if(r2 >= 0xD800 && r2 <= 0xDBFF){
+			*rp = 0x10000 + (((r2 - 0xD800)<<10) | (r1 - 0xDC00));
+			return 0;
+		}
+		return 1;
+	}
+	return 0;
 }
 
 long tabascii[256] =
--- a/sys/src/cmd/tcs/tune.c
+++ b/sys/src/cmd/tcs/tune.c
@@ -106,22 +106,20 @@
 }
 
 void
-tune_in(int fd, long *x, struct convert *out)
+tune_in(int fd, long *, struct convert *out)
 {
 	Biobuf b;
-	Rune rbuf[N];
 	Rune *r, *er, tr;
 	int c, i;
 	
-	USED(x);
-	r = rbuf;
-	er = rbuf+N-3;
+	r = runes;
+	er = runes+N-3;
 	Binit(&b, fd, OREAD);
 	while((c = Bgetrune(&b)) != Beof){
 		ninput += b.runesize;
 		if(r >= er){
-			OUT(out, rbuf, r-rbuf);
-			r = rbuf;
+			OUT(out, runes, r-runes);
+			r = runes;
 		}
 		if(c>=0xe210/**/ && c <= 0xe38c/**/ && (i = c%16) < nelem(t2)){
 			if(c >= 0xe380/**/){
@@ -172,13 +170,13 @@
 				break;
 		}
 	}
-	if(r > rbuf)
-		OUT(out, rbuf, r-rbuf);
-	OUT(out, rbuf, 0);
+	if(r > runes)
+		OUT(out, runes, r-runes);
+	OUT(out, runes, 0);
 }
 
 void
-tune_out(Rune *r, int n, long *x)
+tune_out(Rune *r, int n, long *)
 {
 	static int state = 0;
 	static Rune lastr;
@@ -186,7 +184,6 @@
 	char *p;
 	int i;
 
-	USED(x);
 	nrunes += n;
 	er = r+n;
 	for(p = obuf; r < er; r++){
--- a/sys/src/cmd/tcs/utf.c
+++ b/sys/src/cmd/tcs/utf.c
@@ -27,13 +27,12 @@
 int isochartorune(Rune *rune, char *str);
 
 void
-utf_in(int fd, long *notused, struct convert *out)
+utf_in(int fd, long *, struct convert *out)
 {
 	char buf[N];
 	int i, j, c, n, tot;
-	ulong l;
+	unsigned long l;
 
-	USED(notused);
 	tot = 0;
 	while((n = read(fd, buf+tot, N-tot)) >= 0){
 		tot += n;
@@ -65,12 +64,11 @@
 }
 
 void
-utf_out(Rune *base, int n, long *notused)
+utf_out(Rune *base, int n, long *)
 {
 	char *p;
 	Rune *r;
 
-	USED(notused);
 	nrunes += n;
 	for(r = base, p = obuf; n-- > 0; r++){
 		p += our_wctomb(p, *r);
@@ -80,12 +78,11 @@
 }
 
 void
-isoutf_in(int fd, long *notused, struct convert *out)
+isoutf_in(int fd, long *, struct convert *out)
 {
 	char buf[N];
 	int i, j, c, n, tot;
 
-	USED(notused);
 	tot = 0;
 	while((n = read(fd, buf+tot, N-tot)) >= 0){
 		tot += n;
@@ -117,12 +114,11 @@
 }
 
 void
-isoutf_out(Rune *base, int n, long *notused)
+isoutf_out(Rune *base, int n, long *)
 {
 	char *p;
 	Rune *r;
 
-	USED(notused);
 	nrunes += n;
 	for(r = base, p = obuf; n-- > 0; r++)
 		p += runetoisoutf(p, r);