shithub: libmujs

--- a/docs/reference.html

+++ b/docs/reference.html

@@ -61,11 +61,15 @@

 Numbers are represented using double precision floating point values.

<p>

-Strings in the C interface are zero-terminated byte arrays in CESU-8 encoding.

-CESU-8 is a variant of UTF-8 which encodes supplementary unicode characters as

-surrogate pairs. This maintains compatibility with the UTF-16 nature of

-JavaScript, but requires attention when passing strings using supplementary

-unicode characters to and from the MuJS library.

+Strings in the C interface are zero-terminated byte arrays in WTF-8 encoding.

+This allows both arbitary 16-bit values (as required by Javascript) and also

+extended code points for the full 21-bit Unicode range.

+These extended characters will mostly work as expected in Javascript.

+<p>

+If you have Javascript code that expects to work with UTF-16 surrogate pairs,

+you will need to manually convert any extended characters to surrogate pairs

+and back when passing strings between C and Javascript.

<p>

 The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in

--- a/json.c

+++ b/json.c

@@ -180,10 +180,11 @@

 static void fmtstr(js_State *J, js_Buffer **sb, const char *s)

 	static const char *HEX = "0123456789ABCDEF";

+	int i, n;

 	Rune c;

 	js_putc(J, sb, '"');

 	while (*s) {

-		s += chartorune(&c, s);

+		n = chartorune(&c, s);

 		switch (c) {

 		case '"': js_puts(J, sb, "\\\""); break;

 		case '\\': js_puts(J, sb, "\\\\"); break;

@@ -193,16 +194,22 @@

 		case '\r': js_puts(J, sb, "\\r"); break;

 		case '\t': js_puts(J, sb, "\\t"); break;

 		default:

-			if (c < ' ' || c > 127) {

-				js_puts(J, sb, "\\u");

+			if (c < ' ') {

+				js_putc(J, sb, '\\');

+				js_putc(J, sb, 'u');

 				js_putc(J, sb, HEX[(c>>12)&15]);

 				js_putc(J, sb, HEX[(c>>8)&15]);

 				js_putc(J, sb, HEX[(c>>4)&15]);

 				js_putc(J, sb, HEX[c&15]);

+			} else if (c < 128) {

+				js_putc(J, sb, c);

 			} else {

-				js_putc(J, sb, c); break;

+				for (i = 0; i < n; ++i)

+					js_putc(J, sb, s[i]);

+			break;

+		s += n;

 	js_putc(J, sb, '"');

--- a/jsrepr.c

+++ b/jsrepr.c

@@ -19,10 +19,11 @@

 static void reprstr(js_State *J, js_Buffer **sb, const char *s)

 	static const char *HEX = "0123456789ABCDEF";

+	int i, n;

 	Rune c;

 	js_putc(J, sb, '"');

 	while (*s) {

-		s += chartorune(&c, s);

+		n = chartorune(&c, s);

 		switch (c) {

 		case '"': js_puts(J, sb, "\\\""); break;

 		case '\\': js_puts(J, sb, "\\\\"); break;

@@ -32,16 +33,27 @@

 		case '\r': js_puts(J, sb, "\\r"); break;

 		case '\t': js_puts(J, sb, "\\t"); break;

 		default:

-			if (c < ' ' || c > 127) {

-				js_puts(J, sb, "\\u");

+			if (c < ' ') {

+				js_putc(J, sb, '\\');

+				js_putc(J, sb, 'x');

+				js_putc(J, sb, HEX[(c>>4)&15]);

+				js_putc(J, sb, HEX[c&15]);

+			} else if (c < 128) {

+				js_putc(J, sb, c);

+			} else if (c < 0x10000) {

+				js_putc(J, sb, '\\');

+				js_putc(J, sb, 'u');

 				js_putc(J, sb, HEX[(c>>12)&15]);

 				js_putc(J, sb, HEX[(c>>8)&15]);

 				js_putc(J, sb, HEX[(c>>4)&15]);

 				js_putc(J, sb, HEX[c&15]);

 			} else {

-				js_putc(J, sb, c); break;

+				for (i = 0; i < n; ++i)

+					js_putc(J, sb, s[i]);

+			break;

+		s += n;

 	js_putc(J, sb, '"');

--- a/jsstring.c

+++ b/jsstring.c

@@ -310,7 +310,7 @@

 	for (i = 1; i < top; ++i) {

-		c = js_touint16(J, i);

+		c = js_touint32(J, i);

 		p += runetochar(p, &c);

 	*p = 0;

--- a/utf.c

+++ b/utf.c

@@ -25,6 +25,7 @@

 	Bit2	= 5,

 	Bit3	= 4,

 	Bit4	= 3,

+	Bit5	= 2,

 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */

 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */

@@ -31,21 +32,23 @@

 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */

 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */

 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */

+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */

-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */

-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */

-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */

+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */

+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */

+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */

+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */

 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */

 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */

-	Bad	= Runeerror,

+	Bad	= Runeerror

};

int

 chartorune(Rune *rune, const char *str)

-	int c, c1, c2;

+	int c, c1, c2, c3;

 	int l;

 	/* overlong null character */

@@ -97,6 +100,25 @@

/*

+	 * four character sequence

+	 *	10000-10FFFF => T4 Tx Tx Tx

+	 */

+	if(UTFmax >= 4) {

+		c3 = *(uchar*)(str+3) ^ Tx;

+		if(c3 & Testx)

+			goto bad;

+		if(c < T5) {

+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;

+			if(l <= Rune3)

+				goto bad;

+			if(l > Runemax)

+				goto bad;

+			*rune = l;

+			return 4;

+		}

+	}

+	/*

 	 * bad decoding

*/

 bad:

@@ -127,7 +149,7 @@

/*

 	 * two character sequence

-	 *	0080-07FF => T2 Tx

+	 *	00080-007FF => T2 Tx

*/

 	if(c <= Rune2) {

 		str[0] = T2 | (c >> 1*Bitx);

@@ -137,12 +159,26 @@

/*

 	 * three character sequence

-	 *	0800-FFFF => T3 Tx Tx

+	 *	00800-0FFFF => T3 Tx Tx

*/

-	str[0] = T3 |  (c >> 2*Bitx);

-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);

-	str[2] = Tx |  (c & Maskx);

-	return 3;

+	if(c > Runemax)

+		c = Runeerror;

+	if(c <= Rune3) {

+		str[0] = T3 |  (c >> 2*Bitx);

+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);

+		str[2] = Tx |  (c & Maskx);

+		return 3;

+	}

+	/*

+	 * four character sequence

+	 *	010000-1FFFFF => T4 Tx Tx Tx

+	 */

+	str[0] = T4 |  (c >> 3*Bitx);

+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);

+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);

+	str[3] = Tx |  (c & Maskx);

+	return 4;

int

--- a/utf.h

+++ b/utf.h

@@ -1,7 +1,7 @@

 #ifndef js_utf_h

 #define js_utf_h

-typedef unsigned short Rune;	/* 16 bits */

+typedef int Rune;	/* 32 bits */

 #define chartorune	jsU_chartorune

 #define runetochar	jsU_runetochar

@@ -19,10 +19,11 @@

 enum

-	UTFmax		= 3,		/* maximum bytes per rune */

+	UTFmax		= 4,		/* maximum bytes per rune */

 	Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */

 	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */

 	Runeerror	= 0xFFFD,	/* decoding error in UTF */

+	Runemax		= 0x10FFFF,	/* maximum rune value */

};

 int	chartorune(Rune *rune, const char *str);