shithub: libmujs

Download patch

ref: d2697fd6435d0ffaae5c424e00ccf3a4b5a21f56
parent: a442a0418f06b673559c93a69e3167b4460d6d07
author: Tor Andersson <tor.andersson@artifex.com>
date: Tue Jan 23 09:33:04 EST 2024

Expose extended unicode characters as surrogate pairs in String methods.

Split extended characters into surrogate pairs for charCodeAt, string
indexing, and the string slice/subset functions.

Escape surrogate code points in JSON stringify.

--- a/jsi.h
+++ b/jsi.h
@@ -144,8 +144,8 @@
 js_Regexp *js_toregexp(js_State *J, int idx);
 int js_isarrayindex(js_State *J, const char *str, int *idx);
 int js_runeat(js_State *J, const char *s, int i);
+int js_utflen(const char *s);
 int js_utfptrtoidx(const char *s, const char *p);
-const char *js_utfidxtoptr(const char *s, int i);
 
 void js_dup(js_State *J);
 void js_dup2(js_State *J);
--- a/json.c
+++ b/json.c
@@ -185,7 +185,7 @@
 
 static void fmtstr(js_State *J, js_Buffer **sb, const char *s)
 {
-	static const char *HEX = "0123456789ABCDEF";
+	static const char *HEX = "0123456789abcdef";
 	int i, n;
 	Rune c;
 	js_putc(J, sb, '"');
@@ -200,7 +200,7 @@
 		case '\r': js_puts(J, sb, "\\r"); break;
 		case '\t': js_puts(J, sb, "\\t"); break;
 		default:
-			if (c < ' ') {
+			if (c < ' ' || (c >= 0xd800 && c <= 0xdfff)) {
 				js_putc(J, sb, '\\');
 				js_putc(J, sb, 'u');
 				js_putc(J, sb, HEX[(c>>12)&15]);
--- a/jsstring.c
+++ b/jsstring.c
@@ -20,31 +20,54 @@
 int js_runeat(js_State *J, const char *s, int i)
 {
 	Rune rune = EOF;
-	while (i-- >= 0) {
+	while (i >= 0) {
 		rune = *(unsigned char*)s;
 		if (rune < Runeself) {
 			if (rune == 0)
 				return EOF;
 			++s;
-		} else
+			--i;
+		} else {
 			s += chartorune(&rune, s);
+			if (rune >= 0x10000)
+				i -= 2;
+			else
+				--i;
+		}
 	}
+	if (rune >= 0x10000) {
+		/* high surrogate */
+		if (i == -2)
+			return 0xd800 + ((rune - 0x10000) >> 10);
+		/* low surrogate */
+		else
+			return 0xdc00 + ((rune - 0x10000) & 0x3ff);
+	}
 	return rune;
 }
 
-const char *js_utfidxtoptr(const char *s, int i)
+int js_utflen(const char *s)
 {
+	int c;
+	int n;
 	Rune rune;
-	while (i-- > 0) {
-		rune = *(unsigned char*)s;
-		if (rune < Runeself) {
-			if (rune == 0)
-				return NULL;
-			++s;
-		} else
+
+	n = 0;
+	for(;;) {
+		c = *(unsigned char *)s;
+		if (c < Runeself) {
+			if (c == 0)
+				return n;
+			s++;
+			n++;
+		} else {
 			s += chartorune(&rune, s);
+			if (rune >= 0x10000)
+				n += 2;
+			else
+				n++;
+		}
 	}
-	return s;
 }
 
 int js_utfptrtoidx(const char *s, const char *p)
@@ -56,7 +79,10 @@
 			++s;
 		else
 			s += chartorune(&rune, s);
-		++i;
+		if (rune >= 0x10000)
+			i += 2;
+		else
+			i += 1;
 	}
 	return i;
 }
@@ -190,11 +216,67 @@
 	js_pushnumber(J, strcmp(a, b));
 }
 
+static void Sp_substring_imp(js_State *J, const char *s, int a, int n)
+{
+	Rune head_rune = 0, tail_rune = 0;
+	const char *head, *tail;
+	char *p;
+	int i, k, head_len, tail_len;
+
+	/* find start of substring */
+	head = s;
+	for (i = 0; i < a; ++i) {
+		head += chartorune(&head_rune, head);
+		if (head_rune >= 0x10000)
+			++i;
+	}
+
+	/* find end of substring */
+	tail = head;
+	for (k = i - a; k < n; ++k) {
+		tail += chartorune(&tail_rune, tail);
+		if (tail_rune >= 0x10000)
+			++k;
+	}
+
+	/* no surrogate pair splits! */
+	if (i == a && k == n) {
+		js_pushlstring(J, head, tail - head);
+		return;
+	}
+
+	if (js_try(J)) {
+		js_free(J, p);
+		js_throw(J);
+	}
+
+	p = js_malloc(J, UTFmax + (tail - head));
+
+	/* substring starts with low surrogate (head is just after character) */
+	if (i > a) {
+		head_rune = 0xdc00 + ((head_rune - 0x10000) & 0x3ff);
+		head_len = runetochar(p, &head_rune);
+		memcpy(p + head_len, head, tail - head);
+		js_pushlstring(J, p, head_len + (tail - head));
+	}
+
+	/* substring ends with high surrogate (tail is just after character) */
+	if (k > n) {
+		tail -= runelen(tail_rune);
+		memcpy(p, head, tail - head);
+		tail_rune = 0xd800 + ((tail_rune - 0x10000) >> 10);
+		tail_len = runetochar(p + (tail - head), &tail_rune);
+		js_pushlstring(J, p, (tail - head) + tail_len);
+	}
+
+	js_endtry(J);
+	js_free(J, p);
+}
+
 static void Sp_slice(js_State *J)
 {
 	const char *str = checkstring(J, 0);
-	const char *ss, *ee;
-	int len = utflen(str);
+	int len = js_utflen(str);
 	int s = js_tointeger(J, 1);
 	int e = js_isdefined(J, 2) ? js_tointeger(J, 2) : len;
 
@@ -204,22 +286,16 @@
 	s = s < 0 ? 0 : s > len ? len : s;
 	e = e < 0 ? 0 : e > len ? len : e;
 
-	if (s < e) {
-		ss = js_utfidxtoptr(str, s);
-		ee = js_utfidxtoptr(ss, e - s);
-	} else {
-		ss = js_utfidxtoptr(str, e);
-		ee = js_utfidxtoptr(ss, s - e);
-	}
-
-	js_pushlstring(J, ss, ee - ss);
+	if (s < e)
+		Sp_substring_imp(J, str, s, e - s);
+	else
+		Sp_substring_imp(J, str, e, s - e);
 }
 
 static void Sp_substring(js_State *J)
 {
 	const char *str = checkstring(J, 0);
-	const char *ss, *ee;
-	int len = utflen(str);
+	int len = js_utflen(str);
 	int s = js_tointeger(J, 1);
 	int e = js_isdefined(J, 2) ? js_tointeger(J, 2) : len;
 
@@ -226,15 +302,10 @@
 	s = s < 0 ? 0 : s > len ? len : s;
 	e = e < 0 ? 0 : e > len ? len : e;
 
-	if (s < e) {
-		ss = js_utfidxtoptr(str, s);
-		ee = js_utfidxtoptr(ss, e - s);
-	} else {
-		ss = js_utfidxtoptr(str, e);
-		ee = js_utfidxtoptr(ss, s - e);
-	}
-
-	js_pushlstring(J, ss, ee - ss);
+	if (s < e)
+		Sp_substring_imp(J, str, s, e - s);
+	else
+		Sp_substring_imp(J, str, e, s - e);
 }
 
 static void Sp_toLowerCase(js_State *J)
--- a/jsvalue.c
+++ b/jsvalue.c
@@ -388,7 +388,7 @@
 	} else {
 		obj->u.s.string = js_strdup(J, v);
 	}
-	obj->u.s.length = utflen(v);
+	obj->u.s.length = js_utflen(v);
 	return obj;
 }
 
--- a/utf.c
+++ b/utf.c
@@ -194,26 +194,6 @@
 	return runetochar(str, &rune);
 }
 
-int
-utflen(const char *s)
-{
-	int c;
-	int n;
-	Rune rune;
-
-	n = 0;
-	for(;;) {
-		c = *(uchar*)s;
-		if(c < Runeself) {
-			if(c == 0)
-				return n;
-			s++;
-		} else
-			s += chartorune(&rune, s);
-		n++;
-	}
-}
-
 static const Rune *
 ucd_bsearch(Rune c, const Rune *t, int n, int ne)
 {
--- a/utf.h
+++ b/utf.h
@@ -19,7 +19,6 @@
 #define chartorune	jsU_chartorune
 #define runetochar	jsU_runetochar
 #define runelen		jsU_runelen
-#define utflen		jsU_utflen
 
 #define isalpharune	jsU_isalpharune
 #define islowerrune	jsU_islowerrune
@@ -39,7 +38,6 @@
 int	chartorune(Rune *rune, const char *str);
 int	runetochar(char *str, const Rune *rune);
 int	runelen(int c);
-int	utflen(const char *s);
 
 int		isalpharune(Rune c);
 int		islowerrune(Rune c);