shithub: rgbds

Download patch

ref: 975f85260da45ee2e12b55204bfd7434e15960ac
parent: f29d768989cf23a2f77a3f6e018863c5e0b8ace0
author: dbrotz <43593771+dbrotz@users.noreply.github.com>
date: Sun Jun 2 12:10:34 EDT 2019

Use code points instead of bytes for STRSUB/STRLEN

--- a/src/asm/asmy.y
+++ b/src/asm/asmy.y
@@ -26,6 +26,8 @@
 #include "asm/rpn.h"
 #include "asm/symbol.h"
 
+#include "extern/utf8decoder.h"
+
 #include "common.h"
 #include "linkdefs.h"
 
@@ -431,6 +433,85 @@
 	pPCSymbol->nValue = unionStart[unionIndex];
 }
 
+static size_t strlenUTF8(const char *s)
+{
+	size_t len = 0;
+	uint32_t state = 0;
+	uint32_t codep = 0;
+
+	while (*s) {
+		switch (decode(&state, &codep, (uint8_t)*s)) {
+		case 1:
+			fatalerror("STRLEN: Invalid UTF-8 character");
+			break;
+		case 0:
+			len++;
+			break;
+		}
+		s++;
+	}
+
+	/* Check for partial code point. */
+	if (state != 0)
+		fatalerror("STRLEN: Invalid UTF-8 character");
+
+	return len;
+}
+
+static void strsubUTF8(char *dest, const char *src, uint32_t pos, uint32_t len)
+{
+	size_t srcIndex = 0;
+	size_t destIndex = 0;
+	uint32_t state = 0;
+	uint32_t codep = 0;
+	uint32_t curPos = 1;
+	uint32_t curLen = 0;
+
+	if (pos < 1) {
+		warning("STRSUB: Position starts at 1");
+		pos = 1;
+	}
+
+	/* Advance to starting position in source string. */
+	while (src[srcIndex] && curPos < pos) {
+		switch (decode(&state, &codep, (uint8_t)src[srcIndex])) {
+		case 1:
+			fatalerror("STRSUB: Invalid UTF-8 character");
+			break;
+		case 0:
+			curPos++;
+			break;
+		}
+		srcIndex++;
+	}
+
+	if (!src[srcIndex])
+		warning("STRSUB: Position %lu is past the end of the string",
+			(unsigned long)pos);
+
+	/* Copy from source to destination. */
+	while (src[srcIndex] && destIndex < MAXSTRLEN && curLen < len) {
+		switch (decode(&state, &codep, (uint8_t)src[srcIndex])) {
+		case 1:
+			fatalerror("STRSUB: Invalid UTF-8 character");
+			break;
+		case 0:
+			curLen++;
+			break;
+		}
+		dest[destIndex++] = src[srcIndex++];
+	}
+
+	if (curLen < len)
+		warning("STRSUB: Length too big: %lu", (unsigned long)len);
+
+	/* Check for partial code point. */
+	if (state != 0)
+		fatalerror("STRSUB: Invalid UTF-8 character");
+
+	dest[destIndex] = 0;
+}
+
 %}
 
 %union
@@ -1249,7 +1330,7 @@
 			else
 				rpn_Number(&$$, 0);
 		}
-		| T_OP_STRLEN '(' string ')'		{ rpn_Number(&$$, strlen($3)); }
+		| T_OP_STRLEN '(' string ')'		{ rpn_Number(&$$, strlenUTF8($3)); }
 		| '(' relocconst ')'			{ $$ = $2; }
 ;
 
@@ -1327,7 +1408,7 @@
 			else
 				constexpr_Number(&$$, 0);
 		}
-		| T_OP_STRLEN '(' string ')'		{ constexpr_Number(&$$, strlen($3)); }
+		| T_OP_STRLEN '(' string ')'		{ constexpr_Number(&$$, strlenUTF8($3)); }
 		| '(' const ')'				{ $$ = $2; }
 ;
 
@@ -1338,14 +1419,7 @@
 		}
 		| T_OP_STRSUB '(' string comma uconst comma uconst ')'
 		{
-			uint32_t len = $7;
-			if (len > MAXSTRLEN) {
-				warning("STRSUB: Length too big: %u", len);
-				len = MAXSTRLEN;
-			}
-
-			if (snprintf($$, len + 1, "%s", $3 + $5 - 1) > MAXSTRLEN)
-				warning("STRSUB: String too long '%s'", $$);
+			strsubUTF8($$, $3, $5, $7);
 		}
 		| T_OP_STRCAT '(' string comma string ')'
 		{
--- /dev/null
+++ b/test/asm/strlen.asm
@@ -1,0 +1,9 @@
+SECTION "sec", ROM0
+
+xstrlen: MACRO
+	PRINTV STRLEN(\1)
+	PRINTT "\n"
+ENDM
+
+	xstrlen "ABC"
+	xstrlen "カタカナ"
--- /dev/null
+++ b/test/asm/strlen.out
@@ -1,0 +1,2 @@
+$3
+$4
--- /dev/null
+++ b/test/asm/strsub.asm
@@ -1,0 +1,22 @@
+SECTION "sec", ROM0
+
+xstrsub: MACRO
+	PRINTT STRSUB(\1, \2, \3)
+	PRINTT "\n"
+ENDM
+
+	xstrsub "ABC", 1, 1
+	xstrsub "ABC", 2, 1
+	xstrsub "ABC", 3, 1
+	xstrsub "ABC", 1, 2
+	xstrsub "ABC", 2, 2
+	xstrsub "ABC", 2, 32
+	xstrsub "ABC", 2, 300
+	xstrsub "ABC", 0, 300
+	xstrsub "ABC", 4, 0
+	xstrsub "ABC", 4, 1
+	xstrsub "カタカナ", 1, 2
+	xstrsub "カタカナ", 3, 2
+	xstrsub "カタカナ", 3, 10
+	xstrsub "g̈", 1, 1
+	xstrsub "g̈", 1, 2
--- /dev/null
+++ b/test/asm/strsub.out
@@ -1,0 +1,31 @@
+warning: strsub.asm(13) -> xstrsub(1):
+    STRSUB: Length too big: 32
+warning: strsub.asm(14) -> xstrsub(1):
+    STRSUB: Length too big: 300
+warning: strsub.asm(15) -> xstrsub(1):
+    STRSUB: Position starts at 1
+warning: strsub.asm(15) -> xstrsub(1):
+    STRSUB: Length too big: 300
+warning: strsub.asm(16) -> xstrsub(1):
+    STRSUB: Position 4 is past the end of the string
+warning: strsub.asm(17) -> xstrsub(1):
+    STRSUB: Position 4 is past the end of the string
+warning: strsub.asm(17) -> xstrsub(1):
+    STRSUB: Length too big: 1
+warning: strsub.asm(20) -> xstrsub(1):
+    STRSUB: Length too big: 10
+A
+B
+C
+AB
+BC
+BC
+BC
+ABC
+
+
+カタ
+カナ
+カナ
+g
+g̈