ref: 975f85260da45ee2e12b55204bfd7434e15960ac
parent: f29d768989cf23a2f77a3f6e018863c5e0b8ace0
author: dbrotz <43593771+dbrotz@users.noreply.github.com>
date: Sun Jun 2 12:10:34 EDT 2019
Use code points instead of bytes for STRSUB/STRLEN
--- a/src/asm/asmy.y
+++ b/src/asm/asmy.y
@@ -26,6 +26,8 @@
#include "asm/rpn.h"
#include "asm/symbol.h"
+#include "extern/utf8decoder.h"
+
#include "common.h"
#include "linkdefs.h"
@@ -431,6 +433,85 @@
pPCSymbol->nValue = unionStart[unionIndex];
}
+static size_t strlenUTF8(const char *s)
+{
+ size_t len = 0;
+ uint32_t state = 0;
+ uint32_t codep = 0;
+
+ while (*s) {
+ switch (decode(&state, &codep, (uint8_t)*s)) {
+ case 1:
+ fatalerror("STRLEN: Invalid UTF-8 character");
+ break;
+ case 0:
+ len++;
+ break;
+ }
+ s++;
+ }
+
+ /* Check for partial code point. */
+ if (state != 0)
+ fatalerror("STRLEN: Invalid UTF-8 character");
+
+ return len;
+}
+
+static void strsubUTF8(char *dest, const char *src, uint32_t pos, uint32_t len)
+{
+ size_t srcIndex = 0;
+ size_t destIndex = 0;
+ uint32_t state = 0;
+ uint32_t codep = 0;
+ uint32_t curPos = 1;
+ uint32_t curLen = 0;
+
+ if (pos < 1) {
+ warning("STRSUB: Position starts at 1");
+ pos = 1;
+ }
+
+ /* Advance to starting position in source string. */
+ while (src[srcIndex] && curPos < pos) {
+ switch (decode(&state, &codep, (uint8_t)src[srcIndex])) {
+ case 1:
+ fatalerror("STRSUB: Invalid UTF-8 character");
+ break;
+ case 0:
+ curPos++;
+ break;
+ }
+ srcIndex++;
+ }
+
+ if (!src[srcIndex])
+ warning("STRSUB: Position %lu is past the end of the string",
+ (unsigned long)pos);
+
+ /* Copy from source to destination. */
+ while (src[srcIndex] && destIndex < MAXSTRLEN && curLen < len) {
+ switch (decode(&state, &codep, (uint8_t)src[srcIndex])) {
+ case 1:
+ fatalerror("STRSUB: Invalid UTF-8 character");
+ break;
+ case 0:
+ curLen++;
+ break;
+ }
+ dest[destIndex++] = src[srcIndex++];
+ }
+
+ if (curLen < len)
+ warning("STRSUB: Length too big: %lu", (unsigned long)len);
+
+ /* Check for partial code point. */
+ if (state != 0)
+ fatalerror("STRSUB: Invalid UTF-8 character");
+
+ dest[destIndex] = 0;
+}
+
%}
%union
@@ -1249,7 +1330,7 @@
else
rpn_Number(&$$, 0);
}
- | T_OP_STRLEN '(' string ')' { rpn_Number(&$$, strlen($3)); }
+ | T_OP_STRLEN '(' string ')' { rpn_Number(&$$, strlenUTF8($3)); }
| '(' relocconst ')' { $$ = $2; }
;
@@ -1327,7 +1408,7 @@
else
constexpr_Number(&$$, 0);
}
- | T_OP_STRLEN '(' string ')' { constexpr_Number(&$$, strlen($3)); }
+ | T_OP_STRLEN '(' string ')' { constexpr_Number(&$$, strlenUTF8($3)); }
| '(' const ')' { $$ = $2; }
;
@@ -1338,14 +1419,7 @@
}
| T_OP_STRSUB '(' string comma uconst comma uconst ')'
{
- uint32_t len = $7;
- if (len > MAXSTRLEN) {
- warning("STRSUB: Length too big: %u", len);
- len = MAXSTRLEN;
- }
-
- if (snprintf($$, len + 1, "%s", $3 + $5 - 1) > MAXSTRLEN)
- warning("STRSUB: String too long '%s'", $$);
+ strsubUTF8($$, $3, $5, $7);
}
| T_OP_STRCAT '(' string comma string ')'
{
--- /dev/null
+++ b/test/asm/strlen.asm
@@ -1,0 +1,9 @@
+SECTION "sec", ROM0
+
+xstrlen: MACRO
+ PRINTV STRLEN(\1)
+ PRINTT "\n"
+ENDM
+
+ xstrlen "ABC"
+ xstrlen "カタカナ"
--- /dev/null
+++ b/test/asm/strlen.out
@@ -1,0 +1,2 @@
+$3
+$4
--- /dev/null
+++ b/test/asm/strsub.asm
@@ -1,0 +1,22 @@
+SECTION "sec", ROM0
+
+xstrsub: MACRO
+ PRINTT STRSUB(\1, \2, \3)
+ PRINTT "\n"
+ENDM
+
+ xstrsub "ABC", 1, 1
+ xstrsub "ABC", 2, 1
+ xstrsub "ABC", 3, 1
+ xstrsub "ABC", 1, 2
+ xstrsub "ABC", 2, 2
+ xstrsub "ABC", 2, 32
+ xstrsub "ABC", 2, 300
+ xstrsub "ABC", 0, 300
+ xstrsub "ABC", 4, 0
+ xstrsub "ABC", 4, 1
+ xstrsub "カタカナ", 1, 2
+ xstrsub "カタカナ", 3, 2
+ xstrsub "カタカナ", 3, 10
+ xstrsub "g̈", 1, 1
+ xstrsub "g̈", 1, 2
--- /dev/null
+++ b/test/asm/strsub.out
@@ -1,0 +1,31 @@
+warning: strsub.asm(13) -> xstrsub(1):
+ STRSUB: Length too big: 32
+warning: strsub.asm(14) -> xstrsub(1):
+ STRSUB: Length too big: 300
+warning: strsub.asm(15) -> xstrsub(1):
+ STRSUB: Position starts at 1
+warning: strsub.asm(15) -> xstrsub(1):
+ STRSUB: Length too big: 300
+warning: strsub.asm(16) -> xstrsub(1):
+ STRSUB: Position 4 is past the end of the string
+warning: strsub.asm(17) -> xstrsub(1):
+ STRSUB: Position 4 is past the end of the string
+warning: strsub.asm(17) -> xstrsub(1):
+ STRSUB: Length too big: 1
+warning: strsub.asm(20) -> xstrsub(1):
+ STRSUB: Length too big: 10
+A
+B
+C
+AB
+BC
+BC
+BC
+ABC
+
+
+カタ
+カナ
+カナ
+g
+g̈