shithub: rgbds

Download patch

ref: 2005ed1df9a6abfbb693889129af15f78569282e
parent: d43408f4f3dc7b8375e43ea5de77075726dcecfc
author: Rangi <remy.oukaour+rangi42@gmail.com>
date: Mon Mar 8 10:11:12 EST 2021

Implement CHARLEN and CHARSUB

Fixes #786

--- a/include/asm/charmap.h
+++ b/include/asm/charmap.h
@@ -18,5 +18,6 @@
 void charmap_Pop(void);
 void charmap_Add(char *mapping, uint8_t value);
 size_t charmap_Convert(char const *input, uint8_t *output);
+size_t charmap_ConvertNext(char const **input, uint8_t *output);
 
 #endif /* RGBDS_ASM_CHARMAP_H */
--- a/src/asm/charmap.c
+++ b/src/asm/charmap.c
@@ -57,7 +57,7 @@
 
 struct CharmapStackEntry *charmapStack;
 
-static struct Charmap *charmap_Get(const char *name)
+static struct Charmap *charmap_Get(char const *name)
 {
 	return hash_GetElement(charmaps, name);
 }
@@ -193,6 +193,19 @@
 
 size_t charmap_Convert(char const *input, uint8_t *output)
 {
+	size_t outputLen = 0;
+
+	for (size_t charLen = charmap_ConvertNext(&input, output); charLen;
+	     charLen = charmap_ConvertNext(&input, output)) {
+		output += charLen;
+		outputLen += charLen;
+	}
+
+	return outputLen;
+}
+
+size_t charmap_ConvertNext(char const **input, uint8_t *output)
+{
 	/*
 	 * The goal is to match the longest mapping possible.
 	 * For that, advance through the trie with each character read.
@@ -199,7 +212,6 @@
 	 * If that would lead to a dead end, rewind characters until the last match, and output.
 	 * If no match, read a UTF-8 codepoint and output that.
 	 */
-	size_t outputLen = 0;
 	struct Charmap const *charmap = *currentCharmap;
 	struct Charnode const *node = &charmap->nodes[0];
 	struct Charnode const *match = NULL;
@@ -207,10 +219,10 @@
 
 	for (;;) {
 		/* We still want NULs to reach the `else` path, to give a chance to rewind */
-		uint8_t c = *input - 1;
+		uint8_t c = **input - 1;
 
-		if (*input && node->next[c]) {
-			input++; /* Consume that char */
+		if (**input && node->next[c]) {
+			(*input)++; /* Consume that char */
 			rewindDistance++;
 
 			node = &charmap->nodes[node->next[c]];
@@ -220,31 +232,32 @@
 			}
 
 		} else {
-			input -= rewindDistance; /* Rewind */
+			*input -= rewindDistance; /* Rewind */
 			rewindDistance = 0;
 			node = &charmap->nodes[0];
 
 			if (match) { /* Arrived at a dead end with a match found */
-				*output++ = match->value;
-				outputLen++;
-				match = NULL; /* Reset match for next round */
+				if (output)
+					*output = match->value;
 
-			} else if (*input) { /* No match found */
-				size_t codepointLen = readUTF8Char(output, input);
+				return 1;
 
-				if (codepointLen == 0) {
+			} else if (**input) { /* No match found */
+				size_t codepointLen = readUTF8Char(output, *input);
+
+				if (codepointLen == 0)
 					error("Input string is not valid UTF-8!\n");
-					break;
-				}
-				input += codepointLen; /* OK because UTF-8 has no NUL in multi-byte chars */
-				output += codepointLen;
-				outputLen += codepointLen;
-			}
 
-			if (!*input)
-				break;
+				/* OK because UTF-8 has no NUL in multi-byte chars */
+				*input += codepointLen;
+
+				return codepointLen;
+
+			} else { /* End of input */
+				return 0;
+			}
 		}
 	}
 
-	return outputLen;
+	unreachable_();
 }
--- a/src/asm/lexer.c
+++ b/src/asm/lexer.c
@@ -210,6 +210,9 @@
 	{"STRRPL", T_OP_STRRPL},
 	{"STRFMT", T_OP_STRFMT},
 
+	{"CHARLEN", T_OP_CHARLEN},
+	{"CHARSUB", T_OP_CHARSUB},
+
 	{"INCLUDE", T_POP_INCLUDE},
 	{"PRINT", T_POP_PRINT},
 	{"PRINTLN", T_POP_PRINTLN},
@@ -589,7 +592,7 @@
 	uint16_t children[0x60 - ' '];
 	struct KeywordMapping const *keyword;
 /* Since the keyword structure is invariant, the min number of nodes is known at compile time */
-} keywordDict[351] = {0}; /* Make sure to keep this correct when adding keywords! */
+} keywordDict[357] = {0}; /* Make sure to keep this correct when adding keywords! */
 
 /* Convert a char into its index into the dict */
 static uint8_t dictIndex(char c)
--- a/src/asm/parser.y
+++ b/src/asm/parser.y
@@ -82,13 +82,12 @@
 	return NULL;
 }
 
-static size_t strlenUTF8(const char *s)
+static size_t strlenUTF8(char const *s)
 {
 	size_t len = 0;
 	uint32_t state = 0;
-	uint32_t codep = 0;
 
-	while (*s) {
+	for (uint32_t codep = 0; *s; s++) {
 		switch (decode(&state, &codep, *s)) {
 		case 1:
 			fatalerror("STRLEN: Invalid UTF-8 character\n");
@@ -97,7 +96,6 @@
 			len++;
 			break;
 		}
-		s++;
 	}
 
 	/* Check for partial code point. */
@@ -107,13 +105,12 @@
 	return len;
 }
 
-static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos, uint32_t len)
+static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos, uint32_t len)
 {
 	size_t srcIndex = 0;
 	size_t destIndex = 0;
 	uint32_t state = 0;
 	uint32_t codep = 0;
-	uint32_t curPos = 1;
 	uint32_t curLen = 0;
 
 	if (pos < 1) {
@@ -122,7 +119,7 @@
 	}
 
 	/* Advance to starting position in source string. */
-	while (src[srcIndex] && curPos < pos) {
+	for (uint32_t curPos = 1; src[srcIndex] && curPos < pos; srcIndex++) {
 		switch (decode(&state, &codep, src[srcIndex])) {
 		case 1:
 			fatalerror("STRSUB: Invalid UTF-8 character\n");
@@ -131,7 +128,6 @@
 			curPos++;
 			break;
 		}
-		srcIndex++;
 	}
 
 	if (!src[srcIndex] && len)
@@ -162,6 +158,42 @@
 	dest[destIndex] = '\0';
 }
 
+static size_t charlenUTF8(char const *s)
+{
+	size_t len;
+
+	for (len = 0; charmap_ConvertNext(&s, NULL); len++)
+		;
+
+	return len;
+}
+
+static void charsubUTF8(char *dest, char const *src, uint32_t pos)
+{
+	size_t charLen = 1;
+
+	if (pos < 1) {
+		warning(WARNING_BUILTIN_ARG, "CHARSUB: Position starts at 1\n");
+		pos = 1;
+	}
+
+	/* Advance to starting position in source string. */
+	for (uint32_t curPos = 1; charLen && curPos < pos; curPos++)
+		charLen = charmap_ConvertNext(&src, NULL);
+
+	char const *start = src;
+
+	if (!charmap_ConvertNext(&src, NULL))
+		warning(WARNING_BUILTIN_ARG,
+			"CHARSUB: Position %lu is past the end of the string\n",
+			(unsigned long)pos);
+
+	/* Copy from source to destination. */
+	memcpy(dest, start, src - start);
+
+	dest[src - start] = '\0';
+}
+
 static void strrpl(char *dest, size_t destLen, char const *src, char const *old, char const *new)
 {
 	size_t oldLen = strlen(old);
@@ -503,6 +535,9 @@
 %token	T_OP_STRRPL "STRRPL"
 %token	T_OP_STRFMT "STRFMT"
 
+%token	T_OP_CHARLEN "CHARLEN"
+%token	T_OP_CHARSUB "CHARSUB"
+
 %token	<tzSym> T_LABEL "label"
 %token	<tzSym> T_ID "identifier"
 %token	<tzSym> T_LOCAL_ID "local identifier"
@@ -1451,6 +1486,9 @@
 		| T_OP_STRLEN T_LPAREN string T_RPAREN {
 			rpn_Number(&$$, strlenUTF8($3));
 		}
+		| T_OP_CHARLEN T_LPAREN string T_RPAREN {
+			rpn_Number(&$$, charlenUTF8($3));
+		}
 		| T_LPAREN relocexpr T_RPAREN	{ $$ = $2; }
 ;
 
@@ -1487,6 +1525,9 @@
 string		: T_STRING
 		| T_OP_STRSUB T_LPAREN string T_COMMA uconst T_COMMA uconst T_RPAREN {
 			strsubUTF8($$, sizeof($$), $3, $5, $7);
+		}
+		| T_OP_CHARSUB T_LPAREN string T_COMMA uconst T_RPAREN {
+			charsubUTF8($$, $3, $5);
 		}
 		| T_OP_STRCAT T_LPAREN T_RPAREN {
 			$$[0] = '\0';
--- a/src/asm/rgbasm.5
+++ b/src/asm/rgbasm.5
@@ -394,11 +394,13 @@
 .It Fn STRCMP str1 str2 Ta Returns -1 if Ar str1 No is alphabetically lower than Ar str2 No , zero if they match, 1 if Ar str1 No is greater than Ar str2 .
 .It Fn STRIN str1 str2 Ta Returns the first position of Ar str2 No in Ar str1 No or zero if it's not present Pq first character is position 1 .
 .It Fn STRRIN str1 str2 Ta Returns the last position of Ar str2 No in Ar str1 No or zero if it's not present Pq first character is position 1 .
-.It Fn STRSUB str pos len Ta Returns a substring from Ar str No starting at Ar pos Po first character is position 1 Pc and Ar len No characters long.
+.It Fn STRSUB str pos len Ta Returns a substring from Ar str No starting at Ar pos No (first character is position 1) and Ar len No characters long.
 .It Fn STRUPR str Ta Returns Ar str No with all letters in uppercase.
 .It Fn STRLWR str Ta Returns Ar str No with all letters in lowercase.
 .It Fn STRRPL str old new Ta Returns Ar str No with each non-overlapping occurrence of the substring Ar old No replaced with Ar new .
 .It Fn STRFMT fmt args... Ta Returns the string Ar fmt No with each
+.It Fn CHARLEN str Ta Returns the number of charmap entries in Ar str No with the current charmap.
+.It Fn CHARSUB str pos Ta Returns the substring for the charmap entry at Ar pos No in Ar str No (first character is position 1) with the current charmap.
 .Ql %spec
 pattern replaced by interpolating the format
 .Ar spec
--- a/src/asm/util.c
+++ b/src/asm/util.c
@@ -67,7 +67,8 @@
 		if (decode(&state, &codep, src[i]) == 1)
 			return 0;
 
-		dest[i] = src[i];
+		if (dest)
+			dest[i] = src[i];
 		i++;
 
 		if (state == 0)
--- /dev/null
+++ b/test/asm/charlen-charsub.asm
@@ -1,0 +1,25 @@
+	charmap "<NULL>", $00
+	charmap "A", $10
+	charmap "B", $20
+	charmap "C", $30
+	charmap "Bold", $88
+
+SECTION "test", ROM0
+
+S EQUS "XBold<NULL>ABC"
+
+	assert CHARLEN("{S}") == 6
+	println CHARSUB("{S}", 2)
+	assert !STRCMP(CHARSUB("{S}", 2), "Bold")
+	assert CHARSUB("{S}", 2) == "Bold" && "Bold" == $88
+	assert CHARSUB("{S}", 1) == $58 ; ASCII "X"
+	db "{S}"
+
+	newcharmap ascii
+
+	assert CHARLEN("{S}") == 14
+	println CHARSUB("{S}", 2)
+	assert !STRCMP(CHARSUB("{S}", 2), "B")
+	assert CHARSUB("{S}", 2) == "B" && "B" == $42 ; ASCII "B"
+	assert CHARSUB("{S}", 1) == $58 ; ASCII "X"
+	db "{S}"
--- /dev/null
+++ b/test/asm/charlen-charsub.out
@@ -1,0 +1,2 @@
+Bold
+B
binary files /dev/null b/test/asm/charlen-charsub.out.bin differ