shithub: rgbds

Download patch

ref: e7dc094e568bf05688fd729e9a266ee02d2a2607
parent: 3cd1d46a1bf66df5fafe04dde885cc6f3dfcdb28
author: jidoc01 <jidoc01@naver.com>
date: Mon Jun 24 20:58:47 EDT 2019

Improve charmap structure with trie

Charmap's previous structure was using brute-force comparison for
converting the strings in source files. It always compared given
strings to all of the strings in charmap, which was very costly
in huge projects.
For its improvement, I changed its structure into trie, which is
being used in many string-processing areas. It's now much faster
than before.

--- a/include/asm/charmap.h
+++ b/include/asm/charmap.h
@@ -13,14 +13,25 @@
 
 #define MAXCHARMAPS	512
 #define CHARMAPLENGTH	16
+#define MAXCHARNODES	(MAXCHARMAPS * CHARMAPLENGTH + 1)
 
+/*
+ * A node for trie structure.
+ */
+struct Charnode {
+	uint8_t code; /* the value in a key-value pair. */
+	uint8_t isCode; /* has one if it's a code node, not just a bridge node. */
+	struct Charnode *next[256]; /* each index representing the next possible character from its current state. */
+};
+
 struct Charmap {
-	int32_t count;
-	char input[MAXCHARMAPS][CHARMAPLENGTH + 1];
-	char output[MAXCHARMAPS];
+	int32_t charCount; /* user-side count. */
+	int32_t nodeCount; /* node-side count. */
+	struct Charnode nodes[MAXCHARNODES]; /* first node is reserved for the root node in charmap. */
 };
 
 int32_t readUTF8Char(char *destination, char *source);
+
 int32_t charmap_Add(char *input, uint8_t output);
 int32_t charmap_Convert(char **input);
 
--- a/src/asm/charmap.c
+++ b/src/asm/charmap.c
@@ -42,11 +42,10 @@
 int32_t charmap_Add(char *input, uint8_t output)
 {
 	int32_t i;
-	size_t input_length;
-	char temp1i[CHARMAPLENGTH + 1], temp2i[CHARMAPLENGTH + 1];
-	char temp1o = 0, temp2o = 0;
+	uint8_t v;
 
-	struct Charmap *charmap;
+	struct Charmap 	*charmap;
+	struct Charnode	*curr_node, *temp_node;
 
 	if (pCurrentSection) {
 		if (pCurrentSection->charmap) {
@@ -55,7 +54,6 @@
 			charmap = calloc(1, sizeof(struct Charmap));
 			if (charmap == NULL)
 				fatalerror("Not enough memory for charmap");
-
 			pCurrentSection->charmap = charmap;
 		}
 	} else {
@@ -62,84 +60,103 @@
 		charmap = &globalCharmap;
 	}
 
-	if (charmap->count > MAXCHARMAPS || strlen(input) > CHARMAPLENGTH)
+	if (charmap->charCount >= MAXCHARMAPS || strlen(input) > CHARMAPLENGTH)
 		return -1;
 
-	input_length = strlen(input);
-	if (input_length > 1) {
-		i = 0;
-		while (i < charmap->count + 1) {
-			if (input_length > strlen(charmap->input[i])) {
-				memcpy(temp1i, charmap->input[i],
-				       CHARMAPLENGTH + 1);
-				memcpy(charmap->input[i], input, input_length);
-				temp1o = charmap->output[i];
-				charmap->output[i] = output;
-				i++;
-				break;
-			}
-			i++;
+	curr_node = &charmap->nodes[0];
+
+	for (i = 0; (v = (uint8_t)input[i]); i++) {
+		if (curr_node->next[v]) {
+			curr_node = curr_node->next[v];
+		} else {
+			temp_node = &charmap->nodes[charmap->nodeCount + 1];
+
+			curr_node->next[v] = temp_node;
+			curr_node = temp_node;
+
+			++charmap->nodeCount;
 		}
-		while (i < charmap->count + 1) {
-			memcpy(temp2i, charmap->input[i], CHARMAPLENGTH + 1);
-			memcpy(charmap->input[i], temp1i, CHARMAPLENGTH + 1);
-			memcpy(temp1i, temp2i, CHARMAPLENGTH + 1);
-			temp2o = charmap->output[i];
-			charmap->output[i] = temp1o;
-			temp1o = temp2o;
-			i++;
-		}
-		memcpy(charmap->input[charmap->count + 1], temp1i,
-		       CHARMAPLENGTH + 1);
-		charmap->output[charmap->count + 1] = temp1o;
-	} else {
-		memcpy(charmap->input[charmap->count], input, input_length);
-		charmap->output[charmap->count] = output;
 	}
-	return ++charmap->count;
+
+	/* prevent duplicated keys by accepting only first key-value pair.  */
+	if (curr_node->isCode)
+		return charmap->charCount;
+
+	curr_node->code = output;
+	curr_node->isCode = 1;
+
+	return ++charmap->charCount;
 }
 
 int32_t charmap_Convert(char **input)
 {
-	struct Charmap *charmap;
+	struct Charmap 	*charmap;
+	struct Charnode	*charnode;
 
-	char outchar[CHARMAPLENGTH + 1];
-	char *buffer;
-	int32_t i, j, length;
+	char *output;
+	char outchar[8];
 
+	int32_t i, match, length;
+	uint8_t v, foundCode;
+
 	if (pCurrentSection && pCurrentSection->charmap)
 		charmap = pCurrentSection->charmap;
 	else
 		charmap = &globalCharmap;
 
-	buffer = malloc(strlen(*input));
-	if (buffer == NULL)
+	output = malloc(strlen(*input));
+	if (output == NULL)
 		fatalerror("Not enough memory for buffer");
 
 	length = 0;
+
 	while (**input) {
-		j = 0;
-		for (i = 0; i < charmap->count; i++) {
-			j = strlen(charmap->input[i]);
-			if (memcmp(*input, charmap->input[i], j) == 0) {
-				outchar[0] = charmap->output[i];
-				outchar[1] = 0;
+		charnode = &charmap->nodes[0];
+
+		/*
+		 * find the longest valid match which has been registered in charmap.
+		 * note that there could be either multiple matches or no match.
+		 * and it possibly takes the longest match between them,
+		 * which means that it ignores partial matches shorter than the longest one.
+		*/
+		for (i = match = 0; (v = (*input)[i]);) {
+			if (!charnode->next[v])
 				break;
+
+			charnode = charnode->next[v];
+			i++;
+
+			if (charnode->isCode) {
+				match = i;
+				foundCode = charnode->code;
 			}
-			j = 0;
 		}
 
-		if (!j)
-			j = readUTF8Char(outchar, *input);
+		if (match) {
+			output[length] = foundCode;
 
-		if (!outchar[0]) {
-			buffer[length++] = 0;
+			length += 1;
 		} else {
-			for (i = 0; outchar[i]; i++)
-				buffer[length++] = outchar[i];
+			/*
+			 * put a utf-8 character
+			 * if failed to find a match.
+			 */
+			match = readUTF8Char(outchar, *input);
+
+			if (match) {
+				memcpy(output + length, *input, match);
+			} else {
+				output[length] = 0;
+				match = 1;
+			}
+
+			length += match;
 		}
-		*input += j;
+
+		*input += match;
 	}
-	*input = buffer;
+
+	*input = output;
+
 	return length;
 }