ref: 8f287eeef94454eafe411a168f53bc5f606bab02
dir: /src/asm/lexer.c/
/* * This file is part of RGBDS. * * Copyright (c) 1997-2019, Carsten Sorensen and RGBDS contributors. * * SPDX-License-Identifier: MIT */ #include <assert.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <strings.h> #include <ctype.h> #include "asm/asm.h" #include "asm/constexpr.h" #include "asm/fstack.h" #include "asm/lexer.h" #include "asm/main.h" #include "asm/rpn.h" #include "extern/err.h" #include "asmy.h" struct sLexString { char *tzName; uint32_t nToken; uint32_t nNameLength; struct sLexString *pNext; }; #define pLexBufferRealStart (pCurrentBuffer->pBufferRealStart) #define pLexBuffer (pCurrentBuffer->pBuffer) #define AtLineStart (pCurrentBuffer->oAtLineStart) #define SAFETYMARGIN 1024 #define BOM_SIZE 3 struct sLexFloat tLexFloat[32]; struct sLexString *tLexHash[LEXHASHSIZE]; YY_BUFFER_STATE pCurrentBuffer; uint32_t nLexMaxLength; // max length of all keywords and operators uint32_t tFloatingSecondChar[256]; uint32_t tFloatingFirstChar[256]; uint32_t tFloatingChars[256]; uint32_t nFloating; enum eLexerState lexerstate = LEX_STATE_NORMAL; /* UTF-8 byte order mark */ static const unsigned char bom[BOM_SIZE] = { 0xEF, 0xBB, 0xBF }; void upperstring(char *s) { while (*s) { *s = toupper(*s); s += 1; } } void lowerstring(char *s) { while (*s) { *s = tolower(*s); s += 1; } } void yyskipbytes(uint32_t count) { pLexBuffer += count; } void yyunputbytes(uint32_t count) { pLexBuffer -= count; } void yyunput(char c) { if (pLexBuffer <= pLexBufferRealStart) fatalerror("Buffer safety margin exceeded"); *(--pLexBuffer) = c; } void yyunputstr(char *s) { int32_t i, len; len = strlen(s); if (pLexBuffer - len < pLexBufferRealStart) fatalerror("Buffer safety margin exceeded"); for (i = len - 1; i >= 0; i--) *(--pLexBuffer) = s[i]; } void yy_switch_to_buffer(YY_BUFFER_STATE buf) { pCurrentBuffer = buf; } void yy_set_state(enum eLexerState i) { lexerstate = i; } void yy_delete_buffer(YY_BUFFER_STATE buf) { free(buf->pBufferStart - SAFETYMARGIN); free(buf); } /* * Maintains the following invariants: * 1. nBufferSize < capacity * 2. The buffer is terminated with 0 * 3. nBufferSize is the size without the terminator */ static void yy_buffer_append(YY_BUFFER_STATE buf, uint32_t capacity, char c) { assert(buf->pBufferStart[buf->nBufferSize] == 0); assert(buf->nBufferSize + 1 < capacity); buf->pBufferStart[buf->nBufferSize++] = c; buf->pBufferStart[buf->nBufferSize] = 0; } YY_BUFFER_STATE yy_scan_bytes(char *mem, uint32_t size) { YY_BUFFER_STATE pBuffer = malloc(sizeof(struct yy_buffer_state)); if (pBuffer == NULL) fatalerror("%s: Out of memory!", __func__); pBuffer->pBufferRealStart = malloc(size + 1 + SAFETYMARGIN); if (pBuffer->pBufferRealStart == NULL) fatalerror("%s: Out of memory for buffer!", __func__); pBuffer->pBufferStart = pBuffer->pBufferRealStart + SAFETYMARGIN; pBuffer->pBuffer = pBuffer->pBufferRealStart + SAFETYMARGIN; memcpy(pBuffer->pBuffer, mem, size); pBuffer->nBufferSize = size; pBuffer->oAtLineStart = 1; pBuffer->pBuffer[size] = 0; return pBuffer; } YY_BUFFER_STATE yy_create_buffer(FILE *f) { YY_BUFFER_STATE pBuffer = malloc(sizeof(struct yy_buffer_state)); if (pBuffer == NULL) fatalerror("%s: Out of memory!", __func__); size_t size = 0, capacity = -1; char *buf = NULL; /* * Check if we can get the file size without implementation-defined * behavior: * * From ftell(3p): * [On error], ftell() and ftello() shall return −1, and set errno to * indicate the error. * * The ftell() and ftello() functions shall fail if: [...] * ESPIPE The file descriptor underlying stream is associated with a * pipe, FIFO, or socket. * * From fseek(3p): * The behavior of fseek() on devices which are incapable of seeking * is implementation-defined. */ if (ftell(f) != -1) { fseek(f, 0, SEEK_END); capacity = ftell(f); rewind(f); } // If ftell errored or the block above wasn't executed if (capacity == -1) capacity = 4096; // Handle 0-byte files gracefully else if (capacity == 0) capacity = 1; while (!feof(f)) { if (buf == NULL || size >= capacity) { if (buf) capacity *= 2; /* Give extra room for 2 newlines and terminator */ buf = realloc(buf, capacity + SAFETYMARGIN + 3); if (buf == NULL) fatalerror("%s: Out of memory for buffer!", __func__); } char *bufpos = buf + SAFETYMARGIN + size; size_t read_count = fread(bufpos, 1, capacity - size, f); if (read_count == 0 && !feof(f)) fatalerror("%s: fread error", __func__); size += read_count; } pBuffer->pBufferRealStart = buf; pBuffer->pBufferStart = buf + SAFETYMARGIN; pBuffer->pBuffer = buf + SAFETYMARGIN; pBuffer->pBuffer[size] = 0; pBuffer->nBufferSize = size; /* This is added here to make the buffer scaling above easy to express, * while taking the newline space into account * for the `yy_buffer_append`s below. */ capacity += 3; /* Skip UTF-8 byte order mark. */ if (pBuffer->nBufferSize >= BOM_SIZE && !memcmp(pBuffer->pBuffer, bom, BOM_SIZE)) pBuffer->pBuffer += BOM_SIZE; /* Convert all line endings to LF and spaces */ char *mem = pBuffer->pBuffer; while (*mem) { if ((mem[0] == '\\') && (mem[1] == '\"' || mem[1] == '\\')) { mem += 2; } else { /* LF CR and CR LF */ if (((mem[0] == 10) && (mem[1] == 13)) || ((mem[0] == 13) && (mem[1] == 10))) { mem[0] = ' '; mem[1] = '\n'; mem += 2; /* LF and CR */ } else if ((mem[0] == 10) || (mem[0] == 13)) { mem[0] = '\n'; mem += 1; } else { mem += 1; } } } /* Remove comments */ mem = pBuffer->pBuffer; uint32_t instring = 0; while (*mem) { if (*mem == '\"') instring = 1 - instring; if ((mem[0] == '\\') && (mem[1] == '\"' || mem[1] == '\\')) { mem += 2; } else if (instring) { mem += 1; } else { /* Comments that start with ; anywhere in a line */ if (*mem == ';') { while (!((*mem == '\n') || (*mem == '\0'))) *mem++ = ' '; /* Comments that start with * at the start of a line */ } else if ((mem[0] == '\n') && (mem[1] == '*')) { mem += 1; while (!((*mem == '\n') || (*mem == '\0'))) *mem++ = ' '; } else { mem += 1; } } } /* Add newline if file doesn't end with one */ if (size == 0 || pBuffer->pBufferStart[size - 1] != '\n') yy_buffer_append(pBuffer, capacity, '\n'); /* Add newline if \ will eat the last newline */ if (pBuffer->nBufferSize >= 2) { size_t pos = pBuffer->nBufferSize - 2; /* Skip spaces */ while (pos > 0 && pBuffer->pBufferStart[pos] == ' ') pos--; if (pBuffer->pBufferStart[pos] == '\\') yy_buffer_append(pBuffer, capacity, '\n'); } pBuffer->oAtLineStart = 1; return pBuffer; } uint32_t lex_FloatAlloc(const struct sLexFloat *token) { tLexFloat[nFloating] = *token; return (1 << (nFloating++)); } /* * Make sure that only non-zero ASCII characters are used. Also, check if the * start is greater than the end of the range. */ void lex_CheckCharacterRange(uint16_t start, uint16_t end) { if (start > end || start < 1 || end > 127) { errx(1, "Invalid character range (start: %u, end: %u)", start, end); } } void lex_FloatDeleteRange(uint32_t id, uint16_t start, uint16_t end) { lex_CheckCharacterRange(start, end); while (start <= end) { tFloatingChars[start] &= ~id; start += 1; } } void lex_FloatAddRange(uint32_t id, uint16_t start, uint16_t end) { lex_CheckCharacterRange(start, end); while (start <= end) { tFloatingChars[start] |= id; start += 1; } } void lex_FloatDeleteFirstRange(uint32_t id, uint16_t start, uint16_t end) { lex_CheckCharacterRange(start, end); while (start <= end) { tFloatingFirstChar[start] &= ~id; start += 1; } } void lex_FloatAddFirstRange(uint32_t id, uint16_t start, uint16_t end) { lex_CheckCharacterRange(start, end); while (start <= end) { tFloatingFirstChar[start] |= id; start += 1; } } void lex_FloatDeleteSecondRange(uint32_t id, uint16_t start, uint16_t end) { lex_CheckCharacterRange(start, end); while (start <= end) { tFloatingSecondChar[start] &= ~id; start += 1; } } void lex_FloatAddSecondRange(uint32_t id, uint16_t start, uint16_t end) { lex_CheckCharacterRange(start, end); while (start <= end) { tFloatingSecondChar[start] |= id; start += 1; } } static struct sLexFloat *lexgetfloat(uint32_t nFloatMask) { if (nFloatMask == 0) fatalerror("Internal error in %s", __func__); int32_t i = 0; while ((nFloatMask & 1) == 0) { nFloatMask >>= 1; i++; } return &tLexFloat[i]; } static uint32_t lexcalchash(char *s) { uint32_t hash = 0; while (*s) hash = (hash * 283) ^ toupper(*s++); return hash % LEXHASHSIZE; } void lex_Init(void) { uint32_t i; for (i = 0; i < LEXHASHSIZE; i++) tLexHash[i] = NULL; for (i = 0; i < 256; i++) { tFloatingFirstChar[i] = 0; tFloatingSecondChar[i] = 0; tFloatingChars[i] = 0; } nLexMaxLength = 0; nFloating = 0; } void lex_AddStrings(const struct sLexInitString *lex) { while (lex->tzName) { struct sLexString **ppHash; uint32_t hash; ppHash = &tLexHash[hash = lexcalchash(lex->tzName)]; while (*ppHash) ppHash = &((*ppHash)->pNext); *ppHash = malloc(sizeof(struct sLexString)); if (*ppHash == NULL) fatalerror("Out of memory!"); (*ppHash)->tzName = (char *)strdup(lex->tzName); if ((*ppHash)->tzName == NULL) fatalerror("Out of memory!"); (*ppHash)->nNameLength = strlen(lex->tzName); (*ppHash)->nToken = lex->nToken; (*ppHash)->pNext = NULL; upperstring((*ppHash)->tzName); if ((*ppHash)->nNameLength > nLexMaxLength) nLexMaxLength = (*ppHash)->nNameLength; lex += 1; } } /* * Gets the "float" mask and "float" length. * "Float" refers to the token type of a token that is not a keyword. * The character classes floatingFirstChar, floatingSecondChar, and * floatingChars are defined separately for each token type. * It uses bit masks to match against a set of simple regular expressions * of the form /[floatingFirstChar]([floatingSecondChar][floatingChars]*)?/. * The token types with the longest match from the current position in the * buffer will have their bits set in the float mask. */ void yylex_GetFloatMaskAndFloatLen(uint32_t *pnFloatMask, uint32_t *pnFloatLen) { /* * Note that '\0' should always have a bit mask of 0 in the "floating" * tables, so it doesn't need to be checked for separately. */ char *s = pLexBuffer; uint32_t nOldFloatMask = 0; uint32_t nFloatMask = tFloatingFirstChar[(uint8_t)*s]; if (nFloatMask != 0) { s++; nOldFloatMask = nFloatMask; nFloatMask &= tFloatingSecondChar[(uint8_t)*s]; while (nFloatMask != 0) { s++; nOldFloatMask = nFloatMask; nFloatMask &= tFloatingChars[(uint8_t)*s]; } } *pnFloatMask = nOldFloatMask; *pnFloatLen = (uint32_t)(s - pLexBuffer); } /* * Gets the longest keyword/operator from the current position in the buffer. */ struct sLexString *yylex_GetLongestFixed(void) { struct sLexString *pLongestFixed = NULL; char *s = pLexBuffer; uint32_t hash = 0; uint32_t length = 0; while (length < nLexMaxLength && *s) { hash = (hash * 283) ^ toupper(*s); s++; length++; struct sLexString *lex = tLexHash[hash % LEXHASHSIZE]; while (lex) { if (lex->nNameLength == length && strncasecmp(pLexBuffer, lex->tzName, length) == 0) { pLongestFixed = lex; break; } lex = lex->pNext; } } return pLongestFixed; } size_t CopyMacroArg(char *dest, size_t maxLength, char c) { size_t i; char *s; int32_t argNum; switch (c) { case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': argNum = c - '0'; break; case '@': argNum = -1; break; default: return 0; } s = sym_FindMacroArg(argNum); if (s == NULL) fatalerror("Macro argument not defined"); for (i = 0; s[i] != 0; i++) { if (i >= maxLength) fatalerror("Macro argument too long to fit buffer"); dest[i] = s[i]; } return i; } static inline void yylex_StringWriteChar(char *s, size_t index, char c) { if (index >= MAXSTRLEN) fatalerror("String too long"); s[index] = c; } static inline void yylex_SymbolWriteChar(char *s, size_t index, char c) { if (index >= MAXSYMLEN) fatalerror("Symbol too long"); s[index] = c; } /* * Trims white space at the end of a string. * The index parameter is the index of the 0 at the end of the string. */ void yylex_TrimEnd(char *s, size_t index) { int32_t i = (int32_t)index - 1; while ((i >= 0) && (s[i] == ' ' || s[i] == '\t')) { s[i] = 0; i--; } } size_t yylex_ReadBracketedSymbol(char *dest, size_t index) { char sym[MAXSYMLEN + 1]; char ch; size_t i = 0; size_t length, maxLength; const char *mode = NULL; for (ch = *pLexBuffer; ch != '}' && ch != '"' && ch != '\n'; ch = *(++pLexBuffer)) { if (ch == '\\') { ch = *(++pLexBuffer); maxLength = MAXSYMLEN - i; length = CopyMacroArg(&sym[i], maxLength, ch); if (length != 0) i += length; else fatalerror("Illegal character escape '%c'", ch); } else if (ch == ':' && !mode) { /* Only grab 1st colon */ /* Use a whitelist of modes, which does prevent the * use of some features such as precision, * but also avoids a security flaw */ const char *acceptedModes = "bxXd"; /* Binary isn't natively supported, * so it's handled differently */ static const char * const formatSpecifiers[] = { "", "%x", "%X", "%d" }; /* Prevent reading out of bounds! */ const char *designatedMode; if (i != 1) fatalerror("Print types are exactly 1 character long"); designatedMode = strchr(acceptedModes, sym[i - 1]); if (!designatedMode) fatalerror("Illegal print type '%c'", sym[i - 1]); mode = formatSpecifiers[designatedMode - acceptedModes]; /* Begin writing the symbol again */ i = 0; } else { yylex_SymbolWriteChar(sym, i++, ch); } } /* Properly terminate the string */ yylex_SymbolWriteChar(sym, i, 0); /* It's assumed we're writing to a T_STRING */ maxLength = MAXSTRLEN - index; length = symvaluetostring(&dest[index], maxLength, sym, mode); if (*pLexBuffer == '}') pLexBuffer++; else fatalerror("Missing }"); return length; } static void yylex_ReadQuotedString(void) { size_t index = 0; size_t length, maxLength; while (*pLexBuffer != '"' && *pLexBuffer != '\n') { char ch = *pLexBuffer++; if (ch == '\\') { ch = *pLexBuffer++; switch (ch) { case 'n': ch = '\n'; break; case 't': ch = '\t'; break; case '\\': ch = '\\'; break; case '"': ch = '"'; break; case ',': ch = ','; break; case '{': ch = '{'; break; case '}': ch = '}'; break; default: maxLength = MAXSTRLEN - index; length = CopyMacroArg(&yylval.tzString[index], maxLength, ch); if (length != 0) index += length; else fatalerror("Illegal character escape '%c'", ch); ch = 0; break; } } else if (ch == '{') { // Get bracketed symbol within string. index += yylex_ReadBracketedSymbol(yylval.tzString, index); ch = 0; } if (ch) yylex_StringWriteChar(yylval.tzString, index++, ch); } yylex_StringWriteChar(yylval.tzString, index, 0); if (*pLexBuffer == '"') pLexBuffer++; else fatalerror("Unterminated string"); } static uint32_t yylex_NORMAL(void) { struct sLexString *pLongestFixed = NULL; uint32_t nFloatMask, nFloatLen; uint32_t linestart = AtLineStart; AtLineStart = 0; scanagain: while (*pLexBuffer == ' ' || *pLexBuffer == '\t') { linestart = 0; pLexBuffer++; } if (*pLexBuffer == 0) { // Reached the end of a file, macro, or rept. if (yywrap() == 0) { linestart = AtLineStart; AtLineStart = 0; goto scanagain; } } /* Check for line continuation character */ if (*pLexBuffer == '\\') { /* * Look for line continuation character after a series of * spaces. This is also useful for files that use Windows line * endings: "\r\n" is replaced by " \n" before the lexer has the * opportunity to see it. */ if (pLexBuffer[1] == ' ') { pLexBuffer += 2; while (1) { if (*pLexBuffer == ' ') { pLexBuffer++; } else if (*pLexBuffer == '\n') { pLexBuffer++; nLineNo += 1; goto scanagain; } else { errx(1, "Expected a new line after the continuation character."); } } } /* Line continuation character */ if (pLexBuffer[1] == '\n') { pLexBuffer += 2; nLineNo += 1; goto scanagain; } /* * If there isn't a newline character or a space, ignore the * character '\'. It will eventually be handled by other * functions like PutMacroArg(). */ } /* * Try to match an identifier, macro argument (e.g. \1), * or numeric literal. */ yylex_GetFloatMaskAndFloatLen(&nFloatMask, &nFloatLen); /* Try to match a keyword or operator. */ pLongestFixed = yylex_GetLongestFixed(); if (nFloatLen == 0 && pLongestFixed == NULL) { /* * No keyword, identifier, operator, or numerical literal * matches. */ if (*pLexBuffer == '"') { pLexBuffer++; yylex_ReadQuotedString(); return T_STRING; } else if (*pLexBuffer == '{') { pLexBuffer++; size_t len = yylex_ReadBracketedSymbol(yylval.tzString, 0); yylval.tzString[len] = 0; return T_STRING; } /* * It's not a keyword, operator, identifier, macro argument, * numeric literal, string, or bracketed symbol, so just return * the ASCII character. */ unsigned char ch = *pLexBuffer++; if (ch == '\n') AtLineStart = 1; /* * Check for invalid unprintable characters. * They may not be readily apparent in a text editor, * so this is useful for identifying encoding problems. */ if (ch != 0 && ch != '\n' && !(ch >= 0x20 && ch <= 0x7E)) fatalerror("Found garbage character: 0x%02X", ch); return ch; } if (pLongestFixed == NULL || nFloatLen > pLongestFixed->nNameLength) { /* * Longest match was an identifier, macro argument, or numeric * literal. */ struct sLexFloat *token = lexgetfloat(nFloatMask); if (token->Callback) { int32_t done = token->Callback(pLexBuffer, nFloatLen); if (!done) goto scanagain; } if (token->nToken == T_ID && linestart) return T_LABEL; else return token->nToken; } /* Longest match was a keyword or operator. */ pLexBuffer += pLongestFixed->nNameLength; yylval.nConstValue = pLongestFixed->nToken; return pLongestFixed->nToken; } static uint32_t yylex_MACROARGS(void) { size_t index = 0; size_t length, maxLength; while ((*pLexBuffer == ' ') || (*pLexBuffer == '\t')) pLexBuffer++; while ((*pLexBuffer != ',') && (*pLexBuffer != '\n')) { char ch = *pLexBuffer++; if (ch == '\\') { ch = *pLexBuffer++; switch (ch) { case 'n': ch = '\n'; break; case 't': ch = '\t'; break; case '\\': ch = '\\'; break; case '"': ch = '\"'; break; case ',': ch = ','; break; case '{': ch = '{'; break; case '}': ch = '}'; break; case ' ': /* * Look for line continuation character after a * series of spaces. This is also useful for * files that use Windows line endings: "\r\n" * is replaced by " \n" before the lexer has the * opportunity to see it. */ while (1) { if (*pLexBuffer == ' ') { pLexBuffer++; } else if (*pLexBuffer == '\n') { pLexBuffer++; nLineNo += 1; ch = 0; break; } else { errx(1, "Expected a new line after the continuation character."); } } break; case '\n': /* Line continuation character */ nLineNo += 1; ch = 0; break; default: maxLength = MAXSTRLEN - index; length = CopyMacroArg(&yylval.tzString[index], maxLength, ch); if (length != 0) index += length; else fatalerror("Illegal character escape '%c'", ch); ch = 0; break; } } else if (ch == '{') { index += yylex_ReadBracketedSymbol(yylval.tzString, index); ch = 0; } if (ch) yylex_StringWriteChar(yylval.tzString, index++, ch); } if (index) { yylex_StringWriteChar(yylval.tzString, index, 0); /* trim trailing white space at the end of the line */ if (*pLexBuffer == '\n') yylex_TrimEnd(yylval.tzString, index); return T_STRING; } else if (*pLexBuffer == '\n') { pLexBuffer++; AtLineStart = 1; return '\n'; } else if (*pLexBuffer == ',') { pLexBuffer++; return ','; } fatalerror("Internal error in %s", __func__); } int yylex(void) { switch (lexerstate) { case LEX_STATE_NORMAL: return yylex_NORMAL(); case LEX_STATE_MACROARGS: return yylex_MACROARGS(); default: fatalerror("%s: Internal error.", __func__); } }