1101 lines
30 KiB
C
1101 lines
30 KiB
C
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#include "tokenizer.h"
|
|
|
|
typedef struct {
|
|
const char* keyword;
|
|
TokenizerTag tag;
|
|
} KeywordMap;
|
|
|
|
const char* tokenizerGetTagString(TokenizerTag tag) {
|
|
switch (tag) {
|
|
TOKENIZER_FOREACH_TAG_ENUM(TOKENIZER_GENERATE_CASE)
|
|
default:
|
|
return "UNKNOWN";
|
|
}
|
|
}
|
|
|
|
const KeywordMap keywords[] = {
|
|
{ "addrspace", TOKEN_KEYWORD_ADDRSPACE },
|
|
{ "align", TOKEN_KEYWORD_ALIGN },
|
|
{ "allowzero", TOKEN_KEYWORD_ALLOWZERO },
|
|
{ "and", TOKEN_KEYWORD_AND },
|
|
{ "anyframe", TOKEN_KEYWORD_ANYFRAME },
|
|
{ "anytype", TOKEN_KEYWORD_ANYTYPE },
|
|
{ "asm", TOKEN_KEYWORD_ASM },
|
|
{ "async", TOKEN_KEYWORD_ASYNC },
|
|
{ "await", TOKEN_KEYWORD_AWAIT },
|
|
{ "break", TOKEN_KEYWORD_BREAK },
|
|
{ "callconv", TOKEN_KEYWORD_CALLCONV },
|
|
{ "catch", TOKEN_KEYWORD_CATCH },
|
|
{ "comptime", TOKEN_KEYWORD_COMPTIME },
|
|
{ "const", TOKEN_KEYWORD_CONST },
|
|
{ "continue", TOKEN_KEYWORD_CONTINUE },
|
|
{ "defer", TOKEN_KEYWORD_DEFER },
|
|
{ "else", TOKEN_KEYWORD_ELSE },
|
|
{ "enum", TOKEN_KEYWORD_ENUM },
|
|
{ "errdefer", TOKEN_KEYWORD_ERRDEFER },
|
|
{ "error", TOKEN_KEYWORD_ERROR },
|
|
{ "export", TOKEN_KEYWORD_EXPORT },
|
|
{ "extern", TOKEN_KEYWORD_EXTERN },
|
|
{ "fn", TOKEN_KEYWORD_FN },
|
|
{ "for", TOKEN_KEYWORD_FOR },
|
|
{ "if", TOKEN_KEYWORD_IF },
|
|
{ "inline", TOKEN_KEYWORD_INLINE },
|
|
{ "linksection", TOKEN_KEYWORD_LINKSECTION },
|
|
{ "noalias", TOKEN_KEYWORD_NOALIAS },
|
|
{ "noinline", TOKEN_KEYWORD_NOINLINE },
|
|
{ "nosuspend", TOKEN_KEYWORD_NOSUSPEND },
|
|
{ "opaque", TOKEN_KEYWORD_OPAQUE },
|
|
{ "or", TOKEN_KEYWORD_OR },
|
|
{ "orelse", TOKEN_KEYWORD_ORELSE },
|
|
{ "packed", TOKEN_KEYWORD_PACKED },
|
|
{ "pub", TOKEN_KEYWORD_PUB },
|
|
{ "resume", TOKEN_KEYWORD_RESUME },
|
|
{ "return", TOKEN_KEYWORD_RETURN },
|
|
{ "struct", TOKEN_KEYWORD_STRUCT },
|
|
{ "suspend", TOKEN_KEYWORD_SUSPEND },
|
|
{ "switch", TOKEN_KEYWORD_SWITCH },
|
|
{ "test", TOKEN_KEYWORD_TEST },
|
|
{ "threadlocal", TOKEN_KEYWORD_THREADLOCAL },
|
|
{ "try", TOKEN_KEYWORD_TRY },
|
|
{ "union", TOKEN_KEYWORD_UNION },
|
|
{ "unreachable", TOKEN_KEYWORD_UNREACHABLE },
|
|
{ "usingnamespace", TOKEN_KEYWORD_USINGNAMESPACE },
|
|
{ "var", TOKEN_KEYWORD_VAR },
|
|
{ "volatile", TOKEN_KEYWORD_VOLATILE },
|
|
{ "while", TOKEN_KEYWORD_WHILE }
|
|
};
|
|
|
|
// TODO binary search
|
|
static TokenizerTag getKeyword(const char* bytes, const uint32_t len) {
|
|
for (unsigned long i = 0; i < sizeof(keywords) / sizeof(KeywordMap); i++) {
|
|
size_t klen = strlen(keywords[i].keyword);
|
|
size_t minlen = klen < len ? klen : len;
|
|
int cmp = strncmp(bytes, keywords[i].keyword, minlen);
|
|
if (cmp == 0) {
|
|
if (len == klen) {
|
|
return keywords[i].tag;
|
|
} else {
|
|
return TOKEN_INVALID;
|
|
}
|
|
} else if (cmp < 0) {
|
|
return TOKEN_INVALID;
|
|
}
|
|
}
|
|
return TOKEN_INVALID;
|
|
}
|
|
|
|
Tokenizer tokenizerInit(const char* buffer, const uint32_t len) {
|
|
return (Tokenizer) {
|
|
.buffer = buffer,
|
|
.buffer_len = len,
|
|
.index = (len >= 3 && memcmp(buffer, "\xEF\xBB\xBF", 3) == 0) ? 3 : 0,
|
|
};
|
|
}
|
|
|
|
TokenizerToken tokenizerNext(Tokenizer* self) {
|
|
TokenizerToken result = (TokenizerToken) {
|
|
.tag = TOKEN_INVALID,
|
|
.loc = {
|
|
.start = 0,
|
|
},
|
|
};
|
|
|
|
TokenizerState state = TOKENIZER_STATE_START;
|
|
|
|
state:
|
|
switch (state) {
|
|
case TOKENIZER_STATE_START:
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index == self->buffer_len) {
|
|
return (TokenizerToken) {
|
|
.tag = TOKEN_EOF,
|
|
.loc = {
|
|
.start = self->index,
|
|
.end = self->index,
|
|
}
|
|
};
|
|
} else {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
case ' ':
|
|
case '\n':
|
|
case '\t':
|
|
case '\r':
|
|
self->index++;
|
|
result.loc.start = self->index;
|
|
goto state;
|
|
case '"':
|
|
result.tag = TOKEN_STRING_LITERAL;
|
|
state = TOKENIZER_STATE_STRING_LITERAL;
|
|
goto state;
|
|
case '\'':
|
|
result.tag = TOKEN_CHAR_LITERAL;
|
|
state = TOKENIZER_STATE_CHAR_LITERAL;
|
|
goto state;
|
|
case 'a' ... 'z':
|
|
case 'A' ... 'Z':
|
|
case '_':
|
|
result.tag = TOKEN_IDENTIFIER;
|
|
state = TOKENIZER_STATE_IDENTIFIER;
|
|
goto state;
|
|
case '@':
|
|
state = TOKENIZER_STATE_SAW_AT_SIGN;
|
|
goto state;
|
|
case '=':
|
|
state = TOKENIZER_STATE_EQUAL;
|
|
goto state;
|
|
case '!':
|
|
state = TOKENIZER_STATE_BANG;
|
|
goto state;
|
|
case '|':
|
|
state = TOKENIZER_STATE_PIPE;
|
|
goto state;
|
|
case '(':
|
|
result.tag = TOKEN_L_PAREN;
|
|
self->index++;
|
|
break;
|
|
case ')':
|
|
result.tag = TOKEN_R_PAREN;
|
|
self->index++;
|
|
break;
|
|
case '[':
|
|
result.tag = TOKEN_L_BRACKET;
|
|
self->index++;
|
|
break;
|
|
case ']':
|
|
result.tag = TOKEN_R_BRACKET;
|
|
self->index++;
|
|
break;
|
|
case ';':
|
|
result.tag = TOKEN_SEMICOLON;
|
|
self->index++;
|
|
break;
|
|
case ',':
|
|
result.tag = TOKEN_COMMA;
|
|
self->index++;
|
|
break;
|
|
case '?':
|
|
result.tag = TOKEN_QUESTION_MARK;
|
|
self->index++;
|
|
break;
|
|
case ':':
|
|
result.tag = TOKEN_COLON;
|
|
self->index++;
|
|
break;
|
|
case '%':
|
|
state = TOKENIZER_STATE_PERCENT;
|
|
goto state;
|
|
case '*':
|
|
state = TOKENIZER_STATE_ASTERISK;
|
|
goto state;
|
|
case '+':
|
|
state = TOKENIZER_STATE_PLUS;
|
|
goto state;
|
|
case '<':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_LEFT;
|
|
goto state;
|
|
case '>':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_RIGHT;
|
|
goto state;
|
|
case '^':
|
|
state = TOKENIZER_STATE_CARET;
|
|
goto state;
|
|
case '\\':
|
|
result.tag = TOKEN_MULTILINE_STRING_LITERAL_LINE;
|
|
state = TOKENIZER_STATE_BACKSLASH;
|
|
goto state;
|
|
case '{':
|
|
result.tag = TOKEN_L_BRACE;
|
|
self->index++;
|
|
break;
|
|
case '}':
|
|
result.tag = TOKEN_R_BRACE;
|
|
self->index++;
|
|
break;
|
|
case '~':
|
|
result.tag = TOKEN_TILDE;
|
|
self->index++;
|
|
break;
|
|
case '.':
|
|
state = TOKENIZER_STATE_PERIOD;
|
|
goto state;
|
|
case '-':
|
|
state = TOKENIZER_STATE_MINUS;
|
|
goto state;
|
|
case '/':
|
|
state = TOKENIZER_STATE_SLASH;
|
|
goto state;
|
|
case '&':
|
|
state = TOKENIZER_STATE_AMPERSAND;
|
|
goto state;
|
|
case '0' ... '9':
|
|
result.tag = TOKEN_NUMBER_LITERAL;
|
|
self->index++;
|
|
state = TOKENIZER_STATE_INT;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
};
|
|
break;
|
|
|
|
case TOKENIZER_STATE_EXPECT_NEWLINE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index == self->buffer_len) {
|
|
result.tag = TOKEN_INVALID;
|
|
} else {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case '\n':
|
|
self->index++;
|
|
result.loc.start = self->index;
|
|
state = TOKENIZER_STATE_START;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_INVALID:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index == self->buffer_len) {
|
|
result.tag = TOKEN_INVALID;
|
|
} else {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case '\n':
|
|
result.tag = TOKEN_INVALID;
|
|
break;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_SAW_AT_SIGN:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
case '\n':
|
|
result.tag = TOKEN_INVALID;
|
|
break;
|
|
case '"':
|
|
result.tag = TOKEN_IDENTIFIER;
|
|
state = TOKENIZER_STATE_STRING_LITERAL;
|
|
goto state;
|
|
case 'a' ... 'z':
|
|
case 'A' ... 'Z':
|
|
case '_':
|
|
result.tag = TOKEN_BUILTIN;
|
|
state = TOKENIZER_STATE_BUILTIN;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_AMPERSAND:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_AMPERSAND_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_AMPERSAND;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ASTERISK:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_ASTERISK_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '*':
|
|
result.tag = TOKEN_ASTERISK_ASTERISK;
|
|
self->index++;
|
|
break;
|
|
case '%':
|
|
state = TOKENIZER_STATE_ASTERISK_PERCENT;
|
|
goto state;
|
|
case '|':
|
|
state = TOKENIZER_STATE_ASTERISK_PIPE;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKEN_ASTERISK;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ASTERISK_PERCENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_ASTERISK_PERCENT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_ASTERISK_PERCENT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ASTERISK_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_ASTERISK_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_ASTERISK_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PERCENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_PERCENT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_PERCENT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PLUS:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_PLUS_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '+':
|
|
result.tag = TOKEN_PLUS_PLUS;
|
|
self->index++;
|
|
break;
|
|
case '%':
|
|
state = TOKENIZER_STATE_PLUS_PERCENT;
|
|
goto state;
|
|
case '|':
|
|
state = TOKENIZER_STATE_PLUS_PIPE;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKEN_PLUS;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PLUS_PERCENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_PLUS_PERCENT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_PLUS_PERCENT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PLUS_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_PLUS_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_PLUS_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_CARET:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_CARET_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_CARET;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_IDENTIFIER:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 'a' ... 'z':
|
|
case 'A' ... 'Z':
|
|
case '_':
|
|
case '0' ... '9':
|
|
state = TOKENIZER_STATE_IDENTIFIER;
|
|
goto state;
|
|
default:; // Once we're at C23, this semicolon can be removed.
|
|
const char* start = self->buffer + result.loc.start;
|
|
uint32_t len = self->index - result.loc.start;
|
|
TokenizerTag tag = getKeyword(start, len);
|
|
if (tag != TOKEN_INVALID)
|
|
result.tag = tag;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_BUILTIN:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 'a' ... 'z':
|
|
case 'A' ... 'Z':
|
|
case '_':
|
|
case '0' ... '9':
|
|
state = TOKENIZER_STATE_BUILTIN;
|
|
goto state;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_BACKSLASH:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
result.tag = TOKEN_INVALID;
|
|
break;
|
|
case '\\':
|
|
state = TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE;
|
|
goto state;
|
|
case '\n':
|
|
result.tag = TOKEN_INVALID;
|
|
break;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_STRING_LITERAL:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
result.tag = TOKEN_INVALID;
|
|
}
|
|
break;
|
|
case '\n':
|
|
result.tag = TOKEN_INVALID;
|
|
break;
|
|
case '\\':
|
|
state = TOKENIZER_STATE_STRING_LITERAL_BACKSLASH;
|
|
goto state;
|
|
case '"':
|
|
self->index++;
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_STRING_LITERAL;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_STRING_LITERAL_BACKSLASH:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
case '\n':
|
|
result.tag = TOKEN_INVALID;
|
|
break;
|
|
default:
|
|
state = TOKENIZER_STATE_STRING_LITERAL;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_CHAR_LITERAL:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
result.tag = TOKEN_INVALID;
|
|
}
|
|
break;
|
|
case '\n':
|
|
result.tag = TOKEN_INVALID;
|
|
break;
|
|
case '\\':
|
|
state = TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH;
|
|
goto state;
|
|
case '\'':
|
|
self->index++;
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_CHAR_LITERAL;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
result.tag = TOKEN_INVALID;
|
|
}
|
|
break;
|
|
case '\n':
|
|
result.tag = TOKEN_INVALID;
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_CHAR_LITERAL;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case '\n':
|
|
break;
|
|
case '\r':
|
|
if (self->buffer[self->index + 1] != '\n') {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_BANG:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_BANG_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_BANG;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '|':
|
|
result.tag = TOKEN_PIPE_PIPE;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_EQUAL:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_EQUAL_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '>':
|
|
result.tag = TOKEN_EQUAL_ANGLE_BRACKET_RIGHT;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_EQUAL;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_MINUS:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '>':
|
|
result.tag = TOKEN_ARROW;
|
|
self->index++;
|
|
break;
|
|
case '=':
|
|
result.tag = TOKEN_MINUS_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '%':
|
|
state = TOKENIZER_STATE_MINUS_PERCENT;
|
|
goto state;
|
|
case '|':
|
|
state = TOKENIZER_STATE_MINUS_PIPE;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKEN_MINUS;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_MINUS_PERCENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_MINUS_PERCENT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_MINUS_PERCENT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_MINUS_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_MINUS_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_MINUS_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_LEFT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '<':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
|
|
goto state;
|
|
case '=':
|
|
result.tag = TOKEN_ANGLE_BRACKET_LEFT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_ANGLE_BRACKET_LEFT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '|':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_RIGHT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '>':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
|
|
goto state;
|
|
case '=':
|
|
result.tag = TOKEN_ANGLE_BRACKET_RIGHT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_ANGLE_BRACKET_RIGHT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PERIOD:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '.':
|
|
state = TOKENIZER_STATE_PERIOD_2;
|
|
goto state;
|
|
case '*':
|
|
state = TOKENIZER_STATE_PERIOD_ASTERISK;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKEN_PERIOD;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PERIOD_2:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '.':
|
|
result.tag = TOKEN_ELLIPSIS3;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_ELLIPSIS2;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PERIOD_ASTERISK:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '*':
|
|
result.tag = TOKEN_INVALID_PERIODASTERISKS;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_PERIOD_ASTERISK;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_SLASH:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '/':
|
|
state = TOKENIZER_STATE_LINE_COMMENT_START;
|
|
goto state;
|
|
case '=':
|
|
result.tag = TOKEN_SLASH_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKEN_SLASH;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_LINE_COMMENT_START:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
return (TokenizerToken) {
|
|
.tag = TOKEN_EOF,
|
|
.loc = {
|
|
.start = self->index,
|
|
.end = self->index,
|
|
}
|
|
};
|
|
}
|
|
break;
|
|
case '!':
|
|
result.tag = TOKEN_CONTAINER_DOC_COMMENT;
|
|
state = TOKENIZER_STATE_DOC_COMMENT;
|
|
goto state;
|
|
case '\n':
|
|
self->index++;
|
|
result.loc.start = self->index;
|
|
state = TOKENIZER_STATE_START;
|
|
goto state;
|
|
case '/':
|
|
state = TOKENIZER_STATE_DOC_COMMENT_START;
|
|
goto state;
|
|
case '\r':
|
|
state = TOKENIZER_STATE_EXPECT_NEWLINE;
|
|
goto state;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_LINE_COMMENT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_DOC_COMMENT_START:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
case '\n':
|
|
result.tag = TOKEN_DOC_COMMENT;
|
|
break;
|
|
case '\r':
|
|
if (self->buffer[self->index + 1] == '\n') {
|
|
result.tag = TOKEN_DOC_COMMENT;
|
|
} else {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case '/':
|
|
state = TOKENIZER_STATE_LINE_COMMENT;
|
|
goto state;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKEN_DOC_COMMENT;
|
|
state = TOKENIZER_STATE_DOC_COMMENT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_LINE_COMMENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
return (TokenizerToken) {
|
|
.tag = TOKEN_EOF,
|
|
.loc = {
|
|
.start = self->index,
|
|
.end = self->index,
|
|
}
|
|
};
|
|
}
|
|
break;
|
|
case '\n':
|
|
self->index++;
|
|
result.loc.start = self->index;
|
|
state = TOKENIZER_STATE_START;
|
|
goto state;
|
|
case '\r':
|
|
state = TOKENIZER_STATE_EXPECT_NEWLINE;
|
|
goto state;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_LINE_COMMENT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_DOC_COMMENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
case '\n':
|
|
break;
|
|
case '\r':
|
|
if (self->buffer[self->index + 1] != '\n') {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_DOC_COMMENT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_INT:
|
|
switch (self->buffer[self->index]) {
|
|
case '.':
|
|
state = TOKENIZER_STATE_INT_PERIOD;
|
|
goto state;
|
|
case '_':
|
|
case 'a' ... 'd':
|
|
case 'f' ... 'o':
|
|
case 'q' ... 'z':
|
|
case 'A' ... 'D':
|
|
case 'F' ... 'O':
|
|
case 'Q' ... 'Z':
|
|
case '0' ... '9':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_INT;
|
|
goto state;
|
|
case 'e':
|
|
case 'E':
|
|
case 'p':
|
|
case 'P':
|
|
state = TOKENIZER_STATE_INT_EXPONENT;
|
|
goto state;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_INT_EXPONENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '-':
|
|
case '+':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_INT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_INT_PERIOD:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '_':
|
|
case 'a' ... 'd':
|
|
case 'f' ... 'o':
|
|
case 'q' ... 'z':
|
|
case 'A' ... 'D':
|
|
case 'F' ... 'O':
|
|
case 'Q' ... 'Z':
|
|
case '0' ... '9':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
case 'e':
|
|
case 'E':
|
|
case 'p':
|
|
case 'P':
|
|
state = TOKENIZER_STATE_FLOAT_EXPONENT;
|
|
goto state;
|
|
default:
|
|
self->index--;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_FLOAT:
|
|
switch (self->buffer[self->index]) {
|
|
case '_':
|
|
case 'a' ... 'd':
|
|
case 'f' ... 'o':
|
|
case 'q' ... 'z':
|
|
case 'A' ... 'D':
|
|
case 'F' ... 'O':
|
|
case 'Q' ... 'Z':
|
|
case '0' ... '9':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
case 'e':
|
|
case 'E':
|
|
case 'p':
|
|
case 'P':
|
|
state = TOKENIZER_STATE_FLOAT_EXPONENT;
|
|
goto state;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_FLOAT_EXPONENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '-':
|
|
case '+':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
}
|
|
break;
|
|
}
|
|
|
|
result.loc.end = self->index;
|
|
|
|
return result;
|
|
}
|