Files
zig0/tokenizer.c

1101 lines
30 KiB
C

#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "tokenizer.h"
typedef struct {
const char* keyword;
TokenizerTag tag;
} KeywordMap;
const char* tokenizerGetTagString(TokenizerTag tag) {
switch (tag) {
TOKENIZER_FOREACH_TAG_ENUM(TOKENIZER_GENERATE_CASE)
default:
return "UNKNOWN";
}
}
const KeywordMap keywords[] = {
{ "addrspace", TOKEN_KEYWORD_ADDRSPACE },
{ "align", TOKEN_KEYWORD_ALIGN },
{ "allowzero", TOKEN_KEYWORD_ALLOWZERO },
{ "and", TOKEN_KEYWORD_AND },
{ "anyframe", TOKEN_KEYWORD_ANYFRAME },
{ "anytype", TOKEN_KEYWORD_ANYTYPE },
{ "asm", TOKEN_KEYWORD_ASM },
{ "async", TOKEN_KEYWORD_ASYNC },
{ "await", TOKEN_KEYWORD_AWAIT },
{ "break", TOKEN_KEYWORD_BREAK },
{ "callconv", TOKEN_KEYWORD_CALLCONV },
{ "catch", TOKEN_KEYWORD_CATCH },
{ "comptime", TOKEN_KEYWORD_COMPTIME },
{ "const", TOKEN_KEYWORD_CONST },
{ "continue", TOKEN_KEYWORD_CONTINUE },
{ "defer", TOKEN_KEYWORD_DEFER },
{ "else", TOKEN_KEYWORD_ELSE },
{ "enum", TOKEN_KEYWORD_ENUM },
{ "errdefer", TOKEN_KEYWORD_ERRDEFER },
{ "error", TOKEN_KEYWORD_ERROR },
{ "export", TOKEN_KEYWORD_EXPORT },
{ "extern", TOKEN_KEYWORD_EXTERN },
{ "fn", TOKEN_KEYWORD_FN },
{ "for", TOKEN_KEYWORD_FOR },
{ "if", TOKEN_KEYWORD_IF },
{ "inline", TOKEN_KEYWORD_INLINE },
{ "linksection", TOKEN_KEYWORD_LINKSECTION },
{ "noalias", TOKEN_KEYWORD_NOALIAS },
{ "noinline", TOKEN_KEYWORD_NOINLINE },
{ "nosuspend", TOKEN_KEYWORD_NOSUSPEND },
{ "opaque", TOKEN_KEYWORD_OPAQUE },
{ "or", TOKEN_KEYWORD_OR },
{ "orelse", TOKEN_KEYWORD_ORELSE },
{ "packed", TOKEN_KEYWORD_PACKED },
{ "pub", TOKEN_KEYWORD_PUB },
{ "resume", TOKEN_KEYWORD_RESUME },
{ "return", TOKEN_KEYWORD_RETURN },
{ "struct", TOKEN_KEYWORD_STRUCT },
{ "suspend", TOKEN_KEYWORD_SUSPEND },
{ "switch", TOKEN_KEYWORD_SWITCH },
{ "test", TOKEN_KEYWORD_TEST },
{ "threadlocal", TOKEN_KEYWORD_THREADLOCAL },
{ "try", TOKEN_KEYWORD_TRY },
{ "union", TOKEN_KEYWORD_UNION },
{ "unreachable", TOKEN_KEYWORD_UNREACHABLE },
{ "usingnamespace", TOKEN_KEYWORD_USINGNAMESPACE },
{ "var", TOKEN_KEYWORD_VAR },
{ "volatile", TOKEN_KEYWORD_VOLATILE },
{ "while", TOKEN_KEYWORD_WHILE }
};
// TODO binary search
static TokenizerTag getKeyword(const char* bytes, const uint32_t len) {
for (unsigned long i = 0; i < sizeof(keywords) / sizeof(KeywordMap); i++) {
size_t klen = strlen(keywords[i].keyword);
size_t minlen = klen < len ? klen : len;
int cmp = strncmp(bytes, keywords[i].keyword, minlen);
if (cmp == 0) {
if (len == klen) {
return keywords[i].tag;
} else {
return TOKEN_INVALID;
}
} else if (cmp < 0) {
return TOKEN_INVALID;
}
}
return TOKEN_INVALID;
}
Tokenizer tokenizerInit(const char* buffer, const uint32_t len) {
return (Tokenizer) {
.buffer = buffer,
.buffer_len = len,
.index = (len >= 3 && memcmp(buffer, "\xEF\xBB\xBF", 3) == 0) ? 3 : 0,
};
}
TokenizerToken tokenizerNext(Tokenizer* self) {
TokenizerToken result = (TokenizerToken) {
.tag = TOKEN_INVALID,
.loc = {
.start = 0,
},
};
TokenizerState state = TOKENIZER_STATE_START;
state:
switch (state) {
case TOKENIZER_STATE_START:
switch (self->buffer[self->index]) {
case 0:
if (self->index == self->buffer_len) {
return (TokenizerToken) {
.tag = TOKEN_EOF,
.loc = {
.start = self->index,
.end = self->index,
}
};
} else {
state = TOKENIZER_STATE_INVALID;
goto state;
}
case ' ':
case '\n':
case '\t':
case '\r':
self->index++;
result.loc.start = self->index;
goto state;
case '"':
result.tag = TOKEN_STRING_LITERAL;
state = TOKENIZER_STATE_STRING_LITERAL;
goto state;
case '\'':
result.tag = TOKEN_CHAR_LITERAL;
state = TOKENIZER_STATE_CHAR_LITERAL;
goto state;
case 'a' ... 'z':
case 'A' ... 'Z':
case '_':
result.tag = TOKEN_IDENTIFIER;
state = TOKENIZER_STATE_IDENTIFIER;
goto state;
case '@':
state = TOKENIZER_STATE_SAW_AT_SIGN;
goto state;
case '=':
state = TOKENIZER_STATE_EQUAL;
goto state;
case '!':
state = TOKENIZER_STATE_BANG;
goto state;
case '|':
state = TOKENIZER_STATE_PIPE;
goto state;
case '(':
result.tag = TOKEN_L_PAREN;
self->index++;
break;
case ')':
result.tag = TOKEN_R_PAREN;
self->index++;
break;
case '[':
result.tag = TOKEN_L_BRACKET;
self->index++;
break;
case ']':
result.tag = TOKEN_R_BRACKET;
self->index++;
break;
case ';':
result.tag = TOKEN_SEMICOLON;
self->index++;
break;
case ',':
result.tag = TOKEN_COMMA;
self->index++;
break;
case '?':
result.tag = TOKEN_QUESTION_MARK;
self->index++;
break;
case ':':
result.tag = TOKEN_COLON;
self->index++;
break;
case '%':
state = TOKENIZER_STATE_PERCENT;
goto state;
case '*':
state = TOKENIZER_STATE_ASTERISK;
goto state;
case '+':
state = TOKENIZER_STATE_PLUS;
goto state;
case '<':
state = TOKENIZER_STATE_ANGLE_BRACKET_LEFT;
goto state;
case '>':
state = TOKENIZER_STATE_ANGLE_BRACKET_RIGHT;
goto state;
case '^':
state = TOKENIZER_STATE_CARET;
goto state;
case '\\':
result.tag = TOKEN_MULTILINE_STRING_LITERAL_LINE;
state = TOKENIZER_STATE_BACKSLASH;
goto state;
case '{':
result.tag = TOKEN_L_BRACE;
self->index++;
break;
case '}':
result.tag = TOKEN_R_BRACE;
self->index++;
break;
case '~':
result.tag = TOKEN_TILDE;
self->index++;
break;
case '.':
state = TOKENIZER_STATE_PERIOD;
goto state;
case '-':
state = TOKENIZER_STATE_MINUS;
goto state;
case '/':
state = TOKENIZER_STATE_SLASH;
goto state;
case '&':
state = TOKENIZER_STATE_AMPERSAND;
goto state;
case '0' ... '9':
result.tag = TOKEN_NUMBER_LITERAL;
self->index++;
state = TOKENIZER_STATE_INT;
goto state;
default:
state = TOKENIZER_STATE_INVALID;
goto state;
};
break;
case TOKENIZER_STATE_EXPECT_NEWLINE:
self->index++;
switch (self->buffer[self->index]) {
case 0:
if (self->index == self->buffer_len) {
result.tag = TOKEN_INVALID;
} else {
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case '\n':
self->index++;
result.loc.start = self->index;
state = TOKENIZER_STATE_START;
goto state;
default:
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case TOKENIZER_STATE_INVALID:
self->index++;
switch (self->buffer[self->index]) {
case 0:
if (self->index == self->buffer_len) {
result.tag = TOKEN_INVALID;
} else {
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case '\n':
result.tag = TOKEN_INVALID;
break;
default:
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case TOKENIZER_STATE_SAW_AT_SIGN:
self->index++;
switch (self->buffer[self->index]) {
case 0:
case '\n':
result.tag = TOKEN_INVALID;
break;
case '"':
result.tag = TOKEN_IDENTIFIER;
state = TOKENIZER_STATE_STRING_LITERAL;
goto state;
case 'a' ... 'z':
case 'A' ... 'Z':
case '_':
result.tag = TOKEN_BUILTIN;
state = TOKENIZER_STATE_BUILTIN;
goto state;
default:
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case TOKENIZER_STATE_AMPERSAND:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_AMPERSAND_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_AMPERSAND;
break;
}
break;
case TOKENIZER_STATE_ASTERISK:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_ASTERISK_EQUAL;
self->index++;
break;
case '*':
result.tag = TOKEN_ASTERISK_ASTERISK;
self->index++;
break;
case '%':
state = TOKENIZER_STATE_ASTERISK_PERCENT;
goto state;
case '|':
state = TOKENIZER_STATE_ASTERISK_PIPE;
goto state;
default:
result.tag = TOKEN_ASTERISK;
break;
}
break;
case TOKENIZER_STATE_ASTERISK_PERCENT:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_ASTERISK_PERCENT_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_ASTERISK_PERCENT;
break;
}
break;
case TOKENIZER_STATE_ASTERISK_PIPE:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_ASTERISK_PIPE_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_ASTERISK_PIPE;
break;
}
break;
case TOKENIZER_STATE_PERCENT:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_PERCENT_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_PERCENT;
break;
}
break;
case TOKENIZER_STATE_PLUS:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_PLUS_EQUAL;
self->index++;
break;
case '+':
result.tag = TOKEN_PLUS_PLUS;
self->index++;
break;
case '%':
state = TOKENIZER_STATE_PLUS_PERCENT;
goto state;
case '|':
state = TOKENIZER_STATE_PLUS_PIPE;
goto state;
default:
result.tag = TOKEN_PLUS;
break;
}
break;
case TOKENIZER_STATE_PLUS_PERCENT:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_PLUS_PERCENT_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_PLUS_PERCENT;
break;
}
break;
case TOKENIZER_STATE_PLUS_PIPE:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_PLUS_PIPE_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_PLUS_PIPE;
break;
}
break;
case TOKENIZER_STATE_CARET:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_CARET_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_CARET;
break;
}
break;
case TOKENIZER_STATE_IDENTIFIER:
self->index++;
switch (self->buffer[self->index]) {
case 'a' ... 'z':
case 'A' ... 'Z':
case '_':
case '0' ... '9':
state = TOKENIZER_STATE_IDENTIFIER;
goto state;
default:; // Once we're at C23, this semicolon can be removed.
const char* start = self->buffer + result.loc.start;
uint32_t len = self->index - result.loc.start;
TokenizerTag tag = getKeyword(start, len);
if (tag != TOKEN_INVALID)
result.tag = tag;
}
break;
case TOKENIZER_STATE_BUILTIN:
self->index++;
switch (self->buffer[self->index]) {
case 'a' ... 'z':
case 'A' ... 'Z':
case '_':
case '0' ... '9':
state = TOKENIZER_STATE_BUILTIN;
goto state;
break;
}
break;
case TOKENIZER_STATE_BACKSLASH:
self->index++;
switch (self->buffer[self->index]) {
case 0:
result.tag = TOKEN_INVALID;
break;
case '\\':
state = TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE;
goto state;
case '\n':
result.tag = TOKEN_INVALID;
break;
default:
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case TOKENIZER_STATE_STRING_LITERAL:
self->index++;
switch (self->buffer[self->index]) {
case 0:
if (self->index != self->buffer_len) {
state = TOKENIZER_STATE_INVALID;
goto state;
} else {
result.tag = TOKEN_INVALID;
}
break;
case '\n':
result.tag = TOKEN_INVALID;
break;
case '\\':
state = TOKENIZER_STATE_STRING_LITERAL_BACKSLASH;
goto state;
case '"':
self->index++;
break;
case 0x01 ... 0x09:
case 0x0b ... 0x1f:
case 0x7f:
state = TOKENIZER_STATE_INVALID;
goto state;
default:
state = TOKENIZER_STATE_STRING_LITERAL;
goto state;
}
break;
case TOKENIZER_STATE_STRING_LITERAL_BACKSLASH:
self->index++;
switch (self->buffer[self->index]) {
case 0:
case '\n':
result.tag = TOKEN_INVALID;
break;
default:
state = TOKENIZER_STATE_STRING_LITERAL;
goto state;
}
break;
case TOKENIZER_STATE_CHAR_LITERAL:
self->index++;
switch (self->buffer[self->index]) {
case 0:
if (self->index != self->buffer_len) {
state = TOKENIZER_STATE_INVALID;
goto state;
} else {
result.tag = TOKEN_INVALID;
}
break;
case '\n':
result.tag = TOKEN_INVALID;
break;
case '\\':
state = TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH;
goto state;
case '\'':
self->index++;
break;
case 0x01 ... 0x09:
case 0x0b ... 0x1f:
case 0x7f:
state = TOKENIZER_STATE_INVALID;
goto state;
default:
state = TOKENIZER_STATE_CHAR_LITERAL;
goto state;
}
break;
case TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH:
self->index++;
switch (self->buffer[self->index]) {
case 0:
if (self->index != self->buffer_len) {
state = TOKENIZER_STATE_INVALID;
goto state;
} else {
result.tag = TOKEN_INVALID;
}
break;
case '\n':
result.tag = TOKEN_INVALID;
break;
case 0x01 ... 0x09:
case 0x0b ... 0x1f:
case 0x7f:
state = TOKENIZER_STATE_INVALID;
goto state;
default:
state = TOKENIZER_STATE_CHAR_LITERAL;
goto state;
}
break;
case TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE:
self->index++;
switch (self->buffer[self->index]) {
case 0:
if (self->index != self->buffer_len) {
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case '\n':
break;
case '\r':
if (self->buffer[self->index + 1] != '\n') {
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case 0x01 ... 0x09:
case 0x0b ... 0x0c:
case 0x0e ... 0x1f:
case 0x7f:
state = TOKENIZER_STATE_INVALID;
goto state;
default:
state = TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE;
goto state;
}
break;
case TOKENIZER_STATE_BANG:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_BANG_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_BANG;
break;
}
break;
case TOKENIZER_STATE_PIPE:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_PIPE_EQUAL;
self->index++;
break;
case '|':
result.tag = TOKEN_PIPE_PIPE;
self->index++;
break;
default:
result.tag = TOKEN_PIPE;
break;
}
break;
case TOKENIZER_STATE_EQUAL:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_EQUAL_EQUAL;
self->index++;
break;
case '>':
result.tag = TOKEN_EQUAL_ANGLE_BRACKET_RIGHT;
self->index++;
break;
default:
result.tag = TOKEN_EQUAL;
break;
}
break;
case TOKENIZER_STATE_MINUS:
self->index++;
switch (self->buffer[self->index]) {
case '>':
result.tag = TOKEN_ARROW;
self->index++;
break;
case '=':
result.tag = TOKEN_MINUS_EQUAL;
self->index++;
break;
case '%':
state = TOKENIZER_STATE_MINUS_PERCENT;
goto state;
case '|':
state = TOKENIZER_STATE_MINUS_PIPE;
goto state;
default:
result.tag = TOKEN_MINUS;
break;
}
break;
case TOKENIZER_STATE_MINUS_PERCENT:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_MINUS_PERCENT_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_MINUS_PERCENT;
break;
}
break;
case TOKENIZER_STATE_MINUS_PIPE:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_MINUS_PIPE_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_MINUS_PIPE;
break;
}
break;
case TOKENIZER_STATE_ANGLE_BRACKET_LEFT:
self->index++;
switch (self->buffer[self->index]) {
case '<':
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
goto state;
case '=':
result.tag = TOKEN_ANGLE_BRACKET_LEFT_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_ANGLE_BRACKET_LEFT;
break;
}
break;
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL;
self->index++;
break;
case '|':
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
goto state;
default:
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
break;
}
break;
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
break;
}
break;
case TOKENIZER_STATE_ANGLE_BRACKET_RIGHT:
self->index++;
switch (self->buffer[self->index]) {
case '>':
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
goto state;
case '=':
result.tag = TOKEN_ANGLE_BRACKET_RIGHT_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_ANGLE_BRACKET_RIGHT;
break;
}
break;
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT:
self->index++;
switch (self->buffer[self->index]) {
case '=':
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
break;
}
break;
case TOKENIZER_STATE_PERIOD:
self->index++;
switch (self->buffer[self->index]) {
case '.':
state = TOKENIZER_STATE_PERIOD_2;
goto state;
case '*':
state = TOKENIZER_STATE_PERIOD_ASTERISK;
goto state;
default:
result.tag = TOKEN_PERIOD;
break;
}
break;
case TOKENIZER_STATE_PERIOD_2:
self->index++;
switch (self->buffer[self->index]) {
case '.':
result.tag = TOKEN_ELLIPSIS3;
self->index++;
break;
default:
result.tag = TOKEN_ELLIPSIS2;
break;
}
break;
case TOKENIZER_STATE_PERIOD_ASTERISK:
self->index++;
switch (self->buffer[self->index]) {
case '*':
result.tag = TOKEN_INVALID_PERIODASTERISKS;
break;
default:
result.tag = TOKEN_PERIOD_ASTERISK;
break;
}
break;
case TOKENIZER_STATE_SLASH:
self->index++;
switch (self->buffer[self->index]) {
case '/':
state = TOKENIZER_STATE_LINE_COMMENT_START;
goto state;
case '=':
result.tag = TOKEN_SLASH_EQUAL;
self->index++;
break;
default:
result.tag = TOKEN_SLASH;
break;
}
break;
case TOKENIZER_STATE_LINE_COMMENT_START:
self->index++;
switch (self->buffer[self->index]) {
case 0:
if (self->index != self->buffer_len) {
state = TOKENIZER_STATE_INVALID;
goto state;
} else {
return (TokenizerToken) {
.tag = TOKEN_EOF,
.loc = {
.start = self->index,
.end = self->index,
}
};
}
break;
case '!':
result.tag = TOKEN_CONTAINER_DOC_COMMENT;
state = TOKENIZER_STATE_DOC_COMMENT;
goto state;
case '\n':
self->index++;
result.loc.start = self->index;
state = TOKENIZER_STATE_START;
goto state;
case '/':
state = TOKENIZER_STATE_DOC_COMMENT_START;
goto state;
case '\r':
state = TOKENIZER_STATE_EXPECT_NEWLINE;
goto state;
case 0x01 ... 0x09:
case 0x0b ... 0x0c:
case 0x0e ... 0x1f:
case 0x7f:
state = TOKENIZER_STATE_INVALID;
goto state;
default:
state = TOKENIZER_STATE_LINE_COMMENT;
goto state;
}
break;
case TOKENIZER_STATE_DOC_COMMENT_START:
self->index++;
switch (self->buffer[self->index]) {
case 0:
case '\n':
result.tag = TOKEN_DOC_COMMENT;
break;
case '\r':
if (self->buffer[self->index + 1] == '\n') {
result.tag = TOKEN_DOC_COMMENT;
} else {
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case '/':
state = TOKENIZER_STATE_LINE_COMMENT;
goto state;
case 0x01 ... 0x09:
case 0x0b ... 0x0c:
case 0x0e ... 0x1f:
case 0x7f:
state = TOKENIZER_STATE_INVALID;
goto state;
default:
result.tag = TOKEN_DOC_COMMENT;
state = TOKENIZER_STATE_DOC_COMMENT;
goto state;
}
break;
case TOKENIZER_STATE_LINE_COMMENT:
self->index++;
switch (self->buffer[self->index]) {
case 0:
if (self->index != self->buffer_len) {
state = TOKENIZER_STATE_INVALID;
goto state;
} else {
return (TokenizerToken) {
.tag = TOKEN_EOF,
.loc = {
.start = self->index,
.end = self->index,
}
};
}
break;
case '\n':
self->index++;
result.loc.start = self->index;
state = TOKENIZER_STATE_START;
goto state;
case '\r':
state = TOKENIZER_STATE_EXPECT_NEWLINE;
goto state;
case 0x01 ... 0x09:
case 0x0b ... 0x0c:
case 0x0e ... 0x1f:
case 0x7f:
state = TOKENIZER_STATE_INVALID;
goto state;
default:
state = TOKENIZER_STATE_LINE_COMMENT;
goto state;
}
break;
case TOKENIZER_STATE_DOC_COMMENT:
self->index++;
switch (self->buffer[self->index]) {
case 0:
case '\n':
break;
case '\r':
if (self->buffer[self->index + 1] != '\n') {
state = TOKENIZER_STATE_INVALID;
goto state;
}
break;
case 0x01 ... 0x09:
case 0x0b ... 0x0c:
case 0x0e ... 0x1f:
case 0x7f:
state = TOKENIZER_STATE_INVALID;
goto state;
default:
state = TOKENIZER_STATE_DOC_COMMENT;
goto state;
}
break;
case TOKENIZER_STATE_INT:
switch (self->buffer[self->index]) {
case '.':
state = TOKENIZER_STATE_INT_PERIOD;
goto state;
case '_':
case 'a' ... 'd':
case 'f' ... 'o':
case 'q' ... 'z':
case 'A' ... 'D':
case 'F' ... 'O':
case 'Q' ... 'Z':
case '0' ... '9':
self->index++;
state = TOKENIZER_STATE_INT;
goto state;
case 'e':
case 'E':
case 'p':
case 'P':
state = TOKENIZER_STATE_INT_EXPONENT;
goto state;
default:
break;
}
break;
case TOKENIZER_STATE_INT_EXPONENT:
self->index++;
switch (self->buffer[self->index]) {
case '-':
case '+':
self->index++;
state = TOKENIZER_STATE_FLOAT;
goto state;
default:
state = TOKENIZER_STATE_INT;
goto state;
}
break;
case TOKENIZER_STATE_INT_PERIOD:
self->index++;
switch (self->buffer[self->index]) {
case '_':
case 'a' ... 'd':
case 'f' ... 'o':
case 'q' ... 'z':
case 'A' ... 'D':
case 'F' ... 'O':
case 'Q' ... 'Z':
case '0' ... '9':
self->index++;
state = TOKENIZER_STATE_FLOAT;
goto state;
case 'e':
case 'E':
case 'p':
case 'P':
state = TOKENIZER_STATE_FLOAT_EXPONENT;
goto state;
default:
self->index--;
break;
}
break;
case TOKENIZER_STATE_FLOAT:
switch (self->buffer[self->index]) {
case '_':
case 'a' ... 'd':
case 'f' ... 'o':
case 'q' ... 'z':
case 'A' ... 'D':
case 'F' ... 'O':
case 'Q' ... 'Z':
case '0' ... '9':
self->index++;
state = TOKENIZER_STATE_FLOAT;
goto state;
case 'e':
case 'E':
case 'p':
case 'P':
state = TOKENIZER_STATE_FLOAT_EXPONENT;
goto state;
default:
break;
}
break;
case TOKENIZER_STATE_FLOAT_EXPONENT:
self->index++;
switch (self->buffer[self->index]) {
case '-':
case '+':
self->index++;
state = TOKENIZER_STATE_FLOAT;
goto state;
default:
state = TOKENIZER_STATE_FLOAT;
goto state;
}
break;
}
result.loc.end = self->index;
return result;
}