1096 lines
30 KiB
C
1096 lines
30 KiB
C
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#include "tokenizer.h"
|
|
|
|
typedef struct {
|
|
const char* keyword;
|
|
tokenizer_tag tag;
|
|
} keyword_map;
|
|
|
|
const keyword_map keywords[] = {
|
|
{ "addrspace", TOKENIZER_TAG_KEYWORD_ADDRSPACE },
|
|
{ "align", TOKENIZER_TAG_KEYWORD_ALIGN },
|
|
{ "allowzero", TOKENIZER_TAG_KEYWORD_ALLOWZERO },
|
|
{ "and", TOKENIZER_TAG_KEYWORD_AND },
|
|
{ "anyframe", TOKENIZER_TAG_KEYWORD_ANYFRAME },
|
|
{ "anytype", TOKENIZER_TAG_KEYWORD_ANYTYPE },
|
|
{ "asm", TOKENIZER_TAG_KEYWORD_ASM },
|
|
{ "async", TOKENIZER_TAG_KEYWORD_ASYNC },
|
|
{ "await", TOKENIZER_TAG_KEYWORD_AWAIT },
|
|
{ "break", TOKENIZER_TAG_KEYWORD_BREAK },
|
|
{ "callconv", TOKENIZER_TAG_KEYWORD_CALLCONV },
|
|
{ "catch", TOKENIZER_TAG_KEYWORD_CATCH },
|
|
{ "comptime", TOKENIZER_TAG_KEYWORD_COMPTIME },
|
|
{ "const", TOKENIZER_TAG_KEYWORD_CONST },
|
|
{ "continue", TOKENIZER_TAG_KEYWORD_CONTINUE },
|
|
{ "defer", TOKENIZER_TAG_KEYWORD_DEFER },
|
|
{ "else", TOKENIZER_TAG_KEYWORD_ELSE },
|
|
{ "enum", TOKENIZER_TAG_KEYWORD_ENUM },
|
|
{ "errdefer", TOKENIZER_TAG_KEYWORD_ERRDEFER },
|
|
{ "error", TOKENIZER_TAG_KEYWORD_ERROR },
|
|
{ "export", TOKENIZER_TAG_KEYWORD_EXPORT },
|
|
{ "extern", TOKENIZER_TAG_KEYWORD_EXTERN },
|
|
{ "fn", TOKENIZER_TAG_KEYWORD_FN },
|
|
{ "for", TOKENIZER_TAG_KEYWORD_FOR },
|
|
{ "if", TOKENIZER_TAG_KEYWORD_IF },
|
|
{ "inline", TOKENIZER_TAG_KEYWORD_INLINE },
|
|
{ "linksection", TOKENIZER_TAG_KEYWORD_LINKSECTION },
|
|
{ "noalias", TOKENIZER_TAG_KEYWORD_NOALIAS },
|
|
{ "noinline", TOKENIZER_TAG_KEYWORD_NOINLINE },
|
|
{ "nosuspend", TOKENIZER_TAG_KEYWORD_NOSUSPEND },
|
|
{ "opaque", TOKENIZER_TAG_KEYWORD_OPAQUE },
|
|
{ "or", TOKENIZER_TAG_KEYWORD_OR },
|
|
{ "orelse", TOKENIZER_TAG_KEYWORD_ORELSE },
|
|
{ "packed", TOKENIZER_TAG_KEYWORD_PACKED },
|
|
{ "pub", TOKENIZER_TAG_KEYWORD_PUB },
|
|
{ "resume", TOKENIZER_TAG_KEYWORD_RESUME },
|
|
{ "return", TOKENIZER_TAG_KEYWORD_RETURN },
|
|
{ "struct", TOKENIZER_TAG_KEYWORD_STRUCT },
|
|
{ "suspend", TOKENIZER_TAG_KEYWORD_SUSPEND },
|
|
{ "switch", TOKENIZER_TAG_KEYWORD_SWITCH },
|
|
{ "test", TOKENIZER_TAG_KEYWORD_TEST },
|
|
{ "threadlocal", TOKENIZER_TAG_KEYWORD_THREADLOCAL },
|
|
{ "try", TOKENIZER_TAG_KEYWORD_TRY },
|
|
{ "union", TOKENIZER_TAG_KEYWORD_UNION },
|
|
{ "unreachable", TOKENIZER_TAG_KEYWORD_UNREACHABLE },
|
|
{ "usingnamespace", TOKENIZER_TAG_KEYWORD_USINGNAMESPACE },
|
|
{ "var", TOKENIZER_TAG_KEYWORD_VAR },
|
|
{ "volatile", TOKENIZER_TAG_KEYWORD_VOLATILE },
|
|
{ "while", TOKENIZER_TAG_KEYWORD_WHILE }
|
|
};
|
|
|
|
// TODO binary search
|
|
static tokenizer_tag get_keyword(const char* bytes, uint32_t len)
|
|
{
|
|
for (unsigned long i = 0; i < sizeof(keywords) / sizeof(keyword_map); i++) {
|
|
size_t klen = strlen(keywords[i].keyword);
|
|
size_t minlen = klen < len ? klen : len;
|
|
int cmp = strncmp(bytes, keywords[i].keyword, minlen);
|
|
if (cmp == 0) {
|
|
if (len == klen) {
|
|
return keywords[i].tag;
|
|
} else {
|
|
return TOKENIZER_TAG_INVALID;
|
|
}
|
|
} else if (cmp < 0) {
|
|
return TOKENIZER_TAG_INVALID;
|
|
}
|
|
continue;
|
|
}
|
|
return TOKENIZER_TAG_INVALID;
|
|
}
|
|
|
|
tokenizer tokenizer_init(const char* buffer, uint32_t len)
|
|
{
|
|
return (tokenizer) {
|
|
.buffer = buffer,
|
|
.buffer_len = len,
|
|
.index = (len >= 3 && memcmp(buffer, "\xEF\xBB\xBF", 3) == 0) ? 3 : 0,
|
|
};
|
|
}
|
|
|
|
tokenizer_token tokenizer_next(tokenizer* self)
|
|
{
|
|
tokenizer_token result = (tokenizer_token) {
|
|
.tag = TOKENIZER_TAG_INVALID,
|
|
.loc = {
|
|
.start = 0,
|
|
},
|
|
};
|
|
|
|
tokenizer_state state = TOKENIZER_STATE_START;
|
|
|
|
state:
|
|
switch (state) {
|
|
case TOKENIZER_STATE_START:
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index == self->buffer_len) {
|
|
return (tokenizer_token) {
|
|
.tag = TOKENIZER_TAG_EOF,
|
|
.loc = {
|
|
.start = self->index,
|
|
.end = self->index,
|
|
}
|
|
};
|
|
} else {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
case ' ':
|
|
case '\n':
|
|
case '\t':
|
|
case '\r':
|
|
self->index++;
|
|
result.loc.start = self->index;
|
|
goto state;
|
|
case '"':
|
|
result.tag = TOKENIZER_TAG_STRING_LITERAL;
|
|
state = TOKENIZER_STATE_STRING_LITERAL;
|
|
goto state;
|
|
case '\'':
|
|
result.tag = TOKENIZER_TAG_CHAR_LITERAL;
|
|
state = TOKENIZER_STATE_CHAR_LITERAL;
|
|
goto state;
|
|
case 'a' ... 'z':
|
|
case 'A' ... 'Z':
|
|
case '_':
|
|
result.tag = TOKENIZER_TAG_IDENTIFIER;
|
|
state = TOKENIZER_STATE_IDENTIFIER;
|
|
goto state;
|
|
case '@':
|
|
state = TOKENIZER_STATE_SAW_AT_SIGN;
|
|
goto state;
|
|
case '=':
|
|
state = TOKENIZER_STATE_EQUAL;
|
|
goto state;
|
|
case '!':
|
|
state = TOKENIZER_STATE_BANG;
|
|
goto state;
|
|
case '|':
|
|
state = TOKENIZER_STATE_PIPE;
|
|
goto state;
|
|
case '(':
|
|
result.tag = TOKENIZER_TAG_L_PAREN;
|
|
self->index++;
|
|
break;
|
|
case ')':
|
|
result.tag = TOKENIZER_TAG_R_PAREN;
|
|
self->index++;
|
|
break;
|
|
case '[':
|
|
result.tag = TOKENIZER_TAG_L_BRACKET;
|
|
self->index++;
|
|
break;
|
|
case ']':
|
|
result.tag = TOKENIZER_TAG_R_BRACKET;
|
|
self->index++;
|
|
break;
|
|
case ';':
|
|
result.tag = TOKENIZER_TAG_SEMICOLON;
|
|
self->index++;
|
|
break;
|
|
case ',':
|
|
result.tag = TOKENIZER_TAG_COMMA;
|
|
self->index++;
|
|
break;
|
|
case '?':
|
|
result.tag = TOKENIZER_TAG_QUESTION_MARK;
|
|
self->index++;
|
|
break;
|
|
case ':':
|
|
result.tag = TOKENIZER_TAG_COLON;
|
|
self->index++;
|
|
break;
|
|
case '%':
|
|
state = TOKENIZER_STATE_PERCENT;
|
|
goto state;
|
|
case '*':
|
|
state = TOKENIZER_STATE_ASTERISK;
|
|
goto state;
|
|
case '+':
|
|
state = TOKENIZER_STATE_PLUS;
|
|
goto state;
|
|
case '<':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_LEFT;
|
|
goto state;
|
|
case '>':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_RIGHT;
|
|
goto state;
|
|
case '^':
|
|
state = TOKENIZER_STATE_CARET;
|
|
goto state;
|
|
case '\\':
|
|
result.tag = TOKENIZER_TAG_MULTILINE_STRING_LITERAL_LINE;
|
|
state = TOKENIZER_STATE_BACKSLASH;
|
|
goto state;
|
|
case '{':
|
|
result.tag = TOKENIZER_TAG_L_BRACE;
|
|
self->index++;
|
|
break;
|
|
case '}':
|
|
result.tag = TOKENIZER_TAG_R_BRACE;
|
|
self->index++;
|
|
break;
|
|
case '~':
|
|
result.tag = TOKENIZER_TAG_TILDE;
|
|
self->index++;
|
|
break;
|
|
case '.':
|
|
state = TOKENIZER_STATE_PERIOD;
|
|
goto state;
|
|
case '-':
|
|
state = TOKENIZER_STATE_MINUS;
|
|
goto state;
|
|
case '/':
|
|
state = TOKENIZER_STATE_SLASH;
|
|
goto state;
|
|
case '&':
|
|
state = TOKENIZER_STATE_AMPERSAND;
|
|
goto state;
|
|
case '0' ... '9':
|
|
result.tag = TOKENIZER_TAG_NUMBER_LITERAL;
|
|
self->index++;
|
|
state = TOKENIZER_STATE_INT;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
};
|
|
break;
|
|
|
|
case TOKENIZER_STATE_EXPECT_NEWLINE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index == self->buffer_len) {
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
} else {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case '\n':
|
|
self->index++;
|
|
result.loc.start = self->index;
|
|
state = TOKENIZER_STATE_START;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_INVALID:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index == self->buffer_len) {
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
} else {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case '\n':
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
break;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_SAW_AT_SIGN:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
case '\n':
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
break;
|
|
case '"':
|
|
result.tag = TOKENIZER_TAG_IDENTIFIER;
|
|
state = TOKENIZER_STATE_STRING_LITERAL;
|
|
goto state;
|
|
case 'a' ... 'z':
|
|
case 'A' ... 'Z':
|
|
case '_':
|
|
result.tag = TOKENIZER_TAG_BUILTIN;
|
|
state = TOKENIZER_STATE_BUILTIN;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_AMPERSAND:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_AMPERSAND_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_AMPERSAND;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ASTERISK:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_ASTERISK_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '*':
|
|
result.tag = TOKENIZER_TAG_ASTERISK_ASTERISK;
|
|
self->index++;
|
|
break;
|
|
case '%':
|
|
state = TOKENIZER_STATE_ASTERISK_PERCENT;
|
|
goto state;
|
|
case '|':
|
|
state = TOKENIZER_STATE_ASTERISK_PIPE;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ASTERISK;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ASTERISK_PERCENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_ASTERISK_PERCENT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ASTERISK_PERCENT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ASTERISK_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_ASTERISK_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ASTERISK_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PERCENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_PERCENT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_PERCENT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PLUS:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_PLUS_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '+':
|
|
result.tag = TOKENIZER_TAG_PLUS_PLUS;
|
|
self->index++;
|
|
break;
|
|
case '%':
|
|
state = TOKENIZER_STATE_PLUS_PERCENT;
|
|
goto state;
|
|
case '|':
|
|
state = TOKENIZER_STATE_PLUS_PIPE;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_PLUS;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PLUS_PERCENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_PLUS_PERCENT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_PLUS_PERCENT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PLUS_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_PLUS_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_PLUS_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_CARET:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_CARET_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_CARET;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_IDENTIFIER:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 'a' ... 'z':
|
|
case 'A' ... 'Z':
|
|
case '_':
|
|
case '0' ... '9':
|
|
state = TOKENIZER_STATE_IDENTIFIER;
|
|
goto state;
|
|
default:; // Once we're at C23, this semicolon can be removed.
|
|
const char* start = self->buffer + result.loc.start;
|
|
uint32_t len = self->index - result.loc.start;
|
|
tokenizer_tag tag = get_keyword(start, len);
|
|
if (tag != TOKENIZER_TAG_INVALID) {
|
|
result.tag = tag;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_BUILTIN:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 'a' ... 'z':
|
|
case 'A' ... 'Z':
|
|
case '_':
|
|
case '0' ... '9':
|
|
state = TOKENIZER_STATE_BUILTIN;
|
|
goto state;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_BACKSLASH:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
break;
|
|
case '\\':
|
|
state = TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE;
|
|
goto state;
|
|
case '\n':
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
break;
|
|
default:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_STRING_LITERAL:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
}
|
|
break;
|
|
case '\n':
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
break;
|
|
case '\\':
|
|
state = TOKENIZER_STATE_STRING_LITERAL_BACKSLASH;
|
|
goto state;
|
|
case '"':
|
|
self->index++;
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_STRING_LITERAL;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_STRING_LITERAL_BACKSLASH:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
case '\n':
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
break;
|
|
default:
|
|
state = TOKENIZER_STATE_STRING_LITERAL;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_CHAR_LITERAL:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
}
|
|
break;
|
|
case '\n':
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
break;
|
|
case '\\':
|
|
state = TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH;
|
|
goto state;
|
|
case '\'':
|
|
self->index++;
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_CHAR_LITERAL;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
}
|
|
break;
|
|
case '\n':
|
|
result.tag = TOKENIZER_TAG_INVALID;
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_CHAR_LITERAL;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case '\n':
|
|
break;
|
|
case '\r':
|
|
if (self->buffer[self->index + 1] != '\n') {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_BANG:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_BANG_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_BANG;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '|':
|
|
result.tag = TOKENIZER_TAG_PIPE_PIPE;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_EQUAL:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_EQUAL_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '>':
|
|
result.tag = TOKENIZER_TAG_EQUAL_ANGLE_BRACKET_RIGHT;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_EQUAL;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_MINUS:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '>':
|
|
result.tag = TOKENIZER_TAG_ARROW;
|
|
self->index++;
|
|
break;
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_MINUS_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '%':
|
|
state = TOKENIZER_STATE_MINUS_PERCENT;
|
|
goto state;
|
|
case '|':
|
|
state = TOKENIZER_STATE_MINUS_PIPE;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_MINUS;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_MINUS_PERCENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_MINUS_PERCENT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_MINUS_PERCENT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_MINUS_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_MINUS_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_MINUS_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_LEFT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '<':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
|
|
goto state;
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_LEFT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_LEFT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
case '|':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_RIGHT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '>':
|
|
state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
|
|
goto state;
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_RIGHT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_RIGHT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PERIOD:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '.':
|
|
state = TOKENIZER_STATE_PERIOD_2;
|
|
goto state;
|
|
case '*':
|
|
state = TOKENIZER_STATE_PERIOD_ASTERISK;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_PERIOD;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PERIOD_2:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '.':
|
|
result.tag = TOKENIZER_TAG_ELLIPSIS3;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_ELLIPSIS2;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_PERIOD_ASTERISK:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '*':
|
|
result.tag = TOKENIZER_TAG_INVALID_PERIODASTERISKS;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_PERIOD_ASTERISK;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_SLASH:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '/':
|
|
state = TOKENIZER_STATE_LINE_COMMENT_START;
|
|
goto state;
|
|
case '=':
|
|
result.tag = TOKENIZER_TAG_SLASH_EQUAL;
|
|
self->index++;
|
|
break;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_SLASH;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_LINE_COMMENT_START:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
return (tokenizer_token) {
|
|
.tag = TOKENIZER_TAG_EOF,
|
|
.loc = {
|
|
.start = self->index,
|
|
.end = self->index }
|
|
};
|
|
}
|
|
break;
|
|
case '!':
|
|
result.tag = TOKENIZER_TAG_CONTAINER_DOC_COMMENT;
|
|
state = TOKENIZER_STATE_DOC_COMMENT;
|
|
goto state;
|
|
case '\n':
|
|
self->index++;
|
|
result.loc.start = self->index;
|
|
state = TOKENIZER_STATE_START;
|
|
goto state;
|
|
case '/':
|
|
state = TOKENIZER_STATE_DOC_COMMENT_START;
|
|
goto state;
|
|
case '\r':
|
|
state = TOKENIZER_STATE_EXPECT_NEWLINE;
|
|
goto state;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_LINE_COMMENT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_DOC_COMMENT_START:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
case '\n':
|
|
result.tag = TOKENIZER_TAG_DOC_COMMENT;
|
|
break;
|
|
case '\r':
|
|
if (self->buffer[self->index + 1] == '\n') {
|
|
result.tag = TOKENIZER_TAG_DOC_COMMENT;
|
|
} else {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case '/':
|
|
state = TOKENIZER_STATE_LINE_COMMENT;
|
|
goto state;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
result.tag = TOKENIZER_TAG_DOC_COMMENT;
|
|
state = TOKENIZER_STATE_DOC_COMMENT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_LINE_COMMENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
if (self->index != self->buffer_len) {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
} else {
|
|
return (tokenizer_token) {
|
|
.tag = TOKENIZER_TAG_EOF,
|
|
.loc = {
|
|
.start = self->index,
|
|
.end = self->index }
|
|
};
|
|
}
|
|
break;
|
|
case '\n':
|
|
self->index++;
|
|
result.loc.start = self->index;
|
|
state = TOKENIZER_STATE_START;
|
|
goto state;
|
|
case '\r':
|
|
state = TOKENIZER_STATE_EXPECT_NEWLINE;
|
|
goto state;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_LINE_COMMENT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_DOC_COMMENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case 0:
|
|
case '\n':
|
|
break;
|
|
case '\r':
|
|
if (self->buffer[self->index + 1] != '\n') {
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
}
|
|
break;
|
|
case 0x01 ... 0x09:
|
|
case 0x0b ... 0x0c:
|
|
case 0x0e ... 0x1f:
|
|
case 0x7f:
|
|
state = TOKENIZER_STATE_INVALID;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_DOC_COMMENT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_INT:
|
|
switch (self->buffer[self->index]) {
|
|
case '.':
|
|
state = TOKENIZER_STATE_INT_PERIOD;
|
|
goto state;
|
|
case '_':
|
|
case 'a' ... 'd':
|
|
case 'f' ... 'o':
|
|
case 'q' ... 'z':
|
|
case 'A' ... 'D':
|
|
case 'F' ... 'O':
|
|
case 'Q' ... 'Z':
|
|
case '0' ... '9':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_INT;
|
|
goto state;
|
|
case 'e':
|
|
case 'E':
|
|
case 'p':
|
|
case 'P':
|
|
state = TOKENIZER_STATE_INT_EXPONENT;
|
|
goto state;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_INT_EXPONENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '-':
|
|
case '+':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_INT;
|
|
goto state;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_INT_PERIOD:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '_':
|
|
case 'a' ... 'd':
|
|
case 'f' ... 'o':
|
|
case 'q' ... 'z':
|
|
case 'A' ... 'D':
|
|
case 'F' ... 'O':
|
|
case 'Q' ... 'Z':
|
|
case '0' ... '9':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
case 'e':
|
|
case 'E':
|
|
case 'p':
|
|
case 'P':
|
|
state = TOKENIZER_STATE_FLOAT_EXPONENT;
|
|
goto state;
|
|
default:
|
|
self->index--;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_FLOAT:
|
|
switch (self->buffer[self->index]) {
|
|
case '_':
|
|
case 'a' ... 'd':
|
|
case 'f' ... 'o':
|
|
case 'q' ... 'z':
|
|
case 'A' ... 'D':
|
|
case 'F' ... 'O':
|
|
case 'Q' ... 'Z':
|
|
case '0' ... '9':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
case 'e':
|
|
case 'E':
|
|
case 'p':
|
|
case 'P':
|
|
state = TOKENIZER_STATE_FLOAT_EXPONENT;
|
|
goto state;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case TOKENIZER_STATE_FLOAT_EXPONENT:
|
|
self->index++;
|
|
switch (self->buffer[self->index]) {
|
|
case '-':
|
|
case '+':
|
|
self->index++;
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
default:
|
|
state = TOKENIZER_STATE_FLOAT;
|
|
goto state;
|
|
}
|
|
break;
|
|
}
|
|
|
|
result.loc.end = self->index;
|
|
|
|
return result;
|
|
}
|