From 6863e34fbcca71d4fc1c72dbf81b317c6afaebda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Fri, 13 Dec 2024 09:39:02 +0200 Subject: [PATCH] Tokenizer --- .gitignore | 2 + build.zig | 47 ++ main.c | 39 ++ t/hello.zig | 3 + test_all.zig | 3 + tokenizer.c | 1097 ++++++++++++++++++++++++++++++++++++++++++++ tokenizer.h | 196 ++++++++ tokenizer_test.zig | 769 +++++++++++++++++++++++++++++++ zig1.c | 53 +++ 9 files changed, 2209 insertions(+) create mode 100644 .gitignore create mode 100644 build.zig create mode 100644 main.c create mode 100644 t/hello.zig create mode 100644 test_all.zig create mode 100644 tokenizer.c create mode 100644 tokenizer.h create mode 100644 tokenizer_test.zig create mode 100644 zig1.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..94d7938 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/.zig-cache/ +*.o diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..8e36628 --- /dev/null +++ b/build.zig @@ -0,0 +1,47 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const lib = b.addStaticLibrary(.{ + .name = "tokenizer", + .optimize = optimize, + .target = target, + }); + lib.addCSourceFile(.{ + .file = b.path("tokenizer.c"), + .flags = &[_][]const u8{ + "-std=c11", + "-Wall", + "-Wvla", + "-Wextra", + "-Werror", + "-Wshadow", + "-Wswitch", + "-Walloca", + "-Wformat=2", + "-fno-common", + "-Wconversion", + "-Wswitch-enum", + "-Wuninitialized", + "-Wdouble-promotion", + "-fstack-protector-all", + "-Wimplicit-fallthrough", + //"-D_FORTIFY_SOURCE=2", // consider when optimization flags are enabled + }, + }); + lib.addIncludePath(b.path(".")); + lib.linkLibC(); + + const test_step = b.step("test", "Run unit tests"); + + const test_exe = b.addTest(.{ + .root_source_file = b.path("test_all.zig"), + .optimize = optimize, + }); + test_exe.linkLibrary(lib); + test_exe.addIncludePath(b.path(".")); + + test_step.dependOn(&b.addRunArtifact(test_exe).step); +} diff --git a/main.c b/main.c new file mode 100644 index 0000000..7509364 --- /dev/null +++ b/main.c @@ -0,0 +1,39 @@ +#include +#include +#include + +int zig1_run(char* program, char** msg); +int zig1_run_file(char* fname, char** msg); + +static void usage(char* argv0) +{ + fprintf(stderr, "Usage: %s program.zig\n", argv0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) { + usage(argv[0]); + return 1; + } + + char* msg; + switch (zig1_run_file(argv[1], &msg)) { + case 0: + return 0; + break; + case 1: + fprintf(stderr, "panic: %s\n", msg); + free(msg); + return 0; + break; + case 2: + fprintf(stderr, "interpreter error: %s\n", msg); + free(msg); + return 1; + break; + case 3: + return 1; + break; + } +} diff --git a/t/hello.zig b/t/hello.zig new file mode 100644 index 0000000..c994c88 --- /dev/null +++ b/t/hello.zig @@ -0,0 +1,3 @@ +pub fn main() void { + @panic("hello"); +} diff --git a/test_all.zig b/test_all.zig new file mode 100644 index 0000000..2ca72aa --- /dev/null +++ b/test_all.zig @@ -0,0 +1,3 @@ +test "zig1 test suite" { + _ = @import("tokenizer_test.zig"); +} diff --git a/tokenizer.c b/tokenizer.c new file mode 100644 index 0000000..90af352 --- /dev/null +++ b/tokenizer.c @@ -0,0 +1,1097 @@ +// tokenizer for zig d48611ba67c7871cb348f28a01b89d8771170dd8 + +#include +#include +#include +#include + +#include "tokenizer.h" + +typedef struct { + const char* keyword; + token_tag tag; +} keyword_map; + +const keyword_map keywords[] = { + { "addrspace", TOKEN_TAG_KEYWORD_ADDRSPACE }, + { "align", TOKEN_TAG_KEYWORD_ALIGN }, + { "allowzero", TOKEN_TAG_KEYWORD_ALLOWZERO }, + { "and", TOKEN_TAG_KEYWORD_AND }, + { "anyframe", TOKEN_TAG_KEYWORD_ANYFRAME }, + { "anytype", TOKEN_TAG_KEYWORD_ANYTYPE }, + { "asm", TOKEN_TAG_KEYWORD_ASM }, + { "async", TOKEN_TAG_KEYWORD_ASYNC }, + { "await", TOKEN_TAG_KEYWORD_AWAIT }, + { "break", TOKEN_TAG_KEYWORD_BREAK }, + { "callconv", TOKEN_TAG_KEYWORD_CALLCONV }, + { "catch", TOKEN_TAG_KEYWORD_CATCH }, + { "comptime", TOKEN_TAG_KEYWORD_COMPTIME }, + { "const", TOKEN_TAG_KEYWORD_CONST }, + { "continue", TOKEN_TAG_KEYWORD_CONTINUE }, + { "defer", TOKEN_TAG_KEYWORD_DEFER }, + { "else", TOKEN_TAG_KEYWORD_ELSE }, + { "enum", TOKEN_TAG_KEYWORD_ENUM }, + { "errdefer", TOKEN_TAG_KEYWORD_ERRDEFER }, + { "error", TOKEN_TAG_KEYWORD_ERROR }, + { "export", TOKEN_TAG_KEYWORD_EXPORT }, + { "extern", TOKEN_TAG_KEYWORD_EXTERN }, + { "fn", TOKEN_TAG_KEYWORD_FN }, + { "for", TOKEN_TAG_KEYWORD_FOR }, + { "if", TOKEN_TAG_KEYWORD_IF }, + { "inline", TOKEN_TAG_KEYWORD_INLINE }, + { "linksection", TOKEN_TAG_KEYWORD_LINKSECTION }, + { "noalias", TOKEN_TAG_KEYWORD_NOALIAS }, + { "noinline", TOKEN_TAG_KEYWORD_NOINLINE }, + { "nosuspend", TOKEN_TAG_KEYWORD_NOSUSPEND }, + { "opaque", TOKEN_TAG_KEYWORD_OPAQUE }, + { "or", TOKEN_TAG_KEYWORD_OR }, + { "orelse", TOKEN_TAG_KEYWORD_ORELSE }, + { "packed", TOKEN_TAG_KEYWORD_PACKED }, + { "pub", TOKEN_TAG_KEYWORD_PUB }, + { "resume", TOKEN_TAG_KEYWORD_RESUME }, + { "return", TOKEN_TAG_KEYWORD_RETURN }, + { "struct", TOKEN_TAG_KEYWORD_STRUCT }, + { "suspend", TOKEN_TAG_KEYWORD_SUSPEND }, + { "switch", TOKEN_TAG_KEYWORD_SWITCH }, + { "test", TOKEN_TAG_KEYWORD_TEST }, + { "threadlocal", TOKEN_TAG_KEYWORD_THREADLOCAL }, + { "try", TOKEN_TAG_KEYWORD_TRY }, + { "union", TOKEN_TAG_KEYWORD_UNION }, + { "unreachable", TOKEN_TAG_KEYWORD_UNREACHABLE }, + { "usingnamespace", TOKEN_TAG_KEYWORD_USINGNAMESPACE }, + { "var", TOKEN_TAG_KEYWORD_VAR }, + { "volatile", TOKEN_TAG_KEYWORD_VOLATILE }, + { "while", TOKEN_TAG_KEYWORD_WHILE } +}; + +// TODO binary search +static token_tag get_keyword(const char* bytes, uint32_t len) +{ + for (unsigned long i = 0; i < sizeof(keywords) / sizeof(keyword_map); i++) { + size_t klen = strlen(keywords[i].keyword); + size_t minlen = klen < len ? klen : len; + int cmp = strncmp(bytes, keywords[i].keyword, minlen); + if (cmp == 0) { + if (len == klen) { + return keywords[i].tag; + } else { + return TOKEN_TAG_INVALID; + } + } else if (cmp < 0) { + return TOKEN_TAG_INVALID; + } + continue; + } + return TOKEN_TAG_INVALID; +} + +tokenizer tokenizer_init(const char* buffer, uint32_t len) +{ + return (tokenizer) { + .buffer = buffer, + .buffer_len = len, + .index = (len >= 3 && memcmp(buffer, "\xEF\xBB\xBF", 3) == 0) ? 3 : 0, + }; +} + +token tokenizer_next(tokenizer* self) +{ + token result = (token) { + .tag = TOKEN_TAG_INVALID, + .loc = { + .start = 0, + }, + }; + + token_state state = TOKEN_STATE_START; + +state: + switch (state) { + case TOKEN_STATE_START: + switch (self->buffer[self->index]) { + case 0: + if (self->index == self->buffer_len) { + return (token) { + .tag = TOKEN_TAG_EOF, + .loc = { + .start = self->index, + .end = self->index, + } + }; + } else { + state = TOKEN_STATE_INVALID; + goto state; + } + case ' ': + case '\n': + case '\t': + case '\r': + self->index++; + result.loc.start = self->index; + goto state; + case '"': + result.tag = TOKEN_TAG_STRING_LITERAL; + state = TOKEN_STATE_STRING_LITERAL; + goto state; + case '\'': + result.tag = TOKEN_TAG_CHAR_LITERAL; + state = TOKEN_STATE_CHAR_LITERAL; + goto state; + case 'a' ... 'z': + case 'A' ... 'Z': + case '_': + result.tag = TOKEN_TAG_IDENTIFIER; + state = TOKEN_STATE_IDENTIFIER; + goto state; + case '@': + state = TOKEN_STATE_SAW_AT_SIGN; + goto state; + case '=': + state = TOKEN_STATE_EQUAL; + goto state; + case '!': + state = TOKEN_STATE_BANG; + goto state; + case '|': + state = TOKEN_STATE_PIPE; + goto state; + case '(': + result.tag = TOKEN_TAG_L_PAREN; + self->index++; + break; + case ')': + result.tag = TOKEN_TAG_R_PAREN; + self->index++; + break; + case '[': + result.tag = TOKEN_TAG_L_BRACKET; + self->index++; + break; + case ']': + result.tag = TOKEN_TAG_R_BRACKET; + self->index++; + break; + case ';': + result.tag = TOKEN_TAG_SEMICOLON; + self->index++; + break; + case ',': + result.tag = TOKEN_TAG_COMMA; + self->index++; + break; + case '?': + result.tag = TOKEN_TAG_QUESTION_MARK; + self->index++; + break; + case ':': + result.tag = TOKEN_TAG_COLON; + self->index++; + break; + case '%': + state = TOKEN_STATE_PERCENT; + goto state; + case '*': + state = TOKEN_STATE_ASTERISK; + goto state; + case '+': + state = TOKEN_STATE_PLUS; + goto state; + case '<': + state = TOKEN_STATE_ANGLE_BRACKET_LEFT; + goto state; + case '>': + state = TOKEN_STATE_ANGLE_BRACKET_RIGHT; + goto state; + case '^': + state = TOKEN_STATE_CARET; + goto state; + case '\\': + result.tag = TOKEN_TAG_MULTILINE_STRING_LITERAL_LINE; + state = TOKEN_STATE_BACKSLASH; + goto state; + case '{': + result.tag = TOKEN_TAG_L_BRACE; + self->index++; + break; + case '}': + result.tag = TOKEN_TAG_R_BRACE; + self->index++; + break; + case '~': + result.tag = TOKEN_TAG_TILDE; + self->index++; + break; + case '.': + state = TOKEN_STATE_PERIOD; + goto state; + case '-': + state = TOKEN_STATE_MINUS; + goto state; + case '/': + state = TOKEN_STATE_SLASH; + goto state; + case '&': + state = TOKEN_STATE_AMPERSAND; + goto state; + case '0' ... '9': + result.tag = TOKEN_TAG_NUMBER_LITERAL; + self->index++; + state = TOKEN_STATE_INT; + goto state; + default: + state = TOKEN_STATE_INVALID; + goto state; + }; + break; + + case TOKEN_STATE_EXPECT_NEWLINE: + self->index++; + switch (self->buffer[self->index]) { + case 0: + if (self->index == self->buffer_len) { + result.tag = TOKEN_TAG_INVALID; + } else { + state = TOKEN_STATE_INVALID; + goto state; + } + break; + case '\n': + self->index++; + result.loc.start = self->index; + state = TOKEN_STATE_START; + goto state; + default: + state = TOKEN_STATE_INVALID; + goto state; + } + break; + + case TOKEN_STATE_INVALID: + self->index++; + switch (self->buffer[self->index]) { + case 0: + if (self->index == self->buffer_len) { + result.tag = TOKEN_TAG_INVALID; + } else { + state = TOKEN_STATE_INVALID; + goto state; + } + break; + case '\n': + result.tag = TOKEN_TAG_INVALID; + break; + default: + state = TOKEN_STATE_INVALID; + goto state; + } + break; + + case TOKEN_STATE_SAW_AT_SIGN: + self->index++; + switch (self->buffer[self->index]) { + case 0: + case '\n': + result.tag = TOKEN_TAG_INVALID; + break; + case '"': + result.tag = TOKEN_TAG_IDENTIFIER; + state = TOKEN_STATE_STRING_LITERAL; + goto state; + case 'a' ... 'z': + case 'A' ... 'Z': + case '_': + result.tag = TOKEN_TAG_BUILTIN; + state = TOKEN_STATE_BUILTIN; + goto state; + default: + state = TOKEN_STATE_INVALID; + goto state; + } + break; + + case TOKEN_STATE_AMPERSAND: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_AMPERSAND_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_AMPERSAND; + break; + } + break; + + case TOKEN_STATE_ASTERISK: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_ASTERISK_EQUAL; + self->index++; + break; + case '*': + result.tag = TOKEN_TAG_ASTERISK_ASTERISK; + self->index++; + break; + case '%': + state = TOKEN_STATE_ASTERISK_PERCENT; + goto state; + case '|': + state = TOKEN_STATE_ASTERISK_PIPE; + goto state; + default: + result.tag = TOKEN_TAG_ASTERISK; + break; + } + break; + + case TOKEN_STATE_ASTERISK_PERCENT: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_ASTERISK_PERCENT_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_ASTERISK_PERCENT; + break; + } + break; + + case TOKEN_STATE_ASTERISK_PIPE: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_ASTERISK_PIPE_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_ASTERISK_PIPE; + break; + } + break; + + case TOKEN_STATE_PERCENT: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_PERCENT_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_PERCENT; + break; + } + break; + + case TOKEN_STATE_PLUS: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_PLUS_EQUAL; + self->index++; + break; + case '+': + result.tag = TOKEN_TAG_PLUS_PLUS; + self->index++; + break; + case '%': + state = TOKEN_STATE_PLUS_PERCENT; + goto state; + case '|': + state = TOKEN_STATE_PLUS_PIPE; + goto state; + default: + result.tag = TOKEN_TAG_PLUS; + break; + } + break; + + case TOKEN_STATE_PLUS_PERCENT: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_PLUS_PERCENT_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_PLUS_PERCENT; + break; + } + break; + + case TOKEN_STATE_PLUS_PIPE: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_PLUS_PIPE_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_PLUS_PIPE; + break; + } + break; + + case TOKEN_STATE_CARET: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_CARET_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_CARET; + break; + } + break; + + case TOKEN_STATE_IDENTIFIER: + self->index++; + switch (self->buffer[self->index]) { + case 'a' ... 'z': + case 'A' ... 'Z': + case '_': + case '0' ... '9': + state = TOKEN_STATE_IDENTIFIER; + goto state; + default:; // Once we're at C23, this semicolon can be removed. + const char* start = self->buffer + result.loc.start; + uint32_t len = self->index - result.loc.start; + token_tag tag = get_keyword(start, len); + if (tag != TOKEN_TAG_INVALID) { + result.tag = tag; + } + } + break; + + case TOKEN_STATE_BUILTIN: + self->index++; + switch (self->buffer[self->index]) { + case 'a' ... 'z': + case 'A' ... 'Z': + case '_': + case '0' ... '9': + state = TOKEN_STATE_BUILTIN; + goto state; + break; + } + break; + + case TOKEN_STATE_BACKSLASH: + self->index++; + switch (self->buffer[self->index]) { + case 0: + result.tag = TOKEN_TAG_INVALID; + break; + case '\\': + state = TOKEN_STATE_MULTILINE_STRING_LITERAL_LINE; + goto state; + case '\n': + result.tag = TOKEN_TAG_INVALID; + break; + default: + state = TOKEN_STATE_INVALID; + goto state; + } + break; + + case TOKEN_STATE_STRING_LITERAL: + self->index++; + switch (self->buffer[self->index]) { + case 0: + if (self->index != self->buffer_len) { + state = TOKEN_STATE_INVALID; + goto state; + } else { + result.tag = TOKEN_TAG_INVALID; + } + break; + case '\n': + result.tag = TOKEN_TAG_INVALID; + break; + case '\\': + state = TOKEN_STATE_STRING_LITERAL_BACKSLASH; + goto state; + case '"': + self->index++; + break; + case 0x01 ... 0x09: + case 0x0b ... 0x1f: + case 0x7f: + state = TOKEN_STATE_INVALID; + goto state; + default: + state = TOKEN_STATE_STRING_LITERAL; + goto state; + } + break; + + case TOKEN_STATE_STRING_LITERAL_BACKSLASH: + self->index++; + switch (self->buffer[self->index]) { + case 0: + case '\n': + result.tag = TOKEN_TAG_INVALID; + break; + default: + state = TOKEN_STATE_STRING_LITERAL; + goto state; + } + break; + + case TOKEN_STATE_CHAR_LITERAL: + self->index++; + switch (self->buffer[self->index]) { + case 0: + if (self->index != self->buffer_len) { + state = TOKEN_STATE_INVALID; + goto state; + } else { + result.tag = TOKEN_TAG_INVALID; + } + break; + case '\n': + result.tag = TOKEN_TAG_INVALID; + break; + case '\\': + state = TOKEN_STATE_CHAR_LITERAL_BACKSLASH; + goto state; + case '\'': + self->index++; + break; + case 0x01 ... 0x09: + case 0x0b ... 0x1f: + case 0x7f: + state = TOKEN_STATE_INVALID; + goto state; + default: + state = TOKEN_STATE_CHAR_LITERAL; + goto state; + } + break; + + case TOKEN_STATE_CHAR_LITERAL_BACKSLASH: + self->index++; + switch (self->buffer[self->index]) { + case 0: + if (self->index != self->buffer_len) { + state = TOKEN_STATE_INVALID; + goto state; + } else { + result.tag = TOKEN_TAG_INVALID; + } + break; + case '\n': + result.tag = TOKEN_TAG_INVALID; + break; + case 0x01 ... 0x09: + case 0x0b ... 0x1f: + case 0x7f: + state = TOKEN_STATE_INVALID; + goto state; + default: + state = TOKEN_STATE_CHAR_LITERAL; + goto state; + } + break; + + case TOKEN_STATE_MULTILINE_STRING_LITERAL_LINE: + self->index++; + switch (self->buffer[self->index]) { + case 0: + if (self->index != self->buffer_len) { + state = TOKEN_STATE_INVALID; + goto state; + } + break; + case '\n': + break; + case '\r': + if (self->buffer[self->index + 1] != '\n') { + state = TOKEN_STATE_INVALID; + goto state; + } + break; + case 0x01 ... 0x09: + case 0x0b ... 0x0c: + case 0x0e ... 0x1f: + case 0x7f: + state = TOKEN_STATE_INVALID; + goto state; + default: + state = TOKEN_STATE_MULTILINE_STRING_LITERAL_LINE; + goto state; + } + break; + + case TOKEN_STATE_BANG: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_BANG_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_BANG; + break; + } + break; + + case TOKEN_STATE_PIPE: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_PIPE_EQUAL; + self->index++; + break; + case '|': + result.tag = TOKEN_TAG_PIPE_PIPE; + self->index++; + break; + default: + result.tag = TOKEN_TAG_PIPE; + break; + } + break; + + case TOKEN_STATE_EQUAL: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_EQUAL_EQUAL; + self->index++; + break; + case '>': + result.tag = TOKEN_TAG_EQUAL_ANGLE_BRACKET_RIGHT; + self->index++; + break; + default: + result.tag = TOKEN_TAG_EQUAL; + break; + } + break; + + case TOKEN_STATE_MINUS: + self->index++; + switch (self->buffer[self->index]) { + case '>': + result.tag = TOKEN_TAG_ARROW; + self->index++; + break; + case '=': + result.tag = TOKEN_TAG_MINUS_EQUAL; + self->index++; + break; + case '%': + state = TOKEN_STATE_MINUS_PERCENT; + goto state; + case '|': + state = TOKEN_STATE_MINUS_PIPE; + goto state; + default: + result.tag = TOKEN_TAG_MINUS; + break; + } + break; + + case TOKEN_STATE_MINUS_PERCENT: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_MINUS_PERCENT_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_MINUS_PERCENT; + break; + } + break; + + case TOKEN_STATE_MINUS_PIPE: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_MINUS_PIPE_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_MINUS_PIPE; + break; + } + break; + + case TOKEN_STATE_ANGLE_BRACKET_LEFT: + self->index++; + switch (self->buffer[self->index]) { + case '<': + state = TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT; + goto state; + case '=': + result.tag = TOKEN_TAG_ANGLE_BRACKET_LEFT_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_ANGLE_BRACKET_LEFT; + break; + } + break; + + case TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL; + self->index++; + break; + case '|': + state = TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE; + goto state; + default: + result.tag = TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT; + break; + } + break; + + case TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE; + break; + } + break; + + case TOKEN_STATE_ANGLE_BRACKET_RIGHT: + self->index++; + switch (self->buffer[self->index]) { + case '>': + state = TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT; + goto state; + case '=': + result.tag = TOKEN_TAG_ANGLE_BRACKET_RIGHT_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_ANGLE_BRACKET_RIGHT; + break; + } + break; + + case TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT: + self->index++; + switch (self->buffer[self->index]) { + case '=': + result.tag = TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT; + break; + } + break; + + case TOKEN_STATE_PERIOD: + self->index++; + switch (self->buffer[self->index]) { + case '.': + state = TOKEN_STATE_PERIOD_2; + goto state; + case '*': + state = TOKEN_STATE_PERIOD_ASTERISK; + goto state; + default: + result.tag = TOKEN_TAG_PERIOD; + break; + } + break; + + case TOKEN_STATE_PERIOD_2: + self->index++; + switch (self->buffer[self->index]) { + case '.': + result.tag = TOKEN_TAG_ELLIPSIS3; + self->index++; + break; + default: + result.tag = TOKEN_TAG_ELLIPSIS2; + break; + } + break; + + case TOKEN_STATE_PERIOD_ASTERISK: + self->index++; + switch (self->buffer[self->index]) { + case '*': + result.tag = TOKEN_TAG_INVALID_PERIODASTERISKS; + break; + default: + result.tag = TOKEN_TAG_PERIOD_ASTERISK; + break; + } + break; + + case TOKEN_STATE_SLASH: + self->index++; + switch (self->buffer[self->index]) { + case '/': + state = TOKEN_STATE_LINE_COMMENT_START; + goto state; + case '=': + result.tag = TOKEN_TAG_SLASH_EQUAL; + self->index++; + break; + default: + result.tag = TOKEN_TAG_SLASH; + break; + } + break; + + case TOKEN_STATE_LINE_COMMENT_START: + self->index++; + switch (self->buffer[self->index]) { + case 0: + if (self->index != self->buffer_len) { + state = TOKEN_STATE_INVALID; + goto state; + } else { + return (token) { + .tag = TOKEN_TAG_EOF, + .loc = { + .start = self->index, + .end = self->index } + }; + } + break; + case '!': + result.tag = TOKEN_TAG_CONTAINER_DOC_COMMENT; + state = TOKEN_STATE_DOC_COMMENT; + goto state; + case '\n': + self->index++; + result.loc.start = self->index; + state = TOKEN_STATE_START; + goto state; + case '/': + state = TOKEN_STATE_DOC_COMMENT_START; + goto state; + case '\r': + state = TOKEN_STATE_EXPECT_NEWLINE; + goto state; + case 0x01 ... 0x09: + case 0x0b ... 0x0c: + case 0x0e ... 0x1f: + case 0x7f: + state = TOKEN_STATE_INVALID; + goto state; + default: + state = TOKEN_STATE_LINE_COMMENT; + goto state; + } + break; + + case TOKEN_STATE_DOC_COMMENT_START: + self->index++; + switch (self->buffer[self->index]) { + case 0: + case '\n': + result.tag = TOKEN_TAG_DOC_COMMENT; + break; + case '\r': + if (self->buffer[self->index + 1] == '\n') { + result.tag = TOKEN_TAG_DOC_COMMENT; + } else { + state = TOKEN_STATE_INVALID; + goto state; + } + break; + case '/': + state = TOKEN_STATE_LINE_COMMENT; + goto state; + case 0x01 ... 0x09: + case 0x0b ... 0x0c: + case 0x0e ... 0x1f: + case 0x7f: + state = TOKEN_STATE_INVALID; + goto state; + default: + result.tag = TOKEN_TAG_DOC_COMMENT; + state = TOKEN_STATE_DOC_COMMENT; + goto state; + } + break; + + case TOKEN_STATE_LINE_COMMENT: + self->index++; + switch (self->buffer[self->index]) { + case 0: + if (self->index != self->buffer_len) { + state = TOKEN_STATE_INVALID; + goto state; + } else { + return (token) { + .tag = TOKEN_TAG_EOF, + .loc = { + .start = self->index, + .end = self->index } + }; + } + break; + case '\n': + self->index++; + result.loc.start = self->index; + state = TOKEN_STATE_START; + goto state; + case '\r': + state = TOKEN_STATE_EXPECT_NEWLINE; + goto state; + case 0x01 ... 0x09: + case 0x0b ... 0x0c: + case 0x0e ... 0x1f: + case 0x7f: + state = TOKEN_STATE_INVALID; + goto state; + default: + state = TOKEN_STATE_LINE_COMMENT; + goto state; + } + break; + + case TOKEN_STATE_DOC_COMMENT: + self->index++; + switch (self->buffer[self->index]) { + case 0: + case '\n': + break; + case '\r': + if (self->buffer[self->index + 1] != '\n') { + state = TOKEN_STATE_INVALID; + goto state; + } + break; + case 0x01 ... 0x09: + case 0x0b ... 0x0c: + case 0x0e ... 0x1f: + case 0x7f: + state = TOKEN_STATE_INVALID; + goto state; + default: + state = TOKEN_STATE_DOC_COMMENT; + goto state; + } + break; + + case TOKEN_STATE_INT: + switch (self->buffer[self->index]) { + case '.': + state = TOKEN_STATE_INT_PERIOD; + goto state; + case '_': + case 'a' ... 'd': + case 'f' ... 'o': + case 'q' ... 'z': + case 'A' ... 'D': + case 'F' ... 'O': + case 'Q' ... 'Z': + case '0' ... '9': + self->index++; + state = TOKEN_STATE_INT; + goto state; + case 'e': + case 'E': + case 'p': + case 'P': + state = TOKEN_STATE_INT_EXPONENT; + goto state; + default: + break; + } + break; + + case TOKEN_STATE_INT_EXPONENT: + self->index++; + switch (self->buffer[self->index]) { + case '-': + case '+': + self->index++; + state = TOKEN_STATE_FLOAT; + goto state; + default: + state = TOKEN_STATE_INT; + goto state; + } + break; + + case TOKEN_STATE_INT_PERIOD: + self->index++; + switch (self->buffer[self->index]) { + case '_': + case 'a' ... 'd': + case 'f' ... 'o': + case 'q' ... 'z': + case 'A' ... 'D': + case 'F' ... 'O': + case 'Q' ... 'Z': + case '0' ... '9': + self->index++; + state = TOKEN_STATE_FLOAT; + goto state; + case 'e': + case 'E': + case 'p': + case 'P': + state = TOKEN_STATE_FLOAT_EXPONENT; + goto state; + default: + self->index--; + break; + } + break; + + case TOKEN_STATE_FLOAT: + switch (self->buffer[self->index]) { + case '_': + case 'a' ... 'd': + case 'f' ... 'o': + case 'q' ... 'z': + case 'A' ... 'D': + case 'F' ... 'O': + case 'Q' ... 'Z': + case '0' ... '9': + self->index++; + state = TOKEN_STATE_FLOAT; + goto state; + case 'e': + case 'E': + case 'p': + case 'P': + state = TOKEN_STATE_FLOAT_EXPONENT; + goto state; + default: + break; + } + break; + + case TOKEN_STATE_FLOAT_EXPONENT: + self->index++; + switch (self->buffer[self->index]) { + case '-': + case '+': + self->index++; + state = TOKEN_STATE_FLOAT; + goto state; + default: + state = TOKEN_STATE_FLOAT; + goto state; + } + break; + } + + result.loc.end = self->index; + + return result; +} diff --git a/tokenizer.h b/tokenizer.h new file mode 100644 index 0000000..81cc196 --- /dev/null +++ b/tokenizer.h @@ -0,0 +1,196 @@ +#ifndef __ZIG1_TOKENIZER_H__ +#define __ZIG1_TOKENIZER_H__ + +#include +#include + +typedef enum { + TOKEN_TAG_INVALID, + TOKEN_TAG_INVALID_PERIODASTERISKS, + TOKEN_TAG_IDENTIFIER, + TOKEN_TAG_STRING_LITERAL, + TOKEN_TAG_MULTILINE_STRING_LITERAL_LINE, + TOKEN_TAG_CHAR_LITERAL, + TOKEN_TAG_EOF, + TOKEN_TAG_BUILTIN, + TOKEN_TAG_BANG, + TOKEN_TAG_PIPE, + TOKEN_TAG_PIPE_PIPE, + TOKEN_TAG_PIPE_EQUAL, + TOKEN_TAG_EQUAL, + TOKEN_TAG_EQUAL_EQUAL, + TOKEN_TAG_EQUAL_ANGLE_BRACKET_RIGHT, + TOKEN_TAG_BANG_EQUAL, + TOKEN_TAG_L_PAREN, + TOKEN_TAG_R_PAREN, + TOKEN_TAG_SEMICOLON, + TOKEN_TAG_PERCENT, + TOKEN_TAG_PERCENT_EQUAL, + TOKEN_TAG_L_BRACE, + TOKEN_TAG_R_BRACE, + TOKEN_TAG_L_BRACKET, + TOKEN_TAG_R_BRACKET, + TOKEN_TAG_PERIOD, + TOKEN_TAG_PERIOD_ASTERISK, + TOKEN_TAG_ELLIPSIS2, + TOKEN_TAG_ELLIPSIS3, + TOKEN_TAG_CARET, + TOKEN_TAG_CARET_EQUAL, + TOKEN_TAG_PLUS, + TOKEN_TAG_PLUS_PLUS, + TOKEN_TAG_PLUS_EQUAL, + TOKEN_TAG_PLUS_PERCENT, + TOKEN_TAG_PLUS_PERCENT_EQUAL, + TOKEN_TAG_PLUS_PIPE, + TOKEN_TAG_PLUS_PIPE_EQUAL, + TOKEN_TAG_MINUS, + TOKEN_TAG_MINUS_EQUAL, + TOKEN_TAG_MINUS_PERCENT, + TOKEN_TAG_MINUS_PERCENT_EQUAL, + TOKEN_TAG_MINUS_PIPE, + TOKEN_TAG_MINUS_PIPE_EQUAL, + TOKEN_TAG_ASTERISK, + TOKEN_TAG_ASTERISK_EQUAL, + TOKEN_TAG_ASTERISK_ASTERISK, + TOKEN_TAG_ASTERISK_PERCENT, + TOKEN_TAG_ASTERISK_PERCENT_EQUAL, + TOKEN_TAG_ASTERISK_PIPE, + TOKEN_TAG_ASTERISK_PIPE_EQUAL, + TOKEN_TAG_ARROW, + TOKEN_TAG_COLON, + TOKEN_TAG_SLASH, + TOKEN_TAG_SLASH_EQUAL, + TOKEN_TAG_COMMA, + TOKEN_TAG_AMPERSAND, + TOKEN_TAG_AMPERSAND_EQUAL, + TOKEN_TAG_QUESTION_MARK, + TOKEN_TAG_ANGLE_BRACKET_LEFT, + TOKEN_TAG_ANGLE_BRACKET_LEFT_EQUAL, + TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT, + TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL, + TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE, + TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL, + TOKEN_TAG_ANGLE_BRACKET_RIGHT, + TOKEN_TAG_ANGLE_BRACKET_RIGHT_EQUAL, + TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT, + TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL, + TOKEN_TAG_TILDE, + TOKEN_TAG_NUMBER_LITERAL, + TOKEN_TAG_DOC_COMMENT, + TOKEN_TAG_CONTAINER_DOC_COMMENT, + TOKEN_TAG_KEYWORD_ADDRSPACE, + TOKEN_TAG_KEYWORD_ALIGN, + TOKEN_TAG_KEYWORD_ALLOWZERO, + TOKEN_TAG_KEYWORD_AND, + TOKEN_TAG_KEYWORD_ANYFRAME, + TOKEN_TAG_KEYWORD_ANYTYPE, + TOKEN_TAG_KEYWORD_ASM, + TOKEN_TAG_KEYWORD_ASYNC, + TOKEN_TAG_KEYWORD_AWAIT, + TOKEN_TAG_KEYWORD_BREAK, + TOKEN_TAG_KEYWORD_CALLCONV, + TOKEN_TAG_KEYWORD_CATCH, + TOKEN_TAG_KEYWORD_COMPTIME, + TOKEN_TAG_KEYWORD_CONST, + TOKEN_TAG_KEYWORD_CONTINUE, + TOKEN_TAG_KEYWORD_DEFER, + TOKEN_TAG_KEYWORD_ELSE, + TOKEN_TAG_KEYWORD_ENUM, + TOKEN_TAG_KEYWORD_ERRDEFER, + TOKEN_TAG_KEYWORD_ERROR, + TOKEN_TAG_KEYWORD_EXPORT, + TOKEN_TAG_KEYWORD_EXTERN, + TOKEN_TAG_KEYWORD_FN, + TOKEN_TAG_KEYWORD_FOR, + TOKEN_TAG_KEYWORD_IF, + TOKEN_TAG_KEYWORD_INLINE, + TOKEN_TAG_KEYWORD_NOALIAS, + TOKEN_TAG_KEYWORD_NOINLINE, + TOKEN_TAG_KEYWORD_NOSUSPEND, + TOKEN_TAG_KEYWORD_OPAQUE, + TOKEN_TAG_KEYWORD_OR, + TOKEN_TAG_KEYWORD_ORELSE, + TOKEN_TAG_KEYWORD_PACKED, + TOKEN_TAG_KEYWORD_PUB, + TOKEN_TAG_KEYWORD_RESUME, + TOKEN_TAG_KEYWORD_RETURN, + TOKEN_TAG_KEYWORD_LINKSECTION, + TOKEN_TAG_KEYWORD_STRUCT, + TOKEN_TAG_KEYWORD_SUSPEND, + TOKEN_TAG_KEYWORD_SWITCH, + TOKEN_TAG_KEYWORD_TEST, + TOKEN_TAG_KEYWORD_THREADLOCAL, + TOKEN_TAG_KEYWORD_TRY, + TOKEN_TAG_KEYWORD_UNION, + TOKEN_TAG_KEYWORD_UNREACHABLE, + TOKEN_TAG_KEYWORD_USINGNAMESPACE, + TOKEN_TAG_KEYWORD_VAR, + TOKEN_TAG_KEYWORD_VOLATILE, + TOKEN_TAG_KEYWORD_WHILE, +} token_tag; + +typedef enum { + TOKEN_STATE_START, + TOKEN_STATE_EXPECT_NEWLINE, + TOKEN_STATE_IDENTIFIER, + TOKEN_STATE_BUILTIN, + TOKEN_STATE_STRING_LITERAL, + TOKEN_STATE_STRING_LITERAL_BACKSLASH, + TOKEN_STATE_MULTILINE_STRING_LITERAL_LINE, + TOKEN_STATE_CHAR_LITERAL, + TOKEN_STATE_CHAR_LITERAL_BACKSLASH, + TOKEN_STATE_BACKSLASH, + TOKEN_STATE_EQUAL, + TOKEN_STATE_BANG, + TOKEN_STATE_PIPE, + TOKEN_STATE_MINUS, + TOKEN_STATE_MINUS_PERCENT, + TOKEN_STATE_MINUS_PIPE, + TOKEN_STATE_ASTERISK, + TOKEN_STATE_ASTERISK_PERCENT, + TOKEN_STATE_ASTERISK_PIPE, + TOKEN_STATE_SLASH, + TOKEN_STATE_LINE_COMMENT_START, + TOKEN_STATE_LINE_COMMENT, + TOKEN_STATE_DOC_COMMENT_START, + TOKEN_STATE_DOC_COMMENT, + TOKEN_STATE_INT, + TOKEN_STATE_INT_EXPONENT, + TOKEN_STATE_INT_PERIOD, + TOKEN_STATE_FLOAT, + TOKEN_STATE_FLOAT_EXPONENT, + TOKEN_STATE_AMPERSAND, + TOKEN_STATE_CARET, + TOKEN_STATE_PERCENT, + TOKEN_STATE_PLUS, + TOKEN_STATE_PLUS_PERCENT, + TOKEN_STATE_PLUS_PIPE, + TOKEN_STATE_ANGLE_BRACKET_LEFT, + TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT, + TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE, + TOKEN_STATE_ANGLE_BRACKET_RIGHT, + TOKEN_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT, + TOKEN_STATE_PERIOD, + TOKEN_STATE_PERIOD_2, + TOKEN_STATE_PERIOD_ASTERISK, + TOKEN_STATE_SAW_AT_SIGN, + TOKEN_STATE_INVALID, +} token_state; + +typedef struct { + token_tag tag; + struct { + uint32_t start, end; + } loc; +} token; + +typedef struct { + const char* buffer; + uint32_t buffer_len; + uint32_t index; +} tokenizer; + +tokenizer tokenizer_init(const char* buffer, uint32_t len); +token tokenizer_next(tokenizer* self); + +#endif diff --git a/tokenizer_test.zig b/tokenizer_test.zig new file mode 100644 index 0000000..c7847ac --- /dev/null +++ b/tokenizer_test.zig @@ -0,0 +1,769 @@ +const std = @import("std"); +const testing = std.testing; + +const Token = std.zig.Token; +const Tokenizer = std.zig.Tokenizer; + +const c = @cImport({ + @cInclude("tokenizer.h"); +}); + +fn zigToken(token: c_uint) Token.Tag { + return switch (token) { + c.TOKEN_TAG_INVALID => .invalid, + c.TOKEN_TAG_INVALID_PERIODASTERISKS => .invalid_periodasterisks, + c.TOKEN_TAG_IDENTIFIER => .identifier, + c.TOKEN_TAG_STRING_LITERAL => .string_literal, + c.TOKEN_TAG_MULTILINE_STRING_LITERAL_LINE => .multiline_string_literal_line, + c.TOKEN_TAG_CHAR_LITERAL => .char_literal, + c.TOKEN_TAG_EOF => .eof, + c.TOKEN_TAG_BUILTIN => .builtin, + c.TOKEN_TAG_BANG => .bang, + c.TOKEN_TAG_PIPE => .pipe, + c.TOKEN_TAG_PIPE_PIPE => .pipe_pipe, + c.TOKEN_TAG_PIPE_EQUAL => .pipe_equal, + c.TOKEN_TAG_EQUAL => .equal, + c.TOKEN_TAG_EQUAL_EQUAL => .equal_equal, + c.TOKEN_TAG_EQUAL_ANGLE_BRACKET_RIGHT => .equal_angle_bracket_right, + c.TOKEN_TAG_BANG_EQUAL => .bang_equal, + c.TOKEN_TAG_L_PAREN => .l_paren, + c.TOKEN_TAG_R_PAREN => .r_paren, + c.TOKEN_TAG_SEMICOLON => .semicolon, + c.TOKEN_TAG_PERCENT => .percent, + c.TOKEN_TAG_PERCENT_EQUAL => .percent_equal, + c.TOKEN_TAG_L_BRACE => .l_brace, + c.TOKEN_TAG_R_BRACE => .r_brace, + c.TOKEN_TAG_L_BRACKET => .l_bracket, + c.TOKEN_TAG_R_BRACKET => .r_bracket, + c.TOKEN_TAG_PERIOD => .period, + c.TOKEN_TAG_PERIOD_ASTERISK => .period_asterisk, + c.TOKEN_TAG_ELLIPSIS2 => .ellipsis2, + c.TOKEN_TAG_ELLIPSIS3 => .ellipsis3, + c.TOKEN_TAG_CARET => .caret, + c.TOKEN_TAG_CARET_EQUAL => .caret_equal, + c.TOKEN_TAG_PLUS => .plus, + c.TOKEN_TAG_PLUS_PLUS => .plus_plus, + c.TOKEN_TAG_PLUS_EQUAL => .plus_equal, + c.TOKEN_TAG_PLUS_PERCENT => .plus_percent, + c.TOKEN_TAG_PLUS_PERCENT_EQUAL => .plus_percent_equal, + c.TOKEN_TAG_PLUS_PIPE => .plus_pipe, + c.TOKEN_TAG_PLUS_PIPE_EQUAL => .plus_pipe_equal, + c.TOKEN_TAG_MINUS => .minus, + c.TOKEN_TAG_MINUS_EQUAL => .minus_equal, + c.TOKEN_TAG_MINUS_PERCENT => .minus_percent, + c.TOKEN_TAG_MINUS_PERCENT_EQUAL => .minus_percent_equal, + c.TOKEN_TAG_MINUS_PIPE => .minus_pipe, + c.TOKEN_TAG_MINUS_PIPE_EQUAL => .minus_pipe_equal, + c.TOKEN_TAG_ASTERISK => .asterisk, + c.TOKEN_TAG_ASTERISK_EQUAL => .asterisk_equal, + c.TOKEN_TAG_ASTERISK_ASTERISK => .asterisk_asterisk, + c.TOKEN_TAG_ASTERISK_PERCENT => .asterisk_percent, + c.TOKEN_TAG_ASTERISK_PERCENT_EQUAL => .asterisk_percent_equal, + c.TOKEN_TAG_ASTERISK_PIPE => .asterisk_pipe, + c.TOKEN_TAG_ASTERISK_PIPE_EQUAL => .asterisk_pipe_equal, + c.TOKEN_TAG_ARROW => .arrow, + c.TOKEN_TAG_COLON => .colon, + c.TOKEN_TAG_SLASH => .slash, + c.TOKEN_TAG_SLASH_EQUAL => .slash_equal, + c.TOKEN_TAG_COMMA => .comma, + c.TOKEN_TAG_AMPERSAND => .ampersand, + c.TOKEN_TAG_AMPERSAND_EQUAL => .ampersand_equal, + c.TOKEN_TAG_QUESTION_MARK => .question_mark, + c.TOKEN_TAG_ANGLE_BRACKET_LEFT => .angle_bracket_left, + c.TOKEN_TAG_ANGLE_BRACKET_LEFT_EQUAL => .angle_bracket_left_equal, + c.TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT => .angle_bracket_angle_bracket_left, + c.TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL => .angle_bracket_angle_bracket_left_equal, + c.TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE => .angle_bracket_angle_bracket_left_pipe, + c.TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL => .angle_bracket_angle_bracket_left_pipe_equal, + c.TOKEN_TAG_ANGLE_BRACKET_RIGHT => .angle_bracket_right, + c.TOKEN_TAG_ANGLE_BRACKET_RIGHT_EQUAL => .angle_bracket_right_equal, + c.TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT => .angle_bracket_angle_bracket_right, + c.TOKEN_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL => .angle_bracket_angle_bracket_right_equal, + c.TOKEN_TAG_TILDE => .tilde, + c.TOKEN_TAG_NUMBER_LITERAL => .number_literal, + c.TOKEN_TAG_DOC_COMMENT => .doc_comment, + c.TOKEN_TAG_CONTAINER_DOC_COMMENT => .container_doc_comment, + c.TOKEN_TAG_KEYWORD_ADDRSPACE => .keyword_addrspace, + c.TOKEN_TAG_KEYWORD_ALIGN => .keyword_align, + c.TOKEN_TAG_KEYWORD_ALLOWZERO => .keyword_allowzero, + c.TOKEN_TAG_KEYWORD_AND => .keyword_and, + c.TOKEN_TAG_KEYWORD_ANYFRAME => .keyword_anyframe, + c.TOKEN_TAG_KEYWORD_ANYTYPE => .keyword_anytype, + c.TOKEN_TAG_KEYWORD_ASM => .keyword_asm, + c.TOKEN_TAG_KEYWORD_ASYNC => .keyword_async, + c.TOKEN_TAG_KEYWORD_AWAIT => .keyword_await, + c.TOKEN_TAG_KEYWORD_BREAK => .keyword_break, + c.TOKEN_TAG_KEYWORD_CALLCONV => .keyword_callconv, + c.TOKEN_TAG_KEYWORD_CATCH => .keyword_catch, + c.TOKEN_TAG_KEYWORD_COMPTIME => .keyword_comptime, + c.TOKEN_TAG_KEYWORD_CONST => .keyword_const, + c.TOKEN_TAG_KEYWORD_CONTINUE => .keyword_continue, + c.TOKEN_TAG_KEYWORD_DEFER => .keyword_defer, + c.TOKEN_TAG_KEYWORD_ELSE => .keyword_else, + c.TOKEN_TAG_KEYWORD_ENUM => .keyword_enum, + c.TOKEN_TAG_KEYWORD_ERRDEFER => .keyword_errdefer, + c.TOKEN_TAG_KEYWORD_ERROR => .keyword_error, + c.TOKEN_TAG_KEYWORD_EXPORT => .keyword_export, + c.TOKEN_TAG_KEYWORD_EXTERN => .keyword_extern, + c.TOKEN_TAG_KEYWORD_FN => .keyword_fn, + c.TOKEN_TAG_KEYWORD_FOR => .keyword_for, + c.TOKEN_TAG_KEYWORD_IF => .keyword_if, + c.TOKEN_TAG_KEYWORD_INLINE => .keyword_inline, + c.TOKEN_TAG_KEYWORD_NOALIAS => .keyword_noalias, + c.TOKEN_TAG_KEYWORD_NOINLINE => .keyword_noinline, + c.TOKEN_TAG_KEYWORD_NOSUSPEND => .keyword_nosuspend, + c.TOKEN_TAG_KEYWORD_OPAQUE => .keyword_opaque, + c.TOKEN_TAG_KEYWORD_OR => .keyword_or, + c.TOKEN_TAG_KEYWORD_ORELSE => .keyword_orelse, + c.TOKEN_TAG_KEYWORD_PACKED => .keyword_packed, + c.TOKEN_TAG_KEYWORD_PUB => .keyword_pub, + c.TOKEN_TAG_KEYWORD_RESUME => .keyword_resume, + c.TOKEN_TAG_KEYWORD_RETURN => .keyword_return, + c.TOKEN_TAG_KEYWORD_LINKSECTION => .keyword_linksection, + c.TOKEN_TAG_KEYWORD_STRUCT => .keyword_struct, + c.TOKEN_TAG_KEYWORD_SUSPEND => .keyword_suspend, + c.TOKEN_TAG_KEYWORD_SWITCH => .keyword_switch, + c.TOKEN_TAG_KEYWORD_TEST => .keyword_test, + c.TOKEN_TAG_KEYWORD_THREADLOCAL => .keyword_threadlocal, + c.TOKEN_TAG_KEYWORD_TRY => .keyword_try, + c.TOKEN_TAG_KEYWORD_UNION => .keyword_union, + c.TOKEN_TAG_KEYWORD_UNREACHABLE => .keyword_unreachable, + c.TOKEN_TAG_KEYWORD_USINGNAMESPACE => .keyword_usingnamespace, + c.TOKEN_TAG_KEYWORD_VAR => .keyword_var, + c.TOKEN_TAG_KEYWORD_VOLATILE => .keyword_volatile, + c.TOKEN_TAG_KEYWORD_WHILE => .keyword_while, + else => undefined, + }; +} + +// Copy-pasted from lib/std/zig/tokenizer.zig +fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void { + // uncomment when Zig source and compiler get in sync (e.g. with 0.14) + //var tokenizer = Tokenizer.init(source); + //for (expected_token_tags) |expected_token_tag| { + // const token = tokenizer.next(); + // try std.testing.expectEqual(expected_token_tag, token.tag); + //} + //// Last token should always be eof, even when the last token was invalid, + //// in which case the tokenizer is in an invalid state, which can only be + //// recovered by opinionated means outside the scope of this implementation. + //const last_token = tokenizer.next(); + //try std.testing.expectEqual(Token.Tag.eof, last_token.tag); + //try std.testing.expectEqual(source.len, last_token.loc.start); + //try std.testing.expectEqual(source.len, last_token.loc.end); + + // Do the C thing + var ctokenizer = c.tokenizer_init(source.ptr, @intCast(source.len)); + for (expected_token_tags) |expected_token_tag| { + const token = c.tokenizer_next(&ctokenizer); + try std.testing.expectEqual(expected_token_tag, zigToken(token.tag)); + } + const last_token = c.tokenizer_next(&ctokenizer); + try std.testing.expectEqual(Token.Tag.eof, zigToken(last_token.tag)); + try std.testing.expectEqual(source.len, last_token.loc.start); + try std.testing.expectEqual(source.len, last_token.loc.end); +} + +test "keywords" { + try testTokenize("test const else", &.{ .keyword_test, .keyword_const, .keyword_else }); +} + +test "line comment followed by top-level comptime" { + try testTokenize( + \\// line comment + \\comptime {} + \\ + , &.{ + .keyword_comptime, + .l_brace, + .r_brace, + }); +} + +test "unknown length pointer and then c pointer" { + try testTokenize( + \\[*]u8 + \\[*c]u8 + , &.{ + .l_bracket, + .asterisk, + .r_bracket, + .identifier, + .l_bracket, + .asterisk, + .identifier, + .r_bracket, + .identifier, + }); +} + +test "code point literal with hex escape" { + try testTokenize( + \\'\x1b' + , &.{.char_literal}); + try testTokenize( + \\'\x1' + , &.{.char_literal}); +} + +test "newline in char literal" { + try testTokenize( + \\' + \\' + , &.{ .invalid, .invalid }); +} + +test "newline in string literal" { + try testTokenize( + \\" + \\" + , &.{ .invalid, .invalid }); +} + +test "code point literal with unicode escapes" { + // Valid unicode escapes + try testTokenize( + \\'\u{3}' + , &.{.char_literal}); + try testTokenize( + \\'\u{01}' + , &.{.char_literal}); + try testTokenize( + \\'\u{2a}' + , &.{.char_literal}); + try testTokenize( + \\'\u{3f9}' + , &.{.char_literal}); + try testTokenize( + \\'\u{6E09aBc1523}' + , &.{.char_literal}); + try testTokenize( + \\"\u{440}" + , &.{.string_literal}); + + // Invalid unicode escapes + try testTokenize( + \\'\u' + , &.{.char_literal}); + try testTokenize( + \\'\u{{' + , &.{.char_literal}); + try testTokenize( + \\'\u{}' + , &.{.char_literal}); + try testTokenize( + \\'\u{s}' + , &.{.char_literal}); + try testTokenize( + \\'\u{2z}' + , &.{.char_literal}); + try testTokenize( + \\'\u{4a' + , &.{.char_literal}); + + // Test old-style unicode literals + try testTokenize( + \\'\u0333' + , &.{.char_literal}); + try testTokenize( + \\'\U0333' + , &.{.char_literal}); +} + +test "code point literal with unicode code point" { + try testTokenize( + \\'💩' + , &.{.char_literal}); +} + +test "float literal e exponent" { + try testTokenize("a = 4.94065645841246544177e-324;\n", &.{ + .identifier, + .equal, + .number_literal, + .semicolon, + }); +} + +test "float literal p exponent" { + try testTokenize("a = 0x1.a827999fcef32p+1022;\n", &.{ + .identifier, + .equal, + .number_literal, + .semicolon, + }); +} + +test "chars" { + try testTokenize("'c'", &.{.char_literal}); +} + +test "invalid token characters" { + try testTokenize("#", &.{.invalid}); + try testTokenize("`", &.{.invalid}); + try testTokenize("'c", &.{.invalid}); + try testTokenize("'", &.{.invalid}); + try testTokenize("''", &.{.char_literal}); + try testTokenize("'\n'", &.{ .invalid, .invalid }); +} + +test "invalid literal/comment characters" { + try testTokenize("\"\x00\"", &.{.invalid}); + try testTokenize("`\x00`", &.{.invalid}); + try testTokenize("//\x00", &.{.invalid}); + try testTokenize("//\x1f", &.{.invalid}); + try testTokenize("//\x7f", &.{.invalid}); +} + +test "utf8" { + try testTokenize("//\xc2\x80", &.{}); + try testTokenize("//\xf4\x8f\xbf\xbf", &.{}); +} + +test "invalid utf8" { + try testTokenize("//\x80", &.{}); + try testTokenize("//\xbf", &.{}); + try testTokenize("//\xf8", &.{}); + try testTokenize("//\xff", &.{}); + try testTokenize("//\xc2\xc0", &.{}); + try testTokenize("//\xe0", &.{}); + try testTokenize("//\xf0", &.{}); + try testTokenize("//\xf0\x90\x80\xc0", &.{}); +} + +test "illegal unicode codepoints" { + // unicode newline characters.U+0085, U+2028, U+2029 + try testTokenize("//\xc2\x84", &.{}); + try testTokenize("//\xc2\x85", &.{}); + try testTokenize("//\xc2\x86", &.{}); + try testTokenize("//\xe2\x80\xa7", &.{}); + try testTokenize("//\xe2\x80\xa8", &.{}); + try testTokenize("//\xe2\x80\xa9", &.{}); + try testTokenize("//\xe2\x80\xaa", &.{}); +} + +test "string identifier and builtin fns" { + try testTokenize( + \\const @"if" = @import("std"); + , &.{ + .keyword_const, + .identifier, + .equal, + .builtin, + .l_paren, + .string_literal, + .r_paren, + .semicolon, + }); +} + +test "pipe and then invalid" { + try testTokenize("||=", &.{ + .pipe_pipe, + .equal, + }); +} + +test "line comment and doc comment" { + try testTokenize("//", &.{}); + try testTokenize("// a / b", &.{}); + try testTokenize("// /", &.{}); + try testTokenize("/// a", &.{.doc_comment}); + try testTokenize("///", &.{.doc_comment}); + try testTokenize("////", &.{}); + try testTokenize("//!", &.{.container_doc_comment}); + try testTokenize("//!!", &.{.container_doc_comment}); +} + +test "line comment followed by identifier" { + try testTokenize( + \\ Unexpected, + \\ // another + \\ Another, + , &.{ + .identifier, + .comma, + .identifier, + .comma, + }); +} + +test "UTF-8 BOM is recognized and skipped" { + try testTokenize("\xEF\xBB\xBFa;\n", &.{ + .identifier, + .semicolon, + }); +} + +test "correctly parse pointer assignment" { + try testTokenize("b.*=3;\n", &.{ + .identifier, + .period_asterisk, + .equal, + .number_literal, + .semicolon, + }); +} + +test "correctly parse pointer dereference followed by asterisk" { + try testTokenize("\"b\".* ** 10", &.{ + .string_literal, + .period_asterisk, + .asterisk_asterisk, + .number_literal, + }); + + try testTokenize("(\"b\".*)** 10", &.{ + .l_paren, + .string_literal, + .period_asterisk, + .r_paren, + .asterisk_asterisk, + .number_literal, + }); + + try testTokenize("\"b\".*** 10", &.{ + .string_literal, + .invalid_periodasterisks, + .asterisk_asterisk, + .number_literal, + }); +} + +test "range literals" { + try testTokenize("0...9", &.{ .number_literal, .ellipsis3, .number_literal }); + try testTokenize("'0'...'9'", &.{ .char_literal, .ellipsis3, .char_literal }); + try testTokenize("0x00...0x09", &.{ .number_literal, .ellipsis3, .number_literal }); + try testTokenize("0b00...0b11", &.{ .number_literal, .ellipsis3, .number_literal }); + try testTokenize("0o00...0o11", &.{ .number_literal, .ellipsis3, .number_literal }); +} + +test "number literals decimal" { + try testTokenize("0", &.{.number_literal}); + try testTokenize("1", &.{.number_literal}); + try testTokenize("2", &.{.number_literal}); + try testTokenize("3", &.{.number_literal}); + try testTokenize("4", &.{.number_literal}); + try testTokenize("5", &.{.number_literal}); + try testTokenize("6", &.{.number_literal}); + try testTokenize("7", &.{.number_literal}); + try testTokenize("8", &.{.number_literal}); + try testTokenize("9", &.{.number_literal}); + try testTokenize("1..", &.{ .number_literal, .ellipsis2 }); + try testTokenize("0a", &.{.number_literal}); + try testTokenize("9b", &.{.number_literal}); + try testTokenize("1z", &.{.number_literal}); + try testTokenize("1z_1", &.{.number_literal}); + try testTokenize("9z3", &.{.number_literal}); + + try testTokenize("0_0", &.{.number_literal}); + try testTokenize("0001", &.{.number_literal}); + try testTokenize("01234567890", &.{.number_literal}); + try testTokenize("012_345_6789_0", &.{.number_literal}); + try testTokenize("0_1_2_3_4_5_6_7_8_9_0", &.{.number_literal}); + + try testTokenize("00_", &.{.number_literal}); + try testTokenize("0_0_", &.{.number_literal}); + try testTokenize("0__0", &.{.number_literal}); + try testTokenize("0_0f", &.{.number_literal}); + try testTokenize("0_0_f", &.{.number_literal}); + try testTokenize("0_0_f_00", &.{.number_literal}); + try testTokenize("1_,", &.{ .number_literal, .comma }); + + try testTokenize("0.0", &.{.number_literal}); + try testTokenize("1.0", &.{.number_literal}); + try testTokenize("10.0", &.{.number_literal}); + try testTokenize("0e0", &.{.number_literal}); + try testTokenize("1e0", &.{.number_literal}); + try testTokenize("1e100", &.{.number_literal}); + try testTokenize("1.0e100", &.{.number_literal}); + try testTokenize("1.0e+100", &.{.number_literal}); + try testTokenize("1.0e-100", &.{.number_literal}); + try testTokenize("1_0_0_0.0_0_0_0_0_1e1_0_0_0", &.{.number_literal}); + + try testTokenize("1.", &.{ .number_literal, .period }); + try testTokenize("1e", &.{.number_literal}); + try testTokenize("1.e100", &.{.number_literal}); + try testTokenize("1.0e1f0", &.{.number_literal}); + try testTokenize("1.0p100", &.{.number_literal}); + try testTokenize("1.0p-100", &.{.number_literal}); + try testTokenize("1.0p1f0", &.{.number_literal}); + try testTokenize("1.0_,", &.{ .number_literal, .comma }); + try testTokenize("1_.0", &.{.number_literal}); + try testTokenize("1._", &.{.number_literal}); + try testTokenize("1.a", &.{.number_literal}); + try testTokenize("1.z", &.{.number_literal}); + try testTokenize("1._0", &.{.number_literal}); + try testTokenize("1.+", &.{ .number_literal, .period, .plus }); + try testTokenize("1._+", &.{ .number_literal, .plus }); + try testTokenize("1._e", &.{.number_literal}); + try testTokenize("1.0e", &.{.number_literal}); + try testTokenize("1.0e,", &.{ .number_literal, .comma }); + try testTokenize("1.0e_", &.{.number_literal}); + try testTokenize("1.0e+_", &.{.number_literal}); + try testTokenize("1.0e-_", &.{.number_literal}); + try testTokenize("1.0e0_+", &.{ .number_literal, .plus }); +} + +test "number literals binary" { + try testTokenize("0b0", &.{.number_literal}); + try testTokenize("0b1", &.{.number_literal}); + try testTokenize("0b2", &.{.number_literal}); + try testTokenize("0b3", &.{.number_literal}); + try testTokenize("0b4", &.{.number_literal}); + try testTokenize("0b5", &.{.number_literal}); + try testTokenize("0b6", &.{.number_literal}); + try testTokenize("0b7", &.{.number_literal}); + try testTokenize("0b8", &.{.number_literal}); + try testTokenize("0b9", &.{.number_literal}); + try testTokenize("0ba", &.{.number_literal}); + try testTokenize("0bb", &.{.number_literal}); + try testTokenize("0bc", &.{.number_literal}); + try testTokenize("0bd", &.{.number_literal}); + try testTokenize("0be", &.{.number_literal}); + try testTokenize("0bf", &.{.number_literal}); + try testTokenize("0bz", &.{.number_literal}); + + try testTokenize("0b0000_0000", &.{.number_literal}); + try testTokenize("0b1111_1111", &.{.number_literal}); + try testTokenize("0b10_10_10_10", &.{.number_literal}); + try testTokenize("0b0_1_0_1_0_1_0_1", &.{.number_literal}); + try testTokenize("0b1.", &.{ .number_literal, .period }); + try testTokenize("0b1.0", &.{.number_literal}); + + try testTokenize("0B0", &.{.number_literal}); + try testTokenize("0b_", &.{.number_literal}); + try testTokenize("0b_0", &.{.number_literal}); + try testTokenize("0b1_", &.{.number_literal}); + try testTokenize("0b0__1", &.{.number_literal}); + try testTokenize("0b0_1_", &.{.number_literal}); + try testTokenize("0b1e", &.{.number_literal}); + try testTokenize("0b1p", &.{.number_literal}); + try testTokenize("0b1e0", &.{.number_literal}); + try testTokenize("0b1p0", &.{.number_literal}); + try testTokenize("0b1_,", &.{ .number_literal, .comma }); +} + +test "number literals octal" { + try testTokenize("0o0", &.{.number_literal}); + try testTokenize("0o1", &.{.number_literal}); + try testTokenize("0o2", &.{.number_literal}); + try testTokenize("0o3", &.{.number_literal}); + try testTokenize("0o4", &.{.number_literal}); + try testTokenize("0o5", &.{.number_literal}); + try testTokenize("0o6", &.{.number_literal}); + try testTokenize("0o7", &.{.number_literal}); + try testTokenize("0o8", &.{.number_literal}); + try testTokenize("0o9", &.{.number_literal}); + try testTokenize("0oa", &.{.number_literal}); + try testTokenize("0ob", &.{.number_literal}); + try testTokenize("0oc", &.{.number_literal}); + try testTokenize("0od", &.{.number_literal}); + try testTokenize("0oe", &.{.number_literal}); + try testTokenize("0of", &.{.number_literal}); + try testTokenize("0oz", &.{.number_literal}); + + try testTokenize("0o01234567", &.{.number_literal}); + try testTokenize("0o0123_4567", &.{.number_literal}); + try testTokenize("0o01_23_45_67", &.{.number_literal}); + try testTokenize("0o0_1_2_3_4_5_6_7", &.{.number_literal}); + try testTokenize("0o7.", &.{ .number_literal, .period }); + try testTokenize("0o7.0", &.{.number_literal}); + + try testTokenize("0O0", &.{.number_literal}); + try testTokenize("0o_", &.{.number_literal}); + try testTokenize("0o_0", &.{.number_literal}); + try testTokenize("0o1_", &.{.number_literal}); + try testTokenize("0o0__1", &.{.number_literal}); + try testTokenize("0o0_1_", &.{.number_literal}); + try testTokenize("0o1e", &.{.number_literal}); + try testTokenize("0o1p", &.{.number_literal}); + try testTokenize("0o1e0", &.{.number_literal}); + try testTokenize("0o1p0", &.{.number_literal}); + try testTokenize("0o_,", &.{ .number_literal, .comma }); +} + +test "number literals hexadecimal" { + try testTokenize("0x0", &.{.number_literal}); + try testTokenize("0x1", &.{.number_literal}); + try testTokenize("0x2", &.{.number_literal}); + try testTokenize("0x3", &.{.number_literal}); + try testTokenize("0x4", &.{.number_literal}); + try testTokenize("0x5", &.{.number_literal}); + try testTokenize("0x6", &.{.number_literal}); + try testTokenize("0x7", &.{.number_literal}); + try testTokenize("0x8", &.{.number_literal}); + try testTokenize("0x9", &.{.number_literal}); + try testTokenize("0xa", &.{.number_literal}); + try testTokenize("0xb", &.{.number_literal}); + try testTokenize("0xc", &.{.number_literal}); + try testTokenize("0xd", &.{.number_literal}); + try testTokenize("0xe", &.{.number_literal}); + try testTokenize("0xf", &.{.number_literal}); + try testTokenize("0xA", &.{.number_literal}); + try testTokenize("0xB", &.{.number_literal}); + try testTokenize("0xC", &.{.number_literal}); + try testTokenize("0xD", &.{.number_literal}); + try testTokenize("0xE", &.{.number_literal}); + try testTokenize("0xF", &.{.number_literal}); + try testTokenize("0x0z", &.{.number_literal}); + try testTokenize("0xz", &.{.number_literal}); + + try testTokenize("0x0123456789ABCDEF", &.{.number_literal}); + try testTokenize("0x0123_4567_89AB_CDEF", &.{.number_literal}); + try testTokenize("0x01_23_45_67_89AB_CDE_F", &.{.number_literal}); + try testTokenize("0x0_1_2_3_4_5_6_7_8_9_A_B_C_D_E_F", &.{.number_literal}); + + try testTokenize("0X0", &.{.number_literal}); + try testTokenize("0x_", &.{.number_literal}); + try testTokenize("0x_1", &.{.number_literal}); + try testTokenize("0x1_", &.{.number_literal}); + try testTokenize("0x0__1", &.{.number_literal}); + try testTokenize("0x0_1_", &.{.number_literal}); + try testTokenize("0x_,", &.{ .number_literal, .comma }); + + try testTokenize("0x1.0", &.{.number_literal}); + try testTokenize("0xF.0", &.{.number_literal}); + try testTokenize("0xF.F", &.{.number_literal}); + try testTokenize("0xF.Fp0", &.{.number_literal}); + try testTokenize("0xF.FP0", &.{.number_literal}); + try testTokenize("0x1p0", &.{.number_literal}); + try testTokenize("0xfp0", &.{.number_literal}); + try testTokenize("0x1.0+0xF.0", &.{ .number_literal, .plus, .number_literal }); + + try testTokenize("0x1.", &.{ .number_literal, .period }); + try testTokenize("0xF.", &.{ .number_literal, .period }); + try testTokenize("0x1.+0xF.", &.{ .number_literal, .period, .plus, .number_literal, .period }); + try testTokenize("0xff.p10", &.{.number_literal}); + + try testTokenize("0x0123456.789ABCDEF", &.{.number_literal}); + try testTokenize("0x0_123_456.789_ABC_DEF", &.{.number_literal}); + try testTokenize("0x0_1_2_3_4_5_6.7_8_9_A_B_C_D_E_F", &.{.number_literal}); + try testTokenize("0x0p0", &.{.number_literal}); + try testTokenize("0x0.0p0", &.{.number_literal}); + try testTokenize("0xff.ffp10", &.{.number_literal}); + try testTokenize("0xff.ffP10", &.{.number_literal}); + try testTokenize("0xffp10", &.{.number_literal}); + try testTokenize("0xff_ff.ff_ffp1_0_0_0", &.{.number_literal}); + try testTokenize("0xf_f_f_f.f_f_f_fp+1_000", &.{.number_literal}); + try testTokenize("0xf_f_f_f.f_f_f_fp-1_00_0", &.{.number_literal}); + + try testTokenize("0x1e", &.{.number_literal}); + try testTokenize("0x1e0", &.{.number_literal}); + try testTokenize("0x1p", &.{.number_literal}); + try testTokenize("0xfp0z1", &.{.number_literal}); + try testTokenize("0xff.ffpff", &.{.number_literal}); + try testTokenize("0x0.p", &.{.number_literal}); + try testTokenize("0x0.z", &.{.number_literal}); + try testTokenize("0x0._", &.{.number_literal}); + try testTokenize("0x0_.0", &.{.number_literal}); + try testTokenize("0x0_.0.0", &.{ .number_literal, .period, .number_literal }); + try testTokenize("0x0._0", &.{.number_literal}); + try testTokenize("0x0.0_", &.{.number_literal}); + try testTokenize("0x0_p0", &.{.number_literal}); + try testTokenize("0x0_.p0", &.{.number_literal}); + try testTokenize("0x0._p0", &.{.number_literal}); + try testTokenize("0x0.0_p0", &.{.number_literal}); + try testTokenize("0x0._0p0", &.{.number_literal}); + try testTokenize("0x0.0p_0", &.{.number_literal}); + try testTokenize("0x0.0p+_0", &.{.number_literal}); + try testTokenize("0x0.0p-_0", &.{.number_literal}); + try testTokenize("0x0.0p0_", &.{.number_literal}); +} + +test "multi line string literal with only 1 backslash" { + try testTokenize("x \\\n;", &.{ .identifier, .invalid, .semicolon }); +} + +test "invalid builtin identifiers" { + try testTokenize("@()", &.{.invalid}); + try testTokenize("@0()", &.{.invalid}); +} + +test "invalid token with unfinished escape right before eof" { + try testTokenize("\"\\", &.{.invalid}); + try testTokenize("'\\", &.{.invalid}); + try testTokenize("'\\u", &.{.invalid}); +} + +test "saturating operators" { + try testTokenize("<<", &.{.angle_bracket_angle_bracket_left}); + try testTokenize("<<|", &.{.angle_bracket_angle_bracket_left_pipe}); + try testTokenize("<<|=", &.{.angle_bracket_angle_bracket_left_pipe_equal}); + + try testTokenize("*", &.{.asterisk}); + try testTokenize("*|", &.{.asterisk_pipe}); + try testTokenize("*|=", &.{.asterisk_pipe_equal}); + + try testTokenize("+", &.{.plus}); + try testTokenize("+|", &.{.plus_pipe}); + try testTokenize("+|=", &.{.plus_pipe_equal}); + + try testTokenize("-", &.{.minus}); + try testTokenize("-|", &.{.minus_pipe}); + try testTokenize("-|=", &.{.minus_pipe_equal}); +} + +test "null byte before eof" { + try testTokenize("123 \x00 456", &.{ .number_literal, .invalid }); + try testTokenize("//\x00", &.{.invalid}); + try testTokenize("\\\\\x00", &.{.invalid}); + try testTokenize("\x00", &.{.invalid}); + try testTokenize("// NUL\x00\n", &.{.invalid}); + try testTokenize("///\x00\n", &.{ .doc_comment, .invalid }); + try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid }); +} + +test "invalid tabs and carriage returns" { + // "Inside Line Comments and Documentation Comments, Any TAB is rejected by + // the grammar since it is ambiguous how it should be rendered." + // https://github.com/ziglang/zig-spec/issues/38 + try testTokenize("//\t", &.{.invalid}); + try testTokenize("// \t", &.{.invalid}); + try testTokenize("///\t", &.{.invalid}); + try testTokenize("/// \t", &.{.invalid}); + try testTokenize("//!\t", &.{.invalid}); + try testTokenize("//! \t", &.{.invalid}); + + // "Inside Line Comments and Documentation Comments, CR directly preceding + // NL is unambiguously part of the newline sequence. It is accepted by the + // grammar and removed by zig fmt, leaving only NL. CR anywhere else is + // rejected by the grammar." + // https://github.com/ziglang/zig-spec/issues/38 + try testTokenize("//\r", &.{.invalid}); + try testTokenize("// \r", &.{.invalid}); + try testTokenize("///\r", &.{.invalid}); + try testTokenize("/// \r", &.{.invalid}); + try testTokenize("//\r ", &.{.invalid}); + try testTokenize("// \r ", &.{.invalid}); + try testTokenize("///\r ", &.{.invalid}); + try testTokenize("/// \r ", &.{.invalid}); + try testTokenize("//\r\n", &.{}); + try testTokenize("// \r\n", &.{}); + try testTokenize("///\r\n", &.{.doc_comment}); + try testTokenize("/// \r\n", &.{.doc_comment}); + try testTokenize("//!\r", &.{.invalid}); + try testTokenize("//! \r", &.{.invalid}); + try testTokenize("//!\r ", &.{.invalid}); + try testTokenize("//! \r ", &.{.invalid}); + try testTokenize("//!\r\n", &.{.container_doc_comment}); + try testTokenize("//! \r\n", &.{.container_doc_comment}); + + // The control characters TAB and CR are rejected by the grammar inside multi-line string literals, + // except if CR is directly before NL. + // https://github.com/ziglang/zig-spec/issues/38 + try testTokenize("\\\\\r", &.{.invalid}); + try testTokenize("\\\\\r ", &.{.invalid}); + try testTokenize("\\\\ \r", &.{.invalid}); + try testTokenize("\\\\\t", &.{.invalid}); + try testTokenize("\\\\\t ", &.{.invalid}); + try testTokenize("\\\\ \t", &.{.invalid}); + try testTokenize("\\\\\r\n", &.{.multiline_string_literal_line}); + + // "TAB used as whitespace is...accepted by the grammar. CR used as + // whitespace, whether directly preceding NL or stray, is...accepted by the + // grammar." + // https://github.com/ziglang/zig-spec/issues/38 + try testTokenize("\tpub\tswitch\t", &.{ .keyword_pub, .keyword_switch }); + try testTokenize("\rpub\rswitch\r", &.{ .keyword_pub, .keyword_switch }); +} diff --git a/zig1.c b/zig1.c new file mode 100644 index 0000000..614ba16 --- /dev/null +++ b/zig1.c @@ -0,0 +1,53 @@ +#include +#include +#include + +// API: +// - code = 0: program successfully terminated. +// - code = 1: panicked, panic message in msg. Caller should free msg. +// - code = 2: interpreter error, error in msg. Caller should free msg. +int zig1_run(char* program, char** msg) { return 0; } + +// API: run and: +// code = 3: abnormal error, expect something in stderr. +int zig1_run_file(char* fname, char** msg) +{ + FILE* f = fopen(fname, "r"); + if (f == NULL) { + perror("fopen"); + return 3; + } + fseek(f, 0, SEEK_END); + long fsize = ftell(f); + if (fsize == -1) { + perror("ftell"); + fclose(f); + return 3; + } + fseek(f, 0, SEEK_SET); + + char* program = malloc(fsize + 1); + if (program == NULL) { + perror("malloc"); + fclose(f); + return 3; + } + + size_t bytes_read = fread(program, 1, fsize, f); + if (bytes_read < fsize) { + if (ferror(f)) { + perror("fread"); + } else { + fprintf(stderr, "Unexpected end of file\n"); + } + free(program); + fclose(f); + return 3; + } + fclose(f); + program[fsize] = 0; + + int code = zig1_run(program, msg); + free(program); + return code; +}