making parser

This commit is contained in:
2024-12-20 00:00:51 +02:00
parent 69e90b6b9f
commit 228b215259
7 changed files with 405 additions and 196 deletions

42
ast.c
View File

@@ -4,24 +4,26 @@
#include <stdlib.h> #include <stdlib.h>
#include "ast.h" #include "ast.h"
#include "parse.h" #include "parser.h"
#define N 1024
ast ast_parse(const char* source, const uint32_t len, int* err) ast ast_parse(const char* source, const uint32_t len, int* err)
{ {
uint32_t estimated_token_count = len / 8; uint32_t estimated_token_count = len / 8;
tokenizer_tag* token_tags = NULL; tokenizerTag* token_tags = NULL;
ast_index* token_starts = NULL; astIndex* token_starts = NULL;
ast_node_tag* nodes_tags = NULL; astNodeTag* nodes_tags = NULL;
ast_token_index* main_tokens = NULL; astTokenIndex* main_tokens = NULL;
ast_data* nodes_datas = NULL; astData* nodes_datas = NULL;
ast_node_index* extra_data_arr = NULL; astNodeIndex* extra_data_arr = NULL;
ast_node_index* scratch_arr = NULL; astNodeIndex* scratch_arr = NULL;
if (!(token_tags = calloc(estimated_token_count, sizeof(tokenizer_tag)))) if (!(token_tags = calloc(estimated_token_count, sizeof(tokenizerTag))))
goto err; goto err;
if (!(token_starts = calloc(estimated_token_count, sizeof(ast_index)))) if (!(token_starts = calloc(estimated_token_count, sizeof(astIndex))))
goto err; goto err;
tokenizer tok = tokenizer_init(source, len); tokenizer tok = tokenizer_init(source, len);
@@ -31,26 +33,26 @@ ast ast_parse(const char* source, const uint32_t len, int* err)
fprintf(stderr, "too many tokens, bump estimated_token_count\n"); fprintf(stderr, "too many tokens, bump estimated_token_count\n");
goto err; goto err;
} }
tokenizer_token token = tokenizer_next(&tok); tokenizerToken token = tokenizer_next(&tok);
token_tags[tokens_len] = token.tag; token_tags[tokens_len] = token.tag;
token_starts[tokens_len] = token.loc.start; token_starts[tokens_len] = token.loc.start;
} }
uint32_t estimated_node_count = (tokens_len + 2) / 2; uint32_t estimated_node_count = (tokens_len + 2) / 2;
if (!(nodes_tags = calloc(estimated_node_count, sizeof(ast_node_tag)))) if (!(nodes_tags = calloc(estimated_node_count, sizeof(astNodeTag))))
goto err; goto err;
if (!(main_tokens = calloc(estimated_node_count, sizeof(ast_token_index)))) if (!(main_tokens = calloc(estimated_node_count, sizeof(astTokenIndex))))
goto err; goto err;
if (!(nodes_datas = calloc(estimated_node_count, sizeof(ast_data)))) if (!(nodes_datas = calloc(estimated_node_count, sizeof(astData))))
goto err; goto err;
if (!(extra_data_arr = calloc(16, sizeof(ast_token_index)))) if (!(extra_data_arr = calloc(N, sizeof(astNodeIndex))))
goto err; goto err;
if (!(scratch_arr = calloc(16, sizeof(ast_token_index)))) if (!(scratch_arr = calloc(N, sizeof(astNodeIndex))))
goto err; goto err;
parser p = (parser) { parser p = (parser) {
@@ -60,20 +62,20 @@ ast ast_parse(const char* source, const uint32_t len, int* err)
.token_starts = token_starts, .token_starts = token_starts,
.tokens_len = tokens_len, .tokens_len = tokens_len,
.tok_i = 0, .tok_i = 0,
.nodes = (ast_node_list) { .nodes = (astNodeList) {
.len = 0, .len = 0,
.cap = estimated_node_count, .cap = estimated_node_count,
.tags = nodes_tags, .tags = nodes_tags,
.main_tokens = main_tokens, .main_tokens = main_tokens,
.datas = nodes_datas, .datas = nodes_datas,
}, },
.extra_data = (parser_node_index_slice) { .len = 0, .cap = 16, .arr = extra_data_arr }, .extra_data = (parserNodeIndexSlice) { .len = 0, .cap = N, .arr = extra_data_arr },
.scratch = (parser_node_index_slice) { .len = 0, .cap = 16, .arr = scratch_arr }, .scratch = (parserNodeIndexSlice) { .len = 0, .cap = N, .arr = scratch_arr },
}; };
free(scratch_arr); free(scratch_arr);
// TODO work parse_root(&p);
return (ast) { return (ast) {
.source = source, .source = source,

28
ast.h
View File

@@ -479,23 +479,23 @@ typedef enum {
AST_NODE_TAG_ERROR_VALUE, AST_NODE_TAG_ERROR_VALUE,
/// `lhs!rhs`. main_token is the `!`. /// `lhs!rhs`. main_token is the `!`.
AST_NODE_TAG_ERROR_UNION, AST_NODE_TAG_ERROR_UNION,
} ast_node_tag; } astNodeTag;
typedef uint32_t ast_token_index; typedef int32_t astTokenIndex;
typedef uint32_t ast_node_index; typedef uint32_t astNodeIndex;
typedef uint32_t ast_index; typedef uint32_t astIndex;
typedef struct { typedef struct {
ast_index lhs, rhs; astIndex lhs, rhs;
} ast_data; } astData;
typedef struct { typedef struct {
uint32_t len; uint32_t len;
uint32_t cap; uint32_t cap;
ast_node_tag* tags; astNodeTag* tags;
ast_token_index* main_tokens; astTokenIndex* main_tokens;
ast_data* datas; astData* datas;
} ast_node_list; } astNodeList;
typedef struct { typedef struct {
const char* source; const char* source;
@@ -503,13 +503,13 @@ typedef struct {
struct { struct {
uint32_t len; uint32_t len;
tokenizer_tag* tags; tokenizerTag* tags;
ast_index* starts; astIndex* starts;
} tokens; } tokens;
ast_node_list nodes; astNodeList nodes;
ast_node_index* extra_data; astNodeIndex* extra_data;
uint32_t extra_data_len; uint32_t extra_data_len;
} ast; } ast;

View File

@@ -1,6 +1,11 @@
const std = @import("std"); const std = @import("std");
const c_lib_files = &[_][]const u8{ "tokenizer.c", "ast.c", "zig1.c" }; const c_lib_files = &[_][]const u8{
"tokenizer.c",
"ast.c",
"zig1.c",
"parser.c",
};
const all_c_files = c_lib_files ++ &[_][]const u8{"main.c"}; const all_c_files = c_lib_files ++ &[_][]const u8{"main.c"};
const cflags = &[_][]const u8{ const cflags = &[_][]const u8{
"-std=c11", "-std=c11",
@@ -19,6 +24,7 @@ const cflags = &[_][]const u8{
"-Wdouble-promotion", "-Wdouble-promotion",
"-fstack-protector-all", "-fstack-protector-all",
"-Wimplicit-fallthrough", "-Wimplicit-fallthrough",
"-Wno-unused-function", // TODO remove once refactoring is done
//"-D_FORTIFY_SOURCE=2", // consider when optimization flags are enabled //"-D_FORTIFY_SOURCE=2", // consider when optimization flags are enabled
}; };
@@ -37,24 +43,21 @@ pub fn build(b: *std.Build) !void {
if (std.mem.eql(u8, cc, "zig")) if (std.mem.eql(u8, cc, "zig"))
lib.addCSourceFiles(.{ .files = c_lib_files, .flags = cflags }) lib.addCSourceFiles(.{ .files = c_lib_files, .flags = cflags })
else for (c_lib_files) |cfile| { else for (c_lib_files) |cfile| {
const objfile = try std.fmt.allocPrint(
b.allocator,
"{s}.o",
.{cfile[0 .. cfile.len - 2]},
);
const cc1 = b.addSystemCommand(&.{cc}); const cc1 = b.addSystemCommand(&.{cc});
cc1.addArgs(cflags); cc1.addArgs(cflags ++ .{"-g"});
cc1.addArg("-g"); cc1.addArg(switch (optimize) {
cc1.addArgs(switch (optimize) { .Debug => "-O0",
.Debug => &.{"-O0"}, .ReleaseFast, .ReleaseSafe => "-O3", // ubsan?
.ReleaseFast, .ReleaseSafe => &.{"-O3"}, // TODO ubsan? .ReleaseSmall => "-Os",
.ReleaseSmall => &.{"-Os"},
}); });
cc1.addArg("-c"); cc1.addArg("-c");
cc1.addFileArg(b.path(cfile)); cc1.addFileArg(b.path(cfile));
cc1.addArg("-o"); cc1.addArg("-o");
const obj = cc1.addOutputFileArg(objfile); lib.addObjectFile(cc1.addOutputFileArg(try std.fmt.allocPrint(
lib.addObjectFile(obj); b.allocator,
"{s}.o",
.{cfile[0 .. cfile.len - 2]},
)));
} }
lib.linkLibC(); lib.linkLibC();
@@ -91,11 +94,13 @@ pub fn build(b: *std.Build) !void {
const cppcheck = b.addSystemCommand(&.{"cppcheck"}); const cppcheck = b.addSystemCommand(&.{"cppcheck"});
cppcheck.addArgs(&.{ cppcheck.addArgs(&.{
"--quiet",
"--error-exitcode=1",
"--enable=all", "--enable=all",
"--suppress=missingIncludeSystem", "--suppress=missingIncludeSystem",
"--suppress=checkersReport", "--suppress=checkersReport",
"--quiet",
"--suppress=unusedFunction", // TODO remove after plumbing is done "--suppress=unusedFunction", // TODO remove after plumbing is done
"--suppress=unusedStructMember", // TODO remove after plumbing is done
}); });
for (all_c_files) |cfile| cppcheck.addFileArg(b.path(cfile)); for (all_c_files) |cfile| cppcheck.addFileArg(b.path(cfile));
lint_step.dependOn(&cppcheck.step); lint_step.dependOn(&cppcheck.step);

190
parser.c Normal file
View File

@@ -0,0 +1,190 @@
#include <stdio.h>
#include "parser.h"
typedef struct {
uint32_t len;
astNodeIndex lhs, rhs;
bool trailing;
} members;
typedef struct {
enum {
FIELD_STATE_NONE,
FIELD_STATE_SEEN,
FIELD_STATE_END // sets "end"
} tag;
union {
uint32_t end;
} payload;
} field_state;
static astTokenIndex next_token(parser* p)
{
return ++p->tok_i;
}
static astTokenIndex eat_token(parser* p, tokenizerTag tag)
{
return (p->token_tags[p->tok_i] == tag) ? next_token(p) : -1;
}
static members parse_container_members(parser* p)
{
const uint32_t scratch_top = p->scratch.len;
members res = (members) {};
// ast_token_index last_field;
while (eat_token(p, TOKENIZER_TAG_CONTAINER_DOC_COMMENT) != -1)
;
// bool trailing = false;
while (1) {
// SKIP eat doc comments
switch (p->token_tags[p->tok_i]) {
case TOKENIZER_TAG_INVALID:
case TOKENIZER_TAG_INVALID_PERIODASTERISKS:
case TOKENIZER_TAG_IDENTIFIER:
case TOKENIZER_TAG_STRING_LITERAL:
case TOKENIZER_TAG_MULTILINE_STRING_LITERAL_LINE:
case TOKENIZER_TAG_CHAR_LITERAL:
case TOKENIZER_TAG_EOF:
case TOKENIZER_TAG_BUILTIN:
case TOKENIZER_TAG_BANG:
case TOKENIZER_TAG_PIPE:
case TOKENIZER_TAG_PIPE_PIPE:
case TOKENIZER_TAG_PIPE_EQUAL:
case TOKENIZER_TAG_EQUAL:
case TOKENIZER_TAG_EQUAL_EQUAL:
case TOKENIZER_TAG_EQUAL_ANGLE_BRACKET_RIGHT:
case TOKENIZER_TAG_BANG_EQUAL:
case TOKENIZER_TAG_L_PAREN:
case TOKENIZER_TAG_R_PAREN:
case TOKENIZER_TAG_SEMICOLON:
case TOKENIZER_TAG_PERCENT:
case TOKENIZER_TAG_PERCENT_EQUAL:
case TOKENIZER_TAG_L_BRACE:
case TOKENIZER_TAG_R_BRACE:
case TOKENIZER_TAG_L_BRACKET:
case TOKENIZER_TAG_R_BRACKET:
case TOKENIZER_TAG_PERIOD:
case TOKENIZER_TAG_PERIOD_ASTERISK:
case TOKENIZER_TAG_ELLIPSIS2:
case TOKENIZER_TAG_ELLIPSIS3:
case TOKENIZER_TAG_CARET:
case TOKENIZER_TAG_CARET_EQUAL:
case TOKENIZER_TAG_PLUS:
case TOKENIZER_TAG_PLUS_PLUS:
case TOKENIZER_TAG_PLUS_EQUAL:
case TOKENIZER_TAG_PLUS_PERCENT:
case TOKENIZER_TAG_PLUS_PERCENT_EQUAL:
case TOKENIZER_TAG_PLUS_PIPE:
case TOKENIZER_TAG_PLUS_PIPE_EQUAL:
case TOKENIZER_TAG_MINUS:
case TOKENIZER_TAG_MINUS_EQUAL:
case TOKENIZER_TAG_MINUS_PERCENT:
case TOKENIZER_TAG_MINUS_PERCENT_EQUAL:
case TOKENIZER_TAG_MINUS_PIPE:
case TOKENIZER_TAG_MINUS_PIPE_EQUAL:
case TOKENIZER_TAG_ASTERISK:
case TOKENIZER_TAG_ASTERISK_EQUAL:
case TOKENIZER_TAG_ASTERISK_ASTERISK:
case TOKENIZER_TAG_ASTERISK_PERCENT:
case TOKENIZER_TAG_ASTERISK_PERCENT_EQUAL:
case TOKENIZER_TAG_ASTERISK_PIPE:
case TOKENIZER_TAG_ASTERISK_PIPE_EQUAL:
case TOKENIZER_TAG_ARROW:
case TOKENIZER_TAG_COLON:
case TOKENIZER_TAG_SLASH:
case TOKENIZER_TAG_SLASH_EQUAL:
case TOKENIZER_TAG_COMMA:
case TOKENIZER_TAG_AMPERSAND:
case TOKENIZER_TAG_AMPERSAND_EQUAL:
case TOKENIZER_TAG_QUESTION_MARK:
case TOKENIZER_TAG_ANGLE_BRACKET_LEFT:
case TOKENIZER_TAG_ANGLE_BRACKET_LEFT_EQUAL:
case TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT:
case TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL:
case TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE:
case TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL:
case TOKENIZER_TAG_ANGLE_BRACKET_RIGHT:
case TOKENIZER_TAG_ANGLE_BRACKET_RIGHT_EQUAL:
case TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT:
case TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL:
case TOKENIZER_TAG_TILDE:
case TOKENIZER_TAG_NUMBER_LITERAL:
case TOKENIZER_TAG_DOC_COMMENT:
case TOKENIZER_TAG_CONTAINER_DOC_COMMENT:
case TOKENIZER_TAG_KEYWORD_ADDRSPACE:
case TOKENIZER_TAG_KEYWORD_ALIGN:
case TOKENIZER_TAG_KEYWORD_ALLOWZERO:
case TOKENIZER_TAG_KEYWORD_AND:
case TOKENIZER_TAG_KEYWORD_ANYFRAME:
case TOKENIZER_TAG_KEYWORD_ANYTYPE:
case TOKENIZER_TAG_KEYWORD_ASM:
case TOKENIZER_TAG_KEYWORD_ASYNC:
case TOKENIZER_TAG_KEYWORD_AWAIT:
case TOKENIZER_TAG_KEYWORD_BREAK:
case TOKENIZER_TAG_KEYWORD_CALLCONV:
case TOKENIZER_TAG_KEYWORD_CATCH:
case TOKENIZER_TAG_KEYWORD_COMPTIME:
case TOKENIZER_TAG_KEYWORD_CONST:
case TOKENIZER_TAG_KEYWORD_CONTINUE:
case TOKENIZER_TAG_KEYWORD_DEFER:
case TOKENIZER_TAG_KEYWORD_ELSE:
case TOKENIZER_TAG_KEYWORD_ENUM:
case TOKENIZER_TAG_KEYWORD_ERRDEFER:
case TOKENIZER_TAG_KEYWORD_ERROR:
case TOKENIZER_TAG_KEYWORD_EXPORT:
case TOKENIZER_TAG_KEYWORD_EXTERN:
case TOKENIZER_TAG_KEYWORD_FN:
case TOKENIZER_TAG_KEYWORD_FOR:
case TOKENIZER_TAG_KEYWORD_IF:
case TOKENIZER_TAG_KEYWORD_INLINE:
case TOKENIZER_TAG_KEYWORD_NOALIAS:
case TOKENIZER_TAG_KEYWORD_NOINLINE:
case TOKENIZER_TAG_KEYWORD_NOSUSPEND:
case TOKENIZER_TAG_KEYWORD_OPAQUE:
case TOKENIZER_TAG_KEYWORD_OR:
case TOKENIZER_TAG_KEYWORD_ORELSE:
case TOKENIZER_TAG_KEYWORD_PACKED:
case TOKENIZER_TAG_KEYWORD_RESUME:
case TOKENIZER_TAG_KEYWORD_RETURN:
case TOKENIZER_TAG_KEYWORD_LINKSECTION:
case TOKENIZER_TAG_KEYWORD_STRUCT:
case TOKENIZER_TAG_KEYWORD_SUSPEND:
case TOKENIZER_TAG_KEYWORD_SWITCH:
case TOKENIZER_TAG_KEYWORD_TEST:
case TOKENIZER_TAG_KEYWORD_THREADLOCAL:
case TOKENIZER_TAG_KEYWORD_TRY:
case TOKENIZER_TAG_KEYWORD_UNION:
case TOKENIZER_TAG_KEYWORD_UNREACHABLE:
case TOKENIZER_TAG_KEYWORD_USINGNAMESPACE:
case TOKENIZER_TAG_KEYWORD_VAR:
case TOKENIZER_TAG_KEYWORD_VOLATILE:
case TOKENIZER_TAG_KEYWORD_WHILE:;
const char* str = tokenizerTagString[p->token_tags[p->tok_i]];
fprintf(stderr, "keyword %s not implemented\n", str);
goto cleanup;
case TOKENIZER_TAG_KEYWORD_PUB:
p->tok_i++;
break;
// TODO do work
}
}
cleanup:
p->scratch.len = scratch_top;
return res;
}
int parse_root(parser* p)
{
p->nodes.tags[p->nodes.len++] = AST_NODE_TAG_ROOT;
p->nodes.main_tokens[p->nodes.len] = 0;
// members root_members = parse_container_members(p);
return 0;
}

View File

@@ -9,22 +9,24 @@
typedef struct { typedef struct {
uint32_t len; uint32_t len;
uint32_t cap; uint32_t cap;
ast_node_index* arr; astNodeIndex* arr;
} parser_node_index_slice; } parserNodeIndexSlice;
typedef struct { typedef struct {
const char* source; const char* source;
const uint32_t source_len; const uint32_t source_len;
tokenizer_tag* token_tags; tokenizerTag* token_tags;
ast_index* token_starts; astIndex* token_starts;
uint32_t tokens_len; uint32_t tokens_len;
ast_token_index tok_i; astTokenIndex tok_i;
ast_node_list nodes; astNodeList nodes;
parser_node_index_slice extra_data; parserNodeIndexSlice extra_data;
parser_node_index_slice scratch; parserNodeIndexSlice scratch;
} parser; } parser;
int parse_root(parser*);
#endif #endif

View File

@@ -7,10 +7,10 @@
typedef struct { typedef struct {
const char* keyword; const char* keyword;
tokenizer_tag tag; tokenizerTag tag;
} keyword_map; } keywordMap;
const keyword_map keywords[] = { const keywordMap keywords[] = {
{ "addrspace", TOKENIZER_TAG_KEYWORD_ADDRSPACE }, { "addrspace", TOKENIZER_TAG_KEYWORD_ADDRSPACE },
{ "align", TOKENIZER_TAG_KEYWORD_ALIGN }, { "align", TOKENIZER_TAG_KEYWORD_ALIGN },
{ "allowzero", TOKENIZER_TAG_KEYWORD_ALLOWZERO }, { "allowzero", TOKENIZER_TAG_KEYWORD_ALLOWZERO },
@@ -63,9 +63,9 @@ const keyword_map keywords[] = {
}; };
// TODO binary search // TODO binary search
static tokenizer_tag get_keyword(const char* bytes, const uint32_t len) static tokenizerTag get_keyword(const char* bytes, const uint32_t len)
{ {
for (unsigned long i = 0; i < sizeof(keywords) / sizeof(keyword_map); i++) { for (unsigned long i = 0; i < sizeof(keywords) / sizeof(keywordMap); i++) {
size_t klen = strlen(keywords[i].keyword); size_t klen = strlen(keywords[i].keyword);
size_t minlen = klen < len ? klen : len; size_t minlen = klen < len ? klen : len;
int cmp = strncmp(bytes, keywords[i].keyword, minlen); int cmp = strncmp(bytes, keywords[i].keyword, minlen);
@@ -91,16 +91,16 @@ tokenizer tokenizer_init(const char* buffer, const uint32_t len)
}; };
} }
tokenizer_token tokenizer_next(tokenizer* self) tokenizerToken tokenizer_next(tokenizer* self)
{ {
tokenizer_token result = (tokenizer_token) { tokenizerToken result = (tokenizerToken) {
.tag = TOKENIZER_TAG_INVALID, .tag = TOKENIZER_TAG_INVALID,
.loc = { .loc = {
.start = 0, .start = 0,
}, },
}; };
tokenizer_state state = TOKENIZER_STATE_START; tokenizerState state = TOKENIZER_STATE_START;
state: state:
switch (state) { switch (state) {
@@ -108,7 +108,7 @@ state:
switch (self->buffer[self->index]) { switch (self->buffer[self->index]) {
case 0: case 0:
if (self->index == self->buffer_len) { if (self->index == self->buffer_len) {
return (tokenizer_token) { return (tokenizerToken) {
.tag = TOKENIZER_TAG_EOF, .tag = TOKENIZER_TAG_EOF,
.loc = { .loc = {
.start = self->index, .start = self->index,
@@ -455,7 +455,7 @@ state:
default:; // Once we're at C23, this semicolon can be removed. default:; // Once we're at C23, this semicolon can be removed.
const char* start = self->buffer + result.loc.start; const char* start = self->buffer + result.loc.start;
uint32_t len = self->index - result.loc.start; uint32_t len = self->index - result.loc.start;
tokenizer_tag tag = get_keyword(start, len); tokenizerTag tag = get_keyword(start, len);
if (tag != TOKENIZER_TAG_INVALID) { if (tag != TOKENIZER_TAG_INVALID) {
result.tag = tag; result.tag = tag;
} }
@@ -856,7 +856,7 @@ state:
state = TOKENIZER_STATE_INVALID; state = TOKENIZER_STATE_INVALID;
goto state; goto state;
} else { } else {
return (tokenizer_token) { return (tokenizerToken) {
.tag = TOKENIZER_TAG_EOF, .tag = TOKENIZER_TAG_EOF,
.loc = { .loc = {
.start = self->index, .start = self->index,
@@ -930,7 +930,7 @@ state:
state = TOKENIZER_STATE_INVALID; state = TOKENIZER_STATE_INVALID;
goto state; goto state;
} else { } else {
return (tokenizer_token) { return (tokenizerToken) {
.tag = TOKENIZER_TAG_EOF, .tag = TOKENIZER_TAG_EOF,
.loc = { .loc = {
.start = self->index, .start = self->index,

View File

@@ -4,130 +4,140 @@
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
#define FOREACH_TOKENIZER_TAG_ENUM(TAG) \
TAG(TOKENIZER_TAG_INVALID) \
TAG(TOKENIZER_TAG_INVALID_PERIODASTERISKS) \
TAG(TOKENIZER_TAG_IDENTIFIER) \
TAG(TOKENIZER_TAG_STRING_LITERAL) \
TAG(TOKENIZER_TAG_MULTILINE_STRING_LITERAL_LINE) \
TAG(TOKENIZER_TAG_CHAR_LITERAL) \
TAG(TOKENIZER_TAG_EOF) \
TAG(TOKENIZER_TAG_BUILTIN) \
TAG(TOKENIZER_TAG_BANG) \
TAG(TOKENIZER_TAG_PIPE) \
TAG(TOKENIZER_TAG_PIPE_PIPE) \
TAG(TOKENIZER_TAG_PIPE_EQUAL) \
TAG(TOKENIZER_TAG_EQUAL) \
TAG(TOKENIZER_TAG_EQUAL_EQUAL) \
TAG(TOKENIZER_TAG_EQUAL_ANGLE_BRACKET_RIGHT) \
TAG(TOKENIZER_TAG_BANG_EQUAL) \
TAG(TOKENIZER_TAG_L_PAREN) \
TAG(TOKENIZER_TAG_R_PAREN) \
TAG(TOKENIZER_TAG_SEMICOLON) \
TAG(TOKENIZER_TAG_PERCENT) \
TAG(TOKENIZER_TAG_PERCENT_EQUAL) \
TAG(TOKENIZER_TAG_L_BRACE) \
TAG(TOKENIZER_TAG_R_BRACE) \
TAG(TOKENIZER_TAG_L_BRACKET) \
TAG(TOKENIZER_TAG_R_BRACKET) \
TAG(TOKENIZER_TAG_PERIOD) \
TAG(TOKENIZER_TAG_PERIOD_ASTERISK) \
TAG(TOKENIZER_TAG_ELLIPSIS2) \
TAG(TOKENIZER_TAG_ELLIPSIS3) \
TAG(TOKENIZER_TAG_CARET) \
TAG(TOKENIZER_TAG_CARET_EQUAL) \
TAG(TOKENIZER_TAG_PLUS) \
TAG(TOKENIZER_TAG_PLUS_PLUS) \
TAG(TOKENIZER_TAG_PLUS_EQUAL) \
TAG(TOKENIZER_TAG_PLUS_PERCENT) \
TAG(TOKENIZER_TAG_PLUS_PERCENT_EQUAL) \
TAG(TOKENIZER_TAG_PLUS_PIPE) \
TAG(TOKENIZER_TAG_PLUS_PIPE_EQUAL) \
TAG(TOKENIZER_TAG_MINUS) \
TAG(TOKENIZER_TAG_MINUS_EQUAL) \
TAG(TOKENIZER_TAG_MINUS_PERCENT) \
TAG(TOKENIZER_TAG_MINUS_PERCENT_EQUAL) \
TAG(TOKENIZER_TAG_MINUS_PIPE) \
TAG(TOKENIZER_TAG_MINUS_PIPE_EQUAL) \
TAG(TOKENIZER_TAG_ASTERISK) \
TAG(TOKENIZER_TAG_ASTERISK_EQUAL) \
TAG(TOKENIZER_TAG_ASTERISK_ASTERISK) \
TAG(TOKENIZER_TAG_ASTERISK_PERCENT) \
TAG(TOKENIZER_TAG_ASTERISK_PERCENT_EQUAL) \
TAG(TOKENIZER_TAG_ASTERISK_PIPE) \
TAG(TOKENIZER_TAG_ASTERISK_PIPE_EQUAL) \
TAG(TOKENIZER_TAG_ARROW) \
TAG(TOKENIZER_TAG_COLON) \
TAG(TOKENIZER_TAG_SLASH) \
TAG(TOKENIZER_TAG_SLASH_EQUAL) \
TAG(TOKENIZER_TAG_COMMA) \
TAG(TOKENIZER_TAG_AMPERSAND) \
TAG(TOKENIZER_TAG_AMPERSAND_EQUAL) \
TAG(TOKENIZER_TAG_QUESTION_MARK) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_LEFT) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_LEFT_EQUAL) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_RIGHT) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_RIGHT_EQUAL) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT) \
TAG(TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL) \
TAG(TOKENIZER_TAG_TILDE) \
TAG(TOKENIZER_TAG_NUMBER_LITERAL) \
TAG(TOKENIZER_TAG_DOC_COMMENT) \
TAG(TOKENIZER_TAG_CONTAINER_DOC_COMMENT) \
TAG(TOKENIZER_TAG_KEYWORD_ADDRSPACE) \
TAG(TOKENIZER_TAG_KEYWORD_ALIGN) \
TAG(TOKENIZER_TAG_KEYWORD_ALLOWZERO) \
TAG(TOKENIZER_TAG_KEYWORD_AND) \
TAG(TOKENIZER_TAG_KEYWORD_ANYFRAME) \
TAG(TOKENIZER_TAG_KEYWORD_ANYTYPE) \
TAG(TOKENIZER_TAG_KEYWORD_ASM) \
TAG(TOKENIZER_TAG_KEYWORD_ASYNC) \
TAG(TOKENIZER_TAG_KEYWORD_AWAIT) \
TAG(TOKENIZER_TAG_KEYWORD_BREAK) \
TAG(TOKENIZER_TAG_KEYWORD_CALLCONV) \
TAG(TOKENIZER_TAG_KEYWORD_CATCH) \
TAG(TOKENIZER_TAG_KEYWORD_COMPTIME) \
TAG(TOKENIZER_TAG_KEYWORD_CONST) \
TAG(TOKENIZER_TAG_KEYWORD_CONTINUE) \
TAG(TOKENIZER_TAG_KEYWORD_DEFER) \
TAG(TOKENIZER_TAG_KEYWORD_ELSE) \
TAG(TOKENIZER_TAG_KEYWORD_ENUM) \
TAG(TOKENIZER_TAG_KEYWORD_ERRDEFER) \
TAG(TOKENIZER_TAG_KEYWORD_ERROR) \
TAG(TOKENIZER_TAG_KEYWORD_EXPORT) \
TAG(TOKENIZER_TAG_KEYWORD_EXTERN) \
TAG(TOKENIZER_TAG_KEYWORD_FN) \
TAG(TOKENIZER_TAG_KEYWORD_FOR) \
TAG(TOKENIZER_TAG_KEYWORD_IF) \
TAG(TOKENIZER_TAG_KEYWORD_INLINE) \
TAG(TOKENIZER_TAG_KEYWORD_NOALIAS) \
TAG(TOKENIZER_TAG_KEYWORD_NOINLINE) \
TAG(TOKENIZER_TAG_KEYWORD_NOSUSPEND) \
TAG(TOKENIZER_TAG_KEYWORD_OPAQUE) \
TAG(TOKENIZER_TAG_KEYWORD_OR) \
TAG(TOKENIZER_TAG_KEYWORD_ORELSE) \
TAG(TOKENIZER_TAG_KEYWORD_PACKED) \
TAG(TOKENIZER_TAG_KEYWORD_PUB) \
TAG(TOKENIZER_TAG_KEYWORD_RESUME) \
TAG(TOKENIZER_TAG_KEYWORD_RETURN) \
TAG(TOKENIZER_TAG_KEYWORD_LINKSECTION) \
TAG(TOKENIZER_TAG_KEYWORD_STRUCT) \
TAG(TOKENIZER_TAG_KEYWORD_SUSPEND) \
TAG(TOKENIZER_TAG_KEYWORD_SWITCH) \
TAG(TOKENIZER_TAG_KEYWORD_TEST) \
TAG(TOKENIZER_TAG_KEYWORD_THREADLOCAL) \
TAG(TOKENIZER_TAG_KEYWORD_TRY) \
TAG(TOKENIZER_TAG_KEYWORD_UNION) \
TAG(TOKENIZER_TAG_KEYWORD_UNREACHABLE) \
TAG(TOKENIZER_TAG_KEYWORD_USINGNAMESPACE) \
TAG(TOKENIZER_TAG_KEYWORD_VAR) \
TAG(TOKENIZER_TAG_KEYWORD_VOLATILE) \
TAG(TOKENIZER_TAG_KEYWORD_WHILE) \
#define GENERATE_ENUM(ENUM) ENUM,
#define GENERATE_STRING(STRING) #STRING,
typedef enum { typedef enum {
TOKENIZER_TAG_INVALID, FOREACH_TOKENIZER_TAG_ENUM(GENERATE_ENUM)
TOKENIZER_TAG_INVALID_PERIODASTERISKS, } tokenizerTag;
TOKENIZER_TAG_IDENTIFIER,
TOKENIZER_TAG_STRING_LITERAL, static const char *tokenizerTagString[] = {
TOKENIZER_TAG_MULTILINE_STRING_LITERAL_LINE, FOREACH_TOKENIZER_TAG_ENUM(GENERATE_STRING)
TOKENIZER_TAG_CHAR_LITERAL, };
TOKENIZER_TAG_EOF,
TOKENIZER_TAG_BUILTIN,
TOKENIZER_TAG_BANG,
TOKENIZER_TAG_PIPE,
TOKENIZER_TAG_PIPE_PIPE,
TOKENIZER_TAG_PIPE_EQUAL,
TOKENIZER_TAG_EQUAL,
TOKENIZER_TAG_EQUAL_EQUAL,
TOKENIZER_TAG_EQUAL_ANGLE_BRACKET_RIGHT,
TOKENIZER_TAG_BANG_EQUAL,
TOKENIZER_TAG_L_PAREN,
TOKENIZER_TAG_R_PAREN,
TOKENIZER_TAG_SEMICOLON,
TOKENIZER_TAG_PERCENT,
TOKENIZER_TAG_PERCENT_EQUAL,
TOKENIZER_TAG_L_BRACE,
TOKENIZER_TAG_R_BRACE,
TOKENIZER_TAG_L_BRACKET,
TOKENIZER_TAG_R_BRACKET,
TOKENIZER_TAG_PERIOD,
TOKENIZER_TAG_PERIOD_ASTERISK,
TOKENIZER_TAG_ELLIPSIS2,
TOKENIZER_TAG_ELLIPSIS3,
TOKENIZER_TAG_CARET,
TOKENIZER_TAG_CARET_EQUAL,
TOKENIZER_TAG_PLUS,
TOKENIZER_TAG_PLUS_PLUS,
TOKENIZER_TAG_PLUS_EQUAL,
TOKENIZER_TAG_PLUS_PERCENT,
TOKENIZER_TAG_PLUS_PERCENT_EQUAL,
TOKENIZER_TAG_PLUS_PIPE,
TOKENIZER_TAG_PLUS_PIPE_EQUAL,
TOKENIZER_TAG_MINUS,
TOKENIZER_TAG_MINUS_EQUAL,
TOKENIZER_TAG_MINUS_PERCENT,
TOKENIZER_TAG_MINUS_PERCENT_EQUAL,
TOKENIZER_TAG_MINUS_PIPE,
TOKENIZER_TAG_MINUS_PIPE_EQUAL,
TOKENIZER_TAG_ASTERISK,
TOKENIZER_TAG_ASTERISK_EQUAL,
TOKENIZER_TAG_ASTERISK_ASTERISK,
TOKENIZER_TAG_ASTERISK_PERCENT,
TOKENIZER_TAG_ASTERISK_PERCENT_EQUAL,
TOKENIZER_TAG_ASTERISK_PIPE,
TOKENIZER_TAG_ASTERISK_PIPE_EQUAL,
TOKENIZER_TAG_ARROW,
TOKENIZER_TAG_COLON,
TOKENIZER_TAG_SLASH,
TOKENIZER_TAG_SLASH_EQUAL,
TOKENIZER_TAG_COMMA,
TOKENIZER_TAG_AMPERSAND,
TOKENIZER_TAG_AMPERSAND_EQUAL,
TOKENIZER_TAG_QUESTION_MARK,
TOKENIZER_TAG_ANGLE_BRACKET_LEFT,
TOKENIZER_TAG_ANGLE_BRACKET_LEFT_EQUAL,
TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT,
TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL,
TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE,
TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL,
TOKENIZER_TAG_ANGLE_BRACKET_RIGHT,
TOKENIZER_TAG_ANGLE_BRACKET_RIGHT_EQUAL,
TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT,
TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL,
TOKENIZER_TAG_TILDE,
TOKENIZER_TAG_NUMBER_LITERAL,
TOKENIZER_TAG_DOC_COMMENT,
TOKENIZER_TAG_CONTAINER_DOC_COMMENT,
TOKENIZER_TAG_KEYWORD_ADDRSPACE,
TOKENIZER_TAG_KEYWORD_ALIGN,
TOKENIZER_TAG_KEYWORD_ALLOWZERO,
TOKENIZER_TAG_KEYWORD_AND,
TOKENIZER_TAG_KEYWORD_ANYFRAME,
TOKENIZER_TAG_KEYWORD_ANYTYPE,
TOKENIZER_TAG_KEYWORD_ASM,
TOKENIZER_TAG_KEYWORD_ASYNC,
TOKENIZER_TAG_KEYWORD_AWAIT,
TOKENIZER_TAG_KEYWORD_BREAK,
TOKENIZER_TAG_KEYWORD_CALLCONV,
TOKENIZER_TAG_KEYWORD_CATCH,
TOKENIZER_TAG_KEYWORD_COMPTIME,
TOKENIZER_TAG_KEYWORD_CONST,
TOKENIZER_TAG_KEYWORD_CONTINUE,
TOKENIZER_TAG_KEYWORD_DEFER,
TOKENIZER_TAG_KEYWORD_ELSE,
TOKENIZER_TAG_KEYWORD_ENUM,
TOKENIZER_TAG_KEYWORD_ERRDEFER,
TOKENIZER_TAG_KEYWORD_ERROR,
TOKENIZER_TAG_KEYWORD_EXPORT,
TOKENIZER_TAG_KEYWORD_EXTERN,
TOKENIZER_TAG_KEYWORD_FN,
TOKENIZER_TAG_KEYWORD_FOR,
TOKENIZER_TAG_KEYWORD_IF,
TOKENIZER_TAG_KEYWORD_INLINE,
TOKENIZER_TAG_KEYWORD_NOALIAS,
TOKENIZER_TAG_KEYWORD_NOINLINE,
TOKENIZER_TAG_KEYWORD_NOSUSPEND,
TOKENIZER_TAG_KEYWORD_OPAQUE,
TOKENIZER_TAG_KEYWORD_OR,
TOKENIZER_TAG_KEYWORD_ORELSE,
TOKENIZER_TAG_KEYWORD_PACKED,
TOKENIZER_TAG_KEYWORD_PUB,
TOKENIZER_TAG_KEYWORD_RESUME,
TOKENIZER_TAG_KEYWORD_RETURN,
TOKENIZER_TAG_KEYWORD_LINKSECTION,
TOKENIZER_TAG_KEYWORD_STRUCT,
TOKENIZER_TAG_KEYWORD_SUSPEND,
TOKENIZER_TAG_KEYWORD_SWITCH,
TOKENIZER_TAG_KEYWORD_TEST,
TOKENIZER_TAG_KEYWORD_THREADLOCAL,
TOKENIZER_TAG_KEYWORD_TRY,
TOKENIZER_TAG_KEYWORD_UNION,
TOKENIZER_TAG_KEYWORD_UNREACHABLE,
TOKENIZER_TAG_KEYWORD_USINGNAMESPACE,
TOKENIZER_TAG_KEYWORD_VAR,
TOKENIZER_TAG_KEYWORD_VOLATILE,
TOKENIZER_TAG_KEYWORD_WHILE,
} tokenizer_tag;
typedef enum { typedef enum {
TOKENIZER_STATE_START, TOKENIZER_STATE_START,
@@ -175,14 +185,14 @@ typedef enum {
TOKENIZER_STATE_PERIOD_ASTERISK, TOKENIZER_STATE_PERIOD_ASTERISK,
TOKENIZER_STATE_SAW_AT_SIGN, TOKENIZER_STATE_SAW_AT_SIGN,
TOKENIZER_STATE_INVALID, TOKENIZER_STATE_INVALID,
} tokenizer_state; } tokenizerState;
typedef struct { typedef struct {
tokenizer_tag tag; tokenizerTag tag;
struct { struct {
uint32_t start, end; uint32_t start, end;
} loc; } loc;
} tokenizer_token; } tokenizerToken;
typedef struct { typedef struct {
const char* buffer; const char* buffer;
@@ -191,6 +201,6 @@ typedef struct {
} tokenizer; } tokenizer;
tokenizer tokenizer_init(const char* buffer, uint32_t len); tokenizer tokenizer_init(const char* buffer, uint32_t len);
tokenizer_token tokenizer_next(tokenizer* self); tokenizerToken tokenizer_next(tokenizer* self);
#endif #endif