zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

commit 1fb308ceeea0259ad021d67945ea5adc10960a85 (tree)
parent 3919afcad26d2359efe52f98cd4f2f0573527369
Author: Andrew Kelley <superjoe30@gmail.com>
Date:   Fri,  9 Feb 2018 13:08:02 -0500

self hosted compiler: move tokenization and parsing to std lib

Diffstat:
MCMakeLists.txt | 4++++
Mbuild.zig | 4----
Dsrc-self-hosted/ast.zig | 271-------------------------------------------------------------------------------
Msrc-self-hosted/main.zig | 5-----
Msrc-self-hosted/module.zig | 6+++---
Dsrc-self-hosted/parser.zig | 1160-------------------------------------------------------------------------------
Dsrc-self-hosted/tokenizer.zig | 659-------------------------------------------------------------------------------
Mstd/index.zig | 2++
Astd/zig/ast.zig | 271+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Astd/zig/index.zig | 11+++++++++++
Astd/zig/parser.zig | 1160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Astd/zig/tokenizer.zig | 659+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
12 files changed, 2110 insertions(+), 2102 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -477,6 +477,10 @@ set(ZIG_STD_FILES "special/panic.zig" "special/test_runner.zig" "unicode.zig" + "zig/ast.zig" + "zig/index.zig" + "zig/parser.zig" + "zig/tokenizer.zig" ) set(ZIG_C_HEADER_FILES diff --git a/build.zig b/build.zig @@ -108,10 +108,6 @@ pub fn build(b: &Builder) !void { "std/special/compiler_rt/index.zig", "compiler-rt", "Run the compiler_rt tests", with_lldb)); - test_step.dependOn(tests.addPkgTests(b, test_filter, - "src-self-hosted/main.zig", "fmt", "Run the fmt tests", - with_lldb)); - test_step.dependOn(tests.addCompareOutputTests(b, test_filter)); test_step.dependOn(tests.addBuildExampleTests(b, test_filter)); test_step.dependOn(tests.addCompileErrorTests(b, test_filter)); diff --git a/src-self-hosted/ast.zig b/src-self-hosted/ast.zig @@ -1,271 +0,0 @@ -const std = @import("std"); -const assert = std.debug.assert; -const ArrayList = std.ArrayList; -const Token = @import("tokenizer.zig").Token; -const mem = std.mem; - -pub const Node = struct { - id: Id, - - pub const Id = enum { - Root, - VarDecl, - Identifier, - FnProto, - ParamDecl, - Block, - InfixOp, - PrefixOp, - IntegerLiteral, - FloatLiteral, - }; - - pub fn iterate(base: &Node, index: usize) ?&Node { - return switch (base.id) { - Id.Root => @fieldParentPtr(NodeRoot, "base", base).iterate(index), - Id.VarDecl => @fieldParentPtr(NodeVarDecl, "base", base).iterate(index), - Id.Identifier => @fieldParentPtr(NodeIdentifier, "base", base).iterate(index), - Id.FnProto => @fieldParentPtr(NodeFnProto, "base", base).iterate(index), - Id.ParamDecl => @fieldParentPtr(NodeParamDecl, "base", base).iterate(index), - Id.Block => @fieldParentPtr(NodeBlock, "base", base).iterate(index), - Id.InfixOp => @fieldParentPtr(NodeInfixOp, "base", base).iterate(index), - Id.PrefixOp => @fieldParentPtr(NodePrefixOp, "base", base).iterate(index), - Id.IntegerLiteral => @fieldParentPtr(NodeIntegerLiteral, "base", base).iterate(index), - Id.FloatLiteral => @fieldParentPtr(NodeFloatLiteral, "base", base).iterate(index), - }; - } - - pub fn destroy(base: &Node, allocator: &mem.Allocator) void { - return switch (base.id) { - Id.Root => allocator.destroy(@fieldParentPtr(NodeRoot, "base", base)), - Id.VarDecl => allocator.destroy(@fieldParentPtr(NodeVarDecl, "base", base)), - Id.Identifier => allocator.destroy(@fieldParentPtr(NodeIdentifier, "base", base)), - Id.FnProto => allocator.destroy(@fieldParentPtr(NodeFnProto, "base", base)), - Id.ParamDecl => allocator.destroy(@fieldParentPtr(NodeParamDecl, "base", base)), - Id.Block => allocator.destroy(@fieldParentPtr(NodeBlock, "base", base)), - Id.InfixOp => allocator.destroy(@fieldParentPtr(NodeInfixOp, "base", base)), - Id.PrefixOp => allocator.destroy(@fieldParentPtr(NodePrefixOp, "base", base)), - Id.IntegerLiteral => allocator.destroy(@fieldParentPtr(NodeIntegerLiteral, "base", base)), - Id.FloatLiteral => allocator.destroy(@fieldParentPtr(NodeFloatLiteral, "base", base)), - }; - } -}; - -pub const NodeRoot = struct { - base: Node, - decls: ArrayList(&Node), - - pub fn iterate(self: &NodeRoot, index: usize) ?&Node { - if (index < self.decls.len) { - return self.decls.items[self.decls.len - index - 1]; - } - return null; - } -}; - -pub const NodeVarDecl = struct { - base: Node, - visib_token: ?Token, - name_token: Token, - eq_token: Token, - mut_token: Token, - comptime_token: ?Token, - extern_token: ?Token, - lib_name: ?&Node, - type_node: ?&Node, - align_node: ?&Node, - init_node: ?&Node, - - pub fn iterate(self: &NodeVarDecl, index: usize) ?&Node { - var i = index; - - if (self.type_node) |type_node| { - if (i < 1) return type_node; - i -= 1; - } - - if (self.align_node) |align_node| { - if (i < 1) return align_node; - i -= 1; - } - - if (self.init_node) |init_node| { - if (i < 1) return init_node; - i -= 1; - } - - return null; - } -}; - -pub const NodeIdentifier = struct { - base: Node, - name_token: Token, - - pub fn iterate(self: &NodeIdentifier, index: usize) ?&Node { - return null; - } -}; - -pub const NodeFnProto = struct { - base: Node, - visib_token: ?Token, - fn_token: Token, - name_token: ?Token, - params: ArrayList(&Node), - return_type: &Node, - var_args_token: ?Token, - extern_token: ?Token, - inline_token: ?Token, - cc_token: ?Token, - body_node: ?&Node, - lib_name: ?&Node, // populated if this is an extern declaration - align_expr: ?&Node, // populated if align(A) is present - - pub fn iterate(self: &NodeFnProto, index: usize) ?&Node { - var i = index; - - if (self.body_node) |body_node| { - if (i < 1) return body_node; - i -= 1; - } - - if (i < 1) return self.return_type; - i -= 1; - - if (self.align_expr) |align_expr| { - if (i < 1) return align_expr; - i -= 1; - } - - if (i < self.params.len) return self.params.items[self.params.len - i - 1]; - i -= self.params.len; - - if (self.lib_name) |lib_name| { - if (i < 1) return lib_name; - i -= 1; - } - - return null; - } -}; - -pub const NodeParamDecl = struct { - base: Node, - comptime_token: ?Token, - noalias_token: ?Token, - name_token: ?Token, - type_node: &Node, - var_args_token: ?Token, - - pub fn iterate(self: &NodeParamDecl, index: usize) ?&Node { - var i = index; - - if (i < 1) return self.type_node; - i -= 1; - - return null; - } -}; - -pub const NodeBlock = struct { - base: Node, - begin_token: Token, - end_token: Token, - statements: ArrayList(&Node), - - pub fn iterate(self: &NodeBlock, index: usize) ?&Node { - var i = index; - - if (i < self.statements.len) return self.statements.items[i]; - i -= self.statements.len; - - return null; - } -}; - -pub const NodeInfixOp = struct { - base: Node, - op_token: Token, - lhs: &Node, - op: InfixOp, - rhs: &Node, - - const InfixOp = enum { - EqualEqual, - BangEqual, - }; - - pub fn iterate(self: &NodeInfixOp, index: usize) ?&Node { - var i = index; - - if (i < 1) return self.lhs; - i -= 1; - - switch (self.op) { - InfixOp.EqualEqual => {}, - InfixOp.BangEqual => {}, - } - - if (i < 1) return self.rhs; - i -= 1; - - return null; - } -}; - -pub const NodePrefixOp = struct { - base: Node, - op_token: Token, - op: PrefixOp, - rhs: &Node, - - const PrefixOp = union(enum) { - Return, - AddrOf: AddrOfInfo, - }; - const AddrOfInfo = struct { - align_expr: ?&Node, - bit_offset_start_token: ?Token, - bit_offset_end_token: ?Token, - const_token: ?Token, - volatile_token: ?Token, - }; - - pub fn iterate(self: &NodePrefixOp, index: usize) ?&Node { - var i = index; - - switch (self.op) { - PrefixOp.Return => {}, - PrefixOp.AddrOf => |addr_of_info| { - if (addr_of_info.align_expr) |align_expr| { - if (i < 1) return align_expr; - i -= 1; - } - }, - } - - if (i < 1) return self.rhs; - i -= 1; - - return null; - } -}; - -pub const NodeIntegerLiteral = struct { - base: Node, - token: Token, - - pub fn iterate(self: &NodeIntegerLiteral, index: usize) ?&Node { - return null; - } -}; - -pub const NodeFloatLiteral = struct { - base: Node, - token: Token, - - pub fn iterate(self: &NodeFloatLiteral, index: usize) ?&Node { - return null; - } -}; diff --git a/src-self-hosted/main.zig b/src-self-hosted/main.zig @@ -622,8 +622,3 @@ fn findZigLibDir(allocator: &mem.Allocator) ![]u8 { return error.FileNotFound; } - -test "import tests" { - _ = @import("tokenizer.zig"); - _ = @import("parser.zig"); -} diff --git a/src-self-hosted/module.zig b/src-self-hosted/module.zig @@ -8,9 +8,9 @@ const c = @import("c.zig"); const builtin = @import("builtin"); const Target = @import("target.zig").Target; const warn = std.debug.warn; -const Tokenizer = @import("tokenizer.zig").Tokenizer; -const Token = @import("tokenizer.zig").Token; -const Parser = @import("parser.zig").Parser; +const Tokenizer = std.zig.Tokenizer; +const Token = std.zig.Token; +const Parser = std.zig.Parser; const ArrayList = std.ArrayList; pub const Module = struct { diff --git a/src-self-hosted/parser.zig b/src-self-hosted/parser.zig @@ -1,1160 +0,0 @@ -const std = @import("std"); -const assert = std.debug.assert; -const ArrayList = std.ArrayList; -const mem = std.mem; -const ast = @import("ast.zig"); -const Tokenizer = @import("tokenizer.zig").Tokenizer; -const Token = @import("tokenizer.zig").Token; -const builtin = @import("builtin"); -const io = std.io; - -// TODO when we make parse errors into error types instead of printing directly, -// get rid of this -const warn = std.debug.warn; - -pub const Parser = struct { - allocator: &mem.Allocator, - tokenizer: &Tokenizer, - put_back_tokens: [2]Token, - put_back_count: usize, - source_file_name: []const u8, - - pub const Tree = struct { - root_node: &ast.NodeRoot, - - pub fn deinit(self: &const Tree) void { - // TODO free the whole arena - } - }; - - // This memory contents are used only during a function call. It's used to repurpose memory; - // we reuse the same bytes for the stack data structure used by parsing, tree rendering, and - // source rendering. - const utility_bytes_align = @alignOf( union { a: RenderAstFrame, b: State, c: RenderState } ); - utility_bytes: []align(utility_bytes_align) u8, - - /// `allocator` should be an arena allocator. Parser never calls free on anything. After you're - /// done with a Parser, free the arena. After the arena is freed, no member functions of Parser - /// may be called. - pub fn init(tokenizer: &Tokenizer, allocator: &mem.Allocator, source_file_name: []const u8) Parser { - return Parser { - .allocator = allocator, - .tokenizer = tokenizer, - .put_back_tokens = undefined, - .put_back_count = 0, - .source_file_name = source_file_name, - .utility_bytes = []align(utility_bytes_align) u8{}, - }; - } - - pub fn deinit(self: &Parser) void { - self.allocator.free(self.utility_bytes); - } - - const TopLevelDeclCtx = struct { - visib_token: ?Token, - extern_token: ?Token, - }; - - const DestPtr = union(enum) { - Field: &&ast.Node, - NullableField: &?&ast.Node, - List: &ArrayList(&ast.Node), - - pub fn store(self: &const DestPtr, value: &ast.Node) !void { - switch (*self) { - DestPtr.Field => |ptr| *ptr = value, - DestPtr.NullableField => |ptr| *ptr = value, - DestPtr.List => |list| try list.append(value), - } - } - }; - - const State = union(enum) { - TopLevel, - TopLevelExtern: ?Token, - TopLevelDecl: TopLevelDeclCtx, - Expression: DestPtr, - ExpectOperand, - Operand: &ast.Node, - AfterOperand, - InfixOp: &ast.NodeInfixOp, - PrefixOp: &ast.NodePrefixOp, - AddrOfModifiers: &ast.NodePrefixOp.AddrOfInfo, - TypeExpr: DestPtr, - VarDecl: &ast.NodeVarDecl, - VarDeclAlign: &ast.NodeVarDecl, - VarDeclEq: &ast.NodeVarDecl, - ExpectToken: @TagType(Token.Id), - FnProto: &ast.NodeFnProto, - FnProtoAlign: &ast.NodeFnProto, - ParamDecl: &ast.NodeFnProto, - ParamDeclComma, - FnDef: &ast.NodeFnProto, - Block: &ast.NodeBlock, - Statement: &ast.NodeBlock, - }; - - /// Returns an AST tree, allocated with the parser's allocator. - /// Result should be freed with `freeAst` when done. - pub fn parse(self: &Parser) !Tree { - var stack = self.initUtilityArrayList(State); - defer self.deinitUtilityArrayList(stack); - - const root_node = try self.createRoot(); - // TODO errdefer arena free root node - - try stack.append(State.TopLevel); - - while (true) { - //{ - // const token = self.getNextToken(); - // warn("{} ", @tagName(token.id)); - // self.putBackToken(token); - // var i: usize = stack.len; - // while (i != 0) { - // i -= 1; - // warn("{} ", @tagName(stack.items[i])); - // } - // warn("\n"); - //} - - // This gives us 1 free append that can't fail - const state = stack.pop(); - - switch (state) { - State.TopLevel => { - const token = self.getNextToken(); - switch (token.id) { - Token.Id.Keyword_pub, Token.Id.Keyword_export => { - stack.append(State { .TopLevelExtern = token }) catch unreachable; - continue; - }, - Token.Id.Eof => return Tree {.root_node = root_node}, - else => { - self.putBackToken(token); - // TODO shouldn't need this cast - stack.append(State { .TopLevelExtern = null }) catch unreachable; - continue; - }, - } - }, - State.TopLevelExtern => |visib_token| { - const token = self.getNextToken(); - if (token.id == Token.Id.Keyword_extern) { - stack.append(State { - .TopLevelDecl = TopLevelDeclCtx { - .visib_token = visib_token, - .extern_token = token, - }, - }) catch unreachable; - continue; - } - self.putBackToken(token); - stack.append(State { - .TopLevelDecl = TopLevelDeclCtx { - .visib_token = visib_token, - .extern_token = null, - }, - }) catch unreachable; - continue; - }, - State.TopLevelDecl => |ctx| { - const token = self.getNextToken(); - switch (token.id) { - Token.Id.Keyword_var, Token.Id.Keyword_const => { - stack.append(State.TopLevel) catch unreachable; - // TODO shouldn't need these casts - const var_decl_node = try self.createAttachVarDecl(&root_node.decls, ctx.visib_token, - token, (?Token)(null), ctx.extern_token); - try stack.append(State { .VarDecl = var_decl_node }); - continue; - }, - Token.Id.Keyword_fn => { - stack.append(State.TopLevel) catch unreachable; - // TODO shouldn't need these casts - const fn_proto = try self.createAttachFnProto(&root_node.decls, token, - ctx.extern_token, (?Token)(null), (?Token)(null), (?Token)(null)); - try stack.append(State { .FnDef = fn_proto }); - try stack.append(State { .FnProto = fn_proto }); - continue; - }, - Token.Id.StringLiteral => { - @panic("TODO extern with string literal"); - }, - Token.Id.Keyword_nakedcc, Token.Id.Keyword_stdcallcc => { - stack.append(State.TopLevel) catch unreachable; - const fn_token = try self.eatToken(Token.Id.Keyword_fn); - // TODO shouldn't need this cast - const fn_proto = try self.createAttachFnProto(&root_node.decls, fn_token, - ctx.extern_token, (?Token)(token), (?Token)(null), (?Token)(null)); - try stack.append(State { .FnDef = fn_proto }); - try stack.append(State { .FnProto = fn_proto }); - continue; - }, - else => return self.parseError(token, "expected variable declaration or function, found {}", @tagName(token.id)), - } - }, - State.VarDecl => |var_decl| { - var_decl.name_token = try self.eatToken(Token.Id.Identifier); - stack.append(State { .VarDeclAlign = var_decl }) catch unreachable; - - const next_token = self.getNextToken(); - if (next_token.id == Token.Id.Colon) { - try stack.append(State { .TypeExpr = DestPtr {.NullableField = &var_decl.type_node} }); - continue; - } - - self.putBackToken(next_token); - continue; - }, - State.VarDeclAlign => |var_decl| { - stack.append(State { .VarDeclEq = var_decl }) catch unreachable; - - const next_token = self.getNextToken(); - if (next_token.id == Token.Id.Keyword_align) { - _ = try self.eatToken(Token.Id.LParen); - try stack.append(State { .ExpectToken = Token.Id.RParen }); - try stack.append(State { .Expression = DestPtr{.NullableField = &var_decl.align_node} }); - continue; - } - - self.putBackToken(next_token); - continue; - }, - State.VarDeclEq => |var_decl| { - const token = self.getNextToken(); - if (token.id == Token.Id.Equal) { - var_decl.eq_token = token; - stack.append(State { .ExpectToken = Token.Id.Semicolon }) catch unreachable; - try stack.append(State { - .Expression = DestPtr {.NullableField = &var_decl.init_node}, - }); - continue; - } - if (token.id == Token.Id.Semicolon) { - continue; - } - return self.parseError(token, "expected '=' or ';', found {}", @tagName(token.id)); - }, - State.ExpectToken => |token_id| { - _ = try self.eatToken(token_id); - continue; - }, - - State.Expression => |dest_ptr| { - // save the dest_ptr for later - stack.append(state) catch unreachable; - try stack.append(State.ExpectOperand); - continue; - }, - State.ExpectOperand => { - // we'll either get an operand (like 1 or x), - // or a prefix operator (like ~ or return). - const token = self.getNextToken(); - switch (token.id) { - Token.Id.Keyword_return => { - try stack.append(State { .PrefixOp = try self.createPrefixOp(token, - ast.NodePrefixOp.PrefixOp.Return) }); - try stack.append(State.ExpectOperand); - continue; - }, - Token.Id.Ampersand => { - const prefix_op = try self.createPrefixOp(token, ast.NodePrefixOp.PrefixOp{ - .AddrOf = ast.NodePrefixOp.AddrOfInfo { - .align_expr = null, - .bit_offset_start_token = null, - .bit_offset_end_token = null, - .const_token = null, - .volatile_token = null, - } - }); - try stack.append(State { .PrefixOp = prefix_op }); - try stack.append(State.ExpectOperand); - try stack.append(State { .AddrOfModifiers = &prefix_op.op.AddrOf }); - continue; - }, - Token.Id.Identifier => { - try stack.append(State { - .Operand = &(try self.createIdentifier(token)).base - }); - try stack.append(State.AfterOperand); - continue; - }, - Token.Id.IntegerLiteral => { - try stack.append(State { - .Operand = &(try self.createIntegerLiteral(token)).base - }); - try stack.append(State.AfterOperand); - continue; - }, - Token.Id.FloatLiteral => { - try stack.append(State { - .Operand = &(try self.createFloatLiteral(token)).base - }); - try stack.append(State.AfterOperand); - continue; - }, - else => return self.parseError(token, "expected primary expression, found {}", @tagName(token.id)), - } - }, - - State.AfterOperand => { - // we'll either get an infix operator (like != or ^), - // or a postfix operator (like () or {}), - // otherwise this expression is done (like on a ; or else). - var token = self.getNextToken(); - switch (token.id) { - Token.Id.EqualEqual => { - try stack.append(State { - .InfixOp = try self.createInfixOp(token, ast.NodeInfixOp.InfixOp.EqualEqual) - }); - try stack.append(State.ExpectOperand); - continue; - }, - Token.Id.BangEqual => { - try stack.append(State { - .InfixOp = try self.createInfixOp(token, ast.NodeInfixOp.InfixOp.BangEqual) - }); - try stack.append(State.ExpectOperand); - continue; - }, - else => { - // no postfix/infix operator after this operand. - self.putBackToken(token); - // reduce the stack - var expression: &ast.Node = stack.pop().Operand; - while (true) { - switch (stack.pop()) { - State.Expression => |dest_ptr| { - // we're done - try dest_ptr.store(expression); - break; - }, - State.InfixOp => |infix_op| { - infix_op.rhs = expression; - infix_op.lhs = stack.pop().Operand; - expression = &infix_op.base; - continue; - }, - State.PrefixOp => |prefix_op| { - prefix_op.rhs = expression; - expression = &prefix_op.base; - continue; - }, - else => unreachable, - } - } - continue; - }, - } - }, - - State.AddrOfModifiers => |addr_of_info| { - var token = self.getNextToken(); - switch (token.id) { - Token.Id.Keyword_align => { - stack.append(state) catch unreachable; - if (addr_of_info.align_expr != null) return self.parseError(token, "multiple align qualifiers"); - _ = try self.eatToken(Token.Id.LParen); - try stack.append(State { .ExpectToken = Token.Id.RParen }); - try stack.append(State { .Expression = DestPtr{.NullableField = &addr_of_info.align_expr} }); - continue; - }, - Token.Id.Keyword_const => { - stack.append(state) catch unreachable; - if (addr_of_info.const_token != null) return self.parseError(token, "duplicate qualifier: const"); - addr_of_info.const_token = token; - continue; - }, - Token.Id.Keyword_volatile => { - stack.append(state) catch unreachable; - if (addr_of_info.volatile_token != null) return self.parseError(token, "duplicate qualifier: volatile"); - addr_of_info.volatile_token = token; - continue; - }, - else => { - self.putBackToken(token); - continue; - }, - } - }, - - State.TypeExpr => |dest_ptr| { - const token = self.getNextToken(); - if (token.id == Token.Id.Keyword_var) { - @panic("TODO param with type var"); - } - self.putBackToken(token); - - stack.append(State { .Expression = dest_ptr }) catch unreachable; - continue; - }, - - State.FnProto => |fn_proto| { - stack.append(State { .FnProtoAlign = fn_proto }) catch unreachable; - try stack.append(State { .ParamDecl = fn_proto }); - try stack.append(State { .ExpectToken = Token.Id.LParen }); - - const next_token = self.getNextToken(); - if (next_token.id == Token.Id.Identifier) { - fn_proto.name_token = next_token; - continue; - } - self.putBackToken(next_token); - continue; - }, - - State.FnProtoAlign => |fn_proto| { - const token = self.getNextToken(); - if (token.id == Token.Id.Keyword_align) { - @panic("TODO fn proto align"); - } - self.putBackToken(token); - stack.append(State { - .TypeExpr = DestPtr {.Field = &fn_proto.return_type}, - }) catch unreachable; - continue; - }, - - State.ParamDecl => |fn_proto| { - var token = self.getNextToken(); - if (token.id == Token.Id.RParen) { - continue; - } - const param_decl = try self.createAttachParamDecl(&fn_proto.params); - if (token.id == Token.Id.Keyword_comptime) { - param_decl.comptime_token = token; - token = self.getNextToken(); - } else if (token.id == Token.Id.Keyword_noalias) { - param_decl.noalias_token = token; - token = self.getNextToken(); - } - if (token.id == Token.Id.Identifier) { - const next_token = self.getNextToken(); - if (next_token.id == Token.Id.Colon) { - param_decl.name_token = token; - token = self.getNextToken(); - } else { - self.putBackToken(next_token); - } - } - if (token.id == Token.Id.Ellipsis3) { - param_decl.var_args_token = token; - stack.append(State { .ExpectToken = Token.Id.RParen }) catch unreachable; - continue; - } else { - self.putBackToken(token); - } - - stack.append(State { .ParamDecl = fn_proto }) catch unreachable; - try stack.append(State.ParamDeclComma); - try stack.append(State { - .TypeExpr = DestPtr {.Field = &param_decl.type_node} - }); - continue; - }, - - State.ParamDeclComma => { - const token = self.getNextToken(); - switch (token.id) { - Token.Id.RParen => { - _ = stack.pop(); // pop off the ParamDecl - continue; - }, - Token.Id.Comma => continue, - else => return self.parseError(token, "expected ',' or ')', found {}", @tagName(token.id)), - } - }, - - State.FnDef => |fn_proto| { - const token = self.getNextToken(); - switch(token.id) { - Token.Id.LBrace => { - const block = try self.createBlock(token); - fn_proto.body_node = &block.base; - stack.append(State { .Block = block }) catch unreachable; - continue; - }, - Token.Id.Semicolon => continue, - else => return self.parseError(token, "expected ';' or '{{', found {}", @tagName(token.id)), - } - }, - - State.Block => |block| { - const token = self.getNextToken(); - switch (token.id) { - Token.Id.RBrace => { - block.end_token = token; - continue; - }, - else => { - self.putBackToken(token); - stack.append(State { .Block = block }) catch unreachable; - try stack.append(State { .Statement = block }); - continue; - }, - } - }, - - State.Statement => |block| { - { - // Look for comptime var, comptime const - const comptime_token = self.getNextToken(); - if (comptime_token.id == Token.Id.Keyword_comptime) { - const mut_token = self.getNextToken(); - if (mut_token.id == Token.Id.Keyword_var or mut_token.id == Token.Id.Keyword_const) { - // TODO shouldn't need these casts - const var_decl = try self.createAttachVarDecl(&block.statements, (?Token)(null), - mut_token, (?Token)(comptime_token), (?Token)(null)); - try stack.append(State { .VarDecl = var_decl }); - continue; - } - self.putBackToken(mut_token); - } - self.putBackToken(comptime_token); - } - { - // Look for const, var - const mut_token = self.getNextToken(); - if (mut_token.id == Token.Id.Keyword_var or mut_token.id == Token.Id.Keyword_const) { - // TODO shouldn't need these casts - const var_decl = try self.createAttachVarDecl(&block.statements, (?Token)(null), - mut_token, (?Token)(null), (?Token)(null)); - try stack.append(State { .VarDecl = var_decl }); - continue; - } - self.putBackToken(mut_token); - } - - stack.append(State { .ExpectToken = Token.Id.Semicolon }) catch unreachable; - try stack.append(State { .Expression = DestPtr{.List = &block.statements} }); - continue; - }, - - // These are data, not control flow. - State.InfixOp => unreachable, - State.PrefixOp => unreachable, - State.Operand => unreachable, - } - @import("std").debug.panic("{}", @tagName(state)); - //unreachable; - } - } - - fn createRoot(self: &Parser) !&ast.NodeRoot { - const node = try self.allocator.create(ast.NodeRoot); - - *node = ast.NodeRoot { - .base = ast.Node {.id = ast.Node.Id.Root}, - .decls = ArrayList(&ast.Node).init(self.allocator), - }; - return node; - } - - fn createVarDecl(self: &Parser, visib_token: &const ?Token, mut_token: &const Token, comptime_token: &const ?Token, - extern_token: &const ?Token) !&ast.NodeVarDecl - { - const node = try self.allocator.create(ast.NodeVarDecl); - - *node = ast.NodeVarDecl { - .base = ast.Node {.id = ast.Node.Id.VarDecl}, - .visib_token = *visib_token, - .mut_token = *mut_token, - .comptime_token = *comptime_token, - .extern_token = *extern_token, - .type_node = null, - .align_node = null, - .init_node = null, - .lib_name = null, - // initialized later - .name_token = undefined, - .eq_token = undefined, - }; - return node; - } - - fn createFnProto(self: &Parser, fn_token: &const Token, extern_token: &const ?Token, - cc_token: &const ?Token, visib_token: &const ?Token, inline_token: &const ?Token) !&ast.NodeFnProto - { - const node = try self.allocator.create(ast.NodeFnProto); - - *node = ast.NodeFnProto { - .base = ast.Node {.id = ast.Node.Id.FnProto}, - .visib_token = *visib_token, - .name_token = null, - .fn_token = *fn_token, - .params = ArrayList(&ast.Node).init(self.allocator), - .return_type = undefined, - .var_args_token = null, - .extern_token = *extern_token, - .inline_token = *inline_token, - .cc_token = *cc_token, - .body_node = null, - .lib_name = null, - .align_expr = null, - }; - return node; - } - - fn createParamDecl(self: &Parser) !&ast.NodeParamDecl { - const node = try self.allocator.create(ast.NodeParamDecl); - - *node = ast.NodeParamDecl { - .base = ast.Node {.id = ast.Node.Id.ParamDecl}, - .comptime_token = null, - .noalias_token = null, - .name_token = null, - .type_node = undefined, - .var_args_token = null, - }; - return node; - } - - fn createBlock(self: &Parser, begin_token: &const Token) !&ast.NodeBlock { - const node = try self.allocator.create(ast.NodeBlock); - - *node = ast.NodeBlock { - .base = ast.Node {.id = ast.Node.Id.Block}, - .begin_token = *begin_token, - .end_token = undefined, - .statements = ArrayList(&ast.Node).init(self.allocator), - }; - return node; - } - - fn createInfixOp(self: &Parser, op_token: &const Token, op: &const ast.NodeInfixOp.InfixOp) !&ast.NodeInfixOp { - const node = try self.allocator.create(ast.NodeInfixOp); - - *node = ast.NodeInfixOp { - .base = ast.Node {.id = ast.Node.Id.InfixOp}, - .op_token = *op_token, - .lhs = undefined, - .op = *op, - .rhs = undefined, - }; - return node; - } - - fn createPrefixOp(self: &Parser, op_token: &const Token, op: &const ast.NodePrefixOp.PrefixOp) !&ast.NodePrefixOp { - const node = try self.allocator.create(ast.NodePrefixOp); - - *node = ast.NodePrefixOp { - .base = ast.Node {.id = ast.Node.Id.PrefixOp}, - .op_token = *op_token, - .op = *op, - .rhs = undefined, - }; - return node; - } - - fn createIdentifier(self: &Parser, name_token: &const Token) !&ast.NodeIdentifier { - const node = try self.allocator.create(ast.NodeIdentifier); - - *node = ast.NodeIdentifier { - .base = ast.Node {.id = ast.Node.Id.Identifier}, - .name_token = *name_token, - }; - return node; - } - - fn createIntegerLiteral(self: &Parser, token: &const Token) !&ast.NodeIntegerLiteral { - const node = try self.allocator.create(ast.NodeIntegerLiteral); - - *node = ast.NodeIntegerLiteral { - .base = ast.Node {.id = ast.Node.Id.IntegerLiteral}, - .token = *token, - }; - return node; - } - - fn createFloatLiteral(self: &Parser, token: &const Token) !&ast.NodeFloatLiteral { - const node = try self.allocator.create(ast.NodeFloatLiteral); - - *node = ast.NodeFloatLiteral { - .base = ast.Node {.id = ast.Node.Id.FloatLiteral}, - .token = *token, - }; - return node; - } - - fn createAttachIdentifier(self: &Parser, dest_ptr: &const DestPtr, name_token: &const Token) !&ast.NodeIdentifier { - const node = try self.createIdentifier(name_token); - try dest_ptr.store(&node.base); - return node; - } - - fn createAttachParamDecl(self: &Parser, list: &ArrayList(&ast.Node)) !&ast.NodeParamDecl { - const node = try self.createParamDecl(); - try list.append(&node.base); - return node; - } - - fn createAttachFnProto(self: &Parser, list: &ArrayList(&ast.Node), fn_token: &const Token, - extern_token: &const ?Token, cc_token: &const ?Token, visib_token: &const ?Token, - inline_token: &const ?Token) !&ast.NodeFnProto - { - const node = try self.createFnProto(fn_token, extern_token, cc_token, visib_token, inline_token); - try list.append(&node.base); - return node; - } - - fn createAttachVarDecl(self: &Parser, list: &ArrayList(&ast.Node), visib_token: &const ?Token, - mut_token: &const Token, comptime_token: &const ?Token, extern_token: &const ?Token) !&ast.NodeVarDecl - { - const node = try self.createVarDecl(visib_token, mut_token, comptime_token, extern_token); - try list.append(&node.base); - return node; - } - - fn parseError(self: &Parser, token: &const Token, comptime fmt: []const u8, args: ...) error { - const loc = self.tokenizer.getTokenLocation(token); - warn("{}:{}:{}: error: " ++ fmt ++ "\n", self.source_file_name, loc.line + 1, loc.column + 1, args); - warn("{}\n", self.tokenizer.buffer[loc.line_start..loc.line_end]); - { - var i: usize = 0; - while (i < loc.column) : (i += 1) { - warn(" "); - } - } - { - const caret_count = token.end - token.start; - var i: usize = 0; - while (i < caret_count) : (i += 1) { - warn("~"); - } - } - warn("\n"); - return error.ParseError; - } - - fn expectToken(self: &Parser, token: &const Token, id: @TagType(Token.Id)) !void { - if (token.id != id) { - return self.parseError(token, "expected {}, found {}", @tagName(id), @tagName(token.id)); - } - } - - fn eatToken(self: &Parser, id: @TagType(Token.Id)) !Token { - const token = self.getNextToken(); - try self.expectToken(token, id); - return token; - } - - fn putBackToken(self: &Parser, token: &const Token) void { - self.put_back_tokens[self.put_back_count] = *token; - self.put_back_count += 1; - } - - fn getNextToken(self: &Parser) Token { - if (self.put_back_count != 0) { - const put_back_index = self.put_back_count - 1; - const put_back_token = self.put_back_tokens[put_back_index]; - self.put_back_count = put_back_index; - return put_back_token; - } else { - return self.tokenizer.next(); - } - } - - const RenderAstFrame = struct { - node: &ast.Node, - indent: usize, - }; - - pub fn renderAst(self: &Parser, stream: var, root_node: &ast.NodeRoot) !void { - var stack = self.initUtilityArrayList(RenderAstFrame); - defer self.deinitUtilityArrayList(stack); - - try stack.append(RenderAstFrame { - .node = &root_node.base, - .indent = 0, - }); - - while (stack.popOrNull()) |frame| { - { - var i: usize = 0; - while (i < frame.indent) : (i += 1) { - try stream.print(" "); - } - } - try stream.print("{}\n", @tagName(frame.node.id)); - var child_i: usize = 0; - while (frame.node.iterate(child_i)) |child| : (child_i += 1) { - try stack.append(RenderAstFrame { - .node = child, - .indent = frame.indent + 2, - }); - } - } - } - - const RenderState = union(enum) { - TopLevelDecl: &ast.Node, - FnProtoRParen: &ast.NodeFnProto, - ParamDecl: &ast.Node, - Text: []const u8, - Expression: &ast.Node, - VarDecl: &ast.NodeVarDecl, - Statement: &ast.Node, - PrintIndent, - Indent: usize, - }; - - pub fn renderSource(self: &Parser, stream: var, root_node: &ast.NodeRoot) !void { - var stack = self.initUtilityArrayList(RenderState); - defer self.deinitUtilityArrayList(stack); - - { - var i = root_node.decls.len; - while (i != 0) { - i -= 1; - const decl = root_node.decls.items[i]; - try stack.append(RenderState {.TopLevelDecl = decl}); - } - } - - const indent_delta = 4; - var indent: usize = 0; - while (stack.popOrNull()) |state| { - switch (state) { - RenderState.TopLevelDecl => |decl| { - switch (decl.id) { - ast.Node.Id.FnProto => { - const fn_proto = @fieldParentPtr(ast.NodeFnProto, "base", decl); - if (fn_proto.visib_token) |visib_token| { - switch (visib_token.id) { - Token.Id.Keyword_pub => try stream.print("pub "), - Token.Id.Keyword_export => try stream.print("export "), - else => unreachable, - } - } - if (fn_proto.extern_token) |extern_token| { - try stream.print("{} ", self.tokenizer.getTokenSlice(extern_token)); - } - try stream.print("fn"); - - if (fn_proto.name_token) |name_token| { - try stream.print(" {}", self.tokenizer.getTokenSlice(name_token)); - } - - try stream.print("("); - - try stack.append(RenderState { .Text = "\n" }); - if (fn_proto.body_node == null) { - try stack.append(RenderState { .Text = ";" }); - } - - try stack.append(RenderState { .FnProtoRParen = fn_proto}); - var i = fn_proto.params.len; - while (i != 0) { - i -= 1; - const param_decl_node = fn_proto.params.items[i]; - try stack.append(RenderState { .ParamDecl = param_decl_node}); - if (i != 0) { - try stack.append(RenderState { .Text = ", " }); - } - } - }, - ast.Node.Id.VarDecl => { - const var_decl = @fieldParentPtr(ast.NodeVarDecl, "base", decl); - try stack.append(RenderState { .Text = "\n"}); - try stack.append(RenderState { .VarDecl = var_decl}); - - }, - else => unreachable, - } - }, - - RenderState.VarDecl => |var_decl| { - if (var_decl.visib_token) |visib_token| { - try stream.print("{} ", self.tokenizer.getTokenSlice(visib_token)); - } - if (var_decl.extern_token) |extern_token| { - try stream.print("{} ", self.tokenizer.getTokenSlice(extern_token)); - if (var_decl.lib_name != null) { - @panic("TODO"); - } - } - if (var_decl.comptime_token) |comptime_token| { - try stream.print("{} ", self.tokenizer.getTokenSlice(comptime_token)); - } - try stream.print("{} ", self.tokenizer.getTokenSlice(var_decl.mut_token)); - try stream.print("{}", self.tokenizer.getTokenSlice(var_decl.name_token)); - - try stack.append(RenderState { .Text = ";" }); - if (var_decl.init_node) |init_node| { - try stack.append(RenderState { .Expression = init_node }); - try stack.append(RenderState { .Text = " = " }); - } - if (var_decl.align_node) |align_node| { - try stack.append(RenderState { .Text = ")" }); - try stack.append(RenderState { .Expression = align_node }); - try stack.append(RenderState { .Text = " align(" }); - } - if (var_decl.type_node) |type_node| { - try stream.print(": "); - try stack.append(RenderState { .Expression = type_node }); - } - }, - - RenderState.ParamDecl => |base| { - const param_decl = @fieldParentPtr(ast.NodeParamDecl, "base", base); - if (param_decl.comptime_token) |comptime_token| { - try stream.print("{} ", self.tokenizer.getTokenSlice(comptime_token)); - } - if (param_decl.noalias_token) |noalias_token| { - try stream.print("{} ", self.tokenizer.getTokenSlice(noalias_token)); - } - if (param_decl.name_token) |name_token| { - try stream.print("{}: ", self.tokenizer.getTokenSlice(name_token)); - } - if (param_decl.var_args_token) |var_args_token| { - try stream.print("{}", self.tokenizer.getTokenSlice(var_args_token)); - } else { - try stack.append(RenderState { .Expression = param_decl.type_node}); - } - }, - RenderState.Text => |bytes| { - try stream.write(bytes); - }, - RenderState.Expression => |base| switch (base.id) { - ast.Node.Id.Identifier => { - const identifier = @fieldParentPtr(ast.NodeIdentifier, "base", base); - try stream.print("{}", self.tokenizer.getTokenSlice(identifier.name_token)); - }, - ast.Node.Id.Block => { - const block = @fieldParentPtr(ast.NodeBlock, "base", base); - try stream.write("{"); - try stack.append(RenderState { .Text = "}"}); - try stack.append(RenderState.PrintIndent); - try stack.append(RenderState { .Indent = indent}); - try stack.append(RenderState { .Text = "\n"}); - var i = block.statements.len; - while (i != 0) { - i -= 1; - const statement_node = block.statements.items[i]; - try stack.append(RenderState { .Statement = statement_node}); - try stack.append(RenderState.PrintIndent); - try stack.append(RenderState { .Indent = indent + indent_delta}); - try stack.append(RenderState { .Text = "\n" }); - } - }, - ast.Node.Id.InfixOp => { - const prefix_op_node = @fieldParentPtr(ast.NodeInfixOp, "base", base); - try stack.append(RenderState { .Expression = prefix_op_node.rhs }); - switch (prefix_op_node.op) { - ast.NodeInfixOp.InfixOp.EqualEqual => { - try stack.append(RenderState { .Text = " == "}); - }, - ast.NodeInfixOp.InfixOp.BangEqual => { - try stack.append(RenderState { .Text = " != "}); - }, - else => unreachable, - } - try stack.append(RenderState { .Expression = prefix_op_node.lhs }); - }, - ast.Node.Id.PrefixOp => { - const prefix_op_node = @fieldParentPtr(ast.NodePrefixOp, "base", base); - try stack.append(RenderState { .Expression = prefix_op_node.rhs }); - switch (prefix_op_node.op) { - ast.NodePrefixOp.PrefixOp.Return => { - try stream.write("return "); - }, - ast.NodePrefixOp.PrefixOp.AddrOf => |addr_of_info| { - try stream.write("&"); - if (addr_of_info.volatile_token != null) { - try stack.append(RenderState { .Text = "volatile "}); - } - if (addr_of_info.const_token != null) { - try stack.append(RenderState { .Text = "const "}); - } - if (addr_of_info.align_expr) |align_expr| { - try stream.print("align("); - try stack.append(RenderState { .Text = ") "}); - try stack.append(RenderState { .Expression = align_expr}); - } - }, - else => unreachable, - } - }, - ast.Node.Id.IntegerLiteral => { - const integer_literal = @fieldParentPtr(ast.NodeIntegerLiteral, "base", base); - try stream.print("{}", self.tokenizer.getTokenSlice(integer_literal.token)); - }, - ast.Node.Id.FloatLiteral => { - const float_literal = @fieldParentPtr(ast.NodeFloatLiteral, "base", base); - try stream.print("{}", self.tokenizer.getTokenSlice(float_literal.token)); - }, - else => unreachable, - }, - RenderState.FnProtoRParen => |fn_proto| { - try stream.print(")"); - if (fn_proto.align_expr != null) { - @panic("TODO"); - } - try stream.print(" "); - if (fn_proto.body_node) |body_node| { - try stack.append(RenderState { .Expression = body_node}); - try stack.append(RenderState { .Text = " "}); - } - try stack.append(RenderState { .Expression = fn_proto.return_type}); - }, - RenderState.Statement => |base| { - switch (base.id) { - ast.Node.Id.VarDecl => { - const var_decl = @fieldParentPtr(ast.NodeVarDecl, "base", base); - try stack.append(RenderState { .VarDecl = var_decl}); - }, - else => { - try stack.append(RenderState { .Text = ";"}); - try stack.append(RenderState { .Expression = base}); - }, - } - }, - RenderState.Indent => |new_indent| indent = new_indent, - RenderState.PrintIndent => try stream.writeByteNTimes(' ', indent), - } - } - } - - fn initUtilityArrayList(self: &Parser, comptime T: type) ArrayList(T) { - const new_byte_count = self.utility_bytes.len - self.utility_bytes.len % @sizeOf(T); - self.utility_bytes = self.allocator.alignedShrink(u8, utility_bytes_align, self.utility_bytes, new_byte_count); - const typed_slice = ([]T)(self.utility_bytes); - return ArrayList(T) { - .allocator = self.allocator, - .items = typed_slice, - .len = 0, - }; - } - - fn deinitUtilityArrayList(self: &Parser, list: var) void { - self.utility_bytes = ([]align(utility_bytes_align) u8)(list.items); - } - -}; - -var fixed_buffer_mem: [100 * 1024]u8 = undefined; - -fn testParse(source: []const u8, allocator: &mem.Allocator) ![]u8 { - var padded_source: [0x100]u8 = undefined; - std.mem.copy(u8, padded_source[0..source.len], source); - padded_source[source.len + 0] = '\n'; - padded_source[source.len + 1] = '\n'; - padded_source[source.len + 2] = '\n'; - - var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]); - var parser = Parser.init(&tokenizer, allocator, "(memory buffer)"); - defer parser.deinit(); - - const tree = try parser.parse(); - defer tree.deinit(); - - var buffer = try std.Buffer.initSize(allocator, 0); - var buffer_out_stream = io.BufferOutStream.init(&buffer); - try parser.renderSource(&buffer_out_stream.stream, tree.root_node); - return buffer.toOwnedSlice(); -} - -// TODO test for memory leaks -// TODO test for valid frees -fn testCanonical(source: []const u8) !void { - const needed_alloc_count = x: { - // Try it once with unlimited memory, make sure it works - var fixed_allocator = mem.FixedBufferAllocator.init(fixed_buffer_mem[0..]); - var failing_allocator = std.debug.FailingAllocator.init(&fixed_allocator.allocator, @maxValue(usize)); - const result_source = try testParse(source, &failing_allocator.allocator); - if (!mem.eql(u8, result_source, source)) { - warn("\n====== expected this output: =========\n"); - warn("{}", source); - warn("\n======== instead found this: =========\n"); - warn("{}", result_source); - warn("\n======================================\n"); - return error.TestFailed; - } - failing_allocator.allocator.free(result_source); - break :x failing_allocator.index; - }; - - var fail_index: usize = 0; - while (fail_index < needed_alloc_count) : (fail_index += 1) { - var fixed_allocator = mem.FixedBufferAllocator.init(fixed_buffer_mem[0..]); - var failing_allocator = std.debug.FailingAllocator.init(&fixed_allocator.allocator, fail_index); - if (testParse(source, &failing_allocator.allocator)) |_| { - return error.NondeterministicMemoryUsage; - } else |err| { - assert(err == error.OutOfMemory); - // TODO make this pass - //if (failing_allocator.allocated_bytes != failing_allocator.freed_bytes) { - // warn("\nfail_index: {}/{}\nallocated bytes: {}\nfreed bytes: {}\nallocations: {}\ndeallocations: {}\n", - // fail_index, needed_alloc_count, - // failing_allocator.allocated_bytes, failing_allocator.freed_bytes, - // failing_allocator.index, failing_allocator.deallocations); - // return error.MemoryLeakDetected; - //} - } - } -} - -test "zig fmt" { - try testCanonical( - \\extern fn puts(s: &const u8) c_int; - \\ - ); - - try testCanonical( - \\const a = b; - \\pub const a = b; - \\var a = b; - \\pub var a = b; - \\const a: i32 = b; - \\pub const a: i32 = b; - \\var a: i32 = b; - \\pub var a: i32 = b; - \\ - ); - - try testCanonical( - \\extern var foo: c_int; - \\ - ); - - try testCanonical( - \\var foo: c_int align(1); - \\ - ); - - try testCanonical( - \\fn main(argc: c_int, argv: &&u8) c_int { - \\ const a = b; - \\} - \\ - ); - - try testCanonical( - \\fn foo(argc: c_int, argv: &&u8) c_int { - \\ return 0; - \\} - \\ - ); - - try testCanonical( - \\extern fn f1(s: &align(&u8) u8) c_int; - \\ - ); - - try testCanonical( - \\extern fn f1(s: &&align(1) &const &volatile u8) c_int; - \\extern fn f2(s: &align(1) const &align(1) volatile &const volatile u8) c_int; - \\extern fn f3(s: &align(1) const volatile u8) c_int; - \\ - ); - - try testCanonical( - \\fn f1(a: bool, b: bool) bool { - \\ a != b; - \\ return a == b; - \\} - \\ - ); -} diff --git a/src-self-hosted/tokenizer.zig b/src-self-hosted/tokenizer.zig @@ -1,659 +0,0 @@ -const std = @import("std"); -const mem = std.mem; - -pub const Token = struct { - id: Id, - start: usize, - end: usize, - - const KeywordId = struct { - bytes: []const u8, - id: Id, - }; - - const keywords = []KeywordId { - KeywordId{.bytes="align", .id = Id.Keyword_align}, - KeywordId{.bytes="and", .id = Id.Keyword_and}, - KeywordId{.bytes="asm", .id = Id.Keyword_asm}, - KeywordId{.bytes="break", .id = Id.Keyword_break}, - KeywordId{.bytes="comptime", .id = Id.Keyword_comptime}, - KeywordId{.bytes="const", .id = Id.Keyword_const}, - KeywordId{.bytes="continue", .id = Id.Keyword_continue}, - KeywordId{.bytes="defer", .id = Id.Keyword_defer}, - KeywordId{.bytes="else", .id = Id.Keyword_else}, - KeywordId{.bytes="enum", .id = Id.Keyword_enum}, - KeywordId{.bytes="error", .id = Id.Keyword_error}, - KeywordId{.bytes="export", .id = Id.Keyword_export}, - KeywordId{.bytes="extern", .id = Id.Keyword_extern}, - KeywordId{.bytes="false", .id = Id.Keyword_false}, - KeywordId{.bytes="fn", .id = Id.Keyword_fn}, - KeywordId{.bytes="for", .id = Id.Keyword_for}, - KeywordId{.bytes="goto", .id = Id.Keyword_goto}, - KeywordId{.bytes="if", .id = Id.Keyword_if}, - KeywordId{.bytes="inline", .id = Id.Keyword_inline}, - KeywordId{.bytes="nakedcc", .id = Id.Keyword_nakedcc}, - KeywordId{.bytes="noalias", .id = Id.Keyword_noalias}, - KeywordId{.bytes="null", .id = Id.Keyword_null}, - KeywordId{.bytes="or", .id = Id.Keyword_or}, - KeywordId{.bytes="packed", .id = Id.Keyword_packed}, - KeywordId{.bytes="pub", .id = Id.Keyword_pub}, - KeywordId{.bytes="return", .id = Id.Keyword_return}, - KeywordId{.bytes="stdcallcc", .id = Id.Keyword_stdcallcc}, - KeywordId{.bytes="struct", .id = Id.Keyword_struct}, - KeywordId{.bytes="switch", .id = Id.Keyword_switch}, - KeywordId{.bytes="test", .id = Id.Keyword_test}, - KeywordId{.bytes="this", .id = Id.Keyword_this}, - KeywordId{.bytes="true", .id = Id.Keyword_true}, - KeywordId{.bytes="undefined", .id = Id.Keyword_undefined}, - KeywordId{.bytes="union", .id = Id.Keyword_union}, - KeywordId{.bytes="unreachable", .id = Id.Keyword_unreachable}, - KeywordId{.bytes="use", .id = Id.Keyword_use}, - KeywordId{.bytes="var", .id = Id.Keyword_var}, - KeywordId{.bytes="volatile", .id = Id.Keyword_volatile}, - KeywordId{.bytes="while", .id = Id.Keyword_while}, - }; - - fn getKeyword(bytes: []const u8) ?Id { - for (keywords) |kw| { - if (mem.eql(u8, kw.bytes, bytes)) { - return kw.id; - } - } - return null; - } - - const StrLitKind = enum {Normal, C}; - - pub const Id = union(enum) { - Invalid, - Identifier, - StringLiteral: StrLitKind, - Eof, - Builtin, - Bang, - Equal, - EqualEqual, - BangEqual, - LParen, - RParen, - Semicolon, - Percent, - LBrace, - RBrace, - Period, - Ellipsis2, - Ellipsis3, - Minus, - Arrow, - Colon, - Slash, - Comma, - Ampersand, - AmpersandEqual, - IntegerLiteral, - FloatLiteral, - Keyword_align, - Keyword_and, - Keyword_asm, - Keyword_break, - Keyword_comptime, - Keyword_const, - Keyword_continue, - Keyword_defer, - Keyword_else, - Keyword_enum, - Keyword_error, - Keyword_export, - Keyword_extern, - Keyword_false, - Keyword_fn, - Keyword_for, - Keyword_goto, - Keyword_if, - Keyword_inline, - Keyword_nakedcc, - Keyword_noalias, - Keyword_null, - Keyword_or, - Keyword_packed, - Keyword_pub, - Keyword_return, - Keyword_stdcallcc, - Keyword_struct, - Keyword_switch, - Keyword_test, - Keyword_this, - Keyword_true, - Keyword_undefined, - Keyword_union, - Keyword_unreachable, - Keyword_use, - Keyword_var, - Keyword_volatile, - Keyword_while, - }; -}; - -pub const Tokenizer = struct { - buffer: []const u8, - index: usize, - pending_invalid_token: ?Token, - - pub const Location = struct { - line: usize, - column: usize, - line_start: usize, - line_end: usize, - }; - - pub fn getTokenLocation(self: &Tokenizer, token: &const Token) Location { - var loc = Location { - .line = 0, - .column = 0, - .line_start = 0, - .line_end = 0, - }; - for (self.buffer) |c, i| { - if (i == token.start) { - loc.line_end = i; - while (loc.line_end < self.buffer.len and self.buffer[loc.line_end] != '\n') : (loc.line_end += 1) {} - return loc; - } - if (c == '\n') { - loc.line += 1; - loc.column = 0; - loc.line_start = i + 1; - } else { - loc.column += 1; - } - } - return loc; - } - - /// For debugging purposes - pub fn dump(self: &Tokenizer, token: &const Token) void { - std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]); - } - - /// buffer must end with "\n\n\n". This is so that attempting to decode - /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow. - pub fn init(buffer: []const u8) Tokenizer { - std.debug.assert(buffer[buffer.len - 1] == '\n'); - std.debug.assert(buffer[buffer.len - 2] == '\n'); - std.debug.assert(buffer[buffer.len - 3] == '\n'); - return Tokenizer { - .buffer = buffer, - .index = 0, - .pending_invalid_token = null, - }; - } - - const State = enum { - Start, - Identifier, - Builtin, - C, - StringLiteral, - StringLiteralBackslash, - Equal, - Bang, - Minus, - Slash, - LineComment, - Zero, - IntegerLiteral, - IntegerLiteralWithRadix, - NumberDot, - FloatFraction, - FloatExponentUnsigned, - FloatExponentNumber, - Ampersand, - Period, - Period2, - }; - - pub fn next(self: &Tokenizer) Token { - if (self.pending_invalid_token) |token| { - self.pending_invalid_token = null; - return token; - } - var state = State.Start; - var result = Token { - .id = Token.Id.Eof, - .start = self.index, - .end = undefined, - }; - while (self.index < self.buffer.len) : (self.index += 1) { - const c = self.buffer[self.index]; - switch (state) { - State.Start => switch (c) { - ' ', '\n' => { - result.start = self.index + 1; - }, - 'c' => { - state = State.C; - result.id = Token.Id.Identifier; - }, - '"' => { - state = State.StringLiteral; - result.id = Token.Id { .StringLiteral = Token.StrLitKind.Normal }; - }, - 'a'...'b', 'd'...'z', 'A'...'Z', '_' => { - state = State.Identifier; - result.id = Token.Id.Identifier; - }, - '@' => { - state = State.Builtin; - result.id = Token.Id.Builtin; - }, - '=' => { - state = State.Equal; - }, - '!' => { - state = State.Bang; - }, - '(' => { - result.id = Token.Id.LParen; - self.index += 1; - break; - }, - ')' => { - result.id = Token.Id.RParen; - self.index += 1; - break; - }, - ';' => { - result.id = Token.Id.Semicolon; - self.index += 1; - break; - }, - ',' => { - result.id = Token.Id.Comma; - self.index += 1; - break; - }, - ':' => { - result.id = Token.Id.Colon; - self.index += 1; - break; - }, - '%' => { - result.id = Token.Id.Percent; - self.index += 1; - break; - }, - '{' => { - result.id = Token.Id.LBrace; - self.index += 1; - break; - }, - '}' => { - result.id = Token.Id.RBrace; - self.index += 1; - break; - }, - '.' => { - state = State.Period; - }, - '-' => { - state = State.Minus; - }, - '/' => { - state = State.Slash; - }, - '&' => { - state = State.Ampersand; - }, - '0' => { - state = State.Zero; - result.id = Token.Id.IntegerLiteral; - }, - '1'...'9' => { - state = State.IntegerLiteral; - result.id = Token.Id.IntegerLiteral; - }, - else => { - result.id = Token.Id.Invalid; - self.index += 1; - break; - }, - }, - State.Ampersand => switch (c) { - '=' => { - result.id = Token.Id.AmpersandEqual; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Ampersand; - break; - }, - }, - State.Identifier => switch (c) { - 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, - else => { - if (Token.getKeyword(self.buffer[result.start..self.index])) |id| { - result.id = id; - } - break; - }, - }, - State.Builtin => switch (c) { - 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, - else => break, - }, - State.C => switch (c) { - '\\' => @panic("TODO"), - '"' => { - state = State.StringLiteral; - result.id = Token.Id { .StringLiteral = Token.StrLitKind.C }; - }, - 'a'...'z', 'A'...'Z', '_', '0'...'9' => { - state = State.Identifier; - }, - else => break, - }, - State.StringLiteral => switch (c) { - '\\' => { - state = State.StringLiteralBackslash; - }, - '"' => { - self.index += 1; - break; - }, - '\n' => break, // Look for this error later. - else => self.checkLiteralCharacter(), - }, - - State.StringLiteralBackslash => switch (c) { - '\n' => break, // Look for this error later. - else => { - state = State.StringLiteral; - }, - }, - - State.Bang => switch (c) { - '=' => { - result.id = Token.Id.BangEqual; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Bang; - break; - }, - }, - - State.Equal => switch (c) { - '=' => { - result.id = Token.Id.EqualEqual; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Equal; - break; - }, - }, - - State.Minus => switch (c) { - '>' => { - result.id = Token.Id.Arrow; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Minus; - break; - }, - }, - - State.Period => switch (c) { - '.' => { - state = State.Period2; - }, - else => { - result.id = Token.Id.Period; - break; - }, - }, - - State.Period2 => switch (c) { - '.' => { - result.id = Token.Id.Ellipsis3; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Ellipsis2; - break; - }, - }, - - State.Slash => switch (c) { - '/' => { - result.id = undefined; - state = State.LineComment; - }, - else => { - result.id = Token.Id.Slash; - break; - }, - }, - State.LineComment => switch (c) { - '\n' => { - state = State.Start; - result = Token { - .id = Token.Id.Eof, - .start = self.index + 1, - .end = undefined, - }; - }, - else => self.checkLiteralCharacter(), - }, - State.Zero => switch (c) { - 'b', 'o', 'x' => { - state = State.IntegerLiteralWithRadix; - }, - else => { - // reinterpret as a normal number - self.index -= 1; - state = State.IntegerLiteral; - }, - }, - State.IntegerLiteral => switch (c) { - '.' => { - state = State.NumberDot; - }, - 'p', 'P', 'e', 'E' => { - state = State.FloatExponentUnsigned; - }, - '0'...'9' => {}, - else => break, - }, - State.IntegerLiteralWithRadix => switch (c) { - '.' => { - state = State.NumberDot; - }, - 'p', 'P' => { - state = State.FloatExponentUnsigned; - }, - '0'...'9', 'a'...'f', 'A'...'F' => {}, - else => break, - }, - State.NumberDot => switch (c) { - '.' => { - self.index -= 1; - state = State.Start; - break; - }, - else => { - self.index -= 1; - result.id = Token.Id.FloatLiteral; - state = State.FloatFraction; - }, - }, - State.FloatFraction => switch (c) { - 'p', 'P' => { - state = State.FloatExponentUnsigned; - }, - '0'...'9', 'a'...'f', 'A'...'F' => {}, - else => break, - }, - State.FloatExponentUnsigned => switch (c) { - '+', '-' => { - state = State.FloatExponentNumber; - }, - else => { - // reinterpret as a normal exponent number - self.index -= 1; - state = State.FloatExponentNumber; - } - }, - State.FloatExponentNumber => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => {}, - else => break, - }, - } - } - result.end = self.index; - - if (result.id == Token.Id.Eof) { - if (self.pending_invalid_token) |token| { - self.pending_invalid_token = null; - return token; - } - } - - return result; - } - - pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) []const u8 { - return self.buffer[token.start..token.end]; - } - - fn checkLiteralCharacter(self: &Tokenizer) void { - if (self.pending_invalid_token != null) return; - const invalid_length = self.getInvalidCharacterLength(); - if (invalid_length == 0) return; - self.pending_invalid_token = Token { - .id = Token.Id.Invalid, - .start = self.index, - .end = self.index + invalid_length, - }; - } - - fn getInvalidCharacterLength(self: &Tokenizer) u3 { - const c0 = self.buffer[self.index]; - if (c0 < 0x80) { - if (c0 < 0x20 or c0 == 0x7f) { - // ascii control codes are never allowed - // (note that \n was checked before we got here) - return 1; - } - // looks fine to me. - return 0; - } else { - // check utf8-encoded character. - const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; - // the last 3 bytes in the buffer are guaranteed to be '\n', - // which means we don't need to do any bounds checking here. - const bytes = self.buffer[self.index..self.index + length]; - switch (length) { - 2 => { - const value = std.unicode.utf8Decode2(bytes) catch return length; - if (value == 0x85) return length; // U+0085 (NEL) - }, - 3 => { - const value = std.unicode.utf8Decode3(bytes) catch return length; - if (value == 0x2028) return length; // U+2028 (LS) - if (value == 0x2029) return length; // U+2029 (PS) - }, - 4 => { - _ = std.unicode.utf8Decode4(bytes) catch return length; - }, - else => unreachable, - } - self.index += length - 1; - return 0; - } - } -}; - - - -test "tokenizer" { - testTokenize("test", []Token.Id { - Token.Id.Keyword_test, - }); -} - -test "tokenizer - invalid token characters" { - testTokenize("#", []Token.Id{Token.Id.Invalid}); - testTokenize("`", []Token.Id{Token.Id.Invalid}); -} - -test "tokenizer - invalid literal/comment characters" { - testTokenize("\"\x00\"", []Token.Id { - Token.Id { .StringLiteral = Token.StrLitKind.Normal }, - Token.Id.Invalid, - }); - testTokenize("//\x00", []Token.Id { - Token.Id.Invalid, - }); - testTokenize("//\x1f", []Token.Id { - Token.Id.Invalid, - }); - testTokenize("//\x7f", []Token.Id { - Token.Id.Invalid, - }); -} - -test "tokenizer - utf8" { - testTokenize("//\xc2\x80", []Token.Id{}); - testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{}); -} - -test "tokenizer - invalid utf8" { - testTokenize("//\x80", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xbf", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xf8", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xff", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xe0", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xf0", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid}); -} - -test "tokenizer - illegal unicode codepoints" { - // unicode newline characters.U+0085, U+2028, U+2029 - testTokenize("//\xc2\x84", []Token.Id{}); - testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xc2\x86", []Token.Id{}); - testTokenize("//\xe2\x80\xa7", []Token.Id{}); - testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xe2\x80\xaa", []Token.Id{}); -} - -fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void { - // (test authors, just make this bigger if you need it) - var padded_source: [0x100]u8 = undefined; - std.mem.copy(u8, padded_source[0..source.len], source); - padded_source[source.len + 0] = '\n'; - padded_source[source.len + 1] = '\n'; - padded_source[source.len + 2] = '\n'; - - var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]); - for (expected_tokens) |expected_token_id| { - const token = tokenizer.next(); - std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id)); - switch (expected_token_id) { - Token.Id.StringLiteral => |expected_kind| { - std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable }); - }, - else => {}, - } - } - std.debug.assert(tokenizer.next().id == Token.Id.Eof); -} diff --git a/std/index.zig b/std/index.zig @@ -28,6 +28,7 @@ pub const os = @import("os/index.zig"); pub const rand = @import("rand.zig"); pub const sort = @import("sort.zig"); pub const unicode = @import("unicode.zig"); +pub const zig = @import("zig/index.zig"); test "std" { // run tests from these @@ -58,4 +59,5 @@ test "std" { _ = @import("rand.zig"); _ = @import("sort.zig"); _ = @import("unicode.zig"); + _ = @import("zig/index.zig"); } diff --git a/std/zig/ast.zig b/std/zig/ast.zig @@ -0,0 +1,271 @@ +const std = @import("../index.zig"); +const assert = std.debug.assert; +const ArrayList = std.ArrayList; +const Token = std.zig.Token; +const mem = std.mem; + +pub const Node = struct { + id: Id, + + pub const Id = enum { + Root, + VarDecl, + Identifier, + FnProto, + ParamDecl, + Block, + InfixOp, + PrefixOp, + IntegerLiteral, + FloatLiteral, + }; + + pub fn iterate(base: &Node, index: usize) ?&Node { + return switch (base.id) { + Id.Root => @fieldParentPtr(NodeRoot, "base", base).iterate(index), + Id.VarDecl => @fieldParentPtr(NodeVarDecl, "base", base).iterate(index), + Id.Identifier => @fieldParentPtr(NodeIdentifier, "base", base).iterate(index), + Id.FnProto => @fieldParentPtr(NodeFnProto, "base", base).iterate(index), + Id.ParamDecl => @fieldParentPtr(NodeParamDecl, "base", base).iterate(index), + Id.Block => @fieldParentPtr(NodeBlock, "base", base).iterate(index), + Id.InfixOp => @fieldParentPtr(NodeInfixOp, "base", base).iterate(index), + Id.PrefixOp => @fieldParentPtr(NodePrefixOp, "base", base).iterate(index), + Id.IntegerLiteral => @fieldParentPtr(NodeIntegerLiteral, "base", base).iterate(index), + Id.FloatLiteral => @fieldParentPtr(NodeFloatLiteral, "base", base).iterate(index), + }; + } + + pub fn destroy(base: &Node, allocator: &mem.Allocator) void { + return switch (base.id) { + Id.Root => allocator.destroy(@fieldParentPtr(NodeRoot, "base", base)), + Id.VarDecl => allocator.destroy(@fieldParentPtr(NodeVarDecl, "base", base)), + Id.Identifier => allocator.destroy(@fieldParentPtr(NodeIdentifier, "base", base)), + Id.FnProto => allocator.destroy(@fieldParentPtr(NodeFnProto, "base", base)), + Id.ParamDecl => allocator.destroy(@fieldParentPtr(NodeParamDecl, "base", base)), + Id.Block => allocator.destroy(@fieldParentPtr(NodeBlock, "base", base)), + Id.InfixOp => allocator.destroy(@fieldParentPtr(NodeInfixOp, "base", base)), + Id.PrefixOp => allocator.destroy(@fieldParentPtr(NodePrefixOp, "base", base)), + Id.IntegerLiteral => allocator.destroy(@fieldParentPtr(NodeIntegerLiteral, "base", base)), + Id.FloatLiteral => allocator.destroy(@fieldParentPtr(NodeFloatLiteral, "base", base)), + }; + } +}; + +pub const NodeRoot = struct { + base: Node, + decls: ArrayList(&Node), + + pub fn iterate(self: &NodeRoot, index: usize) ?&Node { + if (index < self.decls.len) { + return self.decls.items[self.decls.len - index - 1]; + } + return null; + } +}; + +pub const NodeVarDecl = struct { + base: Node, + visib_token: ?Token, + name_token: Token, + eq_token: Token, + mut_token: Token, + comptime_token: ?Token, + extern_token: ?Token, + lib_name: ?&Node, + type_node: ?&Node, + align_node: ?&Node, + init_node: ?&Node, + + pub fn iterate(self: &NodeVarDecl, index: usize) ?&Node { + var i = index; + + if (self.type_node) |type_node| { + if (i < 1) return type_node; + i -= 1; + } + + if (self.align_node) |align_node| { + if (i < 1) return align_node; + i -= 1; + } + + if (self.init_node) |init_node| { + if (i < 1) return init_node; + i -= 1; + } + + return null; + } +}; + +pub const NodeIdentifier = struct { + base: Node, + name_token: Token, + + pub fn iterate(self: &NodeIdentifier, index: usize) ?&Node { + return null; + } +}; + +pub const NodeFnProto = struct { + base: Node, + visib_token: ?Token, + fn_token: Token, + name_token: ?Token, + params: ArrayList(&Node), + return_type: &Node, + var_args_token: ?Token, + extern_token: ?Token, + inline_token: ?Token, + cc_token: ?Token, + body_node: ?&Node, + lib_name: ?&Node, // populated if this is an extern declaration + align_expr: ?&Node, // populated if align(A) is present + + pub fn iterate(self: &NodeFnProto, index: usize) ?&Node { + var i = index; + + if (self.body_node) |body_node| { + if (i < 1) return body_node; + i -= 1; + } + + if (i < 1) return self.return_type; + i -= 1; + + if (self.align_expr) |align_expr| { + if (i < 1) return align_expr; + i -= 1; + } + + if (i < self.params.len) return self.params.items[self.params.len - i - 1]; + i -= self.params.len; + + if (self.lib_name) |lib_name| { + if (i < 1) return lib_name; + i -= 1; + } + + return null; + } +}; + +pub const NodeParamDecl = struct { + base: Node, + comptime_token: ?Token, + noalias_token: ?Token, + name_token: ?Token, + type_node: &Node, + var_args_token: ?Token, + + pub fn iterate(self: &NodeParamDecl, index: usize) ?&Node { + var i = index; + + if (i < 1) return self.type_node; + i -= 1; + + return null; + } +}; + +pub const NodeBlock = struct { + base: Node, + begin_token: Token, + end_token: Token, + statements: ArrayList(&Node), + + pub fn iterate(self: &NodeBlock, index: usize) ?&Node { + var i = index; + + if (i < self.statements.len) return self.statements.items[i]; + i -= self.statements.len; + + return null; + } +}; + +pub const NodeInfixOp = struct { + base: Node, + op_token: Token, + lhs: &Node, + op: InfixOp, + rhs: &Node, + + const InfixOp = enum { + EqualEqual, + BangEqual, + }; + + pub fn iterate(self: &NodeInfixOp, index: usize) ?&Node { + var i = index; + + if (i < 1) return self.lhs; + i -= 1; + + switch (self.op) { + InfixOp.EqualEqual => {}, + InfixOp.BangEqual => {}, + } + + if (i < 1) return self.rhs; + i -= 1; + + return null; + } +}; + +pub const NodePrefixOp = struct { + base: Node, + op_token: Token, + op: PrefixOp, + rhs: &Node, + + const PrefixOp = union(enum) { + Return, + AddrOf: AddrOfInfo, + }; + const AddrOfInfo = struct { + align_expr: ?&Node, + bit_offset_start_token: ?Token, + bit_offset_end_token: ?Token, + const_token: ?Token, + volatile_token: ?Token, + }; + + pub fn iterate(self: &NodePrefixOp, index: usize) ?&Node { + var i = index; + + switch (self.op) { + PrefixOp.Return => {}, + PrefixOp.AddrOf => |addr_of_info| { + if (addr_of_info.align_expr) |align_expr| { + if (i < 1) return align_expr; + i -= 1; + } + }, + } + + if (i < 1) return self.rhs; + i -= 1; + + return null; + } +}; + +pub const NodeIntegerLiteral = struct { + base: Node, + token: Token, + + pub fn iterate(self: &NodeIntegerLiteral, index: usize) ?&Node { + return null; + } +}; + +pub const NodeFloatLiteral = struct { + base: Node, + token: Token, + + pub fn iterate(self: &NodeFloatLiteral, index: usize) ?&Node { + return null; + } +}; diff --git a/std/zig/index.zig b/std/zig/index.zig @@ -0,0 +1,11 @@ +const tokenizer = @import("tokenizer.zig"); +pub const Token = tokenizer.Token; +pub const Tokenizer = tokenizer.Tokenizer; +pub const Parser = @import("parser.zig").Parser; +pub const ast = @import("ast.zig"); + +test "std.zig tests" { + _ = @import("tokenizer.zig"); + _ = @import("parser.zig"); + _ = @import("ast.zig"); +} diff --git a/std/zig/parser.zig b/std/zig/parser.zig @@ -0,0 +1,1160 @@ +const std = @import("../index.zig"); +const assert = std.debug.assert; +const ArrayList = std.ArrayList; +const mem = std.mem; +const ast = std.zig.ast; +const Tokenizer = std.zig.Tokenizer; +const Token = std.zig.Token; +const builtin = @import("builtin"); +const io = std.io; + +// TODO when we make parse errors into error types instead of printing directly, +// get rid of this +const warn = std.debug.warn; + +pub const Parser = struct { + allocator: &mem.Allocator, + tokenizer: &Tokenizer, + put_back_tokens: [2]Token, + put_back_count: usize, + source_file_name: []const u8, + + pub const Tree = struct { + root_node: &ast.NodeRoot, + + pub fn deinit(self: &const Tree) void { + // TODO free the whole arena + } + }; + + // This memory contents are used only during a function call. It's used to repurpose memory; + // we reuse the same bytes for the stack data structure used by parsing, tree rendering, and + // source rendering. + const utility_bytes_align = @alignOf( union { a: RenderAstFrame, b: State, c: RenderState } ); + utility_bytes: []align(utility_bytes_align) u8, + + /// `allocator` should be an arena allocator. Parser never calls free on anything. After you're + /// done with a Parser, free the arena. After the arena is freed, no member functions of Parser + /// may be called. + pub fn init(tokenizer: &Tokenizer, allocator: &mem.Allocator, source_file_name: []const u8) Parser { + return Parser { + .allocator = allocator, + .tokenizer = tokenizer, + .put_back_tokens = undefined, + .put_back_count = 0, + .source_file_name = source_file_name, + .utility_bytes = []align(utility_bytes_align) u8{}, + }; + } + + pub fn deinit(self: &Parser) void { + self.allocator.free(self.utility_bytes); + } + + const TopLevelDeclCtx = struct { + visib_token: ?Token, + extern_token: ?Token, + }; + + const DestPtr = union(enum) { + Field: &&ast.Node, + NullableField: &?&ast.Node, + List: &ArrayList(&ast.Node), + + pub fn store(self: &const DestPtr, value: &ast.Node) !void { + switch (*self) { + DestPtr.Field => |ptr| *ptr = value, + DestPtr.NullableField => |ptr| *ptr = value, + DestPtr.List => |list| try list.append(value), + } + } + }; + + const State = union(enum) { + TopLevel, + TopLevelExtern: ?Token, + TopLevelDecl: TopLevelDeclCtx, + Expression: DestPtr, + ExpectOperand, + Operand: &ast.Node, + AfterOperand, + InfixOp: &ast.NodeInfixOp, + PrefixOp: &ast.NodePrefixOp, + AddrOfModifiers: &ast.NodePrefixOp.AddrOfInfo, + TypeExpr: DestPtr, + VarDecl: &ast.NodeVarDecl, + VarDeclAlign: &ast.NodeVarDecl, + VarDeclEq: &ast.NodeVarDecl, + ExpectToken: @TagType(Token.Id), + FnProto: &ast.NodeFnProto, + FnProtoAlign: &ast.NodeFnProto, + ParamDecl: &ast.NodeFnProto, + ParamDeclComma, + FnDef: &ast.NodeFnProto, + Block: &ast.NodeBlock, + Statement: &ast.NodeBlock, + }; + + /// Returns an AST tree, allocated with the parser's allocator. + /// Result should be freed with `freeAst` when done. + pub fn parse(self: &Parser) !Tree { + var stack = self.initUtilityArrayList(State); + defer self.deinitUtilityArrayList(stack); + + const root_node = try self.createRoot(); + // TODO errdefer arena free root node + + try stack.append(State.TopLevel); + + while (true) { + //{ + // const token = self.getNextToken(); + // warn("{} ", @tagName(token.id)); + // self.putBackToken(token); + // var i: usize = stack.len; + // while (i != 0) { + // i -= 1; + // warn("{} ", @tagName(stack.items[i])); + // } + // warn("\n"); + //} + + // This gives us 1 free append that can't fail + const state = stack.pop(); + + switch (state) { + State.TopLevel => { + const token = self.getNextToken(); + switch (token.id) { + Token.Id.Keyword_pub, Token.Id.Keyword_export => { + stack.append(State { .TopLevelExtern = token }) catch unreachable; + continue; + }, + Token.Id.Eof => return Tree {.root_node = root_node}, + else => { + self.putBackToken(token); + // TODO shouldn't need this cast + stack.append(State { .TopLevelExtern = null }) catch unreachable; + continue; + }, + } + }, + State.TopLevelExtern => |visib_token| { + const token = self.getNextToken(); + if (token.id == Token.Id.Keyword_extern) { + stack.append(State { + .TopLevelDecl = TopLevelDeclCtx { + .visib_token = visib_token, + .extern_token = token, + }, + }) catch unreachable; + continue; + } + self.putBackToken(token); + stack.append(State { + .TopLevelDecl = TopLevelDeclCtx { + .visib_token = visib_token, + .extern_token = null, + }, + }) catch unreachable; + continue; + }, + State.TopLevelDecl => |ctx| { + const token = self.getNextToken(); + switch (token.id) { + Token.Id.Keyword_var, Token.Id.Keyword_const => { + stack.append(State.TopLevel) catch unreachable; + // TODO shouldn't need these casts + const var_decl_node = try self.createAttachVarDecl(&root_node.decls, ctx.visib_token, + token, (?Token)(null), ctx.extern_token); + try stack.append(State { .VarDecl = var_decl_node }); + continue; + }, + Token.Id.Keyword_fn => { + stack.append(State.TopLevel) catch unreachable; + // TODO shouldn't need these casts + const fn_proto = try self.createAttachFnProto(&root_node.decls, token, + ctx.extern_token, (?Token)(null), (?Token)(null), (?Token)(null)); + try stack.append(State { .FnDef = fn_proto }); + try stack.append(State { .FnProto = fn_proto }); + continue; + }, + Token.Id.StringLiteral => { + @panic("TODO extern with string literal"); + }, + Token.Id.Keyword_nakedcc, Token.Id.Keyword_stdcallcc => { + stack.append(State.TopLevel) catch unreachable; + const fn_token = try self.eatToken(Token.Id.Keyword_fn); + // TODO shouldn't need this cast + const fn_proto = try self.createAttachFnProto(&root_node.decls, fn_token, + ctx.extern_token, (?Token)(token), (?Token)(null), (?Token)(null)); + try stack.append(State { .FnDef = fn_proto }); + try stack.append(State { .FnProto = fn_proto }); + continue; + }, + else => return self.parseError(token, "expected variable declaration or function, found {}", @tagName(token.id)), + } + }, + State.VarDecl => |var_decl| { + var_decl.name_token = try self.eatToken(Token.Id.Identifier); + stack.append(State { .VarDeclAlign = var_decl }) catch unreachable; + + const next_token = self.getNextToken(); + if (next_token.id == Token.Id.Colon) { + try stack.append(State { .TypeExpr = DestPtr {.NullableField = &var_decl.type_node} }); + continue; + } + + self.putBackToken(next_token); + continue; + }, + State.VarDeclAlign => |var_decl| { + stack.append(State { .VarDeclEq = var_decl }) catch unreachable; + + const next_token = self.getNextToken(); + if (next_token.id == Token.Id.Keyword_align) { + _ = try self.eatToken(Token.Id.LParen); + try stack.append(State { .ExpectToken = Token.Id.RParen }); + try stack.append(State { .Expression = DestPtr{.NullableField = &var_decl.align_node} }); + continue; + } + + self.putBackToken(next_token); + continue; + }, + State.VarDeclEq => |var_decl| { + const token = self.getNextToken(); + if (token.id == Token.Id.Equal) { + var_decl.eq_token = token; + stack.append(State { .ExpectToken = Token.Id.Semicolon }) catch unreachable; + try stack.append(State { + .Expression = DestPtr {.NullableField = &var_decl.init_node}, + }); + continue; + } + if (token.id == Token.Id.Semicolon) { + continue; + } + return self.parseError(token, "expected '=' or ';', found {}", @tagName(token.id)); + }, + State.ExpectToken => |token_id| { + _ = try self.eatToken(token_id); + continue; + }, + + State.Expression => |dest_ptr| { + // save the dest_ptr for later + stack.append(state) catch unreachable; + try stack.append(State.ExpectOperand); + continue; + }, + State.ExpectOperand => { + // we'll either get an operand (like 1 or x), + // or a prefix operator (like ~ or return). + const token = self.getNextToken(); + switch (token.id) { + Token.Id.Keyword_return => { + try stack.append(State { .PrefixOp = try self.createPrefixOp(token, + ast.NodePrefixOp.PrefixOp.Return) }); + try stack.append(State.ExpectOperand); + continue; + }, + Token.Id.Ampersand => { + const prefix_op = try self.createPrefixOp(token, ast.NodePrefixOp.PrefixOp{ + .AddrOf = ast.NodePrefixOp.AddrOfInfo { + .align_expr = null, + .bit_offset_start_token = null, + .bit_offset_end_token = null, + .const_token = null, + .volatile_token = null, + } + }); + try stack.append(State { .PrefixOp = prefix_op }); + try stack.append(State.ExpectOperand); + try stack.append(State { .AddrOfModifiers = &prefix_op.op.AddrOf }); + continue; + }, + Token.Id.Identifier => { + try stack.append(State { + .Operand = &(try self.createIdentifier(token)).base + }); + try stack.append(State.AfterOperand); + continue; + }, + Token.Id.IntegerLiteral => { + try stack.append(State { + .Operand = &(try self.createIntegerLiteral(token)).base + }); + try stack.append(State.AfterOperand); + continue; + }, + Token.Id.FloatLiteral => { + try stack.append(State { + .Operand = &(try self.createFloatLiteral(token)).base + }); + try stack.append(State.AfterOperand); + continue; + }, + else => return self.parseError(token, "expected primary expression, found {}", @tagName(token.id)), + } + }, + + State.AfterOperand => { + // we'll either get an infix operator (like != or ^), + // or a postfix operator (like () or {}), + // otherwise this expression is done (like on a ; or else). + var token = self.getNextToken(); + switch (token.id) { + Token.Id.EqualEqual => { + try stack.append(State { + .InfixOp = try self.createInfixOp(token, ast.NodeInfixOp.InfixOp.EqualEqual) + }); + try stack.append(State.ExpectOperand); + continue; + }, + Token.Id.BangEqual => { + try stack.append(State { + .InfixOp = try self.createInfixOp(token, ast.NodeInfixOp.InfixOp.BangEqual) + }); + try stack.append(State.ExpectOperand); + continue; + }, + else => { + // no postfix/infix operator after this operand. + self.putBackToken(token); + // reduce the stack + var expression: &ast.Node = stack.pop().Operand; + while (true) { + switch (stack.pop()) { + State.Expression => |dest_ptr| { + // we're done + try dest_ptr.store(expression); + break; + }, + State.InfixOp => |infix_op| { + infix_op.rhs = expression; + infix_op.lhs = stack.pop().Operand; + expression = &infix_op.base; + continue; + }, + State.PrefixOp => |prefix_op| { + prefix_op.rhs = expression; + expression = &prefix_op.base; + continue; + }, + else => unreachable, + } + } + continue; + }, + } + }, + + State.AddrOfModifiers => |addr_of_info| { + var token = self.getNextToken(); + switch (token.id) { + Token.Id.Keyword_align => { + stack.append(state) catch unreachable; + if (addr_of_info.align_expr != null) return self.parseError(token, "multiple align qualifiers"); + _ = try self.eatToken(Token.Id.LParen); + try stack.append(State { .ExpectToken = Token.Id.RParen }); + try stack.append(State { .Expression = DestPtr{.NullableField = &addr_of_info.align_expr} }); + continue; + }, + Token.Id.Keyword_const => { + stack.append(state) catch unreachable; + if (addr_of_info.const_token != null) return self.parseError(token, "duplicate qualifier: const"); + addr_of_info.const_token = token; + continue; + }, + Token.Id.Keyword_volatile => { + stack.append(state) catch unreachable; + if (addr_of_info.volatile_token != null) return self.parseError(token, "duplicate qualifier: volatile"); + addr_of_info.volatile_token = token; + continue; + }, + else => { + self.putBackToken(token); + continue; + }, + } + }, + + State.TypeExpr => |dest_ptr| { + const token = self.getNextToken(); + if (token.id == Token.Id.Keyword_var) { + @panic("TODO param with type var"); + } + self.putBackToken(token); + + stack.append(State { .Expression = dest_ptr }) catch unreachable; + continue; + }, + + State.FnProto => |fn_proto| { + stack.append(State { .FnProtoAlign = fn_proto }) catch unreachable; + try stack.append(State { .ParamDecl = fn_proto }); + try stack.append(State { .ExpectToken = Token.Id.LParen }); + + const next_token = self.getNextToken(); + if (next_token.id == Token.Id.Identifier) { + fn_proto.name_token = next_token; + continue; + } + self.putBackToken(next_token); + continue; + }, + + State.FnProtoAlign => |fn_proto| { + const token = self.getNextToken(); + if (token.id == Token.Id.Keyword_align) { + @panic("TODO fn proto align"); + } + self.putBackToken(token); + stack.append(State { + .TypeExpr = DestPtr {.Field = &fn_proto.return_type}, + }) catch unreachable; + continue; + }, + + State.ParamDecl => |fn_proto| { + var token = self.getNextToken(); + if (token.id == Token.Id.RParen) { + continue; + } + const param_decl = try self.createAttachParamDecl(&fn_proto.params); + if (token.id == Token.Id.Keyword_comptime) { + param_decl.comptime_token = token; + token = self.getNextToken(); + } else if (token.id == Token.Id.Keyword_noalias) { + param_decl.noalias_token = token; + token = self.getNextToken(); + } + if (token.id == Token.Id.Identifier) { + const next_token = self.getNextToken(); + if (next_token.id == Token.Id.Colon) { + param_decl.name_token = token; + token = self.getNextToken(); + } else { + self.putBackToken(next_token); + } + } + if (token.id == Token.Id.Ellipsis3) { + param_decl.var_args_token = token; + stack.append(State { .ExpectToken = Token.Id.RParen }) catch unreachable; + continue; + } else { + self.putBackToken(token); + } + + stack.append(State { .ParamDecl = fn_proto }) catch unreachable; + try stack.append(State.ParamDeclComma); + try stack.append(State { + .TypeExpr = DestPtr {.Field = &param_decl.type_node} + }); + continue; + }, + + State.ParamDeclComma => { + const token = self.getNextToken(); + switch (token.id) { + Token.Id.RParen => { + _ = stack.pop(); // pop off the ParamDecl + continue; + }, + Token.Id.Comma => continue, + else => return self.parseError(token, "expected ',' or ')', found {}", @tagName(token.id)), + } + }, + + State.FnDef => |fn_proto| { + const token = self.getNextToken(); + switch(token.id) { + Token.Id.LBrace => { + const block = try self.createBlock(token); + fn_proto.body_node = &block.base; + stack.append(State { .Block = block }) catch unreachable; + continue; + }, + Token.Id.Semicolon => continue, + else => return self.parseError(token, "expected ';' or '{{', found {}", @tagName(token.id)), + } + }, + + State.Block => |block| { + const token = self.getNextToken(); + switch (token.id) { + Token.Id.RBrace => { + block.end_token = token; + continue; + }, + else => { + self.putBackToken(token); + stack.append(State { .Block = block }) catch unreachable; + try stack.append(State { .Statement = block }); + continue; + }, + } + }, + + State.Statement => |block| { + { + // Look for comptime var, comptime const + const comptime_token = self.getNextToken(); + if (comptime_token.id == Token.Id.Keyword_comptime) { + const mut_token = self.getNextToken(); + if (mut_token.id == Token.Id.Keyword_var or mut_token.id == Token.Id.Keyword_const) { + // TODO shouldn't need these casts + const var_decl = try self.createAttachVarDecl(&block.statements, (?Token)(null), + mut_token, (?Token)(comptime_token), (?Token)(null)); + try stack.append(State { .VarDecl = var_decl }); + continue; + } + self.putBackToken(mut_token); + } + self.putBackToken(comptime_token); + } + { + // Look for const, var + const mut_token = self.getNextToken(); + if (mut_token.id == Token.Id.Keyword_var or mut_token.id == Token.Id.Keyword_const) { + // TODO shouldn't need these casts + const var_decl = try self.createAttachVarDecl(&block.statements, (?Token)(null), + mut_token, (?Token)(null), (?Token)(null)); + try stack.append(State { .VarDecl = var_decl }); + continue; + } + self.putBackToken(mut_token); + } + + stack.append(State { .ExpectToken = Token.Id.Semicolon }) catch unreachable; + try stack.append(State { .Expression = DestPtr{.List = &block.statements} }); + continue; + }, + + // These are data, not control flow. + State.InfixOp => unreachable, + State.PrefixOp => unreachable, + State.Operand => unreachable, + } + @import("std").debug.panic("{}", @tagName(state)); + //unreachable; + } + } + + fn createRoot(self: &Parser) !&ast.NodeRoot { + const node = try self.allocator.create(ast.NodeRoot); + + *node = ast.NodeRoot { + .base = ast.Node {.id = ast.Node.Id.Root}, + .decls = ArrayList(&ast.Node).init(self.allocator), + }; + return node; + } + + fn createVarDecl(self: &Parser, visib_token: &const ?Token, mut_token: &const Token, comptime_token: &const ?Token, + extern_token: &const ?Token) !&ast.NodeVarDecl + { + const node = try self.allocator.create(ast.NodeVarDecl); + + *node = ast.NodeVarDecl { + .base = ast.Node {.id = ast.Node.Id.VarDecl}, + .visib_token = *visib_token, + .mut_token = *mut_token, + .comptime_token = *comptime_token, + .extern_token = *extern_token, + .type_node = null, + .align_node = null, + .init_node = null, + .lib_name = null, + // initialized later + .name_token = undefined, + .eq_token = undefined, + }; + return node; + } + + fn createFnProto(self: &Parser, fn_token: &const Token, extern_token: &const ?Token, + cc_token: &const ?Token, visib_token: &const ?Token, inline_token: &const ?Token) !&ast.NodeFnProto + { + const node = try self.allocator.create(ast.NodeFnProto); + + *node = ast.NodeFnProto { + .base = ast.Node {.id = ast.Node.Id.FnProto}, + .visib_token = *visib_token, + .name_token = null, + .fn_token = *fn_token, + .params = ArrayList(&ast.Node).init(self.allocator), + .return_type = undefined, + .var_args_token = null, + .extern_token = *extern_token, + .inline_token = *inline_token, + .cc_token = *cc_token, + .body_node = null, + .lib_name = null, + .align_expr = null, + }; + return node; + } + + fn createParamDecl(self: &Parser) !&ast.NodeParamDecl { + const node = try self.allocator.create(ast.NodeParamDecl); + + *node = ast.NodeParamDecl { + .base = ast.Node {.id = ast.Node.Id.ParamDecl}, + .comptime_token = null, + .noalias_token = null, + .name_token = null, + .type_node = undefined, + .var_args_token = null, + }; + return node; + } + + fn createBlock(self: &Parser, begin_token: &const Token) !&ast.NodeBlock { + const node = try self.allocator.create(ast.NodeBlock); + + *node = ast.NodeBlock { + .base = ast.Node {.id = ast.Node.Id.Block}, + .begin_token = *begin_token, + .end_token = undefined, + .statements = ArrayList(&ast.Node).init(self.allocator), + }; + return node; + } + + fn createInfixOp(self: &Parser, op_token: &const Token, op: &const ast.NodeInfixOp.InfixOp) !&ast.NodeInfixOp { + const node = try self.allocator.create(ast.NodeInfixOp); + + *node = ast.NodeInfixOp { + .base = ast.Node {.id = ast.Node.Id.InfixOp}, + .op_token = *op_token, + .lhs = undefined, + .op = *op, + .rhs = undefined, + }; + return node; + } + + fn createPrefixOp(self: &Parser, op_token: &const Token, op: &const ast.NodePrefixOp.PrefixOp) !&ast.NodePrefixOp { + const node = try self.allocator.create(ast.NodePrefixOp); + + *node = ast.NodePrefixOp { + .base = ast.Node {.id = ast.Node.Id.PrefixOp}, + .op_token = *op_token, + .op = *op, + .rhs = undefined, + }; + return node; + } + + fn createIdentifier(self: &Parser, name_token: &const Token) !&ast.NodeIdentifier { + const node = try self.allocator.create(ast.NodeIdentifier); + + *node = ast.NodeIdentifier { + .base = ast.Node {.id = ast.Node.Id.Identifier}, + .name_token = *name_token, + }; + return node; + } + + fn createIntegerLiteral(self: &Parser, token: &const Token) !&ast.NodeIntegerLiteral { + const node = try self.allocator.create(ast.NodeIntegerLiteral); + + *node = ast.NodeIntegerLiteral { + .base = ast.Node {.id = ast.Node.Id.IntegerLiteral}, + .token = *token, + }; + return node; + } + + fn createFloatLiteral(self: &Parser, token: &const Token) !&ast.NodeFloatLiteral { + const node = try self.allocator.create(ast.NodeFloatLiteral); + + *node = ast.NodeFloatLiteral { + .base = ast.Node {.id = ast.Node.Id.FloatLiteral}, + .token = *token, + }; + return node; + } + + fn createAttachIdentifier(self: &Parser, dest_ptr: &const DestPtr, name_token: &const Token) !&ast.NodeIdentifier { + const node = try self.createIdentifier(name_token); + try dest_ptr.store(&node.base); + return node; + } + + fn createAttachParamDecl(self: &Parser, list: &ArrayList(&ast.Node)) !&ast.NodeParamDecl { + const node = try self.createParamDecl(); + try list.append(&node.base); + return node; + } + + fn createAttachFnProto(self: &Parser, list: &ArrayList(&ast.Node), fn_token: &const Token, + extern_token: &const ?Token, cc_token: &const ?Token, visib_token: &const ?Token, + inline_token: &const ?Token) !&ast.NodeFnProto + { + const node = try self.createFnProto(fn_token, extern_token, cc_token, visib_token, inline_token); + try list.append(&node.base); + return node; + } + + fn createAttachVarDecl(self: &Parser, list: &ArrayList(&ast.Node), visib_token: &const ?Token, + mut_token: &const Token, comptime_token: &const ?Token, extern_token: &const ?Token) !&ast.NodeVarDecl + { + const node = try self.createVarDecl(visib_token, mut_token, comptime_token, extern_token); + try list.append(&node.base); + return node; + } + + fn parseError(self: &Parser, token: &const Token, comptime fmt: []const u8, args: ...) error { + const loc = self.tokenizer.getTokenLocation(token); + warn("{}:{}:{}: error: " ++ fmt ++ "\n", self.source_file_name, loc.line + 1, loc.column + 1, args); + warn("{}\n", self.tokenizer.buffer[loc.line_start..loc.line_end]); + { + var i: usize = 0; + while (i < loc.column) : (i += 1) { + warn(" "); + } + } + { + const caret_count = token.end - token.start; + var i: usize = 0; + while (i < caret_count) : (i += 1) { + warn("~"); + } + } + warn("\n"); + return error.ParseError; + } + + fn expectToken(self: &Parser, token: &const Token, id: @TagType(Token.Id)) !void { + if (token.id != id) { + return self.parseError(token, "expected {}, found {}", @tagName(id), @tagName(token.id)); + } + } + + fn eatToken(self: &Parser, id: @TagType(Token.Id)) !Token { + const token = self.getNextToken(); + try self.expectToken(token, id); + return token; + } + + fn putBackToken(self: &Parser, token: &const Token) void { + self.put_back_tokens[self.put_back_count] = *token; + self.put_back_count += 1; + } + + fn getNextToken(self: &Parser) Token { + if (self.put_back_count != 0) { + const put_back_index = self.put_back_count - 1; + const put_back_token = self.put_back_tokens[put_back_index]; + self.put_back_count = put_back_index; + return put_back_token; + } else { + return self.tokenizer.next(); + } + } + + const RenderAstFrame = struct { + node: &ast.Node, + indent: usize, + }; + + pub fn renderAst(self: &Parser, stream: var, root_node: &ast.NodeRoot) !void { + var stack = self.initUtilityArrayList(RenderAstFrame); + defer self.deinitUtilityArrayList(stack); + + try stack.append(RenderAstFrame { + .node = &root_node.base, + .indent = 0, + }); + + while (stack.popOrNull()) |frame| { + { + var i: usize = 0; + while (i < frame.indent) : (i += 1) { + try stream.print(" "); + } + } + try stream.print("{}\n", @tagName(frame.node.id)); + var child_i: usize = 0; + while (frame.node.iterate(child_i)) |child| : (child_i += 1) { + try stack.append(RenderAstFrame { + .node = child, + .indent = frame.indent + 2, + }); + } + } + } + + const RenderState = union(enum) { + TopLevelDecl: &ast.Node, + FnProtoRParen: &ast.NodeFnProto, + ParamDecl: &ast.Node, + Text: []const u8, + Expression: &ast.Node, + VarDecl: &ast.NodeVarDecl, + Statement: &ast.Node, + PrintIndent, + Indent: usize, + }; + + pub fn renderSource(self: &Parser, stream: var, root_node: &ast.NodeRoot) !void { + var stack = self.initUtilityArrayList(RenderState); + defer self.deinitUtilityArrayList(stack); + + { + var i = root_node.decls.len; + while (i != 0) { + i -= 1; + const decl = root_node.decls.items[i]; + try stack.append(RenderState {.TopLevelDecl = decl}); + } + } + + const indent_delta = 4; + var indent: usize = 0; + while (stack.popOrNull()) |state| { + switch (state) { + RenderState.TopLevelDecl => |decl| { + switch (decl.id) { + ast.Node.Id.FnProto => { + const fn_proto = @fieldParentPtr(ast.NodeFnProto, "base", decl); + if (fn_proto.visib_token) |visib_token| { + switch (visib_token.id) { + Token.Id.Keyword_pub => try stream.print("pub "), + Token.Id.Keyword_export => try stream.print("export "), + else => unreachable, + } + } + if (fn_proto.extern_token) |extern_token| { + try stream.print("{} ", self.tokenizer.getTokenSlice(extern_token)); + } + try stream.print("fn"); + + if (fn_proto.name_token) |name_token| { + try stream.print(" {}", self.tokenizer.getTokenSlice(name_token)); + } + + try stream.print("("); + + try stack.append(RenderState { .Text = "\n" }); + if (fn_proto.body_node == null) { + try stack.append(RenderState { .Text = ";" }); + } + + try stack.append(RenderState { .FnProtoRParen = fn_proto}); + var i = fn_proto.params.len; + while (i != 0) { + i -= 1; + const param_decl_node = fn_proto.params.items[i]; + try stack.append(RenderState { .ParamDecl = param_decl_node}); + if (i != 0) { + try stack.append(RenderState { .Text = ", " }); + } + } + }, + ast.Node.Id.VarDecl => { + const var_decl = @fieldParentPtr(ast.NodeVarDecl, "base", decl); + try stack.append(RenderState { .Text = "\n"}); + try stack.append(RenderState { .VarDecl = var_decl}); + + }, + else => unreachable, + } + }, + + RenderState.VarDecl => |var_decl| { + if (var_decl.visib_token) |visib_token| { + try stream.print("{} ", self.tokenizer.getTokenSlice(visib_token)); + } + if (var_decl.extern_token) |extern_token| { + try stream.print("{} ", self.tokenizer.getTokenSlice(extern_token)); + if (var_decl.lib_name != null) { + @panic("TODO"); + } + } + if (var_decl.comptime_token) |comptime_token| { + try stream.print("{} ", self.tokenizer.getTokenSlice(comptime_token)); + } + try stream.print("{} ", self.tokenizer.getTokenSlice(var_decl.mut_token)); + try stream.print("{}", self.tokenizer.getTokenSlice(var_decl.name_token)); + + try stack.append(RenderState { .Text = ";" }); + if (var_decl.init_node) |init_node| { + try stack.append(RenderState { .Expression = init_node }); + try stack.append(RenderState { .Text = " = " }); + } + if (var_decl.align_node) |align_node| { + try stack.append(RenderState { .Text = ")" }); + try stack.append(RenderState { .Expression = align_node }); + try stack.append(RenderState { .Text = " align(" }); + } + if (var_decl.type_node) |type_node| { + try stream.print(": "); + try stack.append(RenderState { .Expression = type_node }); + } + }, + + RenderState.ParamDecl => |base| { + const param_decl = @fieldParentPtr(ast.NodeParamDecl, "base", base); + if (param_decl.comptime_token) |comptime_token| { + try stream.print("{} ", self.tokenizer.getTokenSlice(comptime_token)); + } + if (param_decl.noalias_token) |noalias_token| { + try stream.print("{} ", self.tokenizer.getTokenSlice(noalias_token)); + } + if (param_decl.name_token) |name_token| { + try stream.print("{}: ", self.tokenizer.getTokenSlice(name_token)); + } + if (param_decl.var_args_token) |var_args_token| { + try stream.print("{}", self.tokenizer.getTokenSlice(var_args_token)); + } else { + try stack.append(RenderState { .Expression = param_decl.type_node}); + } + }, + RenderState.Text => |bytes| { + try stream.write(bytes); + }, + RenderState.Expression => |base| switch (base.id) { + ast.Node.Id.Identifier => { + const identifier = @fieldParentPtr(ast.NodeIdentifier, "base", base); + try stream.print("{}", self.tokenizer.getTokenSlice(identifier.name_token)); + }, + ast.Node.Id.Block => { + const block = @fieldParentPtr(ast.NodeBlock, "base", base); + try stream.write("{"); + try stack.append(RenderState { .Text = "}"}); + try stack.append(RenderState.PrintIndent); + try stack.append(RenderState { .Indent = indent}); + try stack.append(RenderState { .Text = "\n"}); + var i = block.statements.len; + while (i != 0) { + i -= 1; + const statement_node = block.statements.items[i]; + try stack.append(RenderState { .Statement = statement_node}); + try stack.append(RenderState.PrintIndent); + try stack.append(RenderState { .Indent = indent + indent_delta}); + try stack.append(RenderState { .Text = "\n" }); + } + }, + ast.Node.Id.InfixOp => { + const prefix_op_node = @fieldParentPtr(ast.NodeInfixOp, "base", base); + try stack.append(RenderState { .Expression = prefix_op_node.rhs }); + switch (prefix_op_node.op) { + ast.NodeInfixOp.InfixOp.EqualEqual => { + try stack.append(RenderState { .Text = " == "}); + }, + ast.NodeInfixOp.InfixOp.BangEqual => { + try stack.append(RenderState { .Text = " != "}); + }, + else => unreachable, + } + try stack.append(RenderState { .Expression = prefix_op_node.lhs }); + }, + ast.Node.Id.PrefixOp => { + const prefix_op_node = @fieldParentPtr(ast.NodePrefixOp, "base", base); + try stack.append(RenderState { .Expression = prefix_op_node.rhs }); + switch (prefix_op_node.op) { + ast.NodePrefixOp.PrefixOp.Return => { + try stream.write("return "); + }, + ast.NodePrefixOp.PrefixOp.AddrOf => |addr_of_info| { + try stream.write("&"); + if (addr_of_info.volatile_token != null) { + try stack.append(RenderState { .Text = "volatile "}); + } + if (addr_of_info.const_token != null) { + try stack.append(RenderState { .Text = "const "}); + } + if (addr_of_info.align_expr) |align_expr| { + try stream.print("align("); + try stack.append(RenderState { .Text = ") "}); + try stack.append(RenderState { .Expression = align_expr}); + } + }, + else => unreachable, + } + }, + ast.Node.Id.IntegerLiteral => { + const integer_literal = @fieldParentPtr(ast.NodeIntegerLiteral, "base", base); + try stream.print("{}", self.tokenizer.getTokenSlice(integer_literal.token)); + }, + ast.Node.Id.FloatLiteral => { + const float_literal = @fieldParentPtr(ast.NodeFloatLiteral, "base", base); + try stream.print("{}", self.tokenizer.getTokenSlice(float_literal.token)); + }, + else => unreachable, + }, + RenderState.FnProtoRParen => |fn_proto| { + try stream.print(")"); + if (fn_proto.align_expr != null) { + @panic("TODO"); + } + try stream.print(" "); + if (fn_proto.body_node) |body_node| { + try stack.append(RenderState { .Expression = body_node}); + try stack.append(RenderState { .Text = " "}); + } + try stack.append(RenderState { .Expression = fn_proto.return_type}); + }, + RenderState.Statement => |base| { + switch (base.id) { + ast.Node.Id.VarDecl => { + const var_decl = @fieldParentPtr(ast.NodeVarDecl, "base", base); + try stack.append(RenderState { .VarDecl = var_decl}); + }, + else => { + try stack.append(RenderState { .Text = ";"}); + try stack.append(RenderState { .Expression = base}); + }, + } + }, + RenderState.Indent => |new_indent| indent = new_indent, + RenderState.PrintIndent => try stream.writeByteNTimes(' ', indent), + } + } + } + + fn initUtilityArrayList(self: &Parser, comptime T: type) ArrayList(T) { + const new_byte_count = self.utility_bytes.len - self.utility_bytes.len % @sizeOf(T); + self.utility_bytes = self.allocator.alignedShrink(u8, utility_bytes_align, self.utility_bytes, new_byte_count); + const typed_slice = ([]T)(self.utility_bytes); + return ArrayList(T) { + .allocator = self.allocator, + .items = typed_slice, + .len = 0, + }; + } + + fn deinitUtilityArrayList(self: &Parser, list: var) void { + self.utility_bytes = ([]align(utility_bytes_align) u8)(list.items); + } + +}; + +var fixed_buffer_mem: [100 * 1024]u8 = undefined; + +fn testParse(source: []const u8, allocator: &mem.Allocator) ![]u8 { + var padded_source: [0x100]u8 = undefined; + std.mem.copy(u8, padded_source[0..source.len], source); + padded_source[source.len + 0] = '\n'; + padded_source[source.len + 1] = '\n'; + padded_source[source.len + 2] = '\n'; + + var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]); + var parser = Parser.init(&tokenizer, allocator, "(memory buffer)"); + defer parser.deinit(); + + const tree = try parser.parse(); + defer tree.deinit(); + + var buffer = try std.Buffer.initSize(allocator, 0); + var buffer_out_stream = io.BufferOutStream.init(&buffer); + try parser.renderSource(&buffer_out_stream.stream, tree.root_node); + return buffer.toOwnedSlice(); +} + +// TODO test for memory leaks +// TODO test for valid frees +fn testCanonical(source: []const u8) !void { + const needed_alloc_count = x: { + // Try it once with unlimited memory, make sure it works + var fixed_allocator = mem.FixedBufferAllocator.init(fixed_buffer_mem[0..]); + var failing_allocator = std.debug.FailingAllocator.init(&fixed_allocator.allocator, @maxValue(usize)); + const result_source = try testParse(source, &failing_allocator.allocator); + if (!mem.eql(u8, result_source, source)) { + warn("\n====== expected this output: =========\n"); + warn("{}", source); + warn("\n======== instead found this: =========\n"); + warn("{}", result_source); + warn("\n======================================\n"); + return error.TestFailed; + } + failing_allocator.allocator.free(result_source); + break :x failing_allocator.index; + }; + + var fail_index: usize = 0; + while (fail_index < needed_alloc_count) : (fail_index += 1) { + var fixed_allocator = mem.FixedBufferAllocator.init(fixed_buffer_mem[0..]); + var failing_allocator = std.debug.FailingAllocator.init(&fixed_allocator.allocator, fail_index); + if (testParse(source, &failing_allocator.allocator)) |_| { + return error.NondeterministicMemoryUsage; + } else |err| { + assert(err == error.OutOfMemory); + // TODO make this pass + //if (failing_allocator.allocated_bytes != failing_allocator.freed_bytes) { + // warn("\nfail_index: {}/{}\nallocated bytes: {}\nfreed bytes: {}\nallocations: {}\ndeallocations: {}\n", + // fail_index, needed_alloc_count, + // failing_allocator.allocated_bytes, failing_allocator.freed_bytes, + // failing_allocator.index, failing_allocator.deallocations); + // return error.MemoryLeakDetected; + //} + } + } +} + +test "zig fmt" { + try testCanonical( + \\extern fn puts(s: &const u8) c_int; + \\ + ); + + try testCanonical( + \\const a = b; + \\pub const a = b; + \\var a = b; + \\pub var a = b; + \\const a: i32 = b; + \\pub const a: i32 = b; + \\var a: i32 = b; + \\pub var a: i32 = b; + \\ + ); + + try testCanonical( + \\extern var foo: c_int; + \\ + ); + + try testCanonical( + \\var foo: c_int align(1); + \\ + ); + + try testCanonical( + \\fn main(argc: c_int, argv: &&u8) c_int { + \\ const a = b; + \\} + \\ + ); + + try testCanonical( + \\fn foo(argc: c_int, argv: &&u8) c_int { + \\ return 0; + \\} + \\ + ); + + try testCanonical( + \\extern fn f1(s: &align(&u8) u8) c_int; + \\ + ); + + try testCanonical( + \\extern fn f1(s: &&align(1) &const &volatile u8) c_int; + \\extern fn f2(s: &align(1) const &align(1) volatile &const volatile u8) c_int; + \\extern fn f3(s: &align(1) const volatile u8) c_int; + \\ + ); + + try testCanonical( + \\fn f1(a: bool, b: bool) bool { + \\ a != b; + \\ return a == b; + \\} + \\ + ); +} diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig @@ -0,0 +1,659 @@ +const std = @import("../index.zig"); +const mem = std.mem; + +pub const Token = struct { + id: Id, + start: usize, + end: usize, + + const KeywordId = struct { + bytes: []const u8, + id: Id, + }; + + const keywords = []KeywordId { + KeywordId{.bytes="align", .id = Id.Keyword_align}, + KeywordId{.bytes="and", .id = Id.Keyword_and}, + KeywordId{.bytes="asm", .id = Id.Keyword_asm}, + KeywordId{.bytes="break", .id = Id.Keyword_break}, + KeywordId{.bytes="comptime", .id = Id.Keyword_comptime}, + KeywordId{.bytes="const", .id = Id.Keyword_const}, + KeywordId{.bytes="continue", .id = Id.Keyword_continue}, + KeywordId{.bytes="defer", .id = Id.Keyword_defer}, + KeywordId{.bytes="else", .id = Id.Keyword_else}, + KeywordId{.bytes="enum", .id = Id.Keyword_enum}, + KeywordId{.bytes="error", .id = Id.Keyword_error}, + KeywordId{.bytes="export", .id = Id.Keyword_export}, + KeywordId{.bytes="extern", .id = Id.Keyword_extern}, + KeywordId{.bytes="false", .id = Id.Keyword_false}, + KeywordId{.bytes="fn", .id = Id.Keyword_fn}, + KeywordId{.bytes="for", .id = Id.Keyword_for}, + KeywordId{.bytes="goto", .id = Id.Keyword_goto}, + KeywordId{.bytes="if", .id = Id.Keyword_if}, + KeywordId{.bytes="inline", .id = Id.Keyword_inline}, + KeywordId{.bytes="nakedcc", .id = Id.Keyword_nakedcc}, + KeywordId{.bytes="noalias", .id = Id.Keyword_noalias}, + KeywordId{.bytes="null", .id = Id.Keyword_null}, + KeywordId{.bytes="or", .id = Id.Keyword_or}, + KeywordId{.bytes="packed", .id = Id.Keyword_packed}, + KeywordId{.bytes="pub", .id = Id.Keyword_pub}, + KeywordId{.bytes="return", .id = Id.Keyword_return}, + KeywordId{.bytes="stdcallcc", .id = Id.Keyword_stdcallcc}, + KeywordId{.bytes="struct", .id = Id.Keyword_struct}, + KeywordId{.bytes="switch", .id = Id.Keyword_switch}, + KeywordId{.bytes="test", .id = Id.Keyword_test}, + KeywordId{.bytes="this", .id = Id.Keyword_this}, + KeywordId{.bytes="true", .id = Id.Keyword_true}, + KeywordId{.bytes="undefined", .id = Id.Keyword_undefined}, + KeywordId{.bytes="union", .id = Id.Keyword_union}, + KeywordId{.bytes="unreachable", .id = Id.Keyword_unreachable}, + KeywordId{.bytes="use", .id = Id.Keyword_use}, + KeywordId{.bytes="var", .id = Id.Keyword_var}, + KeywordId{.bytes="volatile", .id = Id.Keyword_volatile}, + KeywordId{.bytes="while", .id = Id.Keyword_while}, + }; + + fn getKeyword(bytes: []const u8) ?Id { + for (keywords) |kw| { + if (mem.eql(u8, kw.bytes, bytes)) { + return kw.id; + } + } + return null; + } + + const StrLitKind = enum {Normal, C}; + + pub const Id = union(enum) { + Invalid, + Identifier, + StringLiteral: StrLitKind, + Eof, + Builtin, + Bang, + Equal, + EqualEqual, + BangEqual, + LParen, + RParen, + Semicolon, + Percent, + LBrace, + RBrace, + Period, + Ellipsis2, + Ellipsis3, + Minus, + Arrow, + Colon, + Slash, + Comma, + Ampersand, + AmpersandEqual, + IntegerLiteral, + FloatLiteral, + Keyword_align, + Keyword_and, + Keyword_asm, + Keyword_break, + Keyword_comptime, + Keyword_const, + Keyword_continue, + Keyword_defer, + Keyword_else, + Keyword_enum, + Keyword_error, + Keyword_export, + Keyword_extern, + Keyword_false, + Keyword_fn, + Keyword_for, + Keyword_goto, + Keyword_if, + Keyword_inline, + Keyword_nakedcc, + Keyword_noalias, + Keyword_null, + Keyword_or, + Keyword_packed, + Keyword_pub, + Keyword_return, + Keyword_stdcallcc, + Keyword_struct, + Keyword_switch, + Keyword_test, + Keyword_this, + Keyword_true, + Keyword_undefined, + Keyword_union, + Keyword_unreachable, + Keyword_use, + Keyword_var, + Keyword_volatile, + Keyword_while, + }; +}; + +pub const Tokenizer = struct { + buffer: []const u8, + index: usize, + pending_invalid_token: ?Token, + + pub const Location = struct { + line: usize, + column: usize, + line_start: usize, + line_end: usize, + }; + + pub fn getTokenLocation(self: &Tokenizer, token: &const Token) Location { + var loc = Location { + .line = 0, + .column = 0, + .line_start = 0, + .line_end = 0, + }; + for (self.buffer) |c, i| { + if (i == token.start) { + loc.line_end = i; + while (loc.line_end < self.buffer.len and self.buffer[loc.line_end] != '\n') : (loc.line_end += 1) {} + return loc; + } + if (c == '\n') { + loc.line += 1; + loc.column = 0; + loc.line_start = i + 1; + } else { + loc.column += 1; + } + } + return loc; + } + + /// For debugging purposes + pub fn dump(self: &Tokenizer, token: &const Token) void { + std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]); + } + + /// buffer must end with "\n\n\n". This is so that attempting to decode + /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow. + pub fn init(buffer: []const u8) Tokenizer { + std.debug.assert(buffer[buffer.len - 1] == '\n'); + std.debug.assert(buffer[buffer.len - 2] == '\n'); + std.debug.assert(buffer[buffer.len - 3] == '\n'); + return Tokenizer { + .buffer = buffer, + .index = 0, + .pending_invalid_token = null, + }; + } + + const State = enum { + Start, + Identifier, + Builtin, + C, + StringLiteral, + StringLiteralBackslash, + Equal, + Bang, + Minus, + Slash, + LineComment, + Zero, + IntegerLiteral, + IntegerLiteralWithRadix, + NumberDot, + FloatFraction, + FloatExponentUnsigned, + FloatExponentNumber, + Ampersand, + Period, + Period2, + }; + + pub fn next(self: &Tokenizer) Token { + if (self.pending_invalid_token) |token| { + self.pending_invalid_token = null; + return token; + } + var state = State.Start; + var result = Token { + .id = Token.Id.Eof, + .start = self.index, + .end = undefined, + }; + while (self.index < self.buffer.len) : (self.index += 1) { + const c = self.buffer[self.index]; + switch (state) { + State.Start => switch (c) { + ' ', '\n' => { + result.start = self.index + 1; + }, + 'c' => { + state = State.C; + result.id = Token.Id.Identifier; + }, + '"' => { + state = State.StringLiteral; + result.id = Token.Id { .StringLiteral = Token.StrLitKind.Normal }; + }, + 'a'...'b', 'd'...'z', 'A'...'Z', '_' => { + state = State.Identifier; + result.id = Token.Id.Identifier; + }, + '@' => { + state = State.Builtin; + result.id = Token.Id.Builtin; + }, + '=' => { + state = State.Equal; + }, + '!' => { + state = State.Bang; + }, + '(' => { + result.id = Token.Id.LParen; + self.index += 1; + break; + }, + ')' => { + result.id = Token.Id.RParen; + self.index += 1; + break; + }, + ';' => { + result.id = Token.Id.Semicolon; + self.index += 1; + break; + }, + ',' => { + result.id = Token.Id.Comma; + self.index += 1; + break; + }, + ':' => { + result.id = Token.Id.Colon; + self.index += 1; + break; + }, + '%' => { + result.id = Token.Id.Percent; + self.index += 1; + break; + }, + '{' => { + result.id = Token.Id.LBrace; + self.index += 1; + break; + }, + '}' => { + result.id = Token.Id.RBrace; + self.index += 1; + break; + }, + '.' => { + state = State.Period; + }, + '-' => { + state = State.Minus; + }, + '/' => { + state = State.Slash; + }, + '&' => { + state = State.Ampersand; + }, + '0' => { + state = State.Zero; + result.id = Token.Id.IntegerLiteral; + }, + '1'...'9' => { + state = State.IntegerLiteral; + result.id = Token.Id.IntegerLiteral; + }, + else => { + result.id = Token.Id.Invalid; + self.index += 1; + break; + }, + }, + State.Ampersand => switch (c) { + '=' => { + result.id = Token.Id.AmpersandEqual; + self.index += 1; + break; + }, + else => { + result.id = Token.Id.Ampersand; + break; + }, + }, + State.Identifier => switch (c) { + 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, + else => { + if (Token.getKeyword(self.buffer[result.start..self.index])) |id| { + result.id = id; + } + break; + }, + }, + State.Builtin => switch (c) { + 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, + else => break, + }, + State.C => switch (c) { + '\\' => @panic("TODO"), + '"' => { + state = State.StringLiteral; + result.id = Token.Id { .StringLiteral = Token.StrLitKind.C }; + }, + 'a'...'z', 'A'...'Z', '_', '0'...'9' => { + state = State.Identifier; + }, + else => break, + }, + State.StringLiteral => switch (c) { + '\\' => { + state = State.StringLiteralBackslash; + }, + '"' => { + self.index += 1; + break; + }, + '\n' => break, // Look for this error later. + else => self.checkLiteralCharacter(), + }, + + State.StringLiteralBackslash => switch (c) { + '\n' => break, // Look for this error later. + else => { + state = State.StringLiteral; + }, + }, + + State.Bang => switch (c) { + '=' => { + result.id = Token.Id.BangEqual; + self.index += 1; + break; + }, + else => { + result.id = Token.Id.Bang; + break; + }, + }, + + State.Equal => switch (c) { + '=' => { + result.id = Token.Id.EqualEqual; + self.index += 1; + break; + }, + else => { + result.id = Token.Id.Equal; + break; + }, + }, + + State.Minus => switch (c) { + '>' => { + result.id = Token.Id.Arrow; + self.index += 1; + break; + }, + else => { + result.id = Token.Id.Minus; + break; + }, + }, + + State.Period => switch (c) { + '.' => { + state = State.Period2; + }, + else => { + result.id = Token.Id.Period; + break; + }, + }, + + State.Period2 => switch (c) { + '.' => { + result.id = Token.Id.Ellipsis3; + self.index += 1; + break; + }, + else => { + result.id = Token.Id.Ellipsis2; + break; + }, + }, + + State.Slash => switch (c) { + '/' => { + result.id = undefined; + state = State.LineComment; + }, + else => { + result.id = Token.Id.Slash; + break; + }, + }, + State.LineComment => switch (c) { + '\n' => { + state = State.Start; + result = Token { + .id = Token.Id.Eof, + .start = self.index + 1, + .end = undefined, + }; + }, + else => self.checkLiteralCharacter(), + }, + State.Zero => switch (c) { + 'b', 'o', 'x' => { + state = State.IntegerLiteralWithRadix; + }, + else => { + // reinterpret as a normal number + self.index -= 1; + state = State.IntegerLiteral; + }, + }, + State.IntegerLiteral => switch (c) { + '.' => { + state = State.NumberDot; + }, + 'p', 'P', 'e', 'E' => { + state = State.FloatExponentUnsigned; + }, + '0'...'9' => {}, + else => break, + }, + State.IntegerLiteralWithRadix => switch (c) { + '.' => { + state = State.NumberDot; + }, + 'p', 'P' => { + state = State.FloatExponentUnsigned; + }, + '0'...'9', 'a'...'f', 'A'...'F' => {}, + else => break, + }, + State.NumberDot => switch (c) { + '.' => { + self.index -= 1; + state = State.Start; + break; + }, + else => { + self.index -= 1; + result.id = Token.Id.FloatLiteral; + state = State.FloatFraction; + }, + }, + State.FloatFraction => switch (c) { + 'p', 'P' => { + state = State.FloatExponentUnsigned; + }, + '0'...'9', 'a'...'f', 'A'...'F' => {}, + else => break, + }, + State.FloatExponentUnsigned => switch (c) { + '+', '-' => { + state = State.FloatExponentNumber; + }, + else => { + // reinterpret as a normal exponent number + self.index -= 1; + state = State.FloatExponentNumber; + } + }, + State.FloatExponentNumber => switch (c) { + '0'...'9', 'a'...'f', 'A'...'F' => {}, + else => break, + }, + } + } + result.end = self.index; + + if (result.id == Token.Id.Eof) { + if (self.pending_invalid_token) |token| { + self.pending_invalid_token = null; + return token; + } + } + + return result; + } + + pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) []const u8 { + return self.buffer[token.start..token.end]; + } + + fn checkLiteralCharacter(self: &Tokenizer) void { + if (self.pending_invalid_token != null) return; + const invalid_length = self.getInvalidCharacterLength(); + if (invalid_length == 0) return; + self.pending_invalid_token = Token { + .id = Token.Id.Invalid, + .start = self.index, + .end = self.index + invalid_length, + }; + } + + fn getInvalidCharacterLength(self: &Tokenizer) u3 { + const c0 = self.buffer[self.index]; + if (c0 < 0x80) { + if (c0 < 0x20 or c0 == 0x7f) { + // ascii control codes are never allowed + // (note that \n was checked before we got here) + return 1; + } + // looks fine to me. + return 0; + } else { + // check utf8-encoded character. + const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; + // the last 3 bytes in the buffer are guaranteed to be '\n', + // which means we don't need to do any bounds checking here. + const bytes = self.buffer[self.index..self.index + length]; + switch (length) { + 2 => { + const value = std.unicode.utf8Decode2(bytes) catch return length; + if (value == 0x85) return length; // U+0085 (NEL) + }, + 3 => { + const value = std.unicode.utf8Decode3(bytes) catch return length; + if (value == 0x2028) return length; // U+2028 (LS) + if (value == 0x2029) return length; // U+2029 (PS) + }, + 4 => { + _ = std.unicode.utf8Decode4(bytes) catch return length; + }, + else => unreachable, + } + self.index += length - 1; + return 0; + } + } +}; + + + +test "tokenizer" { + testTokenize("test", []Token.Id { + Token.Id.Keyword_test, + }); +} + +test "tokenizer - invalid token characters" { + testTokenize("#", []Token.Id{Token.Id.Invalid}); + testTokenize("`", []Token.Id{Token.Id.Invalid}); +} + +test "tokenizer - invalid literal/comment characters" { + testTokenize("\"\x00\"", []Token.Id { + Token.Id { .StringLiteral = Token.StrLitKind.Normal }, + Token.Id.Invalid, + }); + testTokenize("//\x00", []Token.Id { + Token.Id.Invalid, + }); + testTokenize("//\x1f", []Token.Id { + Token.Id.Invalid, + }); + testTokenize("//\x7f", []Token.Id { + Token.Id.Invalid, + }); +} + +test "tokenizer - utf8" { + testTokenize("//\xc2\x80", []Token.Id{}); + testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{}); +} + +test "tokenizer - invalid utf8" { + testTokenize("//\x80", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xbf", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xf8", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xff", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xe0", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xf0", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid}); +} + +test "tokenizer - illegal unicode codepoints" { + // unicode newline characters.U+0085, U+2028, U+2029 + testTokenize("//\xc2\x84", []Token.Id{}); + testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xc2\x86", []Token.Id{}); + testTokenize("//\xe2\x80\xa7", []Token.Id{}); + testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid}); + testTokenize("//\xe2\x80\xaa", []Token.Id{}); +} + +fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void { + // (test authors, just make this bigger if you need it) + var padded_source: [0x100]u8 = undefined; + std.mem.copy(u8, padded_source[0..source.len], source); + padded_source[source.len + 0] = '\n'; + padded_source[source.len + 1] = '\n'; + padded_source[source.len + 2] = '\n'; + + var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]); + for (expected_tokens) |expected_token_id| { + const token = tokenizer.next(); + std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id)); + switch (expected_token_id) { + Token.Id.StringLiteral => |expected_kind| { + std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable }); + }, + else => {}, + } + } + std.debug.assert(tokenizer.next().id == Token.Id.Eof); +}