@@ -1,5 +1,4 @@
|
||||
const std = @import("../std.zig");
|
||||
const mem = std.mem;
|
||||
|
||||
pub const Token = struct {
|
||||
tag: Tag,
|
||||
@@ -350,7 +349,7 @@ pub const Tokenizer = struct {
|
||||
|
||||
pub fn init(buffer: [:0]const u8) Tokenizer {
|
||||
// Skip the UTF-8 BOM if present
|
||||
const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else @as(usize, 0);
|
||||
const src_start: usize = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0;
|
||||
return Tokenizer{
|
||||
.buffer = buffer,
|
||||
.index = src_start,
|
||||
@@ -1433,8 +1432,8 @@ pub const Tokenizer = struct {
|
||||
|
||||
fn getInvalidCharacterLength(self: *Tokenizer) u3 {
|
||||
const c0 = self.buffer[self.index];
|
||||
if (c0 < 0x80) {
|
||||
if (c0 < 0x20 or c0 == 0x7f) {
|
||||
if (std.ascii.isASCII(c0)) {
|
||||
if (std.ascii.isCntrl(c0)) {
|
||||
// ascii control codes are never allowed
|
||||
// (note that \n was checked before we got here)
|
||||
return 1;
|
||||
@@ -1469,8 +1468,8 @@ pub const Tokenizer = struct {
|
||||
}
|
||||
};
|
||||
|
||||
test "tokenizer" {
|
||||
try testTokenize("test", &.{.keyword_test});
|
||||
test "keywords" {
|
||||
try testTokenize("test const else", &.{ .keyword_test, .keyword_const, .keyword_else });
|
||||
}
|
||||
|
||||
test "line comment followed by top-level comptime" {
|
||||
@@ -1485,7 +1484,7 @@ test "line comment followed by top-level comptime" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - unknown length pointer and then c pointer" {
|
||||
test "unknown length pointer and then c pointer" {
|
||||
try testTokenize(
|
||||
\\[*]u8
|
||||
\\[*c]u8
|
||||
@@ -1502,7 +1501,7 @@ test "tokenizer - unknown length pointer and then c pointer" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - code point literal with hex escape" {
|
||||
test "code point literal with hex escape" {
|
||||
try testTokenize(
|
||||
\\'\x1b'
|
||||
, &.{.char_literal});
|
||||
@@ -1511,21 +1510,21 @@ test "tokenizer - code point literal with hex escape" {
|
||||
, &.{ .invalid, .invalid });
|
||||
}
|
||||
|
||||
test "tokenizer - newline in char literal" {
|
||||
test "newline in char literal" {
|
||||
try testTokenize(
|
||||
\\'
|
||||
\\'
|
||||
, &.{ .invalid, .invalid });
|
||||
}
|
||||
|
||||
test "tokenizer - newline in string literal" {
|
||||
test "newline in string literal" {
|
||||
try testTokenize(
|
||||
\\"
|
||||
\\"
|
||||
, &.{ .invalid, .string_literal });
|
||||
}
|
||||
|
||||
test "tokenizer - code point literal with unicode escapes" {
|
||||
test "code point literal with unicode escapes" {
|
||||
// Valid unicode escapes
|
||||
try testTokenize(
|
||||
\\'\u{3}'
|
||||
@@ -1575,13 +1574,13 @@ test "tokenizer - code point literal with unicode escapes" {
|
||||
, &.{ .invalid, .integer_literal, .invalid });
|
||||
}
|
||||
|
||||
test "tokenizer - code point literal with unicode code point" {
|
||||
test "code point literal with unicode code point" {
|
||||
try testTokenize(
|
||||
\\'💩'
|
||||
, &.{.char_literal});
|
||||
}
|
||||
|
||||
test "tokenizer - float literal e exponent" {
|
||||
test "float literal e exponent" {
|
||||
try testTokenize("a = 4.94065645841246544177e-324;\n", &.{
|
||||
.identifier,
|
||||
.equal,
|
||||
@@ -1590,7 +1589,7 @@ test "tokenizer - float literal e exponent" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - float literal p exponent" {
|
||||
test "float literal p exponent" {
|
||||
try testTokenize("a = 0x1.a827999fcef32p+1022;\n", &.{
|
||||
.identifier,
|
||||
.equal,
|
||||
@@ -1599,11 +1598,11 @@ test "tokenizer - float literal p exponent" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - chars" {
|
||||
test "chars" {
|
||||
try testTokenize("'c'", &.{.char_literal});
|
||||
}
|
||||
|
||||
test "tokenizer - invalid token characters" {
|
||||
test "invalid token characters" {
|
||||
try testTokenize("#", &.{.invalid});
|
||||
try testTokenize("`", &.{.invalid});
|
||||
try testTokenize("'c", &.{.invalid});
|
||||
@@ -1611,7 +1610,7 @@ test "tokenizer - invalid token characters" {
|
||||
try testTokenize("''", &.{ .invalid, .invalid });
|
||||
}
|
||||
|
||||
test "tokenizer - invalid literal/comment characters" {
|
||||
test "invalid literal/comment characters" {
|
||||
try testTokenize("\"\x00\"", &.{
|
||||
.string_literal,
|
||||
.invalid,
|
||||
@@ -1627,12 +1626,12 @@ test "tokenizer - invalid literal/comment characters" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - utf8" {
|
||||
test "utf8" {
|
||||
try testTokenize("//\xc2\x80", &.{});
|
||||
try testTokenize("//\xf4\x8f\xbf\xbf", &.{});
|
||||
}
|
||||
|
||||
test "tokenizer - invalid utf8" {
|
||||
test "invalid utf8" {
|
||||
try testTokenize("//\x80", &.{
|
||||
.invalid,
|
||||
});
|
||||
@@ -1659,7 +1658,7 @@ test "tokenizer - invalid utf8" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - illegal unicode codepoints" {
|
||||
test "illegal unicode codepoints" {
|
||||
// unicode newline characters.U+0085, U+2028, U+2029
|
||||
try testTokenize("//\xc2\x84", &.{});
|
||||
try testTokenize("//\xc2\x85", &.{
|
||||
@@ -1676,7 +1675,7 @@ test "tokenizer - illegal unicode codepoints" {
|
||||
try testTokenize("//\xe2\x80\xaa", &.{});
|
||||
}
|
||||
|
||||
test "tokenizer - string identifier and builtin fns" {
|
||||
test "string identifier and builtin fns" {
|
||||
try testTokenize(
|
||||
\\const @"if" = @import("std");
|
||||
, &.{
|
||||
@@ -1691,7 +1690,7 @@ test "tokenizer - string identifier and builtin fns" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - multiline string literal with literal tab" {
|
||||
test "multiline string literal with literal tab" {
|
||||
try testTokenize(
|
||||
\\\\foo bar
|
||||
, &.{
|
||||
@@ -1699,7 +1698,7 @@ test "tokenizer - multiline string literal with literal tab" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - comments with literal tab" {
|
||||
test "comments with literal tab" {
|
||||
try testTokenize(
|
||||
\\//foo bar
|
||||
\\//!foo bar
|
||||
@@ -1715,14 +1714,14 @@ test "tokenizer - comments with literal tab" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - pipe and then invalid" {
|
||||
test "pipe and then invalid" {
|
||||
try testTokenize("||=", &.{
|
||||
.pipe_pipe,
|
||||
.equal,
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - line comment and doc comment" {
|
||||
test "line comment and doc comment" {
|
||||
try testTokenize("//", &.{});
|
||||
try testTokenize("// a / b", &.{});
|
||||
try testTokenize("// /", &.{});
|
||||
@@ -1733,7 +1732,7 @@ test "tokenizer - line comment and doc comment" {
|
||||
try testTokenize("//!!", &.{.container_doc_comment});
|
||||
}
|
||||
|
||||
test "tokenizer - line comment followed by identifier" {
|
||||
test "line comment followed by identifier" {
|
||||
try testTokenize(
|
||||
\\ Unexpected,
|
||||
\\ // another
|
||||
@@ -1746,7 +1745,7 @@ test "tokenizer - line comment followed by identifier" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - UTF-8 BOM is recognized and skipped" {
|
||||
test "UTF-8 BOM is recognized and skipped" {
|
||||
try testTokenize("\xEF\xBB\xBFa;\n", &.{
|
||||
.identifier,
|
||||
.semicolon,
|
||||
@@ -1788,7 +1787,7 @@ test "correctly parse pointer dereference followed by asterisk" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - range literals" {
|
||||
test "range literals" {
|
||||
try testTokenize("0...9", &.{ .integer_literal, .ellipsis3, .integer_literal });
|
||||
try testTokenize("'0'...'9'", &.{ .char_literal, .ellipsis3, .char_literal });
|
||||
try testTokenize("0x00...0x09", &.{ .integer_literal, .ellipsis3, .integer_literal });
|
||||
@@ -1796,7 +1795,7 @@ test "tokenizer - range literals" {
|
||||
try testTokenize("0o00...0o11", &.{ .integer_literal, .ellipsis3, .integer_literal });
|
||||
}
|
||||
|
||||
test "tokenizer - number literals decimal" {
|
||||
test "number literals decimal" {
|
||||
try testTokenize("0", &.{.integer_literal});
|
||||
try testTokenize("1", &.{.integer_literal});
|
||||
try testTokenize("2", &.{.integer_literal});
|
||||
@@ -1863,7 +1862,7 @@ test "tokenizer - number literals decimal" {
|
||||
try testTokenize("1.0e0_+", &.{ .invalid, .plus });
|
||||
}
|
||||
|
||||
test "tokenizer - number literals binary" {
|
||||
test "number literals binary" {
|
||||
try testTokenize("0b0", &.{.integer_literal});
|
||||
try testTokenize("0b1", &.{.integer_literal});
|
||||
try testTokenize("0b2", &.{ .invalid, .integer_literal });
|
||||
@@ -1902,7 +1901,7 @@ test "tokenizer - number literals binary" {
|
||||
try testTokenize("0b1_,", &.{ .invalid, .comma });
|
||||
}
|
||||
|
||||
test "tokenizer - number literals octal" {
|
||||
test "number literals octal" {
|
||||
try testTokenize("0o0", &.{.integer_literal});
|
||||
try testTokenize("0o1", &.{.integer_literal});
|
||||
try testTokenize("0o2", &.{.integer_literal});
|
||||
@@ -1941,7 +1940,7 @@ test "tokenizer - number literals octal" {
|
||||
try testTokenize("0o_,", &.{ .invalid, .identifier, .comma });
|
||||
}
|
||||
|
||||
test "tokenizer - number literals hexadecimal" {
|
||||
test "number literals hexadecimal" {
|
||||
try testTokenize("0x0", &.{.integer_literal});
|
||||
try testTokenize("0x1", &.{.integer_literal});
|
||||
try testTokenize("0x2", &.{.integer_literal});
|
||||
@@ -2029,22 +2028,22 @@ test "tokenizer - number literals hexadecimal" {
|
||||
try testTokenize("0x0.0p0_", &.{ .invalid, .eof });
|
||||
}
|
||||
|
||||
test "tokenizer - multi line string literal with only 1 backslash" {
|
||||
test "multi line string literal with only 1 backslash" {
|
||||
try testTokenize("x \\\n;", &.{ .identifier, .invalid, .semicolon });
|
||||
}
|
||||
|
||||
test "tokenizer - invalid builtin identifiers" {
|
||||
test "invalid builtin identifiers" {
|
||||
try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren });
|
||||
try testTokenize("@0()", &.{ .invalid, .integer_literal, .l_paren, .r_paren });
|
||||
}
|
||||
|
||||
test "tokenizer - invalid token with unfinished escape right before eof" {
|
||||
test "invalid token with unfinished escape right before eof" {
|
||||
try testTokenize("\"\\", &.{.invalid});
|
||||
try testTokenize("'\\", &.{.invalid});
|
||||
try testTokenize("'\\u", &.{.invalid});
|
||||
}
|
||||
|
||||
test "tokenizer - saturating" {
|
||||
test "saturating operators" {
|
||||
try testTokenize("<<", &.{.angle_bracket_angle_bracket_left});
|
||||
try testTokenize("<<|", &.{.angle_bracket_angle_bracket_left_pipe});
|
||||
try testTokenize("<<|=", &.{.angle_bracket_angle_bracket_left_pipe_equal});
|
||||
@@ -2062,17 +2061,14 @@ test "tokenizer - saturating" {
|
||||
try testTokenize("-|=", &.{.minus_pipe_equal});
|
||||
}
|
||||
|
||||
fn testTokenize(source: [:0]const u8, expected_tokens: []const Token.Tag) !void {
|
||||
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
|
||||
var tokenizer = Tokenizer.init(source);
|
||||
for (expected_tokens) |expected_token_id| {
|
||||
for (expected_token_tags) |expected_token_tag| {
|
||||
const token = tokenizer.next();
|
||||
if (token.tag != expected_token_id) {
|
||||
std.debug.panic("expected {s}, found {s}\n", .{
|
||||
@tagName(expected_token_id), @tagName(token.tag),
|
||||
});
|
||||
}
|
||||
try std.testing.expectEqual(expected_token_tag, token.tag);
|
||||
}
|
||||
const last_token = tokenizer.next();
|
||||
try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
|
||||
try std.testing.expectEqual(source.len, last_token.loc.start);
|
||||
try std.testing.expectEqual(source.len, last_token.loc.end);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user