zig0

my attempts at zig bootstrapping in C
Log | Files | Refs | README | LICENSE

tokenizer_test.zig (29725B) - Raw


      1 const std = @import("std");
      2 const testing = std.testing;
      3 
      4 const Token = std.zig.Token;
      5 const Tokenizer = std.zig.Tokenizer;
      6 
      7 const c = @cImport({
      8     @cInclude("tokenizer.h");
      9 });
     10 
     11 pub fn zigToken(token: c_uint) Token.Tag {
     12     return switch (token) {
     13         c.TOKEN_INVALID => .invalid,
     14         c.TOKEN_INVALID_PERIODASTERISKS => .invalid_periodasterisks,
     15         c.TOKEN_IDENTIFIER => .identifier,
     16         c.TOKEN_STRING_LITERAL => .string_literal,
     17         c.TOKEN_MULTILINE_STRING_LITERAL_LINE => .multiline_string_literal_line,
     18         c.TOKEN_CHAR_LITERAL => .char_literal,
     19         c.TOKEN_EOF => .eof,
     20         c.TOKEN_BUILTIN => .builtin,
     21         c.TOKEN_BANG => .bang,
     22         c.TOKEN_PIPE => .pipe,
     23         c.TOKEN_PIPE_PIPE => .pipe_pipe,
     24         c.TOKEN_PIPE_EQUAL => .pipe_equal,
     25         c.TOKEN_EQUAL => .equal,
     26         c.TOKEN_EQUAL_EQUAL => .equal_equal,
     27         c.TOKEN_EQUAL_ANGLE_BRACKET_RIGHT => .equal_angle_bracket_right,
     28         c.TOKEN_BANG_EQUAL => .bang_equal,
     29         c.TOKEN_L_PAREN => .l_paren,
     30         c.TOKEN_R_PAREN => .r_paren,
     31         c.TOKEN_SEMICOLON => .semicolon,
     32         c.TOKEN_PERCENT => .percent,
     33         c.TOKEN_PERCENT_EQUAL => .percent_equal,
     34         c.TOKEN_L_BRACE => .l_brace,
     35         c.TOKEN_R_BRACE => .r_brace,
     36         c.TOKEN_L_BRACKET => .l_bracket,
     37         c.TOKEN_R_BRACKET => .r_bracket,
     38         c.TOKEN_PERIOD => .period,
     39         c.TOKEN_PERIOD_ASTERISK => .period_asterisk,
     40         c.TOKEN_ELLIPSIS2 => .ellipsis2,
     41         c.TOKEN_ELLIPSIS3 => .ellipsis3,
     42         c.TOKEN_CARET => .caret,
     43         c.TOKEN_CARET_EQUAL => .caret_equal,
     44         c.TOKEN_PLUS => .plus,
     45         c.TOKEN_PLUS_PLUS => .plus_plus,
     46         c.TOKEN_PLUS_EQUAL => .plus_equal,
     47         c.TOKEN_PLUS_PERCENT => .plus_percent,
     48         c.TOKEN_PLUS_PERCENT_EQUAL => .plus_percent_equal,
     49         c.TOKEN_PLUS_PIPE => .plus_pipe,
     50         c.TOKEN_PLUS_PIPE_EQUAL => .plus_pipe_equal,
     51         c.TOKEN_MINUS => .minus,
     52         c.TOKEN_MINUS_EQUAL => .minus_equal,
     53         c.TOKEN_MINUS_PERCENT => .minus_percent,
     54         c.TOKEN_MINUS_PERCENT_EQUAL => .minus_percent_equal,
     55         c.TOKEN_MINUS_PIPE => .minus_pipe,
     56         c.TOKEN_MINUS_PIPE_EQUAL => .minus_pipe_equal,
     57         c.TOKEN_ASTERISK => .asterisk,
     58         c.TOKEN_ASTERISK_EQUAL => .asterisk_equal,
     59         c.TOKEN_ASTERISK_ASTERISK => .asterisk_asterisk,
     60         c.TOKEN_ASTERISK_PERCENT => .asterisk_percent,
     61         c.TOKEN_ASTERISK_PERCENT_EQUAL => .asterisk_percent_equal,
     62         c.TOKEN_ASTERISK_PIPE => .asterisk_pipe,
     63         c.TOKEN_ASTERISK_PIPE_EQUAL => .asterisk_pipe_equal,
     64         c.TOKEN_ARROW => .arrow,
     65         c.TOKEN_COLON => .colon,
     66         c.TOKEN_SLASH => .slash,
     67         c.TOKEN_SLASH_EQUAL => .slash_equal,
     68         c.TOKEN_COMMA => .comma,
     69         c.TOKEN_AMPERSAND => .ampersand,
     70         c.TOKEN_AMPERSAND_EQUAL => .ampersand_equal,
     71         c.TOKEN_QUESTION_MARK => .question_mark,
     72         c.TOKEN_ANGLE_BRACKET_LEFT => .angle_bracket_left,
     73         c.TOKEN_ANGLE_BRACKET_LEFT_EQUAL => .angle_bracket_left_equal,
     74         c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT => .angle_bracket_angle_bracket_left,
     75         c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL => .angle_bracket_angle_bracket_left_equal,
     76         c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE => .angle_bracket_angle_bracket_left_pipe,
     77         c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL => .angle_bracket_angle_bracket_left_pipe_equal,
     78         c.TOKEN_ANGLE_BRACKET_RIGHT => .angle_bracket_right,
     79         c.TOKEN_ANGLE_BRACKET_RIGHT_EQUAL => .angle_bracket_right_equal,
     80         c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT => .angle_bracket_angle_bracket_right,
     81         c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL => .angle_bracket_angle_bracket_right_equal,
     82         c.TOKEN_TILDE => .tilde,
     83         c.TOKEN_NUMBER_LITERAL => .number_literal,
     84         c.TOKEN_DOC_COMMENT => .doc_comment,
     85         c.TOKEN_CONTAINER_DOC_COMMENT => .container_doc_comment,
     86         c.TOKEN_KEYWORD_ADDRSPACE => .keyword_addrspace,
     87         c.TOKEN_KEYWORD_ALIGN => .keyword_align,
     88         c.TOKEN_KEYWORD_ALLOWZERO => .keyword_allowzero,
     89         c.TOKEN_KEYWORD_AND => .keyword_and,
     90         c.TOKEN_KEYWORD_ANYFRAME => .keyword_anyframe,
     91         c.TOKEN_KEYWORD_ANYTYPE => .keyword_anytype,
     92         c.TOKEN_KEYWORD_ASM => .keyword_asm,
     93         c.TOKEN_KEYWORD_BREAK => .keyword_break,
     94         c.TOKEN_KEYWORD_CALLCONV => .keyword_callconv,
     95         c.TOKEN_KEYWORD_CATCH => .keyword_catch,
     96         c.TOKEN_KEYWORD_COMPTIME => .keyword_comptime,
     97         c.TOKEN_KEYWORD_CONST => .keyword_const,
     98         c.TOKEN_KEYWORD_CONTINUE => .keyword_continue,
     99         c.TOKEN_KEYWORD_DEFER => .keyword_defer,
    100         c.TOKEN_KEYWORD_ELSE => .keyword_else,
    101         c.TOKEN_KEYWORD_ENUM => .keyword_enum,
    102         c.TOKEN_KEYWORD_ERRDEFER => .keyword_errdefer,
    103         c.TOKEN_KEYWORD_ERROR => .keyword_error,
    104         c.TOKEN_KEYWORD_EXPORT => .keyword_export,
    105         c.TOKEN_KEYWORD_EXTERN => .keyword_extern,
    106         c.TOKEN_KEYWORD_FN => .keyword_fn,
    107         c.TOKEN_KEYWORD_FOR => .keyword_for,
    108         c.TOKEN_KEYWORD_IF => .keyword_if,
    109         c.TOKEN_KEYWORD_INLINE => .keyword_inline,
    110         c.TOKEN_KEYWORD_NOALIAS => .keyword_noalias,
    111         c.TOKEN_KEYWORD_NOINLINE => .keyword_noinline,
    112         c.TOKEN_KEYWORD_NOSUSPEND => .keyword_nosuspend,
    113         c.TOKEN_KEYWORD_OPAQUE => .keyword_opaque,
    114         c.TOKEN_KEYWORD_OR => .keyword_or,
    115         c.TOKEN_KEYWORD_ORELSE => .keyword_orelse,
    116         c.TOKEN_KEYWORD_PACKED => .keyword_packed,
    117         c.TOKEN_KEYWORD_PUB => .keyword_pub,
    118         c.TOKEN_KEYWORD_RESUME => .keyword_resume,
    119         c.TOKEN_KEYWORD_RETURN => .keyword_return,
    120         c.TOKEN_KEYWORD_LINKSECTION => .keyword_linksection,
    121         c.TOKEN_KEYWORD_STRUCT => .keyword_struct,
    122         c.TOKEN_KEYWORD_SUSPEND => .keyword_suspend,
    123         c.TOKEN_KEYWORD_SWITCH => .keyword_switch,
    124         c.TOKEN_KEYWORD_TEST => .keyword_test,
    125         c.TOKEN_KEYWORD_THREADLOCAL => .keyword_threadlocal,
    126         c.TOKEN_KEYWORD_TRY => .keyword_try,
    127         c.TOKEN_KEYWORD_UNION => .keyword_union,
    128         c.TOKEN_KEYWORD_UNREACHABLE => .keyword_unreachable,
    129         c.TOKEN_KEYWORD_VAR => .keyword_var,
    130         c.TOKEN_KEYWORD_VOLATILE => .keyword_volatile,
    131         c.TOKEN_KEYWORD_WHILE => .keyword_while,
    132         else => undefined,
    133     };
    134 }
    135 
    136 // Copy-pasted from lib/std/zig/tokenizer.zig
    137 fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
    138     // Do the C thing
    139     {
    140         var ctokenizer = c.tokenizerInit(source.ptr, @intCast(source.len));
    141         for (expected_token_tags) |expected_token_tag| {
    142             const token = c.tokenizerNext(&ctokenizer);
    143             try std.testing.expectEqual(expected_token_tag, zigToken(token.tag));
    144         }
    145         const last_token = c.tokenizerNext(&ctokenizer);
    146         try std.testing.expectEqual(Token.Tag.eof, zigToken(last_token.tag));
    147     }
    148 
    149     {
    150         var tokenizer = Tokenizer.init(source);
    151         for (expected_token_tags) |expected_token_tag| {
    152             const token = tokenizer.next();
    153             try std.testing.expectEqual(expected_token_tag, token.tag);
    154         }
    155         // Last token should always be eof, even when the last token was invalid,
    156         // in which case the tokenizer is in an invalid state, which can only be
    157         // recovered by opinionated means outside the scope of this implementation.
    158         const last_token = tokenizer.next();
    159         try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
    160         try std.testing.expectEqual(source.len, last_token.loc.start);
    161         try std.testing.expectEqual(source.len, last_token.loc.end);
    162     }
    163 }
    164 
    165 test "keywords" {
    166     try testTokenize("test const else", &.{ .keyword_test, .keyword_const, .keyword_else });
    167 }
    168 
    169 test "line comment followed by top-level comptime" {
    170     try testTokenize(
    171         \\// line comment
    172         \\comptime {}
    173         \\
    174     , &.{
    175         .keyword_comptime,
    176         .l_brace,
    177         .r_brace,
    178     });
    179 }
    180 
    181 test "unknown length pointer and then c pointer" {
    182     try testTokenize(
    183         \\[*]u8
    184         \\[*c]u8
    185     , &.{
    186         .l_bracket,
    187         .asterisk,
    188         .r_bracket,
    189         .identifier,
    190         .l_bracket,
    191         .asterisk,
    192         .identifier,
    193         .r_bracket,
    194         .identifier,
    195     });
    196 }
    197 
    198 test "code point literal with hex escape" {
    199     try testTokenize(
    200         \\'\x1b'
    201     , &.{.char_literal});
    202     try testTokenize(
    203         \\'\x1'
    204     , &.{.char_literal});
    205 }
    206 
    207 test "newline in char literal" {
    208     try testTokenize(
    209         \\'
    210         \\'
    211     , &.{ .invalid, .invalid });
    212 }
    213 
    214 test "newline in string literal" {
    215     try testTokenize(
    216         \\"
    217         \\"
    218     , &.{ .invalid, .invalid });
    219 }
    220 
    221 test "code point literal with unicode escapes" {
    222     // Valid unicode escapes
    223     try testTokenize(
    224         \\'\u{3}'
    225     , &.{.char_literal});
    226     try testTokenize(
    227         \\'\u{01}'
    228     , &.{.char_literal});
    229     try testTokenize(
    230         \\'\u{2a}'
    231     , &.{.char_literal});
    232     try testTokenize(
    233         \\'\u{3f9}'
    234     , &.{.char_literal});
    235     try testTokenize(
    236         \\'\u{6E09aBc1523}'
    237     , &.{.char_literal});
    238     try testTokenize(
    239         \\"\u{440}"
    240     , &.{.string_literal});
    241 
    242     // Invalid unicode escapes
    243     try testTokenize(
    244         \\'\u'
    245     , &.{.char_literal});
    246     try testTokenize(
    247         \\'\u{{'
    248     , &.{.char_literal});
    249     try testTokenize(
    250         \\'\u{}'
    251     , &.{.char_literal});
    252     try testTokenize(
    253         \\'\u{s}'
    254     , &.{.char_literal});
    255     try testTokenize(
    256         \\'\u{2z}'
    257     , &.{.char_literal});
    258     try testTokenize(
    259         \\'\u{4a'
    260     , &.{.char_literal});
    261 
    262     // Test old-style unicode literals
    263     try testTokenize(
    264         \\'\u0333'
    265     , &.{.char_literal});
    266     try testTokenize(
    267         \\'\U0333'
    268     , &.{.char_literal});
    269 }
    270 
    271 test "code point literal with unicode code point" {
    272     try testTokenize(
    273         \\'💩'
    274     , &.{.char_literal});
    275 }
    276 
    277 test "float literal e exponent" {
    278     try testTokenize("a = 4.94065645841246544177e-324;\n", &.{
    279         .identifier,
    280         .equal,
    281         .number_literal,
    282         .semicolon,
    283     });
    284 }
    285 
    286 test "float literal p exponent" {
    287     try testTokenize("a = 0x1.a827999fcef32p+1022;\n", &.{
    288         .identifier,
    289         .equal,
    290         .number_literal,
    291         .semicolon,
    292     });
    293 }
    294 
    295 test "chars" {
    296     try testTokenize("'c'", &.{.char_literal});
    297 }
    298 
    299 test "invalid token characters" {
    300     try testTokenize("#", &.{.invalid});
    301     try testTokenize("`", &.{.invalid});
    302     try testTokenize("'c", &.{.invalid});
    303     try testTokenize("'", &.{.invalid});
    304     try testTokenize("''", &.{.char_literal});
    305     try testTokenize("'\n'", &.{ .invalid, .invalid });
    306 }
    307 
    308 test "invalid literal/comment characters" {
    309     try testTokenize("\"\x00\"", &.{.invalid});
    310     try testTokenize("`\x00`", &.{.invalid});
    311     try testTokenize("//\x00", &.{.invalid});
    312     try testTokenize("//\x1f", &.{.invalid});
    313     try testTokenize("//\x7f", &.{.invalid});
    314 }
    315 
    316 test "utf8" {
    317     try testTokenize("//\xc2\x80", &.{});
    318     try testTokenize("//\xf4\x8f\xbf\xbf", &.{});
    319 }
    320 
    321 test "invalid utf8" {
    322     try testTokenize("//\x80", &.{});
    323     try testTokenize("//\xbf", &.{});
    324     try testTokenize("//\xf8", &.{});
    325     try testTokenize("//\xff", &.{});
    326     try testTokenize("//\xc2\xc0", &.{});
    327     try testTokenize("//\xe0", &.{});
    328     try testTokenize("//\xf0", &.{});
    329     try testTokenize("//\xf0\x90\x80\xc0", &.{});
    330 }
    331 
    332 test "illegal unicode codepoints" {
    333     // unicode newline characters.U+0085, U+2028, U+2029
    334     try testTokenize("//\xc2\x84", &.{});
    335     try testTokenize("//\xc2\x85", &.{});
    336     try testTokenize("//\xc2\x86", &.{});
    337     try testTokenize("//\xe2\x80\xa7", &.{});
    338     try testTokenize("//\xe2\x80\xa8", &.{});
    339     try testTokenize("//\xe2\x80\xa9", &.{});
    340     try testTokenize("//\xe2\x80\xaa", &.{});
    341 }
    342 
    343 test "string identifier and builtin fns" {
    344     try testTokenize(
    345         \\const @"if" = @import("std");
    346     , &.{
    347         .keyword_const,
    348         .identifier,
    349         .equal,
    350         .builtin,
    351         .l_paren,
    352         .string_literal,
    353         .r_paren,
    354         .semicolon,
    355     });
    356 }
    357 
    358 test "pipe and then invalid" {
    359     try testTokenize("||=", &.{
    360         .pipe_pipe,
    361         .equal,
    362     });
    363 }
    364 
    365 test "line comment and doc comment" {
    366     try testTokenize("//", &.{});
    367     try testTokenize("// a / b", &.{});
    368     try testTokenize("// /", &.{});
    369     try testTokenize("/// a", &.{.doc_comment});
    370     try testTokenize("///", &.{.doc_comment});
    371     try testTokenize("////", &.{});
    372     try testTokenize("//!", &.{.container_doc_comment});
    373     try testTokenize("//!!", &.{.container_doc_comment});
    374 }
    375 
    376 test "line comment followed by identifier" {
    377     try testTokenize(
    378         \\    Unexpected,
    379         \\    // another
    380         \\    Another,
    381     , &.{
    382         .identifier,
    383         .comma,
    384         .identifier,
    385         .comma,
    386     });
    387 }
    388 
    389 test "UTF-8 BOM is recognized and skipped" {
    390     try testTokenize("\xEF\xBB\xBFa;\n", &.{
    391         .identifier,
    392         .semicolon,
    393     });
    394 }
    395 
    396 test "correctly parse pointer assignment" {
    397     try testTokenize("b.*=3;\n", &.{
    398         .identifier,
    399         .period_asterisk,
    400         .equal,
    401         .number_literal,
    402         .semicolon,
    403     });
    404 }
    405 
    406 test "correctly parse pointer dereference followed by asterisk" {
    407     try testTokenize("\"b\".* ** 10", &.{
    408         .string_literal,
    409         .period_asterisk,
    410         .asterisk_asterisk,
    411         .number_literal,
    412     });
    413 
    414     try testTokenize("(\"b\".*)** 10", &.{
    415         .l_paren,
    416         .string_literal,
    417         .period_asterisk,
    418         .r_paren,
    419         .asterisk_asterisk,
    420         .number_literal,
    421     });
    422 
    423     try testTokenize("\"b\".*** 10", &.{
    424         .string_literal,
    425         .invalid_periodasterisks,
    426         .asterisk_asterisk,
    427         .number_literal,
    428     });
    429 }
    430 
    431 test "range literals" {
    432     try testTokenize("0...9", &.{ .number_literal, .ellipsis3, .number_literal });
    433     try testTokenize("'0'...'9'", &.{ .char_literal, .ellipsis3, .char_literal });
    434     try testTokenize("0x00...0x09", &.{ .number_literal, .ellipsis3, .number_literal });
    435     try testTokenize("0b00...0b11", &.{ .number_literal, .ellipsis3, .number_literal });
    436     try testTokenize("0o00...0o11", &.{ .number_literal, .ellipsis3, .number_literal });
    437 }
    438 
    439 test "number literals decimal" {
    440     try testTokenize("0", &.{.number_literal});
    441     try testTokenize("1", &.{.number_literal});
    442     try testTokenize("2", &.{.number_literal});
    443     try testTokenize("3", &.{.number_literal});
    444     try testTokenize("4", &.{.number_literal});
    445     try testTokenize("5", &.{.number_literal});
    446     try testTokenize("6", &.{.number_literal});
    447     try testTokenize("7", &.{.number_literal});
    448     try testTokenize("8", &.{.number_literal});
    449     try testTokenize("9", &.{.number_literal});
    450     try testTokenize("1..", &.{ .number_literal, .ellipsis2 });
    451     try testTokenize("0a", &.{.number_literal});
    452     try testTokenize("9b", &.{.number_literal});
    453     try testTokenize("1z", &.{.number_literal});
    454     try testTokenize("1z_1", &.{.number_literal});
    455     try testTokenize("9z3", &.{.number_literal});
    456 
    457     try testTokenize("0_0", &.{.number_literal});
    458     try testTokenize("0001", &.{.number_literal});
    459     try testTokenize("01234567890", &.{.number_literal});
    460     try testTokenize("012_345_6789_0", &.{.number_literal});
    461     try testTokenize("0_1_2_3_4_5_6_7_8_9_0", &.{.number_literal});
    462 
    463     try testTokenize("00_", &.{.number_literal});
    464     try testTokenize("0_0_", &.{.number_literal});
    465     try testTokenize("0__0", &.{.number_literal});
    466     try testTokenize("0_0f", &.{.number_literal});
    467     try testTokenize("0_0_f", &.{.number_literal});
    468     try testTokenize("0_0_f_00", &.{.number_literal});
    469     try testTokenize("1_,", &.{ .number_literal, .comma });
    470 
    471     try testTokenize("0.0", &.{.number_literal});
    472     try testTokenize("1.0", &.{.number_literal});
    473     try testTokenize("10.0", &.{.number_literal});
    474     try testTokenize("0e0", &.{.number_literal});
    475     try testTokenize("1e0", &.{.number_literal});
    476     try testTokenize("1e100", &.{.number_literal});
    477     try testTokenize("1.0e100", &.{.number_literal});
    478     try testTokenize("1.0e+100", &.{.number_literal});
    479     try testTokenize("1.0e-100", &.{.number_literal});
    480     try testTokenize("1_0_0_0.0_0_0_0_0_1e1_0_0_0", &.{.number_literal});
    481 
    482     try testTokenize("1.", &.{ .number_literal, .period });
    483     try testTokenize("1e", &.{.number_literal});
    484     try testTokenize("1.e100", &.{.number_literal});
    485     try testTokenize("1.0e1f0", &.{.number_literal});
    486     try testTokenize("1.0p100", &.{.number_literal});
    487     try testTokenize("1.0p-100", &.{.number_literal});
    488     try testTokenize("1.0p1f0", &.{.number_literal});
    489     try testTokenize("1.0_,", &.{ .number_literal, .comma });
    490     try testTokenize("1_.0", &.{.number_literal});
    491     try testTokenize("1._", &.{.number_literal});
    492     try testTokenize("1.a", &.{.number_literal});
    493     try testTokenize("1.z", &.{.number_literal});
    494     try testTokenize("1._0", &.{.number_literal});
    495     try testTokenize("1.+", &.{ .number_literal, .period, .plus });
    496     try testTokenize("1._+", &.{ .number_literal, .plus });
    497     try testTokenize("1._e", &.{.number_literal});
    498     try testTokenize("1.0e", &.{.number_literal});
    499     try testTokenize("1.0e,", &.{ .number_literal, .comma });
    500     try testTokenize("1.0e_", &.{.number_literal});
    501     try testTokenize("1.0e+_", &.{.number_literal});
    502     try testTokenize("1.0e-_", &.{.number_literal});
    503     try testTokenize("1.0e0_+", &.{ .number_literal, .plus });
    504 }
    505 
    506 test "number literals binary" {
    507     try testTokenize("0b0", &.{.number_literal});
    508     try testTokenize("0b1", &.{.number_literal});
    509     try testTokenize("0b2", &.{.number_literal});
    510     try testTokenize("0b3", &.{.number_literal});
    511     try testTokenize("0b4", &.{.number_literal});
    512     try testTokenize("0b5", &.{.number_literal});
    513     try testTokenize("0b6", &.{.number_literal});
    514     try testTokenize("0b7", &.{.number_literal});
    515     try testTokenize("0b8", &.{.number_literal});
    516     try testTokenize("0b9", &.{.number_literal});
    517     try testTokenize("0ba", &.{.number_literal});
    518     try testTokenize("0bb", &.{.number_literal});
    519     try testTokenize("0bc", &.{.number_literal});
    520     try testTokenize("0bd", &.{.number_literal});
    521     try testTokenize("0be", &.{.number_literal});
    522     try testTokenize("0bf", &.{.number_literal});
    523     try testTokenize("0bz", &.{.number_literal});
    524 
    525     try testTokenize("0b0000_0000", &.{.number_literal});
    526     try testTokenize("0b1111_1111", &.{.number_literal});
    527     try testTokenize("0b10_10_10_10", &.{.number_literal});
    528     try testTokenize("0b0_1_0_1_0_1_0_1", &.{.number_literal});
    529     try testTokenize("0b1.", &.{ .number_literal, .period });
    530     try testTokenize("0b1.0", &.{.number_literal});
    531 
    532     try testTokenize("0B0", &.{.number_literal});
    533     try testTokenize("0b_", &.{.number_literal});
    534     try testTokenize("0b_0", &.{.number_literal});
    535     try testTokenize("0b1_", &.{.number_literal});
    536     try testTokenize("0b0__1", &.{.number_literal});
    537     try testTokenize("0b0_1_", &.{.number_literal});
    538     try testTokenize("0b1e", &.{.number_literal});
    539     try testTokenize("0b1p", &.{.number_literal});
    540     try testTokenize("0b1e0", &.{.number_literal});
    541     try testTokenize("0b1p0", &.{.number_literal});
    542     try testTokenize("0b1_,", &.{ .number_literal, .comma });
    543 }
    544 
    545 test "number literals octal" {
    546     try testTokenize("0o0", &.{.number_literal});
    547     try testTokenize("0o1", &.{.number_literal});
    548     try testTokenize("0o2", &.{.number_literal});
    549     try testTokenize("0o3", &.{.number_literal});
    550     try testTokenize("0o4", &.{.number_literal});
    551     try testTokenize("0o5", &.{.number_literal});
    552     try testTokenize("0o6", &.{.number_literal});
    553     try testTokenize("0o7", &.{.number_literal});
    554     try testTokenize("0o8", &.{.number_literal});
    555     try testTokenize("0o9", &.{.number_literal});
    556     try testTokenize("0oa", &.{.number_literal});
    557     try testTokenize("0ob", &.{.number_literal});
    558     try testTokenize("0oc", &.{.number_literal});
    559     try testTokenize("0od", &.{.number_literal});
    560     try testTokenize("0oe", &.{.number_literal});
    561     try testTokenize("0of", &.{.number_literal});
    562     try testTokenize("0oz", &.{.number_literal});
    563 
    564     try testTokenize("0o01234567", &.{.number_literal});
    565     try testTokenize("0o0123_4567", &.{.number_literal});
    566     try testTokenize("0o01_23_45_67", &.{.number_literal});
    567     try testTokenize("0o0_1_2_3_4_5_6_7", &.{.number_literal});
    568     try testTokenize("0o7.", &.{ .number_literal, .period });
    569     try testTokenize("0o7.0", &.{.number_literal});
    570 
    571     try testTokenize("0O0", &.{.number_literal});
    572     try testTokenize("0o_", &.{.number_literal});
    573     try testTokenize("0o_0", &.{.number_literal});
    574     try testTokenize("0o1_", &.{.number_literal});
    575     try testTokenize("0o0__1", &.{.number_literal});
    576     try testTokenize("0o0_1_", &.{.number_literal});
    577     try testTokenize("0o1e", &.{.number_literal});
    578     try testTokenize("0o1p", &.{.number_literal});
    579     try testTokenize("0o1e0", &.{.number_literal});
    580     try testTokenize("0o1p0", &.{.number_literal});
    581     try testTokenize("0o_,", &.{ .number_literal, .comma });
    582 }
    583 
    584 test "number literals hexadecimal" {
    585     try testTokenize("0x0", &.{.number_literal});
    586     try testTokenize("0x1", &.{.number_literal});
    587     try testTokenize("0x2", &.{.number_literal});
    588     try testTokenize("0x3", &.{.number_literal});
    589     try testTokenize("0x4", &.{.number_literal});
    590     try testTokenize("0x5", &.{.number_literal});
    591     try testTokenize("0x6", &.{.number_literal});
    592     try testTokenize("0x7", &.{.number_literal});
    593     try testTokenize("0x8", &.{.number_literal});
    594     try testTokenize("0x9", &.{.number_literal});
    595     try testTokenize("0xa", &.{.number_literal});
    596     try testTokenize("0xb", &.{.number_literal});
    597     try testTokenize("0xc", &.{.number_literal});
    598     try testTokenize("0xd", &.{.number_literal});
    599     try testTokenize("0xe", &.{.number_literal});
    600     try testTokenize("0xf", &.{.number_literal});
    601     try testTokenize("0xA", &.{.number_literal});
    602     try testTokenize("0xB", &.{.number_literal});
    603     try testTokenize("0xC", &.{.number_literal});
    604     try testTokenize("0xD", &.{.number_literal});
    605     try testTokenize("0xE", &.{.number_literal});
    606     try testTokenize("0xF", &.{.number_literal});
    607     try testTokenize("0x0z", &.{.number_literal});
    608     try testTokenize("0xz", &.{.number_literal});
    609 
    610     try testTokenize("0x0123456789ABCDEF", &.{.number_literal});
    611     try testTokenize("0x0123_4567_89AB_CDEF", &.{.number_literal});
    612     try testTokenize("0x01_23_45_67_89AB_CDE_F", &.{.number_literal});
    613     try testTokenize("0x0_1_2_3_4_5_6_7_8_9_A_B_C_D_E_F", &.{.number_literal});
    614 
    615     try testTokenize("0X0", &.{.number_literal});
    616     try testTokenize("0x_", &.{.number_literal});
    617     try testTokenize("0x_1", &.{.number_literal});
    618     try testTokenize("0x1_", &.{.number_literal});
    619     try testTokenize("0x0__1", &.{.number_literal});
    620     try testTokenize("0x0_1_", &.{.number_literal});
    621     try testTokenize("0x_,", &.{ .number_literal, .comma });
    622 
    623     try testTokenize("0x1.0", &.{.number_literal});
    624     try testTokenize("0xF.0", &.{.number_literal});
    625     try testTokenize("0xF.F", &.{.number_literal});
    626     try testTokenize("0xF.Fp0", &.{.number_literal});
    627     try testTokenize("0xF.FP0", &.{.number_literal});
    628     try testTokenize("0x1p0", &.{.number_literal});
    629     try testTokenize("0xfp0", &.{.number_literal});
    630     try testTokenize("0x1.0+0xF.0", &.{ .number_literal, .plus, .number_literal });
    631 
    632     try testTokenize("0x1.", &.{ .number_literal, .period });
    633     try testTokenize("0xF.", &.{ .number_literal, .period });
    634     try testTokenize("0x1.+0xF.", &.{ .number_literal, .period, .plus, .number_literal, .period });
    635     try testTokenize("0xff.p10", &.{.number_literal});
    636 
    637     try testTokenize("0x0123456.789ABCDEF", &.{.number_literal});
    638     try testTokenize("0x0_123_456.789_ABC_DEF", &.{.number_literal});
    639     try testTokenize("0x0_1_2_3_4_5_6.7_8_9_A_B_C_D_E_F", &.{.number_literal});
    640     try testTokenize("0x0p0", &.{.number_literal});
    641     try testTokenize("0x0.0p0", &.{.number_literal});
    642     try testTokenize("0xff.ffp10", &.{.number_literal});
    643     try testTokenize("0xff.ffP10", &.{.number_literal});
    644     try testTokenize("0xffp10", &.{.number_literal});
    645     try testTokenize("0xff_ff.ff_ffp1_0_0_0", &.{.number_literal});
    646     try testTokenize("0xf_f_f_f.f_f_f_fp+1_000", &.{.number_literal});
    647     try testTokenize("0xf_f_f_f.f_f_f_fp-1_00_0", &.{.number_literal});
    648 
    649     try testTokenize("0x1e", &.{.number_literal});
    650     try testTokenize("0x1e0", &.{.number_literal});
    651     try testTokenize("0x1p", &.{.number_literal});
    652     try testTokenize("0xfp0z1", &.{.number_literal});
    653     try testTokenize("0xff.ffpff", &.{.number_literal});
    654     try testTokenize("0x0.p", &.{.number_literal});
    655     try testTokenize("0x0.z", &.{.number_literal});
    656     try testTokenize("0x0._", &.{.number_literal});
    657     try testTokenize("0x0_.0", &.{.number_literal});
    658     try testTokenize("0x0_.0.0", &.{ .number_literal, .period, .number_literal });
    659     try testTokenize("0x0._0", &.{.number_literal});
    660     try testTokenize("0x0.0_", &.{.number_literal});
    661     try testTokenize("0x0_p0", &.{.number_literal});
    662     try testTokenize("0x0_.p0", &.{.number_literal});
    663     try testTokenize("0x0._p0", &.{.number_literal});
    664     try testTokenize("0x0.0_p0", &.{.number_literal});
    665     try testTokenize("0x0._0p0", &.{.number_literal});
    666     try testTokenize("0x0.0p_0", &.{.number_literal});
    667     try testTokenize("0x0.0p+_0", &.{.number_literal});
    668     try testTokenize("0x0.0p-_0", &.{.number_literal});
    669     try testTokenize("0x0.0p0_", &.{.number_literal});
    670 }
    671 
    672 test "multi line string literal with only 1 backslash" {
    673     try testTokenize("x \\\n;", &.{ .identifier, .invalid, .semicolon });
    674 }
    675 
    676 test "invalid builtin identifiers" {
    677     try testTokenize("@()", &.{.invalid});
    678     try testTokenize("@0()", &.{.invalid});
    679 }
    680 
    681 test "invalid token with unfinished escape right before eof" {
    682     try testTokenize("\"\\", &.{.invalid});
    683     try testTokenize("'\\", &.{.invalid});
    684     try testTokenize("'\\u", &.{.invalid});
    685 }
    686 
    687 test "saturating operators" {
    688     try testTokenize("<<", &.{.angle_bracket_angle_bracket_left});
    689     try testTokenize("<<|", &.{.angle_bracket_angle_bracket_left_pipe});
    690     try testTokenize("<<|=", &.{.angle_bracket_angle_bracket_left_pipe_equal});
    691 
    692     try testTokenize("*", &.{.asterisk});
    693     try testTokenize("*|", &.{.asterisk_pipe});
    694     try testTokenize("*|=", &.{.asterisk_pipe_equal});
    695 
    696     try testTokenize("+", &.{.plus});
    697     try testTokenize("+|", &.{.plus_pipe});
    698     try testTokenize("+|=", &.{.plus_pipe_equal});
    699 
    700     try testTokenize("-", &.{.minus});
    701     try testTokenize("-|", &.{.minus_pipe});
    702     try testTokenize("-|=", &.{.minus_pipe_equal});
    703 }
    704 
    705 test "null byte before eof" {
    706     try testTokenize("123 \x00 456", &.{ .number_literal, .invalid });
    707     try testTokenize("//\x00", &.{.invalid});
    708     try testTokenize("\\\\\x00", &.{.invalid});
    709     try testTokenize("\x00", &.{.invalid});
    710     try testTokenize("// NUL\x00\n", &.{.invalid});
    711     try testTokenize("///\x00\n", &.{ .doc_comment, .invalid });
    712     try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid });
    713 }
    714 
    715 test "invalid tabs and carriage returns" {
    716     // "Inside Line Comments and Documentation Comments, Any TAB is rejected by
    717     // the grammar since it is ambiguous how it should be rendered."
    718     // https://github.com/ziglang/zig-spec/issues/38
    719     try testTokenize("//\t", &.{.invalid});
    720     try testTokenize("// \t", &.{.invalid});
    721     try testTokenize("///\t", &.{.invalid});
    722     try testTokenize("/// \t", &.{.invalid});
    723     try testTokenize("//!\t", &.{.invalid});
    724     try testTokenize("//! \t", &.{.invalid});
    725 
    726     // "Inside Line Comments and Documentation Comments, CR directly preceding
    727     // NL is unambiguously part of the newline sequence. It is accepted by the
    728     // grammar and removed by zig fmt, leaving only NL. CR anywhere else is
    729     // rejected by the grammar."
    730     // https://github.com/ziglang/zig-spec/issues/38
    731     try testTokenize("//\r", &.{.invalid});
    732     try testTokenize("// \r", &.{.invalid});
    733     try testTokenize("///\r", &.{.invalid});
    734     try testTokenize("/// \r", &.{.invalid});
    735     try testTokenize("//\r ", &.{.invalid});
    736     try testTokenize("// \r ", &.{.invalid});
    737     try testTokenize("///\r ", &.{.invalid});
    738     try testTokenize("/// \r ", &.{.invalid});
    739     try testTokenize("//\r\n", &.{});
    740     try testTokenize("// \r\n", &.{});
    741     try testTokenize("///\r\n", &.{.doc_comment});
    742     try testTokenize("/// \r\n", &.{.doc_comment});
    743     try testTokenize("//!\r", &.{.invalid});
    744     try testTokenize("//! \r", &.{.invalid});
    745     try testTokenize("//!\r ", &.{.invalid});
    746     try testTokenize("//! \r ", &.{.invalid});
    747     try testTokenize("//!\r\n", &.{.container_doc_comment});
    748     try testTokenize("//! \r\n", &.{.container_doc_comment});
    749 
    750     // The control characters TAB and CR are rejected by the grammar inside multi-line string literals,
    751     // except if CR is directly before NL.
    752     // https://github.com/ziglang/zig-spec/issues/38
    753     try testTokenize("\\\\\r", &.{.invalid});
    754     try testTokenize("\\\\\r ", &.{.invalid});
    755     try testTokenize("\\\\ \r", &.{.invalid});
    756     try testTokenize("\\\\\t", &.{.invalid});
    757     try testTokenize("\\\\\t ", &.{.invalid});
    758     try testTokenize("\\\\ \t", &.{.invalid});
    759     try testTokenize("\\\\\r\n", &.{.multiline_string_literal_line});
    760 
    761     // "TAB used as whitespace is...accepted by the grammar. CR used as
    762     // whitespace, whether directly preceding NL or stray, is...accepted by the
    763     // grammar."
    764     // https://github.com/ziglang/zig-spec/issues/38
    765     try testTokenize("\tpub\tswitch\t", &.{ .keyword_pub, .keyword_switch });
    766     try testTokenize("\rpub\rswitch\r", &.{ .keyword_pub, .keyword_switch });
    767 }