tokenizer_test.zig (29725B) - Raw
1 const std = @import("std"); 2 const testing = std.testing; 3 4 const Token = std.zig.Token; 5 const Tokenizer = std.zig.Tokenizer; 6 7 const c = @cImport({ 8 @cInclude("tokenizer.h"); 9 }); 10 11 pub fn zigToken(token: c_uint) Token.Tag { 12 return switch (token) { 13 c.TOKEN_INVALID => .invalid, 14 c.TOKEN_INVALID_PERIODASTERISKS => .invalid_periodasterisks, 15 c.TOKEN_IDENTIFIER => .identifier, 16 c.TOKEN_STRING_LITERAL => .string_literal, 17 c.TOKEN_MULTILINE_STRING_LITERAL_LINE => .multiline_string_literal_line, 18 c.TOKEN_CHAR_LITERAL => .char_literal, 19 c.TOKEN_EOF => .eof, 20 c.TOKEN_BUILTIN => .builtin, 21 c.TOKEN_BANG => .bang, 22 c.TOKEN_PIPE => .pipe, 23 c.TOKEN_PIPE_PIPE => .pipe_pipe, 24 c.TOKEN_PIPE_EQUAL => .pipe_equal, 25 c.TOKEN_EQUAL => .equal, 26 c.TOKEN_EQUAL_EQUAL => .equal_equal, 27 c.TOKEN_EQUAL_ANGLE_BRACKET_RIGHT => .equal_angle_bracket_right, 28 c.TOKEN_BANG_EQUAL => .bang_equal, 29 c.TOKEN_L_PAREN => .l_paren, 30 c.TOKEN_R_PAREN => .r_paren, 31 c.TOKEN_SEMICOLON => .semicolon, 32 c.TOKEN_PERCENT => .percent, 33 c.TOKEN_PERCENT_EQUAL => .percent_equal, 34 c.TOKEN_L_BRACE => .l_brace, 35 c.TOKEN_R_BRACE => .r_brace, 36 c.TOKEN_L_BRACKET => .l_bracket, 37 c.TOKEN_R_BRACKET => .r_bracket, 38 c.TOKEN_PERIOD => .period, 39 c.TOKEN_PERIOD_ASTERISK => .period_asterisk, 40 c.TOKEN_ELLIPSIS2 => .ellipsis2, 41 c.TOKEN_ELLIPSIS3 => .ellipsis3, 42 c.TOKEN_CARET => .caret, 43 c.TOKEN_CARET_EQUAL => .caret_equal, 44 c.TOKEN_PLUS => .plus, 45 c.TOKEN_PLUS_PLUS => .plus_plus, 46 c.TOKEN_PLUS_EQUAL => .plus_equal, 47 c.TOKEN_PLUS_PERCENT => .plus_percent, 48 c.TOKEN_PLUS_PERCENT_EQUAL => .plus_percent_equal, 49 c.TOKEN_PLUS_PIPE => .plus_pipe, 50 c.TOKEN_PLUS_PIPE_EQUAL => .plus_pipe_equal, 51 c.TOKEN_MINUS => .minus, 52 c.TOKEN_MINUS_EQUAL => .minus_equal, 53 c.TOKEN_MINUS_PERCENT => .minus_percent, 54 c.TOKEN_MINUS_PERCENT_EQUAL => .minus_percent_equal, 55 c.TOKEN_MINUS_PIPE => .minus_pipe, 56 c.TOKEN_MINUS_PIPE_EQUAL => .minus_pipe_equal, 57 c.TOKEN_ASTERISK => .asterisk, 58 c.TOKEN_ASTERISK_EQUAL => .asterisk_equal, 59 c.TOKEN_ASTERISK_ASTERISK => .asterisk_asterisk, 60 c.TOKEN_ASTERISK_PERCENT => .asterisk_percent, 61 c.TOKEN_ASTERISK_PERCENT_EQUAL => .asterisk_percent_equal, 62 c.TOKEN_ASTERISK_PIPE => .asterisk_pipe, 63 c.TOKEN_ASTERISK_PIPE_EQUAL => .asterisk_pipe_equal, 64 c.TOKEN_ARROW => .arrow, 65 c.TOKEN_COLON => .colon, 66 c.TOKEN_SLASH => .slash, 67 c.TOKEN_SLASH_EQUAL => .slash_equal, 68 c.TOKEN_COMMA => .comma, 69 c.TOKEN_AMPERSAND => .ampersand, 70 c.TOKEN_AMPERSAND_EQUAL => .ampersand_equal, 71 c.TOKEN_QUESTION_MARK => .question_mark, 72 c.TOKEN_ANGLE_BRACKET_LEFT => .angle_bracket_left, 73 c.TOKEN_ANGLE_BRACKET_LEFT_EQUAL => .angle_bracket_left_equal, 74 c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT => .angle_bracket_angle_bracket_left, 75 c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL => .angle_bracket_angle_bracket_left_equal, 76 c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE => .angle_bracket_angle_bracket_left_pipe, 77 c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL => .angle_bracket_angle_bracket_left_pipe_equal, 78 c.TOKEN_ANGLE_BRACKET_RIGHT => .angle_bracket_right, 79 c.TOKEN_ANGLE_BRACKET_RIGHT_EQUAL => .angle_bracket_right_equal, 80 c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT => .angle_bracket_angle_bracket_right, 81 c.TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL => .angle_bracket_angle_bracket_right_equal, 82 c.TOKEN_TILDE => .tilde, 83 c.TOKEN_NUMBER_LITERAL => .number_literal, 84 c.TOKEN_DOC_COMMENT => .doc_comment, 85 c.TOKEN_CONTAINER_DOC_COMMENT => .container_doc_comment, 86 c.TOKEN_KEYWORD_ADDRSPACE => .keyword_addrspace, 87 c.TOKEN_KEYWORD_ALIGN => .keyword_align, 88 c.TOKEN_KEYWORD_ALLOWZERO => .keyword_allowzero, 89 c.TOKEN_KEYWORD_AND => .keyword_and, 90 c.TOKEN_KEYWORD_ANYFRAME => .keyword_anyframe, 91 c.TOKEN_KEYWORD_ANYTYPE => .keyword_anytype, 92 c.TOKEN_KEYWORD_ASM => .keyword_asm, 93 c.TOKEN_KEYWORD_BREAK => .keyword_break, 94 c.TOKEN_KEYWORD_CALLCONV => .keyword_callconv, 95 c.TOKEN_KEYWORD_CATCH => .keyword_catch, 96 c.TOKEN_KEYWORD_COMPTIME => .keyword_comptime, 97 c.TOKEN_KEYWORD_CONST => .keyword_const, 98 c.TOKEN_KEYWORD_CONTINUE => .keyword_continue, 99 c.TOKEN_KEYWORD_DEFER => .keyword_defer, 100 c.TOKEN_KEYWORD_ELSE => .keyword_else, 101 c.TOKEN_KEYWORD_ENUM => .keyword_enum, 102 c.TOKEN_KEYWORD_ERRDEFER => .keyword_errdefer, 103 c.TOKEN_KEYWORD_ERROR => .keyword_error, 104 c.TOKEN_KEYWORD_EXPORT => .keyword_export, 105 c.TOKEN_KEYWORD_EXTERN => .keyword_extern, 106 c.TOKEN_KEYWORD_FN => .keyword_fn, 107 c.TOKEN_KEYWORD_FOR => .keyword_for, 108 c.TOKEN_KEYWORD_IF => .keyword_if, 109 c.TOKEN_KEYWORD_INLINE => .keyword_inline, 110 c.TOKEN_KEYWORD_NOALIAS => .keyword_noalias, 111 c.TOKEN_KEYWORD_NOINLINE => .keyword_noinline, 112 c.TOKEN_KEYWORD_NOSUSPEND => .keyword_nosuspend, 113 c.TOKEN_KEYWORD_OPAQUE => .keyword_opaque, 114 c.TOKEN_KEYWORD_OR => .keyword_or, 115 c.TOKEN_KEYWORD_ORELSE => .keyword_orelse, 116 c.TOKEN_KEYWORD_PACKED => .keyword_packed, 117 c.TOKEN_KEYWORD_PUB => .keyword_pub, 118 c.TOKEN_KEYWORD_RESUME => .keyword_resume, 119 c.TOKEN_KEYWORD_RETURN => .keyword_return, 120 c.TOKEN_KEYWORD_LINKSECTION => .keyword_linksection, 121 c.TOKEN_KEYWORD_STRUCT => .keyword_struct, 122 c.TOKEN_KEYWORD_SUSPEND => .keyword_suspend, 123 c.TOKEN_KEYWORD_SWITCH => .keyword_switch, 124 c.TOKEN_KEYWORD_TEST => .keyword_test, 125 c.TOKEN_KEYWORD_THREADLOCAL => .keyword_threadlocal, 126 c.TOKEN_KEYWORD_TRY => .keyword_try, 127 c.TOKEN_KEYWORD_UNION => .keyword_union, 128 c.TOKEN_KEYWORD_UNREACHABLE => .keyword_unreachable, 129 c.TOKEN_KEYWORD_VAR => .keyword_var, 130 c.TOKEN_KEYWORD_VOLATILE => .keyword_volatile, 131 c.TOKEN_KEYWORD_WHILE => .keyword_while, 132 else => undefined, 133 }; 134 } 135 136 // Copy-pasted from lib/std/zig/tokenizer.zig 137 fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void { 138 // Do the C thing 139 { 140 var ctokenizer = c.tokenizerInit(source.ptr, @intCast(source.len)); 141 for (expected_token_tags) |expected_token_tag| { 142 const token = c.tokenizerNext(&ctokenizer); 143 try std.testing.expectEqual(expected_token_tag, zigToken(token.tag)); 144 } 145 const last_token = c.tokenizerNext(&ctokenizer); 146 try std.testing.expectEqual(Token.Tag.eof, zigToken(last_token.tag)); 147 } 148 149 { 150 var tokenizer = Tokenizer.init(source); 151 for (expected_token_tags) |expected_token_tag| { 152 const token = tokenizer.next(); 153 try std.testing.expectEqual(expected_token_tag, token.tag); 154 } 155 // Last token should always be eof, even when the last token was invalid, 156 // in which case the tokenizer is in an invalid state, which can only be 157 // recovered by opinionated means outside the scope of this implementation. 158 const last_token = tokenizer.next(); 159 try std.testing.expectEqual(Token.Tag.eof, last_token.tag); 160 try std.testing.expectEqual(source.len, last_token.loc.start); 161 try std.testing.expectEqual(source.len, last_token.loc.end); 162 } 163 } 164 165 test "keywords" { 166 try testTokenize("test const else", &.{ .keyword_test, .keyword_const, .keyword_else }); 167 } 168 169 test "line comment followed by top-level comptime" { 170 try testTokenize( 171 \\// line comment 172 \\comptime {} 173 \\ 174 , &.{ 175 .keyword_comptime, 176 .l_brace, 177 .r_brace, 178 }); 179 } 180 181 test "unknown length pointer and then c pointer" { 182 try testTokenize( 183 \\[*]u8 184 \\[*c]u8 185 , &.{ 186 .l_bracket, 187 .asterisk, 188 .r_bracket, 189 .identifier, 190 .l_bracket, 191 .asterisk, 192 .identifier, 193 .r_bracket, 194 .identifier, 195 }); 196 } 197 198 test "code point literal with hex escape" { 199 try testTokenize( 200 \\'\x1b' 201 , &.{.char_literal}); 202 try testTokenize( 203 \\'\x1' 204 , &.{.char_literal}); 205 } 206 207 test "newline in char literal" { 208 try testTokenize( 209 \\' 210 \\' 211 , &.{ .invalid, .invalid }); 212 } 213 214 test "newline in string literal" { 215 try testTokenize( 216 \\" 217 \\" 218 , &.{ .invalid, .invalid }); 219 } 220 221 test "code point literal with unicode escapes" { 222 // Valid unicode escapes 223 try testTokenize( 224 \\'\u{3}' 225 , &.{.char_literal}); 226 try testTokenize( 227 \\'\u{01}' 228 , &.{.char_literal}); 229 try testTokenize( 230 \\'\u{2a}' 231 , &.{.char_literal}); 232 try testTokenize( 233 \\'\u{3f9}' 234 , &.{.char_literal}); 235 try testTokenize( 236 \\'\u{6E09aBc1523}' 237 , &.{.char_literal}); 238 try testTokenize( 239 \\"\u{440}" 240 , &.{.string_literal}); 241 242 // Invalid unicode escapes 243 try testTokenize( 244 \\'\u' 245 , &.{.char_literal}); 246 try testTokenize( 247 \\'\u{{' 248 , &.{.char_literal}); 249 try testTokenize( 250 \\'\u{}' 251 , &.{.char_literal}); 252 try testTokenize( 253 \\'\u{s}' 254 , &.{.char_literal}); 255 try testTokenize( 256 \\'\u{2z}' 257 , &.{.char_literal}); 258 try testTokenize( 259 \\'\u{4a' 260 , &.{.char_literal}); 261 262 // Test old-style unicode literals 263 try testTokenize( 264 \\'\u0333' 265 , &.{.char_literal}); 266 try testTokenize( 267 \\'\U0333' 268 , &.{.char_literal}); 269 } 270 271 test "code point literal with unicode code point" { 272 try testTokenize( 273 \\'💩' 274 , &.{.char_literal}); 275 } 276 277 test "float literal e exponent" { 278 try testTokenize("a = 4.94065645841246544177e-324;\n", &.{ 279 .identifier, 280 .equal, 281 .number_literal, 282 .semicolon, 283 }); 284 } 285 286 test "float literal p exponent" { 287 try testTokenize("a = 0x1.a827999fcef32p+1022;\n", &.{ 288 .identifier, 289 .equal, 290 .number_literal, 291 .semicolon, 292 }); 293 } 294 295 test "chars" { 296 try testTokenize("'c'", &.{.char_literal}); 297 } 298 299 test "invalid token characters" { 300 try testTokenize("#", &.{.invalid}); 301 try testTokenize("`", &.{.invalid}); 302 try testTokenize("'c", &.{.invalid}); 303 try testTokenize("'", &.{.invalid}); 304 try testTokenize("''", &.{.char_literal}); 305 try testTokenize("'\n'", &.{ .invalid, .invalid }); 306 } 307 308 test "invalid literal/comment characters" { 309 try testTokenize("\"\x00\"", &.{.invalid}); 310 try testTokenize("`\x00`", &.{.invalid}); 311 try testTokenize("//\x00", &.{.invalid}); 312 try testTokenize("//\x1f", &.{.invalid}); 313 try testTokenize("//\x7f", &.{.invalid}); 314 } 315 316 test "utf8" { 317 try testTokenize("//\xc2\x80", &.{}); 318 try testTokenize("//\xf4\x8f\xbf\xbf", &.{}); 319 } 320 321 test "invalid utf8" { 322 try testTokenize("//\x80", &.{}); 323 try testTokenize("//\xbf", &.{}); 324 try testTokenize("//\xf8", &.{}); 325 try testTokenize("//\xff", &.{}); 326 try testTokenize("//\xc2\xc0", &.{}); 327 try testTokenize("//\xe0", &.{}); 328 try testTokenize("//\xf0", &.{}); 329 try testTokenize("//\xf0\x90\x80\xc0", &.{}); 330 } 331 332 test "illegal unicode codepoints" { 333 // unicode newline characters.U+0085, U+2028, U+2029 334 try testTokenize("//\xc2\x84", &.{}); 335 try testTokenize("//\xc2\x85", &.{}); 336 try testTokenize("//\xc2\x86", &.{}); 337 try testTokenize("//\xe2\x80\xa7", &.{}); 338 try testTokenize("//\xe2\x80\xa8", &.{}); 339 try testTokenize("//\xe2\x80\xa9", &.{}); 340 try testTokenize("//\xe2\x80\xaa", &.{}); 341 } 342 343 test "string identifier and builtin fns" { 344 try testTokenize( 345 \\const @"if" = @import("std"); 346 , &.{ 347 .keyword_const, 348 .identifier, 349 .equal, 350 .builtin, 351 .l_paren, 352 .string_literal, 353 .r_paren, 354 .semicolon, 355 }); 356 } 357 358 test "pipe and then invalid" { 359 try testTokenize("||=", &.{ 360 .pipe_pipe, 361 .equal, 362 }); 363 } 364 365 test "line comment and doc comment" { 366 try testTokenize("//", &.{}); 367 try testTokenize("// a / b", &.{}); 368 try testTokenize("// /", &.{}); 369 try testTokenize("/// a", &.{.doc_comment}); 370 try testTokenize("///", &.{.doc_comment}); 371 try testTokenize("////", &.{}); 372 try testTokenize("//!", &.{.container_doc_comment}); 373 try testTokenize("//!!", &.{.container_doc_comment}); 374 } 375 376 test "line comment followed by identifier" { 377 try testTokenize( 378 \\ Unexpected, 379 \\ // another 380 \\ Another, 381 , &.{ 382 .identifier, 383 .comma, 384 .identifier, 385 .comma, 386 }); 387 } 388 389 test "UTF-8 BOM is recognized and skipped" { 390 try testTokenize("\xEF\xBB\xBFa;\n", &.{ 391 .identifier, 392 .semicolon, 393 }); 394 } 395 396 test "correctly parse pointer assignment" { 397 try testTokenize("b.*=3;\n", &.{ 398 .identifier, 399 .period_asterisk, 400 .equal, 401 .number_literal, 402 .semicolon, 403 }); 404 } 405 406 test "correctly parse pointer dereference followed by asterisk" { 407 try testTokenize("\"b\".* ** 10", &.{ 408 .string_literal, 409 .period_asterisk, 410 .asterisk_asterisk, 411 .number_literal, 412 }); 413 414 try testTokenize("(\"b\".*)** 10", &.{ 415 .l_paren, 416 .string_literal, 417 .period_asterisk, 418 .r_paren, 419 .asterisk_asterisk, 420 .number_literal, 421 }); 422 423 try testTokenize("\"b\".*** 10", &.{ 424 .string_literal, 425 .invalid_periodasterisks, 426 .asterisk_asterisk, 427 .number_literal, 428 }); 429 } 430 431 test "range literals" { 432 try testTokenize("0...9", &.{ .number_literal, .ellipsis3, .number_literal }); 433 try testTokenize("'0'...'9'", &.{ .char_literal, .ellipsis3, .char_literal }); 434 try testTokenize("0x00...0x09", &.{ .number_literal, .ellipsis3, .number_literal }); 435 try testTokenize("0b00...0b11", &.{ .number_literal, .ellipsis3, .number_literal }); 436 try testTokenize("0o00...0o11", &.{ .number_literal, .ellipsis3, .number_literal }); 437 } 438 439 test "number literals decimal" { 440 try testTokenize("0", &.{.number_literal}); 441 try testTokenize("1", &.{.number_literal}); 442 try testTokenize("2", &.{.number_literal}); 443 try testTokenize("3", &.{.number_literal}); 444 try testTokenize("4", &.{.number_literal}); 445 try testTokenize("5", &.{.number_literal}); 446 try testTokenize("6", &.{.number_literal}); 447 try testTokenize("7", &.{.number_literal}); 448 try testTokenize("8", &.{.number_literal}); 449 try testTokenize("9", &.{.number_literal}); 450 try testTokenize("1..", &.{ .number_literal, .ellipsis2 }); 451 try testTokenize("0a", &.{.number_literal}); 452 try testTokenize("9b", &.{.number_literal}); 453 try testTokenize("1z", &.{.number_literal}); 454 try testTokenize("1z_1", &.{.number_literal}); 455 try testTokenize("9z3", &.{.number_literal}); 456 457 try testTokenize("0_0", &.{.number_literal}); 458 try testTokenize("0001", &.{.number_literal}); 459 try testTokenize("01234567890", &.{.number_literal}); 460 try testTokenize("012_345_6789_0", &.{.number_literal}); 461 try testTokenize("0_1_2_3_4_5_6_7_8_9_0", &.{.number_literal}); 462 463 try testTokenize("00_", &.{.number_literal}); 464 try testTokenize("0_0_", &.{.number_literal}); 465 try testTokenize("0__0", &.{.number_literal}); 466 try testTokenize("0_0f", &.{.number_literal}); 467 try testTokenize("0_0_f", &.{.number_literal}); 468 try testTokenize("0_0_f_00", &.{.number_literal}); 469 try testTokenize("1_,", &.{ .number_literal, .comma }); 470 471 try testTokenize("0.0", &.{.number_literal}); 472 try testTokenize("1.0", &.{.number_literal}); 473 try testTokenize("10.0", &.{.number_literal}); 474 try testTokenize("0e0", &.{.number_literal}); 475 try testTokenize("1e0", &.{.number_literal}); 476 try testTokenize("1e100", &.{.number_literal}); 477 try testTokenize("1.0e100", &.{.number_literal}); 478 try testTokenize("1.0e+100", &.{.number_literal}); 479 try testTokenize("1.0e-100", &.{.number_literal}); 480 try testTokenize("1_0_0_0.0_0_0_0_0_1e1_0_0_0", &.{.number_literal}); 481 482 try testTokenize("1.", &.{ .number_literal, .period }); 483 try testTokenize("1e", &.{.number_literal}); 484 try testTokenize("1.e100", &.{.number_literal}); 485 try testTokenize("1.0e1f0", &.{.number_literal}); 486 try testTokenize("1.0p100", &.{.number_literal}); 487 try testTokenize("1.0p-100", &.{.number_literal}); 488 try testTokenize("1.0p1f0", &.{.number_literal}); 489 try testTokenize("1.0_,", &.{ .number_literal, .comma }); 490 try testTokenize("1_.0", &.{.number_literal}); 491 try testTokenize("1._", &.{.number_literal}); 492 try testTokenize("1.a", &.{.number_literal}); 493 try testTokenize("1.z", &.{.number_literal}); 494 try testTokenize("1._0", &.{.number_literal}); 495 try testTokenize("1.+", &.{ .number_literal, .period, .plus }); 496 try testTokenize("1._+", &.{ .number_literal, .plus }); 497 try testTokenize("1._e", &.{.number_literal}); 498 try testTokenize("1.0e", &.{.number_literal}); 499 try testTokenize("1.0e,", &.{ .number_literal, .comma }); 500 try testTokenize("1.0e_", &.{.number_literal}); 501 try testTokenize("1.0e+_", &.{.number_literal}); 502 try testTokenize("1.0e-_", &.{.number_literal}); 503 try testTokenize("1.0e0_+", &.{ .number_literal, .plus }); 504 } 505 506 test "number literals binary" { 507 try testTokenize("0b0", &.{.number_literal}); 508 try testTokenize("0b1", &.{.number_literal}); 509 try testTokenize("0b2", &.{.number_literal}); 510 try testTokenize("0b3", &.{.number_literal}); 511 try testTokenize("0b4", &.{.number_literal}); 512 try testTokenize("0b5", &.{.number_literal}); 513 try testTokenize("0b6", &.{.number_literal}); 514 try testTokenize("0b7", &.{.number_literal}); 515 try testTokenize("0b8", &.{.number_literal}); 516 try testTokenize("0b9", &.{.number_literal}); 517 try testTokenize("0ba", &.{.number_literal}); 518 try testTokenize("0bb", &.{.number_literal}); 519 try testTokenize("0bc", &.{.number_literal}); 520 try testTokenize("0bd", &.{.number_literal}); 521 try testTokenize("0be", &.{.number_literal}); 522 try testTokenize("0bf", &.{.number_literal}); 523 try testTokenize("0bz", &.{.number_literal}); 524 525 try testTokenize("0b0000_0000", &.{.number_literal}); 526 try testTokenize("0b1111_1111", &.{.number_literal}); 527 try testTokenize("0b10_10_10_10", &.{.number_literal}); 528 try testTokenize("0b0_1_0_1_0_1_0_1", &.{.number_literal}); 529 try testTokenize("0b1.", &.{ .number_literal, .period }); 530 try testTokenize("0b1.0", &.{.number_literal}); 531 532 try testTokenize("0B0", &.{.number_literal}); 533 try testTokenize("0b_", &.{.number_literal}); 534 try testTokenize("0b_0", &.{.number_literal}); 535 try testTokenize("0b1_", &.{.number_literal}); 536 try testTokenize("0b0__1", &.{.number_literal}); 537 try testTokenize("0b0_1_", &.{.number_literal}); 538 try testTokenize("0b1e", &.{.number_literal}); 539 try testTokenize("0b1p", &.{.number_literal}); 540 try testTokenize("0b1e0", &.{.number_literal}); 541 try testTokenize("0b1p0", &.{.number_literal}); 542 try testTokenize("0b1_,", &.{ .number_literal, .comma }); 543 } 544 545 test "number literals octal" { 546 try testTokenize("0o0", &.{.number_literal}); 547 try testTokenize("0o1", &.{.number_literal}); 548 try testTokenize("0o2", &.{.number_literal}); 549 try testTokenize("0o3", &.{.number_literal}); 550 try testTokenize("0o4", &.{.number_literal}); 551 try testTokenize("0o5", &.{.number_literal}); 552 try testTokenize("0o6", &.{.number_literal}); 553 try testTokenize("0o7", &.{.number_literal}); 554 try testTokenize("0o8", &.{.number_literal}); 555 try testTokenize("0o9", &.{.number_literal}); 556 try testTokenize("0oa", &.{.number_literal}); 557 try testTokenize("0ob", &.{.number_literal}); 558 try testTokenize("0oc", &.{.number_literal}); 559 try testTokenize("0od", &.{.number_literal}); 560 try testTokenize("0oe", &.{.number_literal}); 561 try testTokenize("0of", &.{.number_literal}); 562 try testTokenize("0oz", &.{.number_literal}); 563 564 try testTokenize("0o01234567", &.{.number_literal}); 565 try testTokenize("0o0123_4567", &.{.number_literal}); 566 try testTokenize("0o01_23_45_67", &.{.number_literal}); 567 try testTokenize("0o0_1_2_3_4_5_6_7", &.{.number_literal}); 568 try testTokenize("0o7.", &.{ .number_literal, .period }); 569 try testTokenize("0o7.0", &.{.number_literal}); 570 571 try testTokenize("0O0", &.{.number_literal}); 572 try testTokenize("0o_", &.{.number_literal}); 573 try testTokenize("0o_0", &.{.number_literal}); 574 try testTokenize("0o1_", &.{.number_literal}); 575 try testTokenize("0o0__1", &.{.number_literal}); 576 try testTokenize("0o0_1_", &.{.number_literal}); 577 try testTokenize("0o1e", &.{.number_literal}); 578 try testTokenize("0o1p", &.{.number_literal}); 579 try testTokenize("0o1e0", &.{.number_literal}); 580 try testTokenize("0o1p0", &.{.number_literal}); 581 try testTokenize("0o_,", &.{ .number_literal, .comma }); 582 } 583 584 test "number literals hexadecimal" { 585 try testTokenize("0x0", &.{.number_literal}); 586 try testTokenize("0x1", &.{.number_literal}); 587 try testTokenize("0x2", &.{.number_literal}); 588 try testTokenize("0x3", &.{.number_literal}); 589 try testTokenize("0x4", &.{.number_literal}); 590 try testTokenize("0x5", &.{.number_literal}); 591 try testTokenize("0x6", &.{.number_literal}); 592 try testTokenize("0x7", &.{.number_literal}); 593 try testTokenize("0x8", &.{.number_literal}); 594 try testTokenize("0x9", &.{.number_literal}); 595 try testTokenize("0xa", &.{.number_literal}); 596 try testTokenize("0xb", &.{.number_literal}); 597 try testTokenize("0xc", &.{.number_literal}); 598 try testTokenize("0xd", &.{.number_literal}); 599 try testTokenize("0xe", &.{.number_literal}); 600 try testTokenize("0xf", &.{.number_literal}); 601 try testTokenize("0xA", &.{.number_literal}); 602 try testTokenize("0xB", &.{.number_literal}); 603 try testTokenize("0xC", &.{.number_literal}); 604 try testTokenize("0xD", &.{.number_literal}); 605 try testTokenize("0xE", &.{.number_literal}); 606 try testTokenize("0xF", &.{.number_literal}); 607 try testTokenize("0x0z", &.{.number_literal}); 608 try testTokenize("0xz", &.{.number_literal}); 609 610 try testTokenize("0x0123456789ABCDEF", &.{.number_literal}); 611 try testTokenize("0x0123_4567_89AB_CDEF", &.{.number_literal}); 612 try testTokenize("0x01_23_45_67_89AB_CDE_F", &.{.number_literal}); 613 try testTokenize("0x0_1_2_3_4_5_6_7_8_9_A_B_C_D_E_F", &.{.number_literal}); 614 615 try testTokenize("0X0", &.{.number_literal}); 616 try testTokenize("0x_", &.{.number_literal}); 617 try testTokenize("0x_1", &.{.number_literal}); 618 try testTokenize("0x1_", &.{.number_literal}); 619 try testTokenize("0x0__1", &.{.number_literal}); 620 try testTokenize("0x0_1_", &.{.number_literal}); 621 try testTokenize("0x_,", &.{ .number_literal, .comma }); 622 623 try testTokenize("0x1.0", &.{.number_literal}); 624 try testTokenize("0xF.0", &.{.number_literal}); 625 try testTokenize("0xF.F", &.{.number_literal}); 626 try testTokenize("0xF.Fp0", &.{.number_literal}); 627 try testTokenize("0xF.FP0", &.{.number_literal}); 628 try testTokenize("0x1p0", &.{.number_literal}); 629 try testTokenize("0xfp0", &.{.number_literal}); 630 try testTokenize("0x1.0+0xF.0", &.{ .number_literal, .plus, .number_literal }); 631 632 try testTokenize("0x1.", &.{ .number_literal, .period }); 633 try testTokenize("0xF.", &.{ .number_literal, .period }); 634 try testTokenize("0x1.+0xF.", &.{ .number_literal, .period, .plus, .number_literal, .period }); 635 try testTokenize("0xff.p10", &.{.number_literal}); 636 637 try testTokenize("0x0123456.789ABCDEF", &.{.number_literal}); 638 try testTokenize("0x0_123_456.789_ABC_DEF", &.{.number_literal}); 639 try testTokenize("0x0_1_2_3_4_5_6.7_8_9_A_B_C_D_E_F", &.{.number_literal}); 640 try testTokenize("0x0p0", &.{.number_literal}); 641 try testTokenize("0x0.0p0", &.{.number_literal}); 642 try testTokenize("0xff.ffp10", &.{.number_literal}); 643 try testTokenize("0xff.ffP10", &.{.number_literal}); 644 try testTokenize("0xffp10", &.{.number_literal}); 645 try testTokenize("0xff_ff.ff_ffp1_0_0_0", &.{.number_literal}); 646 try testTokenize("0xf_f_f_f.f_f_f_fp+1_000", &.{.number_literal}); 647 try testTokenize("0xf_f_f_f.f_f_f_fp-1_00_0", &.{.number_literal}); 648 649 try testTokenize("0x1e", &.{.number_literal}); 650 try testTokenize("0x1e0", &.{.number_literal}); 651 try testTokenize("0x1p", &.{.number_literal}); 652 try testTokenize("0xfp0z1", &.{.number_literal}); 653 try testTokenize("0xff.ffpff", &.{.number_literal}); 654 try testTokenize("0x0.p", &.{.number_literal}); 655 try testTokenize("0x0.z", &.{.number_literal}); 656 try testTokenize("0x0._", &.{.number_literal}); 657 try testTokenize("0x0_.0", &.{.number_literal}); 658 try testTokenize("0x0_.0.0", &.{ .number_literal, .period, .number_literal }); 659 try testTokenize("0x0._0", &.{.number_literal}); 660 try testTokenize("0x0.0_", &.{.number_literal}); 661 try testTokenize("0x0_p0", &.{.number_literal}); 662 try testTokenize("0x0_.p0", &.{.number_literal}); 663 try testTokenize("0x0._p0", &.{.number_literal}); 664 try testTokenize("0x0.0_p0", &.{.number_literal}); 665 try testTokenize("0x0._0p0", &.{.number_literal}); 666 try testTokenize("0x0.0p_0", &.{.number_literal}); 667 try testTokenize("0x0.0p+_0", &.{.number_literal}); 668 try testTokenize("0x0.0p-_0", &.{.number_literal}); 669 try testTokenize("0x0.0p0_", &.{.number_literal}); 670 } 671 672 test "multi line string literal with only 1 backslash" { 673 try testTokenize("x \\\n;", &.{ .identifier, .invalid, .semicolon }); 674 } 675 676 test "invalid builtin identifiers" { 677 try testTokenize("@()", &.{.invalid}); 678 try testTokenize("@0()", &.{.invalid}); 679 } 680 681 test "invalid token with unfinished escape right before eof" { 682 try testTokenize("\"\\", &.{.invalid}); 683 try testTokenize("'\\", &.{.invalid}); 684 try testTokenize("'\\u", &.{.invalid}); 685 } 686 687 test "saturating operators" { 688 try testTokenize("<<", &.{.angle_bracket_angle_bracket_left}); 689 try testTokenize("<<|", &.{.angle_bracket_angle_bracket_left_pipe}); 690 try testTokenize("<<|=", &.{.angle_bracket_angle_bracket_left_pipe_equal}); 691 692 try testTokenize("*", &.{.asterisk}); 693 try testTokenize("*|", &.{.asterisk_pipe}); 694 try testTokenize("*|=", &.{.asterisk_pipe_equal}); 695 696 try testTokenize("+", &.{.plus}); 697 try testTokenize("+|", &.{.plus_pipe}); 698 try testTokenize("+|=", &.{.plus_pipe_equal}); 699 700 try testTokenize("-", &.{.minus}); 701 try testTokenize("-|", &.{.minus_pipe}); 702 try testTokenize("-|=", &.{.minus_pipe_equal}); 703 } 704 705 test "null byte before eof" { 706 try testTokenize("123 \x00 456", &.{ .number_literal, .invalid }); 707 try testTokenize("//\x00", &.{.invalid}); 708 try testTokenize("\\\\\x00", &.{.invalid}); 709 try testTokenize("\x00", &.{.invalid}); 710 try testTokenize("// NUL\x00\n", &.{.invalid}); 711 try testTokenize("///\x00\n", &.{ .doc_comment, .invalid }); 712 try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid }); 713 } 714 715 test "invalid tabs and carriage returns" { 716 // "Inside Line Comments and Documentation Comments, Any TAB is rejected by 717 // the grammar since it is ambiguous how it should be rendered." 718 // https://github.com/ziglang/zig-spec/issues/38 719 try testTokenize("//\t", &.{.invalid}); 720 try testTokenize("// \t", &.{.invalid}); 721 try testTokenize("///\t", &.{.invalid}); 722 try testTokenize("/// \t", &.{.invalid}); 723 try testTokenize("//!\t", &.{.invalid}); 724 try testTokenize("//! \t", &.{.invalid}); 725 726 // "Inside Line Comments and Documentation Comments, CR directly preceding 727 // NL is unambiguously part of the newline sequence. It is accepted by the 728 // grammar and removed by zig fmt, leaving only NL. CR anywhere else is 729 // rejected by the grammar." 730 // https://github.com/ziglang/zig-spec/issues/38 731 try testTokenize("//\r", &.{.invalid}); 732 try testTokenize("// \r", &.{.invalid}); 733 try testTokenize("///\r", &.{.invalid}); 734 try testTokenize("/// \r", &.{.invalid}); 735 try testTokenize("//\r ", &.{.invalid}); 736 try testTokenize("// \r ", &.{.invalid}); 737 try testTokenize("///\r ", &.{.invalid}); 738 try testTokenize("/// \r ", &.{.invalid}); 739 try testTokenize("//\r\n", &.{}); 740 try testTokenize("// \r\n", &.{}); 741 try testTokenize("///\r\n", &.{.doc_comment}); 742 try testTokenize("/// \r\n", &.{.doc_comment}); 743 try testTokenize("//!\r", &.{.invalid}); 744 try testTokenize("//! \r", &.{.invalid}); 745 try testTokenize("//!\r ", &.{.invalid}); 746 try testTokenize("//! \r ", &.{.invalid}); 747 try testTokenize("//!\r\n", &.{.container_doc_comment}); 748 try testTokenize("//! \r\n", &.{.container_doc_comment}); 749 750 // The control characters TAB and CR are rejected by the grammar inside multi-line string literals, 751 // except if CR is directly before NL. 752 // https://github.com/ziglang/zig-spec/issues/38 753 try testTokenize("\\\\\r", &.{.invalid}); 754 try testTokenize("\\\\\r ", &.{.invalid}); 755 try testTokenize("\\\\ \r", &.{.invalid}); 756 try testTokenize("\\\\\t", &.{.invalid}); 757 try testTokenize("\\\\\t ", &.{.invalid}); 758 try testTokenize("\\\\ \t", &.{.invalid}); 759 try testTokenize("\\\\\r\n", &.{.multiline_string_literal_line}); 760 761 // "TAB used as whitespace is...accepted by the grammar. CR used as 762 // whitespace, whether directly preceding NL or stray, is...accepted by the 763 // grammar." 764 // https://github.com/ziglang/zig-spec/issues/38 765 try testTokenize("\tpub\tswitch\t", &.{ .keyword_pub, .keyword_switch }); 766 try testTokenize("\rpub\rswitch\r", &.{ .keyword_pub, .keyword_switch }); 767 }