blob c8f33dbf (71354B) - Raw
1 // SPDX-License-Identifier: MIT 2 // Copyright (c) 2015-2020 Zig Contributors 3 // This file is part of [zig](https://ziglang.org/), which is MIT licensed. 4 // The MIT license requires this copyright notice to be included in all copies 5 // and substantial portions of the software. 6 const std = @import("../std.zig"); 7 const mem = std.mem; 8 9 pub const Token = struct { 10 id: Id, 11 loc: Loc, 12 13 pub const Loc = struct { 14 start: usize, 15 end: usize, 16 }; 17 18 pub const keywords = std.ComptimeStringMap(Id, .{ 19 .{ "align", .Keyword_align }, 20 .{ "allowzero", .Keyword_allowzero }, 21 .{ "and", .Keyword_and }, 22 .{ "anyframe", .Keyword_anyframe }, 23 .{ "anytype", .Keyword_anytype }, 24 .{ "asm", .Keyword_asm }, 25 .{ "async", .Keyword_async }, 26 .{ "await", .Keyword_await }, 27 .{ "break", .Keyword_break }, 28 .{ "callconv", .Keyword_callconv }, 29 .{ "catch", .Keyword_catch }, 30 .{ "comptime", .Keyword_comptime }, 31 .{ "const", .Keyword_const }, 32 .{ "continue", .Keyword_continue }, 33 .{ "defer", .Keyword_defer }, 34 .{ "else", .Keyword_else }, 35 .{ "enum", .Keyword_enum }, 36 .{ "errdefer", .Keyword_errdefer }, 37 .{ "error", .Keyword_error }, 38 .{ "export", .Keyword_export }, 39 .{ "extern", .Keyword_extern }, 40 .{ "false", .Keyword_false }, 41 .{ "fn", .Keyword_fn }, 42 .{ "for", .Keyword_for }, 43 .{ "if", .Keyword_if }, 44 .{ "inline", .Keyword_inline }, 45 .{ "noalias", .Keyword_noalias }, 46 .{ "noasync", .Keyword_nosuspend }, // TODO: remove this 47 .{ "noinline", .Keyword_noinline }, 48 .{ "nosuspend", .Keyword_nosuspend }, 49 .{ "null", .Keyword_null }, 50 .{ "opaque", .Keyword_opaque }, 51 .{ "or", .Keyword_or }, 52 .{ "orelse", .Keyword_orelse }, 53 .{ "packed", .Keyword_packed }, 54 .{ "pub", .Keyword_pub }, 55 .{ "resume", .Keyword_resume }, 56 .{ "return", .Keyword_return }, 57 .{ "linksection", .Keyword_linksection }, 58 .{ "struct", .Keyword_struct }, 59 .{ "suspend", .Keyword_suspend }, 60 .{ "switch", .Keyword_switch }, 61 .{ "test", .Keyword_test }, 62 .{ "threadlocal", .Keyword_threadlocal }, 63 .{ "true", .Keyword_true }, 64 .{ "try", .Keyword_try }, 65 .{ "undefined", .Keyword_undefined }, 66 .{ "union", .Keyword_union }, 67 .{ "unreachable", .Keyword_unreachable }, 68 .{ "usingnamespace", .Keyword_usingnamespace }, 69 .{ "var", .Keyword_var }, 70 .{ "volatile", .Keyword_volatile }, 71 .{ "while", .Keyword_while }, 72 }); 73 74 pub fn getKeyword(bytes: []const u8) ?Id { 75 return keywords.get(bytes); 76 } 77 78 pub const Id = enum { 79 Invalid, 80 Invalid_ampersands, 81 Identifier, 82 StringLiteral, 83 MultilineStringLiteralLine, 84 CharLiteral, 85 Eof, 86 Builtin, 87 Bang, 88 Pipe, 89 PipePipe, 90 PipeEqual, 91 Equal, 92 EqualEqual, 93 EqualAngleBracketRight, 94 BangEqual, 95 LParen, 96 RParen, 97 Semicolon, 98 Percent, 99 PercentEqual, 100 LBrace, 101 RBrace, 102 LBracket, 103 RBracket, 104 Period, 105 PeriodAsterisk, 106 Ellipsis2, 107 Ellipsis3, 108 Caret, 109 CaretEqual, 110 Plus, 111 PlusPlus, 112 PlusEqual, 113 PlusPercent, 114 PlusPercentEqual, 115 Minus, 116 MinusEqual, 117 MinusPercent, 118 MinusPercentEqual, 119 Asterisk, 120 AsteriskEqual, 121 AsteriskAsterisk, 122 AsteriskPercent, 123 AsteriskPercentEqual, 124 Arrow, 125 Colon, 126 Slash, 127 SlashEqual, 128 Comma, 129 Ampersand, 130 AmpersandEqual, 131 QuestionMark, 132 AngleBracketLeft, 133 AngleBracketLeftEqual, 134 AngleBracketAngleBracketLeft, 135 AngleBracketAngleBracketLeftEqual, 136 AngleBracketRight, 137 AngleBracketRightEqual, 138 AngleBracketAngleBracketRight, 139 AngleBracketAngleBracketRightEqual, 140 Tilde, 141 IntegerLiteral, 142 FloatLiteral, 143 LineComment, 144 DocComment, 145 ContainerDocComment, 146 ShebangLine, 147 Keyword_align, 148 Keyword_allowzero, 149 Keyword_and, 150 Keyword_anyframe, 151 Keyword_anytype, 152 Keyword_asm, 153 Keyword_async, 154 Keyword_await, 155 Keyword_break, 156 Keyword_callconv, 157 Keyword_catch, 158 Keyword_comptime, 159 Keyword_const, 160 Keyword_continue, 161 Keyword_defer, 162 Keyword_else, 163 Keyword_enum, 164 Keyword_errdefer, 165 Keyword_error, 166 Keyword_export, 167 Keyword_extern, 168 Keyword_false, 169 Keyword_fn, 170 Keyword_for, 171 Keyword_if, 172 Keyword_inline, 173 Keyword_noalias, 174 Keyword_noinline, 175 Keyword_nosuspend, 176 Keyword_null, 177 Keyword_opaque, 178 Keyword_or, 179 Keyword_orelse, 180 Keyword_packed, 181 Keyword_pub, 182 Keyword_resume, 183 Keyword_return, 184 Keyword_linksection, 185 Keyword_struct, 186 Keyword_suspend, 187 Keyword_switch, 188 Keyword_test, 189 Keyword_threadlocal, 190 Keyword_true, 191 Keyword_try, 192 Keyword_undefined, 193 Keyword_union, 194 Keyword_unreachable, 195 Keyword_usingnamespace, 196 Keyword_var, 197 Keyword_volatile, 198 Keyword_while, 199 200 pub fn symbol(id: Id) []const u8 { 201 return switch (id) { 202 .Invalid => "Invalid", 203 .Invalid_ampersands => "&&", 204 .Identifier => "Identifier", 205 .StringLiteral => "StringLiteral", 206 .MultilineStringLiteralLine => "MultilineStringLiteralLine", 207 .CharLiteral => "CharLiteral", 208 .Eof => "Eof", 209 .Builtin => "Builtin", 210 .IntegerLiteral => "IntegerLiteral", 211 .FloatLiteral => "FloatLiteral", 212 .LineComment => "LineComment", 213 .DocComment => "DocComment", 214 .ContainerDocComment => "ContainerDocComment", 215 .ShebangLine => "ShebangLine", 216 217 .Bang => "!", 218 .Pipe => "|", 219 .PipePipe => "||", 220 .PipeEqual => "|=", 221 .Equal => "=", 222 .EqualEqual => "==", 223 .EqualAngleBracketRight => "=>", 224 .BangEqual => "!=", 225 .LParen => "(", 226 .RParen => ")", 227 .Semicolon => ";", 228 .Percent => "%", 229 .PercentEqual => "%=", 230 .LBrace => "{", 231 .RBrace => "}", 232 .LBracket => "[", 233 .RBracket => "]", 234 .Period => ".", 235 .PeriodAsterisk => ".*", 236 .Ellipsis2 => "..", 237 .Ellipsis3 => "...", 238 .Caret => "^", 239 .CaretEqual => "^=", 240 .Plus => "+", 241 .PlusPlus => "++", 242 .PlusEqual => "+=", 243 .PlusPercent => "+%", 244 .PlusPercentEqual => "+%=", 245 .Minus => "-", 246 .MinusEqual => "-=", 247 .MinusPercent => "-%", 248 .MinusPercentEqual => "-%=", 249 .Asterisk => "*", 250 .AsteriskEqual => "*=", 251 .AsteriskAsterisk => "**", 252 .AsteriskPercent => "*%", 253 .AsteriskPercentEqual => "*%=", 254 .Arrow => "->", 255 .Colon => ":", 256 .Slash => "/", 257 .SlashEqual => "/=", 258 .Comma => ",", 259 .Ampersand => "&", 260 .AmpersandEqual => "&=", 261 .QuestionMark => "?", 262 .AngleBracketLeft => "<", 263 .AngleBracketLeftEqual => "<=", 264 .AngleBracketAngleBracketLeft => "<<", 265 .AngleBracketAngleBracketLeftEqual => "<<=", 266 .AngleBracketRight => ">", 267 .AngleBracketRightEqual => ">=", 268 .AngleBracketAngleBracketRight => ">>", 269 .AngleBracketAngleBracketRightEqual => ">>=", 270 .Tilde => "~", 271 .Keyword_align => "align", 272 .Keyword_allowzero => "allowzero", 273 .Keyword_and => "and", 274 .Keyword_anyframe => "anyframe", 275 .Keyword_anytype => "anytype", 276 .Keyword_asm => "asm", 277 .Keyword_async => "async", 278 .Keyword_await => "await", 279 .Keyword_break => "break", 280 .Keyword_callconv => "callconv", 281 .Keyword_catch => "catch", 282 .Keyword_comptime => "comptime", 283 .Keyword_const => "const", 284 .Keyword_continue => "continue", 285 .Keyword_defer => "defer", 286 .Keyword_else => "else", 287 .Keyword_enum => "enum", 288 .Keyword_errdefer => "errdefer", 289 .Keyword_error => "error", 290 .Keyword_export => "export", 291 .Keyword_extern => "extern", 292 .Keyword_false => "false", 293 .Keyword_fn => "fn", 294 .Keyword_for => "for", 295 .Keyword_if => "if", 296 .Keyword_inline => "inline", 297 .Keyword_noalias => "noalias", 298 .Keyword_noinline => "noinline", 299 .Keyword_nosuspend => "nosuspend", 300 .Keyword_null => "null", 301 .Keyword_opaque => "opaque", 302 .Keyword_or => "or", 303 .Keyword_orelse => "orelse", 304 .Keyword_packed => "packed", 305 .Keyword_pub => "pub", 306 .Keyword_resume => "resume", 307 .Keyword_return => "return", 308 .Keyword_linksection => "linksection", 309 .Keyword_struct => "struct", 310 .Keyword_suspend => "suspend", 311 .Keyword_switch => "switch", 312 .Keyword_test => "test", 313 .Keyword_threadlocal => "threadlocal", 314 .Keyword_true => "true", 315 .Keyword_try => "try", 316 .Keyword_undefined => "undefined", 317 .Keyword_union => "union", 318 .Keyword_unreachable => "unreachable", 319 .Keyword_usingnamespace => "usingnamespace", 320 .Keyword_var => "var", 321 .Keyword_volatile => "volatile", 322 .Keyword_while => "while", 323 }; 324 } 325 }; 326 }; 327 328 pub const Tokenizer = struct { 329 buffer: []const u8, 330 index: usize, 331 pending_invalid_token: ?Token, 332 333 /// For debugging purposes 334 pub fn dump(self: *Tokenizer, token: *const Token) void { 335 std.debug.warn("{} \"{}\"\n", .{ @tagName(token.id), self.buffer[token.start..token.end] }); 336 } 337 338 pub fn init(buffer: []const u8) Tokenizer { 339 // Skip the UTF-8 BOM if present 340 const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else @as(usize, 0); 341 return Tokenizer{ 342 .buffer = buffer, 343 .index = src_start, 344 .pending_invalid_token = null, 345 }; 346 } 347 348 const State = enum { 349 start, 350 identifier, 351 builtin, 352 string_literal, 353 string_literal_backslash, 354 multiline_string_literal_line, 355 char_literal, 356 char_literal_backslash, 357 char_literal_hex_escape, 358 char_literal_unicode_escape_saw_u, 359 char_literal_unicode_escape, 360 char_literal_unicode_invalid, 361 char_literal_unicode, 362 char_literal_end, 363 backslash, 364 equal, 365 bang, 366 pipe, 367 minus, 368 minus_percent, 369 asterisk, 370 asterisk_percent, 371 slash, 372 line_comment_start, 373 line_comment, 374 doc_comment_start, 375 doc_comment, 376 container_doc_comment, 377 zero, 378 int_literal_dec, 379 int_literal_dec_no_underscore, 380 int_literal_bin, 381 int_literal_bin_no_underscore, 382 int_literal_oct, 383 int_literal_oct_no_underscore, 384 int_literal_hex, 385 int_literal_hex_no_underscore, 386 num_dot_dec, 387 num_dot_hex, 388 float_fraction_dec, 389 float_fraction_dec_no_underscore, 390 float_fraction_hex, 391 float_fraction_hex_no_underscore, 392 float_exponent_unsigned, 393 float_exponent_num, 394 float_exponent_num_no_underscore, 395 ampersand, 396 caret, 397 percent, 398 plus, 399 plus_percent, 400 angle_bracket_left, 401 angle_bracket_angle_bracket_left, 402 angle_bracket_right, 403 angle_bracket_angle_bracket_right, 404 period, 405 period_2, 406 saw_at_sign, 407 }; 408 409 fn isIdentifierChar(char: u8) bool { 410 return std.ascii.isAlNum(char) or char == '_'; 411 } 412 413 pub fn next(self: *Tokenizer) Token { 414 if (self.pending_invalid_token) |token| { 415 self.pending_invalid_token = null; 416 return token; 417 } 418 const start_index = self.index; 419 var state: State = .start; 420 var result = Token{ 421 .id = .Eof, 422 .loc = .{ 423 .start = self.index, 424 .end = undefined, 425 }, 426 }; 427 var seen_escape_digits: usize = undefined; 428 var remaining_code_units: usize = undefined; 429 while (self.index < self.buffer.len) : (self.index += 1) { 430 const c = self.buffer[self.index]; 431 switch (state) { 432 .start => switch (c) { 433 ' ', '\n', '\t', '\r' => { 434 result.loc.start = self.index + 1; 435 }, 436 '"' => { 437 state = .string_literal; 438 result.id = .StringLiteral; 439 }, 440 '\'' => { 441 state = .char_literal; 442 }, 443 'a'...'z', 'A'...'Z', '_' => { 444 state = .identifier; 445 result.id = .Identifier; 446 }, 447 '@' => { 448 state = .saw_at_sign; 449 }, 450 '=' => { 451 state = .equal; 452 }, 453 '!' => { 454 state = .bang; 455 }, 456 '|' => { 457 state = .pipe; 458 }, 459 '(' => { 460 result.id = .LParen; 461 self.index += 1; 462 break; 463 }, 464 ')' => { 465 result.id = .RParen; 466 self.index += 1; 467 break; 468 }, 469 '[' => { 470 result.id = .LBracket; 471 self.index += 1; 472 break; 473 }, 474 ']' => { 475 result.id = .RBracket; 476 self.index += 1; 477 break; 478 }, 479 ';' => { 480 result.id = .Semicolon; 481 self.index += 1; 482 break; 483 }, 484 ',' => { 485 result.id = .Comma; 486 self.index += 1; 487 break; 488 }, 489 '?' => { 490 result.id = .QuestionMark; 491 self.index += 1; 492 break; 493 }, 494 ':' => { 495 result.id = .Colon; 496 self.index += 1; 497 break; 498 }, 499 '%' => { 500 state = .percent; 501 }, 502 '*' => { 503 state = .asterisk; 504 }, 505 '+' => { 506 state = .plus; 507 }, 508 '<' => { 509 state = .angle_bracket_left; 510 }, 511 '>' => { 512 state = .angle_bracket_right; 513 }, 514 '^' => { 515 state = .caret; 516 }, 517 '\\' => { 518 state = .backslash; 519 result.id = .MultilineStringLiteralLine; 520 }, 521 '{' => { 522 result.id = .LBrace; 523 self.index += 1; 524 break; 525 }, 526 '}' => { 527 result.id = .RBrace; 528 self.index += 1; 529 break; 530 }, 531 '~' => { 532 result.id = .Tilde; 533 self.index += 1; 534 break; 535 }, 536 '.' => { 537 state = .period; 538 }, 539 '-' => { 540 state = .minus; 541 }, 542 '/' => { 543 state = .slash; 544 }, 545 '&' => { 546 state = .ampersand; 547 }, 548 '0' => { 549 state = .zero; 550 result.id = .IntegerLiteral; 551 }, 552 '1'...'9' => { 553 state = .int_literal_dec; 554 result.id = .IntegerLiteral; 555 }, 556 else => { 557 result.id = .Invalid; 558 self.index += 1; 559 break; 560 }, 561 }, 562 563 .saw_at_sign => switch (c) { 564 '"' => { 565 result.id = .Identifier; 566 state = .string_literal; 567 }, 568 else => { 569 // reinterpret as a builtin 570 self.index -= 1; 571 state = .builtin; 572 result.id = .Builtin; 573 }, 574 }, 575 576 .ampersand => switch (c) { 577 '&' => { 578 result.id = .Invalid_ampersands; 579 self.index += 1; 580 break; 581 }, 582 '=' => { 583 result.id = .AmpersandEqual; 584 self.index += 1; 585 break; 586 }, 587 else => { 588 result.id = .Ampersand; 589 break; 590 }, 591 }, 592 593 .asterisk => switch (c) { 594 '=' => { 595 result.id = .AsteriskEqual; 596 self.index += 1; 597 break; 598 }, 599 '*' => { 600 result.id = .AsteriskAsterisk; 601 self.index += 1; 602 break; 603 }, 604 '%' => { 605 state = .asterisk_percent; 606 }, 607 else => { 608 result.id = .Asterisk; 609 break; 610 }, 611 }, 612 613 .asterisk_percent => switch (c) { 614 '=' => { 615 result.id = .AsteriskPercentEqual; 616 self.index += 1; 617 break; 618 }, 619 else => { 620 result.id = .AsteriskPercent; 621 break; 622 }, 623 }, 624 625 .percent => switch (c) { 626 '=' => { 627 result.id = .PercentEqual; 628 self.index += 1; 629 break; 630 }, 631 else => { 632 result.id = .Percent; 633 break; 634 }, 635 }, 636 637 .plus => switch (c) { 638 '=' => { 639 result.id = .PlusEqual; 640 self.index += 1; 641 break; 642 }, 643 '+' => { 644 result.id = .PlusPlus; 645 self.index += 1; 646 break; 647 }, 648 '%' => { 649 state = .plus_percent; 650 }, 651 else => { 652 result.id = .Plus; 653 break; 654 }, 655 }, 656 657 .plus_percent => switch (c) { 658 '=' => { 659 result.id = .PlusPercentEqual; 660 self.index += 1; 661 break; 662 }, 663 else => { 664 result.id = .PlusPercent; 665 break; 666 }, 667 }, 668 669 .caret => switch (c) { 670 '=' => { 671 result.id = .CaretEqual; 672 self.index += 1; 673 break; 674 }, 675 else => { 676 result.id = .Caret; 677 break; 678 }, 679 }, 680 681 .identifier => switch (c) { 682 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, 683 else => { 684 if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |id| { 685 result.id = id; 686 } 687 break; 688 }, 689 }, 690 .builtin => switch (c) { 691 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, 692 else => break, 693 }, 694 .backslash => switch (c) { 695 '\\' => { 696 state = .multiline_string_literal_line; 697 }, 698 else => break, 699 }, 700 .string_literal => switch (c) { 701 '\\' => { 702 state = .string_literal_backslash; 703 }, 704 '"' => { 705 self.index += 1; 706 break; 707 }, 708 '\n', '\r' => break, // Look for this error later. 709 else => self.checkLiteralCharacter(), 710 }, 711 712 .string_literal_backslash => switch (c) { 713 '\n', '\r' => break, // Look for this error later. 714 else => { 715 state = .string_literal; 716 }, 717 }, 718 719 .char_literal => switch (c) { 720 '\\' => { 721 state = .char_literal_backslash; 722 }, 723 '\'', 0x80...0xbf, 0xf8...0xff => { 724 result.id = .Invalid; 725 break; 726 }, 727 0xc0...0xdf => { // 110xxxxx 728 remaining_code_units = 1; 729 state = .char_literal_unicode; 730 }, 731 0xe0...0xef => { // 1110xxxx 732 remaining_code_units = 2; 733 state = .char_literal_unicode; 734 }, 735 0xf0...0xf7 => { // 11110xxx 736 remaining_code_units = 3; 737 state = .char_literal_unicode; 738 }, 739 else => { 740 state = .char_literal_end; 741 }, 742 }, 743 744 .char_literal_backslash => switch (c) { 745 '\n' => { 746 result.id = .Invalid; 747 break; 748 }, 749 'x' => { 750 state = .char_literal_hex_escape; 751 seen_escape_digits = 0; 752 }, 753 'u' => { 754 state = .char_literal_unicode_escape_saw_u; 755 }, 756 else => { 757 state = .char_literal_end; 758 }, 759 }, 760 761 .char_literal_hex_escape => switch (c) { 762 '0'...'9', 'a'...'f', 'A'...'F' => { 763 seen_escape_digits += 1; 764 if (seen_escape_digits == 2) { 765 state = .char_literal_end; 766 } 767 }, 768 else => { 769 result.id = .Invalid; 770 break; 771 }, 772 }, 773 774 .char_literal_unicode_escape_saw_u => switch (c) { 775 '{' => { 776 state = .char_literal_unicode_escape; 777 seen_escape_digits = 0; 778 }, 779 else => { 780 result.id = .Invalid; 781 state = .char_literal_unicode_invalid; 782 }, 783 }, 784 785 .char_literal_unicode_escape => switch (c) { 786 '0'...'9', 'a'...'f', 'A'...'F' => { 787 seen_escape_digits += 1; 788 }, 789 '}' => { 790 if (seen_escape_digits == 0) { 791 result.id = .Invalid; 792 state = .char_literal_unicode_invalid; 793 } else { 794 state = .char_literal_end; 795 } 796 }, 797 else => { 798 result.id = .Invalid; 799 state = .char_literal_unicode_invalid; 800 }, 801 }, 802 803 .char_literal_unicode_invalid => switch (c) { 804 // Keep consuming characters until an obvious stopping point. 805 // This consolidates e.g. `u{0ab1Q}` into a single invalid token 806 // instead of creating the tokens `u{0ab1`, `Q`, `}` 807 '0'...'9', 'a'...'z', 'A'...'Z', '}' => {}, 808 else => break, 809 }, 810 811 .char_literal_end => switch (c) { 812 '\'' => { 813 result.id = .CharLiteral; 814 self.index += 1; 815 break; 816 }, 817 else => { 818 result.id = .Invalid; 819 break; 820 }, 821 }, 822 823 .char_literal_unicode => switch (c) { 824 0x80...0xbf => { 825 remaining_code_units -= 1; 826 if (remaining_code_units == 0) { 827 state = .char_literal_end; 828 } 829 }, 830 else => { 831 result.id = .Invalid; 832 break; 833 }, 834 }, 835 836 .multiline_string_literal_line => switch (c) { 837 '\n' => { 838 self.index += 1; 839 break; 840 }, 841 '\t' => {}, 842 else => self.checkLiteralCharacter(), 843 }, 844 845 .bang => switch (c) { 846 '=' => { 847 result.id = .BangEqual; 848 self.index += 1; 849 break; 850 }, 851 else => { 852 result.id = .Bang; 853 break; 854 }, 855 }, 856 857 .pipe => switch (c) { 858 '=' => { 859 result.id = .PipeEqual; 860 self.index += 1; 861 break; 862 }, 863 '|' => { 864 result.id = .PipePipe; 865 self.index += 1; 866 break; 867 }, 868 else => { 869 result.id = .Pipe; 870 break; 871 }, 872 }, 873 874 .equal => switch (c) { 875 '=' => { 876 result.id = .EqualEqual; 877 self.index += 1; 878 break; 879 }, 880 '>' => { 881 result.id = .EqualAngleBracketRight; 882 self.index += 1; 883 break; 884 }, 885 else => { 886 result.id = .Equal; 887 break; 888 }, 889 }, 890 891 .minus => switch (c) { 892 '>' => { 893 result.id = .Arrow; 894 self.index += 1; 895 break; 896 }, 897 '=' => { 898 result.id = .MinusEqual; 899 self.index += 1; 900 break; 901 }, 902 '%' => { 903 state = .minus_percent; 904 }, 905 else => { 906 result.id = .Minus; 907 break; 908 }, 909 }, 910 911 .minus_percent => switch (c) { 912 '=' => { 913 result.id = .MinusPercentEqual; 914 self.index += 1; 915 break; 916 }, 917 else => { 918 result.id = .MinusPercent; 919 break; 920 }, 921 }, 922 923 .angle_bracket_left => switch (c) { 924 '<' => { 925 state = .angle_bracket_angle_bracket_left; 926 }, 927 '=' => { 928 result.id = .AngleBracketLeftEqual; 929 self.index += 1; 930 break; 931 }, 932 else => { 933 result.id = .AngleBracketLeft; 934 break; 935 }, 936 }, 937 938 .angle_bracket_angle_bracket_left => switch (c) { 939 '=' => { 940 result.id = .AngleBracketAngleBracketLeftEqual; 941 self.index += 1; 942 break; 943 }, 944 else => { 945 result.id = .AngleBracketAngleBracketLeft; 946 break; 947 }, 948 }, 949 950 .angle_bracket_right => switch (c) { 951 '>' => { 952 state = .angle_bracket_angle_bracket_right; 953 }, 954 '=' => { 955 result.id = .AngleBracketRightEqual; 956 self.index += 1; 957 break; 958 }, 959 else => { 960 result.id = .AngleBracketRight; 961 break; 962 }, 963 }, 964 965 .angle_bracket_angle_bracket_right => switch (c) { 966 '=' => { 967 result.id = .AngleBracketAngleBracketRightEqual; 968 self.index += 1; 969 break; 970 }, 971 else => { 972 result.id = .AngleBracketAngleBracketRight; 973 break; 974 }, 975 }, 976 977 .period => switch (c) { 978 '.' => { 979 state = .period_2; 980 }, 981 '*' => { 982 result.id = .PeriodAsterisk; 983 self.index += 1; 984 break; 985 }, 986 else => { 987 result.id = .Period; 988 break; 989 }, 990 }, 991 992 .period_2 => switch (c) { 993 '.' => { 994 result.id = .Ellipsis3; 995 self.index += 1; 996 break; 997 }, 998 else => { 999 result.id = .Ellipsis2; 1000 break; 1001 }, 1002 }, 1003 1004 .slash => switch (c) { 1005 '/' => { 1006 state = .line_comment_start; 1007 result.id = .LineComment; 1008 }, 1009 '=' => { 1010 result.id = .SlashEqual; 1011 self.index += 1; 1012 break; 1013 }, 1014 else => { 1015 result.id = .Slash; 1016 break; 1017 }, 1018 }, 1019 .line_comment_start => switch (c) { 1020 '/' => { 1021 state = .doc_comment_start; 1022 }, 1023 '!' => { 1024 result.id = .ContainerDocComment; 1025 state = .container_doc_comment; 1026 }, 1027 '\n' => break, 1028 '\t', '\r' => state = .line_comment, 1029 else => { 1030 state = .line_comment; 1031 self.checkLiteralCharacter(); 1032 }, 1033 }, 1034 .doc_comment_start => switch (c) { 1035 '/' => { 1036 state = .line_comment; 1037 }, 1038 '\n' => { 1039 result.id = .DocComment; 1040 break; 1041 }, 1042 '\t', '\r' => { 1043 state = .doc_comment; 1044 result.id = .DocComment; 1045 }, 1046 else => { 1047 state = .doc_comment; 1048 result.id = .DocComment; 1049 self.checkLiteralCharacter(); 1050 }, 1051 }, 1052 .line_comment, .doc_comment, .container_doc_comment => switch (c) { 1053 '\n' => break, 1054 '\t', '\r' => {}, 1055 else => self.checkLiteralCharacter(), 1056 }, 1057 .zero => switch (c) { 1058 'b' => { 1059 state = .int_literal_bin_no_underscore; 1060 }, 1061 'o' => { 1062 state = .int_literal_oct_no_underscore; 1063 }, 1064 'x' => { 1065 state = .int_literal_hex_no_underscore; 1066 }, 1067 '0'...'9', '_', '.', 'e', 'E' => { 1068 // reinterpret as a decimal number 1069 self.index -= 1; 1070 state = .int_literal_dec; 1071 }, 1072 else => { 1073 if (isIdentifierChar(c)) { 1074 result.id = .Invalid; 1075 } 1076 break; 1077 }, 1078 }, 1079 .int_literal_bin_no_underscore => switch (c) { 1080 '0'...'1' => { 1081 state = .int_literal_bin; 1082 }, 1083 else => { 1084 result.id = .Invalid; 1085 break; 1086 }, 1087 }, 1088 .int_literal_bin => switch (c) { 1089 '_' => { 1090 state = .int_literal_bin_no_underscore; 1091 }, 1092 '0'...'1' => {}, 1093 else => { 1094 if (isIdentifierChar(c)) { 1095 result.id = .Invalid; 1096 } 1097 break; 1098 }, 1099 }, 1100 .int_literal_oct_no_underscore => switch (c) { 1101 '0'...'7' => { 1102 state = .int_literal_oct; 1103 }, 1104 else => { 1105 result.id = .Invalid; 1106 break; 1107 }, 1108 }, 1109 .int_literal_oct => switch (c) { 1110 '_' => { 1111 state = .int_literal_oct_no_underscore; 1112 }, 1113 '0'...'7' => {}, 1114 else => { 1115 if (isIdentifierChar(c)) { 1116 result.id = .Invalid; 1117 } 1118 break; 1119 }, 1120 }, 1121 .int_literal_dec_no_underscore => switch (c) { 1122 '0'...'9' => { 1123 state = .int_literal_dec; 1124 }, 1125 else => { 1126 result.id = .Invalid; 1127 break; 1128 }, 1129 }, 1130 .int_literal_dec => switch (c) { 1131 '_' => { 1132 state = .int_literal_dec_no_underscore; 1133 }, 1134 '.' => { 1135 state = .num_dot_dec; 1136 result.id = .FloatLiteral; 1137 }, 1138 'e', 'E' => { 1139 state = .float_exponent_unsigned; 1140 result.id = .FloatLiteral; 1141 }, 1142 '0'...'9' => {}, 1143 else => { 1144 if (isIdentifierChar(c)) { 1145 result.id = .Invalid; 1146 } 1147 break; 1148 }, 1149 }, 1150 .int_literal_hex_no_underscore => switch (c) { 1151 '0'...'9', 'a'...'f', 'A'...'F' => { 1152 state = .int_literal_hex; 1153 }, 1154 else => { 1155 result.id = .Invalid; 1156 break; 1157 }, 1158 }, 1159 .int_literal_hex => switch (c) { 1160 '_' => { 1161 state = .int_literal_hex_no_underscore; 1162 }, 1163 '.' => { 1164 state = .num_dot_hex; 1165 result.id = .FloatLiteral; 1166 }, 1167 'p', 'P' => { 1168 state = .float_exponent_unsigned; 1169 result.id = .FloatLiteral; 1170 }, 1171 '0'...'9', 'a'...'f', 'A'...'F' => {}, 1172 else => { 1173 if (isIdentifierChar(c)) { 1174 result.id = .Invalid; 1175 } 1176 break; 1177 }, 1178 }, 1179 .num_dot_dec => switch (c) { 1180 '.' => { 1181 result.id = .IntegerLiteral; 1182 self.index -= 1; 1183 state = .start; 1184 break; 1185 }, 1186 'e', 'E' => { 1187 state = .float_exponent_unsigned; 1188 }, 1189 '0'...'9' => { 1190 state = .float_fraction_dec; 1191 }, 1192 else => { 1193 if (isIdentifierChar(c)) { 1194 result.id = .Invalid; 1195 } 1196 break; 1197 }, 1198 }, 1199 .num_dot_hex => switch (c) { 1200 '.' => { 1201 result.id = .IntegerLiteral; 1202 self.index -= 1; 1203 state = .start; 1204 break; 1205 }, 1206 'p', 'P' => { 1207 state = .float_exponent_unsigned; 1208 }, 1209 '0'...'9', 'a'...'f', 'A'...'F' => { 1210 result.id = .FloatLiteral; 1211 state = .float_fraction_hex; 1212 }, 1213 else => { 1214 if (isIdentifierChar(c)) { 1215 result.id = .Invalid; 1216 } 1217 break; 1218 }, 1219 }, 1220 .float_fraction_dec_no_underscore => switch (c) { 1221 '0'...'9' => { 1222 state = .float_fraction_dec; 1223 }, 1224 else => { 1225 result.id = .Invalid; 1226 break; 1227 }, 1228 }, 1229 .float_fraction_dec => switch (c) { 1230 '_' => { 1231 state = .float_fraction_dec_no_underscore; 1232 }, 1233 'e', 'E' => { 1234 state = .float_exponent_unsigned; 1235 }, 1236 '0'...'9' => {}, 1237 else => { 1238 if (isIdentifierChar(c)) { 1239 result.id = .Invalid; 1240 } 1241 break; 1242 }, 1243 }, 1244 .float_fraction_hex_no_underscore => switch (c) { 1245 '0'...'9', 'a'...'f', 'A'...'F' => { 1246 state = .float_fraction_hex; 1247 }, 1248 else => { 1249 result.id = .Invalid; 1250 break; 1251 }, 1252 }, 1253 .float_fraction_hex => switch (c) { 1254 '_' => { 1255 state = .float_fraction_hex_no_underscore; 1256 }, 1257 'p', 'P' => { 1258 state = .float_exponent_unsigned; 1259 }, 1260 '0'...'9', 'a'...'f', 'A'...'F' => {}, 1261 else => { 1262 if (isIdentifierChar(c)) { 1263 result.id = .Invalid; 1264 } 1265 break; 1266 }, 1267 }, 1268 .float_exponent_unsigned => switch (c) { 1269 '+', '-' => { 1270 state = .float_exponent_num_no_underscore; 1271 }, 1272 else => { 1273 // reinterpret as a normal exponent number 1274 self.index -= 1; 1275 state = .float_exponent_num_no_underscore; 1276 }, 1277 }, 1278 .float_exponent_num_no_underscore => switch (c) { 1279 '0'...'9' => { 1280 state = .float_exponent_num; 1281 }, 1282 else => { 1283 result.id = .Invalid; 1284 break; 1285 }, 1286 }, 1287 .float_exponent_num => switch (c) { 1288 '_' => { 1289 state = .float_exponent_num_no_underscore; 1290 }, 1291 '0'...'9' => {}, 1292 else => { 1293 if (isIdentifierChar(c)) { 1294 result.id = .Invalid; 1295 } 1296 break; 1297 }, 1298 }, 1299 } 1300 } else if (self.index == self.buffer.len) { 1301 switch (state) { 1302 .start, 1303 .int_literal_dec, 1304 .int_literal_bin, 1305 .int_literal_oct, 1306 .int_literal_hex, 1307 .num_dot_dec, 1308 .num_dot_hex, 1309 .float_fraction_dec, 1310 .float_fraction_hex, 1311 .float_exponent_num, 1312 .string_literal, // find this error later 1313 .multiline_string_literal_line, 1314 .builtin, 1315 => {}, 1316 1317 .identifier => { 1318 if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |id| { 1319 result.id = id; 1320 } 1321 }, 1322 .line_comment, .line_comment_start => { 1323 result.id = .LineComment; 1324 }, 1325 .doc_comment, .doc_comment_start => { 1326 result.id = .DocComment; 1327 }, 1328 .container_doc_comment => { 1329 result.id = .ContainerDocComment; 1330 }, 1331 1332 .int_literal_dec_no_underscore, 1333 .int_literal_bin_no_underscore, 1334 .int_literal_oct_no_underscore, 1335 .int_literal_hex_no_underscore, 1336 .float_fraction_dec_no_underscore, 1337 .float_fraction_hex_no_underscore, 1338 .float_exponent_num_no_underscore, 1339 .float_exponent_unsigned, 1340 .saw_at_sign, 1341 .backslash, 1342 .char_literal, 1343 .char_literal_backslash, 1344 .char_literal_hex_escape, 1345 .char_literal_unicode_escape_saw_u, 1346 .char_literal_unicode_escape, 1347 .char_literal_unicode_invalid, 1348 .char_literal_end, 1349 .char_literal_unicode, 1350 .string_literal_backslash, 1351 => { 1352 result.id = .Invalid; 1353 }, 1354 1355 .equal => { 1356 result.id = .Equal; 1357 }, 1358 .bang => { 1359 result.id = .Bang; 1360 }, 1361 .minus => { 1362 result.id = .Minus; 1363 }, 1364 .slash => { 1365 result.id = .Slash; 1366 }, 1367 .zero => { 1368 result.id = .IntegerLiteral; 1369 }, 1370 .ampersand => { 1371 result.id = .Ampersand; 1372 }, 1373 .period => { 1374 result.id = .Period; 1375 }, 1376 .period_2 => { 1377 result.id = .Ellipsis2; 1378 }, 1379 .pipe => { 1380 result.id = .Pipe; 1381 }, 1382 .angle_bracket_angle_bracket_right => { 1383 result.id = .AngleBracketAngleBracketRight; 1384 }, 1385 .angle_bracket_right => { 1386 result.id = .AngleBracketRight; 1387 }, 1388 .angle_bracket_angle_bracket_left => { 1389 result.id = .AngleBracketAngleBracketLeft; 1390 }, 1391 .angle_bracket_left => { 1392 result.id = .AngleBracketLeft; 1393 }, 1394 .plus_percent => { 1395 result.id = .PlusPercent; 1396 }, 1397 .plus => { 1398 result.id = .Plus; 1399 }, 1400 .percent => { 1401 result.id = .Percent; 1402 }, 1403 .caret => { 1404 result.id = .Caret; 1405 }, 1406 .asterisk_percent => { 1407 result.id = .AsteriskPercent; 1408 }, 1409 .asterisk => { 1410 result.id = .Asterisk; 1411 }, 1412 .minus_percent => { 1413 result.id = .MinusPercent; 1414 }, 1415 } 1416 } 1417 1418 if (result.id == .Eof) { 1419 if (self.pending_invalid_token) |token| { 1420 self.pending_invalid_token = null; 1421 return token; 1422 } 1423 } 1424 1425 result.loc.end = self.index; 1426 return result; 1427 } 1428 1429 fn checkLiteralCharacter(self: *Tokenizer) void { 1430 if (self.pending_invalid_token != null) return; 1431 const invalid_length = self.getInvalidCharacterLength(); 1432 if (invalid_length == 0) return; 1433 self.pending_invalid_token = .{ 1434 .id = .Invalid, 1435 .loc = .{ 1436 .start = self.index, 1437 .end = self.index + invalid_length, 1438 }, 1439 }; 1440 } 1441 1442 fn getInvalidCharacterLength(self: *Tokenizer) u3 { 1443 const c0 = self.buffer[self.index]; 1444 if (c0 < 0x80) { 1445 if (c0 < 0x20 or c0 == 0x7f) { 1446 // ascii control codes are never allowed 1447 // (note that \n was checked before we got here) 1448 return 1; 1449 } 1450 // looks fine to me. 1451 return 0; 1452 } else { 1453 // check utf8-encoded character. 1454 const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; 1455 if (self.index + length > self.buffer.len) { 1456 return @intCast(u3, self.buffer.len - self.index); 1457 } 1458 const bytes = self.buffer[self.index .. self.index + length]; 1459 switch (length) { 1460 2 => { 1461 const value = std.unicode.utf8Decode2(bytes) catch return length; 1462 if (value == 0x85) return length; // U+0085 (NEL) 1463 }, 1464 3 => { 1465 const value = std.unicode.utf8Decode3(bytes) catch return length; 1466 if (value == 0x2028) return length; // U+2028 (LS) 1467 if (value == 0x2029) return length; // U+2029 (PS) 1468 }, 1469 4 => { 1470 _ = std.unicode.utf8Decode4(bytes) catch return length; 1471 }, 1472 else => unreachable, 1473 } 1474 self.index += length - 1; 1475 return 0; 1476 } 1477 } 1478 }; 1479 1480 test "tokenizer" { 1481 testTokenize("test", &[_]Token.Id{.Keyword_test}); 1482 } 1483 1484 test "tokenizer - unknown length pointer and then c pointer" { 1485 testTokenize( 1486 \\[*]u8 1487 \\[*c]u8 1488 , &[_]Token.Id{ 1489 .LBracket, 1490 .Asterisk, 1491 .RBracket, 1492 .Identifier, 1493 .LBracket, 1494 .Asterisk, 1495 .Identifier, 1496 .RBracket, 1497 .Identifier, 1498 }); 1499 } 1500 1501 test "tokenizer - char literal with hex escape" { 1502 testTokenize( 1503 \\'\x1b' 1504 , &[_]Token.Id{.CharLiteral}); 1505 testTokenize( 1506 \\'\x1' 1507 , &[_]Token.Id{ .Invalid, .Invalid }); 1508 } 1509 1510 test "tokenizer - char literal with unicode escapes" { 1511 // Valid unicode escapes 1512 testTokenize( 1513 \\'\u{3}' 1514 , &[_]Token.Id{.CharLiteral}); 1515 testTokenize( 1516 \\'\u{01}' 1517 , &[_]Token.Id{.CharLiteral}); 1518 testTokenize( 1519 \\'\u{2a}' 1520 , &[_]Token.Id{.CharLiteral}); 1521 testTokenize( 1522 \\'\u{3f9}' 1523 , &[_]Token.Id{.CharLiteral}); 1524 testTokenize( 1525 \\'\u{6E09aBc1523}' 1526 , &[_]Token.Id{.CharLiteral}); 1527 testTokenize( 1528 \\"\u{440}" 1529 , &[_]Token.Id{.StringLiteral}); 1530 1531 // Invalid unicode escapes 1532 testTokenize( 1533 \\'\u' 1534 , &[_]Token.Id{.Invalid}); 1535 testTokenize( 1536 \\'\u{{' 1537 , &[_]Token.Id{ .Invalid, .Invalid }); 1538 testTokenize( 1539 \\'\u{}' 1540 , &[_]Token.Id{ .Invalid, .Invalid }); 1541 testTokenize( 1542 \\'\u{s}' 1543 , &[_]Token.Id{ .Invalid, .Invalid }); 1544 testTokenize( 1545 \\'\u{2z}' 1546 , &[_]Token.Id{ .Invalid, .Invalid }); 1547 testTokenize( 1548 \\'\u{4a' 1549 , &[_]Token.Id{.Invalid}); 1550 1551 // Test old-style unicode literals 1552 testTokenize( 1553 \\'\u0333' 1554 , &[_]Token.Id{ .Invalid, .Invalid }); 1555 testTokenize( 1556 \\'\U0333' 1557 , &[_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid }); 1558 } 1559 1560 test "tokenizer - char literal with unicode code point" { 1561 testTokenize( 1562 \\'💩' 1563 , &[_]Token.Id{.CharLiteral}); 1564 } 1565 1566 test "tokenizer - float literal e exponent" { 1567 testTokenize("a = 4.94065645841246544177e-324;\n", &[_]Token.Id{ 1568 .Identifier, 1569 .Equal, 1570 .FloatLiteral, 1571 .Semicolon, 1572 }); 1573 } 1574 1575 test "tokenizer - float literal p exponent" { 1576 testTokenize("a = 0x1.a827999fcef32p+1022;\n", &[_]Token.Id{ 1577 .Identifier, 1578 .Equal, 1579 .FloatLiteral, 1580 .Semicolon, 1581 }); 1582 } 1583 1584 test "tokenizer - chars" { 1585 testTokenize("'c'", &[_]Token.Id{.CharLiteral}); 1586 } 1587 1588 test "tokenizer - invalid token characters" { 1589 testTokenize("#", &[_]Token.Id{.Invalid}); 1590 testTokenize("`", &[_]Token.Id{.Invalid}); 1591 testTokenize("'c", &[_]Token.Id{.Invalid}); 1592 testTokenize("'", &[_]Token.Id{.Invalid}); 1593 testTokenize("''", &[_]Token.Id{ .Invalid, .Invalid }); 1594 } 1595 1596 test "tokenizer - invalid literal/comment characters" { 1597 testTokenize("\"\x00\"", &[_]Token.Id{ 1598 .StringLiteral, 1599 .Invalid, 1600 }); 1601 testTokenize("//\x00", &[_]Token.Id{ 1602 .LineComment, 1603 .Invalid, 1604 }); 1605 testTokenize("//\x1f", &[_]Token.Id{ 1606 .LineComment, 1607 .Invalid, 1608 }); 1609 testTokenize("//\x7f", &[_]Token.Id{ 1610 .LineComment, 1611 .Invalid, 1612 }); 1613 } 1614 1615 test "tokenizer - utf8" { 1616 testTokenize("//\xc2\x80", &[_]Token.Id{.LineComment}); 1617 testTokenize("//\xf4\x8f\xbf\xbf", &[_]Token.Id{.LineComment}); 1618 } 1619 1620 test "tokenizer - invalid utf8" { 1621 testTokenize("//\x80", &[_]Token.Id{ 1622 .LineComment, 1623 .Invalid, 1624 }); 1625 testTokenize("//\xbf", &[_]Token.Id{ 1626 .LineComment, 1627 .Invalid, 1628 }); 1629 testTokenize("//\xf8", &[_]Token.Id{ 1630 .LineComment, 1631 .Invalid, 1632 }); 1633 testTokenize("//\xff", &[_]Token.Id{ 1634 .LineComment, 1635 .Invalid, 1636 }); 1637 testTokenize("//\xc2\xc0", &[_]Token.Id{ 1638 .LineComment, 1639 .Invalid, 1640 }); 1641 testTokenize("//\xe0", &[_]Token.Id{ 1642 .LineComment, 1643 .Invalid, 1644 }); 1645 testTokenize("//\xf0", &[_]Token.Id{ 1646 .LineComment, 1647 .Invalid, 1648 }); 1649 testTokenize("//\xf0\x90\x80\xc0", &[_]Token.Id{ 1650 .LineComment, 1651 .Invalid, 1652 }); 1653 } 1654 1655 test "tokenizer - illegal unicode codepoints" { 1656 // unicode newline characters.U+0085, U+2028, U+2029 1657 testTokenize("//\xc2\x84", &[_]Token.Id{.LineComment}); 1658 testTokenize("//\xc2\x85", &[_]Token.Id{ 1659 .LineComment, 1660 .Invalid, 1661 }); 1662 testTokenize("//\xc2\x86", &[_]Token.Id{.LineComment}); 1663 testTokenize("//\xe2\x80\xa7", &[_]Token.Id{.LineComment}); 1664 testTokenize("//\xe2\x80\xa8", &[_]Token.Id{ 1665 .LineComment, 1666 .Invalid, 1667 }); 1668 testTokenize("//\xe2\x80\xa9", &[_]Token.Id{ 1669 .LineComment, 1670 .Invalid, 1671 }); 1672 testTokenize("//\xe2\x80\xaa", &[_]Token.Id{.LineComment}); 1673 } 1674 1675 test "tokenizer - string identifier and builtin fns" { 1676 testTokenize( 1677 \\const @"if" = @import("std"); 1678 , &[_]Token.Id{ 1679 .Keyword_const, 1680 .Identifier, 1681 .Equal, 1682 .Builtin, 1683 .LParen, 1684 .StringLiteral, 1685 .RParen, 1686 .Semicolon, 1687 }); 1688 } 1689 1690 test "tokenizer - multiline string literal with literal tab" { 1691 testTokenize( 1692 \\\\foo bar 1693 , &[_]Token.Id{ 1694 .MultilineStringLiteralLine, 1695 }); 1696 } 1697 1698 test "tokenizer - comments with literal tab" { 1699 testTokenize( 1700 \\//foo bar 1701 \\//!foo bar 1702 \\///foo bar 1703 \\// foo 1704 \\/// foo 1705 \\/// /foo 1706 , &[_]Token.Id{ 1707 .LineComment, 1708 .ContainerDocComment, 1709 .DocComment, 1710 .LineComment, 1711 .DocComment, 1712 .DocComment, 1713 }); 1714 } 1715 1716 test "tokenizer - pipe and then invalid" { 1717 testTokenize("||=", &[_]Token.Id{ 1718 .PipePipe, 1719 .Equal, 1720 }); 1721 } 1722 1723 test "tokenizer - line comment and doc comment" { 1724 testTokenize("//", &[_]Token.Id{.LineComment}); 1725 testTokenize("// a / b", &[_]Token.Id{.LineComment}); 1726 testTokenize("// /", &[_]Token.Id{.LineComment}); 1727 testTokenize("/// a", &[_]Token.Id{.DocComment}); 1728 testTokenize("///", &[_]Token.Id{.DocComment}); 1729 testTokenize("////", &[_]Token.Id{.LineComment}); 1730 testTokenize("//!", &[_]Token.Id{.ContainerDocComment}); 1731 testTokenize("//!!", &[_]Token.Id{.ContainerDocComment}); 1732 } 1733 1734 test "tokenizer - line comment followed by identifier" { 1735 testTokenize( 1736 \\ Unexpected, 1737 \\ // another 1738 \\ Another, 1739 , &[_]Token.Id{ 1740 .Identifier, 1741 .Comma, 1742 .LineComment, 1743 .Identifier, 1744 .Comma, 1745 }); 1746 } 1747 1748 test "tokenizer - UTF-8 BOM is recognized and skipped" { 1749 testTokenize("\xEF\xBB\xBFa;\n", &[_]Token.Id{ 1750 .Identifier, 1751 .Semicolon, 1752 }); 1753 } 1754 1755 test "correctly parse pointer assignment" { 1756 testTokenize("b.*=3;\n", &[_]Token.Id{ 1757 .Identifier, 1758 .PeriodAsterisk, 1759 .Equal, 1760 .IntegerLiteral, 1761 .Semicolon, 1762 }); 1763 } 1764 1765 test "tokenizer - range literals" { 1766 testTokenize("0...9", &[_]Token.Id{ .IntegerLiteral, .Ellipsis3, .IntegerLiteral }); 1767 testTokenize("'0'...'9'", &[_]Token.Id{ .CharLiteral, .Ellipsis3, .CharLiteral }); 1768 testTokenize("0x00...0x09", &[_]Token.Id{ .IntegerLiteral, .Ellipsis3, .IntegerLiteral }); 1769 testTokenize("0b00...0b11", &[_]Token.Id{ .IntegerLiteral, .Ellipsis3, .IntegerLiteral }); 1770 testTokenize("0o00...0o11", &[_]Token.Id{ .IntegerLiteral, .Ellipsis3, .IntegerLiteral }); 1771 } 1772 1773 test "tokenizer - number literals decimal" { 1774 testTokenize("0", &[_]Token.Id{.IntegerLiteral}); 1775 testTokenize("1", &[_]Token.Id{.IntegerLiteral}); 1776 testTokenize("2", &[_]Token.Id{.IntegerLiteral}); 1777 testTokenize("3", &[_]Token.Id{.IntegerLiteral}); 1778 testTokenize("4", &[_]Token.Id{.IntegerLiteral}); 1779 testTokenize("5", &[_]Token.Id{.IntegerLiteral}); 1780 testTokenize("6", &[_]Token.Id{.IntegerLiteral}); 1781 testTokenize("7", &[_]Token.Id{.IntegerLiteral}); 1782 testTokenize("8", &[_]Token.Id{.IntegerLiteral}); 1783 testTokenize("9", &[_]Token.Id{.IntegerLiteral}); 1784 testTokenize("1..", &[_]Token.Id{ .IntegerLiteral, .Ellipsis2 }); 1785 testTokenize("0a", &[_]Token.Id{ .Invalid, .Identifier }); 1786 testTokenize("9b", &[_]Token.Id{ .Invalid, .Identifier }); 1787 testTokenize("1z", &[_]Token.Id{ .Invalid, .Identifier }); 1788 testTokenize("1z_1", &[_]Token.Id{ .Invalid, .Identifier }); 1789 testTokenize("9z3", &[_]Token.Id{ .Invalid, .Identifier }); 1790 1791 testTokenize("0_0", &[_]Token.Id{.IntegerLiteral}); 1792 testTokenize("0001", &[_]Token.Id{.IntegerLiteral}); 1793 testTokenize("01234567890", &[_]Token.Id{.IntegerLiteral}); 1794 testTokenize("012_345_6789_0", &[_]Token.Id{.IntegerLiteral}); 1795 testTokenize("0_1_2_3_4_5_6_7_8_9_0", &[_]Token.Id{.IntegerLiteral}); 1796 1797 testTokenize("00_", &[_]Token.Id{.Invalid}); 1798 testTokenize("0_0_", &[_]Token.Id{.Invalid}); 1799 testTokenize("0__0", &[_]Token.Id{ .Invalid, .Identifier }); 1800 testTokenize("0_0f", &[_]Token.Id{ .Invalid, .Identifier }); 1801 testTokenize("0_0_f", &[_]Token.Id{ .Invalid, .Identifier }); 1802 testTokenize("0_0_f_00", &[_]Token.Id{ .Invalid, .Identifier }); 1803 testTokenize("1_,", &[_]Token.Id{ .Invalid, .Comma }); 1804 1805 testTokenize("1.", &[_]Token.Id{.FloatLiteral}); 1806 testTokenize("0.0", &[_]Token.Id{.FloatLiteral}); 1807 testTokenize("1.0", &[_]Token.Id{.FloatLiteral}); 1808 testTokenize("10.0", &[_]Token.Id{.FloatLiteral}); 1809 testTokenize("0e0", &[_]Token.Id{.FloatLiteral}); 1810 testTokenize("1e0", &[_]Token.Id{.FloatLiteral}); 1811 testTokenize("1e100", &[_]Token.Id{.FloatLiteral}); 1812 testTokenize("1.e100", &[_]Token.Id{.FloatLiteral}); 1813 testTokenize("1.0e100", &[_]Token.Id{.FloatLiteral}); 1814 testTokenize("1.0e+100", &[_]Token.Id{.FloatLiteral}); 1815 testTokenize("1.0e-100", &[_]Token.Id{.FloatLiteral}); 1816 testTokenize("1_0_0_0.0_0_0_0_0_1e1_0_0_0", &[_]Token.Id{.FloatLiteral}); 1817 testTokenize("1.+", &[_]Token.Id{ .FloatLiteral, .Plus }); 1818 1819 testTokenize("1e", &[_]Token.Id{.Invalid}); 1820 testTokenize("1.0e1f0", &[_]Token.Id{ .Invalid, .Identifier }); 1821 testTokenize("1.0p100", &[_]Token.Id{ .Invalid, .Identifier }); 1822 testTokenize("1.0p-100", &[_]Token.Id{ .Invalid, .Identifier, .Minus, .IntegerLiteral }); 1823 testTokenize("1.0p1f0", &[_]Token.Id{ .Invalid, .Identifier }); 1824 testTokenize("1.0_,", &[_]Token.Id{ .Invalid, .Comma }); 1825 testTokenize("1_.0", &[_]Token.Id{ .Invalid, .Period, .IntegerLiteral }); 1826 testTokenize("1._", &[_]Token.Id{ .Invalid, .Identifier }); 1827 testTokenize("1.a", &[_]Token.Id{ .Invalid, .Identifier }); 1828 testTokenize("1.z", &[_]Token.Id{ .Invalid, .Identifier }); 1829 testTokenize("1._0", &[_]Token.Id{ .Invalid, .Identifier }); 1830 testTokenize("1._+", &[_]Token.Id{ .Invalid, .Identifier, .Plus }); 1831 testTokenize("1._e", &[_]Token.Id{ .Invalid, .Identifier }); 1832 testTokenize("1.0e", &[_]Token.Id{.Invalid}); 1833 testTokenize("1.0e,", &[_]Token.Id{ .Invalid, .Comma }); 1834 testTokenize("1.0e_", &[_]Token.Id{ .Invalid, .Identifier }); 1835 testTokenize("1.0e+_", &[_]Token.Id{ .Invalid, .Identifier }); 1836 testTokenize("1.0e-_", &[_]Token.Id{ .Invalid, .Identifier }); 1837 testTokenize("1.0e0_+", &[_]Token.Id{ .Invalid, .Plus }); 1838 } 1839 1840 test "tokenizer - number literals binary" { 1841 testTokenize("0b0", &[_]Token.Id{.IntegerLiteral}); 1842 testTokenize("0b1", &[_]Token.Id{.IntegerLiteral}); 1843 testTokenize("0b2", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1844 testTokenize("0b3", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1845 testTokenize("0b4", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1846 testTokenize("0b5", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1847 testTokenize("0b6", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1848 testTokenize("0b7", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1849 testTokenize("0b8", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1850 testTokenize("0b9", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1851 testTokenize("0ba", &[_]Token.Id{ .Invalid, .Identifier }); 1852 testTokenize("0bb", &[_]Token.Id{ .Invalid, .Identifier }); 1853 testTokenize("0bc", &[_]Token.Id{ .Invalid, .Identifier }); 1854 testTokenize("0bd", &[_]Token.Id{ .Invalid, .Identifier }); 1855 testTokenize("0be", &[_]Token.Id{ .Invalid, .Identifier }); 1856 testTokenize("0bf", &[_]Token.Id{ .Invalid, .Identifier }); 1857 testTokenize("0bz", &[_]Token.Id{ .Invalid, .Identifier }); 1858 1859 testTokenize("0b0000_0000", &[_]Token.Id{.IntegerLiteral}); 1860 testTokenize("0b1111_1111", &[_]Token.Id{.IntegerLiteral}); 1861 testTokenize("0b10_10_10_10", &[_]Token.Id{.IntegerLiteral}); 1862 testTokenize("0b0_1_0_1_0_1_0_1", &[_]Token.Id{.IntegerLiteral}); 1863 testTokenize("0b1.", &[_]Token.Id{ .IntegerLiteral, .Period }); 1864 testTokenize("0b1.0", &[_]Token.Id{ .IntegerLiteral, .Period, .IntegerLiteral }); 1865 1866 testTokenize("0B0", &[_]Token.Id{ .Invalid, .Identifier }); 1867 testTokenize("0b_", &[_]Token.Id{ .Invalid, .Identifier }); 1868 testTokenize("0b_0", &[_]Token.Id{ .Invalid, .Identifier }); 1869 testTokenize("0b1_", &[_]Token.Id{.Invalid}); 1870 testTokenize("0b0__1", &[_]Token.Id{ .Invalid, .Identifier }); 1871 testTokenize("0b0_1_", &[_]Token.Id{.Invalid}); 1872 testTokenize("0b1e", &[_]Token.Id{ .Invalid, .Identifier }); 1873 testTokenize("0b1p", &[_]Token.Id{ .Invalid, .Identifier }); 1874 testTokenize("0b1e0", &[_]Token.Id{ .Invalid, .Identifier }); 1875 testTokenize("0b1p0", &[_]Token.Id{ .Invalid, .Identifier }); 1876 testTokenize("0b1_,", &[_]Token.Id{ .Invalid, .Comma }); 1877 } 1878 1879 test "tokenizer - number literals octal" { 1880 testTokenize("0o0", &[_]Token.Id{.IntegerLiteral}); 1881 testTokenize("0o1", &[_]Token.Id{.IntegerLiteral}); 1882 testTokenize("0o2", &[_]Token.Id{.IntegerLiteral}); 1883 testTokenize("0o3", &[_]Token.Id{.IntegerLiteral}); 1884 testTokenize("0o4", &[_]Token.Id{.IntegerLiteral}); 1885 testTokenize("0o5", &[_]Token.Id{.IntegerLiteral}); 1886 testTokenize("0o6", &[_]Token.Id{.IntegerLiteral}); 1887 testTokenize("0o7", &[_]Token.Id{.IntegerLiteral}); 1888 testTokenize("0o8", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1889 testTokenize("0o9", &[_]Token.Id{ .Invalid, .IntegerLiteral }); 1890 testTokenize("0oa", &[_]Token.Id{ .Invalid, .Identifier }); 1891 testTokenize("0ob", &[_]Token.Id{ .Invalid, .Identifier }); 1892 testTokenize("0oc", &[_]Token.Id{ .Invalid, .Identifier }); 1893 testTokenize("0od", &[_]Token.Id{ .Invalid, .Identifier }); 1894 testTokenize("0oe", &[_]Token.Id{ .Invalid, .Identifier }); 1895 testTokenize("0of", &[_]Token.Id{ .Invalid, .Identifier }); 1896 testTokenize("0oz", &[_]Token.Id{ .Invalid, .Identifier }); 1897 1898 testTokenize("0o01234567", &[_]Token.Id{.IntegerLiteral}); 1899 testTokenize("0o0123_4567", &[_]Token.Id{.IntegerLiteral}); 1900 testTokenize("0o01_23_45_67", &[_]Token.Id{.IntegerLiteral}); 1901 testTokenize("0o0_1_2_3_4_5_6_7", &[_]Token.Id{.IntegerLiteral}); 1902 testTokenize("0o7.", &[_]Token.Id{ .IntegerLiteral, .Period }); 1903 testTokenize("0o7.0", &[_]Token.Id{ .IntegerLiteral, .Period, .IntegerLiteral }); 1904 1905 testTokenize("0O0", &[_]Token.Id{ .Invalid, .Identifier }); 1906 testTokenize("0o_", &[_]Token.Id{ .Invalid, .Identifier }); 1907 testTokenize("0o_0", &[_]Token.Id{ .Invalid, .Identifier }); 1908 testTokenize("0o1_", &[_]Token.Id{.Invalid}); 1909 testTokenize("0o0__1", &[_]Token.Id{ .Invalid, .Identifier }); 1910 testTokenize("0o0_1_", &[_]Token.Id{.Invalid}); 1911 testTokenize("0o1e", &[_]Token.Id{ .Invalid, .Identifier }); 1912 testTokenize("0o1p", &[_]Token.Id{ .Invalid, .Identifier }); 1913 testTokenize("0o1e0", &[_]Token.Id{ .Invalid, .Identifier }); 1914 testTokenize("0o1p0", &[_]Token.Id{ .Invalid, .Identifier }); 1915 testTokenize("0o_,", &[_]Token.Id{ .Invalid, .Identifier, .Comma }); 1916 } 1917 1918 test "tokenizer - number literals hexadeciaml" { 1919 testTokenize("0x0", &[_]Token.Id{.IntegerLiteral}); 1920 testTokenize("0x1", &[_]Token.Id{.IntegerLiteral}); 1921 testTokenize("0x2", &[_]Token.Id{.IntegerLiteral}); 1922 testTokenize("0x3", &[_]Token.Id{.IntegerLiteral}); 1923 testTokenize("0x4", &[_]Token.Id{.IntegerLiteral}); 1924 testTokenize("0x5", &[_]Token.Id{.IntegerLiteral}); 1925 testTokenize("0x6", &[_]Token.Id{.IntegerLiteral}); 1926 testTokenize("0x7", &[_]Token.Id{.IntegerLiteral}); 1927 testTokenize("0x8", &[_]Token.Id{.IntegerLiteral}); 1928 testTokenize("0x9", &[_]Token.Id{.IntegerLiteral}); 1929 testTokenize("0xa", &[_]Token.Id{.IntegerLiteral}); 1930 testTokenize("0xb", &[_]Token.Id{.IntegerLiteral}); 1931 testTokenize("0xc", &[_]Token.Id{.IntegerLiteral}); 1932 testTokenize("0xd", &[_]Token.Id{.IntegerLiteral}); 1933 testTokenize("0xe", &[_]Token.Id{.IntegerLiteral}); 1934 testTokenize("0xf", &[_]Token.Id{.IntegerLiteral}); 1935 testTokenize("0xA", &[_]Token.Id{.IntegerLiteral}); 1936 testTokenize("0xB", &[_]Token.Id{.IntegerLiteral}); 1937 testTokenize("0xC", &[_]Token.Id{.IntegerLiteral}); 1938 testTokenize("0xD", &[_]Token.Id{.IntegerLiteral}); 1939 testTokenize("0xE", &[_]Token.Id{.IntegerLiteral}); 1940 testTokenize("0xF", &[_]Token.Id{.IntegerLiteral}); 1941 testTokenize("0x0z", &[_]Token.Id{ .Invalid, .Identifier }); 1942 testTokenize("0xz", &[_]Token.Id{ .Invalid, .Identifier }); 1943 1944 testTokenize("0x0123456789ABCDEF", &[_]Token.Id{.IntegerLiteral}); 1945 testTokenize("0x0123_4567_89AB_CDEF", &[_]Token.Id{.IntegerLiteral}); 1946 testTokenize("0x01_23_45_67_89AB_CDE_F", &[_]Token.Id{.IntegerLiteral}); 1947 testTokenize("0x0_1_2_3_4_5_6_7_8_9_A_B_C_D_E_F", &[_]Token.Id{.IntegerLiteral}); 1948 1949 testTokenize("0X0", &[_]Token.Id{ .Invalid, .Identifier }); 1950 testTokenize("0x_", &[_]Token.Id{ .Invalid, .Identifier }); 1951 testTokenize("0x_1", &[_]Token.Id{ .Invalid, .Identifier }); 1952 testTokenize("0x1_", &[_]Token.Id{.Invalid}); 1953 testTokenize("0x0__1", &[_]Token.Id{ .Invalid, .Identifier }); 1954 testTokenize("0x0_1_", &[_]Token.Id{.Invalid}); 1955 testTokenize("0x_,", &[_]Token.Id{ .Invalid, .Identifier, .Comma }); 1956 1957 testTokenize("0x1.", &[_]Token.Id{.FloatLiteral}); 1958 testTokenize("0x1.0", &[_]Token.Id{.FloatLiteral}); 1959 testTokenize("0xF.", &[_]Token.Id{.FloatLiteral}); 1960 testTokenize("0xF.0", &[_]Token.Id{.FloatLiteral}); 1961 testTokenize("0xF.F", &[_]Token.Id{.FloatLiteral}); 1962 testTokenize("0xF.Fp0", &[_]Token.Id{.FloatLiteral}); 1963 testTokenize("0xF.FP0", &[_]Token.Id{.FloatLiteral}); 1964 testTokenize("0x1p0", &[_]Token.Id{.FloatLiteral}); 1965 testTokenize("0xfp0", &[_]Token.Id{.FloatLiteral}); 1966 testTokenize("0x1.+0xF.", &[_]Token.Id{ .FloatLiteral, .Plus, .FloatLiteral }); 1967 1968 testTokenize("0x0123456.789ABCDEF", &[_]Token.Id{.FloatLiteral}); 1969 testTokenize("0x0_123_456.789_ABC_DEF", &[_]Token.Id{.FloatLiteral}); 1970 testTokenize("0x0_1_2_3_4_5_6.7_8_9_A_B_C_D_E_F", &[_]Token.Id{.FloatLiteral}); 1971 testTokenize("0x0p0", &[_]Token.Id{.FloatLiteral}); 1972 testTokenize("0x0.0p0", &[_]Token.Id{.FloatLiteral}); 1973 testTokenize("0xff.ffp10", &[_]Token.Id{.FloatLiteral}); 1974 testTokenize("0xff.ffP10", &[_]Token.Id{.FloatLiteral}); 1975 testTokenize("0xff.p10", &[_]Token.Id{.FloatLiteral}); 1976 testTokenize("0xffp10", &[_]Token.Id{.FloatLiteral}); 1977 testTokenize("0xff_ff.ff_ffp1_0_0_0", &[_]Token.Id{.FloatLiteral}); 1978 testTokenize("0xf_f_f_f.f_f_f_fp+1_000", &[_]Token.Id{.FloatLiteral}); 1979 testTokenize("0xf_f_f_f.f_f_f_fp-1_00_0", &[_]Token.Id{.FloatLiteral}); 1980 1981 testTokenize("0x1e", &[_]Token.Id{.IntegerLiteral}); 1982 testTokenize("0x1e0", &[_]Token.Id{.IntegerLiteral}); 1983 testTokenize("0x1p", &[_]Token.Id{.Invalid}); 1984 testTokenize("0xfp0z1", &[_]Token.Id{ .Invalid, .Identifier }); 1985 testTokenize("0xff.ffpff", &[_]Token.Id{ .Invalid, .Identifier }); 1986 testTokenize("0x0.p", &[_]Token.Id{.Invalid}); 1987 testTokenize("0x0.z", &[_]Token.Id{ .Invalid, .Identifier }); 1988 testTokenize("0x0._", &[_]Token.Id{ .Invalid, .Identifier }); 1989 testTokenize("0x0_.0", &[_]Token.Id{ .Invalid, .Period, .IntegerLiteral }); 1990 testTokenize("0x0_.0.0", &[_]Token.Id{ .Invalid, .Period, .FloatLiteral }); 1991 testTokenize("0x0._0", &[_]Token.Id{ .Invalid, .Identifier }); 1992 testTokenize("0x0.0_", &[_]Token.Id{.Invalid}); 1993 testTokenize("0x0_p0", &[_]Token.Id{ .Invalid, .Identifier }); 1994 testTokenize("0x0_.p0", &[_]Token.Id{ .Invalid, .Period, .Identifier }); 1995 testTokenize("0x0._p0", &[_]Token.Id{ .Invalid, .Identifier }); 1996 testTokenize("0x0.0_p0", &[_]Token.Id{ .Invalid, .Identifier }); 1997 testTokenize("0x0._0p0", &[_]Token.Id{ .Invalid, .Identifier }); 1998 testTokenize("0x0.0p_0", &[_]Token.Id{ .Invalid, .Identifier }); 1999 testTokenize("0x0.0p+_0", &[_]Token.Id{ .Invalid, .Identifier }); 2000 testTokenize("0x0.0p-_0", &[_]Token.Id{ .Invalid, .Identifier }); 2001 testTokenize("0x0.0p0_", &[_]Token.Id{ .Invalid, .Eof }); 2002 } 2003 2004 fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void { 2005 var tokenizer = Tokenizer.init(source); 2006 for (expected_tokens) |expected_token_id| { 2007 const token = tokenizer.next(); 2008 if (token.id != expected_token_id) { 2009 std.debug.panic("expected {}, found {}\n", .{ @tagName(expected_token_id), @tagName(token.id) }); 2010 } 2011 } 2012 const last_token = tokenizer.next(); 2013 std.testing.expect(last_token.id == .Eof); 2014 }