blob e71babe4 (45110B) - Raw
1 const std = @import("../index.zig"); 2 const mem = std.mem; 3 4 pub const Token = struct { 5 id: Id, 6 start: usize, 7 end: usize, 8 9 pub const Keyword = struct { 10 bytes: []const u8, 11 id: Id, 12 }; 13 14 pub const keywords = []Keyword{ 15 Keyword{ .bytes = "align", .id = Id.Keyword_align }, 16 Keyword{ .bytes = "and", .id = Id.Keyword_and }, 17 Keyword{ .bytes = "anyerror", .id = Id.Keyword_anyerror }, 18 Keyword{ .bytes = "asm", .id = Id.Keyword_asm }, 19 Keyword{ .bytes = "async", .id = Id.Keyword_async }, 20 Keyword{ .bytes = "await", .id = Id.Keyword_await }, 21 Keyword{ .bytes = "break", .id = Id.Keyword_break }, 22 Keyword{ .bytes = "catch", .id = Id.Keyword_catch }, 23 Keyword{ .bytes = "cancel", .id = Id.Keyword_cancel }, 24 Keyword{ .bytes = "comptime", .id = Id.Keyword_comptime }, 25 Keyword{ .bytes = "const", .id = Id.Keyword_const }, 26 Keyword{ .bytes = "continue", .id = Id.Keyword_continue }, 27 Keyword{ .bytes = "defer", .id = Id.Keyword_defer }, 28 Keyword{ .bytes = "else", .id = Id.Keyword_else }, 29 Keyword{ .bytes = "enum", .id = Id.Keyword_enum }, 30 Keyword{ .bytes = "errdefer", .id = Id.Keyword_errdefer }, 31 Keyword{ .bytes = "error", .id = Id.Keyword_error }, 32 Keyword{ .bytes = "export", .id = Id.Keyword_export }, 33 Keyword{ .bytes = "extern", .id = Id.Keyword_extern }, 34 Keyword{ .bytes = "false", .id = Id.Keyword_false }, 35 Keyword{ .bytes = "fn", .id = Id.Keyword_fn }, 36 Keyword{ .bytes = "for", .id = Id.Keyword_for }, 37 Keyword{ .bytes = "if", .id = Id.Keyword_if }, 38 Keyword{ .bytes = "inline", .id = Id.Keyword_inline }, 39 Keyword{ .bytes = "nakedcc", .id = Id.Keyword_nakedcc }, 40 Keyword{ .bytes = "noalias", .id = Id.Keyword_noalias }, 41 Keyword{ .bytes = "null", .id = Id.Keyword_null }, 42 Keyword{ .bytes = "or", .id = Id.Keyword_or }, 43 Keyword{ .bytes = "orelse", .id = Id.Keyword_orelse }, 44 Keyword{ .bytes = "packed", .id = Id.Keyword_packed }, 45 Keyword{ .bytes = "promise", .id = Id.Keyword_promise }, 46 Keyword{ .bytes = "pub", .id = Id.Keyword_pub }, 47 Keyword{ .bytes = "resume", .id = Id.Keyword_resume }, 48 Keyword{ .bytes = "return", .id = Id.Keyword_return }, 49 Keyword{ .bytes = "linksection", .id = Id.Keyword_linksection }, 50 Keyword{ .bytes = "stdcallcc", .id = Id.Keyword_stdcallcc }, 51 Keyword{ .bytes = "struct", .id = Id.Keyword_struct }, 52 Keyword{ .bytes = "suspend", .id = Id.Keyword_suspend }, 53 Keyword{ .bytes = "switch", .id = Id.Keyword_switch }, 54 Keyword{ .bytes = "test", .id = Id.Keyword_test }, 55 Keyword{ .bytes = "this", .id = Id.Keyword_this }, 56 Keyword{ .bytes = "true", .id = Id.Keyword_true }, 57 Keyword{ .bytes = "try", .id = Id.Keyword_try }, 58 Keyword{ .bytes = "undefined", .id = Id.Keyword_undefined }, 59 Keyword{ .bytes = "union", .id = Id.Keyword_union }, 60 Keyword{ .bytes = "unreachable", .id = Id.Keyword_unreachable }, 61 Keyword{ .bytes = "use", .id = Id.Keyword_use }, 62 Keyword{ .bytes = "var", .id = Id.Keyword_var }, 63 Keyword{ .bytes = "volatile", .id = Id.Keyword_volatile }, 64 Keyword{ .bytes = "while", .id = Id.Keyword_while }, 65 }; 66 67 // TODO perfect hash at comptime 68 fn getKeyword(bytes: []const u8) ?Id { 69 for (keywords) |kw| { 70 if (mem.eql(u8, kw.bytes, bytes)) { 71 return kw.id; 72 } 73 } 74 return null; 75 } 76 77 pub const Id = enum { 78 Invalid, 79 Identifier, 80 StringLiteral, 81 MultilineStringLiteralLine, 82 CharLiteral, 83 Eof, 84 Builtin, 85 Bang, 86 Pipe, 87 PipePipe, 88 PipeEqual, 89 Equal, 90 EqualEqual, 91 EqualAngleBracketRight, 92 BangEqual, 93 LParen, 94 RParen, 95 Semicolon, 96 Percent, 97 PercentEqual, 98 LBrace, 99 RBrace, 100 LBracket, 101 RBracket, 102 Period, 103 Ellipsis2, 104 Ellipsis3, 105 Caret, 106 CaretEqual, 107 Plus, 108 PlusPlus, 109 PlusEqual, 110 PlusPercent, 111 PlusPercentEqual, 112 Minus, 113 MinusEqual, 114 MinusPercent, 115 MinusPercentEqual, 116 Asterisk, 117 AsteriskEqual, 118 AsteriskAsterisk, 119 AsteriskPercent, 120 AsteriskPercentEqual, 121 Arrow, 122 Colon, 123 Slash, 124 SlashEqual, 125 Comma, 126 Ampersand, 127 AmpersandEqual, 128 QuestionMark, 129 AngleBracketLeft, 130 AngleBracketLeftEqual, 131 AngleBracketAngleBracketLeft, 132 AngleBracketAngleBracketLeftEqual, 133 AngleBracketRight, 134 AngleBracketRightEqual, 135 AngleBracketAngleBracketRight, 136 AngleBracketAngleBracketRightEqual, 137 Tilde, 138 IntegerLiteral, 139 FloatLiteral, 140 LineComment, 141 DocComment, 142 BracketStarBracket, 143 ShebangLine, 144 Keyword_align, 145 Keyword_and, 146 Keyword_anyerror, 147 Keyword_asm, 148 Keyword_async, 149 Keyword_await, 150 Keyword_break, 151 Keyword_cancel, 152 Keyword_catch, 153 Keyword_comptime, 154 Keyword_const, 155 Keyword_continue, 156 Keyword_defer, 157 Keyword_else, 158 Keyword_enum, 159 Keyword_errdefer, 160 Keyword_error, 161 Keyword_export, 162 Keyword_extern, 163 Keyword_false, 164 Keyword_fn, 165 Keyword_for, 166 Keyword_if, 167 Keyword_inline, 168 Keyword_nakedcc, 169 Keyword_noalias, 170 Keyword_null, 171 Keyword_or, 172 Keyword_orelse, 173 Keyword_packed, 174 Keyword_promise, 175 Keyword_pub, 176 Keyword_resume, 177 Keyword_return, 178 Keyword_linksection, 179 Keyword_stdcallcc, 180 Keyword_struct, 181 Keyword_suspend, 182 Keyword_switch, 183 Keyword_test, 184 Keyword_this, 185 Keyword_true, 186 Keyword_try, 187 Keyword_undefined, 188 Keyword_union, 189 Keyword_unreachable, 190 Keyword_use, 191 Keyword_var, 192 Keyword_volatile, 193 Keyword_while, 194 }; 195 }; 196 197 pub const Tokenizer = struct { 198 buffer: []const u8, 199 index: usize, 200 pending_invalid_token: ?Token, 201 202 /// For debugging purposes 203 pub fn dump(self: *Tokenizer, token: *const Token) void { 204 std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]); 205 } 206 207 pub fn init(buffer: []const u8) Tokenizer { 208 if (mem.startsWith(u8, buffer, "#!")) { 209 const src_start = if (mem.indexOfScalar(u8, buffer, '\n')) |i| i + 1 else buffer.len; 210 return Tokenizer{ 211 .buffer = buffer, 212 .index = src_start, 213 .pending_invalid_token = Token{ 214 .id = Token.Id.ShebangLine, 215 .start = 0, 216 .end = src_start, 217 }, 218 }; 219 } else { 220 return Tokenizer{ 221 .buffer = buffer, 222 .index = 0, 223 .pending_invalid_token = null, 224 }; 225 } 226 } 227 228 const State = enum { 229 Start, 230 Identifier, 231 Builtin, 232 C, 233 StringLiteral, 234 StringLiteralBackslash, 235 MultilineStringLiteralLine, 236 CharLiteral, 237 CharLiteralBackslash, 238 CharLiteralEscape1, 239 CharLiteralEscape2, 240 CharLiteralEnd, 241 Backslash, 242 Equal, 243 Bang, 244 Pipe, 245 Minus, 246 MinusPercent, 247 Asterisk, 248 AsteriskPercent, 249 Slash, 250 LineCommentStart, 251 LineComment, 252 DocCommentStart, 253 DocComment, 254 Zero, 255 IntegerLiteral, 256 IntegerLiteralWithRadix, 257 IntegerLiteralWithRadixHex, 258 NumberDot, 259 NumberDotHex, 260 FloatFraction, 261 FloatFractionHex, 262 FloatExponentUnsigned, 263 FloatExponentUnsignedHex, 264 FloatExponentNumber, 265 FloatExponentNumberHex, 266 Ampersand, 267 Caret, 268 Percent, 269 Plus, 270 PlusPercent, 271 AngleBracketLeft, 272 AngleBracketAngleBracketLeft, 273 AngleBracketRight, 274 AngleBracketAngleBracketRight, 275 Period, 276 Period2, 277 SawAtSign, 278 LBracket, 279 LBracketStar, 280 }; 281 282 pub fn next(self: *Tokenizer) Token { 283 if (self.pending_invalid_token) |token| { 284 self.pending_invalid_token = null; 285 return token; 286 } 287 const start_index = self.index; 288 var state = State.Start; 289 var result = Token{ 290 .id = Token.Id.Eof, 291 .start = self.index, 292 .end = undefined, 293 }; 294 while (self.index < self.buffer.len) : (self.index += 1) { 295 const c = self.buffer[self.index]; 296 switch (state) { 297 State.Start => switch (c) { 298 ' ' => { 299 result.start = self.index + 1; 300 }, 301 '\n' => { 302 result.start = self.index + 1; 303 }, 304 'c' => { 305 state = State.C; 306 result.id = Token.Id.Identifier; 307 }, 308 '"' => { 309 state = State.StringLiteral; 310 result.id = Token.Id.StringLiteral; 311 }, 312 '\'' => { 313 state = State.CharLiteral; 314 }, 315 'a'...'b', 'd'...'z', 'A'...'Z', '_' => { 316 state = State.Identifier; 317 result.id = Token.Id.Identifier; 318 }, 319 '@' => { 320 state = State.SawAtSign; 321 }, 322 '=' => { 323 state = State.Equal; 324 }, 325 '!' => { 326 state = State.Bang; 327 }, 328 '|' => { 329 state = State.Pipe; 330 }, 331 '(' => { 332 result.id = Token.Id.LParen; 333 self.index += 1; 334 break; 335 }, 336 ')' => { 337 result.id = Token.Id.RParen; 338 self.index += 1; 339 break; 340 }, 341 '[' => { 342 state = State.LBracket; 343 }, 344 ']' => { 345 result.id = Token.Id.RBracket; 346 self.index += 1; 347 break; 348 }, 349 ';' => { 350 result.id = Token.Id.Semicolon; 351 self.index += 1; 352 break; 353 }, 354 ',' => { 355 result.id = Token.Id.Comma; 356 self.index += 1; 357 break; 358 }, 359 '?' => { 360 result.id = Token.Id.QuestionMark; 361 self.index += 1; 362 break; 363 }, 364 ':' => { 365 result.id = Token.Id.Colon; 366 self.index += 1; 367 break; 368 }, 369 '%' => { 370 state = State.Percent; 371 }, 372 '*' => { 373 state = State.Asterisk; 374 }, 375 '+' => { 376 state = State.Plus; 377 }, 378 '<' => { 379 state = State.AngleBracketLeft; 380 }, 381 '>' => { 382 state = State.AngleBracketRight; 383 }, 384 '^' => { 385 state = State.Caret; 386 }, 387 '\\' => { 388 state = State.Backslash; 389 result.id = Token.Id.MultilineStringLiteralLine; 390 }, 391 '{' => { 392 result.id = Token.Id.LBrace; 393 self.index += 1; 394 break; 395 }, 396 '}' => { 397 result.id = Token.Id.RBrace; 398 self.index += 1; 399 break; 400 }, 401 '~' => { 402 result.id = Token.Id.Tilde; 403 self.index += 1; 404 break; 405 }, 406 '.' => { 407 state = State.Period; 408 }, 409 '-' => { 410 state = State.Minus; 411 }, 412 '/' => { 413 state = State.Slash; 414 }, 415 '&' => { 416 state = State.Ampersand; 417 }, 418 '0' => { 419 state = State.Zero; 420 result.id = Token.Id.IntegerLiteral; 421 }, 422 '1'...'9' => { 423 state = State.IntegerLiteral; 424 result.id = Token.Id.IntegerLiteral; 425 }, 426 else => { 427 result.id = Token.Id.Invalid; 428 self.index += 1; 429 break; 430 }, 431 }, 432 433 State.SawAtSign => switch (c) { 434 '"' => { 435 result.id = Token.Id.Identifier; 436 state = State.StringLiteral; 437 }, 438 else => { 439 // reinterpret as a builtin 440 self.index -= 1; 441 state = State.Builtin; 442 result.id = Token.Id.Builtin; 443 }, 444 }, 445 446 State.LBracket => switch (c) { 447 '*' => { 448 state = State.LBracketStar; 449 }, 450 else => { 451 result.id = Token.Id.LBracket; 452 break; 453 }, 454 }, 455 456 State.LBracketStar => switch (c) { 457 ']' => { 458 result.id = Token.Id.BracketStarBracket; 459 self.index += 1; 460 break; 461 }, 462 else => { 463 result.id = Token.Id.Invalid; 464 break; 465 }, 466 }, 467 468 State.Ampersand => switch (c) { 469 '=' => { 470 result.id = Token.Id.AmpersandEqual; 471 self.index += 1; 472 break; 473 }, 474 else => { 475 result.id = Token.Id.Ampersand; 476 break; 477 }, 478 }, 479 480 State.Asterisk => switch (c) { 481 '=' => { 482 result.id = Token.Id.AsteriskEqual; 483 self.index += 1; 484 break; 485 }, 486 '*' => { 487 result.id = Token.Id.AsteriskAsterisk; 488 self.index += 1; 489 break; 490 }, 491 '%' => { 492 state = State.AsteriskPercent; 493 }, 494 else => { 495 result.id = Token.Id.Asterisk; 496 break; 497 }, 498 }, 499 500 State.AsteriskPercent => switch (c) { 501 '=' => { 502 result.id = Token.Id.AsteriskPercentEqual; 503 self.index += 1; 504 break; 505 }, 506 else => { 507 result.id = Token.Id.AsteriskPercent; 508 break; 509 }, 510 }, 511 512 State.Percent => switch (c) { 513 '=' => { 514 result.id = Token.Id.PercentEqual; 515 self.index += 1; 516 break; 517 }, 518 else => { 519 result.id = Token.Id.Percent; 520 break; 521 }, 522 }, 523 524 State.Plus => switch (c) { 525 '=' => { 526 result.id = Token.Id.PlusEqual; 527 self.index += 1; 528 break; 529 }, 530 '+' => { 531 result.id = Token.Id.PlusPlus; 532 self.index += 1; 533 break; 534 }, 535 '%' => { 536 state = State.PlusPercent; 537 }, 538 else => { 539 result.id = Token.Id.Plus; 540 break; 541 }, 542 }, 543 544 State.PlusPercent => switch (c) { 545 '=' => { 546 result.id = Token.Id.PlusPercentEqual; 547 self.index += 1; 548 break; 549 }, 550 else => { 551 result.id = Token.Id.PlusPercent; 552 break; 553 }, 554 }, 555 556 State.Caret => switch (c) { 557 '=' => { 558 result.id = Token.Id.CaretEqual; 559 self.index += 1; 560 break; 561 }, 562 else => { 563 result.id = Token.Id.Caret; 564 break; 565 }, 566 }, 567 568 State.Identifier => switch (c) { 569 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, 570 else => { 571 if (Token.getKeyword(self.buffer[result.start..self.index])) |id| { 572 result.id = id; 573 } 574 break; 575 }, 576 }, 577 State.Builtin => switch (c) { 578 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, 579 else => break, 580 }, 581 State.Backslash => switch (c) { 582 '\\' => { 583 state = State.MultilineStringLiteralLine; 584 }, 585 else => break, 586 }, 587 State.C => switch (c) { 588 '\\' => { 589 state = State.Backslash; 590 result.id = Token.Id.MultilineStringLiteralLine; 591 }, 592 '"' => { 593 state = State.StringLiteral; 594 result.id = Token.Id.StringLiteral; 595 }, 596 'a'...'z', 'A'...'Z', '_', '0'...'9' => { 597 state = State.Identifier; 598 }, 599 else => break, 600 }, 601 State.StringLiteral => switch (c) { 602 '\\' => { 603 state = State.StringLiteralBackslash; 604 }, 605 '"' => { 606 self.index += 1; 607 break; 608 }, 609 '\n' => break, // Look for this error later. 610 else => self.checkLiteralCharacter(), 611 }, 612 613 State.StringLiteralBackslash => switch (c) { 614 '\n' => break, // Look for this error later. 615 else => { 616 state = State.StringLiteral; 617 }, 618 }, 619 620 State.CharLiteral => switch (c) { 621 '\\' => { 622 state = State.CharLiteralBackslash; 623 }, 624 '\'' => { 625 result.id = Token.Id.Invalid; 626 break; 627 }, 628 else => { 629 if (c < 0x20 or c == 0x7f) { 630 result.id = Token.Id.Invalid; 631 break; 632 } 633 634 state = State.CharLiteralEnd; 635 }, 636 }, 637 638 State.CharLiteralBackslash => switch (c) { 639 '\n' => { 640 result.id = Token.Id.Invalid; 641 break; 642 }, 643 'x' => { 644 state = State.CharLiteralEscape1; 645 }, 646 else => { 647 state = State.CharLiteralEnd; 648 }, 649 }, 650 651 State.CharLiteralEscape1 => switch (c) { 652 '0'...'9', 'a'...'z', 'A'...'F' => { 653 state = State.CharLiteralEscape2; 654 }, 655 else => { 656 result.id = Token.Id.Invalid; 657 break; 658 }, 659 }, 660 661 State.CharLiteralEscape2 => switch (c) { 662 '0'...'9', 'a'...'z', 'A'...'F' => { 663 state = State.CharLiteralEnd; 664 }, 665 else => { 666 result.id = Token.Id.Invalid; 667 break; 668 }, 669 }, 670 671 State.CharLiteralEnd => switch (c) { 672 '\'' => { 673 result.id = Token.Id.CharLiteral; 674 self.index += 1; 675 break; 676 }, 677 else => { 678 result.id = Token.Id.Invalid; 679 break; 680 }, 681 }, 682 683 State.MultilineStringLiteralLine => switch (c) { 684 '\n' => { 685 self.index += 1; 686 break; 687 }, 688 else => self.checkLiteralCharacter(), 689 }, 690 691 State.Bang => switch (c) { 692 '=' => { 693 result.id = Token.Id.BangEqual; 694 self.index += 1; 695 break; 696 }, 697 else => { 698 result.id = Token.Id.Bang; 699 break; 700 }, 701 }, 702 703 State.Pipe => switch (c) { 704 '=' => { 705 result.id = Token.Id.PipeEqual; 706 self.index += 1; 707 break; 708 }, 709 '|' => { 710 result.id = Token.Id.PipePipe; 711 self.index += 1; 712 break; 713 }, 714 else => { 715 result.id = Token.Id.Pipe; 716 break; 717 }, 718 }, 719 720 State.Equal => switch (c) { 721 '=' => { 722 result.id = Token.Id.EqualEqual; 723 self.index += 1; 724 break; 725 }, 726 '>' => { 727 result.id = Token.Id.EqualAngleBracketRight; 728 self.index += 1; 729 break; 730 }, 731 else => { 732 result.id = Token.Id.Equal; 733 break; 734 }, 735 }, 736 737 State.Minus => switch (c) { 738 '>' => { 739 result.id = Token.Id.Arrow; 740 self.index += 1; 741 break; 742 }, 743 '=' => { 744 result.id = Token.Id.MinusEqual; 745 self.index += 1; 746 break; 747 }, 748 '%' => { 749 state = State.MinusPercent; 750 }, 751 else => { 752 result.id = Token.Id.Minus; 753 break; 754 }, 755 }, 756 757 State.MinusPercent => switch (c) { 758 '=' => { 759 result.id = Token.Id.MinusPercentEqual; 760 self.index += 1; 761 break; 762 }, 763 else => { 764 result.id = Token.Id.MinusPercent; 765 break; 766 }, 767 }, 768 769 State.AngleBracketLeft => switch (c) { 770 '<' => { 771 state = State.AngleBracketAngleBracketLeft; 772 }, 773 '=' => { 774 result.id = Token.Id.AngleBracketLeftEqual; 775 self.index += 1; 776 break; 777 }, 778 else => { 779 result.id = Token.Id.AngleBracketLeft; 780 break; 781 }, 782 }, 783 784 State.AngleBracketAngleBracketLeft => switch (c) { 785 '=' => { 786 result.id = Token.Id.AngleBracketAngleBracketLeftEqual; 787 self.index += 1; 788 break; 789 }, 790 else => { 791 result.id = Token.Id.AngleBracketAngleBracketLeft; 792 break; 793 }, 794 }, 795 796 State.AngleBracketRight => switch (c) { 797 '>' => { 798 state = State.AngleBracketAngleBracketRight; 799 }, 800 '=' => { 801 result.id = Token.Id.AngleBracketRightEqual; 802 self.index += 1; 803 break; 804 }, 805 else => { 806 result.id = Token.Id.AngleBracketRight; 807 break; 808 }, 809 }, 810 811 State.AngleBracketAngleBracketRight => switch (c) { 812 '=' => { 813 result.id = Token.Id.AngleBracketAngleBracketRightEqual; 814 self.index += 1; 815 break; 816 }, 817 else => { 818 result.id = Token.Id.AngleBracketAngleBracketRight; 819 break; 820 }, 821 }, 822 823 State.Period => switch (c) { 824 '.' => { 825 state = State.Period2; 826 }, 827 else => { 828 result.id = Token.Id.Period; 829 break; 830 }, 831 }, 832 833 State.Period2 => switch (c) { 834 '.' => { 835 result.id = Token.Id.Ellipsis3; 836 self.index += 1; 837 break; 838 }, 839 else => { 840 result.id = Token.Id.Ellipsis2; 841 break; 842 }, 843 }, 844 845 State.Slash => switch (c) { 846 '/' => { 847 state = State.LineCommentStart; 848 result.id = Token.Id.LineComment; 849 }, 850 '=' => { 851 result.id = Token.Id.SlashEqual; 852 self.index += 1; 853 break; 854 }, 855 else => { 856 result.id = Token.Id.Slash; 857 break; 858 }, 859 }, 860 State.LineCommentStart => switch (c) { 861 '/' => { 862 state = State.DocCommentStart; 863 }, 864 '\n' => break, 865 else => { 866 state = State.LineComment; 867 self.checkLiteralCharacter(); 868 }, 869 }, 870 State.DocCommentStart => switch (c) { 871 '/' => { 872 state = State.LineComment; 873 }, 874 '\n' => { 875 result.id = Token.Id.DocComment; 876 break; 877 }, 878 else => { 879 state = State.DocComment; 880 result.id = Token.Id.DocComment; 881 self.checkLiteralCharacter(); 882 }, 883 }, 884 State.LineComment, State.DocComment => switch (c) { 885 '\n' => break, 886 else => self.checkLiteralCharacter(), 887 }, 888 State.Zero => switch (c) { 889 'b', 'o' => { 890 state = State.IntegerLiteralWithRadix; 891 }, 892 'x' => { 893 state = State.IntegerLiteralWithRadixHex; 894 }, 895 else => { 896 // reinterpret as a normal number 897 self.index -= 1; 898 state = State.IntegerLiteral; 899 }, 900 }, 901 State.IntegerLiteral => switch (c) { 902 '.' => { 903 state = State.NumberDot; 904 }, 905 'p', 'P', 'e', 'E' => { 906 state = State.FloatExponentUnsigned; 907 }, 908 '0'...'9' => {}, 909 else => break, 910 }, 911 State.IntegerLiteralWithRadix => switch (c) { 912 '.' => { 913 state = State.NumberDot; 914 }, 915 '0'...'9' => {}, 916 else => break, 917 }, 918 State.IntegerLiteralWithRadixHex => switch (c) { 919 '.' => { 920 state = State.NumberDotHex; 921 }, 922 'p', 'P' => { 923 state = State.FloatExponentUnsignedHex; 924 }, 925 '0'...'9', 'a'...'f', 'A'...'F' => {}, 926 else => break, 927 }, 928 State.NumberDot => switch (c) { 929 '.' => { 930 self.index -= 1; 931 state = State.Start; 932 break; 933 }, 934 else => { 935 self.index -= 1; 936 result.id = Token.Id.FloatLiteral; 937 state = State.FloatFraction; 938 }, 939 }, 940 State.NumberDotHex => switch (c) { 941 '.' => { 942 self.index -= 1; 943 state = State.Start; 944 break; 945 }, 946 else => { 947 self.index -= 1; 948 result.id = Token.Id.FloatLiteral; 949 state = State.FloatFractionHex; 950 }, 951 }, 952 State.FloatFraction => switch (c) { 953 'e', 'E' => { 954 state = State.FloatExponentUnsigned; 955 }, 956 '0'...'9' => {}, 957 else => break, 958 }, 959 State.FloatFractionHex => switch (c) { 960 'p', 'P' => { 961 state = State.FloatExponentUnsignedHex; 962 }, 963 '0'...'9', 'a'...'f', 'A'...'F' => {}, 964 else => break, 965 }, 966 State.FloatExponentUnsigned => switch (c) { 967 '+', '-' => { 968 state = State.FloatExponentNumber; 969 }, 970 else => { 971 // reinterpret as a normal exponent number 972 self.index -= 1; 973 state = State.FloatExponentNumber; 974 }, 975 }, 976 State.FloatExponentUnsignedHex => switch (c) { 977 '+', '-' => { 978 state = State.FloatExponentNumberHex; 979 }, 980 else => { 981 // reinterpret as a normal exponent number 982 self.index -= 1; 983 state = State.FloatExponentNumberHex; 984 }, 985 }, 986 State.FloatExponentNumber => switch (c) { 987 '0'...'9' => {}, 988 else => break, 989 }, 990 State.FloatExponentNumberHex => switch (c) { 991 '0'...'9', 'a'...'f', 'A'...'F' => {}, 992 else => break, 993 }, 994 } 995 } else if (self.index == self.buffer.len) { 996 switch (state) { 997 State.Start, 998 State.C, 999 State.IntegerLiteral, 1000 State.IntegerLiteralWithRadix, 1001 State.IntegerLiteralWithRadixHex, 1002 State.FloatFraction, 1003 State.FloatFractionHex, 1004 State.FloatExponentNumber, 1005 State.FloatExponentNumberHex, 1006 State.StringLiteral, // find this error later 1007 State.MultilineStringLiteralLine, 1008 State.Builtin, 1009 => {}, 1010 1011 State.Identifier => { 1012 if (Token.getKeyword(self.buffer[result.start..self.index])) |id| { 1013 result.id = id; 1014 } 1015 }, 1016 State.LineCommentStart, State.LineComment => { 1017 result.id = Token.Id.LineComment; 1018 }, 1019 State.DocComment, State.DocCommentStart => { 1020 result.id = Token.Id.DocComment; 1021 }, 1022 1023 State.NumberDot, 1024 State.NumberDotHex, 1025 State.FloatExponentUnsigned, 1026 State.FloatExponentUnsignedHex, 1027 State.SawAtSign, 1028 State.Backslash, 1029 State.CharLiteral, 1030 State.CharLiteralBackslash, 1031 State.CharLiteralEscape1, 1032 State.CharLiteralEscape2, 1033 State.CharLiteralEnd, 1034 State.StringLiteralBackslash, 1035 State.LBracketStar, 1036 => { 1037 result.id = Token.Id.Invalid; 1038 }, 1039 1040 State.Equal => { 1041 result.id = Token.Id.Equal; 1042 }, 1043 State.Bang => { 1044 result.id = Token.Id.Bang; 1045 }, 1046 State.Minus => { 1047 result.id = Token.Id.Minus; 1048 }, 1049 State.Slash => { 1050 result.id = Token.Id.Slash; 1051 }, 1052 State.LBracket => { 1053 result.id = Token.Id.LBracket; 1054 }, 1055 State.Zero => { 1056 result.id = Token.Id.IntegerLiteral; 1057 }, 1058 State.Ampersand => { 1059 result.id = Token.Id.Ampersand; 1060 }, 1061 State.Period => { 1062 result.id = Token.Id.Period; 1063 }, 1064 State.Period2 => { 1065 result.id = Token.Id.Ellipsis2; 1066 }, 1067 State.Pipe => { 1068 result.id = Token.Id.Pipe; 1069 }, 1070 State.AngleBracketAngleBracketRight => { 1071 result.id = Token.Id.AngleBracketAngleBracketRight; 1072 }, 1073 State.AngleBracketRight => { 1074 result.id = Token.Id.AngleBracketRight; 1075 }, 1076 State.AngleBracketAngleBracketLeft => { 1077 result.id = Token.Id.AngleBracketAngleBracketLeft; 1078 }, 1079 State.AngleBracketLeft => { 1080 result.id = Token.Id.AngleBracketLeft; 1081 }, 1082 State.PlusPercent => { 1083 result.id = Token.Id.PlusPercent; 1084 }, 1085 State.Plus => { 1086 result.id = Token.Id.Plus; 1087 }, 1088 State.Percent => { 1089 result.id = Token.Id.Percent; 1090 }, 1091 State.Caret => { 1092 result.id = Token.Id.Caret; 1093 }, 1094 State.AsteriskPercent => { 1095 result.id = Token.Id.AsteriskPercent; 1096 }, 1097 State.Asterisk => { 1098 result.id = Token.Id.Asterisk; 1099 }, 1100 State.MinusPercent => { 1101 result.id = Token.Id.MinusPercent; 1102 }, 1103 } 1104 } 1105 1106 if (result.id == Token.Id.Eof) { 1107 if (self.pending_invalid_token) |token| { 1108 self.pending_invalid_token = null; 1109 return token; 1110 } 1111 } 1112 1113 result.end = self.index; 1114 return result; 1115 } 1116 1117 fn checkLiteralCharacter(self: *Tokenizer) void { 1118 if (self.pending_invalid_token != null) return; 1119 const invalid_length = self.getInvalidCharacterLength(); 1120 if (invalid_length == 0) return; 1121 self.pending_invalid_token = Token{ 1122 .id = Token.Id.Invalid, 1123 .start = self.index, 1124 .end = self.index + invalid_length, 1125 }; 1126 } 1127 1128 fn getInvalidCharacterLength(self: *Tokenizer) u3 { 1129 const c0 = self.buffer[self.index]; 1130 if (c0 < 0x80) { 1131 if (c0 < 0x20 or c0 == 0x7f) { 1132 // ascii control codes are never allowed 1133 // (note that \n was checked before we got here) 1134 return 1; 1135 } 1136 // looks fine to me. 1137 return 0; 1138 } else { 1139 // check utf8-encoded character. 1140 const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; 1141 if (self.index + length > self.buffer.len) { 1142 return @intCast(u3, self.buffer.len - self.index); 1143 } 1144 const bytes = self.buffer[self.index .. self.index + length]; 1145 switch (length) { 1146 2 => { 1147 const value = std.unicode.utf8Decode2(bytes) catch return length; 1148 if (value == 0x85) return length; // U+0085 (NEL) 1149 }, 1150 3 => { 1151 const value = std.unicode.utf8Decode3(bytes) catch return length; 1152 if (value == 0x2028) return length; // U+2028 (LS) 1153 if (value == 0x2029) return length; // U+2029 (PS) 1154 }, 1155 4 => { 1156 _ = std.unicode.utf8Decode4(bytes) catch return length; 1157 }, 1158 else => unreachable, 1159 } 1160 self.index += length - 1; 1161 return 0; 1162 } 1163 } 1164 }; 1165 1166 test "tokenizer" { 1167 testTokenize("test", []Token.Id{Token.Id.Keyword_test}); 1168 } 1169 1170 test "tokenizer - unknown length pointer" { 1171 testTokenize( 1172 \\[*]u8 1173 , []Token.Id{ 1174 Token.Id.BracketStarBracket, 1175 Token.Id.Identifier, 1176 }); 1177 } 1178 1179 test "tokenizer - char literal with hex escape" { 1180 testTokenize( 1181 \\'\x1b' 1182 , []Token.Id{Token.Id.CharLiteral}); 1183 } 1184 1185 test "tokenizer - float literal e exponent" { 1186 testTokenize("a = 4.94065645841246544177e-324;\n", []Token.Id{ 1187 Token.Id.Identifier, 1188 Token.Id.Equal, 1189 Token.Id.FloatLiteral, 1190 Token.Id.Semicolon, 1191 }); 1192 } 1193 1194 test "tokenizer - float literal p exponent" { 1195 testTokenize("a = 0x1.a827999fcef32p+1022;\n", []Token.Id{ 1196 Token.Id.Identifier, 1197 Token.Id.Equal, 1198 Token.Id.FloatLiteral, 1199 Token.Id.Semicolon, 1200 }); 1201 } 1202 1203 test "tokenizer - chars" { 1204 testTokenize("'c'", []Token.Id{Token.Id.CharLiteral}); 1205 } 1206 1207 test "tokenizer - invalid token characters" { 1208 testTokenize("#", []Token.Id{Token.Id.Invalid}); 1209 testTokenize("`", []Token.Id{Token.Id.Invalid}); 1210 testTokenize("'c", []Token.Id{Token.Id.Invalid}); 1211 testTokenize("'", []Token.Id{Token.Id.Invalid}); 1212 testTokenize("''", []Token.Id{ Token.Id.Invalid, Token.Id.Invalid }); 1213 } 1214 1215 test "tokenizer - invalid literal/comment characters" { 1216 testTokenize("\"\x00\"", []Token.Id{ 1217 Token.Id.StringLiteral, 1218 Token.Id.Invalid, 1219 }); 1220 testTokenize("//\x00", []Token.Id{ 1221 Token.Id.LineComment, 1222 Token.Id.Invalid, 1223 }); 1224 testTokenize("//\x1f", []Token.Id{ 1225 Token.Id.LineComment, 1226 Token.Id.Invalid, 1227 }); 1228 testTokenize("//\x7f", []Token.Id{ 1229 Token.Id.LineComment, 1230 Token.Id.Invalid, 1231 }); 1232 } 1233 1234 test "tokenizer - utf8" { 1235 testTokenize("//\xc2\x80", []Token.Id{Token.Id.LineComment}); 1236 testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{Token.Id.LineComment}); 1237 } 1238 1239 test "tokenizer - invalid utf8" { 1240 testTokenize("//\x80", []Token.Id{ 1241 Token.Id.LineComment, 1242 Token.Id.Invalid, 1243 }); 1244 testTokenize("//\xbf", []Token.Id{ 1245 Token.Id.LineComment, 1246 Token.Id.Invalid, 1247 }); 1248 testTokenize("//\xf8", []Token.Id{ 1249 Token.Id.LineComment, 1250 Token.Id.Invalid, 1251 }); 1252 testTokenize("//\xff", []Token.Id{ 1253 Token.Id.LineComment, 1254 Token.Id.Invalid, 1255 }); 1256 testTokenize("//\xc2\xc0", []Token.Id{ 1257 Token.Id.LineComment, 1258 Token.Id.Invalid, 1259 }); 1260 testTokenize("//\xe0", []Token.Id{ 1261 Token.Id.LineComment, 1262 Token.Id.Invalid, 1263 }); 1264 testTokenize("//\xf0", []Token.Id{ 1265 Token.Id.LineComment, 1266 Token.Id.Invalid, 1267 }); 1268 testTokenize("//\xf0\x90\x80\xc0", []Token.Id{ 1269 Token.Id.LineComment, 1270 Token.Id.Invalid, 1271 }); 1272 } 1273 1274 test "tokenizer - illegal unicode codepoints" { 1275 // unicode newline characters.U+0085, U+2028, U+2029 1276 testTokenize("//\xc2\x84", []Token.Id{Token.Id.LineComment}); 1277 testTokenize("//\xc2\x85", []Token.Id{ 1278 Token.Id.LineComment, 1279 Token.Id.Invalid, 1280 }); 1281 testTokenize("//\xc2\x86", []Token.Id{Token.Id.LineComment}); 1282 testTokenize("//\xe2\x80\xa7", []Token.Id{Token.Id.LineComment}); 1283 testTokenize("//\xe2\x80\xa8", []Token.Id{ 1284 Token.Id.LineComment, 1285 Token.Id.Invalid, 1286 }); 1287 testTokenize("//\xe2\x80\xa9", []Token.Id{ 1288 Token.Id.LineComment, 1289 Token.Id.Invalid, 1290 }); 1291 testTokenize("//\xe2\x80\xaa", []Token.Id{Token.Id.LineComment}); 1292 } 1293 1294 test "tokenizer - string identifier and builtin fns" { 1295 testTokenize( 1296 \\const @"if" = @import("std"); 1297 , []Token.Id{ 1298 Token.Id.Keyword_const, 1299 Token.Id.Identifier, 1300 Token.Id.Equal, 1301 Token.Id.Builtin, 1302 Token.Id.LParen, 1303 Token.Id.StringLiteral, 1304 Token.Id.RParen, 1305 Token.Id.Semicolon, 1306 }); 1307 } 1308 1309 test "tokenizer - pipe and then invalid" { 1310 testTokenize("||=", []Token.Id{ 1311 Token.Id.PipePipe, 1312 Token.Id.Equal, 1313 }); 1314 } 1315 1316 test "tokenizer - line comment and doc comment" { 1317 testTokenize("//", []Token.Id{Token.Id.LineComment}); 1318 testTokenize("// a / b", []Token.Id{Token.Id.LineComment}); 1319 testTokenize("// /", []Token.Id{Token.Id.LineComment}); 1320 testTokenize("/// a", []Token.Id{Token.Id.DocComment}); 1321 testTokenize("///", []Token.Id{Token.Id.DocComment}); 1322 testTokenize("////", []Token.Id{Token.Id.LineComment}); 1323 } 1324 1325 test "tokenizer - line comment followed by identifier" { 1326 testTokenize( 1327 \\ Unexpected, 1328 \\ // another 1329 \\ Another, 1330 , []Token.Id{ 1331 Token.Id.Identifier, 1332 Token.Id.Comma, 1333 Token.Id.LineComment, 1334 Token.Id.Identifier, 1335 Token.Id.Comma, 1336 }); 1337 } 1338 1339 fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void { 1340 var tokenizer = Tokenizer.init(source); 1341 for (expected_tokens) |expected_token_id| { 1342 const token = tokenizer.next(); 1343 if (token.id != expected_token_id) { 1344 std.debug.panic("expected {}, found {}\n", @tagName(expected_token_id), @tagName(token.id)); 1345 } 1346 } 1347 const last_token = tokenizer.next(); 1348 std.testing.expect(last_token.id == Token.Id.Eof); 1349 }