tokenizer.c (30224B) - Raw
1 #include "common.h" 2 3 #include <stdbool.h> 4 #include <stdio.h> 5 #include <string.h> 6 7 #include "tokenizer.h" 8 9 typedef struct { 10 const char* keyword; 11 TokenizerTag tag; 12 } KeywordMap; 13 14 const char* tokenizerGetTagString(TokenizerTag tag) { 15 switch (tag) { 16 TOKENIZER_FOREACH_TAG_ENUM(TOKENIZER_GENERATE_CASE) 17 default: 18 return "UNKNOWN"; 19 } 20 } 21 22 // clang-format off 23 const KeywordMap keywords[] = { 24 { "addrspace", TOKEN_KEYWORD_ADDRSPACE }, 25 { "align", TOKEN_KEYWORD_ALIGN }, 26 { "allowzero", TOKEN_KEYWORD_ALLOWZERO }, 27 { "and", TOKEN_KEYWORD_AND }, 28 { "anyframe", TOKEN_KEYWORD_ANYFRAME }, 29 { "anytype", TOKEN_KEYWORD_ANYTYPE }, 30 { "asm", TOKEN_KEYWORD_ASM }, 31 { "break", TOKEN_KEYWORD_BREAK }, 32 { "callconv", TOKEN_KEYWORD_CALLCONV }, 33 { "catch", TOKEN_KEYWORD_CATCH }, 34 { "comptime", TOKEN_KEYWORD_COMPTIME }, 35 { "const", TOKEN_KEYWORD_CONST }, 36 { "continue", TOKEN_KEYWORD_CONTINUE }, 37 { "defer", TOKEN_KEYWORD_DEFER }, 38 { "else", TOKEN_KEYWORD_ELSE }, 39 { "enum", TOKEN_KEYWORD_ENUM }, 40 { "errdefer", TOKEN_KEYWORD_ERRDEFER }, 41 { "error", TOKEN_KEYWORD_ERROR }, 42 { "export", TOKEN_KEYWORD_EXPORT }, 43 { "extern", TOKEN_KEYWORD_EXTERN }, 44 { "fn", TOKEN_KEYWORD_FN }, 45 { "for", TOKEN_KEYWORD_FOR }, 46 { "if", TOKEN_KEYWORD_IF }, 47 { "inline", TOKEN_KEYWORD_INLINE }, 48 { "linksection", TOKEN_KEYWORD_LINKSECTION }, 49 { "noalias", TOKEN_KEYWORD_NOALIAS }, 50 { "noinline", TOKEN_KEYWORD_NOINLINE }, 51 { "nosuspend", TOKEN_KEYWORD_NOSUSPEND }, 52 { "opaque", TOKEN_KEYWORD_OPAQUE }, 53 { "or", TOKEN_KEYWORD_OR }, 54 { "orelse", TOKEN_KEYWORD_ORELSE }, 55 { "packed", TOKEN_KEYWORD_PACKED }, 56 { "pub", TOKEN_KEYWORD_PUB }, 57 { "resume", TOKEN_KEYWORD_RESUME }, 58 { "return", TOKEN_KEYWORD_RETURN }, 59 { "struct", TOKEN_KEYWORD_STRUCT }, 60 { "suspend", TOKEN_KEYWORD_SUSPEND }, 61 { "switch", TOKEN_KEYWORD_SWITCH }, 62 { "test", TOKEN_KEYWORD_TEST }, 63 { "threadlocal", TOKEN_KEYWORD_THREADLOCAL }, 64 { "try", TOKEN_KEYWORD_TRY }, 65 { "union", TOKEN_KEYWORD_UNION }, 66 { "unreachable", TOKEN_KEYWORD_UNREACHABLE }, 67 { "var", TOKEN_KEYWORD_VAR }, 68 { "volatile", TOKEN_KEYWORD_VOLATILE }, 69 { "while", TOKEN_KEYWORD_WHILE } 70 }; 71 // clang-format on 72 73 // TODO binary search 74 static TokenizerTag getKeyword(const char* bytes, const uint32_t len) { 75 for (unsigned long i = 0; i < sizeof(keywords) / sizeof(KeywordMap); i++) { 76 size_t klen = strlen(keywords[i].keyword); 77 size_t minlen = klen < len ? klen : len; 78 int cmp = strncmp(bytes, keywords[i].keyword, minlen); 79 if (cmp == 0) { 80 if (len == klen) { 81 return keywords[i].tag; 82 } else if (len < klen) { 83 return TOKEN_INVALID; 84 } 85 // len > klen: input is longer than keyword (e.g., "orelse" vs 86 // "or"), continue searching. 87 } else if (cmp < 0) { 88 return TOKEN_INVALID; 89 } 90 } 91 return TOKEN_INVALID; 92 } 93 94 Tokenizer tokenizerInit(const char* buffer, const uint32_t len) { 95 return (Tokenizer) { 96 .buffer = buffer, 97 .buffer_len = len, 98 .index = (len >= 3 && memcmp(buffer, "\xEF\xBB\xBF", 3) == 0) ? 3 : 0, 99 }; 100 } 101 102 TokenizerToken tokenizerNext(Tokenizer* self) { 103 TokenizerToken result = (TokenizerToken) { 104 .tag = TOKEN_INVALID, 105 .loc = { 106 .start = self->index, 107 }, 108 }; 109 110 TokenizerState state = TOKENIZER_STATE_START; 111 112 state: 113 switch (state) { 114 case TOKENIZER_STATE_START: 115 switch (self->buffer[self->index]) { 116 case 0: 117 if (self->index == self->buffer_len) { 118 return (TokenizerToken) { .tag = TOKEN_EOF, 119 .loc = { 120 .start = self->index, 121 .end = self->index, 122 } }; 123 } else { 124 state = TOKENIZER_STATE_INVALID; 125 goto state; 126 } 127 case ' ': 128 case '\n': 129 case '\t': 130 case '\r': 131 self->index++; 132 result.loc.start = self->index; 133 goto state; 134 case '"': 135 result.tag = TOKEN_STRING_LITERAL; 136 state = TOKENIZER_STATE_STRING_LITERAL; 137 goto state; 138 case '\'': 139 result.tag = TOKEN_CHAR_LITERAL; 140 state = TOKENIZER_STATE_CHAR_LITERAL; 141 goto state; 142 case 'a' ... 'z': 143 case 'A' ... 'Z': 144 case '_': 145 result.tag = TOKEN_IDENTIFIER; 146 state = TOKENIZER_STATE_IDENTIFIER; 147 goto state; 148 case '@': 149 state = TOKENIZER_STATE_SAW_AT_SIGN; 150 goto state; 151 case '=': 152 state = TOKENIZER_STATE_EQUAL; 153 goto state; 154 case '!': 155 state = TOKENIZER_STATE_BANG; 156 goto state; 157 case '|': 158 state = TOKENIZER_STATE_PIPE; 159 goto state; 160 case '(': 161 result.tag = TOKEN_L_PAREN; 162 self->index++; 163 break; 164 case ')': 165 result.tag = TOKEN_R_PAREN; 166 self->index++; 167 break; 168 case '[': 169 result.tag = TOKEN_L_BRACKET; 170 self->index++; 171 break; 172 case ']': 173 result.tag = TOKEN_R_BRACKET; 174 self->index++; 175 break; 176 case ';': 177 result.tag = TOKEN_SEMICOLON; 178 self->index++; 179 break; 180 case ',': 181 result.tag = TOKEN_COMMA; 182 self->index++; 183 break; 184 case '?': 185 result.tag = TOKEN_QUESTION_MARK; 186 self->index++; 187 break; 188 case ':': 189 result.tag = TOKEN_COLON; 190 self->index++; 191 break; 192 case '%': 193 state = TOKENIZER_STATE_PERCENT; 194 goto state; 195 case '*': 196 state = TOKENIZER_STATE_ASTERISK; 197 goto state; 198 case '+': 199 state = TOKENIZER_STATE_PLUS; 200 goto state; 201 case '<': 202 state = TOKENIZER_STATE_ANGLE_BRACKET_LEFT; 203 goto state; 204 case '>': 205 state = TOKENIZER_STATE_ANGLE_BRACKET_RIGHT; 206 goto state; 207 case '^': 208 state = TOKENIZER_STATE_CARET; 209 goto state; 210 case '\\': 211 result.tag = TOKEN_MULTILINE_STRING_LITERAL_LINE; 212 state = TOKENIZER_STATE_BACKSLASH; 213 goto state; 214 case '{': 215 result.tag = TOKEN_L_BRACE; 216 self->index++; 217 break; 218 case '}': 219 result.tag = TOKEN_R_BRACE; 220 self->index++; 221 break; 222 case '~': 223 result.tag = TOKEN_TILDE; 224 self->index++; 225 break; 226 case '.': 227 state = TOKENIZER_STATE_PERIOD; 228 goto state; 229 case '-': 230 state = TOKENIZER_STATE_MINUS; 231 goto state; 232 case '/': 233 state = TOKENIZER_STATE_SLASH; 234 goto state; 235 case '&': 236 state = TOKENIZER_STATE_AMPERSAND; 237 goto state; 238 case '0' ... '9': 239 result.tag = TOKEN_NUMBER_LITERAL; 240 self->index++; 241 state = TOKENIZER_STATE_INT; 242 goto state; 243 default: 244 state = TOKENIZER_STATE_INVALID; 245 goto state; 246 }; 247 break; 248 249 case TOKENIZER_STATE_EXPECT_NEWLINE: 250 self->index++; 251 switch (self->buffer[self->index]) { 252 case 0: 253 if (self->index == self->buffer_len) { 254 result.tag = TOKEN_INVALID; 255 } else { 256 state = TOKENIZER_STATE_INVALID; 257 goto state; 258 } 259 break; 260 case '\n': 261 self->index++; 262 result.loc.start = self->index; 263 state = TOKENIZER_STATE_START; 264 goto state; 265 default: 266 state = TOKENIZER_STATE_INVALID; 267 goto state; 268 } 269 break; 270 271 case TOKENIZER_STATE_INVALID: 272 self->index++; 273 switch (self->buffer[self->index]) { 274 case 0: 275 if (self->index == self->buffer_len) { 276 result.tag = TOKEN_INVALID; 277 } else { 278 state = TOKENIZER_STATE_INVALID; 279 goto state; 280 } 281 break; 282 case '\n': 283 result.tag = TOKEN_INVALID; 284 break; 285 default: 286 state = TOKENIZER_STATE_INVALID; 287 goto state; 288 } 289 break; 290 291 case TOKENIZER_STATE_SAW_AT_SIGN: 292 self->index++; 293 switch (self->buffer[self->index]) { 294 case 0: 295 case '\n': 296 result.tag = TOKEN_INVALID; 297 break; 298 case '"': 299 result.tag = TOKEN_IDENTIFIER; 300 state = TOKENIZER_STATE_STRING_LITERAL; 301 goto state; 302 case 'a' ... 'z': 303 case 'A' ... 'Z': 304 case '_': 305 result.tag = TOKEN_BUILTIN; 306 state = TOKENIZER_STATE_BUILTIN; 307 goto state; 308 default: 309 state = TOKENIZER_STATE_INVALID; 310 goto state; 311 } 312 break; 313 314 case TOKENIZER_STATE_AMPERSAND: 315 self->index++; 316 switch (self->buffer[self->index]) { 317 case '=': 318 result.tag = TOKEN_AMPERSAND_EQUAL; 319 self->index++; 320 break; 321 default: 322 result.tag = TOKEN_AMPERSAND; 323 break; 324 } 325 break; 326 327 case TOKENIZER_STATE_ASTERISK: 328 self->index++; 329 switch (self->buffer[self->index]) { 330 case '=': 331 result.tag = TOKEN_ASTERISK_EQUAL; 332 self->index++; 333 break; 334 case '*': 335 result.tag = TOKEN_ASTERISK_ASTERISK; 336 self->index++; 337 break; 338 case '%': 339 state = TOKENIZER_STATE_ASTERISK_PERCENT; 340 goto state; 341 case '|': 342 state = TOKENIZER_STATE_ASTERISK_PIPE; 343 goto state; 344 default: 345 result.tag = TOKEN_ASTERISK; 346 break; 347 } 348 break; 349 350 case TOKENIZER_STATE_ASTERISK_PERCENT: 351 self->index++; 352 switch (self->buffer[self->index]) { 353 case '=': 354 result.tag = TOKEN_ASTERISK_PERCENT_EQUAL; 355 self->index++; 356 break; 357 default: 358 result.tag = TOKEN_ASTERISK_PERCENT; 359 break; 360 } 361 break; 362 363 case TOKENIZER_STATE_ASTERISK_PIPE: 364 self->index++; 365 switch (self->buffer[self->index]) { 366 case '=': 367 result.tag = TOKEN_ASTERISK_PIPE_EQUAL; 368 self->index++; 369 break; 370 default: 371 result.tag = TOKEN_ASTERISK_PIPE; 372 break; 373 } 374 break; 375 376 case TOKENIZER_STATE_PERCENT: 377 self->index++; 378 switch (self->buffer[self->index]) { 379 case '=': 380 result.tag = TOKEN_PERCENT_EQUAL; 381 self->index++; 382 break; 383 default: 384 result.tag = TOKEN_PERCENT; 385 break; 386 } 387 break; 388 389 case TOKENIZER_STATE_PLUS: 390 self->index++; 391 switch (self->buffer[self->index]) { 392 case '=': 393 result.tag = TOKEN_PLUS_EQUAL; 394 self->index++; 395 break; 396 case '+': 397 result.tag = TOKEN_PLUS_PLUS; 398 self->index++; 399 break; 400 case '%': 401 state = TOKENIZER_STATE_PLUS_PERCENT; 402 goto state; 403 case '|': 404 state = TOKENIZER_STATE_PLUS_PIPE; 405 goto state; 406 default: 407 result.tag = TOKEN_PLUS; 408 break; 409 } 410 break; 411 412 case TOKENIZER_STATE_PLUS_PERCENT: 413 self->index++; 414 switch (self->buffer[self->index]) { 415 case '=': 416 result.tag = TOKEN_PLUS_PERCENT_EQUAL; 417 self->index++; 418 break; 419 default: 420 result.tag = TOKEN_PLUS_PERCENT; 421 break; 422 } 423 break; 424 425 case TOKENIZER_STATE_PLUS_PIPE: 426 self->index++; 427 switch (self->buffer[self->index]) { 428 case '=': 429 result.tag = TOKEN_PLUS_PIPE_EQUAL; 430 self->index++; 431 break; 432 default: 433 result.tag = TOKEN_PLUS_PIPE; 434 break; 435 } 436 break; 437 438 case TOKENIZER_STATE_CARET: 439 self->index++; 440 switch (self->buffer[self->index]) { 441 case '=': 442 result.tag = TOKEN_CARET_EQUAL; 443 self->index++; 444 break; 445 default: 446 result.tag = TOKEN_CARET; 447 break; 448 } 449 break; 450 451 case TOKENIZER_STATE_IDENTIFIER: 452 self->index++; 453 switch (self->buffer[self->index]) { 454 case 'a' ... 'z': 455 case 'A' ... 'Z': 456 case '_': 457 case '0' ... '9': 458 state = TOKENIZER_STATE_IDENTIFIER; 459 goto state; 460 default:; // Once we're at C23, this semicolon can be removed. 461 const char* start = self->buffer + result.loc.start; 462 uint32_t len = self->index - result.loc.start; 463 TokenizerTag tag = getKeyword(start, len); 464 if (tag != TOKEN_INVALID) 465 result.tag = tag; 466 } 467 break; 468 469 case TOKENIZER_STATE_BUILTIN: 470 self->index++; 471 switch (self->buffer[self->index]) { 472 case 'a' ... 'z': 473 case 'A' ... 'Z': 474 case '_': 475 case '0' ... '9': 476 state = TOKENIZER_STATE_BUILTIN; 477 goto state; 478 break; 479 } 480 break; 481 482 case TOKENIZER_STATE_BACKSLASH: 483 self->index++; 484 switch (self->buffer[self->index]) { 485 case 0: 486 result.tag = TOKEN_INVALID; 487 break; 488 case '\\': 489 state = TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE; 490 goto state; 491 case '\n': 492 result.tag = TOKEN_INVALID; 493 break; 494 default: 495 state = TOKENIZER_STATE_INVALID; 496 goto state; 497 } 498 break; 499 500 case TOKENIZER_STATE_STRING_LITERAL: 501 self->index++; 502 switch (self->buffer[self->index]) { 503 case 0: 504 if (self->index != self->buffer_len) { 505 state = TOKENIZER_STATE_INVALID; 506 goto state; 507 } else { 508 result.tag = TOKEN_INVALID; 509 } 510 break; 511 case '\n': 512 result.tag = TOKEN_INVALID; 513 break; 514 case '\\': 515 state = TOKENIZER_STATE_STRING_LITERAL_BACKSLASH; 516 goto state; 517 case '"': 518 self->index++; 519 break; 520 case 0x01 ... 0x09: 521 case 0x0b ... 0x1f: 522 case 0x7f: 523 state = TOKENIZER_STATE_INVALID; 524 goto state; 525 default: 526 state = TOKENIZER_STATE_STRING_LITERAL; 527 goto state; 528 } 529 break; 530 531 case TOKENIZER_STATE_STRING_LITERAL_BACKSLASH: 532 self->index++; 533 switch (self->buffer[self->index]) { 534 case 0: 535 case '\n': 536 result.tag = TOKEN_INVALID; 537 break; 538 default: 539 state = TOKENIZER_STATE_STRING_LITERAL; 540 goto state; 541 } 542 break; 543 544 case TOKENIZER_STATE_CHAR_LITERAL: 545 self->index++; 546 switch (self->buffer[self->index]) { 547 case 0: 548 if (self->index != self->buffer_len) { 549 state = TOKENIZER_STATE_INVALID; 550 goto state; 551 } else { 552 result.tag = TOKEN_INVALID; 553 } 554 break; 555 case '\n': 556 result.tag = TOKEN_INVALID; 557 break; 558 case '\\': 559 state = TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH; 560 goto state; 561 case '\'': 562 self->index++; 563 break; 564 case 0x01 ... 0x09: 565 case 0x0b ... 0x1f: 566 case 0x7f: 567 state = TOKENIZER_STATE_INVALID; 568 goto state; 569 default: 570 state = TOKENIZER_STATE_CHAR_LITERAL; 571 goto state; 572 } 573 break; 574 575 case TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH: 576 self->index++; 577 switch (self->buffer[self->index]) { 578 case 0: 579 if (self->index != self->buffer_len) { 580 state = TOKENIZER_STATE_INVALID; 581 goto state; 582 } else { 583 result.tag = TOKEN_INVALID; 584 } 585 break; 586 case '\n': 587 result.tag = TOKEN_INVALID; 588 break; 589 case 0x01 ... 0x09: 590 case 0x0b ... 0x1f: 591 case 0x7f: 592 state = TOKENIZER_STATE_INVALID; 593 goto state; 594 default: 595 state = TOKENIZER_STATE_CHAR_LITERAL; 596 goto state; 597 } 598 break; 599 600 case TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE: 601 self->index++; 602 switch (self->buffer[self->index]) { 603 case 0: 604 if (self->index != self->buffer_len) { 605 state = TOKENIZER_STATE_INVALID; 606 goto state; 607 } 608 break; 609 case '\n': 610 break; 611 case '\r': 612 if (self->buffer[self->index + 1] != '\n') { 613 state = TOKENIZER_STATE_INVALID; 614 goto state; 615 } 616 break; 617 case 0x01 ... 0x09: 618 case 0x0b ... 0x0c: 619 case 0x0e ... 0x1f: 620 case 0x7f: 621 state = TOKENIZER_STATE_INVALID; 622 goto state; 623 default: 624 state = TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE; 625 goto state; 626 } 627 break; 628 629 case TOKENIZER_STATE_BANG: 630 self->index++; 631 switch (self->buffer[self->index]) { 632 case '=': 633 result.tag = TOKEN_BANG_EQUAL; 634 self->index++; 635 break; 636 default: 637 result.tag = TOKEN_BANG; 638 break; 639 } 640 break; 641 642 case TOKENIZER_STATE_PIPE: 643 self->index++; 644 switch (self->buffer[self->index]) { 645 case '=': 646 result.tag = TOKEN_PIPE_EQUAL; 647 self->index++; 648 break; 649 case '|': 650 result.tag = TOKEN_PIPE_PIPE; 651 self->index++; 652 break; 653 default: 654 result.tag = TOKEN_PIPE; 655 break; 656 } 657 break; 658 659 case TOKENIZER_STATE_EQUAL: 660 self->index++; 661 switch (self->buffer[self->index]) { 662 case '=': 663 result.tag = TOKEN_EQUAL_EQUAL; 664 self->index++; 665 break; 666 case '>': 667 result.tag = TOKEN_EQUAL_ANGLE_BRACKET_RIGHT; 668 self->index++; 669 break; 670 default: 671 result.tag = TOKEN_EQUAL; 672 break; 673 } 674 break; 675 676 case TOKENIZER_STATE_MINUS: 677 self->index++; 678 switch (self->buffer[self->index]) { 679 case '>': 680 result.tag = TOKEN_ARROW; 681 self->index++; 682 break; 683 case '=': 684 result.tag = TOKEN_MINUS_EQUAL; 685 self->index++; 686 break; 687 case '%': 688 state = TOKENIZER_STATE_MINUS_PERCENT; 689 goto state; 690 case '|': 691 state = TOKENIZER_STATE_MINUS_PIPE; 692 goto state; 693 default: 694 result.tag = TOKEN_MINUS; 695 break; 696 } 697 break; 698 699 case TOKENIZER_STATE_MINUS_PERCENT: 700 self->index++; 701 switch (self->buffer[self->index]) { 702 case '=': 703 result.tag = TOKEN_MINUS_PERCENT_EQUAL; 704 self->index++; 705 break; 706 default: 707 result.tag = TOKEN_MINUS_PERCENT; 708 break; 709 } 710 break; 711 712 case TOKENIZER_STATE_MINUS_PIPE: 713 self->index++; 714 switch (self->buffer[self->index]) { 715 case '=': 716 result.tag = TOKEN_MINUS_PIPE_EQUAL; 717 self->index++; 718 break; 719 default: 720 result.tag = TOKEN_MINUS_PIPE; 721 break; 722 } 723 break; 724 725 case TOKENIZER_STATE_ANGLE_BRACKET_LEFT: 726 self->index++; 727 switch (self->buffer[self->index]) { 728 case '<': 729 state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT; 730 goto state; 731 case '=': 732 result.tag = TOKEN_ANGLE_BRACKET_LEFT_EQUAL; 733 self->index++; 734 break; 735 default: 736 result.tag = TOKEN_ANGLE_BRACKET_LEFT; 737 break; 738 } 739 break; 740 741 case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT: 742 self->index++; 743 switch (self->buffer[self->index]) { 744 case '=': 745 result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL; 746 self->index++; 747 break; 748 case '|': 749 state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE; 750 goto state; 751 default: 752 result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT; 753 break; 754 } 755 break; 756 757 case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE: 758 self->index++; 759 switch (self->buffer[self->index]) { 760 case '=': 761 result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL; 762 self->index++; 763 break; 764 default: 765 result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE; 766 break; 767 } 768 break; 769 770 case TOKENIZER_STATE_ANGLE_BRACKET_RIGHT: 771 self->index++; 772 switch (self->buffer[self->index]) { 773 case '>': 774 state = TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT; 775 goto state; 776 case '=': 777 result.tag = TOKEN_ANGLE_BRACKET_RIGHT_EQUAL; 778 self->index++; 779 break; 780 default: 781 result.tag = TOKEN_ANGLE_BRACKET_RIGHT; 782 break; 783 } 784 break; 785 786 case TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT: 787 self->index++; 788 switch (self->buffer[self->index]) { 789 case '=': 790 result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL; 791 self->index++; 792 break; 793 default: 794 result.tag = TOKEN_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT; 795 break; 796 } 797 break; 798 799 case TOKENIZER_STATE_PERIOD: 800 self->index++; 801 switch (self->buffer[self->index]) { 802 case '.': 803 state = TOKENIZER_STATE_PERIOD_2; 804 goto state; 805 case '*': 806 state = TOKENIZER_STATE_PERIOD_ASTERISK; 807 goto state; 808 default: 809 result.tag = TOKEN_PERIOD; 810 break; 811 } 812 break; 813 814 case TOKENIZER_STATE_PERIOD_2: 815 self->index++; 816 switch (self->buffer[self->index]) { 817 case '.': 818 result.tag = TOKEN_ELLIPSIS3; 819 self->index++; 820 break; 821 default: 822 result.tag = TOKEN_ELLIPSIS2; 823 break; 824 } 825 break; 826 827 case TOKENIZER_STATE_PERIOD_ASTERISK: 828 self->index++; 829 switch (self->buffer[self->index]) { 830 case '*': 831 result.tag = TOKEN_INVALID_PERIODASTERISKS; 832 break; 833 default: 834 result.tag = TOKEN_PERIOD_ASTERISK; 835 break; 836 } 837 break; 838 839 case TOKENIZER_STATE_SLASH: 840 self->index++; 841 switch (self->buffer[self->index]) { 842 case '/': 843 state = TOKENIZER_STATE_LINE_COMMENT_START; 844 goto state; 845 case '=': 846 result.tag = TOKEN_SLASH_EQUAL; 847 self->index++; 848 break; 849 default: 850 result.tag = TOKEN_SLASH; 851 break; 852 } 853 break; 854 855 case TOKENIZER_STATE_LINE_COMMENT_START: 856 self->index++; 857 switch (self->buffer[self->index]) { 858 case 0: 859 if (self->index != self->buffer_len) { 860 state = TOKENIZER_STATE_INVALID; 861 goto state; 862 } else { 863 return (TokenizerToken) { .tag = TOKEN_EOF, 864 .loc = { 865 .start = self->index, 866 .end = self->index, 867 } }; 868 } 869 break; 870 case '!': 871 result.tag = TOKEN_CONTAINER_DOC_COMMENT; 872 state = TOKENIZER_STATE_DOC_COMMENT; 873 goto state; 874 case '\n': 875 self->index++; 876 result.loc.start = self->index; 877 state = TOKENIZER_STATE_START; 878 goto state; 879 case '/': 880 state = TOKENIZER_STATE_DOC_COMMENT_START; 881 goto state; 882 case '\r': 883 state = TOKENIZER_STATE_EXPECT_NEWLINE; 884 goto state; 885 case 0x01 ... 0x09: 886 case 0x0b ... 0x0c: 887 case 0x0e ... 0x1f: 888 case 0x7f: 889 state = TOKENIZER_STATE_INVALID; 890 goto state; 891 default: 892 state = TOKENIZER_STATE_LINE_COMMENT; 893 goto state; 894 } 895 break; 896 897 case TOKENIZER_STATE_DOC_COMMENT_START: 898 self->index++; 899 switch (self->buffer[self->index]) { 900 case 0: 901 case '\n': 902 result.tag = TOKEN_DOC_COMMENT; 903 break; 904 case '\r': 905 if (self->buffer[self->index + 1] == '\n') { 906 result.tag = TOKEN_DOC_COMMENT; 907 } else { 908 state = TOKENIZER_STATE_INVALID; 909 goto state; 910 } 911 break; 912 case '/': 913 state = TOKENIZER_STATE_LINE_COMMENT; 914 goto state; 915 case 0x01 ... 0x09: 916 case 0x0b ... 0x0c: 917 case 0x0e ... 0x1f: 918 case 0x7f: 919 state = TOKENIZER_STATE_INVALID; 920 goto state; 921 default: 922 result.tag = TOKEN_DOC_COMMENT; 923 state = TOKENIZER_STATE_DOC_COMMENT; 924 goto state; 925 } 926 break; 927 928 case TOKENIZER_STATE_LINE_COMMENT: 929 self->index++; 930 switch (self->buffer[self->index]) { 931 case 0: 932 if (self->index != self->buffer_len) { 933 state = TOKENIZER_STATE_INVALID; 934 goto state; 935 } else { 936 return (TokenizerToken) { .tag = TOKEN_EOF, 937 .loc = { 938 .start = self->index, 939 .end = self->index, 940 } }; 941 } 942 break; 943 case '\n': 944 self->index++; 945 result.loc.start = self->index; 946 state = TOKENIZER_STATE_START; 947 goto state; 948 case '\r': 949 state = TOKENIZER_STATE_EXPECT_NEWLINE; 950 goto state; 951 case 0x01 ... 0x09: 952 case 0x0b ... 0x0c: 953 case 0x0e ... 0x1f: 954 case 0x7f: 955 state = TOKENIZER_STATE_INVALID; 956 goto state; 957 default: 958 state = TOKENIZER_STATE_LINE_COMMENT; 959 goto state; 960 } 961 break; 962 963 case TOKENIZER_STATE_DOC_COMMENT: 964 self->index++; 965 switch (self->buffer[self->index]) { 966 case 0: 967 case '\n': 968 break; 969 case '\r': 970 if (self->buffer[self->index + 1] != '\n') { 971 state = TOKENIZER_STATE_INVALID; 972 goto state; 973 } 974 break; 975 case 0x01 ... 0x09: 976 case 0x0b ... 0x0c: 977 case 0x0e ... 0x1f: 978 case 0x7f: 979 state = TOKENIZER_STATE_INVALID; 980 goto state; 981 default: 982 state = TOKENIZER_STATE_DOC_COMMENT; 983 goto state; 984 } 985 break; 986 987 case TOKENIZER_STATE_INT: 988 switch (self->buffer[self->index]) { 989 case '.': 990 state = TOKENIZER_STATE_INT_PERIOD; 991 goto state; 992 case '_': 993 case 'a' ... 'd': 994 case 'f' ... 'o': 995 case 'q' ... 'z': 996 case 'A' ... 'D': 997 case 'F' ... 'O': 998 case 'Q' ... 'Z': 999 case '0' ... '9': 1000 self->index++; 1001 state = TOKENIZER_STATE_INT; 1002 goto state; 1003 case 'e': 1004 case 'E': 1005 case 'p': 1006 case 'P': 1007 state = TOKENIZER_STATE_INT_EXPONENT; 1008 goto state; 1009 default: 1010 break; 1011 } 1012 break; 1013 1014 case TOKENIZER_STATE_INT_EXPONENT: 1015 self->index++; 1016 switch (self->buffer[self->index]) { 1017 case '-': 1018 case '+': 1019 self->index++; 1020 state = TOKENIZER_STATE_FLOAT; 1021 goto state; 1022 default: 1023 state = TOKENIZER_STATE_INT; 1024 goto state; 1025 } 1026 break; 1027 1028 case TOKENIZER_STATE_INT_PERIOD: 1029 self->index++; 1030 switch (self->buffer[self->index]) { 1031 case '_': 1032 case 'a' ... 'd': 1033 case 'f' ... 'o': 1034 case 'q' ... 'z': 1035 case 'A' ... 'D': 1036 case 'F' ... 'O': 1037 case 'Q' ... 'Z': 1038 case '0' ... '9': 1039 self->index++; 1040 state = TOKENIZER_STATE_FLOAT; 1041 goto state; 1042 case 'e': 1043 case 'E': 1044 case 'p': 1045 case 'P': 1046 state = TOKENIZER_STATE_FLOAT_EXPONENT; 1047 goto state; 1048 default: 1049 self->index--; 1050 break; 1051 } 1052 break; 1053 1054 case TOKENIZER_STATE_FLOAT: 1055 switch (self->buffer[self->index]) { 1056 case '_': 1057 case 'a' ... 'd': 1058 case 'f' ... 'o': 1059 case 'q' ... 'z': 1060 case 'A' ... 'D': 1061 case 'F' ... 'O': 1062 case 'Q' ... 'Z': 1063 case '0' ... '9': 1064 self->index++; 1065 state = TOKENIZER_STATE_FLOAT; 1066 goto state; 1067 case 'e': 1068 case 'E': 1069 case 'p': 1070 case 'P': 1071 state = TOKENIZER_STATE_FLOAT_EXPONENT; 1072 goto state; 1073 default: 1074 break; 1075 } 1076 break; 1077 1078 case TOKENIZER_STATE_FLOAT_EXPONENT: 1079 self->index++; 1080 switch (self->buffer[self->index]) { 1081 case '-': 1082 case '+': 1083 self->index++; 1084 state = TOKENIZER_STATE_FLOAT; 1085 goto state; 1086 default: 1087 state = TOKENIZER_STATE_FLOAT; 1088 goto state; 1089 } 1090 break; 1091 } 1092 1093 result.loc.end = self->index; 1094 1095 return result; 1096 }