Tokenizer.zig (70343B) - Raw
1 const std = @import("std"); 2 const assert = std.debug.assert; 3 const Compilation = @import("Compilation.zig"); 4 const Source = @import("Source.zig"); 5 const LangOpts = @import("LangOpts.zig"); 6 7 pub const Token = struct { 8 id: Id, 9 source: Source.Id, 10 start: u32 = 0, 11 end: u32 = 0, 12 line: u32 = 0, 13 14 pub const Id = enum(u8) { 15 invalid, 16 nl, 17 whitespace, 18 eof, 19 /// identifier containing solely basic character set characters 20 identifier, 21 /// identifier with at least one extended character 22 extended_identifier, 23 24 // string literals with prefixes 25 string_literal, 26 string_literal_utf_16, 27 string_literal_utf_8, 28 string_literal_utf_32, 29 string_literal_wide, 30 31 /// Any string literal with an embedded newline or EOF 32 /// Always a parser error; by default just a warning from preprocessor 33 unterminated_string_literal, 34 35 // <foobar> only generated by preprocessor 36 macro_string, 37 38 // char literals with prefixes 39 char_literal, 40 char_literal_utf_8, 41 char_literal_utf_16, 42 char_literal_utf_32, 43 char_literal_wide, 44 45 /// Any character literal with nothing inside the quotes 46 /// Always a parser error; by default just a warning from preprocessor 47 empty_char_literal, 48 49 /// Any character literal with an embedded newline or EOF 50 /// Always a parser error; by default just a warning from preprocessor 51 unterminated_char_literal, 52 53 /// `/* */` style comment without a closing `*/` before EOF 54 unterminated_comment, 55 56 /// Integer literal tokens generated by preprocessor. 57 one, 58 zero, 59 60 bang, 61 bang_equal, 62 pipe, 63 pipe_pipe, 64 pipe_equal, 65 equal, 66 equal_equal, 67 l_paren, 68 r_paren, 69 l_brace, 70 r_brace, 71 l_bracket, 72 r_bracket, 73 period, 74 ellipsis, 75 caret, 76 caret_equal, 77 plus, 78 plus_plus, 79 plus_equal, 80 minus, 81 minus_minus, 82 minus_equal, 83 asterisk, 84 asterisk_equal, 85 percent, 86 percent_equal, 87 arrow, 88 colon, 89 colon_colon, 90 semicolon, 91 slash, 92 slash_equal, 93 comma, 94 ampersand, 95 ampersand_ampersand, 96 ampersand_equal, 97 question_mark, 98 angle_bracket_left, 99 angle_bracket_left_equal, 100 angle_bracket_angle_bracket_left, 101 angle_bracket_angle_bracket_left_equal, 102 angle_bracket_right, 103 angle_bracket_right_equal, 104 angle_bracket_angle_bracket_right, 105 angle_bracket_angle_bracket_right_equal, 106 tilde, 107 hash, 108 hash_hash, 109 110 /// Special token to speed up preprocessing, `loc.end` will be an index to the param list. 111 macro_param, 112 /// Special token to signal that the argument must be replaced without expansion (e.g. in concatenation) 113 macro_param_no_expand, 114 /// Special token to speed up preprocessing, `loc.end` will be an index to the param list. 115 stringify_param, 116 /// Same as stringify_param, but for var args 117 stringify_va_args, 118 /// Special macro whitespace, always equal to a single space 119 macro_ws, 120 /// Special token for implementing __has_attribute 121 macro_param_has_attribute, 122 /// Special token for implementing __has_c_attribute 123 macro_param_has_c_attribute, 124 /// Special token for implementing __has_declspec_attribute 125 macro_param_has_declspec_attribute, 126 /// Special token for implementing __has_warning 127 macro_param_has_warning, 128 /// Special token for implementing __has_feature 129 macro_param_has_feature, 130 /// Special token for implementing __has_extension 131 macro_param_has_extension, 132 /// Special token for implementing __has_builtin 133 macro_param_has_builtin, 134 /// Special token for implementing __has_include 135 macro_param_has_include, 136 /// Special token for implementing __has_include_next 137 macro_param_has_include_next, 138 /// Special token for implementing __has_embed 139 macro_param_has_embed, 140 /// Special token for implementing __is_identifier 141 macro_param_is_identifier, 142 /// Special token for implementing __FILE__ 143 macro_file, 144 /// Special token for implementing __LINE__ 145 macro_line, 146 /// Special token for implementing __COUNTER__ 147 macro_counter, 148 /// Special token for implementing _Pragma 149 macro_param_pragma_operator, 150 151 /// Special identifier for implementing __func__ 152 macro_func, 153 /// Special identifier for implementing __FUNCTION__ 154 macro_function, 155 /// Special identifier for implementing __PRETTY_FUNCTION__ 156 macro_pretty_func, 157 158 keyword_auto, 159 keyword_auto_type, 160 keyword_break, 161 keyword_case, 162 keyword_char, 163 keyword_const, 164 keyword_continue, 165 keyword_default, 166 keyword_do, 167 keyword_double, 168 keyword_else, 169 keyword_enum, 170 keyword_extern, 171 keyword_float, 172 keyword_for, 173 keyword_goto, 174 keyword_if, 175 keyword_int, 176 keyword_long, 177 keyword_register, 178 keyword_return, 179 keyword_short, 180 keyword_signed, 181 keyword_signed1, 182 keyword_signed2, 183 keyword_sizeof, 184 keyword_static, 185 keyword_struct, 186 keyword_switch, 187 keyword_typedef, 188 keyword_typeof1, 189 keyword_typeof2, 190 keyword_union, 191 keyword_unsigned, 192 keyword_void, 193 keyword_volatile, 194 keyword_while, 195 196 // ISO C99 197 keyword_bool, 198 keyword_complex, 199 keyword_imaginary, 200 keyword_inline, 201 keyword_restrict, 202 203 // ISO C11 204 keyword_alignas, 205 keyword_alignof, 206 keyword_atomic, 207 keyword_generic, 208 keyword_noreturn, 209 keyword_static_assert, 210 keyword_thread_local, 211 212 // ISO C23 213 keyword_bit_int, 214 keyword_c23_alignas, 215 keyword_c23_alignof, 216 keyword_c23_bool, 217 keyword_c23_static_assert, 218 keyword_c23_thread_local, 219 keyword_constexpr, 220 keyword_true, 221 keyword_false, 222 keyword_nullptr, 223 keyword_typeof_unqual, 224 225 // Preprocessor directives 226 keyword_include, 227 keyword_include_next, 228 keyword_embed, 229 keyword_define, 230 keyword_defined, 231 keyword_undef, 232 keyword_ifdef, 233 keyword_ifndef, 234 keyword_elif, 235 keyword_elifdef, 236 keyword_elifndef, 237 keyword_endif, 238 keyword_error, 239 keyword_warning, 240 keyword_pragma, 241 keyword_line, 242 keyword_va_args, 243 keyword_va_opt, 244 245 // gcc keywords 246 keyword_const1, 247 keyword_const2, 248 keyword_inline1, 249 keyword_inline2, 250 keyword_volatile1, 251 keyword_volatile2, 252 keyword_restrict1, 253 keyword_restrict2, 254 keyword_alignof1, 255 keyword_alignof2, 256 keyword_typeof, 257 keyword_attribute1, 258 keyword_attribute2, 259 keyword_extension, 260 keyword_asm, 261 keyword_asm1, 262 keyword_asm2, 263 /// _Float128 264 keyword_float128_1, 265 /// __float128 266 keyword_float128_2, 267 keyword_int128, 268 keyword_imag1, 269 keyword_imag2, 270 keyword_real1, 271 keyword_real2, 272 keyword_float16, 273 274 // clang keywords 275 keyword_fp16, 276 277 // ms keywords 278 keyword_declspec, 279 keyword_int64, 280 keyword_int64_2, 281 keyword_int32, 282 keyword_int32_2, 283 keyword_int16, 284 keyword_int16_2, 285 keyword_int8, 286 keyword_int8_2, 287 keyword_stdcall, 288 keyword_stdcall2, 289 keyword_thiscall, 290 keyword_thiscall2, 291 keyword_vectorcall, 292 keyword_vectorcall2, 293 294 // builtins that require special parsing 295 builtin_choose_expr, 296 builtin_va_arg, 297 builtin_offsetof, 298 builtin_bitoffsetof, 299 builtin_types_compatible_p, 300 301 /// Generated by #embed directive 302 /// Decimal value with no prefix or suffix 303 embed_byte, 304 305 /// preprocessor number 306 /// An optional period, followed by a digit 0-9, followed by any number of letters 307 /// digits, underscores, periods, and exponents (e+, e-, E+, E-, p+, p-, P+, P-) 308 pp_num, 309 310 /// preprocessor placemarker token 311 /// generated if `##` is used with a zero-token argument 312 /// removed after substitution, so the parser should never see this 313 /// See C99 6.10.3.3.2 314 placemarker, 315 316 /// Virtual linemarker token output from preprocessor to indicate start of a new include 317 include_start, 318 319 /// Virtual linemarker token output from preprocessor to indicate resuming a file after 320 /// completion of the preceding #include 321 include_resume, 322 323 /// A comment token if asked to preserve comments. 324 comment, 325 326 /// Return true if token is identifier or keyword. 327 pub fn isMacroIdentifier(id: Id) bool { 328 switch (id) { 329 .keyword_include, 330 .keyword_include_next, 331 .keyword_embed, 332 .keyword_define, 333 .keyword_defined, 334 .keyword_undef, 335 .keyword_ifdef, 336 .keyword_ifndef, 337 .keyword_elif, 338 .keyword_elifdef, 339 .keyword_elifndef, 340 .keyword_endif, 341 .keyword_error, 342 .keyword_warning, 343 .keyword_pragma, 344 .keyword_line, 345 .keyword_va_args, 346 .keyword_va_opt, 347 .macro_func, 348 .macro_function, 349 .macro_pretty_func, 350 .keyword_auto, 351 .keyword_auto_type, 352 .keyword_break, 353 .keyword_case, 354 .keyword_char, 355 .keyword_const, 356 .keyword_continue, 357 .keyword_default, 358 .keyword_do, 359 .keyword_double, 360 .keyword_else, 361 .keyword_enum, 362 .keyword_extern, 363 .keyword_float, 364 .keyword_for, 365 .keyword_goto, 366 .keyword_if, 367 .keyword_int, 368 .keyword_long, 369 .keyword_register, 370 .keyword_return, 371 .keyword_short, 372 .keyword_signed, 373 .keyword_signed1, 374 .keyword_signed2, 375 .keyword_sizeof, 376 .keyword_static, 377 .keyword_struct, 378 .keyword_switch, 379 .keyword_typedef, 380 .keyword_union, 381 .keyword_unsigned, 382 .keyword_void, 383 .keyword_volatile, 384 .keyword_while, 385 .keyword_bool, 386 .keyword_complex, 387 .keyword_imaginary, 388 .keyword_inline, 389 .keyword_restrict, 390 .keyword_alignas, 391 .keyword_alignof, 392 .keyword_atomic, 393 .keyword_generic, 394 .keyword_noreturn, 395 .keyword_static_assert, 396 .keyword_thread_local, 397 .identifier, 398 .extended_identifier, 399 .keyword_typeof, 400 .keyword_typeof1, 401 .keyword_typeof2, 402 .keyword_const1, 403 .keyword_const2, 404 .keyword_inline1, 405 .keyword_inline2, 406 .keyword_volatile1, 407 .keyword_volatile2, 408 .keyword_restrict1, 409 .keyword_restrict2, 410 .keyword_alignof1, 411 .keyword_alignof2, 412 .builtin_choose_expr, 413 .builtin_va_arg, 414 .builtin_offsetof, 415 .builtin_bitoffsetof, 416 .builtin_types_compatible_p, 417 .keyword_attribute1, 418 .keyword_attribute2, 419 .keyword_extension, 420 .keyword_asm, 421 .keyword_asm1, 422 .keyword_asm2, 423 .keyword_float128_1, 424 .keyword_float128_2, 425 .keyword_int128, 426 .keyword_imag1, 427 .keyword_imag2, 428 .keyword_real1, 429 .keyword_real2, 430 .keyword_float16, 431 .keyword_fp16, 432 .keyword_declspec, 433 .keyword_int64, 434 .keyword_int64_2, 435 .keyword_int32, 436 .keyword_int32_2, 437 .keyword_int16, 438 .keyword_int16_2, 439 .keyword_int8, 440 .keyword_int8_2, 441 .keyword_stdcall, 442 .keyword_stdcall2, 443 .keyword_thiscall, 444 .keyword_thiscall2, 445 .keyword_vectorcall, 446 .keyword_vectorcall2, 447 .keyword_bit_int, 448 .keyword_c23_alignas, 449 .keyword_c23_alignof, 450 .keyword_c23_bool, 451 .keyword_c23_static_assert, 452 .keyword_c23_thread_local, 453 .keyword_constexpr, 454 .keyword_true, 455 .keyword_false, 456 .keyword_nullptr, 457 .keyword_typeof_unqual, 458 => return true, 459 else => return false, 460 } 461 } 462 463 /// Turn macro keywords into identifiers. 464 /// `keyword_defined` is special since it should only turn into an identifier if 465 /// we are *not* in an #if or #elif expression 466 pub fn simplifyMacroKeywordExtra(id: *Id, defined_to_identifier: bool) void { 467 switch (id.*) { 468 .keyword_include, 469 .keyword_include_next, 470 .keyword_embed, 471 .keyword_define, 472 .keyword_undef, 473 .keyword_ifdef, 474 .keyword_ifndef, 475 .keyword_elif, 476 .keyword_elifdef, 477 .keyword_elifndef, 478 .keyword_endif, 479 .keyword_error, 480 .keyword_warning, 481 .keyword_pragma, 482 .keyword_line, 483 .keyword_va_args, 484 .keyword_va_opt, 485 => id.* = .identifier, 486 .keyword_defined => if (defined_to_identifier) { 487 id.* = .identifier; 488 }, 489 else => {}, 490 } 491 } 492 493 pub fn simplifyMacroKeyword(id: *Id) void { 494 simplifyMacroKeywordExtra(id, false); 495 } 496 497 pub fn lexeme(id: Id) ?[]const u8 { 498 return switch (id) { 499 .include_start, 500 .include_resume, 501 => unreachable, 502 503 .unterminated_comment, 504 .invalid, 505 .identifier, 506 .extended_identifier, 507 .string_literal, 508 .string_literal_utf_16, 509 .string_literal_utf_8, 510 .string_literal_utf_32, 511 .string_literal_wide, 512 .unterminated_string_literal, 513 .unterminated_char_literal, 514 .empty_char_literal, 515 .char_literal, 516 .char_literal_utf_8, 517 .char_literal_utf_16, 518 .char_literal_utf_32, 519 .char_literal_wide, 520 .macro_string, 521 .whitespace, 522 .pp_num, 523 .embed_byte, 524 .comment, 525 => null, 526 527 .zero => "0", 528 .one => "1", 529 530 .nl, 531 .eof, 532 .macro_param, 533 .macro_param_no_expand, 534 .stringify_param, 535 .stringify_va_args, 536 .macro_param_has_attribute, 537 .macro_param_has_c_attribute, 538 .macro_param_has_declspec_attribute, 539 .macro_param_has_warning, 540 .macro_param_has_feature, 541 .macro_param_has_extension, 542 .macro_param_has_builtin, 543 .macro_param_has_include, 544 .macro_param_has_include_next, 545 .macro_param_has_embed, 546 .macro_param_is_identifier, 547 .macro_file, 548 .macro_line, 549 .macro_counter, 550 .macro_param_pragma_operator, 551 .placemarker, 552 => "", 553 .macro_ws => " ", 554 555 .macro_func => "__func__", 556 .macro_function => "__FUNCTION__", 557 .macro_pretty_func => "__PRETTY_FUNCTION__", 558 559 .bang => "!", 560 .bang_equal => "!=", 561 .pipe => "|", 562 .pipe_pipe => "||", 563 .pipe_equal => "|=", 564 .equal => "=", 565 .equal_equal => "==", 566 .l_paren => "(", 567 .r_paren => ")", 568 .l_brace => "{", 569 .r_brace => "}", 570 .l_bracket => "[", 571 .r_bracket => "]", 572 .period => ".", 573 .ellipsis => "...", 574 .caret => "^", 575 .caret_equal => "^=", 576 .plus => "+", 577 .plus_plus => "++", 578 .plus_equal => "+=", 579 .minus => "-", 580 .minus_minus => "--", 581 .minus_equal => "-=", 582 .asterisk => "*", 583 .asterisk_equal => "*=", 584 .percent => "%", 585 .percent_equal => "%=", 586 .arrow => "->", 587 .colon => ":", 588 .colon_colon => "::", 589 .semicolon => ";", 590 .slash => "/", 591 .slash_equal => "/=", 592 .comma => ",", 593 .ampersand => "&", 594 .ampersand_ampersand => "&&", 595 .ampersand_equal => "&=", 596 .question_mark => "?", 597 .angle_bracket_left => "<", 598 .angle_bracket_left_equal => "<=", 599 .angle_bracket_angle_bracket_left => "<<", 600 .angle_bracket_angle_bracket_left_equal => "<<=", 601 .angle_bracket_right => ">", 602 .angle_bracket_right_equal => ">=", 603 .angle_bracket_angle_bracket_right => ">>", 604 .angle_bracket_angle_bracket_right_equal => ">>=", 605 .tilde => "~", 606 .hash => "#", 607 .hash_hash => "##", 608 609 .keyword_auto => "auto", 610 .keyword_auto_type => "__auto_type", 611 .keyword_break => "break", 612 .keyword_case => "case", 613 .keyword_char => "char", 614 .keyword_const => "const", 615 .keyword_continue => "continue", 616 .keyword_default => "default", 617 .keyword_do => "do", 618 .keyword_double => "double", 619 .keyword_else => "else", 620 .keyword_enum => "enum", 621 .keyword_extern => "extern", 622 .keyword_float => "float", 623 .keyword_for => "for", 624 .keyword_goto => "goto", 625 .keyword_if => "if", 626 .keyword_int => "int", 627 .keyword_long => "long", 628 .keyword_register => "register", 629 .keyword_return => "return", 630 .keyword_short => "short", 631 .keyword_signed => "signed", 632 .keyword_signed1 => "__signed", 633 .keyword_signed2 => "__signed__", 634 .keyword_sizeof => "sizeof", 635 .keyword_static => "static", 636 .keyword_struct => "struct", 637 .keyword_switch => "switch", 638 .keyword_typedef => "typedef", 639 .keyword_typeof => "typeof", 640 .keyword_union => "union", 641 .keyword_unsigned => "unsigned", 642 .keyword_void => "void", 643 .keyword_volatile => "volatile", 644 .keyword_while => "while", 645 .keyword_bool => "_Bool", 646 .keyword_complex => "_Complex", 647 .keyword_imaginary => "_Imaginary", 648 .keyword_inline => "inline", 649 .keyword_restrict => "restrict", 650 .keyword_alignas => "_Alignas", 651 .keyword_alignof => "_Alignof", 652 .keyword_atomic => "_Atomic", 653 .keyword_generic => "_Generic", 654 .keyword_noreturn => "_Noreturn", 655 .keyword_static_assert => "_Static_assert", 656 .keyword_thread_local => "_Thread_local", 657 .keyword_bit_int => "_BitInt", 658 .keyword_c23_alignas => "alignas", 659 .keyword_c23_alignof => "alignof", 660 .keyword_c23_bool => "bool", 661 .keyword_c23_static_assert => "static_assert", 662 .keyword_c23_thread_local => "thread_local", 663 .keyword_constexpr => "constexpr", 664 .keyword_true => "true", 665 .keyword_false => "false", 666 .keyword_nullptr => "nullptr", 667 .keyword_typeof_unqual => "typeof_unqual", 668 .keyword_include => "include", 669 .keyword_include_next => "include_next", 670 .keyword_embed => "embed", 671 .keyword_define => "define", 672 .keyword_defined => "defined", 673 .keyword_undef => "undef", 674 .keyword_ifdef => "ifdef", 675 .keyword_ifndef => "ifndef", 676 .keyword_elif => "elif", 677 .keyword_elifdef => "elifdef", 678 .keyword_elifndef => "elifndef", 679 .keyword_endif => "endif", 680 .keyword_error => "error", 681 .keyword_warning => "warning", 682 .keyword_pragma => "pragma", 683 .keyword_line => "line", 684 .keyword_va_args => "__VA_ARGS__", 685 .keyword_va_opt => "__VA_OPT__", 686 .keyword_const1 => "__const", 687 .keyword_const2 => "__const__", 688 .keyword_inline1 => "__inline", 689 .keyword_inline2 => "__inline__", 690 .keyword_volatile1 => "__volatile", 691 .keyword_volatile2 => "__volatile__", 692 .keyword_restrict1 => "__restrict", 693 .keyword_restrict2 => "__restrict__", 694 .keyword_alignof1 => "__alignof", 695 .keyword_alignof2 => "__alignof__", 696 .keyword_typeof1 => "__typeof", 697 .keyword_typeof2 => "__typeof__", 698 .builtin_choose_expr => "__builtin_choose_expr", 699 .builtin_va_arg => "__builtin_va_arg", 700 .builtin_offsetof => "__builtin_offsetof", 701 .builtin_bitoffsetof => "__builtin_bitoffsetof", 702 .builtin_types_compatible_p => "__builtin_types_compatible_p", 703 .keyword_attribute1 => "__attribute", 704 .keyword_attribute2 => "__attribute__", 705 .keyword_extension => "__extension__", 706 .keyword_asm => "asm", 707 .keyword_asm1 => "__asm", 708 .keyword_asm2 => "__asm__", 709 .keyword_float128_1 => "_Float128", 710 .keyword_float128_2 => "__float128", 711 .keyword_int128 => "__int128", 712 .keyword_imag1 => "__imag", 713 .keyword_imag2 => "__imag__", 714 .keyword_real1 => "__real", 715 .keyword_real2 => "__real__", 716 .keyword_float16 => "_Float16", 717 .keyword_fp16 => "__fp16", 718 .keyword_declspec => "__declspec", 719 .keyword_int64 => "__int64", 720 .keyword_int64_2 => "_int64", 721 .keyword_int32 => "__int32", 722 .keyword_int32_2 => "_int32", 723 .keyword_int16 => "__int16", 724 .keyword_int16_2 => "_int16", 725 .keyword_int8 => "__int8", 726 .keyword_int8_2 => "_int8", 727 .keyword_stdcall => "__stdcall", 728 .keyword_stdcall2 => "_stdcall", 729 .keyword_thiscall => "__thiscall", 730 .keyword_thiscall2 => "_thiscall", 731 .keyword_vectorcall => "__vectorcall", 732 .keyword_vectorcall2 => "_vectorcall", 733 }; 734 } 735 736 pub fn symbol(id: Id) []const u8 { 737 return switch (id) { 738 .macro_string => unreachable, 739 .invalid => "invalid bytes", 740 .identifier, 741 .extended_identifier, 742 .macro_func, 743 .macro_function, 744 .macro_pretty_func, 745 .builtin_choose_expr, 746 .builtin_va_arg, 747 .builtin_offsetof, 748 .builtin_bitoffsetof, 749 .builtin_types_compatible_p, 750 => "an identifier", 751 .string_literal, 752 .string_literal_utf_16, 753 .string_literal_utf_8, 754 .string_literal_utf_32, 755 .string_literal_wide, 756 .unterminated_string_literal, 757 => "a string literal", 758 .char_literal, 759 .char_literal_utf_8, 760 .char_literal_utf_16, 761 .char_literal_utf_32, 762 .char_literal_wide, 763 .unterminated_char_literal, 764 .empty_char_literal, 765 => "a character literal", 766 .pp_num, .embed_byte => "A number", 767 else => id.lexeme().?, 768 }; 769 } 770 771 /// tokens that can start an expression parsed by Preprocessor.expr 772 /// Note that eof, r_paren, and string literals cannot actually start a 773 /// preprocessor expression, but we include them here so that a nicer 774 /// error message can be generated by the parser. 775 pub fn validPreprocessorExprStart(id: Id) bool { 776 return switch (id) { 777 .eof, 778 .r_paren, 779 .string_literal, 780 .string_literal_utf_16, 781 .string_literal_utf_8, 782 .string_literal_utf_32, 783 .string_literal_wide, 784 785 .char_literal, 786 .char_literal_utf_8, 787 .char_literal_utf_16, 788 .char_literal_utf_32, 789 .char_literal_wide, 790 .l_paren, 791 .plus, 792 .minus, 793 .tilde, 794 .bang, 795 .identifier, 796 .extended_identifier, 797 .keyword_defined, 798 .one, 799 .zero, 800 .pp_num, 801 .keyword_true, 802 .keyword_false, 803 => true, 804 else => false, 805 }; 806 } 807 808 pub fn allowsDigraphs(id: Id, langopts: LangOpts) bool { 809 return switch (id) { 810 .l_bracket, 811 .r_bracket, 812 .l_brace, 813 .r_brace, 814 .hash, 815 .hash_hash, 816 => langopts.hasDigraphs(), 817 else => false, 818 }; 819 } 820 821 pub fn canOpenGCCAsmStmt(id: Id) bool { 822 return switch (id) { 823 .keyword_volatile, .keyword_volatile1, .keyword_volatile2, .keyword_inline, .keyword_inline1, .keyword_inline2, .keyword_goto, .l_paren => true, 824 else => false, 825 }; 826 } 827 828 pub fn isStringLiteral(id: Id) bool { 829 return switch (id) { 830 .string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide => true, 831 else => false, 832 }; 833 } 834 }; 835 836 /// double underscore and underscore + capital letter identifiers 837 /// belong to the implementation namespace, so we always convert them 838 /// to keywords. 839 pub fn getTokenId(langopts: LangOpts, str: []const u8) Token.Id { 840 const kw = all_kws.get(str) orelse return .identifier; 841 const standard = langopts.standard; 842 return switch (kw) { 843 .keyword_inline => if (standard.isGNU() or standard.atLeast(.c99)) kw else .identifier, 844 .keyword_restrict => if (standard.atLeast(.c99)) kw else .identifier, 845 .keyword_typeof => if (standard.isGNU() or standard.atLeast(.c23)) kw else .identifier, 846 .keyword_asm => if (standard.isGNU()) kw else .identifier, 847 .keyword_declspec => if (langopts.declspec_attrs) kw else .identifier, 848 849 .keyword_c23_alignas, 850 .keyword_c23_alignof, 851 .keyword_c23_bool, 852 .keyword_c23_static_assert, 853 .keyword_c23_thread_local, 854 .keyword_constexpr, 855 .keyword_true, 856 .keyword_false, 857 .keyword_nullptr, 858 .keyword_typeof_unqual, 859 .keyword_elifdef, 860 .keyword_elifndef, 861 => if (standard.atLeast(.c23)) kw else .identifier, 862 863 .keyword_int64, 864 .keyword_int64_2, 865 .keyword_int32, 866 .keyword_int32_2, 867 .keyword_int16, 868 .keyword_int16_2, 869 .keyword_int8, 870 .keyword_int8_2, 871 .keyword_stdcall2, 872 .keyword_thiscall2, 873 .keyword_vectorcall2, 874 => if (langopts.ms_extensions) kw else .identifier, 875 else => kw, 876 }; 877 } 878 879 const all_kws = std.StaticStringMap(Id).initComptime(.{ 880 .{ "auto", .keyword_auto }, 881 .{ "break", .keyword_break }, 882 .{ "case", .keyword_case }, 883 .{ "char", .keyword_char }, 884 .{ "const", .keyword_const }, 885 .{ "continue", .keyword_continue }, 886 .{ "default", .keyword_default }, 887 .{ "do", .keyword_do }, 888 .{ "double", .keyword_double }, 889 .{ "else", .keyword_else }, 890 .{ "enum", .keyword_enum }, 891 .{ "extern", .keyword_extern }, 892 .{ "float", .keyword_float }, 893 .{ "for", .keyword_for }, 894 .{ "goto", .keyword_goto }, 895 .{ "if", .keyword_if }, 896 .{ "int", .keyword_int }, 897 .{ "long", .keyword_long }, 898 .{ "register", .keyword_register }, 899 .{ "return", .keyword_return }, 900 .{ "short", .keyword_short }, 901 .{ "signed", .keyword_signed }, 902 .{ "__signed", .keyword_signed1 }, 903 .{ "__signed__", .keyword_signed2 }, 904 .{ "sizeof", .keyword_sizeof }, 905 .{ "static", .keyword_static }, 906 .{ "struct", .keyword_struct }, 907 .{ "switch", .keyword_switch }, 908 .{ "typedef", .keyword_typedef }, 909 .{ "union", .keyword_union }, 910 .{ "unsigned", .keyword_unsigned }, 911 .{ "void", .keyword_void }, 912 .{ "volatile", .keyword_volatile }, 913 .{ "while", .keyword_while }, 914 .{ "__typeof__", .keyword_typeof2 }, 915 .{ "__typeof", .keyword_typeof1 }, 916 917 // ISO C99 918 .{ "_Bool", .keyword_bool }, 919 .{ "_Complex", .keyword_complex }, 920 .{ "_Imaginary", .keyword_imaginary }, 921 .{ "inline", .keyword_inline }, 922 .{ "restrict", .keyword_restrict }, 923 924 // ISO C11 925 .{ "_Alignas", .keyword_alignas }, 926 .{ "_Alignof", .keyword_alignof }, 927 .{ "_Atomic", .keyword_atomic }, 928 .{ "_Generic", .keyword_generic }, 929 .{ "_Noreturn", .keyword_noreturn }, 930 .{ "_Static_assert", .keyword_static_assert }, 931 .{ "_Thread_local", .keyword_thread_local }, 932 933 // ISO C23 934 .{ "_BitInt", .keyword_bit_int }, 935 .{ "alignas", .keyword_c23_alignas }, 936 .{ "alignof", .keyword_c23_alignof }, 937 .{ "bool", .keyword_c23_bool }, 938 .{ "static_assert", .keyword_c23_static_assert }, 939 .{ "thread_local", .keyword_c23_thread_local }, 940 .{ "constexpr", .keyword_constexpr }, 941 .{ "true", .keyword_true }, 942 .{ "false", .keyword_false }, 943 .{ "nullptr", .keyword_nullptr }, 944 .{ "typeof_unqual", .keyword_typeof_unqual }, 945 946 // Preprocessor directives 947 .{ "include", .keyword_include }, 948 .{ "include_next", .keyword_include_next }, 949 .{ "embed", .keyword_embed }, 950 .{ "define", .keyword_define }, 951 .{ "defined", .keyword_defined }, 952 .{ "undef", .keyword_undef }, 953 .{ "ifdef", .keyword_ifdef }, 954 .{ "ifndef", .keyword_ifndef }, 955 .{ "elif", .keyword_elif }, 956 .{ "elifdef", .keyword_elifdef }, 957 .{ "elifndef", .keyword_elifndef }, 958 .{ "endif", .keyword_endif }, 959 .{ "error", .keyword_error }, 960 .{ "warning", .keyword_warning }, 961 .{ "pragma", .keyword_pragma }, 962 .{ "line", .keyword_line }, 963 .{ "__VA_ARGS__", .keyword_va_args }, 964 .{ "__VA_OPT__", .keyword_va_opt }, 965 .{ "__func__", .macro_func }, 966 .{ "__FUNCTION__", .macro_function }, 967 .{ "__PRETTY_FUNCTION__", .macro_pretty_func }, 968 969 // gcc keywords 970 .{ "__auto_type", .keyword_auto_type }, 971 .{ "__const", .keyword_const1 }, 972 .{ "__const__", .keyword_const2 }, 973 .{ "__inline", .keyword_inline1 }, 974 .{ "__inline__", .keyword_inline2 }, 975 .{ "__volatile", .keyword_volatile1 }, 976 .{ "__volatile__", .keyword_volatile2 }, 977 .{ "__restrict", .keyword_restrict1 }, 978 .{ "__restrict__", .keyword_restrict2 }, 979 .{ "__alignof", .keyword_alignof1 }, 980 .{ "__alignof__", .keyword_alignof2 }, 981 .{ "typeof", .keyword_typeof }, 982 .{ "__attribute", .keyword_attribute1 }, 983 .{ "__attribute__", .keyword_attribute2 }, 984 .{ "__extension__", .keyword_extension }, 985 .{ "asm", .keyword_asm }, 986 .{ "__asm", .keyword_asm1 }, 987 .{ "__asm__", .keyword_asm2 }, 988 .{ "_Float128", .keyword_float128_1 }, 989 .{ "__float128", .keyword_float128_2 }, 990 .{ "__int128", .keyword_int128 }, 991 .{ "__imag", .keyword_imag1 }, 992 .{ "__imag__", .keyword_imag2 }, 993 .{ "__real", .keyword_real1 }, 994 .{ "__real__", .keyword_real2 }, 995 .{ "_Float16", .keyword_float16 }, 996 997 // clang keywords 998 .{ "__fp16", .keyword_fp16 }, 999 1000 // ms keywords 1001 .{ "__declspec", .keyword_declspec }, 1002 .{ "__int64", .keyword_int64 }, 1003 .{ "_int64", .keyword_int64_2 }, 1004 .{ "__int32", .keyword_int32 }, 1005 .{ "_int32", .keyword_int32_2 }, 1006 .{ "__int16", .keyword_int16 }, 1007 .{ "_int16", .keyword_int16_2 }, 1008 .{ "__int8", .keyword_int8 }, 1009 .{ "_int8", .keyword_int8_2 }, 1010 .{ "__stdcall", .keyword_stdcall }, 1011 .{ "_stdcall", .keyword_stdcall2 }, 1012 .{ "__thiscall", .keyword_thiscall }, 1013 .{ "_thiscall", .keyword_thiscall2 }, 1014 .{ "__vectorcall", .keyword_vectorcall }, 1015 .{ "_vectorcall", .keyword_vectorcall2 }, 1016 1017 // builtins that require special parsing 1018 .{ "__builtin_choose_expr", .builtin_choose_expr }, 1019 .{ "__builtin_va_arg", .builtin_va_arg }, 1020 .{ "__builtin_offsetof", .builtin_offsetof }, 1021 .{ "__builtin_bitoffsetof", .builtin_bitoffsetof }, 1022 .{ "__builtin_types_compatible_p", .builtin_types_compatible_p }, 1023 }); 1024 }; 1025 1026 const Tokenizer = @This(); 1027 1028 buf: []const u8, 1029 index: u32 = 0, 1030 source: Source.Id, 1031 langopts: LangOpts, 1032 line: u32 = 1, 1033 1034 pub fn next(self: *Tokenizer) Token { 1035 var state: enum { 1036 start, 1037 whitespace, 1038 u, 1039 u8, 1040 U, 1041 L, 1042 string_literal, 1043 char_literal_start, 1044 char_literal, 1045 char_escape_sequence, 1046 string_escape_sequence, 1047 identifier, 1048 extended_identifier, 1049 equal, 1050 bang, 1051 pipe, 1052 colon, 1053 percent, 1054 asterisk, 1055 plus, 1056 angle_bracket_left, 1057 angle_bracket_angle_bracket_left, 1058 angle_bracket_right, 1059 angle_bracket_angle_bracket_right, 1060 caret, 1061 period, 1062 period2, 1063 minus, 1064 slash, 1065 ampersand, 1066 hash, 1067 hash_digraph, 1068 hash_hash_digraph_partial, 1069 line_comment, 1070 multi_line_comment, 1071 multi_line_comment_asterisk, 1072 multi_line_comment_done, 1073 pp_num, 1074 pp_num_exponent, 1075 pp_num_digit_separator, 1076 } = .start; 1077 1078 var start = self.index; 1079 var id: Token.Id = .eof; 1080 1081 while (self.index < self.buf.len) : (self.index += 1) { 1082 const c = self.buf[self.index]; 1083 switch (state) { 1084 .start => switch (c) { 1085 '\n' => { 1086 id = .nl; 1087 self.index += 1; 1088 self.line += 1; 1089 break; 1090 }, 1091 '"' => { 1092 id = .string_literal; 1093 state = .string_literal; 1094 }, 1095 '\'' => { 1096 id = .char_literal; 1097 state = .char_literal_start; 1098 }, 1099 'u' => state = .u, 1100 'U' => state = .U, 1101 'L' => state = .L, 1102 'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier, 1103 '=' => state = .equal, 1104 '!' => state = .bang, 1105 '|' => state = .pipe, 1106 '(' => { 1107 id = .l_paren; 1108 self.index += 1; 1109 break; 1110 }, 1111 ')' => { 1112 id = .r_paren; 1113 self.index += 1; 1114 break; 1115 }, 1116 '[' => { 1117 id = .l_bracket; 1118 self.index += 1; 1119 break; 1120 }, 1121 ']' => { 1122 id = .r_bracket; 1123 self.index += 1; 1124 break; 1125 }, 1126 ';' => { 1127 id = .semicolon; 1128 self.index += 1; 1129 break; 1130 }, 1131 ',' => { 1132 id = .comma; 1133 self.index += 1; 1134 break; 1135 }, 1136 '?' => { 1137 id = .question_mark; 1138 self.index += 1; 1139 break; 1140 }, 1141 ':' => state = .colon, 1142 '%' => state = .percent, 1143 '*' => state = .asterisk, 1144 '+' => state = .plus, 1145 '<' => state = .angle_bracket_left, 1146 '>' => state = .angle_bracket_right, 1147 '^' => state = .caret, 1148 '{' => { 1149 id = .l_brace; 1150 self.index += 1; 1151 break; 1152 }, 1153 '}' => { 1154 id = .r_brace; 1155 self.index += 1; 1156 break; 1157 }, 1158 '~' => { 1159 id = .tilde; 1160 self.index += 1; 1161 break; 1162 }, 1163 '.' => state = .period, 1164 '-' => state = .minus, 1165 '/' => state = .slash, 1166 '&' => state = .ampersand, 1167 '#' => state = .hash, 1168 '0'...'9' => state = .pp_num, 1169 '\t', '\x0B', '\x0C', ' ' => state = .whitespace, 1170 '$' => if (self.langopts.dollars_in_identifiers) { 1171 state = .extended_identifier; 1172 } else { 1173 id = .invalid; 1174 self.index += 1; 1175 break; 1176 }, 1177 0x1A => if (self.langopts.ms_extensions) { 1178 id = .eof; 1179 break; 1180 } else { 1181 id = .invalid; 1182 self.index += 1; 1183 break; 1184 }, 1185 0x80...0xFF => state = .extended_identifier, 1186 else => { 1187 id = .invalid; 1188 self.index += 1; 1189 break; 1190 }, 1191 }, 1192 .whitespace => switch (c) { 1193 '\t', '\x0B', '\x0C', ' ' => {}, 1194 else => { 1195 id = .whitespace; 1196 break; 1197 }, 1198 }, 1199 .u => switch (c) { 1200 '8' => { 1201 state = .u8; 1202 }, 1203 '\'' => { 1204 id = .char_literal_utf_16; 1205 state = .char_literal_start; 1206 }, 1207 '\"' => { 1208 id = .string_literal_utf_16; 1209 state = .string_literal; 1210 }, 1211 else => { 1212 self.index -= 1; 1213 state = .identifier; 1214 }, 1215 }, 1216 .u8 => switch (c) { 1217 '\"' => { 1218 id = .string_literal_utf_8; 1219 state = .string_literal; 1220 }, 1221 '\'' => { 1222 id = .char_literal_utf_8; 1223 state = .char_literal_start; 1224 }, 1225 else => { 1226 self.index -= 1; 1227 state = .identifier; 1228 }, 1229 }, 1230 .U => switch (c) { 1231 '\'' => { 1232 id = .char_literal_utf_32; 1233 state = .char_literal_start; 1234 }, 1235 '\"' => { 1236 id = .string_literal_utf_32; 1237 state = .string_literal; 1238 }, 1239 else => { 1240 self.index -= 1; 1241 state = .identifier; 1242 }, 1243 }, 1244 .L => switch (c) { 1245 '\'' => { 1246 id = .char_literal_wide; 1247 state = .char_literal_start; 1248 }, 1249 '\"' => { 1250 id = .string_literal_wide; 1251 state = .string_literal; 1252 }, 1253 else => { 1254 self.index -= 1; 1255 state = .identifier; 1256 }, 1257 }, 1258 .string_literal => switch (c) { 1259 '\\' => { 1260 state = .string_escape_sequence; 1261 }, 1262 '"' => { 1263 self.index += 1; 1264 break; 1265 }, 1266 '\n' => { 1267 id = .unterminated_string_literal; 1268 break; 1269 }, 1270 '\r' => unreachable, 1271 else => {}, 1272 }, 1273 .char_literal_start => switch (c) { 1274 '\\' => { 1275 state = .char_escape_sequence; 1276 }, 1277 '\'' => { 1278 id = .empty_char_literal; 1279 self.index += 1; 1280 break; 1281 }, 1282 '\n' => { 1283 id = .unterminated_char_literal; 1284 break; 1285 }, 1286 else => { 1287 state = .char_literal; 1288 }, 1289 }, 1290 .char_literal => switch (c) { 1291 '\\' => { 1292 state = .char_escape_sequence; 1293 }, 1294 '\'' => { 1295 self.index += 1; 1296 break; 1297 }, 1298 '\n' => { 1299 id = .unterminated_char_literal; 1300 break; 1301 }, 1302 else => {}, 1303 }, 1304 .char_escape_sequence => switch (c) { 1305 '\r', '\n' => { 1306 id = .unterminated_char_literal; 1307 break; 1308 }, 1309 else => state = .char_literal, 1310 }, 1311 .string_escape_sequence => switch (c) { 1312 '\r', '\n' => { 1313 id = .unterminated_string_literal; 1314 break; 1315 }, 1316 else => state = .string_literal, 1317 }, 1318 .identifier, .extended_identifier => switch (c) { 1319 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, 1320 '$' => if (self.langopts.dollars_in_identifiers) { 1321 state = .extended_identifier; 1322 } else { 1323 id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier; 1324 break; 1325 }, 1326 0x80...0xFF => state = .extended_identifier, 1327 else => { 1328 id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier; 1329 break; 1330 }, 1331 }, 1332 .equal => switch (c) { 1333 '=' => { 1334 id = .equal_equal; 1335 self.index += 1; 1336 break; 1337 }, 1338 else => { 1339 id = .equal; 1340 break; 1341 }, 1342 }, 1343 .bang => switch (c) { 1344 '=' => { 1345 id = .bang_equal; 1346 self.index += 1; 1347 break; 1348 }, 1349 else => { 1350 id = .bang; 1351 break; 1352 }, 1353 }, 1354 .pipe => switch (c) { 1355 '=' => { 1356 id = .pipe_equal; 1357 self.index += 1; 1358 break; 1359 }, 1360 '|' => { 1361 id = .pipe_pipe; 1362 self.index += 1; 1363 break; 1364 }, 1365 else => { 1366 id = .pipe; 1367 break; 1368 }, 1369 }, 1370 .colon => switch (c) { 1371 '>' => { 1372 if (self.langopts.hasDigraphs()) { 1373 id = .r_bracket; 1374 self.index += 1; 1375 } else { 1376 id = .colon; 1377 } 1378 break; 1379 }, 1380 ':' => { 1381 if (self.langopts.standard.atLeast(.c23)) { 1382 id = .colon_colon; 1383 self.index += 1; 1384 break; 1385 } else { 1386 id = .colon; 1387 break; 1388 } 1389 }, 1390 else => { 1391 id = .colon; 1392 break; 1393 }, 1394 }, 1395 .percent => switch (c) { 1396 '=' => { 1397 id = .percent_equal; 1398 self.index += 1; 1399 break; 1400 }, 1401 '>' => { 1402 if (self.langopts.hasDigraphs()) { 1403 id = .r_brace; 1404 self.index += 1; 1405 } else { 1406 id = .percent; 1407 } 1408 break; 1409 }, 1410 ':' => { 1411 if (self.langopts.hasDigraphs()) { 1412 state = .hash_digraph; 1413 } else { 1414 id = .percent; 1415 break; 1416 } 1417 }, 1418 else => { 1419 id = .percent; 1420 break; 1421 }, 1422 }, 1423 .asterisk => switch (c) { 1424 '=' => { 1425 id = .asterisk_equal; 1426 self.index += 1; 1427 break; 1428 }, 1429 else => { 1430 id = .asterisk; 1431 break; 1432 }, 1433 }, 1434 .plus => switch (c) { 1435 '=' => { 1436 id = .plus_equal; 1437 self.index += 1; 1438 break; 1439 }, 1440 '+' => { 1441 id = .plus_plus; 1442 self.index += 1; 1443 break; 1444 }, 1445 else => { 1446 id = .plus; 1447 break; 1448 }, 1449 }, 1450 .angle_bracket_left => switch (c) { 1451 '<' => state = .angle_bracket_angle_bracket_left, 1452 '=' => { 1453 id = .angle_bracket_left_equal; 1454 self.index += 1; 1455 break; 1456 }, 1457 ':' => { 1458 if (self.langopts.hasDigraphs()) { 1459 id = .l_bracket; 1460 self.index += 1; 1461 } else { 1462 id = .angle_bracket_left; 1463 } 1464 break; 1465 }, 1466 '%' => { 1467 if (self.langopts.hasDigraphs()) { 1468 id = .l_brace; 1469 self.index += 1; 1470 } else { 1471 id = .angle_bracket_left; 1472 } 1473 break; 1474 }, 1475 else => { 1476 id = .angle_bracket_left; 1477 break; 1478 }, 1479 }, 1480 .angle_bracket_angle_bracket_left => switch (c) { 1481 '=' => { 1482 id = .angle_bracket_angle_bracket_left_equal; 1483 self.index += 1; 1484 break; 1485 }, 1486 else => { 1487 id = .angle_bracket_angle_bracket_left; 1488 break; 1489 }, 1490 }, 1491 .angle_bracket_right => switch (c) { 1492 '>' => state = .angle_bracket_angle_bracket_right, 1493 '=' => { 1494 id = .angle_bracket_right_equal; 1495 self.index += 1; 1496 break; 1497 }, 1498 else => { 1499 id = .angle_bracket_right; 1500 break; 1501 }, 1502 }, 1503 .angle_bracket_angle_bracket_right => switch (c) { 1504 '=' => { 1505 id = .angle_bracket_angle_bracket_right_equal; 1506 self.index += 1; 1507 break; 1508 }, 1509 else => { 1510 id = .angle_bracket_angle_bracket_right; 1511 break; 1512 }, 1513 }, 1514 .caret => switch (c) { 1515 '=' => { 1516 id = .caret_equal; 1517 self.index += 1; 1518 break; 1519 }, 1520 else => { 1521 id = .caret; 1522 break; 1523 }, 1524 }, 1525 .period => switch (c) { 1526 '.' => state = .period2, 1527 '0'...'9' => state = .pp_num, 1528 else => { 1529 id = .period; 1530 break; 1531 }, 1532 }, 1533 .period2 => switch (c) { 1534 '.' => { 1535 id = .ellipsis; 1536 self.index += 1; 1537 break; 1538 }, 1539 else => { 1540 id = .period; 1541 self.index -= 1; 1542 break; 1543 }, 1544 }, 1545 .minus => switch (c) { 1546 '>' => { 1547 id = .arrow; 1548 self.index += 1; 1549 break; 1550 }, 1551 '=' => { 1552 id = .minus_equal; 1553 self.index += 1; 1554 break; 1555 }, 1556 '-' => { 1557 id = .minus_minus; 1558 self.index += 1; 1559 break; 1560 }, 1561 else => { 1562 id = .minus; 1563 break; 1564 }, 1565 }, 1566 .ampersand => switch (c) { 1567 '&' => { 1568 id = .ampersand_ampersand; 1569 self.index += 1; 1570 break; 1571 }, 1572 '=' => { 1573 id = .ampersand_equal; 1574 self.index += 1; 1575 break; 1576 }, 1577 else => { 1578 id = .ampersand; 1579 break; 1580 }, 1581 }, 1582 .hash => switch (c) { 1583 '#' => { 1584 id = .hash_hash; 1585 self.index += 1; 1586 break; 1587 }, 1588 else => { 1589 id = .hash; 1590 break; 1591 }, 1592 }, 1593 .hash_digraph => switch (c) { 1594 '%' => state = .hash_hash_digraph_partial, 1595 else => { 1596 id = .hash; 1597 break; 1598 }, 1599 }, 1600 .hash_hash_digraph_partial => switch (c) { 1601 ':' => { 1602 id = .hash_hash; 1603 self.index += 1; 1604 break; 1605 }, 1606 else => { 1607 id = .hash; 1608 self.index -= 1; // re-tokenize the percent 1609 break; 1610 }, 1611 }, 1612 .slash => switch (c) { 1613 '/' => state = .line_comment, 1614 '*' => state = .multi_line_comment, 1615 '=' => { 1616 id = .slash_equal; 1617 self.index += 1; 1618 break; 1619 }, 1620 else => { 1621 id = .slash; 1622 break; 1623 }, 1624 }, 1625 .line_comment => switch (c) { 1626 '\n' => { 1627 if (self.langopts.preserve_comments) { 1628 id = .comment; 1629 break; 1630 } 1631 self.index -= 1; 1632 state = .start; 1633 }, 1634 else => {}, 1635 }, 1636 .multi_line_comment => switch (c) { 1637 '*' => state = .multi_line_comment_asterisk, 1638 '\n' => self.line += 1, 1639 else => {}, 1640 }, 1641 .multi_line_comment_asterisk => switch (c) { 1642 '/' => { 1643 if (self.langopts.preserve_comments) { 1644 self.index += 1; 1645 id = .comment; 1646 break; 1647 } 1648 state = .multi_line_comment_done; 1649 }, 1650 '\n' => { 1651 self.line += 1; 1652 state = .multi_line_comment; 1653 }, 1654 '*' => {}, 1655 else => state = .multi_line_comment, 1656 }, 1657 .multi_line_comment_done => switch (c) { 1658 '\n' => { 1659 start = self.index; 1660 id = .nl; 1661 self.index += 1; 1662 self.line += 1; 1663 break; 1664 }, 1665 '\r' => unreachable, 1666 '\t', '\x0B', '\x0C', ' ' => { 1667 start = self.index; 1668 state = .whitespace; 1669 }, 1670 else => { 1671 id = .whitespace; 1672 break; 1673 }, 1674 }, 1675 .pp_num => switch (c) { 1676 'a'...'d', 1677 'A'...'D', 1678 'f'...'o', 1679 'F'...'O', 1680 'q'...'z', 1681 'Q'...'Z', 1682 '0'...'9', 1683 '_', 1684 '.', 1685 => {}, 1686 'e', 'E', 'p', 'P' => state = .pp_num_exponent, 1687 '\'' => if (self.langopts.standard.atLeast(.c23)) { 1688 state = .pp_num_digit_separator; 1689 } else { 1690 id = .pp_num; 1691 break; 1692 }, 1693 else => { 1694 id = .pp_num; 1695 break; 1696 }, 1697 }, 1698 .pp_num_digit_separator => switch (c) { 1699 'a'...'d', 1700 'A'...'D', 1701 'f'...'o', 1702 'F'...'O', 1703 'q'...'z', 1704 'Q'...'Z', 1705 '0'...'9', 1706 '_', 1707 => state = .pp_num, 1708 else => { 1709 self.index -= 1; 1710 id = .pp_num; 1711 break; 1712 }, 1713 }, 1714 .pp_num_exponent => switch (c) { 1715 'a'...'o', 1716 'q'...'z', 1717 'A'...'O', 1718 'Q'...'Z', 1719 '0'...'9', 1720 '_', 1721 '.', 1722 '+', 1723 '-', 1724 => state = .pp_num, 1725 'p', 'P' => {}, 1726 else => { 1727 id = .pp_num; 1728 break; 1729 }, 1730 }, 1731 } 1732 } else if (self.index == self.buf.len) { 1733 switch (state) { 1734 .start, .line_comment => {}, 1735 .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]), 1736 .extended_identifier => id = .extended_identifier, 1737 1738 .period2 => { 1739 self.index -= 1; 1740 id = .period; 1741 }, 1742 1743 .multi_line_comment, 1744 .multi_line_comment_asterisk, 1745 => id = .unterminated_comment, 1746 1747 .char_escape_sequence, .char_literal, .char_literal_start => id = .unterminated_char_literal, 1748 .string_escape_sequence, .string_literal => id = .unterminated_string_literal, 1749 1750 .whitespace => id = .whitespace, 1751 .multi_line_comment_done => id = .whitespace, 1752 1753 .equal => id = .equal, 1754 .bang => id = .bang, 1755 .minus => id = .minus, 1756 .slash => id = .slash, 1757 .ampersand => id = .ampersand, 1758 .hash => id = .hash, 1759 .period => id = .period, 1760 .pipe => id = .pipe, 1761 .angle_bracket_angle_bracket_right => id = .angle_bracket_angle_bracket_right, 1762 .angle_bracket_right => id = .angle_bracket_right, 1763 .angle_bracket_angle_bracket_left => id = .angle_bracket_angle_bracket_left, 1764 .angle_bracket_left => id = .angle_bracket_left, 1765 .plus => id = .plus, 1766 .colon => id = .colon, 1767 .percent => id = .percent, 1768 .caret => id = .caret, 1769 .asterisk => id = .asterisk, 1770 .hash_digraph => id = .hash, 1771 .hash_hash_digraph_partial => { 1772 id = .hash; 1773 self.index -= 1; // re-tokenize the percent 1774 }, 1775 .pp_num, .pp_num_exponent, .pp_num_digit_separator => id = .pp_num, 1776 } 1777 } 1778 1779 return .{ 1780 .id = id, 1781 .start = start, 1782 .end = self.index, 1783 .line = self.line, 1784 .source = self.source, 1785 }; 1786 } 1787 1788 pub fn nextNoWS(self: *Tokenizer) Token { 1789 var tok = self.next(); 1790 while (tok.id == .whitespace or tok.id == .comment) tok = self.next(); 1791 return tok; 1792 } 1793 1794 pub fn nextNoWSComments(self: *Tokenizer) Token { 1795 var tok = self.next(); 1796 while (tok.id == .whitespace) tok = self.next(); 1797 return tok; 1798 } 1799 1800 /// Try to tokenize a '::' even if not supported by the current language standard. 1801 pub fn colonColon(self: *Tokenizer) Token { 1802 var tok = self.nextNoWS(); 1803 if (tok.id == .colon and self.index < self.buf.len and self.buf[self.index] == ':') { 1804 self.index += 1; 1805 tok.id = .colon_colon; 1806 } 1807 return tok; 1808 } 1809 1810 test "operators" { 1811 try expectTokens( 1812 \\ ! != | || |= = == 1813 \\ ( ) { } [ ] . .. ... 1814 \\ ^ ^= + ++ += - -- -= 1815 \\ * *= % %= -> : ; / /= 1816 \\ , & && &= ? < <= << 1817 \\ <<= > >= >> >>= ~ # ## 1818 \\ 1819 , &.{ 1820 .bang, 1821 .bang_equal, 1822 .pipe, 1823 .pipe_pipe, 1824 .pipe_equal, 1825 .equal, 1826 .equal_equal, 1827 .nl, 1828 .l_paren, 1829 .r_paren, 1830 .l_brace, 1831 .r_brace, 1832 .l_bracket, 1833 .r_bracket, 1834 .period, 1835 .period, 1836 .period, 1837 .ellipsis, 1838 .nl, 1839 .caret, 1840 .caret_equal, 1841 .plus, 1842 .plus_plus, 1843 .plus_equal, 1844 .minus, 1845 .minus_minus, 1846 .minus_equal, 1847 .nl, 1848 .asterisk, 1849 .asterisk_equal, 1850 .percent, 1851 .percent_equal, 1852 .arrow, 1853 .colon, 1854 .semicolon, 1855 .slash, 1856 .slash_equal, 1857 .nl, 1858 .comma, 1859 .ampersand, 1860 .ampersand_ampersand, 1861 .ampersand_equal, 1862 .question_mark, 1863 .angle_bracket_left, 1864 .angle_bracket_left_equal, 1865 .angle_bracket_angle_bracket_left, 1866 .nl, 1867 .angle_bracket_angle_bracket_left_equal, 1868 .angle_bracket_right, 1869 .angle_bracket_right_equal, 1870 .angle_bracket_angle_bracket_right, 1871 .angle_bracket_angle_bracket_right_equal, 1872 .tilde, 1873 .hash, 1874 .hash_hash, 1875 .nl, 1876 }); 1877 } 1878 1879 test "keywords" { 1880 try expectTokens( 1881 \\auto __auto_type break case char const continue default do 1882 \\double else enum extern float for goto if int 1883 \\long register return short signed sizeof static 1884 \\struct switch typedef union unsigned void volatile 1885 \\while _Bool _Complex _Imaginary inline restrict _Alignas 1886 \\_Alignof _Atomic _Generic _Noreturn _Static_assert _Thread_local 1887 \\__attribute __attribute__ 1888 \\ 1889 , &.{ 1890 .keyword_auto, 1891 .keyword_auto_type, 1892 .keyword_break, 1893 .keyword_case, 1894 .keyword_char, 1895 .keyword_const, 1896 .keyword_continue, 1897 .keyword_default, 1898 .keyword_do, 1899 .nl, 1900 .keyword_double, 1901 .keyword_else, 1902 .keyword_enum, 1903 .keyword_extern, 1904 .keyword_float, 1905 .keyword_for, 1906 .keyword_goto, 1907 .keyword_if, 1908 .keyword_int, 1909 .nl, 1910 .keyword_long, 1911 .keyword_register, 1912 .keyword_return, 1913 .keyword_short, 1914 .keyword_signed, 1915 .keyword_sizeof, 1916 .keyword_static, 1917 .nl, 1918 .keyword_struct, 1919 .keyword_switch, 1920 .keyword_typedef, 1921 .keyword_union, 1922 .keyword_unsigned, 1923 .keyword_void, 1924 .keyword_volatile, 1925 .nl, 1926 .keyword_while, 1927 .keyword_bool, 1928 .keyword_complex, 1929 .keyword_imaginary, 1930 .keyword_inline, 1931 .keyword_restrict, 1932 .keyword_alignas, 1933 .nl, 1934 .keyword_alignof, 1935 .keyword_atomic, 1936 .keyword_generic, 1937 .keyword_noreturn, 1938 .keyword_static_assert, 1939 .keyword_thread_local, 1940 .nl, 1941 .keyword_attribute1, 1942 .keyword_attribute2, 1943 .nl, 1944 }); 1945 } 1946 1947 test "preprocessor keywords" { 1948 try expectTokens( 1949 \\#include 1950 \\#include_next 1951 \\#embed 1952 \\#define 1953 \\#ifdef 1954 \\#ifndef 1955 \\#error 1956 \\#pragma 1957 \\ 1958 , &.{ 1959 .hash, 1960 .keyword_include, 1961 .nl, 1962 .hash, 1963 .keyword_include_next, 1964 .nl, 1965 .hash, 1966 .keyword_embed, 1967 .nl, 1968 .hash, 1969 .keyword_define, 1970 .nl, 1971 .hash, 1972 .keyword_ifdef, 1973 .nl, 1974 .hash, 1975 .keyword_ifndef, 1976 .nl, 1977 .hash, 1978 .keyword_error, 1979 .nl, 1980 .hash, 1981 .keyword_pragma, 1982 .nl, 1983 }); 1984 } 1985 1986 test "line continuation" { 1987 try expectTokens( 1988 \\#define foo \ 1989 \\ bar 1990 \\"foo\ 1991 \\ bar" 1992 \\#define "foo" 1993 \\ "bar" 1994 \\#define "foo" \ 1995 \\ "bar" 1996 , &.{ 1997 .hash, 1998 .keyword_define, 1999 .identifier, 2000 .identifier, 2001 .nl, 2002 .string_literal, 2003 .nl, 2004 .hash, 2005 .keyword_define, 2006 .string_literal, 2007 .nl, 2008 .string_literal, 2009 .nl, 2010 .hash, 2011 .keyword_define, 2012 .string_literal, 2013 .string_literal, 2014 }); 2015 } 2016 2017 test "string prefix" { 2018 try expectTokens( 2019 \\"foo" 2020 \\u"foo" 2021 \\u8"foo" 2022 \\U"foo" 2023 \\L"foo" 2024 \\'foo' 2025 \\u8'A' 2026 \\u'foo' 2027 \\U'foo' 2028 \\L'foo' 2029 \\ 2030 , &.{ 2031 .string_literal, 2032 .nl, 2033 .string_literal_utf_16, 2034 .nl, 2035 .string_literal_utf_8, 2036 .nl, 2037 .string_literal_utf_32, 2038 .nl, 2039 .string_literal_wide, 2040 .nl, 2041 .char_literal, 2042 .nl, 2043 .char_literal_utf_8, 2044 .nl, 2045 .char_literal_utf_16, 2046 .nl, 2047 .char_literal_utf_32, 2048 .nl, 2049 .char_literal_wide, 2050 .nl, 2051 }); 2052 } 2053 2054 test "num suffixes" { 2055 try expectTokens( 2056 \\ 1.0f 1.0L 1.0 .0 1. 0x1p0f 0X1p0 2057 \\ 0l 0lu 0ll 0llu 0 2058 \\ 1u 1ul 1ull 1 2059 \\ 1.0i 1.0I 2060 \\ 1.0if 1.0If 1.0fi 1.0fI 2061 \\ 1.0il 1.0Il 1.0li 1.0lI 2062 \\ 2063 , &.{ 2064 .pp_num, 2065 .pp_num, 2066 .pp_num, 2067 .pp_num, 2068 .pp_num, 2069 .pp_num, 2070 .pp_num, 2071 .nl, 2072 .pp_num, 2073 .pp_num, 2074 .pp_num, 2075 .pp_num, 2076 .pp_num, 2077 .nl, 2078 .pp_num, 2079 .pp_num, 2080 .pp_num, 2081 .pp_num, 2082 .nl, 2083 .pp_num, 2084 .pp_num, 2085 .nl, 2086 .pp_num, 2087 .pp_num, 2088 .pp_num, 2089 .pp_num, 2090 .nl, 2091 .pp_num, 2092 .pp_num, 2093 .pp_num, 2094 .pp_num, 2095 .nl, 2096 }); 2097 } 2098 2099 test "comments" { 2100 try expectTokens( 2101 \\//foo 2102 \\#foo 2103 , &.{ 2104 .nl, 2105 .hash, 2106 .identifier, 2107 }); 2108 } 2109 2110 test "extended identifiers" { 2111 try expectTokens("πͺπ»πΈπ¬π¬", &.{.extended_identifier}); 2112 try expectTokens("uπͺπ»πΈπ¬π¬", &.{.extended_identifier}); 2113 try expectTokens("u8πͺπ»πΈπ¬π¬", &.{.extended_identifier}); 2114 try expectTokens("Uπͺπ»πΈπ¬π¬", &.{.extended_identifier}); 2115 try expectTokens("Lπͺπ»πΈπ¬π¬", &.{.extended_identifier}); 2116 try expectTokens("1β’", &.{ .pp_num, .extended_identifier }); 2117 try expectTokens("1.β’", &.{ .pp_num, .extended_identifier }); 2118 try expectTokens("..β’", &.{ .period, .period, .extended_identifier }); 2119 try expectTokens("0β’", &.{ .pp_num, .extended_identifier }); 2120 try expectTokens("0b\u{E0000}", &.{ .pp_num, .extended_identifier }); 2121 try expectTokens("0b0\u{E0000}", &.{ .pp_num, .extended_identifier }); 2122 try expectTokens("01\u{E0000}", &.{ .pp_num, .extended_identifier }); 2123 try expectTokens("010\u{E0000}", &.{ .pp_num, .extended_identifier }); 2124 try expectTokens("0x\u{E0000}", &.{ .pp_num, .extended_identifier }); 2125 try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier }); 2126 try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal}); 2127 try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal}); 2128 try expectTokens("\"\\u\u{E0000}\"", &.{.string_literal}); 2129 try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier }); 2130 try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier }); 2131 } 2132 2133 test "digraphs" { 2134 try expectTokens("%:<::><%%>%:%:", &.{ .hash, .l_bracket, .r_bracket, .l_brace, .r_brace, .hash_hash }); 2135 try expectTokens("\"%:<::><%%>%:%:\"", &.{.string_literal}); 2136 try expectTokens("%:%42 %:%", &.{ .hash, .percent, .pp_num, .hash, .percent }); 2137 } 2138 2139 test "C23 keywords" { 2140 try expectTokensExtra("true false alignas alignof bool static_assert thread_local nullptr typeof_unqual", &.{ 2141 .keyword_true, 2142 .keyword_false, 2143 .keyword_c23_alignas, 2144 .keyword_c23_alignof, 2145 .keyword_c23_bool, 2146 .keyword_c23_static_assert, 2147 .keyword_c23_thread_local, 2148 .keyword_nullptr, 2149 .keyword_typeof_unqual, 2150 }, .c23); 2151 } 2152 2153 test "Tokenizer fuzz test" { 2154 var comp = Compilation.init(std.testing.allocator, std.fs.cwd()); 2155 defer comp.deinit(); 2156 2157 const input_bytes = std.testing.fuzzInput(.{}); 2158 if (input_bytes.len == 0) return; 2159 2160 const source = try comp.addSourceFromBuffer("fuzz.c", input_bytes); 2161 2162 var tokenizer: Tokenizer = .{ 2163 .buf = source.buf, 2164 .source = source.id, 2165 .langopts = comp.langopts, 2166 }; 2167 while (true) { 2168 const prev_index = tokenizer.index; 2169 const tok = tokenizer.next(); 2170 if (tok.id == .eof) break; 2171 try std.testing.expect(prev_index < tokenizer.index); // ensure that the tokenizer always makes progress 2172 } 2173 } 2174 2175 fn expectTokensExtra(contents: []const u8, expected_tokens: []const Token.Id, standard: ?LangOpts.Standard) !void { 2176 var comp = Compilation.init(std.testing.allocator, std.fs.cwd()); 2177 defer comp.deinit(); 2178 if (standard) |provided| { 2179 comp.langopts.standard = provided; 2180 } 2181 const source = try comp.addSourceFromBuffer("path", contents); 2182 var tokenizer = Tokenizer{ 2183 .buf = source.buf, 2184 .source = source.id, 2185 .langopts = comp.langopts, 2186 }; 2187 var i: usize = 0; 2188 while (i < expected_tokens.len) { 2189 const token = tokenizer.next(); 2190 if (token.id == .whitespace) continue; 2191 const expected_token_id = expected_tokens[i]; 2192 i += 1; 2193 if (!std.meta.eql(token.id, expected_token_id)) { 2194 std.debug.print("expected {s}, found {s}\n", .{ @tagName(expected_token_id), @tagName(token.id) }); 2195 return error.TokensDoNotEqual; 2196 } 2197 } 2198 const last_token = tokenizer.next(); 2199 try std.testing.expect(last_token.id == .eof); 2200 } 2201 2202 fn expectTokens(contents: []const u8, expected_tokens: []const Token.Id) !void { 2203 return expectTokensExtra(contents, expected_tokens, null); 2204 }