lex.zig (49189B) - Raw
1 //! Expects to be run after the C preprocessor and after `removeComments`. 2 //! This means that the lexer assumes that: 3 //! - Splices ('\' at the end of a line) have been handled/collapsed. 4 //! - Preprocessor directives and macros have been expanded (any remaining should be skipped with the exception of `#pragma code_page`). 5 //! - All comments have been removed. 6 7 const std = @import("std"); 8 const ErrorDetails = @import("errors.zig").ErrorDetails; 9 const columnWidth = @import("literals.zig").columnWidth; 10 const code_pages = @import("code_pages.zig"); 11 const SupportedCodePage = code_pages.SupportedCodePage; 12 const SourceMappings = @import("source_mapping.zig").SourceMappings; 13 const isNonAsciiDigit = @import("utils.zig").isNonAsciiDigit; 14 15 const dumpTokensDuringTests = false; 16 17 pub const default_max_string_literal_codepoints = 4097; 18 19 pub const Token = struct { 20 id: Id, 21 start: usize, 22 end: usize, 23 line_number: usize, 24 25 pub const Id = enum { 26 literal, 27 number, 28 quoted_ascii_string, 29 quoted_wide_string, 30 operator, 31 begin, 32 end, 33 comma, 34 open_paren, 35 close_paren, 36 /// This Id is only used for errors, the Lexer will never return one 37 /// of these from a `next` call. 38 preprocessor_command, 39 invalid, 40 eof, 41 42 pub fn nameForErrorDisplay(self: Id) []const u8 { 43 return switch (self) { 44 .literal => "<literal>", 45 .number => "<number>", 46 .quoted_ascii_string => "<quoted ascii string>", 47 .quoted_wide_string => "<quoted wide string>", 48 .operator => "<operator>", 49 .begin => "<'{' or BEGIN>", 50 .end => "<'}' or END>", 51 .comma => ",", 52 .open_paren => "(", 53 .close_paren => ")", 54 .preprocessor_command => "<preprocessor command>", 55 .invalid => unreachable, 56 .eof => "<eof>", 57 }; 58 } 59 }; 60 61 pub fn slice(self: Token, buffer: []const u8) []const u8 { 62 return buffer[self.start..self.end]; 63 } 64 65 /// Returns 0-based column 66 pub fn calculateColumn(token: Token, source: []const u8, tab_columns: usize, maybe_line_start: ?usize) usize { 67 const line_start = maybe_line_start orelse token.getLineStartForColumnCalc(source); 68 69 var i: usize = line_start; 70 var column: usize = 0; 71 while (i < token.start) : (i += 1) { 72 column += columnWidth(column, source[i], tab_columns); 73 } 74 return column; 75 } 76 77 // TODO: More testing is needed to determine if this can be merged with getLineStartForErrorDisplay 78 // (the TODO in currentIndexFormsLineEndingPair should be taken into account as well) 79 pub fn getLineStartForColumnCalc(token: Token, source: []const u8) usize { 80 const line_start = line_start: { 81 if (token.start != 0) { 82 // start checking at the byte before the token 83 var index = token.start - 1; 84 while (true) { 85 if (source[index] == '\n') break :line_start @min(source.len - 1, index + 1); 86 if (index != 0) index -= 1 else break; 87 } 88 } 89 break :line_start 0; 90 }; 91 return line_start; 92 } 93 94 pub fn getLineStartForErrorDisplay(token: Token, source: []const u8) usize { 95 const line_start = line_start: { 96 if (token.start != 0) { 97 // start checking at the byte before the token 98 var index = token.start - 1; 99 while (true) { 100 if (source[index] == '\r' or source[index] == '\n') break :line_start @min(source.len - 1, index + 1); 101 if (index != 0) index -= 1 else break; 102 } 103 } 104 break :line_start 0; 105 }; 106 return line_start; 107 } 108 109 pub fn getLineForErrorDisplay(token: Token, source: []const u8, maybe_line_start: ?usize) []const u8 { 110 const line_start = maybe_line_start orelse token.getLineStartForErrorDisplay(source); 111 112 var line_end = line_start; 113 while (line_end < source.len and source[line_end] != '\r' and source[line_end] != '\n') : (line_end += 1) {} 114 return source[line_start..line_end]; 115 } 116 117 pub fn isStringLiteral(token: Token) bool { 118 return token.id == .quoted_ascii_string or token.id == .quoted_wide_string; 119 } 120 }; 121 122 pub const LineHandler = struct { 123 line_number: usize = 1, 124 buffer: []const u8, 125 last_line_ending_index: ?usize = null, 126 127 /// Like incrementLineNumber but checks that the current char is a line ending first. 128 /// Returns the new line number if it was incremented, null otherwise. 129 pub fn maybeIncrementLineNumber(self: *LineHandler, cur_index: usize) ?usize { 130 const c = self.buffer[cur_index]; 131 if (c == '\r' or c == '\n') { 132 return self.incrementLineNumber(cur_index); 133 } 134 return null; 135 } 136 137 /// Increments line_number appropriately (handling line ending pairs) 138 /// and returns the new line number if it was incremented, or null otherwise. 139 pub fn incrementLineNumber(self: *LineHandler, cur_index: usize) ?usize { 140 if (self.currentIndexFormsLineEndingPair(cur_index)) { 141 self.last_line_ending_index = null; 142 return null; 143 } else { 144 self.line_number += 1; 145 self.last_line_ending_index = cur_index; 146 return self.line_number; 147 } 148 } 149 150 /// \r\n and \n\r pairs are treated as a single line ending (but not \r\r \n\n) 151 /// expects self.index and last_line_ending_index (if non-null) to contain line endings 152 /// 153 /// TODO: This is not really how the Win32 RC compiler handles line endings. Instead, it 154 /// seems to drop all carriage returns during preprocessing and then replace all 155 /// remaining line endings with well-formed CRLF pairs (e.g. `<CR>a<CR>b<LF>c` becomes `ab<CR><LF>c`). 156 /// Handling this the same as the Win32 RC compiler would need control over the preprocessor, 157 /// since Clang converts unpaired <CR> into unpaired <LF>. 158 pub fn currentIndexFormsLineEndingPair(self: *const LineHandler, cur_index: usize) bool { 159 if (self.last_line_ending_index == null) return false; 160 161 // must immediately precede the current index, we know cur_index must 162 // be >= 1 since last_line_ending_index is non-null (so if the subtraction 163 // overflows it is a bug at the callsite of this function). 164 if (self.last_line_ending_index.? != cur_index - 1) return false; 165 166 const cur_line_ending = self.buffer[cur_index]; 167 const last_line_ending = self.buffer[self.last_line_ending_index.?]; 168 169 // sanity check 170 std.debug.assert(cur_line_ending == '\r' or cur_line_ending == '\n'); 171 std.debug.assert(last_line_ending == '\r' or last_line_ending == '\n'); 172 173 // can't be \n\n or \r\r 174 if (last_line_ending == cur_line_ending) return false; 175 176 return true; 177 } 178 }; 179 180 pub const LexError = error{ 181 UnfinishedStringLiteral, 182 StringLiteralTooLong, 183 InvalidNumberWithExponent, 184 InvalidDigitCharacterInNumberLiteral, 185 IllegalByte, 186 IllegalByteOutsideStringLiterals, 187 IllegalCodepointOutsideStringLiterals, 188 IllegalByteOrderMark, 189 IllegalPrivateUseCharacter, 190 FoundCStyleEscapedQuote, 191 CodePagePragmaMissingLeftParen, 192 CodePagePragmaMissingRightParen, 193 /// Can be caught and ignored 194 CodePagePragmaInvalidCodePage, 195 CodePagePragmaNotInteger, 196 CodePagePragmaOverflow, 197 CodePagePragmaUnsupportedCodePage, 198 /// Can be caught and ignored 199 CodePagePragmaInIncludedFile, 200 }; 201 202 pub const Lexer = struct { 203 const Self = @This(); 204 205 buffer: []const u8, 206 index: usize, 207 line_handler: LineHandler, 208 at_start_of_line: bool = true, 209 error_context_token: ?Token = null, 210 current_code_page: SupportedCodePage, 211 default_code_page: SupportedCodePage, 212 source_mappings: ?*SourceMappings, 213 max_string_literal_codepoints: u15, 214 /// Needed to determine whether or not the output code page should 215 /// be set in the parser. 216 seen_pragma_code_pages: u2 = 0, 217 last_pragma_code_page_token: ?Token = null, 218 219 pub const Error = LexError; 220 221 pub const LexerOptions = struct { 222 default_code_page: SupportedCodePage = .windows1252, 223 source_mappings: ?*SourceMappings = null, 224 max_string_literal_codepoints: u15 = default_max_string_literal_codepoints, 225 }; 226 227 pub fn init(buffer: []const u8, options: LexerOptions) Self { 228 return Self{ 229 .buffer = buffer, 230 .index = 0, 231 .current_code_page = options.default_code_page, 232 .default_code_page = options.default_code_page, 233 .source_mappings = options.source_mappings, 234 .max_string_literal_codepoints = options.max_string_literal_codepoints, 235 .line_handler = .{ .buffer = buffer }, 236 }; 237 } 238 239 pub fn dump(self: *Self, token: *const Token) void { 240 std.debug.print("{s}:{d}: {f}\n", .{ 241 @tagName(token.id), token.line_number, std.ascii.hexEscape(token.slice(self.buffer), .lower), 242 }); 243 } 244 245 pub const LexMethod = enum { 246 whitespace_delimiter_only, 247 normal, 248 normal_expect_operator, 249 }; 250 251 pub fn next(self: *Self, comptime method: LexMethod) LexError!Token { 252 switch (method) { 253 .whitespace_delimiter_only => return self.nextWhitespaceDelimeterOnly(), 254 .normal => return self.nextNormal(), 255 .normal_expect_operator => return self.nextNormalWithContext(.expect_operator), 256 } 257 } 258 259 const StateWhitespaceDelimiterOnly = enum { 260 start, 261 literal, 262 preprocessor, 263 semicolon, 264 }; 265 266 pub fn nextWhitespaceDelimeterOnly(self: *Self) LexError!Token { 267 const start_index = self.index; 268 var result = Token{ 269 .id = .eof, 270 .start = start_index, 271 .end = undefined, 272 .line_number = self.line_handler.line_number, 273 }; 274 var state = StateWhitespaceDelimiterOnly.start; 275 276 while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) { 277 const c = codepoint.value; 278 try self.checkForIllegalCodepoint(codepoint, false); 279 switch (state) { 280 .start => switch (c) { 281 '\r', '\n' => { 282 result.start = self.index + 1; 283 result.line_number = self.incrementLineNumber(); 284 }, 285 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => { 286 result.start = self.index + 1; 287 }, 288 // NBSP only counts as whitespace at the start of a line (but 289 // can be intermixed with other whitespace). Who knows why. 290 // TODO: This should either be removed, or it should also include 291 // the codepoints listed in disjoint_code_page.zig 292 '\xA0' => if (self.at_start_of_line) { 293 result.start = self.index + codepoint.byte_len; 294 } else { 295 state = .literal; 296 self.at_start_of_line = false; 297 }, 298 '#' => { 299 if (self.at_start_of_line) { 300 state = .preprocessor; 301 } else { 302 state = .literal; 303 } 304 self.at_start_of_line = false; 305 }, 306 ';' => { 307 state = .semicolon; 308 self.at_start_of_line = false; 309 }, 310 else => { 311 state = .literal; 312 self.at_start_of_line = false; 313 }, 314 }, 315 .literal => switch (c) { 316 '\r', '\n', ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => { 317 result.id = .literal; 318 break; 319 }, 320 else => {}, 321 }, 322 .preprocessor => switch (c) { 323 '\r', '\n' => { 324 try self.evaluatePreprocessorCommand(result.start, self.index); 325 result.start = self.index + 1; 326 state = .start; 327 result.line_number = self.incrementLineNumber(); 328 }, 329 else => {}, 330 }, 331 .semicolon => switch (c) { 332 '\r', '\n' => { 333 result.start = self.index + 1; 334 state = .start; 335 result.line_number = self.incrementLineNumber(); 336 }, 337 else => {}, 338 }, 339 } 340 } else { // got EOF 341 switch (state) { 342 .start => {}, 343 .semicolon => { 344 // Skip past everything up to the EOF 345 result.start = self.index; 346 }, 347 .literal => { 348 result.id = .literal; 349 }, 350 .preprocessor => { 351 try self.evaluatePreprocessorCommand(result.start, self.index); 352 result.start = self.index; 353 }, 354 } 355 } 356 357 result.end = self.index; 358 359 // EOF tokens must have their start index match the end index 360 std.debug.assert(result.id != .eof or result.start == result.end); 361 362 return result; 363 } 364 365 const StateNormal = enum { 366 start, 367 literal_or_quoted_wide_string, 368 quoted_ascii_string, 369 quoted_wide_string, 370 quoted_ascii_string_escape, 371 quoted_wide_string_escape, 372 quoted_ascii_string_maybe_end, 373 quoted_wide_string_maybe_end, 374 literal, 375 number_literal, 376 preprocessor, 377 semicolon, 378 // end 379 e, 380 en, 381 // begin 382 b, 383 be, 384 beg, 385 begi, 386 }; 387 388 /// TODO: A not-terrible name 389 pub fn nextNormal(self: *Self) LexError!Token { 390 return self.nextNormalWithContext(.any); 391 } 392 393 pub fn nextNormalWithContext(self: *Self, context: enum { expect_operator, any }) LexError!Token { 394 const start_index = self.index; 395 var result = Token{ 396 .id = .eof, 397 .start = start_index, 398 .end = undefined, 399 .line_number = self.line_handler.line_number, 400 }; 401 var state = StateNormal.start; 402 403 // Note: The Windows RC compiler uses a non-standard method of computing 404 // length for its 'string literal too long' errors; it isn't easily 405 // explained or intuitive (it's sort-of pre-parsed byte length but with 406 // a few of exceptions/edge cases). 407 // 408 // It also behaves strangely with non-ASCII codepoints, e.g. even though the default 409 // limit is 4097, you can only have 4094 € codepoints (1 UTF-16 code unit each), 410 // and 2048 𐐷 codepoints (2 UTF-16 code units each). 411 // 412 // TODO: Understand this more, bring it more in line with how the Win32 limits work. 413 // Alternatively, do something that makes more sense but may be more permissive. 414 var string_literal_length: usize = 0; 415 // Keeping track of the string literal column prevents pathological edge cases when 416 // there are tons of tab stop characters within a string literal. 417 var string_literal_column: usize = 0; 418 var string_literal_collapsing_whitespace: bool = false; 419 var still_could_have_exponent: bool = true; 420 var exponent_index: ?usize = null; 421 while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) { 422 const c = codepoint.value; 423 const in_string_literal = switch (state) { 424 .quoted_ascii_string, 425 .quoted_wide_string, 426 .quoted_ascii_string_escape, 427 .quoted_wide_string_escape, 428 .quoted_ascii_string_maybe_end, 429 .quoted_wide_string_maybe_end, 430 => 431 // If the current line is not the same line as the start of the string literal, 432 // then we want to treat the current codepoint as 'not in a string literal' 433 // for the purposes of detecting illegal codepoints. This means that we will 434 // error on illegal-outside-string-literal characters that are outside string 435 // literals from the perspective of a C preprocessor, but that may be 436 // inside string literals from the perspective of the RC lexer. For example, 437 // "hello 438 // @" 439 // will be treated as a single string literal by the RC lexer but the Win32 440 // preprocessor will consider this an unclosed string literal followed by 441 // the character @ and ", and will therefore error since the Win32 RC preprocessor 442 // errors on the @ character outside string literals. 443 // 444 // By doing this here, we can effectively emulate the Win32 RC preprocessor behavior 445 // at lex-time, and avoid the need for a separate step that checks for this edge-case 446 // specifically. 447 result.line_number == self.line_handler.line_number, 448 else => false, 449 }; 450 try self.checkForIllegalCodepoint(codepoint, in_string_literal); 451 switch (state) { 452 .start => switch (c) { 453 '\r', '\n' => { 454 result.start = self.index + 1; 455 result.line_number = self.incrementLineNumber(); 456 }, 457 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => { 458 result.start = self.index + 1; 459 }, 460 // NBSP only counts as whitespace at the start of a line (but 461 // can be intermixed with other whitespace). Who knows why. 462 '\xA0' => if (self.at_start_of_line) { 463 result.start = self.index + codepoint.byte_len; 464 } else { 465 state = .literal; 466 self.at_start_of_line = false; 467 }, 468 'L', 'l' => { 469 state = .literal_or_quoted_wide_string; 470 self.at_start_of_line = false; 471 }, 472 'E', 'e' => { 473 state = .e; 474 self.at_start_of_line = false; 475 }, 476 'B', 'b' => { 477 state = .b; 478 self.at_start_of_line = false; 479 }, 480 '"' => { 481 state = .quoted_ascii_string; 482 self.at_start_of_line = false; 483 string_literal_collapsing_whitespace = false; 484 string_literal_length = 0; 485 486 var dummy_token = Token{ 487 .start = self.index, 488 .end = self.index, 489 .line_number = self.line_handler.line_number, 490 .id = .invalid, 491 }; 492 string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null); 493 }, 494 '+', '&', '|' => { 495 self.index += 1; 496 result.id = .operator; 497 self.at_start_of_line = false; 498 break; 499 }, 500 '-' => { 501 if (context == .expect_operator) { 502 self.index += 1; 503 result.id = .operator; 504 self.at_start_of_line = false; 505 break; 506 } else { 507 state = .number_literal; 508 still_could_have_exponent = true; 509 exponent_index = null; 510 self.at_start_of_line = false; 511 } 512 }, 513 '0'...'9', '~' => { 514 state = .number_literal; 515 still_could_have_exponent = true; 516 exponent_index = null; 517 self.at_start_of_line = false; 518 }, 519 '#' => { 520 if (self.at_start_of_line) { 521 state = .preprocessor; 522 } else { 523 state = .literal; 524 } 525 self.at_start_of_line = false; 526 }, 527 ';' => { 528 state = .semicolon; 529 self.at_start_of_line = false; 530 }, 531 '{', '}' => { 532 self.index += 1; 533 result.id = if (c == '{') .begin else .end; 534 self.at_start_of_line = false; 535 break; 536 }, 537 '(', ')' => { 538 self.index += 1; 539 result.id = if (c == '(') .open_paren else .close_paren; 540 self.at_start_of_line = false; 541 break; 542 }, 543 ',' => { 544 self.index += 1; 545 result.id = .comma; 546 self.at_start_of_line = false; 547 break; 548 }, 549 else => { 550 if (isNonAsciiDigit(c)) { 551 self.error_context_token = .{ 552 .id = .number, 553 .start = result.start, 554 .end = self.index + 1, 555 .line_number = self.line_handler.line_number, 556 }; 557 return error.InvalidDigitCharacterInNumberLiteral; 558 } 559 state = .literal; 560 self.at_start_of_line = false; 561 }, 562 }, 563 .preprocessor => switch (c) { 564 '\r', '\n' => { 565 try self.evaluatePreprocessorCommand(result.start, self.index); 566 result.start = self.index + 1; 567 state = .start; 568 result.line_number = self.incrementLineNumber(); 569 }, 570 else => {}, 571 }, 572 // Semi-colon acts as a line-terminator--everything is skipped until 573 // the next line. 574 .semicolon => switch (c) { 575 '\r', '\n' => { 576 result.start = self.index + 1; 577 state = .start; 578 result.line_number = self.incrementLineNumber(); 579 }, 580 else => {}, 581 }, 582 .number_literal => switch (c) { 583 // zig fmt: off 584 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F', 585 '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')', 586 '\'', ';', '=', 587 => { 588 // zig fmt: on 589 result.id = .number; 590 break; 591 }, 592 '0'...'9' => { 593 if (exponent_index) |exp_i| { 594 if (self.index - 1 == exp_i) { 595 // Note: This being an error is a quirk of the preprocessor used by 596 // the Win32 RC compiler. 597 self.error_context_token = .{ 598 .id = .number, 599 .start = result.start, 600 .end = self.index + 1, 601 .line_number = self.line_handler.line_number, 602 }; 603 return error.InvalidNumberWithExponent; 604 } 605 } 606 }, 607 'e', 'E' => { 608 if (still_could_have_exponent) { 609 exponent_index = self.index; 610 still_could_have_exponent = false; 611 } 612 }, 613 else => { 614 if (isNonAsciiDigit(c)) { 615 self.error_context_token = .{ 616 .id = .number, 617 .start = result.start, 618 .end = self.index + 1, 619 .line_number = self.line_handler.line_number, 620 }; 621 return error.InvalidDigitCharacterInNumberLiteral; 622 } 623 still_could_have_exponent = false; 624 }, 625 }, 626 .literal_or_quoted_wide_string => switch (c) { 627 // zig fmt: off 628 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F', 629 '\r', '\n', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')', 630 '\'', ';', '=', 631 // zig fmt: on 632 => { 633 result.id = .literal; 634 break; 635 }, 636 '"' => { 637 state = .quoted_wide_string; 638 string_literal_collapsing_whitespace = false; 639 string_literal_length = 0; 640 641 var dummy_token = Token{ 642 .start = self.index, 643 .end = self.index, 644 .line_number = self.line_handler.line_number, 645 .id = .invalid, 646 }; 647 string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null); 648 }, 649 else => { 650 state = .literal; 651 }, 652 }, 653 .literal => switch (c) { 654 // zig fmt: off 655 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F', 656 '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')', 657 '\'', ';', '=', 658 => { 659 // zig fmt: on 660 result.id = .literal; 661 break; 662 }, 663 else => {}, 664 }, 665 .e => switch (c) { 666 'N', 'n' => { 667 state = .en; 668 }, 669 else => { 670 state = .literal; 671 self.index -= 1; 672 }, 673 }, 674 .en => switch (c) { 675 'D', 'd' => { 676 result.id = .end; 677 self.index += 1; 678 break; 679 }, 680 else => { 681 state = .literal; 682 self.index -= 1; 683 }, 684 }, 685 .b => switch (c) { 686 'E', 'e' => { 687 state = .be; 688 }, 689 else => { 690 state = .literal; 691 self.index -= 1; 692 }, 693 }, 694 .be => switch (c) { 695 'G', 'g' => { 696 state = .beg; 697 }, 698 else => { 699 state = .literal; 700 self.index -= 1; 701 }, 702 }, 703 .beg => switch (c) { 704 'I', 'i' => { 705 state = .begi; 706 }, 707 else => { 708 state = .literal; 709 self.index -= 1; 710 }, 711 }, 712 .begi => switch (c) { 713 'N', 'n' => { 714 result.id = .begin; 715 self.index += 1; 716 break; 717 }, 718 else => { 719 state = .literal; 720 self.index -= 1; 721 }, 722 }, 723 .quoted_ascii_string, .quoted_wide_string => switch (c) { 724 '"' => { 725 string_literal_column += 1; 726 state = if (state == .quoted_ascii_string) .quoted_ascii_string_maybe_end else .quoted_wide_string_maybe_end; 727 }, 728 '\\' => { 729 string_literal_length += 1; 730 string_literal_column += 1; 731 state = if (state == .quoted_ascii_string) .quoted_ascii_string_escape else .quoted_wide_string_escape; 732 }, 733 '\r' => { 734 string_literal_column = 0; 735 // \r doesn't count towards string literal length 736 737 // Increment line number but don't affect the result token's line number 738 _ = self.incrementLineNumber(); 739 }, 740 '\n' => { 741 string_literal_column = 0; 742 // first \n expands to <space><\n> 743 if (!string_literal_collapsing_whitespace) { 744 string_literal_length += 2; 745 string_literal_collapsing_whitespace = true; 746 } 747 // the rest are collapsed into the <space><\n> 748 749 // Increment line number but don't affect the result token's line number 750 _ = self.incrementLineNumber(); 751 }, 752 // only \t, space, Vertical Tab, and Form Feed count as whitespace when collapsing 753 '\t', ' ', '\x0b', '\x0c' => { 754 if (!string_literal_collapsing_whitespace) { 755 // Literal tab characters are counted as the number of space characters 756 // needed to reach the next 8-column tab stop. 757 const width = columnWidth(string_literal_column, @intCast(c), 8); 758 string_literal_length += width; 759 string_literal_column += width; 760 } 761 }, 762 else => { 763 string_literal_collapsing_whitespace = false; 764 string_literal_length += 1; 765 string_literal_column += 1; 766 }, 767 }, 768 .quoted_ascii_string_escape, .quoted_wide_string_escape => switch (c) { 769 '"' => { 770 self.error_context_token = .{ 771 .id = .invalid, 772 .start = self.index - 1, 773 .end = self.index + 1, 774 .line_number = self.line_handler.line_number, 775 }; 776 return error.FoundCStyleEscapedQuote; 777 }, 778 else => { 779 string_literal_length += 1; 780 string_literal_column += 1; 781 state = if (state == .quoted_ascii_string_escape) .quoted_ascii_string else .quoted_wide_string; 782 }, 783 }, 784 .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => switch (c) { 785 '"' => { 786 state = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string; 787 // Escaped quotes count as 1 char for string literal length checks. 788 // Since we did not increment on the first " (because it could have been 789 // the end of the quoted string), we increment here 790 string_literal_length += 1; 791 string_literal_column += 1; 792 }, 793 else => { 794 result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string; 795 break; 796 }, 797 }, 798 } 799 } else { // got EOF 800 switch (state) { 801 .start => {}, 802 .semicolon => { 803 // Skip past everything up to the EOF 804 result.start = self.index; 805 }, 806 .literal_or_quoted_wide_string, .literal, .e, .en, .b, .be, .beg, .begi => { 807 result.id = .literal; 808 }, 809 .preprocessor => { 810 try self.evaluatePreprocessorCommand(result.start, self.index); 811 result.start = self.index; 812 }, 813 .number_literal => { 814 result.id = .number; 815 }, 816 .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => { 817 result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string; 818 }, 819 .quoted_ascii_string, 820 .quoted_wide_string, 821 .quoted_ascii_string_escape, 822 .quoted_wide_string_escape, 823 => { 824 self.error_context_token = .{ 825 .id = .eof, 826 .start = self.index, 827 .end = self.index, 828 .line_number = self.line_handler.line_number, 829 }; 830 return LexError.UnfinishedStringLiteral; 831 }, 832 } 833 } 834 835 result.end = self.index; 836 837 if (result.id == .quoted_ascii_string or result.id == .quoted_wide_string) { 838 if (string_literal_length > self.max_string_literal_codepoints) { 839 self.error_context_token = result; 840 return LexError.StringLiteralTooLong; 841 } 842 } 843 844 // EOF tokens must have their start index match the end index 845 std.debug.assert(result.id != .eof or result.start == result.end); 846 847 return result; 848 } 849 850 /// Increments line_number appropriately (handling line ending pairs) 851 /// and returns the new line number. 852 fn incrementLineNumber(self: *Self) usize { 853 _ = self.line_handler.incrementLineNumber(self.index); 854 self.at_start_of_line = true; 855 return self.line_handler.line_number; 856 } 857 858 fn checkForIllegalCodepoint(self: *Self, codepoint: code_pages.Codepoint, in_string_literal: bool) LexError!void { 859 const err = switch (codepoint.value) { 860 // 0x00 = NUL 861 // 0x1A = Substitute (treated as EOF) 862 // NOTE: 0x1A gets treated as EOF by the clang preprocessor so after a .rc file 863 // is run through the clang preprocessor it will no longer have 0x1A characters in it. 864 // 0x7F = DEL (treated as a context-specific terminator by the Windows RC compiler) 865 0x00, 0x1A, 0x7F => error.IllegalByte, 866 // 0x01...0x03 result in strange 'macro definition too big' errors when used outside of string literals 867 // 0x04 is valid but behaves strangely (sort of acts as a 'skip the next character' instruction) 868 0x01...0x04 => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return, 869 // @ and ` both result in error RC2018: unknown character '0x60' (and subsequently 870 // fatal error RC1116: RC terminating after preprocessor errors) if they are ever used 871 // outside of string literals. Not exactly sure why this would be the case, though. 872 // TODO: Make sure there aren't any exceptions 873 '@', '`' => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return, 874 // The Byte Order Mark is mostly skipped over by the Windows RC compiler, but 875 // there are edge cases where it leads to cryptic 'compiler limit : macro definition too big' 876 // errors (e.g. a BOM within a number literal). By making this illegal we avoid having to 877 // deal with a lot of edge cases and remove the potential footgun of the bytes of a BOM 878 // being 'missing' when included in a string literal (the Windows RC compiler acts as 879 // if the codepoint was never part of the string literal). 880 '\u{FEFF}' => error.IllegalByteOrderMark, 881 // Similar deal with this private use codepoint, it gets skipped/ignored by the 882 // RC compiler (but without the cryptic errors). Silently dropping bytes still seems like 883 // enough of a footgun with no real use-cases that it's still worth erroring instead of 884 // emulating the RC compiler's behavior, though. 885 '\u{E000}' => error.IllegalPrivateUseCharacter, 886 // These codepoints lead to strange errors when used outside of string literals, 887 // and miscompilations when used within string literals. We avoid the miscompilation 888 // within string literals and emit a warning, but outside of string literals it makes 889 // more sense to just disallow these codepoints. 890 0x900, 0xA00, 0xA0D, 0x2000, 0xD00, 0xFFFE, 0xFFFF => if (!in_string_literal) error.IllegalCodepointOutsideStringLiterals else return, 891 else => return, 892 }; 893 self.error_context_token = .{ 894 .id = .invalid, 895 .start = self.index, 896 .end = self.index + codepoint.byte_len, 897 .line_number = self.line_handler.line_number, 898 }; 899 return err; 900 } 901 902 fn evaluatePreprocessorCommand(self: *Self, start: usize, end: usize) !void { 903 const token = Token{ 904 .id = .preprocessor_command, 905 .start = start, 906 .end = end, 907 .line_number = self.line_handler.line_number, 908 }; 909 errdefer self.error_context_token = token; 910 const full_command = self.buffer[start..end]; 911 912 const code_page = (parsePragmaCodePage(full_command) catch |err| switch (err) { 913 error.NotPragma, error.NotCodePagePragma => return, 914 else => |e| return e, 915 }) orelse self.default_code_page; 916 917 // https://learn.microsoft.com/en-us/windows/win32/menurc/pragma-directives 918 // > This pragma is not supported in an included resource file (.rc) 919 // 920 // Even though the Win32 behavior is to just ignore such directives silently, 921 // this is an error in the lexer to allow for emitting warnings/errors when 922 // such directives are found if that's wanted. The intention is for the lexer 923 // to still be able to work correctly after this error is returned. 924 if (self.source_mappings) |source_mappings| { 925 if (!source_mappings.isRootFile(token.line_number)) { 926 return error.CodePagePragmaInIncludedFile; 927 } 928 } 929 930 self.seen_pragma_code_pages +|= 1; 931 self.last_pragma_code_page_token = token; 932 self.current_code_page = code_page; 933 } 934 935 pub fn getErrorDetails(self: Self, lex_err: LexError) ErrorDetails { 936 const err = switch (lex_err) { 937 error.UnfinishedStringLiteral => ErrorDetails.Error.unfinished_string_literal, 938 error.StringLiteralTooLong => return .{ 939 .err = .string_literal_too_long, 940 .code_page = self.current_code_page, 941 .token = self.error_context_token.?, 942 .extra = .{ .number = self.max_string_literal_codepoints }, 943 }, 944 error.InvalidNumberWithExponent => ErrorDetails.Error.invalid_number_with_exponent, 945 error.InvalidDigitCharacterInNumberLiteral => ErrorDetails.Error.invalid_digit_character_in_number_literal, 946 error.IllegalByte => ErrorDetails.Error.illegal_byte, 947 error.IllegalByteOutsideStringLiterals => ErrorDetails.Error.illegal_byte_outside_string_literals, 948 error.IllegalCodepointOutsideStringLiterals => ErrorDetails.Error.illegal_codepoint_outside_string_literals, 949 error.IllegalByteOrderMark => ErrorDetails.Error.illegal_byte_order_mark, 950 error.IllegalPrivateUseCharacter => ErrorDetails.Error.illegal_private_use_character, 951 error.FoundCStyleEscapedQuote => ErrorDetails.Error.found_c_style_escaped_quote, 952 error.CodePagePragmaMissingLeftParen => ErrorDetails.Error.code_page_pragma_missing_left_paren, 953 error.CodePagePragmaMissingRightParen => ErrorDetails.Error.code_page_pragma_missing_right_paren, 954 error.CodePagePragmaInvalidCodePage => ErrorDetails.Error.code_page_pragma_invalid_code_page, 955 error.CodePagePragmaNotInteger => ErrorDetails.Error.code_page_pragma_not_integer, 956 error.CodePagePragmaOverflow => ErrorDetails.Error.code_page_pragma_overflow, 957 error.CodePagePragmaUnsupportedCodePage => ErrorDetails.Error.code_page_pragma_unsupported_code_page, 958 error.CodePagePragmaInIncludedFile => ErrorDetails.Error.code_page_pragma_in_included_file, 959 }; 960 return .{ 961 .err = err, 962 .code_page = self.current_code_page, 963 .token = self.error_context_token.?, 964 }; 965 } 966 }; 967 968 fn parseCodePageNum(str: []const u8) !u32 { 969 var x: u32 = 0; 970 for (str) |c| { 971 const digit = try std.fmt.charToDigit(c, 10); 972 if (x != 0) x = try std.math.mul(u32, x, 10); 973 x = try std.math.add(u32, x, digit); 974 } 975 return x; 976 } 977 978 /// Returns `null` when the code_page is set to DEFAULT 979 pub fn parsePragmaCodePage(full_command: []const u8) !?SupportedCodePage { 980 var command = full_command; 981 982 // Anything besides exactly this is ignored by the Windows RC implementation 983 const expected_directive = "#pragma"; 984 if (!std.mem.startsWith(u8, command, expected_directive)) return error.NotPragma; 985 command = command[expected_directive.len..]; 986 987 if (command.len == 0 or !std.ascii.isWhitespace(command[0])) return error.NotCodePagePragma; 988 while (command.len > 0 and std.ascii.isWhitespace(command[0])) { 989 command = command[1..]; 990 } 991 992 // Note: CoDe_PaGeZ is also treated as "code_page" by the Windows RC implementation, 993 // and it will error with 'Missing left parenthesis in code_page #pragma' 994 const expected_extension = "code_page"; 995 if (!std.ascii.startsWithIgnoreCase(command, expected_extension)) return error.NotCodePagePragma; 996 command = command[expected_extension.len..]; 997 998 while (command.len > 0 and std.ascii.isWhitespace(command[0])) { 999 command = command[1..]; 1000 } 1001 1002 if (command.len == 0 or command[0] != '(') { 1003 return error.CodePagePragmaMissingLeftParen; 1004 } 1005 command = command[1..]; 1006 1007 while (command.len > 0 and std.ascii.isWhitespace(command[0])) { 1008 command = command[1..]; 1009 } 1010 1011 var num_str: []u8 = command[0..0]; 1012 while (command.len > 0 and (command[0] != ')' and !std.ascii.isWhitespace(command[0]))) { 1013 command = command[1..]; 1014 num_str.len += 1; 1015 } 1016 1017 if (num_str.len == 0) { 1018 return error.CodePagePragmaNotInteger; 1019 } 1020 1021 while (command.len > 0 and std.ascii.isWhitespace(command[0])) { 1022 command = command[1..]; 1023 } 1024 1025 if (command.len == 0 or command[0] != ')') { 1026 return error.CodePagePragmaMissingRightParen; 1027 } 1028 1029 const code_page: ?SupportedCodePage = code_page: { 1030 if (std.ascii.eqlIgnoreCase("DEFAULT", num_str)) { 1031 break :code_page null; 1032 } 1033 1034 // The Win32 compiler behaves fairly strangely around maxInt(u32): 1035 // - If the overflowed u32 wraps and becomes a known code page ID, then 1036 // it will error/warn with "Codepage not valid: ignored" (depending on /w) 1037 // - If the overflowed u32 wraps and does not become a known code page ID, 1038 // then it will error with 'constant too big' and 'Codepage not integer' 1039 // 1040 // Instead of that, we just have a separate error specifically for overflow. 1041 const num = parseCodePageNum(num_str) catch |err| switch (err) { 1042 error.InvalidCharacter => return error.CodePagePragmaNotInteger, 1043 error.Overflow => return error.CodePagePragmaOverflow, 1044 }; 1045 1046 // Anything that starts with 0 but does not resolve to 0 is treated as invalid, e.g. 01252 1047 if (num_str[0] == '0' and num != 0) { 1048 return error.CodePagePragmaInvalidCodePage; 1049 } 1050 // Anything that resolves to 0 is treated as 'not an integer' by the Win32 implementation. 1051 else if (num == 0) { 1052 return error.CodePagePragmaNotInteger; 1053 } 1054 // Anything above u16 max is not going to be found since our CodePage enum is backed by a u16. 1055 if (num > std.math.maxInt(u16)) { 1056 return error.CodePagePragmaInvalidCodePage; 1057 } 1058 1059 break :code_page code_pages.getByIdentifierEnsureSupported(@intCast(num)) catch |err| switch (err) { 1060 error.InvalidCodePage => return error.CodePagePragmaInvalidCodePage, 1061 error.UnsupportedCodePage => return error.CodePagePragmaUnsupportedCodePage, 1062 }; 1063 }; 1064 1065 return code_page; 1066 } 1067 1068 fn testLexNormal(source: []const u8, expected_tokens: []const Token.Id) !void { 1069 var lexer = Lexer.init(source, .{}); 1070 if (dumpTokensDuringTests) std.debug.print("\n----------------------\n{s}\n----------------------\n", .{lexer.buffer}); 1071 for (expected_tokens) |expected_token_id| { 1072 const token = try lexer.nextNormal(); 1073 if (dumpTokensDuringTests) lexer.dump(&token); 1074 try std.testing.expectEqual(expected_token_id, token.id); 1075 } 1076 const last_token = try lexer.nextNormal(); 1077 try std.testing.expectEqual(Token.Id.eof, last_token.id); 1078 } 1079 1080 fn expectLexError(expected: LexError, actual: anytype) !void { 1081 try std.testing.expectError(expected, actual); 1082 if (dumpTokensDuringTests) std.debug.print("{!}\n", .{actual}); 1083 } 1084 1085 test "normal: numbers" { 1086 try testLexNormal("1", &.{.number}); 1087 try testLexNormal("-1", &.{.number}); 1088 try testLexNormal("- 1", &.{ .number, .number }); 1089 try testLexNormal("-a", &.{.number}); 1090 } 1091 1092 test "normal: string literals" { 1093 try testLexNormal("\"\"", &.{.quoted_ascii_string}); 1094 // "" is an escaped " 1095 try testLexNormal("\" \"\" \"", &.{.quoted_ascii_string}); 1096 } 1097 1098 test "superscript chars and code pages" { 1099 const firstToken = struct { 1100 pub fn firstToken(source: []const u8, default_code_page: SupportedCodePage, comptime lex_method: Lexer.LexMethod) LexError!Token { 1101 var lexer = Lexer.init(source, .{ .default_code_page = default_code_page }); 1102 return lexer.next(lex_method); 1103 } 1104 }.firstToken; 1105 const utf8_source = "²"; 1106 const windows1252_source = "\xB2"; 1107 1108 const windows1252_encoded_as_windows1252 = firstToken(windows1252_source, .windows1252, .normal); 1109 try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, windows1252_encoded_as_windows1252); 1110 1111 const utf8_encoded_as_windows1252 = try firstToken(utf8_source, .windows1252, .normal); 1112 try std.testing.expectEqual(Token{ 1113 .id = .literal, 1114 .start = 0, 1115 .end = 2, 1116 .line_number = 1, 1117 }, utf8_encoded_as_windows1252); 1118 1119 const utf8_encoded_as_utf8 = firstToken(utf8_source, .utf8, .normal); 1120 try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, utf8_encoded_as_utf8); 1121 1122 const windows1252_encoded_as_utf8 = try firstToken(windows1252_source, .utf8, .normal); 1123 try std.testing.expectEqual(Token{ 1124 .id = .literal, 1125 .start = 0, 1126 .end = 1, 1127 .line_number = 1, 1128 }, windows1252_encoded_as_utf8); 1129 }