lex.zig - zig - fork of https://codeberg.org/ziglang/zig

lex.zig (49189B) - Raw
      1 //! Expects to be run after the C preprocessor and after `removeComments`.
      2 //! This means that the lexer assumes that:
      3 //! - Splices ('\' at the end of a line) have been handled/collapsed.
      4 //! - Preprocessor directives and macros have been expanded (any remaining should be skipped with the exception of `#pragma code_page`).
      5 //! - All comments have been removed.
      6 
      7 const std = @import("std");
      8 const ErrorDetails = @import("errors.zig").ErrorDetails;
      9 const columnWidth = @import("literals.zig").columnWidth;
     10 const code_pages = @import("code_pages.zig");
     11 const SupportedCodePage = code_pages.SupportedCodePage;
     12 const SourceMappings = @import("source_mapping.zig").SourceMappings;
     13 const isNonAsciiDigit = @import("utils.zig").isNonAsciiDigit;
     14 
     15 const dumpTokensDuringTests = false;
     16 
     17 pub const default_max_string_literal_codepoints = 4097;
     18 
     19 pub const Token = struct {
     20     id: Id,
     21     start: usize,
     22     end: usize,
     23     line_number: usize,
     24 
     25     pub const Id = enum {
     26         literal,
     27         number,
     28         quoted_ascii_string,
     29         quoted_wide_string,
     30         operator,
     31         begin,
     32         end,
     33         comma,
     34         open_paren,
     35         close_paren,
     36         /// This Id is only used for errors, the Lexer will never return one
     37         /// of these from a `next` call.
     38         preprocessor_command,
     39         invalid,
     40         eof,
     41 
     42         pub fn nameForErrorDisplay(self: Id) []const u8 {
     43             return switch (self) {
     44                 .literal => "<literal>",
     45                 .number => "<number>",
     46                 .quoted_ascii_string => "<quoted ascii string>",
     47                 .quoted_wide_string => "<quoted wide string>",
     48                 .operator => "<operator>",
     49                 .begin => "<'{' or BEGIN>",
     50                 .end => "<'}' or END>",
     51                 .comma => ",",
     52                 .open_paren => "(",
     53                 .close_paren => ")",
     54                 .preprocessor_command => "<preprocessor command>",
     55                 .invalid => unreachable,
     56                 .eof => "<eof>",
     57             };
     58         }
     59     };
     60 
     61     pub fn slice(self: Token, buffer: []const u8) []const u8 {
     62         return buffer[self.start..self.end];
     63     }
     64 
     65     /// Returns 0-based column
     66     pub fn calculateColumn(token: Token, source: []const u8, tab_columns: usize, maybe_line_start: ?usize) usize {
     67         const line_start = maybe_line_start orelse token.getLineStartForColumnCalc(source);
     68 
     69         var i: usize = line_start;
     70         var column: usize = 0;
     71         while (i < token.start) : (i += 1) {
     72             column += columnWidth(column, source[i], tab_columns);
     73         }
     74         return column;
     75     }
     76 
     77     // TODO: More testing is needed to determine if this can be merged with getLineStartForErrorDisplay
     78     //       (the TODO in currentIndexFormsLineEndingPair should be taken into account as well)
     79     pub fn getLineStartForColumnCalc(token: Token, source: []const u8) usize {
     80         const line_start = line_start: {
     81             if (token.start != 0) {
     82                 // start checking at the byte before the token
     83                 var index = token.start - 1;
     84                 while (true) {
     85                     if (source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
     86                     if (index != 0) index -= 1 else break;
     87                 }
     88             }
     89             break :line_start 0;
     90         };
     91         return line_start;
     92     }
     93 
     94     pub fn getLineStartForErrorDisplay(token: Token, source: []const u8) usize {
     95         const line_start = line_start: {
     96             if (token.start != 0) {
     97                 // start checking at the byte before the token
     98                 var index = token.start - 1;
     99                 while (true) {
    100                     if (source[index] == '\r' or source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
    101                     if (index != 0) index -= 1 else break;
    102                 }
    103             }
    104             break :line_start 0;
    105         };
    106         return line_start;
    107     }
    108 
    109     pub fn getLineForErrorDisplay(token: Token, source: []const u8, maybe_line_start: ?usize) []const u8 {
    110         const line_start = maybe_line_start orelse token.getLineStartForErrorDisplay(source);
    111 
    112         var line_end = line_start;
    113         while (line_end < source.len and source[line_end] != '\r' and source[line_end] != '\n') : (line_end += 1) {}
    114         return source[line_start..line_end];
    115     }
    116 
    117     pub fn isStringLiteral(token: Token) bool {
    118         return token.id == .quoted_ascii_string or token.id == .quoted_wide_string;
    119     }
    120 };
    121 
    122 pub const LineHandler = struct {
    123     line_number: usize = 1,
    124     buffer: []const u8,
    125     last_line_ending_index: ?usize = null,
    126 
    127     /// Like incrementLineNumber but checks that the current char is a line ending first.
    128     /// Returns the new line number if it was incremented, null otherwise.
    129     pub fn maybeIncrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
    130         const c = self.buffer[cur_index];
    131         if (c == '\r' or c == '\n') {
    132             return self.incrementLineNumber(cur_index);
    133         }
    134         return null;
    135     }
    136 
    137     /// Increments line_number appropriately (handling line ending pairs)
    138     /// and returns the new line number if it was incremented, or null otherwise.
    139     pub fn incrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
    140         if (self.currentIndexFormsLineEndingPair(cur_index)) {
    141             self.last_line_ending_index = null;
    142             return null;
    143         } else {
    144             self.line_number += 1;
    145             self.last_line_ending_index = cur_index;
    146             return self.line_number;
    147         }
    148     }
    149 
    150     /// \r\n and \n\r pairs are treated as a single line ending (but not \r\r \n\n)
    151     /// expects self.index and last_line_ending_index (if non-null) to contain line endings
    152     ///
    153     /// TODO: This is not really how the Win32 RC compiler handles line endings. Instead, it
    154     ///       seems to drop all carriage returns during preprocessing and then replace all
    155     ///       remaining line endings with well-formed CRLF pairs (e.g. `<CR>a<CR>b<LF>c` becomes `ab<CR><LF>c`).
    156     ///       Handling this the same as the Win32 RC compiler would need control over the preprocessor,
    157     ///       since Clang converts unpaired <CR> into unpaired <LF>.
    158     pub fn currentIndexFormsLineEndingPair(self: *const LineHandler, cur_index: usize) bool {
    159         if (self.last_line_ending_index == null) return false;
    160 
    161         // must immediately precede the current index, we know cur_index must
    162         // be >= 1 since last_line_ending_index is non-null (so if the subtraction
    163         // overflows it is a bug at the callsite of this function).
    164         if (self.last_line_ending_index.? != cur_index - 1) return false;
    165 
    166         const cur_line_ending = self.buffer[cur_index];
    167         const last_line_ending = self.buffer[self.last_line_ending_index.?];
    168 
    169         // sanity check
    170         std.debug.assert(cur_line_ending == '\r' or cur_line_ending == '\n');
    171         std.debug.assert(last_line_ending == '\r' or last_line_ending == '\n');
    172 
    173         // can't be \n\n or \r\r
    174         if (last_line_ending == cur_line_ending) return false;
    175 
    176         return true;
    177     }
    178 };
    179 
    180 pub const LexError = error{
    181     UnfinishedStringLiteral,
    182     StringLiteralTooLong,
    183     InvalidNumberWithExponent,
    184     InvalidDigitCharacterInNumberLiteral,
    185     IllegalByte,
    186     IllegalByteOutsideStringLiterals,
    187     IllegalCodepointOutsideStringLiterals,
    188     IllegalByteOrderMark,
    189     IllegalPrivateUseCharacter,
    190     FoundCStyleEscapedQuote,
    191     CodePagePragmaMissingLeftParen,
    192     CodePagePragmaMissingRightParen,
    193     /// Can be caught and ignored
    194     CodePagePragmaInvalidCodePage,
    195     CodePagePragmaNotInteger,
    196     CodePagePragmaOverflow,
    197     CodePagePragmaUnsupportedCodePage,
    198     /// Can be caught and ignored
    199     CodePagePragmaInIncludedFile,
    200 };
    201 
    202 pub const Lexer = struct {
    203     const Self = @This();
    204 
    205     buffer: []const u8,
    206     index: usize,
    207     line_handler: LineHandler,
    208     at_start_of_line: bool = true,
    209     error_context_token: ?Token = null,
    210     current_code_page: SupportedCodePage,
    211     default_code_page: SupportedCodePage,
    212     source_mappings: ?*SourceMappings,
    213     max_string_literal_codepoints: u15,
    214     /// Needed to determine whether or not the output code page should
    215     /// be set in the parser.
    216     seen_pragma_code_pages: u2 = 0,
    217     last_pragma_code_page_token: ?Token = null,
    218 
    219     pub const Error = LexError;
    220 
    221     pub const LexerOptions = struct {
    222         default_code_page: SupportedCodePage = .windows1252,
    223         source_mappings: ?*SourceMappings = null,
    224         max_string_literal_codepoints: u15 = default_max_string_literal_codepoints,
    225     };
    226 
    227     pub fn init(buffer: []const u8, options: LexerOptions) Self {
    228         return Self{
    229             .buffer = buffer,
    230             .index = 0,
    231             .current_code_page = options.default_code_page,
    232             .default_code_page = options.default_code_page,
    233             .source_mappings = options.source_mappings,
    234             .max_string_literal_codepoints = options.max_string_literal_codepoints,
    235             .line_handler = .{ .buffer = buffer },
    236         };
    237     }
    238 
    239     pub fn dump(self: *Self, token: *const Token) void {
    240         std.debug.print("{s}:{d}: {f}\n", .{
    241             @tagName(token.id), token.line_number, std.ascii.hexEscape(token.slice(self.buffer), .lower),
    242         });
    243     }
    244 
    245     pub const LexMethod = enum {
    246         whitespace_delimiter_only,
    247         normal,
    248         normal_expect_operator,
    249     };
    250 
    251     pub fn next(self: *Self, comptime method: LexMethod) LexError!Token {
    252         switch (method) {
    253             .whitespace_delimiter_only => return self.nextWhitespaceDelimeterOnly(),
    254             .normal => return self.nextNormal(),
    255             .normal_expect_operator => return self.nextNormalWithContext(.expect_operator),
    256         }
    257     }
    258 
    259     const StateWhitespaceDelimiterOnly = enum {
    260         start,
    261         literal,
    262         preprocessor,
    263         semicolon,
    264     };
    265 
    266     pub fn nextWhitespaceDelimeterOnly(self: *Self) LexError!Token {
    267         const start_index = self.index;
    268         var result = Token{
    269             .id = .eof,
    270             .start = start_index,
    271             .end = undefined,
    272             .line_number = self.line_handler.line_number,
    273         };
    274         var state = StateWhitespaceDelimiterOnly.start;
    275 
    276         while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
    277             const c = codepoint.value;
    278             try self.checkForIllegalCodepoint(codepoint, false);
    279             switch (state) {
    280                 .start => switch (c) {
    281                     '\r', '\n' => {
    282                         result.start = self.index + 1;
    283                         result.line_number = self.incrementLineNumber();
    284                     },
    285                     ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
    286                         result.start = self.index + 1;
    287                     },
    288                     // NBSP only counts as whitespace at the start of a line (but
    289                     // can be intermixed with other whitespace). Who knows why.
    290                     // TODO: This should either be removed, or it should also include
    291                     //       the codepoints listed in disjoint_code_page.zig
    292                     '\xA0' => if (self.at_start_of_line) {
    293                         result.start = self.index + codepoint.byte_len;
    294                     } else {
    295                         state = .literal;
    296                         self.at_start_of_line = false;
    297                     },
    298                     '#' => {
    299                         if (self.at_start_of_line) {
    300                             state = .preprocessor;
    301                         } else {
    302                             state = .literal;
    303                         }
    304                         self.at_start_of_line = false;
    305                     },
    306                     ';' => {
    307                         state = .semicolon;
    308                         self.at_start_of_line = false;
    309                     },
    310                     else => {
    311                         state = .literal;
    312                         self.at_start_of_line = false;
    313                     },
    314                 },
    315                 .literal => switch (c) {
    316                     '\r', '\n', ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
    317                         result.id = .literal;
    318                         break;
    319                     },
    320                     else => {},
    321                 },
    322                 .preprocessor => switch (c) {
    323                     '\r', '\n' => {
    324                         try self.evaluatePreprocessorCommand(result.start, self.index);
    325                         result.start = self.index + 1;
    326                         state = .start;
    327                         result.line_number = self.incrementLineNumber();
    328                     },
    329                     else => {},
    330                 },
    331                 .semicolon => switch (c) {
    332                     '\r', '\n' => {
    333                         result.start = self.index + 1;
    334                         state = .start;
    335                         result.line_number = self.incrementLineNumber();
    336                     },
    337                     else => {},
    338                 },
    339             }
    340         } else { // got EOF
    341             switch (state) {
    342                 .start => {},
    343                 .semicolon => {
    344                     // Skip past everything up to the EOF
    345                     result.start = self.index;
    346                 },
    347                 .literal => {
    348                     result.id = .literal;
    349                 },
    350                 .preprocessor => {
    351                     try self.evaluatePreprocessorCommand(result.start, self.index);
    352                     result.start = self.index;
    353                 },
    354             }
    355         }
    356 
    357         result.end = self.index;
    358 
    359         // EOF tokens must have their start index match the end index
    360         std.debug.assert(result.id != .eof or result.start == result.end);
    361 
    362         return result;
    363     }
    364 
    365     const StateNormal = enum {
    366         start,
    367         literal_or_quoted_wide_string,
    368         quoted_ascii_string,
    369         quoted_wide_string,
    370         quoted_ascii_string_escape,
    371         quoted_wide_string_escape,
    372         quoted_ascii_string_maybe_end,
    373         quoted_wide_string_maybe_end,
    374         literal,
    375         number_literal,
    376         preprocessor,
    377         semicolon,
    378         // end
    379         e,
    380         en,
    381         // begin
    382         b,
    383         be,
    384         beg,
    385         begi,
    386     };
    387 
    388     /// TODO: A not-terrible name
    389     pub fn nextNormal(self: *Self) LexError!Token {
    390         return self.nextNormalWithContext(.any);
    391     }
    392 
    393     pub fn nextNormalWithContext(self: *Self, context: enum { expect_operator, any }) LexError!Token {
    394         const start_index = self.index;
    395         var result = Token{
    396             .id = .eof,
    397             .start = start_index,
    398             .end = undefined,
    399             .line_number = self.line_handler.line_number,
    400         };
    401         var state = StateNormal.start;
    402 
    403         // Note: The Windows RC compiler uses a non-standard method of computing
    404         //       length for its 'string literal too long' errors; it isn't easily
    405         //       explained or intuitive (it's sort-of pre-parsed byte length but with
    406         //       a few of exceptions/edge cases).
    407         //
    408         // It also behaves strangely with non-ASCII codepoints, e.g. even though the default
    409         // limit is 4097, you can only have 4094 € codepoints (1 UTF-16 code unit each),
    410         // and 2048 𐐷 codepoints (2 UTF-16 code units each).
    411         //
    412         // TODO: Understand this more, bring it more in line with how the Win32 limits work.
    413         //       Alternatively, do something that makes more sense but may be more permissive.
    414         var string_literal_length: usize = 0;
    415         // Keeping track of the string literal column prevents pathological edge cases when
    416         // there are tons of tab stop characters within a string literal.
    417         var string_literal_column: usize = 0;
    418         var string_literal_collapsing_whitespace: bool = false;
    419         var still_could_have_exponent: bool = true;
    420         var exponent_index: ?usize = null;
    421         while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
    422             const c = codepoint.value;
    423             const in_string_literal = switch (state) {
    424                 .quoted_ascii_string,
    425                 .quoted_wide_string,
    426                 .quoted_ascii_string_escape,
    427                 .quoted_wide_string_escape,
    428                 .quoted_ascii_string_maybe_end,
    429                 .quoted_wide_string_maybe_end,
    430                 =>
    431                 // If the current line is not the same line as the start of the string literal,
    432                 // then we want to treat the current codepoint as 'not in a string literal'
    433                 // for the purposes of detecting illegal codepoints. This means that we will
    434                 // error on illegal-outside-string-literal characters that are outside string
    435                 // literals from the perspective of a C preprocessor, but that may be
    436                 // inside string literals from the perspective of the RC lexer. For example,
    437                 // "hello
    438                 // @"
    439                 // will be treated as a single string literal by the RC lexer but the Win32
    440                 // preprocessor will consider this an unclosed string literal followed by
    441                 // the character @ and ", and will therefore error since the Win32 RC preprocessor
    442                 // errors on the @ character outside string literals.
    443                 //
    444                 // By doing this here, we can effectively emulate the Win32 RC preprocessor behavior
    445                 // at lex-time, and avoid the need for a separate step that checks for this edge-case
    446                 // specifically.
    447                 result.line_number == self.line_handler.line_number,
    448                 else => false,
    449             };
    450             try self.checkForIllegalCodepoint(codepoint, in_string_literal);
    451             switch (state) {
    452                 .start => switch (c) {
    453                     '\r', '\n' => {
    454                         result.start = self.index + 1;
    455                         result.line_number = self.incrementLineNumber();
    456                     },
    457                     ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
    458                         result.start = self.index + 1;
    459                     },
    460                     // NBSP only counts as whitespace at the start of a line (but
    461                     // can be intermixed with other whitespace). Who knows why.
    462                     '\xA0' => if (self.at_start_of_line) {
    463                         result.start = self.index + codepoint.byte_len;
    464                     } else {
    465                         state = .literal;
    466                         self.at_start_of_line = false;
    467                     },
    468                     'L', 'l' => {
    469                         state = .literal_or_quoted_wide_string;
    470                         self.at_start_of_line = false;
    471                     },
    472                     'E', 'e' => {
    473                         state = .e;
    474                         self.at_start_of_line = false;
    475                     },
    476                     'B', 'b' => {
    477                         state = .b;
    478                         self.at_start_of_line = false;
    479                     },
    480                     '"' => {
    481                         state = .quoted_ascii_string;
    482                         self.at_start_of_line = false;
    483                         string_literal_collapsing_whitespace = false;
    484                         string_literal_length = 0;
    485 
    486                         var dummy_token = Token{
    487                             .start = self.index,
    488                             .end = self.index,
    489                             .line_number = self.line_handler.line_number,
    490                             .id = .invalid,
    491                         };
    492                         string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
    493                     },
    494                     '+', '&', '|' => {
    495                         self.index += 1;
    496                         result.id = .operator;
    497                         self.at_start_of_line = false;
    498                         break;
    499                     },
    500                     '-' => {
    501                         if (context == .expect_operator) {
    502                             self.index += 1;
    503                             result.id = .operator;
    504                             self.at_start_of_line = false;
    505                             break;
    506                         } else {
    507                             state = .number_literal;
    508                             still_could_have_exponent = true;
    509                             exponent_index = null;
    510                             self.at_start_of_line = false;
    511                         }
    512                     },
    513                     '0'...'9', '~' => {
    514                         state = .number_literal;
    515                         still_could_have_exponent = true;
    516                         exponent_index = null;
    517                         self.at_start_of_line = false;
    518                     },
    519                     '#' => {
    520                         if (self.at_start_of_line) {
    521                             state = .preprocessor;
    522                         } else {
    523                             state = .literal;
    524                         }
    525                         self.at_start_of_line = false;
    526                     },
    527                     ';' => {
    528                         state = .semicolon;
    529                         self.at_start_of_line = false;
    530                     },
    531                     '{', '}' => {
    532                         self.index += 1;
    533                         result.id = if (c == '{') .begin else .end;
    534                         self.at_start_of_line = false;
    535                         break;
    536                     },
    537                     '(', ')' => {
    538                         self.index += 1;
    539                         result.id = if (c == '(') .open_paren else .close_paren;
    540                         self.at_start_of_line = false;
    541                         break;
    542                     },
    543                     ',' => {
    544                         self.index += 1;
    545                         result.id = .comma;
    546                         self.at_start_of_line = false;
    547                         break;
    548                     },
    549                     else => {
    550                         if (isNonAsciiDigit(c)) {
    551                             self.error_context_token = .{
    552                                 .id = .number,
    553                                 .start = result.start,
    554                                 .end = self.index + 1,
    555                                 .line_number = self.line_handler.line_number,
    556                             };
    557                             return error.InvalidDigitCharacterInNumberLiteral;
    558                         }
    559                         state = .literal;
    560                         self.at_start_of_line = false;
    561                     },
    562                 },
    563                 .preprocessor => switch (c) {
    564                     '\r', '\n' => {
    565                         try self.evaluatePreprocessorCommand(result.start, self.index);
    566                         result.start = self.index + 1;
    567                         state = .start;
    568                         result.line_number = self.incrementLineNumber();
    569                     },
    570                     else => {},
    571                 },
    572                 // Semi-colon acts as a line-terminator--everything is skipped until
    573                 // the next line.
    574                 .semicolon => switch (c) {
    575                     '\r', '\n' => {
    576                         result.start = self.index + 1;
    577                         state = .start;
    578                         result.line_number = self.incrementLineNumber();
    579                     },
    580                     else => {},
    581                 },
    582                 .number_literal => switch (c) {
    583                     // zig fmt: off
    584                     ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
    585                     '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
    586                     '\'', ';', '=',
    587                     => {
    588                     // zig fmt: on
    589                         result.id = .number;
    590                         break;
    591                     },
    592                     '0'...'9' => {
    593                         if (exponent_index) |exp_i| {
    594                             if (self.index - 1 == exp_i) {
    595                                 // Note: This being an error is a quirk of the preprocessor used by
    596                                 //       the Win32 RC compiler.
    597                                 self.error_context_token = .{
    598                                     .id = .number,
    599                                     .start = result.start,
    600                                     .end = self.index + 1,
    601                                     .line_number = self.line_handler.line_number,
    602                                 };
    603                                 return error.InvalidNumberWithExponent;
    604                             }
    605                         }
    606                     },
    607                     'e', 'E' => {
    608                         if (still_could_have_exponent) {
    609                             exponent_index = self.index;
    610                             still_could_have_exponent = false;
    611                         }
    612                     },
    613                     else => {
    614                         if (isNonAsciiDigit(c)) {
    615                             self.error_context_token = .{
    616                                 .id = .number,
    617                                 .start = result.start,
    618                                 .end = self.index + 1,
    619                                 .line_number = self.line_handler.line_number,
    620                             };
    621                             return error.InvalidDigitCharacterInNumberLiteral;
    622                         }
    623                         still_could_have_exponent = false;
    624                     },
    625                 },
    626                 .literal_or_quoted_wide_string => switch (c) {
    627                     // zig fmt: off
    628                     ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
    629                     '\r', '\n', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
    630                     '\'', ';', '=',
    631                     // zig fmt: on
    632                     => {
    633                         result.id = .literal;
    634                         break;
    635                     },
    636                     '"' => {
    637                         state = .quoted_wide_string;
    638                         string_literal_collapsing_whitespace = false;
    639                         string_literal_length = 0;
    640 
    641                         var dummy_token = Token{
    642                             .start = self.index,
    643                             .end = self.index,
    644                             .line_number = self.line_handler.line_number,
    645                             .id = .invalid,
    646                         };
    647                         string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
    648                     },
    649                     else => {
    650                         state = .literal;
    651                     },
    652                 },
    653                 .literal => switch (c) {
    654                     // zig fmt: off
    655                     ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
    656                     '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
    657                     '\'', ';', '=',
    658                     => {
    659                     // zig fmt: on
    660                         result.id = .literal;
    661                         break;
    662                     },
    663                     else => {},
    664                 },
    665                 .e => switch (c) {
    666                     'N', 'n' => {
    667                         state = .en;
    668                     },
    669                     else => {
    670                         state = .literal;
    671                         self.index -= 1;
    672                     },
    673                 },
    674                 .en => switch (c) {
    675                     'D', 'd' => {
    676                         result.id = .end;
    677                         self.index += 1;
    678                         break;
    679                     },
    680                     else => {
    681                         state = .literal;
    682                         self.index -= 1;
    683                     },
    684                 },
    685                 .b => switch (c) {
    686                     'E', 'e' => {
    687                         state = .be;
    688                     },
    689                     else => {
    690                         state = .literal;
    691                         self.index -= 1;
    692                     },
    693                 },
    694                 .be => switch (c) {
    695                     'G', 'g' => {
    696                         state = .beg;
    697                     },
    698                     else => {
    699                         state = .literal;
    700                         self.index -= 1;
    701                     },
    702                 },
    703                 .beg => switch (c) {
    704                     'I', 'i' => {
    705                         state = .begi;
    706                     },
    707                     else => {
    708                         state = .literal;
    709                         self.index -= 1;
    710                     },
    711                 },
    712                 .begi => switch (c) {
    713                     'N', 'n' => {
    714                         result.id = .begin;
    715                         self.index += 1;
    716                         break;
    717                     },
    718                     else => {
    719                         state = .literal;
    720                         self.index -= 1;
    721                     },
    722                 },
    723                 .quoted_ascii_string, .quoted_wide_string => switch (c) {
    724                     '"' => {
    725                         string_literal_column += 1;
    726                         state = if (state == .quoted_ascii_string) .quoted_ascii_string_maybe_end else .quoted_wide_string_maybe_end;
    727                     },
    728                     '\\' => {
    729                         string_literal_length += 1;
    730                         string_literal_column += 1;
    731                         state = if (state == .quoted_ascii_string) .quoted_ascii_string_escape else .quoted_wide_string_escape;
    732                     },
    733                     '\r' => {
    734                         string_literal_column = 0;
    735                         // \r doesn't count towards string literal length
    736 
    737                         // Increment line number but don't affect the result token's line number
    738                         _ = self.incrementLineNumber();
    739                     },
    740                     '\n' => {
    741                         string_literal_column = 0;
    742                         // first \n expands to <space><\n>
    743                         if (!string_literal_collapsing_whitespace) {
    744                             string_literal_length += 2;
    745                             string_literal_collapsing_whitespace = true;
    746                         }
    747                         // the rest are collapsed into the <space><\n>
    748 
    749                         // Increment line number but don't affect the result token's line number
    750                         _ = self.incrementLineNumber();
    751                     },
    752                     // only \t, space, Vertical Tab, and Form Feed count as whitespace when collapsing
    753                     '\t', ' ', '\x0b', '\x0c' => {
    754                         if (!string_literal_collapsing_whitespace) {
    755                             // Literal tab characters are counted as the number of space characters
    756                             // needed to reach the next 8-column tab stop.
    757                             const width = columnWidth(string_literal_column, @intCast(c), 8);
    758                             string_literal_length += width;
    759                             string_literal_column += width;
    760                         }
    761                     },
    762                     else => {
    763                         string_literal_collapsing_whitespace = false;
    764                         string_literal_length += 1;
    765                         string_literal_column += 1;
    766                     },
    767                 },
    768                 .quoted_ascii_string_escape, .quoted_wide_string_escape => switch (c) {
    769                     '"' => {
    770                         self.error_context_token = .{
    771                             .id = .invalid,
    772                             .start = self.index - 1,
    773                             .end = self.index + 1,
    774                             .line_number = self.line_handler.line_number,
    775                         };
    776                         return error.FoundCStyleEscapedQuote;
    777                     },
    778                     else => {
    779                         string_literal_length += 1;
    780                         string_literal_column += 1;
    781                         state = if (state == .quoted_ascii_string_escape) .quoted_ascii_string else .quoted_wide_string;
    782                     },
    783                 },
    784                 .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => switch (c) {
    785                     '"' => {
    786                         state = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
    787                         // Escaped quotes count as 1 char for string literal length checks.
    788                         // Since we did not increment on the first " (because it could have been
    789                         // the end of the quoted string), we increment here
    790                         string_literal_length += 1;
    791                         string_literal_column += 1;
    792                     },
    793                     else => {
    794                         result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
    795                         break;
    796                     },
    797                 },
    798             }
    799         } else { // got EOF
    800             switch (state) {
    801                 .start => {},
    802                 .semicolon => {
    803                     // Skip past everything up to the EOF
    804                     result.start = self.index;
    805                 },
    806                 .literal_or_quoted_wide_string, .literal, .e, .en, .b, .be, .beg, .begi => {
    807                     result.id = .literal;
    808                 },
    809                 .preprocessor => {
    810                     try self.evaluatePreprocessorCommand(result.start, self.index);
    811                     result.start = self.index;
    812                 },
    813                 .number_literal => {
    814                     result.id = .number;
    815                 },
    816                 .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => {
    817                     result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
    818                 },
    819                 .quoted_ascii_string,
    820                 .quoted_wide_string,
    821                 .quoted_ascii_string_escape,
    822                 .quoted_wide_string_escape,
    823                 => {
    824                     self.error_context_token = .{
    825                         .id = .eof,
    826                         .start = self.index,
    827                         .end = self.index,
    828                         .line_number = self.line_handler.line_number,
    829                     };
    830                     return LexError.UnfinishedStringLiteral;
    831                 },
    832             }
    833         }
    834 
    835         result.end = self.index;
    836 
    837         if (result.id == .quoted_ascii_string or result.id == .quoted_wide_string) {
    838             if (string_literal_length > self.max_string_literal_codepoints) {
    839                 self.error_context_token = result;
    840                 return LexError.StringLiteralTooLong;
    841             }
    842         }
    843 
    844         // EOF tokens must have their start index match the end index
    845         std.debug.assert(result.id != .eof or result.start == result.end);
    846 
    847         return result;
    848     }
    849 
    850     /// Increments line_number appropriately (handling line ending pairs)
    851     /// and returns the new line number.
    852     fn incrementLineNumber(self: *Self) usize {
    853         _ = self.line_handler.incrementLineNumber(self.index);
    854         self.at_start_of_line = true;
    855         return self.line_handler.line_number;
    856     }
    857 
    858     fn checkForIllegalCodepoint(self: *Self, codepoint: code_pages.Codepoint, in_string_literal: bool) LexError!void {
    859         const err = switch (codepoint.value) {
    860             // 0x00 = NUL
    861             // 0x1A = Substitute (treated as EOF)
    862             // NOTE: 0x1A gets treated as EOF by the clang preprocessor so after a .rc file
    863             //       is run through the clang preprocessor it will no longer have 0x1A characters in it.
    864             // 0x7F = DEL (treated as a context-specific terminator by the Windows RC compiler)
    865             0x00, 0x1A, 0x7F => error.IllegalByte,
    866             // 0x01...0x03 result in strange 'macro definition too big' errors when used outside of string literals
    867             // 0x04 is valid but behaves strangely (sort of acts as a 'skip the next character' instruction)
    868             0x01...0x04 => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
    869             // @ and ` both result in error RC2018: unknown character '0x60' (and subsequently
    870             // fatal error RC1116: RC terminating after preprocessor errors) if they are ever used
    871             // outside of string literals. Not exactly sure why this would be the case, though.
    872             // TODO: Make sure there aren't any exceptions
    873             '@', '`' => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
    874             // The Byte Order Mark is mostly skipped over by the Windows RC compiler, but
    875             // there are edge cases where it leads to cryptic 'compiler limit : macro definition too big'
    876             // errors (e.g. a BOM within a number literal). By making this illegal we avoid having to
    877             // deal with a lot of edge cases and remove the potential footgun of the bytes of a BOM
    878             // being 'missing' when included in a string literal (the Windows RC compiler acts as
    879             // if the codepoint was never part of the string literal).
    880             '\u{FEFF}' => error.IllegalByteOrderMark,
    881             // Similar deal with this private use codepoint, it gets skipped/ignored by the
    882             // RC compiler (but without the cryptic errors). Silently dropping bytes still seems like
    883             // enough of a footgun with no real use-cases that it's still worth erroring instead of
    884             // emulating the RC compiler's behavior, though.
    885             '\u{E000}' => error.IllegalPrivateUseCharacter,
    886             // These codepoints lead to strange errors when used outside of string literals,
    887             // and miscompilations when used within string literals. We avoid the miscompilation
    888             // within string literals and emit a warning, but outside of string literals it makes
    889             // more sense to just disallow these codepoints.
    890             0x900, 0xA00, 0xA0D, 0x2000, 0xD00, 0xFFFE, 0xFFFF => if (!in_string_literal) error.IllegalCodepointOutsideStringLiterals else return,
    891             else => return,
    892         };
    893         self.error_context_token = .{
    894             .id = .invalid,
    895             .start = self.index,
    896             .end = self.index + codepoint.byte_len,
    897             .line_number = self.line_handler.line_number,
    898         };
    899         return err;
    900     }
    901 
    902     fn evaluatePreprocessorCommand(self: *Self, start: usize, end: usize) !void {
    903         const token = Token{
    904             .id = .preprocessor_command,
    905             .start = start,
    906             .end = end,
    907             .line_number = self.line_handler.line_number,
    908         };
    909         errdefer self.error_context_token = token;
    910         const full_command = self.buffer[start..end];
    911 
    912         const code_page = (parsePragmaCodePage(full_command) catch |err| switch (err) {
    913             error.NotPragma, error.NotCodePagePragma => return,
    914             else => |e| return e,
    915         }) orelse self.default_code_page;
    916 
    917         // https://learn.microsoft.com/en-us/windows/win32/menurc/pragma-directives
    918         // > This pragma is not supported in an included resource file (.rc)
    919         //
    920         // Even though the Win32 behavior is to just ignore such directives silently,
    921         // this is an error in the lexer to allow for emitting warnings/errors when
    922         // such directives are found if that's wanted. The intention is for the lexer
    923         // to still be able to work correctly after this error is returned.
    924         if (self.source_mappings) |source_mappings| {
    925             if (!source_mappings.isRootFile(token.line_number)) {
    926                 return error.CodePagePragmaInIncludedFile;
    927             }
    928         }
    929 
    930         self.seen_pragma_code_pages +|= 1;
    931         self.last_pragma_code_page_token = token;
    932         self.current_code_page = code_page;
    933     }
    934 
    935     pub fn getErrorDetails(self: Self, lex_err: LexError) ErrorDetails {
    936         const err = switch (lex_err) {
    937             error.UnfinishedStringLiteral => ErrorDetails.Error.unfinished_string_literal,
    938             error.StringLiteralTooLong => return .{
    939                 .err = .string_literal_too_long,
    940                 .code_page = self.current_code_page,
    941                 .token = self.error_context_token.?,
    942                 .extra = .{ .number = self.max_string_literal_codepoints },
    943             },
    944             error.InvalidNumberWithExponent => ErrorDetails.Error.invalid_number_with_exponent,
    945             error.InvalidDigitCharacterInNumberLiteral => ErrorDetails.Error.invalid_digit_character_in_number_literal,
    946             error.IllegalByte => ErrorDetails.Error.illegal_byte,
    947             error.IllegalByteOutsideStringLiterals => ErrorDetails.Error.illegal_byte_outside_string_literals,
    948             error.IllegalCodepointOutsideStringLiterals => ErrorDetails.Error.illegal_codepoint_outside_string_literals,
    949             error.IllegalByteOrderMark => ErrorDetails.Error.illegal_byte_order_mark,
    950             error.IllegalPrivateUseCharacter => ErrorDetails.Error.illegal_private_use_character,
    951             error.FoundCStyleEscapedQuote => ErrorDetails.Error.found_c_style_escaped_quote,
    952             error.CodePagePragmaMissingLeftParen => ErrorDetails.Error.code_page_pragma_missing_left_paren,
    953             error.CodePagePragmaMissingRightParen => ErrorDetails.Error.code_page_pragma_missing_right_paren,
    954             error.CodePagePragmaInvalidCodePage => ErrorDetails.Error.code_page_pragma_invalid_code_page,
    955             error.CodePagePragmaNotInteger => ErrorDetails.Error.code_page_pragma_not_integer,
    956             error.CodePagePragmaOverflow => ErrorDetails.Error.code_page_pragma_overflow,
    957             error.CodePagePragmaUnsupportedCodePage => ErrorDetails.Error.code_page_pragma_unsupported_code_page,
    958             error.CodePagePragmaInIncludedFile => ErrorDetails.Error.code_page_pragma_in_included_file,
    959         };
    960         return .{
    961             .err = err,
    962             .code_page = self.current_code_page,
    963             .token = self.error_context_token.?,
    964         };
    965     }
    966 };
    967 
    968 fn parseCodePageNum(str: []const u8) !u32 {
    969     var x: u32 = 0;
    970     for (str) |c| {
    971         const digit = try std.fmt.charToDigit(c, 10);
    972         if (x != 0) x = try std.math.mul(u32, x, 10);
    973         x = try std.math.add(u32, x, digit);
    974     }
    975     return x;
    976 }
    977 
    978 /// Returns `null` when the code_page is set to DEFAULT
    979 pub fn parsePragmaCodePage(full_command: []const u8) !?SupportedCodePage {
    980     var command = full_command;
    981 
    982     // Anything besides exactly this is ignored by the Windows RC implementation
    983     const expected_directive = "#pragma";
    984     if (!std.mem.startsWith(u8, command, expected_directive)) return error.NotPragma;
    985     command = command[expected_directive.len..];
    986 
    987     if (command.len == 0 or !std.ascii.isWhitespace(command[0])) return error.NotCodePagePragma;
    988     while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
    989         command = command[1..];
    990     }
    991 
    992     // Note: CoDe_PaGeZ is also treated as "code_page" by the Windows RC implementation,
    993     //       and it will error with 'Missing left parenthesis in code_page #pragma'
    994     const expected_extension = "code_page";
    995     if (!std.ascii.startsWithIgnoreCase(command, expected_extension)) return error.NotCodePagePragma;
    996     command = command[expected_extension.len..];
    997 
    998     while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
    999         command = command[1..];
   1000     }
   1001 
   1002     if (command.len == 0 or command[0] != '(') {
   1003         return error.CodePagePragmaMissingLeftParen;
   1004     }
   1005     command = command[1..];
   1006 
   1007     while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
   1008         command = command[1..];
   1009     }
   1010 
   1011     var num_str: []u8 = command[0..0];
   1012     while (command.len > 0 and (command[0] != ')' and !std.ascii.isWhitespace(command[0]))) {
   1013         command = command[1..];
   1014         num_str.len += 1;
   1015     }
   1016 
   1017     if (num_str.len == 0) {
   1018         return error.CodePagePragmaNotInteger;
   1019     }
   1020 
   1021     while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
   1022         command = command[1..];
   1023     }
   1024 
   1025     if (command.len == 0 or command[0] != ')') {
   1026         return error.CodePagePragmaMissingRightParen;
   1027     }
   1028 
   1029     const code_page: ?SupportedCodePage = code_page: {
   1030         if (std.ascii.eqlIgnoreCase("DEFAULT", num_str)) {
   1031             break :code_page null;
   1032         }
   1033 
   1034         // The Win32 compiler behaves fairly strangely around maxInt(u32):
   1035         // - If the overflowed u32 wraps and becomes a known code page ID, then
   1036         //   it will error/warn with "Codepage not valid:  ignored" (depending on /w)
   1037         // - If the overflowed u32 wraps and does not become a known code page ID,
   1038         //   then it will error with 'constant too big' and 'Codepage not integer'
   1039         //
   1040         // Instead of that, we just have a separate error specifically for overflow.
   1041         const num = parseCodePageNum(num_str) catch |err| switch (err) {
   1042             error.InvalidCharacter => return error.CodePagePragmaNotInteger,
   1043             error.Overflow => return error.CodePagePragmaOverflow,
   1044         };
   1045 
   1046         // Anything that starts with 0 but does not resolve to 0 is treated as invalid, e.g. 01252
   1047         if (num_str[0] == '0' and num != 0) {
   1048             return error.CodePagePragmaInvalidCodePage;
   1049         }
   1050         // Anything that resolves to 0 is treated as 'not an integer' by the Win32 implementation.
   1051         else if (num == 0) {
   1052             return error.CodePagePragmaNotInteger;
   1053         }
   1054         // Anything above u16 max is not going to be found since our CodePage enum is backed by a u16.
   1055         if (num > std.math.maxInt(u16)) {
   1056             return error.CodePagePragmaInvalidCodePage;
   1057         }
   1058 
   1059         break :code_page code_pages.getByIdentifierEnsureSupported(@intCast(num)) catch |err| switch (err) {
   1060             error.InvalidCodePage => return error.CodePagePragmaInvalidCodePage,
   1061             error.UnsupportedCodePage => return error.CodePagePragmaUnsupportedCodePage,
   1062         };
   1063     };
   1064 
   1065     return code_page;
   1066 }
   1067 
   1068 fn testLexNormal(source: []const u8, expected_tokens: []const Token.Id) !void {
   1069     var lexer = Lexer.init(source, .{});
   1070     if (dumpTokensDuringTests) std.debug.print("\n----------------------\n{s}\n----------------------\n", .{lexer.buffer});
   1071     for (expected_tokens) |expected_token_id| {
   1072         const token = try lexer.nextNormal();
   1073         if (dumpTokensDuringTests) lexer.dump(&token);
   1074         try std.testing.expectEqual(expected_token_id, token.id);
   1075     }
   1076     const last_token = try lexer.nextNormal();
   1077     try std.testing.expectEqual(Token.Id.eof, last_token.id);
   1078 }
   1079 
   1080 fn expectLexError(expected: LexError, actual: anytype) !void {
   1081     try std.testing.expectError(expected, actual);
   1082     if (dumpTokensDuringTests) std.debug.print("{!}\n", .{actual});
   1083 }
   1084 
   1085 test "normal: numbers" {
   1086     try testLexNormal("1", &.{.number});
   1087     try testLexNormal("-1", &.{.number});
   1088     try testLexNormal("- 1", &.{ .number, .number });
   1089     try testLexNormal("-a", &.{.number});
   1090 }
   1091 
   1092 test "normal: string literals" {
   1093     try testLexNormal("\"\"", &.{.quoted_ascii_string});
   1094     // "" is an escaped "
   1095     try testLexNormal("\" \"\" \"", &.{.quoted_ascii_string});
   1096 }
   1097 
   1098 test "superscript chars and code pages" {
   1099     const firstToken = struct {
   1100         pub fn firstToken(source: []const u8, default_code_page: SupportedCodePage, comptime lex_method: Lexer.LexMethod) LexError!Token {
   1101             var lexer = Lexer.init(source, .{ .default_code_page = default_code_page });
   1102             return lexer.next(lex_method);
   1103         }
   1104     }.firstToken;
   1105     const utf8_source = "²";
   1106     const windows1252_source = "\xB2";
   1107 
   1108     const windows1252_encoded_as_windows1252 = firstToken(windows1252_source, .windows1252, .normal);
   1109     try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, windows1252_encoded_as_windows1252);
   1110 
   1111     const utf8_encoded_as_windows1252 = try firstToken(utf8_source, .windows1252, .normal);
   1112     try std.testing.expectEqual(Token{
   1113         .id = .literal,
   1114         .start = 0,
   1115         .end = 2,
   1116         .line_number = 1,
   1117     }, utf8_encoded_as_windows1252);
   1118 
   1119     const utf8_encoded_as_utf8 = firstToken(utf8_source, .utf8, .normal);
   1120     try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, utf8_encoded_as_utf8);
   1121 
   1122     const windows1252_encoded_as_utf8 = try firstToken(windows1252_source, .utf8, .normal);
   1123     try std.testing.expectEqual(Token{
   1124         .id = .literal,
   1125         .start = 0,
   1126         .end = 1,
   1127         .line_number = 1,
   1128     }, windows1252_encoded_as_utf8);
   1129 }
	zig fork of https://codeberg.org/ziglang/zig
	Log \| Files \| Refs \| README \| LICENSE