zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

comments.zig (12784B) - Raw


      1 //! Expects to run after a C preprocessor step that preserves comments.
      2 //!
      3 //! `rc` has a peculiar quirk where something like `blah/**/blah` will be
      4 //! transformed into `blahblah` during parsing. However, `clang -E` will
      5 //! transform it into `blah blah`, so in order to match `rc`, we need
      6 //! to remove comments ourselves after the preprocessor runs.
      7 //! Note: Multiline comments that actually span more than one line do
      8 //!       get translated to a space character by `rc`.
      9 //!
     10 //! Removing comments before lexing also allows the lexer to not have to
     11 //! deal with comments which would complicate its implementation (this is something
     12 //! of a tradeoff, as removing comments in a separate pass means that we'll
     13 //! need to iterate the source twice instead of once, but having to deal with
     14 //! comments when lexing would be a pain).
     15 
     16 const std = @import("std");
     17 const Allocator = std.mem.Allocator;
     18 const UncheckedSliceWriter = @import("utils.zig").UncheckedSliceWriter;
     19 const SourceMappings = @import("source_mapping.zig").SourceMappings;
     20 const LineHandler = @import("lex.zig").LineHandler;
     21 const formsLineEndingPair = @import("source_mapping.zig").formsLineEndingPair;
     22 
     23 /// `buf` must be at least as long as `source`
     24 /// In-place transformation is supported (i.e. `source` and `buf` can be the same slice)
     25 pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMappings) ![]u8 {
     26     std.debug.assert(buf.len >= source.len);
     27     var result = UncheckedSliceWriter{ .slice = buf };
     28     const State = enum {
     29         start,
     30         forward_slash,
     31         line_comment,
     32         multiline_comment,
     33         multiline_comment_end,
     34         single_quoted,
     35         single_quoted_escape,
     36         double_quoted,
     37         double_quoted_escape,
     38     };
     39     var state: State = .start;
     40     var index: usize = 0;
     41     var pending_start: ?usize = null;
     42     var line_handler = LineHandler{ .buffer = source };
     43     while (index < source.len) : (index += 1) {
     44         const c = source[index];
     45         // TODO: Disallow \x1A, \x00, \x7F in comments. At least \x1A and \x00 can definitely
     46         //       cause errors or parsing weirdness in the Win32 RC compiler. These are disallowed
     47         //       in the lexer, but comments are stripped before getting to the lexer.
     48         switch (state) {
     49             .start => switch (c) {
     50                 '/' => {
     51                     state = .forward_slash;
     52                     pending_start = index;
     53                 },
     54                 '\r', '\n' => {
     55                     _ = line_handler.incrementLineNumber(index);
     56                     result.write(c);
     57                 },
     58                 else => {
     59                     switch (c) {
     60                         '"' => state = .double_quoted,
     61                         '\'' => state = .single_quoted,
     62                         else => {},
     63                     }
     64                     result.write(c);
     65                 },
     66             },
     67             .forward_slash => switch (c) {
     68                 '/' => state = .line_comment,
     69                 '*' => {
     70                     state = .multiline_comment;
     71                 },
     72                 else => {
     73                     _ = line_handler.maybeIncrementLineNumber(index);
     74                     result.writeSlice(source[pending_start.? .. index + 1]);
     75                     pending_start = null;
     76                     state = .start;
     77                 },
     78             },
     79             .line_comment => switch (c) {
     80                 '\r', '\n' => {
     81                     _ = line_handler.incrementLineNumber(index);
     82                     result.write(c);
     83                     state = .start;
     84                 },
     85                 else => {},
     86             },
     87             .multiline_comment => switch (c) {
     88                 '\r' => try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings),
     89                 '\n' => {
     90                     _ = line_handler.incrementLineNumber(index);
     91                     result.write(c);
     92                 },
     93                 '*' => state = .multiline_comment_end,
     94                 else => {},
     95             },
     96             .multiline_comment_end => switch (c) {
     97                 '\r' => {
     98                     try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings);
     99                     // We only want to treat this as a newline if it's part of a CRLF pair. If it's
    100                     // not, then we still want to stay in .multiline_comment_end, so that e.g. `*<\r>/` still
    101                     // functions as a `*/` comment ending. Kinda crazy, but that's how the Win32 implementation works.
    102                     if (formsLineEndingPair(source, '\r', index + 1)) {
    103                         state = .multiline_comment;
    104                     }
    105                 },
    106                 '\n' => {
    107                     _ = line_handler.incrementLineNumber(index);
    108                     result.write(c);
    109                     state = .multiline_comment;
    110                 },
    111                 '/' => {
    112                     state = .start;
    113                 },
    114                 else => {
    115                     state = .multiline_comment;
    116                 },
    117             },
    118             .single_quoted => switch (c) {
    119                 '\r', '\n' => {
    120                     _ = line_handler.incrementLineNumber(index);
    121                     state = .start;
    122                     result.write(c);
    123                 },
    124                 '\\' => {
    125                     state = .single_quoted_escape;
    126                     result.write(c);
    127                 },
    128                 '\'' => {
    129                     state = .start;
    130                     result.write(c);
    131                 },
    132                 else => {
    133                     result.write(c);
    134                 },
    135             },
    136             .single_quoted_escape => switch (c) {
    137                 '\r', '\n' => {
    138                     _ = line_handler.incrementLineNumber(index);
    139                     state = .start;
    140                     result.write(c);
    141                 },
    142                 else => {
    143                     state = .single_quoted;
    144                     result.write(c);
    145                 },
    146             },
    147             .double_quoted => switch (c) {
    148                 '\r', '\n' => {
    149                     _ = line_handler.incrementLineNumber(index);
    150                     state = .start;
    151                     result.write(c);
    152                 },
    153                 '\\' => {
    154                     state = .double_quoted_escape;
    155                     result.write(c);
    156                 },
    157                 '"' => {
    158                     state = .start;
    159                     result.write(c);
    160                 },
    161                 else => {
    162                     result.write(c);
    163                 },
    164             },
    165             .double_quoted_escape => switch (c) {
    166                 '\r', '\n' => {
    167                     _ = line_handler.incrementLineNumber(index);
    168                     state = .start;
    169                     result.write(c);
    170                 },
    171                 else => {
    172                     state = .double_quoted;
    173                     result.write(c);
    174                 },
    175             },
    176         }
    177     } else {
    178         switch (state) {
    179             .start,
    180             .line_comment,
    181             .multiline_comment,
    182             .multiline_comment_end,
    183             .single_quoted,
    184             .single_quoted_escape,
    185             .double_quoted,
    186             .double_quoted_escape,
    187             => {},
    188             .forward_slash => {
    189                 result.writeSlice(source[pending_start.?..index]);
    190             },
    191         }
    192     }
    193     return result.getWritten();
    194 }
    195 
    196 inline fn handleMultilineCarriageReturn(
    197     source: []const u8,
    198     line_handler: *LineHandler,
    199     index: usize,
    200     result: *UncheckedSliceWriter,
    201     source_mappings: ?*SourceMappings,
    202 ) !void {
    203     // This is a dumb way to go about this, but basically we want to determine
    204     // if this is part of a distinct CRLF or LFCR pair. This function call will detect
    205     // LFCR pairs correctly since the function we're in will only be called on CR,
    206     // but will not detect CRLF pairs since it only looks at the line ending before the
    207     // CR. So, we do a second (forward) check if the first fails to detect CRLF that is
    208     // not part of another pair.
    209     const is_lfcr_pair = line_handler.currentIndexFormsLineEndingPair(index);
    210     const is_crlf_pair = !is_lfcr_pair and formsLineEndingPair(source, '\r', index + 1);
    211     // Note: Bare \r within a multiline comment should *not* be treated as a line ending for the
    212     // purposes of removing comments, but *should* be treated as a line ending for the
    213     // purposes of line counting/source mapping
    214     _ = line_handler.incrementLineNumber(index);
    215     // So only write the \r if it's part of a CRLF/LFCR pair
    216     if (is_lfcr_pair or is_crlf_pair) {
    217         result.write('\r');
    218     }
    219     // And otherwise, we want to collapse the source mapping so that we can still know which
    220     // line came from where.
    221     else {
    222         // Because the line gets collapsed, we need to decrement line number so that
    223         // the next collapse acts on the first of the collapsed line numbers
    224         line_handler.line_number -= 1;
    225         if (source_mappings) |mappings| {
    226             try mappings.collapse(line_handler.line_number, 1);
    227         }
    228     }
    229 }
    230 
    231 pub fn removeCommentsAlloc(allocator: Allocator, source: []const u8, source_mappings: ?*SourceMappings) ![]u8 {
    232     const buf = try allocator.alloc(u8, source.len);
    233     errdefer allocator.free(buf);
    234     const result = try removeComments(source, buf, source_mappings);
    235     return allocator.realloc(buf, result.len);
    236 }
    237 
    238 fn testRemoveComments(expected: []const u8, source: []const u8) !void {
    239     const result = try removeCommentsAlloc(std.testing.allocator, source, null);
    240     defer std.testing.allocator.free(result);
    241 
    242     try std.testing.expectEqualStrings(expected, result);
    243 }
    244 
    245 test "basic" {
    246     try testRemoveComments("", "// comment");
    247     try testRemoveComments("", "/* comment */");
    248 }
    249 
    250 test "mixed" {
    251     try testRemoveComments("hello", "hello// comment");
    252     try testRemoveComments("hello", "hel/* comment */lo");
    253 }
    254 
    255 test "within a string" {
    256     // escaped " is \"
    257     try testRemoveComments(
    258         \\blah"//som\"/*ething*/"BLAH
    259     ,
    260         \\blah"//som\"/*ething*/"BLAH
    261     );
    262 }
    263 
    264 test "line comments retain newlines" {
    265     try testRemoveComments(
    266         \\
    267         \\
    268         \\
    269     ,
    270         \\// comment
    271         \\// comment
    272         \\// comment
    273     );
    274 
    275     try testRemoveComments("\r\n", "//comment\r\n");
    276 }
    277 
    278 test "unfinished multiline comment" {
    279     try testRemoveComments(
    280         \\unfinished
    281         \\
    282     ,
    283         \\unfinished/*
    284         \\
    285     );
    286 }
    287 
    288 test "crazy" {
    289     try testRemoveComments(
    290         \\blah"/*som*/\""BLAH
    291     ,
    292         \\blah"/*som*/\""/*ething*/BLAH
    293     );
    294 
    295     try testRemoveComments(
    296         \\blah"/*som*/"BLAH RCDATA "BEGIN END
    297         \\
    298         \\
    299         \\hello
    300         \\"
    301     ,
    302         \\blah"/*som*/"/*ething*/BLAH RCDATA "BEGIN END
    303         \\// comment
    304         \\//"blah blah" RCDATA {}
    305         \\hello
    306         \\"
    307     );
    308 }
    309 
    310 test "multiline comment with newlines" {
    311     // bare \r is not treated as a newline
    312     try testRemoveComments("blahblah", "blah/*some\rthing*/blah");
    313 
    314     try testRemoveComments(
    315         \\blah
    316         \\blah
    317     ,
    318         \\blah/*some
    319         \\thing*/blah
    320     );
    321     try testRemoveComments(
    322         "blah\r\nblah",
    323         "blah/*some\r\nthing*/blah",
    324     );
    325 
    326     // handle *<not /> correctly
    327     try testRemoveComments(
    328         \\blah
    329         \\
    330         \\
    331     ,
    332         \\blah/*some
    333         \\thing*
    334         \\/bl*ah*/
    335     );
    336 }
    337 
    338 test "comments appended to a line" {
    339     try testRemoveComments(
    340         \\blah 
    341         \\blah
    342     ,
    343         \\blah // line comment
    344         \\blah
    345     );
    346     try testRemoveComments(
    347         "blah \r\nblah",
    348         "blah // line comment\r\nblah",
    349     );
    350 }
    351 
    352 test "forward slash only" {
    353     try testRemoveComments(
    354         \\  /
    355         \\/
    356     ,
    357         \\  /
    358         \\/
    359     );
    360 }
    361 
    362 test "remove comments with mappings" {
    363     const allocator = std.testing.allocator;
    364     var mut_source = "blah/*\rcommented line*\r/blah".*;
    365     var mappings = SourceMappings{};
    366     _ = try mappings.files.put(allocator, "test.rc");
    367     try mappings.set(1, 1, 0);
    368     try mappings.set(2, 2, 0);
    369     try mappings.set(3, 3, 0);
    370     defer mappings.deinit(allocator);
    371 
    372     const result = try removeComments(&mut_source, &mut_source, &mappings);
    373 
    374     try std.testing.expectEqualStrings("blahblah", result);
    375     try std.testing.expectEqual(@as(usize, 1), mappings.end_line);
    376     try std.testing.expectEqual(@as(usize, 3), mappings.getCorrespondingSpan(1).?.end_line);
    377 }
    378 
    379 test "in place" {
    380     var mut_source = "blah /* comment */ blah".*;
    381     const result = try removeComments(&mut_source, &mut_source, null);
    382     try std.testing.expectEqualStrings("blah  blah", result);
    383 }