comments.zig (12784B) - Raw
1 //! Expects to run after a C preprocessor step that preserves comments. 2 //! 3 //! `rc` has a peculiar quirk where something like `blah/**/blah` will be 4 //! transformed into `blahblah` during parsing. However, `clang -E` will 5 //! transform it into `blah blah`, so in order to match `rc`, we need 6 //! to remove comments ourselves after the preprocessor runs. 7 //! Note: Multiline comments that actually span more than one line do 8 //! get translated to a space character by `rc`. 9 //! 10 //! Removing comments before lexing also allows the lexer to not have to 11 //! deal with comments which would complicate its implementation (this is something 12 //! of a tradeoff, as removing comments in a separate pass means that we'll 13 //! need to iterate the source twice instead of once, but having to deal with 14 //! comments when lexing would be a pain). 15 16 const std = @import("std"); 17 const Allocator = std.mem.Allocator; 18 const UncheckedSliceWriter = @import("utils.zig").UncheckedSliceWriter; 19 const SourceMappings = @import("source_mapping.zig").SourceMappings; 20 const LineHandler = @import("lex.zig").LineHandler; 21 const formsLineEndingPair = @import("source_mapping.zig").formsLineEndingPair; 22 23 /// `buf` must be at least as long as `source` 24 /// In-place transformation is supported (i.e. `source` and `buf` can be the same slice) 25 pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMappings) ![]u8 { 26 std.debug.assert(buf.len >= source.len); 27 var result = UncheckedSliceWriter{ .slice = buf }; 28 const State = enum { 29 start, 30 forward_slash, 31 line_comment, 32 multiline_comment, 33 multiline_comment_end, 34 single_quoted, 35 single_quoted_escape, 36 double_quoted, 37 double_quoted_escape, 38 }; 39 var state: State = .start; 40 var index: usize = 0; 41 var pending_start: ?usize = null; 42 var line_handler = LineHandler{ .buffer = source }; 43 while (index < source.len) : (index += 1) { 44 const c = source[index]; 45 // TODO: Disallow \x1A, \x00, \x7F in comments. At least \x1A and \x00 can definitely 46 // cause errors or parsing weirdness in the Win32 RC compiler. These are disallowed 47 // in the lexer, but comments are stripped before getting to the lexer. 48 switch (state) { 49 .start => switch (c) { 50 '/' => { 51 state = .forward_slash; 52 pending_start = index; 53 }, 54 '\r', '\n' => { 55 _ = line_handler.incrementLineNumber(index); 56 result.write(c); 57 }, 58 else => { 59 switch (c) { 60 '"' => state = .double_quoted, 61 '\'' => state = .single_quoted, 62 else => {}, 63 } 64 result.write(c); 65 }, 66 }, 67 .forward_slash => switch (c) { 68 '/' => state = .line_comment, 69 '*' => { 70 state = .multiline_comment; 71 }, 72 else => { 73 _ = line_handler.maybeIncrementLineNumber(index); 74 result.writeSlice(source[pending_start.? .. index + 1]); 75 pending_start = null; 76 state = .start; 77 }, 78 }, 79 .line_comment => switch (c) { 80 '\r', '\n' => { 81 _ = line_handler.incrementLineNumber(index); 82 result.write(c); 83 state = .start; 84 }, 85 else => {}, 86 }, 87 .multiline_comment => switch (c) { 88 '\r' => try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings), 89 '\n' => { 90 _ = line_handler.incrementLineNumber(index); 91 result.write(c); 92 }, 93 '*' => state = .multiline_comment_end, 94 else => {}, 95 }, 96 .multiline_comment_end => switch (c) { 97 '\r' => { 98 try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings); 99 // We only want to treat this as a newline if it's part of a CRLF pair. If it's 100 // not, then we still want to stay in .multiline_comment_end, so that e.g. `*<\r>/` still 101 // functions as a `*/` comment ending. Kinda crazy, but that's how the Win32 implementation works. 102 if (formsLineEndingPair(source, '\r', index + 1)) { 103 state = .multiline_comment; 104 } 105 }, 106 '\n' => { 107 _ = line_handler.incrementLineNumber(index); 108 result.write(c); 109 state = .multiline_comment; 110 }, 111 '/' => { 112 state = .start; 113 }, 114 else => { 115 state = .multiline_comment; 116 }, 117 }, 118 .single_quoted => switch (c) { 119 '\r', '\n' => { 120 _ = line_handler.incrementLineNumber(index); 121 state = .start; 122 result.write(c); 123 }, 124 '\\' => { 125 state = .single_quoted_escape; 126 result.write(c); 127 }, 128 '\'' => { 129 state = .start; 130 result.write(c); 131 }, 132 else => { 133 result.write(c); 134 }, 135 }, 136 .single_quoted_escape => switch (c) { 137 '\r', '\n' => { 138 _ = line_handler.incrementLineNumber(index); 139 state = .start; 140 result.write(c); 141 }, 142 else => { 143 state = .single_quoted; 144 result.write(c); 145 }, 146 }, 147 .double_quoted => switch (c) { 148 '\r', '\n' => { 149 _ = line_handler.incrementLineNumber(index); 150 state = .start; 151 result.write(c); 152 }, 153 '\\' => { 154 state = .double_quoted_escape; 155 result.write(c); 156 }, 157 '"' => { 158 state = .start; 159 result.write(c); 160 }, 161 else => { 162 result.write(c); 163 }, 164 }, 165 .double_quoted_escape => switch (c) { 166 '\r', '\n' => { 167 _ = line_handler.incrementLineNumber(index); 168 state = .start; 169 result.write(c); 170 }, 171 else => { 172 state = .double_quoted; 173 result.write(c); 174 }, 175 }, 176 } 177 } else { 178 switch (state) { 179 .start, 180 .line_comment, 181 .multiline_comment, 182 .multiline_comment_end, 183 .single_quoted, 184 .single_quoted_escape, 185 .double_quoted, 186 .double_quoted_escape, 187 => {}, 188 .forward_slash => { 189 result.writeSlice(source[pending_start.?..index]); 190 }, 191 } 192 } 193 return result.getWritten(); 194 } 195 196 inline fn handleMultilineCarriageReturn( 197 source: []const u8, 198 line_handler: *LineHandler, 199 index: usize, 200 result: *UncheckedSliceWriter, 201 source_mappings: ?*SourceMappings, 202 ) !void { 203 // This is a dumb way to go about this, but basically we want to determine 204 // if this is part of a distinct CRLF or LFCR pair. This function call will detect 205 // LFCR pairs correctly since the function we're in will only be called on CR, 206 // but will not detect CRLF pairs since it only looks at the line ending before the 207 // CR. So, we do a second (forward) check if the first fails to detect CRLF that is 208 // not part of another pair. 209 const is_lfcr_pair = line_handler.currentIndexFormsLineEndingPair(index); 210 const is_crlf_pair = !is_lfcr_pair and formsLineEndingPair(source, '\r', index + 1); 211 // Note: Bare \r within a multiline comment should *not* be treated as a line ending for the 212 // purposes of removing comments, but *should* be treated as a line ending for the 213 // purposes of line counting/source mapping 214 _ = line_handler.incrementLineNumber(index); 215 // So only write the \r if it's part of a CRLF/LFCR pair 216 if (is_lfcr_pair or is_crlf_pair) { 217 result.write('\r'); 218 } 219 // And otherwise, we want to collapse the source mapping so that we can still know which 220 // line came from where. 221 else { 222 // Because the line gets collapsed, we need to decrement line number so that 223 // the next collapse acts on the first of the collapsed line numbers 224 line_handler.line_number -= 1; 225 if (source_mappings) |mappings| { 226 try mappings.collapse(line_handler.line_number, 1); 227 } 228 } 229 } 230 231 pub fn removeCommentsAlloc(allocator: Allocator, source: []const u8, source_mappings: ?*SourceMappings) ![]u8 { 232 const buf = try allocator.alloc(u8, source.len); 233 errdefer allocator.free(buf); 234 const result = try removeComments(source, buf, source_mappings); 235 return allocator.realloc(buf, result.len); 236 } 237 238 fn testRemoveComments(expected: []const u8, source: []const u8) !void { 239 const result = try removeCommentsAlloc(std.testing.allocator, source, null); 240 defer std.testing.allocator.free(result); 241 242 try std.testing.expectEqualStrings(expected, result); 243 } 244 245 test "basic" { 246 try testRemoveComments("", "// comment"); 247 try testRemoveComments("", "/* comment */"); 248 } 249 250 test "mixed" { 251 try testRemoveComments("hello", "hello// comment"); 252 try testRemoveComments("hello", "hel/* comment */lo"); 253 } 254 255 test "within a string" { 256 // escaped " is \" 257 try testRemoveComments( 258 \\blah"//som\"/*ething*/"BLAH 259 , 260 \\blah"//som\"/*ething*/"BLAH 261 ); 262 } 263 264 test "line comments retain newlines" { 265 try testRemoveComments( 266 \\ 267 \\ 268 \\ 269 , 270 \\// comment 271 \\// comment 272 \\// comment 273 ); 274 275 try testRemoveComments("\r\n", "//comment\r\n"); 276 } 277 278 test "unfinished multiline comment" { 279 try testRemoveComments( 280 \\unfinished 281 \\ 282 , 283 \\unfinished/* 284 \\ 285 ); 286 } 287 288 test "crazy" { 289 try testRemoveComments( 290 \\blah"/*som*/\""BLAH 291 , 292 \\blah"/*som*/\""/*ething*/BLAH 293 ); 294 295 try testRemoveComments( 296 \\blah"/*som*/"BLAH RCDATA "BEGIN END 297 \\ 298 \\ 299 \\hello 300 \\" 301 , 302 \\blah"/*som*/"/*ething*/BLAH RCDATA "BEGIN END 303 \\// comment 304 \\//"blah blah" RCDATA {} 305 \\hello 306 \\" 307 ); 308 } 309 310 test "multiline comment with newlines" { 311 // bare \r is not treated as a newline 312 try testRemoveComments("blahblah", "blah/*some\rthing*/blah"); 313 314 try testRemoveComments( 315 \\blah 316 \\blah 317 , 318 \\blah/*some 319 \\thing*/blah 320 ); 321 try testRemoveComments( 322 "blah\r\nblah", 323 "blah/*some\r\nthing*/blah", 324 ); 325 326 // handle *<not /> correctly 327 try testRemoveComments( 328 \\blah 329 \\ 330 \\ 331 , 332 \\blah/*some 333 \\thing* 334 \\/bl*ah*/ 335 ); 336 } 337 338 test "comments appended to a line" { 339 try testRemoveComments( 340 \\blah 341 \\blah 342 , 343 \\blah // line comment 344 \\blah 345 ); 346 try testRemoveComments( 347 "blah \r\nblah", 348 "blah // line comment\r\nblah", 349 ); 350 } 351 352 test "forward slash only" { 353 try testRemoveComments( 354 \\ / 355 \\/ 356 , 357 \\ / 358 \\/ 359 ); 360 } 361 362 test "remove comments with mappings" { 363 const allocator = std.testing.allocator; 364 var mut_source = "blah/*\rcommented line*\r/blah".*; 365 var mappings = SourceMappings{}; 366 _ = try mappings.files.put(allocator, "test.rc"); 367 try mappings.set(1, 1, 0); 368 try mappings.set(2, 2, 0); 369 try mappings.set(3, 3, 0); 370 defer mappings.deinit(allocator); 371 372 const result = try removeComments(&mut_source, &mut_source, &mappings); 373 374 try std.testing.expectEqualStrings("blahblah", result); 375 try std.testing.expectEqual(@as(usize, 1), mappings.end_line); 376 try std.testing.expectEqual(@as(usize, 3), mappings.getCorrespondingSpan(1).?.end_line); 377 } 378 379 test "in place" { 380 var mut_source = "blah /* comment */ blah".*; 381 const result = try removeComments(&mut_source, &mut_source, null); 382 try std.testing.expectEqualStrings("blah blah", result); 383 }