literals.zig (49670B) - Raw
1 const std = @import("std"); 2 const code_pages = @import("code_pages.zig"); 3 const SupportedCodePage = code_pages.SupportedCodePage; 4 const windows1252 = @import("windows1252.zig"); 5 const ErrorDetails = @import("errors.zig").ErrorDetails; 6 const DiagnosticsContext = @import("errors.zig").DiagnosticsContext; 7 const Token = @import("lex.zig").Token; 8 9 /// rc is maximally liberal in terms of what it accepts as a number literal 10 /// for data values. As long as it starts with a number or - or ~, that's good enough. 11 pub fn isValidNumberDataLiteral(str: []const u8) bool { 12 if (str.len == 0) return false; 13 switch (str[0]) { 14 '~', '-', '0'...'9' => return true, 15 else => return false, 16 } 17 } 18 19 pub const SourceBytes = struct { 20 slice: []const u8, 21 code_page: SupportedCodePage, 22 }; 23 24 pub const StringType = enum { ascii, wide }; 25 26 /// Valid escapes: 27 /// "" -> " 28 /// \a, \A => 0x08 (not 0x07 like in C) 29 /// \n => 0x0A 30 /// \r => 0x0D 31 /// \t, \T => 0x09 32 /// \\ => \ 33 /// \nnn => byte with numeric value given by nnn interpreted as octal 34 /// (wraps on overflow, number of digits can be 1-3 for ASCII strings 35 /// and 1-7 for wide strings) 36 /// \xhh => byte with numeric value given by hh interpreted as hex 37 /// (number of digits can be 0-2 for ASCII strings and 0-4 for 38 /// wide strings) 39 /// \<\r+> => \ 40 /// \<[\r\n\t ]+> => <nothing> 41 /// 42 /// Special cases: 43 /// <\t> => 1-8 spaces, dependent on columns in the source rc file itself 44 /// <\r> => <nothing> 45 /// <\n+><\w+?\n?> => <space><\n> 46 /// 47 /// Special, especially weird case: 48 /// \"" => " 49 /// NOTE: This leads to footguns because the preprocessor can start parsing things 50 /// out-of-sync with the RC compiler, expanding macros within string literals, etc. 51 /// This parse function handles this case the same as the Windows RC compiler, but 52 /// \" within a string literal is treated as an error by the lexer, so the relevant 53 /// branches should never actually be hit during this function. 54 pub const IterativeStringParser = struct { 55 source: []const u8, 56 code_page: SupportedCodePage, 57 /// The type of the string inferred by the prefix (L"" or "") 58 /// This is what matters for things like the maximum digits in an 59 /// escape sequence, whether or not invalid escape sequences are skipped, etc. 60 declared_string_type: StringType, 61 pending_codepoint: ?u21 = null, 62 num_pending_spaces: u8 = 0, 63 index: usize = 0, 64 column: usize = 0, 65 diagnostics: ?DiagnosticsContext = null, 66 seen_tab: bool = false, 67 68 const State = enum { 69 normal, 70 quote, 71 newline, 72 escaped, 73 escaped_cr, 74 escaped_newlines, 75 escaped_octal, 76 escaped_hex, 77 }; 78 79 pub fn init(bytes: SourceBytes, options: StringParseOptions) IterativeStringParser { 80 const declared_string_type: StringType = switch (bytes.slice[0]) { 81 'L', 'l' => .wide, 82 else => .ascii, 83 }; 84 var source = bytes.slice[1 .. bytes.slice.len - 1]; // remove "" 85 var column = options.start_column + 1; // for the removed " 86 if (declared_string_type == .wide) { 87 source = source[1..]; // remove L 88 column += 1; // for the removed L 89 } 90 return .{ 91 .source = source, 92 .code_page = bytes.code_page, 93 .declared_string_type = declared_string_type, 94 .column = column, 95 .diagnostics = options.diagnostics, 96 }; 97 } 98 99 pub const ParsedCodepoint = struct { 100 codepoint: u21, 101 /// Note: If this is true, `codepoint` will have an effective maximum value 102 /// of 0xFFFF, as `codepoint` is calculated using wrapping arithmetic on a u16. 103 /// If the value needs to be truncated to a smaller integer (e.g. for ASCII string 104 /// literals), then that must be done by the caller. 105 from_escaped_integer: bool = false, 106 /// Denotes that the codepoint is: 107 /// - Escaped (has a \ in front of it), and 108 /// - Has a value >= U+10000, meaning it would be encoded as a surrogate 109 /// pair in UTF-16, and 110 /// - Is part of a wide string literal 111 /// 112 /// Normally in wide string literals, invalid escapes are omitted 113 /// during parsing (the codepoints are not returned at all during 114 /// the `next` call), but this is a special case in which the 115 /// escape only applies to the high surrogate pair of the codepoint. 116 /// 117 /// TODO: Maybe just return the low surrogate codepoint by itself in this case. 118 escaped_surrogate_pair: bool = false, 119 }; 120 121 pub fn next(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint { 122 const result = try self.nextUnchecked(); 123 if (self.diagnostics != null and result != null and !result.?.from_escaped_integer) { 124 switch (result.?.codepoint) { 125 0x0900, 0x0A00, 0x0A0D, 0x2000, 0x0D00 => { 126 const err: ErrorDetails.Error = if (result.?.codepoint == 0xD00) 127 .rc_would_miscompile_codepoint_skip 128 else 129 .rc_would_miscompile_codepoint_whitespace; 130 try self.diagnostics.?.diagnostics.append(ErrorDetails{ 131 .err = err, 132 .type = .warning, 133 .code_page = self.code_page, 134 .token = self.diagnostics.?.token, 135 .extra = .{ .number = result.?.codepoint }, 136 }); 137 }, 138 0xFFFE, 0xFFFF => { 139 try self.diagnostics.?.diagnostics.append(ErrorDetails{ 140 .err = .rc_would_miscompile_codepoint_bom, 141 .type = .warning, 142 .code_page = self.code_page, 143 .token = self.diagnostics.?.token, 144 .extra = .{ .number = result.?.codepoint }, 145 }); 146 try self.diagnostics.?.diagnostics.append(ErrorDetails{ 147 .err = .rc_would_miscompile_codepoint_bom, 148 .type = .note, 149 .code_page = self.code_page, 150 .token = self.diagnostics.?.token, 151 .print_source_line = false, 152 .extra = .{ .number = result.?.codepoint }, 153 }); 154 }, 155 else => {}, 156 } 157 } 158 return result; 159 } 160 161 pub fn nextUnchecked(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint { 162 if (self.num_pending_spaces > 0) { 163 // Ensure that we don't get into this predicament so we can ensure that 164 // the order of processing any pending stuff doesn't matter 165 std.debug.assert(self.pending_codepoint == null); 166 self.num_pending_spaces -= 1; 167 return .{ .codepoint = ' ' }; 168 } 169 if (self.pending_codepoint) |pending_codepoint| { 170 self.pending_codepoint = null; 171 return .{ .codepoint = pending_codepoint }; 172 } 173 if (self.index >= self.source.len) return null; 174 175 var state: State = .normal; 176 var string_escape_n: u16 = 0; 177 var string_escape_i: u8 = 0; 178 const max_octal_escape_digits: u8 = switch (self.declared_string_type) { 179 .ascii => 3, 180 .wide => 7, 181 }; 182 const max_hex_escape_digits: u8 = switch (self.declared_string_type) { 183 .ascii => 2, 184 .wide => 4, 185 }; 186 187 var backtrack: bool = undefined; 188 while (self.code_page.codepointAt(self.index, self.source)) |codepoint| : ({ 189 if (!backtrack) self.index += codepoint.byte_len; 190 }) { 191 backtrack = false; 192 const c = codepoint.value; 193 defer { 194 if (!backtrack) { 195 if (c == '\t') { 196 self.column += columnsUntilTabStop(self.column, 8); 197 } else { 198 self.column += codepoint.byte_len; 199 } 200 } 201 } 202 switch (state) { 203 .normal => switch (c) { 204 '\\' => state = .escaped, 205 '"' => state = .quote, 206 '\r' => {}, 207 '\n' => state = .newline, 208 '\t' => { 209 // Only warn about a tab getting converted to spaces once per string 210 if (self.diagnostics != null and !self.seen_tab) { 211 try self.diagnostics.?.diagnostics.append(ErrorDetails{ 212 .err = .tab_converted_to_spaces, 213 .type = .warning, 214 .code_page = self.code_page, 215 .token = self.diagnostics.?.token, 216 }); 217 try self.diagnostics.?.diagnostics.append(ErrorDetails{ 218 .err = .tab_converted_to_spaces, 219 .type = .note, 220 .code_page = self.code_page, 221 .token = self.diagnostics.?.token, 222 .print_source_line = false, 223 }); 224 self.seen_tab = true; 225 } 226 const cols = columnsUntilTabStop(self.column, 8); 227 self.num_pending_spaces = @intCast(cols - 1); 228 self.index += codepoint.byte_len; 229 return .{ .codepoint = ' ' }; 230 }, 231 else => { 232 self.index += codepoint.byte_len; 233 return .{ .codepoint = c }; 234 }, 235 }, 236 .quote => switch (c) { 237 '"' => { 238 // "" => " 239 self.index += codepoint.byte_len; 240 return .{ .codepoint = '"' }; 241 }, 242 else => unreachable, // this is a bug in the lexer 243 }, 244 .newline => switch (c) { 245 '\r', ' ', '\t', '\n', '\x0b', '\x0c', '\xa0' => {}, 246 else => { 247 // we intentionally avoid incrementing self.index 248 // to handle the current char in the next call, 249 // and we set backtrack so column count is handled correctly 250 backtrack = true; 251 252 // <space><newline> 253 self.pending_codepoint = '\n'; 254 return .{ .codepoint = ' ' }; 255 }, 256 }, 257 .escaped => switch (c) { 258 '\r' => state = .escaped_cr, 259 '\n' => state = .escaped_newlines, 260 '0'...'7' => { 261 string_escape_n = std.fmt.charToDigit(@intCast(c), 8) catch unreachable; 262 string_escape_i = 1; 263 state = .escaped_octal; 264 }, 265 'x', 'X' => { 266 string_escape_n = 0; 267 string_escape_i = 0; 268 state = .escaped_hex; 269 }, 270 else => { 271 switch (c) { 272 'a', 'A' => { 273 self.index += codepoint.byte_len; 274 // might be a bug in RC, but matches its behavior 275 return .{ .codepoint = '\x08' }; 276 }, 277 'n' => { 278 self.index += codepoint.byte_len; 279 return .{ .codepoint = '\n' }; 280 }, 281 'r' => { 282 self.index += codepoint.byte_len; 283 return .{ .codepoint = '\r' }; 284 }, 285 't', 'T' => { 286 self.index += codepoint.byte_len; 287 return .{ .codepoint = '\t' }; 288 }, 289 '\\' => { 290 self.index += codepoint.byte_len; 291 return .{ .codepoint = '\\' }; 292 }, 293 '"' => { 294 // \" is a special case that doesn't get the \ included, 295 backtrack = true; 296 }, 297 else => switch (self.declared_string_type) { 298 .wide => { 299 // All invalid escape sequences are skipped in wide strings, 300 // but there is a special case around \<tab> where the \ 301 // is skipped but the tab character is processed. 302 // It's actually a bit weirder than that, though, since 303 // the preprocessor is the one that does the <tab> -> spaces 304 // conversion, so it goes something like this: 305 // 306 // Before preprocessing: L"\<tab>" 307 // After preprocessing: L"\ " 308 // 309 // So the parser only sees an escaped space character followed 310 // by some other number of spaces >= 0. 311 // 312 // However, our preprocessor keeps tab characters intact, so we emulate 313 // the above behavior by skipping the \ and then outputting one less 314 // space than normal for the <tab> character. 315 if (c == '\t') { 316 // Only warn about a tab getting converted to spaces once per string 317 if (self.diagnostics != null and !self.seen_tab) { 318 try self.diagnostics.?.diagnostics.append(ErrorDetails{ 319 .err = .tab_converted_to_spaces, 320 .type = .warning, 321 .code_page = self.code_page, 322 .token = self.diagnostics.?.token, 323 }); 324 try self.diagnostics.?.diagnostics.append(ErrorDetails{ 325 .err = .tab_converted_to_spaces, 326 .type = .note, 327 .code_page = self.code_page, 328 .token = self.diagnostics.?.token, 329 .print_source_line = false, 330 }); 331 self.seen_tab = true; 332 } 333 334 const cols = columnsUntilTabStop(self.column, 8); 335 // If the tab character would only be converted to a single space, 336 // then we can just skip both the \ and the <tab> and move on. 337 if (cols > 1) { 338 self.num_pending_spaces = @intCast(cols - 2); 339 self.index += codepoint.byte_len; 340 return .{ .codepoint = ' ' }; 341 } 342 } 343 // There's a second special case when the codepoint would be encoded 344 // as a surrogate pair in UTF-16, as the escape 'applies' to the 345 // high surrogate pair only in this instance. This is a side-effect 346 // of the Win32 RC compiler preprocessor outputting UTF-16 and the 347 // compiler itself seemingly working on code units instead of code points 348 // in this particular instance. 349 // 350 // We emulate this behavior by emitting the codepoint, but with a marker 351 // that indicates that it needs to be handled specially. 352 if (c >= 0x10000 and c != code_pages.Codepoint.invalid) { 353 self.index += codepoint.byte_len; 354 return .{ .codepoint = c, .escaped_surrogate_pair = true }; 355 } 356 }, 357 .ascii => { 358 // we intentionally avoid incrementing self.index 359 // to handle the current char in the next call, 360 // and we set backtrack so column count is handled correctly 361 backtrack = true; 362 return .{ .codepoint = '\\' }; 363 }, 364 }, 365 } 366 state = .normal; 367 }, 368 }, 369 .escaped_cr => switch (c) { 370 '\r' => {}, 371 '\n' => state = .escaped_newlines, 372 else => { 373 // we intentionally avoid incrementing self.index 374 // to handle the current char in the next call, 375 // and we set backtrack so column count is handled correctly 376 backtrack = true; 377 return .{ .codepoint = '\\' }; 378 }, 379 }, 380 .escaped_newlines => switch (c) { 381 '\r', '\n', '\t', ' ', '\x0b', '\x0c', '\xa0' => {}, 382 else => { 383 // backtrack so that we handle the current char properly 384 backtrack = true; 385 state = .normal; 386 }, 387 }, 388 .escaped_octal => switch (c) { 389 '0'...'7' => { 390 // Note: We use wrapping arithmetic on a u16 here since there's been no observed 391 // string parsing scenario where an escaped integer with a value >= the u16 392 // max is interpreted as anything but the truncated u16 value. 393 string_escape_n *%= 8; 394 string_escape_n +%= std.fmt.charToDigit(@intCast(c), 8) catch unreachable; 395 string_escape_i += 1; 396 if (string_escape_i == max_octal_escape_digits) { 397 self.index += codepoint.byte_len; 398 return .{ .codepoint = string_escape_n, .from_escaped_integer = true }; 399 } 400 }, 401 else => { 402 // we intentionally avoid incrementing self.index 403 // to handle the current char in the next call, 404 // and we set backtrack so column count is handled correctly 405 backtrack = true; 406 407 // write out whatever byte we have parsed so far 408 return .{ .codepoint = string_escape_n, .from_escaped_integer = true }; 409 }, 410 }, 411 .escaped_hex => switch (c) { 412 '0'...'9', 'a'...'f', 'A'...'F' => { 413 string_escape_n *= 16; 414 string_escape_n += std.fmt.charToDigit(@intCast(c), 16) catch unreachable; 415 string_escape_i += 1; 416 if (string_escape_i == max_hex_escape_digits) { 417 self.index += codepoint.byte_len; 418 return .{ .codepoint = string_escape_n, .from_escaped_integer = true }; 419 } 420 }, 421 else => { 422 // we intentionally avoid incrementing self.index 423 // to handle the current char in the next call, 424 // and we set backtrack so column count is handled correctly 425 backtrack = true; 426 427 // write out whatever byte we have parsed so far 428 // (even with 0 actual digits, \x alone parses to 0) 429 const escaped_value = string_escape_n; 430 return .{ .codepoint = escaped_value, .from_escaped_integer = true }; 431 }, 432 }, 433 } 434 } 435 436 switch (state) { 437 .normal, .escaped_newlines => {}, 438 .newline => { 439 // <space><newline> 440 self.pending_codepoint = '\n'; 441 return .{ .codepoint = ' ' }; 442 }, 443 .escaped, .escaped_cr => return .{ .codepoint = '\\' }, 444 .escaped_octal, .escaped_hex => { 445 return .{ .codepoint = string_escape_n, .from_escaped_integer = true }; 446 }, 447 .quote => unreachable, // this is a bug in the lexer 448 } 449 450 return null; 451 } 452 }; 453 454 pub const StringParseOptions = struct { 455 start_column: usize = 0, 456 diagnostics: ?DiagnosticsContext = null, 457 output_code_page: SupportedCodePage, 458 }; 459 460 pub fn parseQuotedString( 461 comptime literal_type: StringType, 462 allocator: std.mem.Allocator, 463 bytes: SourceBytes, 464 options: StringParseOptions, 465 ) !(switch (literal_type) { 466 .ascii => []u8, 467 .wide => [:0]u16, 468 }) { 469 const T = if (literal_type == .ascii) u8 else u16; 470 std.debug.assert(bytes.slice.len >= 2); // must at least have 2 double quote chars 471 472 var buf = try std.array_list.Managed(T).initCapacity(allocator, bytes.slice.len); 473 errdefer buf.deinit(); 474 475 var iterative_parser = IterativeStringParser.init(bytes, options); 476 477 while (try iterative_parser.next()) |parsed| { 478 const c = parsed.codepoint; 479 switch (literal_type) { 480 .ascii => switch (options.output_code_page) { 481 .windows1252 => { 482 if (parsed.from_escaped_integer) { 483 try buf.append(@truncate(c)); 484 } else if (windows1252.bestFitFromCodepoint(c)) |best_fit| { 485 try buf.append(best_fit); 486 } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) { 487 try buf.append('?'); 488 } else { 489 try buf.appendSlice("??"); 490 } 491 }, 492 .utf8 => { 493 var codepoint_to_encode = c; 494 if (parsed.from_escaped_integer) { 495 codepoint_to_encode = @as(T, @truncate(c)); 496 } 497 const escaped_integer_outside_ascii_range = parsed.from_escaped_integer and codepoint_to_encode > 0x7F; 498 if (escaped_integer_outside_ascii_range or c == code_pages.Codepoint.invalid) { 499 codepoint_to_encode = '�'; 500 } 501 var utf8_buf: [4]u8 = undefined; 502 const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable; 503 try buf.appendSlice(utf8_buf[0..utf8_len]); 504 }, 505 }, 506 .wide => { 507 // Parsing any string type as a wide string is handled separately, see parseQuotedStringAsWideString 508 std.debug.assert(iterative_parser.declared_string_type == .wide); 509 if (parsed.from_escaped_integer) { 510 try buf.append(std.mem.nativeToLittle(u16, @truncate(c))); 511 } else if (c == code_pages.Codepoint.invalid) { 512 try buf.append(std.mem.nativeToLittle(u16, '�')); 513 } else if (c < 0x10000) { 514 const short: u16 = @intCast(c); 515 try buf.append(std.mem.nativeToLittle(u16, short)); 516 } else { 517 if (!parsed.escaped_surrogate_pair) { 518 const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800; 519 try buf.append(std.mem.nativeToLittle(u16, high)); 520 } 521 const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00; 522 try buf.append(std.mem.nativeToLittle(u16, low)); 523 } 524 }, 525 } 526 } 527 528 if (literal_type == .wide) { 529 return buf.toOwnedSliceSentinel(0); 530 } else { 531 return buf.toOwnedSlice(); 532 } 533 } 534 535 pub fn parseQuotedAsciiString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![]u8 { 536 std.debug.assert(bytes.slice.len >= 2); // "" 537 return parseQuotedString(.ascii, allocator, bytes, options); 538 } 539 540 pub fn parseQuotedWideString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![:0]u16 { 541 std.debug.assert(bytes.slice.len >= 3); // L"" 542 return parseQuotedString(.wide, allocator, bytes, options); 543 } 544 545 /// Parses any string type into a wide string. 546 /// If the string is declared as a wide string (L""), then it is handled normally. 547 /// Otherwise, things are fairly normal with the exception of escaped integers. 548 /// Escaped integers are handled by: 549 /// - Truncating the escape to a u8 550 /// - Reinterpeting the u8 as a byte from the *output* code page 551 /// - Outputting the codepoint that corresponds to the interpreted byte, or � if no such 552 /// interpretation is possible 553 /// For example, if the code page is UTF-8, then while \x80 is a valid start byte, it's 554 /// interpreted as a single byte, so it ends up being seen as invalid and � is outputted. 555 /// If the code page is Windows-1252, then \x80 is interpreted to be € which has the 556 /// codepoint U+20AC, so the UTF-16 encoding of U+20AC is outputted. 557 pub fn parseQuotedStringAsWideString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![:0]u16 { 558 std.debug.assert(bytes.slice.len >= 2); // "" 559 560 if (bytes.slice[0] == 'l' or bytes.slice[0] == 'L') { 561 return parseQuotedWideString(allocator, bytes, options); 562 } 563 564 // Note: We're only handling the case of parsing an ASCII string into a wide string from here on out. 565 // TODO: The logic below is similar to that in AcceleratorKeyCodepointTranslator, might be worth merging the two 566 567 var buf = try std.array_list.Managed(u16).initCapacity(allocator, bytes.slice.len); 568 errdefer buf.deinit(); 569 570 var iterative_parser = IterativeStringParser.init(bytes, options); 571 572 while (try iterative_parser.next()) |parsed| { 573 const c = parsed.codepoint; 574 if (parsed.from_escaped_integer) { 575 std.debug.assert(c != code_pages.Codepoint.invalid); 576 const byte_to_interpret: u8 = @truncate(c); 577 const code_unit_to_encode: u16 = switch (options.output_code_page) { 578 .windows1252 => windows1252.toCodepoint(byte_to_interpret), 579 .utf8 => if (byte_to_interpret > 0x7F) '�' else byte_to_interpret, 580 }; 581 try buf.append(std.mem.nativeToLittle(u16, code_unit_to_encode)); 582 } else if (c == code_pages.Codepoint.invalid) { 583 try buf.append(std.mem.nativeToLittle(u16, '�')); 584 } else if (c < 0x10000) { 585 const short: u16 = @intCast(c); 586 try buf.append(std.mem.nativeToLittle(u16, short)); 587 } else { 588 if (!parsed.escaped_surrogate_pair) { 589 const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800; 590 try buf.append(std.mem.nativeToLittle(u16, high)); 591 } 592 const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00; 593 try buf.append(std.mem.nativeToLittle(u16, low)); 594 } 595 } 596 597 return buf.toOwnedSliceSentinel(0); 598 } 599 600 test "parse quoted ascii string" { 601 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); 602 defer arena_allocator.deinit(); 603 const arena = arena_allocator.allocator(); 604 605 try std.testing.expectEqualSlices(u8, "hello", try parseQuotedAsciiString(arena, .{ 606 .slice = 607 \\"hello" 608 , 609 .code_page = .windows1252, 610 }, .{ 611 .output_code_page = .windows1252, 612 })); 613 // hex with 0 digits 614 try std.testing.expectEqualSlices(u8, "\x00", try parseQuotedAsciiString(arena, .{ 615 .slice = 616 \\"\x" 617 , 618 .code_page = .windows1252, 619 }, .{ 620 .output_code_page = .windows1252, 621 })); 622 // hex max of 2 digits 623 try std.testing.expectEqualSlices(u8, "\xFFf", try parseQuotedAsciiString(arena, .{ 624 .slice = 625 \\"\XfFf" 626 , 627 .code_page = .windows1252, 628 }, .{ 629 .output_code_page = .windows1252, 630 })); 631 // octal with invalid octal digit 632 try std.testing.expectEqualSlices(u8, "\x019", try parseQuotedAsciiString(arena, .{ 633 .slice = 634 \\"\19" 635 , 636 .code_page = .windows1252, 637 }, .{ 638 .output_code_page = .windows1252, 639 })); 640 // escaped quotes 641 try std.testing.expectEqualSlices(u8, " \" ", try parseQuotedAsciiString(arena, .{ 642 .slice = 643 \\" "" " 644 , 645 .code_page = .windows1252, 646 }, .{ 647 .output_code_page = .windows1252, 648 })); 649 // backslash right before escaped quotes 650 try std.testing.expectEqualSlices(u8, "\"", try parseQuotedAsciiString(arena, .{ 651 .slice = 652 \\"\""" 653 , 654 .code_page = .windows1252, 655 }, .{ 656 .output_code_page = .windows1252, 657 })); 658 // octal overflow 659 try std.testing.expectEqualSlices(u8, "\x01", try parseQuotedAsciiString(arena, .{ 660 .slice = 661 \\"\401" 662 , 663 .code_page = .windows1252, 664 }, .{ 665 .output_code_page = .windows1252, 666 })); 667 // escapes 668 try std.testing.expectEqualSlices(u8, "\x08\n\r\t\\", try parseQuotedAsciiString(arena, .{ 669 .slice = 670 \\"\a\n\r\t\\" 671 , 672 .code_page = .windows1252, 673 }, .{ 674 .output_code_page = .windows1252, 675 })); 676 // uppercase escapes 677 try std.testing.expectEqualSlices(u8, "\x08\\N\\R\t\\", try parseQuotedAsciiString(arena, .{ 678 .slice = 679 \\"\A\N\R\T\\" 680 , 681 .code_page = .windows1252, 682 }, .{ 683 .output_code_page = .windows1252, 684 })); 685 // backslash on its own 686 try std.testing.expectEqualSlices(u8, "\\", try parseQuotedAsciiString(arena, .{ 687 .slice = 688 \\"\" 689 , 690 .code_page = .windows1252, 691 }, .{ 692 .output_code_page = .windows1252, 693 })); 694 // unrecognized escapes 695 try std.testing.expectEqualSlices(u8, "\\b", try parseQuotedAsciiString(arena, .{ 696 .slice = 697 \\"\b" 698 , 699 .code_page = .windows1252, 700 }, .{ 701 .output_code_page = .windows1252, 702 })); 703 // escaped carriage returns 704 try std.testing.expectEqualSlices(u8, "\\", try parseQuotedAsciiString( 705 arena, 706 .{ .slice = "\"\\\r\r\r\r\r\"", .code_page = .windows1252 }, 707 .{ .output_code_page = .windows1252 }, 708 )); 709 // escaped newlines 710 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( 711 arena, 712 .{ .slice = "\"\\\n\n\n\n\n\"", .code_page = .windows1252 }, 713 .{ .output_code_page = .windows1252 }, 714 )); 715 // escaped CRLF pairs 716 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( 717 arena, 718 .{ .slice = "\"\\\r\n\r\n\r\n\r\n\r\n\"", .code_page = .windows1252 }, 719 .{ .output_code_page = .windows1252 }, 720 )); 721 // escaped newlines with other whitespace 722 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( 723 arena, 724 .{ .slice = "\"\\\n \t\r\n \r\t\n \t\"", .code_page = .windows1252 }, 725 .{ .output_code_page = .windows1252 }, 726 )); 727 // literal tab characters get converted to spaces (dependent on source file columns) 728 try std.testing.expectEqualSlices(u8, " ", try parseQuotedAsciiString( 729 arena, 730 .{ .slice = "\"\t\"", .code_page = .windows1252 }, 731 .{ .output_code_page = .windows1252 }, 732 )); 733 try std.testing.expectEqualSlices(u8, "abc ", try parseQuotedAsciiString( 734 arena, 735 .{ .slice = "\"abc\t\"", .code_page = .windows1252 }, 736 .{ .output_code_page = .windows1252 }, 737 )); 738 try std.testing.expectEqualSlices(u8, "abcdefg ", try parseQuotedAsciiString( 739 arena, 740 .{ .slice = "\"abcdefg\t\"", .code_page = .windows1252 }, 741 .{ .output_code_page = .windows1252 }, 742 )); 743 try std.testing.expectEqualSlices(u8, "\\ ", try parseQuotedAsciiString( 744 arena, 745 .{ .slice = "\"\\\t\"", .code_page = .windows1252 }, 746 .{ .output_code_page = .windows1252 }, 747 )); 748 // literal CR's get dropped 749 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( 750 arena, 751 .{ .slice = "\"\r\r\r\r\r\"", .code_page = .windows1252 }, 752 .{ .output_code_page = .windows1252 }, 753 )); 754 // contiguous newlines and whitespace get collapsed to <space><newline> 755 try std.testing.expectEqualSlices(u8, " \n", try parseQuotedAsciiString( 756 arena, 757 .{ .slice = "\"\n\r\r \r\n \t \"", .code_page = .windows1252 }, 758 .{ .output_code_page = .windows1252 }, 759 )); 760 } 761 762 test "parse quoted ascii string with utf8 code page" { 763 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); 764 defer arena_allocator.deinit(); 765 const arena = arena_allocator.allocator(); 766 767 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( 768 arena, 769 .{ .slice = "\"\"", .code_page = .utf8 }, 770 .{ .output_code_page = .windows1252 }, 771 )); 772 // Codepoints that don't have a Windows-1252 representation get converted to ? 773 try std.testing.expectEqualSlices(u8, "?????????", try parseQuotedAsciiString( 774 arena, 775 .{ .slice = "\"кириллица\"", .code_page = .utf8 }, 776 .{ .output_code_page = .windows1252 }, 777 )); 778 // Codepoints that have a best fit mapping get converted accordingly, 779 // these are box drawing codepoints 780 try std.testing.expectEqualSlices(u8, "\x2b\x2d\x2b", try parseQuotedAsciiString( 781 arena, 782 .{ .slice = "\"┌─┐\"", .code_page = .utf8 }, 783 .{ .output_code_page = .windows1252 }, 784 )); 785 // Invalid UTF-8 gets converted to ? depending on well-formedness 786 try std.testing.expectEqualSlices(u8, "????", try parseQuotedAsciiString( 787 arena, 788 .{ .slice = "\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 }, 789 .{ .output_code_page = .windows1252 }, 790 )); 791 // Codepoints that would require a UTF-16 surrogate pair get converted to ?? 792 try std.testing.expectEqualSlices(u8, "??", try parseQuotedAsciiString( 793 arena, 794 .{ .slice = "\"\xF2\xAF\xBA\xB4\"", .code_page = .utf8 }, 795 .{ .output_code_page = .windows1252 }, 796 )); 797 798 // Output code page changes how invalid UTF-8 gets converted, since it 799 // now encodes the result as UTF-8 so it can write replacement characters. 800 try std.testing.expectEqualSlices(u8, "����", try parseQuotedAsciiString( 801 arena, 802 .{ .slice = "\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 }, 803 .{ .output_code_page = .utf8 }, 804 )); 805 try std.testing.expectEqualSlices(u8, "\xF2\xAF\xBA\xB4", try parseQuotedAsciiString( 806 arena, 807 .{ .slice = "\"\xF2\xAF\xBA\xB4\"", .code_page = .utf8 }, 808 .{ .output_code_page = .utf8 }, 809 )); 810 811 // This used to cause integer overflow when reconsuming the 4-byte long codepoint 812 // after the escaped CRLF pair. 813 try std.testing.expectEqualSlices(u8, "\u{10348}", try parseQuotedAsciiString( 814 arena, 815 .{ .slice = "\"\\\r\n\u{10348}\"", .code_page = .utf8 }, 816 .{ .output_code_page = .utf8 }, 817 )); 818 } 819 820 test "parse quoted string with different input/output code pages" { 821 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); 822 defer arena_allocator.deinit(); 823 const arena = arena_allocator.allocator(); 824 825 try std.testing.expectEqualSlices(u8, "€���\x60\x7F", try parseQuotedAsciiString( 826 arena, 827 .{ .slice = "\"\x80\\x8a\\600\\612\\540\\577\"", .code_page = .windows1252 }, 828 .{ .output_code_page = .utf8 }, 829 )); 830 } 831 832 test "parse quoted wide string" { 833 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); 834 defer arena_allocator.deinit(); 835 const arena = arena_allocator.allocator(); 836 837 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("hello"), try parseQuotedWideString(arena, .{ 838 .slice = 839 \\L"hello" 840 , 841 .code_page = .windows1252, 842 }, .{ 843 .output_code_page = .windows1252, 844 })); 845 // hex with 0 digits 846 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{0x0}, try parseQuotedWideString(arena, .{ 847 .slice = 848 \\L"\x" 849 , 850 .code_page = .windows1252, 851 }, .{ 852 .output_code_page = .windows1252, 853 })); 854 // hex max of 4 digits 855 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0xFFFF), std.mem.nativeToLittle(u16, 'f') }, try parseQuotedWideString(arena, .{ 856 .slice = 857 \\L"\XfFfFf" 858 , 859 .code_page = .windows1252, 860 }, .{ 861 .output_code_page = .windows1252, 862 })); 863 // octal max of 7 digits 864 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0x9493), std.mem.nativeToLittle(u16, '3'), std.mem.nativeToLittle(u16, '3') }, try parseQuotedWideString(arena, .{ 865 .slice = 866 \\L"\111222333" 867 , 868 .code_page = .windows1252, 869 }, .{ 870 .output_code_page = .windows1252, 871 })); 872 // octal overflow 873 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{std.mem.nativeToLittle(u16, 0xFF01)}, try parseQuotedWideString(arena, .{ 874 .slice = 875 \\L"\777401" 876 , 877 .code_page = .windows1252, 878 }, .{ 879 .output_code_page = .windows1252, 880 })); 881 // literal tab characters get converted to spaces (dependent on source file columns) 882 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("abcdefg "), try parseQuotedWideString( 883 arena, 884 .{ .slice = "L\"abcdefg\t\"", .code_page = .windows1252 }, 885 .{ .output_code_page = .windows1252 }, 886 )); 887 // Windows-1252 conversion 888 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("ðð€€€"), try parseQuotedWideString( 889 arena, 890 .{ .slice = "L\"\xf0\xf0\x80\x80\x80\"", .code_page = .windows1252 }, 891 .{ .output_code_page = .windows1252 }, 892 )); 893 // Invalid escape sequences are skipped 894 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral(""), try parseQuotedWideString( 895 arena, 896 .{ .slice = "L\"\\H\"", .code_page = .windows1252 }, 897 .{ .output_code_page = .windows1252 }, 898 )); 899 } 900 901 test "parse quoted wide string with utf8 code page" { 902 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); 903 defer arena_allocator.deinit(); 904 const arena = arena_allocator.allocator(); 905 906 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{}, try parseQuotedWideString( 907 arena, 908 .{ .slice = "L\"\"", .code_page = .utf8 }, 909 .{ .output_code_page = .windows1252 }, 910 )); 911 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("кириллица"), try parseQuotedWideString( 912 arena, 913 .{ .slice = "L\"кириллица\"", .code_page = .utf8 }, 914 .{ .output_code_page = .windows1252 }, 915 )); 916 // Invalid UTF-8 gets converted to � depending on well-formedness 917 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("����"), try parseQuotedWideString( 918 arena, 919 .{ .slice = "L\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 }, 920 .{ .output_code_page = .windows1252 }, 921 )); 922 } 923 924 test "parse quoted ascii string as wide string" { 925 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); 926 defer arena_allocator.deinit(); 927 const arena = arena_allocator.allocator(); 928 929 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("кириллица"), try parseQuotedStringAsWideString( 930 arena, 931 .{ .slice = "\"кириллица\"", .code_page = .utf8 }, 932 .{ .output_code_page = .windows1252 }, 933 )); 934 // Whether or not invalid escapes are skipped is still determined by the L prefix 935 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("\\H"), try parseQuotedStringAsWideString( 936 arena, 937 .{ .slice = "\"\\H\"", .code_page = .windows1252 }, 938 .{ .output_code_page = .windows1252 }, 939 )); 940 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral(""), try parseQuotedStringAsWideString( 941 arena, 942 .{ .slice = "L\"\\H\"", .code_page = .windows1252 }, 943 .{ .output_code_page = .windows1252 }, 944 )); 945 // Maximum escape sequence value is also determined by the L prefix 946 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0x12), std.mem.nativeToLittle(u16, '3'), std.mem.nativeToLittle(u16, '4') }, try parseQuotedStringAsWideString( 947 arena, 948 .{ .slice = "\"\\x1234\"", .code_page = .windows1252 }, 949 .{ .output_code_page = .windows1252 }, 950 )); 951 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{std.mem.nativeToLittle(u16, 0x1234)}, try parseQuotedStringAsWideString( 952 arena, 953 .{ .slice = "L\"\\x1234\"", .code_page = .windows1252 }, 954 .{ .output_code_page = .windows1252 }, 955 )); 956 } 957 958 pub fn columnsUntilTabStop(column: usize, tab_columns: usize) usize { 959 // 0 => 8, 1 => 7, 2 => 6, 3 => 5, 4 => 4 960 // 5 => 3, 6 => 2, 7 => 1, 8 => 8 961 return tab_columns - (column % tab_columns); 962 } 963 964 pub fn columnWidth(cur_column: usize, c: u8, tab_columns: usize) usize { 965 return switch (c) { 966 '\t' => columnsUntilTabStop(cur_column, tab_columns), 967 else => 1, 968 }; 969 } 970 971 pub const Number = struct { 972 value: u32, 973 is_long: bool = false, 974 975 pub fn asWord(self: Number) u16 { 976 return @truncate(self.value); 977 } 978 979 pub fn evaluateOperator(lhs: Number, operator_char: u8, rhs: Number) Number { 980 const result = switch (operator_char) { 981 '-' => lhs.value -% rhs.value, 982 '+' => lhs.value +% rhs.value, 983 '|' => lhs.value | rhs.value, 984 '&' => lhs.value & rhs.value, 985 else => unreachable, // invalid operator, this would be a lexer/parser bug 986 }; 987 return .{ 988 .value = result, 989 .is_long = lhs.is_long or rhs.is_long, 990 }; 991 } 992 }; 993 994 /// Assumes that number literals normally rejected by RC's preprocessor 995 /// are similarly rejected before being parsed. 996 /// 997 /// Relevant RC preprocessor errors: 998 /// RC2021: expected exponent value, not '<digit>' 999 /// example that is rejected: 1e1 1000 /// example that is accepted: 1ea 1001 /// (this function will parse the two examples above the same) 1002 pub fn parseNumberLiteral(bytes: SourceBytes) Number { 1003 std.debug.assert(bytes.slice.len > 0); 1004 var result = Number{ .value = 0, .is_long = false }; 1005 var radix: u8 = 10; 1006 var buf = bytes.slice; 1007 1008 const Prefix = enum { none, minus, complement }; 1009 var prefix: Prefix = .none; 1010 switch (buf[0]) { 1011 '-' => { 1012 prefix = .minus; 1013 buf = buf[1..]; 1014 }, 1015 '~' => { 1016 prefix = .complement; 1017 buf = buf[1..]; 1018 }, 1019 else => {}, 1020 } 1021 1022 if (buf.len > 2 and buf[0] == '0') { 1023 switch (buf[1]) { 1024 'o' => { // octal radix prefix is case-sensitive 1025 radix = 8; 1026 buf = buf[2..]; 1027 }, 1028 'x', 'X' => { 1029 radix = 16; 1030 buf = buf[2..]; 1031 }, 1032 else => {}, 1033 } 1034 } 1035 1036 var i: usize = 0; 1037 while (bytes.code_page.codepointAt(i, buf)) |codepoint| : (i += codepoint.byte_len) { 1038 const c = codepoint.value; 1039 if (c == 'L' or c == 'l') { 1040 result.is_long = true; 1041 break; 1042 } 1043 const digit = switch (c) { 1044 // On invalid digit for the radix, just stop parsing but don't fail 1045 0x00...0x7F => std.fmt.charToDigit(@intCast(c), radix) catch break, 1046 else => break, 1047 }; 1048 1049 if (result.value != 0) { 1050 result.value *%= radix; 1051 } 1052 result.value +%= digit; 1053 } 1054 1055 switch (prefix) { 1056 .none => {}, 1057 .minus => result.value = 0 -% result.value, 1058 .complement => result.value = ~result.value, 1059 } 1060 1061 return result; 1062 } 1063 1064 test "parse number literal" { 1065 try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "0", .code_page = .windows1252 })); 1066 try std.testing.expectEqual(Number{ .value = 1, .is_long = false }, parseNumberLiteral(.{ .slice = "1", .code_page = .windows1252 })); 1067 try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "1L", .code_page = .windows1252 })); 1068 try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "1l", .code_page = .windows1252 })); 1069 try std.testing.expectEqual(Number{ .value = 1, .is_long = false }, parseNumberLiteral(.{ .slice = "1garbageL", .code_page = .windows1252 })); 1070 try std.testing.expectEqual(Number{ .value = 4294967295, .is_long = false }, parseNumberLiteral(.{ .slice = "4294967295", .code_page = .windows1252 })); 1071 try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "4294967296", .code_page = .windows1252 })); 1072 try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "4294967297L", .code_page = .windows1252 })); 1073 1074 // can handle any length of number, wraps on overflow appropriately 1075 const big_overflow = parseNumberLiteral(.{ .slice = "1000000000000000000000000000000000000000000000000000000000000000000000000000000090000000001", .code_page = .windows1252 }); 1076 try std.testing.expectEqual(Number{ .value = 4100654081, .is_long = false }, big_overflow); 1077 try std.testing.expectEqual(@as(u16, 1025), big_overflow.asWord()); 1078 1079 try std.testing.expectEqual(Number{ .value = 0x20, .is_long = false }, parseNumberLiteral(.{ .slice = "0x20", .code_page = .windows1252 })); 1080 try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2AL", .code_page = .windows1252 })); 1081 try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL", .code_page = .windows1252 })); 1082 try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL", .code_page = .windows1252 })); 1083 1084 try std.testing.expectEqual(Number{ .value = 0o20, .is_long = false }, parseNumberLiteral(.{ .slice = "0o20", .code_page = .windows1252 })); 1085 try std.testing.expectEqual(Number{ .value = 0o20, .is_long = true }, parseNumberLiteral(.{ .slice = "0o20L", .code_page = .windows1252 })); 1086 try std.testing.expectEqual(Number{ .value = 0o2, .is_long = false }, parseNumberLiteral(.{ .slice = "0o29", .code_page = .windows1252 })); 1087 try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "0O29", .code_page = .windows1252 })); 1088 1089 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFF, .is_long = false }, parseNumberLiteral(.{ .slice = "-1", .code_page = .windows1252 })); 1090 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFE, .is_long = false }, parseNumberLiteral(.{ .slice = "~1", .code_page = .windows1252 })); 1091 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFF, .is_long = true }, parseNumberLiteral(.{ .slice = "-4294967297L", .code_page = .windows1252 })); 1092 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFE, .is_long = true }, parseNumberLiteral(.{ .slice = "~4294967297L", .code_page = .windows1252 })); 1093 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFD, .is_long = false }, parseNumberLiteral(.{ .slice = "-0X3", .code_page = .windows1252 })); 1094 1095 // anything after L is ignored 1096 try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL5", .code_page = .windows1252 })); 1097 }