Parser.zig (61116B) - Raw
1 //! A Markdown parser producing `Document`s. 2 //! 3 //! The parser operates at two levels: at the outer level, the parser accepts 4 //! the content of an input document line by line and begins building the _block 5 //! structure_ of the document. This creates a stack of currently open blocks. 6 //! 7 //! When the parser detects the end of a block, it closes the block, popping it 8 //! from the open block stack and completing any additional parsing of the 9 //! block's content. For blocks which contain parseable inline content, this 10 //! invokes the inner level of the parser, handling the _inline structure_ of 11 //! the block. 12 //! 13 //! Inline parsing scans through the collected inline content of a block. When 14 //! it encounters a character that could indicate the beginning of an inline, it 15 //! either handles the inline right away (if possible) or adds it to a pending 16 //! inlines stack. When an inline is completed, it is added to a list of 17 //! completed inlines, which (along with any surrounding text nodes) will become 18 //! the children of the parent inline or the block whose inline content is being 19 //! parsed. 20 21 const std = @import("std"); 22 const mem = std.mem; 23 const assert = std.debug.assert; 24 const isWhitespace = std.ascii.isWhitespace; 25 const Allocator = mem.Allocator; 26 const expectEqual = std.testing.expectEqual; 27 const Document = @import("Document.zig"); 28 const Node = Document.Node; 29 const ExtraIndex = Document.ExtraIndex; 30 const ExtraData = Document.ExtraData; 31 const StringIndex = Document.StringIndex; 32 const ArrayList = std.ArrayListUnmanaged; 33 34 nodes: Node.List = .{}, 35 extra: ArrayList(u32) = .empty, 36 scratch_extra: ArrayList(u32) = .empty, 37 string_bytes: ArrayList(u8) = .empty, 38 scratch_string: ArrayList(u8) = .empty, 39 pending_blocks: ArrayList(Block) = .empty, 40 allocator: Allocator, 41 42 const Parser = @This(); 43 44 /// An arbitrary limit on the maximum number of columns in a table so that 45 /// table-related metadata maintained by the parser does not require dynamic 46 /// memory allocation. 47 const max_table_columns = 128; 48 49 /// A block element which is still receiving children. 50 const Block = struct { 51 tag: Tag, 52 data: Data, 53 extra_start: usize, 54 string_start: usize, 55 56 const Tag = enum { 57 /// Data is `list`. 58 list, 59 /// Data is `list_item`. 60 list_item, 61 /// Data is `table`. 62 table, 63 /// Data is `none`. 64 table_row, 65 /// Data is `heading`. 66 heading, 67 /// Data is `code_block`. 68 code_block, 69 /// Data is `none`. 70 blockquote, 71 /// Data is `none`. 72 paragraph, 73 /// Data is `none`. 74 thematic_break, 75 }; 76 77 const Data = union { 78 none: void, 79 list: struct { 80 marker: ListMarker, 81 /// Between 0 and 999,999,999, inclusive. 82 start: u30, 83 tight: bool, 84 last_line_blank: bool = false, 85 }, 86 list_item: struct { 87 continuation_indent: usize, 88 }, 89 table: struct { 90 column_alignments_buffer: [max_table_columns]Node.TableCellAlignment, 91 column_alignments_len: usize, 92 }, 93 heading: struct { 94 /// Between 1 and 6, inclusive. 95 level: u3, 96 }, 97 code_block: struct { 98 tag: StringIndex, 99 fence_len: usize, 100 indent: usize, 101 }, 102 103 const ListMarker = enum { 104 @"-", 105 @"*", 106 @"+", 107 number_dot, 108 number_paren, 109 }; 110 }; 111 112 const ContentType = enum { 113 blocks, 114 inlines, 115 raw_inlines, 116 nothing, 117 }; 118 119 fn canAccept(b: Block) ContentType { 120 return switch (b.tag) { 121 .list, 122 .list_item, 123 .table, 124 .blockquote, 125 => .blocks, 126 127 .heading, 128 .paragraph, 129 => .inlines, 130 131 .code_block, 132 => .raw_inlines, 133 134 .table_row, 135 .thematic_break, 136 => .nothing, 137 }; 138 } 139 140 /// Attempts to continue `b` using the contents of `line`. If successful, 141 /// returns the remaining portion of `line` to be considered part of `b` 142 /// (e.g. for a blockquote, this would be everything except the leading 143 /// `>`). If unsuccessful, returns null. 144 fn match(b: Block, line: []const u8) ?[]const u8 { 145 const unindented = mem.trimStart(u8, line, " \t"); 146 const indent = line.len - unindented.len; 147 return switch (b.tag) { 148 .list => line, 149 .list_item => if (indent >= b.data.list_item.continuation_indent) 150 line[b.data.list_item.continuation_indent..] 151 else if (unindented.len == 0) 152 // Blank lines should not close list items, since there may be 153 // more indented contents to follow after the blank line. 154 "" 155 else 156 null, 157 .table => if (unindented.len > 0) line else null, 158 .table_row => null, 159 .heading => null, 160 .code_block => code_block: { 161 const trimmed = mem.trimEnd(u8, unindented, " \t"); 162 if (mem.indexOfNone(u8, trimmed, "`") != null or trimmed.len != b.data.code_block.fence_len) { 163 const effective_indent = @min(indent, b.data.code_block.indent); 164 break :code_block line[effective_indent..]; 165 } else { 166 break :code_block null; 167 } 168 }, 169 .blockquote => if (mem.startsWith(u8, unindented, ">")) 170 unindented[1..] 171 else 172 null, 173 .paragraph => if (unindented.len > 0) line else null, 174 .thematic_break => null, 175 }; 176 } 177 }; 178 179 pub fn init(allocator: Allocator) Allocator.Error!Parser { 180 var p: Parser = .{ .allocator = allocator }; 181 try p.nodes.append(allocator, .{ 182 .tag = .root, 183 .data = undefined, 184 }); 185 try p.string_bytes.append(allocator, 0); 186 return p; 187 } 188 189 pub fn deinit(p: *Parser) void { 190 p.nodes.deinit(p.allocator); 191 p.extra.deinit(p.allocator); 192 p.scratch_extra.deinit(p.allocator); 193 p.string_bytes.deinit(p.allocator); 194 p.scratch_string.deinit(p.allocator); 195 p.pending_blocks.deinit(p.allocator); 196 p.* = undefined; 197 } 198 199 /// Accepts a single line of content. `line` should not have a trailing line 200 /// ending character. 201 pub fn feedLine(p: *Parser, line: []const u8) Allocator.Error!void { 202 var rest_line = line; 203 const first_unmatched = for (p.pending_blocks.items, 0..) |b, i| { 204 if (b.match(rest_line)) |rest| { 205 rest_line = rest; 206 } else { 207 break i; 208 } 209 } else p.pending_blocks.items.len; 210 211 const in_code_block = p.pending_blocks.items.len > 0 and 212 p.pending_blocks.getLast().tag == .code_block; 213 const code_block_end = in_code_block and 214 first_unmatched + 1 == p.pending_blocks.items.len; 215 // New blocks cannot be started if we are actively inside a code block or 216 // are just closing one (to avoid interpreting the closing ``` as a new code 217 // block start). 218 var maybe_block_start = if (!in_code_block or first_unmatched + 2 <= p.pending_blocks.items.len) 219 try p.startBlock(rest_line) 220 else 221 null; 222 223 // This is a lazy continuation line if there are no new blocks to open and 224 // the last open block is a paragraph. 225 if (maybe_block_start == null and 226 !isBlank(rest_line) and 227 p.pending_blocks.items.len > 0 and 228 p.pending_blocks.getLast().tag == .paragraph) 229 { 230 try p.addScratchStringLine(mem.trimStart(u8, rest_line, " \t")); 231 return; 232 } 233 234 // If a new block needs to be started, any paragraph needs to be closed, 235 // even though this isn't detected as part of the closing condition for 236 // paragraphs. 237 if (maybe_block_start != null and 238 p.pending_blocks.items.len > 0 and 239 p.pending_blocks.getLast().tag == .paragraph) 240 { 241 try p.closeLastBlock(); 242 } 243 244 while (p.pending_blocks.items.len > first_unmatched) { 245 try p.closeLastBlock(); 246 } 247 248 while (maybe_block_start) |block_start| : (maybe_block_start = try p.startBlock(rest_line)) { 249 try p.appendBlockStart(block_start); 250 // There may be more blocks to start within the same line. 251 rest_line = block_start.rest; 252 // Headings may only contain inline content. 253 if (block_start.tag == .heading) break; 254 // An opening code fence does not contain any additional block or inline 255 // content to process. 256 if (block_start.tag == .code_block) return; 257 } 258 259 // Do not append the end of a code block (```) as textual content. 260 if (code_block_end) return; 261 262 const can_accept = if (p.pending_blocks.getLastOrNull()) |last_pending_block| 263 last_pending_block.canAccept() 264 else 265 .blocks; 266 const rest_line_trimmed = mem.trimStart(u8, rest_line, " \t"); 267 switch (can_accept) { 268 .blocks => { 269 // If we're inside a list item and the rest of the line is blank, it 270 // means that any subsequent child of the list item (or subsequent 271 // item in the list) will cause the containing list to be considered 272 // loose. However, we can't immediately declare that the list is 273 // loose, since we might just be looking at a blank line after the 274 // end of the last item in the list. The final determination will be 275 // made when appending the next child of the list or list item. 276 const maybe_containing_list_index = if (p.pending_blocks.items.len > 0 and p.pending_blocks.getLast().tag == .list_item) 277 p.pending_blocks.items.len - 2 278 else 279 null; 280 281 if (rest_line_trimmed.len > 0) { 282 try p.appendBlockStart(.{ 283 .tag = .paragraph, 284 .data = .{ .none = {} }, 285 .rest = undefined, 286 }); 287 try p.addScratchStringLine(rest_line_trimmed); 288 } 289 290 if (maybe_containing_list_index) |containing_list_index| { 291 p.pending_blocks.items[containing_list_index].data.list.last_line_blank = rest_line_trimmed.len == 0; 292 } 293 }, 294 .inlines => try p.addScratchStringLine(rest_line_trimmed), 295 .raw_inlines => try p.addScratchStringLine(rest_line), 296 .nothing => {}, 297 } 298 } 299 300 /// Completes processing of the input and returns the parsed document. 301 pub fn endInput(p: *Parser) Allocator.Error!Document { 302 while (p.pending_blocks.items.len > 0) { 303 try p.closeLastBlock(); 304 } 305 // There should be no inline content pending after closing the last open 306 // block. 307 assert(p.scratch_string.items.len == 0); 308 309 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items)); 310 p.nodes.items(.data)[0] = .{ .container = .{ .children = children } }; 311 p.scratch_string.items.len = 0; 312 p.scratch_extra.items.len = 0; 313 314 var nodes = p.nodes.toOwnedSlice(); 315 errdefer nodes.deinit(p.allocator); 316 const extra = try p.extra.toOwnedSlice(p.allocator); 317 errdefer p.allocator.free(extra); 318 const string_bytes = try p.string_bytes.toOwnedSlice(p.allocator); 319 errdefer p.allocator.free(string_bytes); 320 321 return .{ 322 .nodes = nodes, 323 .extra = extra, 324 .string_bytes = string_bytes, 325 }; 326 } 327 328 /// Data describing the start of a new block element. 329 const BlockStart = struct { 330 tag: Tag, 331 data: Data, 332 rest: []const u8, 333 334 const Tag = enum { 335 /// Data is `list_item`. 336 list_item, 337 /// Data is `table_row`. 338 table_row, 339 /// Data is `heading`. 340 heading, 341 /// Data is `code_block`. 342 code_block, 343 /// Data is `none`. 344 blockquote, 345 /// Data is `none`. 346 paragraph, 347 /// Data is `none`. 348 thematic_break, 349 }; 350 351 const Data = union { 352 none: void, 353 list_item: struct { 354 marker: Block.Data.ListMarker, 355 number: u30, 356 continuation_indent: usize, 357 }, 358 table_row: struct { 359 cells_buffer: [max_table_columns][]const u8, 360 cells_len: usize, 361 }, 362 heading: struct { 363 /// Between 1 and 6, inclusive. 364 level: u3, 365 }, 366 code_block: struct { 367 tag: StringIndex, 368 fence_len: usize, 369 indent: usize, 370 }, 371 }; 372 }; 373 374 fn appendBlockStart(p: *Parser, block_start: BlockStart) !void { 375 if (p.pending_blocks.getLastOrNull()) |last_pending_block| { 376 // Close the last block if it is a list and the new block is not a list item 377 // or not of the same marker type. 378 const should_close_list = last_pending_block.tag == .list and 379 (block_start.tag != .list_item or 380 block_start.data.list_item.marker != last_pending_block.data.list.marker); 381 // The last block should also be closed if the new block is not a table 382 // row, which is the only allowed child of a table. 383 const should_close_table = last_pending_block.tag == .table and 384 block_start.tag != .table_row; 385 if (should_close_list or should_close_table) { 386 try p.closeLastBlock(); 387 } 388 } 389 390 if (p.pending_blocks.getLastOrNull()) |last_pending_block| { 391 // If the last block is a list or list item, check for tightness based 392 // on the last line. 393 const maybe_containing_list = switch (last_pending_block.tag) { 394 .list => &p.pending_blocks.items[p.pending_blocks.items.len - 1], 395 .list_item => &p.pending_blocks.items[p.pending_blocks.items.len - 2], 396 else => null, 397 }; 398 if (maybe_containing_list) |containing_list| { 399 if (containing_list.data.list.last_line_blank) { 400 containing_list.data.list.tight = false; 401 } 402 } 403 } 404 405 // Start a new list if the new block is a list item and there is no 406 // containing list yet. 407 if (block_start.tag == .list_item and 408 (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .list)) 409 { 410 try p.pending_blocks.append(p.allocator, .{ 411 .tag = .list, 412 .data = .{ .list = .{ 413 .marker = block_start.data.list_item.marker, 414 .start = block_start.data.list_item.number, 415 .tight = true, 416 } }, 417 .string_start = p.scratch_string.items.len, 418 .extra_start = p.scratch_extra.items.len, 419 }); 420 } 421 422 if (block_start.tag == .table_row) { 423 // Likewise, table rows start a table implicitly. 424 if (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .table) { 425 try p.pending_blocks.append(p.allocator, .{ 426 .tag = .table, 427 .data = .{ .table = .{ 428 .column_alignments_buffer = undefined, 429 .column_alignments_len = 0, 430 } }, 431 .string_start = p.scratch_string.items.len, 432 .extra_start = p.scratch_extra.items.len, 433 }); 434 } 435 436 const current_row = p.scratch_extra.items.len - p.pending_blocks.getLast().extra_start; 437 if (current_row <= 1) { 438 var buffer: [max_table_columns]Node.TableCellAlignment = undefined; 439 const table_row = &block_start.data.table_row; 440 if (parseTableHeaderDelimiter(table_row.cells_buffer[0..table_row.cells_len], &buffer)) |alignments| { 441 const table = &p.pending_blocks.items[p.pending_blocks.items.len - 1].data.table; 442 @memcpy(table.column_alignments_buffer[0..alignments.len], alignments); 443 table.column_alignments_len = alignments.len; 444 if (current_row == 1) { 445 // We need to go back and mark the header row and its column 446 // alignments. 447 const datas = p.nodes.items(.data); 448 const header_data = datas[p.scratch_extra.getLast()]; 449 for (p.extraChildren(header_data.container.children), 0..) |header_cell, i| { 450 const alignment = if (i < alignments.len) alignments[i] else .unset; 451 const cell_data = &datas[@intFromEnum(header_cell)].table_cell; 452 cell_data.info.alignment = alignment; 453 cell_data.info.header = true; 454 } 455 } 456 return; 457 } 458 } 459 } 460 461 const tag: Block.Tag, const data: Block.Data = switch (block_start.tag) { 462 .list_item => .{ .list_item, .{ .list_item = .{ 463 .continuation_indent = block_start.data.list_item.continuation_indent, 464 } } }, 465 .table_row => .{ .table_row, .{ .none = {} } }, 466 .heading => .{ .heading, .{ .heading = .{ 467 .level = block_start.data.heading.level, 468 } } }, 469 .code_block => .{ .code_block, .{ .code_block = .{ 470 .tag = block_start.data.code_block.tag, 471 .fence_len = block_start.data.code_block.fence_len, 472 .indent = block_start.data.code_block.indent, 473 } } }, 474 .blockquote => .{ .blockquote, .{ .none = {} } }, 475 .paragraph => .{ .paragraph, .{ .none = {} } }, 476 .thematic_break => .{ .thematic_break, .{ .none = {} } }, 477 }; 478 479 try p.pending_blocks.append(p.allocator, .{ 480 .tag = tag, 481 .data = data, 482 .string_start = p.scratch_string.items.len, 483 .extra_start = p.scratch_extra.items.len, 484 }); 485 486 if (tag == .table_row) { 487 // Table rows are unique, since we already have all the children 488 // available in the BlockStart. We can immediately parse and append 489 // these children now. 490 const containing_table = p.pending_blocks.items[p.pending_blocks.items.len - 2]; 491 const table = &containing_table.data.table; 492 const column_alignments = table.column_alignments_buffer[0..table.column_alignments_len]; 493 const table_row = &block_start.data.table_row; 494 for (table_row.cells_buffer[0..table_row.cells_len], 0..) |cell_content, i| { 495 const cell_children = try p.parseInlines(cell_content); 496 const alignment = if (i < column_alignments.len) column_alignments[i] else .unset; 497 const cell = try p.addNode(.{ 498 .tag = .table_cell, 499 .data = .{ .table_cell = .{ 500 .info = .{ 501 .alignment = alignment, 502 .header = false, 503 }, 504 .children = cell_children, 505 } }, 506 }); 507 try p.addScratchExtraNode(cell); 508 } 509 } 510 } 511 512 fn startBlock(p: *Parser, line: []const u8) !?BlockStart { 513 const unindented = mem.trimStart(u8, line, " \t"); 514 const indent = line.len - unindented.len; 515 if (isThematicBreak(line)) { 516 // Thematic breaks take precedence over list items. 517 return .{ 518 .tag = .thematic_break, 519 .data = .{ .none = {} }, 520 .rest = "", 521 }; 522 } else if (startListItem(unindented)) |list_item| { 523 return .{ 524 .tag = .list_item, 525 .data = .{ .list_item = .{ 526 .marker = list_item.marker, 527 .number = list_item.number, 528 .continuation_indent = indent + list_item.marker_len, 529 } }, 530 .rest = list_item.rest, 531 }; 532 } else if (startTableRow(unindented)) |table_row| { 533 return .{ 534 .tag = .table_row, 535 .data = .{ .table_row = .{ 536 .cells_buffer = table_row.cells_buffer, 537 .cells_len = table_row.cells_len, 538 } }, 539 .rest = "", 540 }; 541 } else if (startHeading(unindented)) |heading| { 542 return .{ 543 .tag = .heading, 544 .data = .{ .heading = .{ 545 .level = heading.level, 546 } }, 547 .rest = heading.rest, 548 }; 549 } else if (try p.startCodeBlock(unindented)) |code_block| { 550 return .{ 551 .tag = .code_block, 552 .data = .{ .code_block = .{ 553 .tag = code_block.tag, 554 .fence_len = code_block.fence_len, 555 .indent = indent, 556 } }, 557 .rest = "", 558 }; 559 } else if (startBlockquote(unindented)) |rest| { 560 return .{ 561 .tag = .blockquote, 562 .data = .{ .none = {} }, 563 .rest = rest, 564 }; 565 } else { 566 return null; 567 } 568 } 569 570 const ListItemStart = struct { 571 marker: Block.Data.ListMarker, 572 number: u30, 573 marker_len: usize, 574 rest: []const u8, 575 }; 576 577 fn startListItem(unindented_line: []const u8) ?ListItemStart { 578 if (mem.startsWith(u8, unindented_line, "- ")) { 579 return .{ 580 .marker = .@"-", 581 .number = undefined, 582 .marker_len = 2, 583 .rest = unindented_line[2..], 584 }; 585 } else if (mem.startsWith(u8, unindented_line, "* ")) { 586 return .{ 587 .marker = .@"*", 588 .number = undefined, 589 .marker_len = 2, 590 .rest = unindented_line[2..], 591 }; 592 } else if (mem.startsWith(u8, unindented_line, "+ ")) { 593 return .{ 594 .marker = .@"+", 595 .number = undefined, 596 .marker_len = 2, 597 .rest = unindented_line[2..], 598 }; 599 } 600 601 const number_end = mem.indexOfNone(u8, unindented_line, "0123456789") orelse return null; 602 const after_number = unindented_line[number_end..]; 603 const marker: Block.Data.ListMarker = if (mem.startsWith(u8, after_number, ". ")) 604 .number_dot 605 else if (mem.startsWith(u8, after_number, ") ")) 606 .number_paren 607 else 608 return null; 609 const number = std.fmt.parseInt(u30, unindented_line[0..number_end], 10) catch return null; 610 if (number > 999_999_999) return null; 611 return .{ 612 .marker = marker, 613 .number = number, 614 .marker_len = number_end + 2, 615 .rest = after_number[2..], 616 }; 617 } 618 619 const TableRowStart = struct { 620 cells_buffer: [max_table_columns][]const u8, 621 cells_len: usize, 622 }; 623 624 fn startTableRow(unindented_line: []const u8) ?TableRowStart { 625 if (unindented_line.len < 2 or 626 !mem.startsWith(u8, unindented_line, "|") or 627 mem.endsWith(u8, unindented_line, "\\|") or 628 !mem.endsWith(u8, unindented_line, "|")) return null; 629 630 var cells_buffer: [max_table_columns][]const u8 = undefined; 631 var cells: ArrayList([]const u8) = .initBuffer(&cells_buffer); 632 const table_row_content = unindented_line[1 .. unindented_line.len - 1]; 633 var cell_start: usize = 0; 634 var i: usize = 0; 635 while (i < table_row_content.len) : (i += 1) { 636 switch (table_row_content[i]) { 637 '\\' => i += 1, 638 '|' => { 639 cells.appendBounded(table_row_content[cell_start..i]) catch return null; 640 cell_start = i + 1; 641 }, 642 '`' => { 643 // Ignoring pipes in code spans allows table cells to contain 644 // code using ||, for example. 645 const open_start = i; 646 i = mem.indexOfNonePos(u8, table_row_content, i, "`") orelse return null; 647 const open_len = i - open_start; 648 while (mem.indexOfScalarPos(u8, table_row_content, i, '`')) |close_start| { 649 i = mem.indexOfNonePos(u8, table_row_content, close_start, "`") orelse return null; 650 const close_len = i - close_start; 651 if (close_len == open_len) break; 652 } else return null; 653 }, 654 else => {}, 655 } 656 } 657 cells.appendBounded(table_row_content[cell_start..]) catch return null; 658 659 return .{ .cells_buffer = cells_buffer, .cells_len = cells.items.len }; 660 } 661 662 fn parseTableHeaderDelimiter( 663 row_cells: []const []const u8, 664 buffer: []Node.TableCellAlignment, 665 ) ?[]Node.TableCellAlignment { 666 var alignments: ArrayList(Node.TableCellAlignment) = .initBuffer(buffer); 667 for (row_cells) |content| { 668 const alignment = parseTableHeaderDelimiterCell(content) orelse return null; 669 alignments.appendAssumeCapacity(alignment); 670 } 671 return alignments.items; 672 } 673 674 fn parseTableHeaderDelimiterCell(content: []const u8) ?Node.TableCellAlignment { 675 var state: enum { 676 before_rule, 677 after_left_anchor, 678 in_rule, 679 after_right_anchor, 680 after_rule, 681 } = .before_rule; 682 var left_anchor = false; 683 var right_anchor = false; 684 for (content) |c| { 685 switch (state) { 686 .before_rule => switch (c) { 687 ' ' => {}, 688 ':' => { 689 left_anchor = true; 690 state = .after_left_anchor; 691 }, 692 '-' => state = .in_rule, 693 else => return null, 694 }, 695 .after_left_anchor => switch (c) { 696 '-' => state = .in_rule, 697 else => return null, 698 }, 699 .in_rule => switch (c) { 700 '-' => {}, 701 ':' => { 702 right_anchor = true; 703 state = .after_right_anchor; 704 }, 705 ' ' => state = .after_rule, 706 else => return null, 707 }, 708 .after_right_anchor => switch (c) { 709 ' ' => state = .after_rule, 710 else => return null, 711 }, 712 .after_rule => switch (c) { 713 ' ' => {}, 714 else => return null, 715 }, 716 } 717 } 718 719 switch (state) { 720 .before_rule, 721 .after_left_anchor, 722 => return null, 723 724 .in_rule, 725 .after_right_anchor, 726 .after_rule, 727 => {}, 728 } 729 730 return if (left_anchor and right_anchor) 731 .center 732 else if (left_anchor) 733 .left 734 else if (right_anchor) 735 .right 736 else 737 .unset; 738 } 739 740 test parseTableHeaderDelimiterCell { 741 try expectEqual(null, parseTableHeaderDelimiterCell("")); 742 try expectEqual(null, parseTableHeaderDelimiterCell(" ")); 743 try expectEqual(.unset, parseTableHeaderDelimiterCell("-")); 744 try expectEqual(.unset, parseTableHeaderDelimiterCell(" - ")); 745 try expectEqual(.unset, parseTableHeaderDelimiterCell("----")); 746 try expectEqual(.unset, parseTableHeaderDelimiterCell(" ---- ")); 747 try expectEqual(null, parseTableHeaderDelimiterCell(":")); 748 try expectEqual(null, parseTableHeaderDelimiterCell("::")); 749 try expectEqual(.left, parseTableHeaderDelimiterCell(":-")); 750 try expectEqual(.left, parseTableHeaderDelimiterCell(" :----")); 751 try expectEqual(.center, parseTableHeaderDelimiterCell(":-:")); 752 try expectEqual(.center, parseTableHeaderDelimiterCell(":----:")); 753 try expectEqual(.center, parseTableHeaderDelimiterCell(" :----: ")); 754 try expectEqual(.right, parseTableHeaderDelimiterCell("-:")); 755 try expectEqual(.right, parseTableHeaderDelimiterCell("----:")); 756 try expectEqual(.right, parseTableHeaderDelimiterCell(" ----: ")); 757 } 758 759 const HeadingStart = struct { 760 level: u3, 761 rest: []const u8, 762 }; 763 764 fn startHeading(unindented_line: []const u8) ?HeadingStart { 765 var level: u3 = 0; 766 return for (unindented_line, 0..) |c, i| { 767 switch (c) { 768 '#' => { 769 if (level == 6) break null; 770 level += 1; 771 }, 772 ' ' => { 773 // We must have seen at least one # by this point, since 774 // unindented_line has no leading spaces. 775 assert(level > 0); 776 break .{ 777 .level = level, 778 .rest = unindented_line[i + 1 ..], 779 }; 780 }, 781 else => break null, 782 } 783 } else null; 784 } 785 786 const CodeBlockStart = struct { 787 tag: StringIndex, 788 fence_len: usize, 789 }; 790 791 fn startCodeBlock(p: *Parser, unindented_line: []const u8) !?CodeBlockStart { 792 var fence_len: usize = 0; 793 const tag_bytes = for (unindented_line, 0..) |c, i| { 794 switch (c) { 795 '`' => fence_len += 1, 796 else => break unindented_line[i..], 797 } 798 } else ""; 799 // Code block tags may not contain backticks, since that would create 800 // potential confusion with inline code spans. 801 if (fence_len < 3 or mem.indexOfScalar(u8, tag_bytes, '`') != null) return null; 802 return .{ 803 .tag = try p.addString(mem.trim(u8, tag_bytes, " ")), 804 .fence_len = fence_len, 805 }; 806 } 807 808 fn startBlockquote(unindented_line: []const u8) ?[]const u8 { 809 return if (mem.startsWith(u8, unindented_line, ">")) 810 unindented_line[1..] 811 else 812 null; 813 } 814 815 fn isThematicBreak(line: []const u8) bool { 816 var char: ?u8 = null; 817 var count: usize = 0; 818 for (line) |c| { 819 switch (c) { 820 ' ' => {}, 821 '-', '_', '*' => { 822 if (char != null and c != char.?) return false; 823 char = c; 824 count += 1; 825 }, 826 else => return false, 827 } 828 } 829 return count >= 3; 830 } 831 832 fn closeLastBlock(p: *Parser) !void { 833 const b = p.pending_blocks.pop().?; 834 const node = switch (b.tag) { 835 .list => list: { 836 assert(b.string_start == p.scratch_string.items.len); 837 838 // Although tightness is parsed as a property of the list, it is 839 // stored at the list item level to make it possible to render each 840 // node without any context from its parents. 841 const list_items = p.scratch_extra.items[b.extra_start..]; 842 const node_datas = p.nodes.items(.data); 843 if (!b.data.list.tight) { 844 for (list_items) |list_item| { 845 node_datas[list_item].list_item.tight = false; 846 } 847 } 848 849 const children = try p.addExtraChildren(@ptrCast(list_items)); 850 break :list try p.addNode(.{ 851 .tag = .list, 852 .data = .{ .list = .{ 853 .start = switch (b.data.list.marker) { 854 .number_dot, .number_paren => @enumFromInt(b.data.list.start), 855 .@"-", .@"*", .@"+" => .unordered, 856 }, 857 .children = children, 858 } }, 859 }); 860 }, 861 .list_item => list_item: { 862 assert(b.string_start == p.scratch_string.items.len); 863 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..])); 864 break :list_item try p.addNode(.{ 865 .tag = .list_item, 866 .data = .{ .list_item = .{ 867 .tight = true, 868 .children = children, 869 } }, 870 }); 871 }, 872 .table => table: { 873 assert(b.string_start == p.scratch_string.items.len); 874 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..])); 875 break :table try p.addNode(.{ 876 .tag = .table, 877 .data = .{ .container = .{ 878 .children = children, 879 } }, 880 }); 881 }, 882 .table_row => table_row: { 883 assert(b.string_start == p.scratch_string.items.len); 884 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..])); 885 break :table_row try p.addNode(.{ 886 .tag = .table_row, 887 .data = .{ .container = .{ 888 .children = children, 889 } }, 890 }); 891 }, 892 .heading => heading: { 893 const children = try p.parseInlines(p.scratch_string.items[b.string_start..]); 894 break :heading try p.addNode(.{ 895 .tag = .heading, 896 .data = .{ .heading = .{ 897 .level = b.data.heading.level, 898 .children = children, 899 } }, 900 }); 901 }, 902 .code_block => code_block: { 903 const content = try p.addString(p.scratch_string.items[b.string_start..]); 904 break :code_block try p.addNode(.{ 905 .tag = .code_block, 906 .data = .{ .code_block = .{ 907 .tag = b.data.code_block.tag, 908 .content = content, 909 } }, 910 }); 911 }, 912 .blockquote => blockquote: { 913 assert(b.string_start == p.scratch_string.items.len); 914 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..])); 915 break :blockquote try p.addNode(.{ 916 .tag = .blockquote, 917 .data = .{ .container = .{ 918 .children = children, 919 } }, 920 }); 921 }, 922 .paragraph => paragraph: { 923 const children = try p.parseInlines(p.scratch_string.items[b.string_start..]); 924 break :paragraph try p.addNode(.{ 925 .tag = .paragraph, 926 .data = .{ .container = .{ 927 .children = children, 928 } }, 929 }); 930 }, 931 .thematic_break => try p.addNode(.{ 932 .tag = .thematic_break, 933 .data = .{ .none = {} }, 934 }), 935 }; 936 p.scratch_string.items.len = b.string_start; 937 p.scratch_extra.items.len = b.extra_start; 938 try p.addScratchExtraNode(node); 939 } 940 941 const InlineParser = struct { 942 parent: *Parser, 943 content: []const u8, 944 pos: usize = 0, 945 pending_inlines: ArrayList(PendingInline) = .empty, 946 completed_inlines: ArrayList(CompletedInline) = .empty, 947 948 const PendingInline = struct { 949 tag: Tag, 950 data: Data, 951 start: usize, 952 953 const Tag = enum { 954 /// Data is `emphasis`. 955 emphasis, 956 /// Data is `none`. 957 link, 958 /// Data is `none`. 959 image, 960 }; 961 962 const Data = union { 963 none: void, 964 emphasis: struct { 965 underscore: bool, 966 run_len: usize, 967 }, 968 }; 969 }; 970 971 const CompletedInline = struct { 972 node: Node.Index, 973 start: usize, 974 len: usize, 975 }; 976 977 fn deinit(ip: *InlineParser) void { 978 ip.pending_inlines.deinit(ip.parent.allocator); 979 ip.completed_inlines.deinit(ip.parent.allocator); 980 } 981 982 /// Parses all of `ip.content`, returning the children of the node 983 /// containing the inline content. 984 fn parse(ip: *InlineParser) Allocator.Error!ExtraIndex { 985 while (ip.pos < ip.content.len) : (ip.pos += 1) { 986 switch (ip.content[ip.pos]) { 987 '\\' => ip.pos += 1, 988 '[' => try ip.pending_inlines.append(ip.parent.allocator, .{ 989 .tag = .link, 990 .data = .{ .none = {} }, 991 .start = ip.pos, 992 }), 993 '!' => if (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == '[') { 994 try ip.pending_inlines.append(ip.parent.allocator, .{ 995 .tag = .image, 996 .data = .{ .none = {} }, 997 .start = ip.pos, 998 }); 999 ip.pos += 1; 1000 }, 1001 ']' => try ip.parseLink(), 1002 '<' => try ip.parseAutolink(), 1003 '*', '_' => try ip.parseEmphasis(), 1004 '`' => try ip.parseCodeSpan(), 1005 'h' => if (ip.pos == 0 or isPreTextAutolink(ip.content[ip.pos - 1])) { 1006 try ip.parseTextAutolink(); 1007 }, 1008 else => {}, 1009 } 1010 } 1011 1012 const children = try ip.encodeChildren(0, ip.content.len); 1013 // There may be pending inlines after parsing (e.g. unclosed emphasis 1014 // runs), but there must not be any completed inlines, since those 1015 // should all be part of `children`. 1016 assert(ip.completed_inlines.items.len == 0); 1017 return children; 1018 } 1019 1020 /// Parses a link, starting at the `]` at the end of the link text. `ip.pos` 1021 /// is left at the closing `)` of the link target or at the closing `]` if 1022 /// there is none. 1023 fn parseLink(ip: *InlineParser) !void { 1024 var i = ip.pending_inlines.items.len; 1025 while (i > 0) { 1026 i -= 1; 1027 if (ip.pending_inlines.items[i].tag == .link or 1028 ip.pending_inlines.items[i].tag == .image) break; 1029 } else return; 1030 const opener = ip.pending_inlines.items[i]; 1031 ip.pending_inlines.shrinkRetainingCapacity(i); 1032 const text_start = switch (opener.tag) { 1033 .link => opener.start + 1, 1034 .image => opener.start + 2, 1035 else => unreachable, 1036 }; 1037 1038 if (ip.pos + 1 >= ip.content.len or ip.content[ip.pos + 1] != '(') return; 1039 const text_end = ip.pos; 1040 1041 const target_start = text_end + 2; 1042 var target_end = target_start; 1043 var nesting_level: usize = 1; 1044 while (target_end < ip.content.len) : (target_end += 1) { 1045 switch (ip.content[target_end]) { 1046 '\\' => target_end += 1, 1047 '(' => nesting_level += 1, 1048 ')' => { 1049 if (nesting_level == 1) break; 1050 nesting_level -= 1; 1051 }, 1052 else => {}, 1053 } 1054 } else return; 1055 ip.pos = target_end; 1056 1057 const children = try ip.encodeChildren(text_start, text_end); 1058 const target = try ip.encodeLinkTarget(target_start, target_end); 1059 1060 const link = try ip.parent.addNode(.{ 1061 .tag = switch (opener.tag) { 1062 .link => .link, 1063 .image => .image, 1064 else => unreachable, 1065 }, 1066 .data = .{ .link = .{ 1067 .target = target, 1068 .children = children, 1069 } }, 1070 }); 1071 try ip.completed_inlines.append(ip.parent.allocator, .{ 1072 .node = link, 1073 .start = opener.start, 1074 .len = ip.pos - opener.start + 1, 1075 }); 1076 } 1077 1078 fn encodeLinkTarget(ip: *InlineParser, start: usize, end: usize) !StringIndex { 1079 // For efficiency, we can encode directly into string_bytes rather than 1080 // creating a temporary string and then encoding it, since this process 1081 // is entirely linear. 1082 const string_top = ip.parent.string_bytes.items.len; 1083 errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top); 1084 1085 var text_iter: TextIterator = .{ .content = ip.content[start..end] }; 1086 while (text_iter.next()) |content| { 1087 switch (content) { 1088 .char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c), 1089 .text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s), 1090 .line_break => try ip.parent.string_bytes.appendSlice(ip.parent.allocator, "\\\n"), 1091 } 1092 } 1093 try ip.parent.string_bytes.append(ip.parent.allocator, 0); 1094 return @enumFromInt(string_top); 1095 } 1096 1097 /// Parses an autolink, starting at the opening `<`. `ip.pos` is left at the 1098 /// closing `>`, or remains unchanged at the opening `<` if there is none. 1099 fn parseAutolink(ip: *InlineParser) !void { 1100 const start = ip.pos; 1101 ip.pos += 1; 1102 var state: enum { 1103 start, 1104 scheme, 1105 target, 1106 } = .start; 1107 while (ip.pos < ip.content.len) : (ip.pos += 1) { 1108 switch (state) { 1109 .start => switch (ip.content[ip.pos]) { 1110 'A'...'Z', 'a'...'z' => state = .scheme, 1111 else => break, 1112 }, 1113 .scheme => switch (ip.content[ip.pos]) { 1114 'A'...'Z', 'a'...'z', '0'...'9', '+', '.', '-' => {}, 1115 ':' => state = .target, 1116 else => break, 1117 }, 1118 .target => switch (ip.content[ip.pos]) { 1119 '<', ' ', '\t', '\n' => break, // Not allowed in autolinks 1120 '>' => { 1121 // Backslash escapes are not recognized in autolink targets. 1122 const target = try ip.parent.addString(ip.content[start + 1 .. ip.pos]); 1123 const node = try ip.parent.addNode(.{ 1124 .tag = .autolink, 1125 .data = .{ .text = .{ 1126 .content = target, 1127 } }, 1128 }); 1129 try ip.completed_inlines.append(ip.parent.allocator, .{ 1130 .node = node, 1131 .start = start, 1132 .len = ip.pos - start + 1, 1133 }); 1134 return; 1135 }, 1136 else => {}, 1137 }, 1138 } 1139 } 1140 ip.pos = start; 1141 } 1142 1143 /// Parses a plain text autolink (not delimited by `<>`), starting at the 1144 /// first character in the link (an `h`). `ip.pos` is left at the last 1145 /// character of the link, or remains unchanged if there is no valid link. 1146 fn parseTextAutolink(ip: *InlineParser) !void { 1147 const start = ip.pos; 1148 var state: union(enum) { 1149 /// Inside `http`. Contains the rest of the text to be matched. 1150 http: []const u8, 1151 after_http, 1152 after_https, 1153 /// Inside `://`. Contains the rest of the text to be matched. 1154 authority: []const u8, 1155 /// Inside link content. 1156 content: struct { 1157 start: usize, 1158 paren_nesting: usize, 1159 }, 1160 } = .{ .http = "http" }; 1161 1162 while (ip.pos < ip.content.len) : (ip.pos += 1) { 1163 switch (state) { 1164 .http => |rest| { 1165 if (ip.content[ip.pos] != rest[0]) break; 1166 if (rest.len > 1) { 1167 state = .{ .http = rest[1..] }; 1168 } else { 1169 state = .after_http; 1170 } 1171 }, 1172 .after_http => switch (ip.content[ip.pos]) { 1173 's' => state = .after_https, 1174 ':' => state = .{ .authority = "//" }, 1175 else => break, 1176 }, 1177 .after_https => switch (ip.content[ip.pos]) { 1178 ':' => state = .{ .authority = "//" }, 1179 else => break, 1180 }, 1181 .authority => |rest| { 1182 if (ip.content[ip.pos] != rest[0]) break; 1183 if (rest.len > 1) { 1184 state = .{ .authority = rest[1..] }; 1185 } else { 1186 state = .{ .content = .{ 1187 .start = ip.pos + 1, 1188 .paren_nesting = 0, 1189 } }; 1190 } 1191 }, 1192 .content => |*content| switch (ip.content[ip.pos]) { 1193 ' ', '\t', '\n' => break, 1194 '(' => content.paren_nesting += 1, 1195 ')' => if (content.paren_nesting == 0) { 1196 break; 1197 } else { 1198 content.paren_nesting -= 1; 1199 }, 1200 else => {}, 1201 }, 1202 } 1203 } 1204 1205 switch (state) { 1206 .http, .after_http, .after_https, .authority => { 1207 ip.pos = start; 1208 }, 1209 .content => |content| { 1210 while (ip.pos > content.start and isPostTextAutolink(ip.content[ip.pos - 1])) { 1211 ip.pos -= 1; 1212 } 1213 if (ip.pos == content.start) { 1214 ip.pos = start; 1215 return; 1216 } 1217 1218 const target = try ip.parent.addString(ip.content[start..ip.pos]); 1219 const node = try ip.parent.addNode(.{ 1220 .tag = .autolink, 1221 .data = .{ .text = .{ 1222 .content = target, 1223 } }, 1224 }); 1225 try ip.completed_inlines.append(ip.parent.allocator, .{ 1226 .node = node, 1227 .start = start, 1228 .len = ip.pos - start, 1229 }); 1230 ip.pos -= 1; 1231 }, 1232 } 1233 } 1234 1235 /// Returns whether `c` may appear before a text autolink is recognized. 1236 fn isPreTextAutolink(c: u8) bool { 1237 return switch (c) { 1238 ' ', '\t', '\n', '*', '_', '(' => true, 1239 else => false, 1240 }; 1241 } 1242 1243 /// Returns whether `c` is punctuation that may appear after a text autolink 1244 /// and not be considered part of it. 1245 fn isPostTextAutolink(c: u8) bool { 1246 return switch (c) { 1247 '?', '!', '.', ',', ':', '*', '_' => true, 1248 else => false, 1249 }; 1250 } 1251 1252 /// Parses emphasis, starting at the beginning of a run of `*` or `_` 1253 /// characters. `ip.pos` is left at the last character in the run after 1254 /// parsing. 1255 fn parseEmphasis(ip: *InlineParser) !void { 1256 const char = ip.content[ip.pos]; 1257 var start = ip.pos; 1258 while (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == char) { 1259 ip.pos += 1; 1260 } 1261 var len = ip.pos - start + 1; 1262 const underscore = char == '_'; 1263 const space_before = start == 0 or isWhitespace(ip.content[start - 1]); 1264 const space_after = start + len == ip.content.len or isWhitespace(ip.content[start + len]); 1265 const punct_before = start == 0 or isPunctuation(ip.content[start - 1]); 1266 const punct_after = start + len == ip.content.len or isPunctuation(ip.content[start + len]); 1267 // The rules for when emphasis may be closed or opened are stricter for 1268 // underscores to avoid inappropriately interpreting snake_case words as 1269 // containing emphasis markers. 1270 const can_open = if (underscore) 1271 !space_after and (space_before or punct_before) 1272 else 1273 !space_after; 1274 const can_close = if (underscore) 1275 !space_before and (space_after or punct_after) 1276 else 1277 !space_before; 1278 1279 if (can_close and ip.pending_inlines.items.len > 0) { 1280 var i = ip.pending_inlines.items.len; 1281 while (i > 0 and len > 0) { 1282 i -= 1; 1283 const opener = &ip.pending_inlines.items[i]; 1284 if (opener.tag != .emphasis or 1285 opener.data.emphasis.underscore != underscore) continue; 1286 1287 const close_len = @min(opener.data.emphasis.run_len, len); 1288 const opener_end = opener.start + opener.data.emphasis.run_len; 1289 1290 const emphasis = try ip.encodeEmphasis(opener_end, start, close_len); 1291 const emphasis_start = opener_end - close_len; 1292 const emphasis_len = start - emphasis_start + close_len; 1293 try ip.completed_inlines.append(ip.parent.allocator, .{ 1294 .node = emphasis, 1295 .start = emphasis_start, 1296 .len = emphasis_len, 1297 }); 1298 1299 // There may still be other openers further down in the 1300 // stack to close, or part of this run might serve as an 1301 // opener itself. 1302 start += close_len; 1303 len -= close_len; 1304 1305 // Remove any pending inlines above this on the stack, since 1306 // closing this emphasis will prevent them from being closed. 1307 // Additionally, if this opener is completely consumed by 1308 // being closed, it can be removed. 1309 opener.data.emphasis.run_len -= close_len; 1310 if (opener.data.emphasis.run_len == 0) { 1311 ip.pending_inlines.shrinkRetainingCapacity(i); 1312 } else { 1313 ip.pending_inlines.shrinkRetainingCapacity(i + 1); 1314 } 1315 } 1316 } 1317 1318 if (can_open and len > 0) { 1319 try ip.pending_inlines.append(ip.parent.allocator, .{ 1320 .tag = .emphasis, 1321 .data = .{ .emphasis = .{ 1322 .underscore = underscore, 1323 .run_len = len, 1324 } }, 1325 .start = start, 1326 }); 1327 } 1328 } 1329 1330 /// Encodes emphasis specified by a run of `run_len` emphasis characters, 1331 /// with `start..end` being the range of content contained within the 1332 /// emphasis. 1333 fn encodeEmphasis(ip: *InlineParser, start: usize, end: usize, run_len: usize) !Node.Index { 1334 const children = try ip.encodeChildren(start, end); 1335 var inner = switch (run_len % 3) { 1336 1 => try ip.parent.addNode(.{ 1337 .tag = .emphasis, 1338 .data = .{ .container = .{ 1339 .children = children, 1340 } }, 1341 }), 1342 2 => try ip.parent.addNode(.{ 1343 .tag = .strong, 1344 .data = .{ .container = .{ 1345 .children = children, 1346 } }, 1347 }), 1348 0 => strong_emphasis: { 1349 const strong = try ip.parent.addNode(.{ 1350 .tag = .strong, 1351 .data = .{ .container = .{ 1352 .children = children, 1353 } }, 1354 }); 1355 break :strong_emphasis try ip.parent.addNode(.{ 1356 .tag = .emphasis, 1357 .data = .{ .container = .{ 1358 .children = try ip.parent.addExtraChildren(&.{strong}), 1359 } }, 1360 }); 1361 }, 1362 else => unreachable, 1363 }; 1364 1365 var run_left = run_len; 1366 while (run_left > 3) : (run_left -= 3) { 1367 const strong = try ip.parent.addNode(.{ 1368 .tag = .strong, 1369 .data = .{ .container = .{ 1370 .children = try ip.parent.addExtraChildren(&.{inner}), 1371 } }, 1372 }); 1373 inner = try ip.parent.addNode(.{ 1374 .tag = .emphasis, 1375 .data = .{ .container = .{ 1376 .children = try ip.parent.addExtraChildren(&.{strong}), 1377 } }, 1378 }); 1379 } 1380 1381 return inner; 1382 } 1383 1384 /// Parses a code span, starting at the beginning of the opening backtick 1385 /// run. `ip.pos` is left at the last character in the closing run after 1386 /// parsing. 1387 fn parseCodeSpan(ip: *InlineParser) !void { 1388 const opener_start = ip.pos; 1389 ip.pos = mem.indexOfNonePos(u8, ip.content, ip.pos, "`") orelse ip.content.len; 1390 const opener_len = ip.pos - opener_start; 1391 1392 const start = ip.pos; 1393 const end = while (mem.indexOfScalarPos(u8, ip.content, ip.pos, '`')) |closer_start| { 1394 ip.pos = mem.indexOfNonePos(u8, ip.content, closer_start, "`") orelse ip.content.len; 1395 const closer_len = ip.pos - closer_start; 1396 1397 if (closer_len == opener_len) break closer_start; 1398 } else unterminated: { 1399 ip.pos = ip.content.len; 1400 break :unterminated ip.content.len; 1401 }; 1402 1403 var content = if (start < ip.content.len) 1404 ip.content[start..end] 1405 else 1406 ""; 1407 // This single space removal rule allows code spans to be written which 1408 // start or end with backticks. 1409 if (mem.startsWith(u8, content, " `")) content = content[1..]; 1410 if (mem.endsWith(u8, content, "` ")) content = content[0 .. content.len - 1]; 1411 1412 const text = try ip.parent.addNode(.{ 1413 .tag = .code_span, 1414 .data = .{ .text = .{ 1415 .content = try ip.parent.addString(content), 1416 } }, 1417 }); 1418 try ip.completed_inlines.append(ip.parent.allocator, .{ 1419 .node = text, 1420 .start = opener_start, 1421 .len = ip.pos - opener_start, 1422 }); 1423 // Ensure ip.pos is pointing at the last character of the 1424 // closer, not after it. 1425 ip.pos -= 1; 1426 } 1427 1428 /// Encodes children parsed in the content range `start..end`. The children 1429 /// will be text nodes and any completed inlines within the range. 1430 fn encodeChildren(ip: *InlineParser, start: usize, end: usize) !ExtraIndex { 1431 const scratch_extra_top = ip.parent.scratch_extra.items.len; 1432 defer ip.parent.scratch_extra.shrinkRetainingCapacity(scratch_extra_top); 1433 1434 var child_index = ip.completed_inlines.items.len; 1435 while (child_index > 0 and ip.completed_inlines.items[child_index - 1].start >= start) { 1436 child_index -= 1; 1437 } 1438 const start_child_index = child_index; 1439 1440 var pos = start; 1441 while (child_index < ip.completed_inlines.items.len) : (child_index += 1) { 1442 const child_inline = ip.completed_inlines.items[child_index]; 1443 // Completed inlines must be strictly nested within the encodable 1444 // content. 1445 assert(child_inline.start >= pos and child_inline.start + child_inline.len <= end); 1446 1447 if (child_inline.start > pos) { 1448 try ip.encodeTextNode(pos, child_inline.start); 1449 } 1450 try ip.parent.addScratchExtraNode(child_inline.node); 1451 1452 pos = child_inline.start + child_inline.len; 1453 } 1454 ip.completed_inlines.shrinkRetainingCapacity(start_child_index); 1455 1456 if (pos < end) { 1457 try ip.encodeTextNode(pos, end); 1458 } 1459 1460 const children = ip.parent.scratch_extra.items[scratch_extra_top..]; 1461 return try ip.parent.addExtraChildren(@ptrCast(children)); 1462 } 1463 1464 /// Encodes textual content `ip.content[start..end]` to `scratch_extra`. The 1465 /// encoded content may include both `text` and `line_break` nodes. 1466 fn encodeTextNode(ip: *InlineParser, start: usize, end: usize) !void { 1467 // For efficiency, we can encode directly into string_bytes rather than 1468 // creating a temporary string and then encoding it, since this process 1469 // is entirely linear. 1470 const string_top = ip.parent.string_bytes.items.len; 1471 errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top); 1472 1473 var string_start = string_top; 1474 var text_iter: TextIterator = .{ .content = ip.content[start..end] }; 1475 while (text_iter.next()) |content| { 1476 switch (content) { 1477 .char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c), 1478 .text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s), 1479 .line_break => { 1480 if (ip.parent.string_bytes.items.len > string_start) { 1481 try ip.parent.string_bytes.append(ip.parent.allocator, 0); 1482 try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{ 1483 .tag = .text, 1484 .data = .{ .text = .{ 1485 .content = @enumFromInt(string_start), 1486 } }, 1487 })); 1488 string_start = ip.parent.string_bytes.items.len; 1489 } 1490 try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{ 1491 .tag = .line_break, 1492 .data = .{ .none = {} }, 1493 })); 1494 }, 1495 } 1496 } 1497 if (ip.parent.string_bytes.items.len > string_start) { 1498 try ip.parent.string_bytes.append(ip.parent.allocator, 0); 1499 try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{ 1500 .tag = .text, 1501 .data = .{ .text = .{ 1502 .content = @enumFromInt(string_start), 1503 } }, 1504 })); 1505 } 1506 } 1507 1508 /// An iterator over parts of textual content, handling unescaping of 1509 /// escaped characters and line breaks. 1510 const TextIterator = struct { 1511 content: []const u8, 1512 pos: usize = 0, 1513 1514 const Content = union(enum) { 1515 char: u8, 1516 text: []const u8, 1517 line_break, 1518 }; 1519 1520 const replacement = "\u{FFFD}"; 1521 1522 fn next(iter: *TextIterator) ?Content { 1523 if (iter.pos >= iter.content.len) return null; 1524 if (iter.content[iter.pos] == '\\') { 1525 iter.pos += 1; 1526 if (iter.pos == iter.content.len) { 1527 return .{ .char = '\\' }; 1528 } else if (iter.content[iter.pos] == '\n') { 1529 iter.pos += 1; 1530 return .line_break; 1531 } else if (isPunctuation(iter.content[iter.pos])) { 1532 const c = iter.content[iter.pos]; 1533 iter.pos += 1; 1534 return .{ .char = c }; 1535 } else { 1536 return .{ .char = '\\' }; 1537 } 1538 } 1539 return iter.nextCodepoint(); 1540 } 1541 1542 fn nextCodepoint(iter: *TextIterator) ?Content { 1543 switch (iter.content[iter.pos]) { 1544 0 => { 1545 iter.pos += 1; 1546 return .{ .text = replacement }; 1547 }, 1548 1...127 => |c| { 1549 iter.pos += 1; 1550 return .{ .char = c }; 1551 }, 1552 else => |b| { 1553 const cp_len = std.unicode.utf8ByteSequenceLength(b) catch { 1554 iter.pos += 1; 1555 return .{ .text = replacement }; 1556 }; 1557 const is_valid = iter.pos + cp_len <= iter.content.len and 1558 std.unicode.utf8ValidateSlice(iter.content[iter.pos..][0..cp_len]); 1559 const cp_encoded = if (is_valid) 1560 iter.content[iter.pos..][0..cp_len] 1561 else 1562 replacement; 1563 iter.pos += cp_len; 1564 return .{ .text = cp_encoded }; 1565 }, 1566 } 1567 } 1568 }; 1569 }; 1570 1571 fn parseInlines(p: *Parser, content: []const u8) !ExtraIndex { 1572 var ip: InlineParser = .{ 1573 .parent = p, 1574 .content = mem.trim(u8, content, " \t\n"), 1575 }; 1576 defer ip.deinit(); 1577 return try ip.parse(); 1578 } 1579 1580 pub fn extraData(p: Parser, comptime T: type, index: ExtraIndex) ExtraData(T) { 1581 const fields = @typeInfo(T).@"struct".fields; 1582 var i: usize = @intFromEnum(index); 1583 var result: T = undefined; 1584 inline for (fields) |field| { 1585 @field(result, field.name) = switch (field.type) { 1586 u32 => p.extra.items[i], 1587 else => @compileError("bad field type"), 1588 }; 1589 i += 1; 1590 } 1591 return .{ .data = result, .end = i }; 1592 } 1593 1594 pub fn extraChildren(p: Parser, index: ExtraIndex) []const Node.Index { 1595 const children = p.extraData(Node.Children, index); 1596 return @ptrCast(p.extra.items[children.end..][0..children.data.len]); 1597 } 1598 1599 fn addNode(p: *Parser, node: Node) !Node.Index { 1600 const index: Node.Index = @enumFromInt(@as(u32, @intCast(p.nodes.len))); 1601 try p.nodes.append(p.allocator, node); 1602 return index; 1603 } 1604 1605 fn addString(p: *Parser, s: []const u8) !StringIndex { 1606 if (s.len == 0) return .empty; 1607 1608 const index: StringIndex = @enumFromInt(@as(u32, @intCast(p.string_bytes.items.len))); 1609 try p.string_bytes.ensureUnusedCapacity(p.allocator, s.len + 1); 1610 p.string_bytes.appendSliceAssumeCapacity(s); 1611 p.string_bytes.appendAssumeCapacity(0); 1612 return index; 1613 } 1614 1615 fn addExtraChildren(p: *Parser, nodes: []const Node.Index) !ExtraIndex { 1616 const index: ExtraIndex = @enumFromInt(@as(u32, @intCast(p.extra.items.len))); 1617 try p.extra.ensureUnusedCapacity(p.allocator, nodes.len + 1); 1618 p.extra.appendAssumeCapacity(@intCast(nodes.len)); 1619 p.extra.appendSliceAssumeCapacity(@ptrCast(nodes)); 1620 return index; 1621 } 1622 1623 fn addScratchExtraNode(p: *Parser, node: Node.Index) !void { 1624 try p.scratch_extra.append(p.allocator, @intFromEnum(node)); 1625 } 1626 1627 fn addScratchStringLine(p: *Parser, line: []const u8) !void { 1628 try p.scratch_string.ensureUnusedCapacity(p.allocator, line.len + 1); 1629 p.scratch_string.appendSliceAssumeCapacity(line); 1630 p.scratch_string.appendAssumeCapacity('\n'); 1631 } 1632 1633 fn isBlank(line: []const u8) bool { 1634 return mem.indexOfNone(u8, line, " \t") == null; 1635 } 1636 1637 fn isPunctuation(c: u8) bool { 1638 return switch (c) { 1639 '!', 1640 '"', 1641 '#', 1642 '$', 1643 '%', 1644 '&', 1645 '\'', 1646 '(', 1647 ')', 1648 '*', 1649 '+', 1650 ',', 1651 '-', 1652 '.', 1653 '/', 1654 ':', 1655 ';', 1656 '<', 1657 '=', 1658 '>', 1659 '?', 1660 '@', 1661 '[', 1662 '\\', 1663 ']', 1664 '^', 1665 '_', 1666 '`', 1667 '{', 1668 '|', 1669 '}', 1670 '~', 1671 => true, 1672 else => false, 1673 }; 1674 }