Parser.zig - zig - fork of https://codeberg.org/ziglang/zig

Parser.zig (61116B) - Raw
      1 //! A Markdown parser producing `Document`s.
      2 //!
      3 //! The parser operates at two levels: at the outer level, the parser accepts
      4 //! the content of an input document line by line and begins building the _block
      5 //! structure_ of the document. This creates a stack of currently open blocks.
      6 //!
      7 //! When the parser detects the end of a block, it closes the block, popping it
      8 //! from the open block stack and completing any additional parsing of the
      9 //! block's content. For blocks which contain parseable inline content, this
     10 //! invokes the inner level of the parser, handling the _inline structure_ of
     11 //! the block.
     12 //!
     13 //! Inline parsing scans through the collected inline content of a block. When
     14 //! it encounters a character that could indicate the beginning of an inline, it
     15 //! either handles the inline right away (if possible) or adds it to a pending
     16 //! inlines stack. When an inline is completed, it is added to a list of
     17 //! completed inlines, which (along with any surrounding text nodes) will become
     18 //! the children of the parent inline or the block whose inline content is being
     19 //! parsed.
     20 
     21 const std = @import("std");
     22 const mem = std.mem;
     23 const assert = std.debug.assert;
     24 const isWhitespace = std.ascii.isWhitespace;
     25 const Allocator = mem.Allocator;
     26 const expectEqual = std.testing.expectEqual;
     27 const Document = @import("Document.zig");
     28 const Node = Document.Node;
     29 const ExtraIndex = Document.ExtraIndex;
     30 const ExtraData = Document.ExtraData;
     31 const StringIndex = Document.StringIndex;
     32 const ArrayList = std.ArrayListUnmanaged;
     33 
     34 nodes: Node.List = .{},
     35 extra: ArrayList(u32) = .empty,
     36 scratch_extra: ArrayList(u32) = .empty,
     37 string_bytes: ArrayList(u8) = .empty,
     38 scratch_string: ArrayList(u8) = .empty,
     39 pending_blocks: ArrayList(Block) = .empty,
     40 allocator: Allocator,
     41 
     42 const Parser = @This();
     43 
     44 /// An arbitrary limit on the maximum number of columns in a table so that
     45 /// table-related metadata maintained by the parser does not require dynamic
     46 /// memory allocation.
     47 const max_table_columns = 128;
     48 
     49 /// A block element which is still receiving children.
     50 const Block = struct {
     51     tag: Tag,
     52     data: Data,
     53     extra_start: usize,
     54     string_start: usize,
     55 
     56     const Tag = enum {
     57         /// Data is `list`.
     58         list,
     59         /// Data is `list_item`.
     60         list_item,
     61         /// Data is `table`.
     62         table,
     63         /// Data is `none`.
     64         table_row,
     65         /// Data is `heading`.
     66         heading,
     67         /// Data is `code_block`.
     68         code_block,
     69         /// Data is `none`.
     70         blockquote,
     71         /// Data is `none`.
     72         paragraph,
     73         /// Data is `none`.
     74         thematic_break,
     75     };
     76 
     77     const Data = union {
     78         none: void,
     79         list: struct {
     80             marker: ListMarker,
     81             /// Between 0 and 999,999,999, inclusive.
     82             start: u30,
     83             tight: bool,
     84             last_line_blank: bool = false,
     85         },
     86         list_item: struct {
     87             continuation_indent: usize,
     88         },
     89         table: struct {
     90             column_alignments_buffer: [max_table_columns]Node.TableCellAlignment,
     91             column_alignments_len: usize,
     92         },
     93         heading: struct {
     94             /// Between 1 and 6, inclusive.
     95             level: u3,
     96         },
     97         code_block: struct {
     98             tag: StringIndex,
     99             fence_len: usize,
    100             indent: usize,
    101         },
    102 
    103         const ListMarker = enum {
    104             @"-",
    105             @"*",
    106             @"+",
    107             number_dot,
    108             number_paren,
    109         };
    110     };
    111 
    112     const ContentType = enum {
    113         blocks,
    114         inlines,
    115         raw_inlines,
    116         nothing,
    117     };
    118 
    119     fn canAccept(b: Block) ContentType {
    120         return switch (b.tag) {
    121             .list,
    122             .list_item,
    123             .table,
    124             .blockquote,
    125             => .blocks,
    126 
    127             .heading,
    128             .paragraph,
    129             => .inlines,
    130 
    131             .code_block,
    132             => .raw_inlines,
    133 
    134             .table_row,
    135             .thematic_break,
    136             => .nothing,
    137         };
    138     }
    139 
    140     /// Attempts to continue `b` using the contents of `line`. If successful,
    141     /// returns the remaining portion of `line` to be considered part of `b`
    142     /// (e.g. for a blockquote, this would be everything except the leading
    143     /// `>`). If unsuccessful, returns null.
    144     fn match(b: Block, line: []const u8) ?[]const u8 {
    145         const unindented = mem.trimStart(u8, line, " \t");
    146         const indent = line.len - unindented.len;
    147         return switch (b.tag) {
    148             .list => line,
    149             .list_item => if (indent >= b.data.list_item.continuation_indent)
    150                 line[b.data.list_item.continuation_indent..]
    151             else if (unindented.len == 0)
    152                 // Blank lines should not close list items, since there may be
    153                 // more indented contents to follow after the blank line.
    154                 ""
    155             else
    156                 null,
    157             .table => if (unindented.len > 0) line else null,
    158             .table_row => null,
    159             .heading => null,
    160             .code_block => code_block: {
    161                 const trimmed = mem.trimEnd(u8, unindented, " \t");
    162                 if (mem.indexOfNone(u8, trimmed, "`") != null or trimmed.len != b.data.code_block.fence_len) {
    163                     const effective_indent = @min(indent, b.data.code_block.indent);
    164                     break :code_block line[effective_indent..];
    165                 } else {
    166                     break :code_block null;
    167                 }
    168             },
    169             .blockquote => if (mem.startsWith(u8, unindented, ">"))
    170                 unindented[1..]
    171             else
    172                 null,
    173             .paragraph => if (unindented.len > 0) line else null,
    174             .thematic_break => null,
    175         };
    176     }
    177 };
    178 
    179 pub fn init(allocator: Allocator) Allocator.Error!Parser {
    180     var p: Parser = .{ .allocator = allocator };
    181     try p.nodes.append(allocator, .{
    182         .tag = .root,
    183         .data = undefined,
    184     });
    185     try p.string_bytes.append(allocator, 0);
    186     return p;
    187 }
    188 
    189 pub fn deinit(p: *Parser) void {
    190     p.nodes.deinit(p.allocator);
    191     p.extra.deinit(p.allocator);
    192     p.scratch_extra.deinit(p.allocator);
    193     p.string_bytes.deinit(p.allocator);
    194     p.scratch_string.deinit(p.allocator);
    195     p.pending_blocks.deinit(p.allocator);
    196     p.* = undefined;
    197 }
    198 
    199 /// Accepts a single line of content. `line` should not have a trailing line
    200 /// ending character.
    201 pub fn feedLine(p: *Parser, line: []const u8) Allocator.Error!void {
    202     var rest_line = line;
    203     const first_unmatched = for (p.pending_blocks.items, 0..) |b, i| {
    204         if (b.match(rest_line)) |rest| {
    205             rest_line = rest;
    206         } else {
    207             break i;
    208         }
    209     } else p.pending_blocks.items.len;
    210 
    211     const in_code_block = p.pending_blocks.items.len > 0 and
    212         p.pending_blocks.getLast().tag == .code_block;
    213     const code_block_end = in_code_block and
    214         first_unmatched + 1 == p.pending_blocks.items.len;
    215     // New blocks cannot be started if we are actively inside a code block or
    216     // are just closing one (to avoid interpreting the closing ``` as a new code
    217     // block start).
    218     var maybe_block_start = if (!in_code_block or first_unmatched + 2 <= p.pending_blocks.items.len)
    219         try p.startBlock(rest_line)
    220     else
    221         null;
    222 
    223     // This is a lazy continuation line if there are no new blocks to open and
    224     // the last open block is a paragraph.
    225     if (maybe_block_start == null and
    226         !isBlank(rest_line) and
    227         p.pending_blocks.items.len > 0 and
    228         p.pending_blocks.getLast().tag == .paragraph)
    229     {
    230         try p.addScratchStringLine(mem.trimStart(u8, rest_line, " \t"));
    231         return;
    232     }
    233 
    234     // If a new block needs to be started, any paragraph needs to be closed,
    235     // even though this isn't detected as part of the closing condition for
    236     // paragraphs.
    237     if (maybe_block_start != null and
    238         p.pending_blocks.items.len > 0 and
    239         p.pending_blocks.getLast().tag == .paragraph)
    240     {
    241         try p.closeLastBlock();
    242     }
    243 
    244     while (p.pending_blocks.items.len > first_unmatched) {
    245         try p.closeLastBlock();
    246     }
    247 
    248     while (maybe_block_start) |block_start| : (maybe_block_start = try p.startBlock(rest_line)) {
    249         try p.appendBlockStart(block_start);
    250         // There may be more blocks to start within the same line.
    251         rest_line = block_start.rest;
    252         // Headings may only contain inline content.
    253         if (block_start.tag == .heading) break;
    254         // An opening code fence does not contain any additional block or inline
    255         // content to process.
    256         if (block_start.tag == .code_block) return;
    257     }
    258 
    259     // Do not append the end of a code block (```) as textual content.
    260     if (code_block_end) return;
    261 
    262     const can_accept = if (p.pending_blocks.getLastOrNull()) |last_pending_block|
    263         last_pending_block.canAccept()
    264     else
    265         .blocks;
    266     const rest_line_trimmed = mem.trimStart(u8, rest_line, " \t");
    267     switch (can_accept) {
    268         .blocks => {
    269             // If we're inside a list item and the rest of the line is blank, it
    270             // means that any subsequent child of the list item (or subsequent
    271             // item in the list) will cause the containing list to be considered
    272             // loose. However, we can't immediately declare that the list is
    273             // loose, since we might just be looking at a blank line after the
    274             // end of the last item in the list. The final determination will be
    275             // made when appending the next child of the list or list item.
    276             const maybe_containing_list_index = if (p.pending_blocks.items.len > 0 and p.pending_blocks.getLast().tag == .list_item)
    277                 p.pending_blocks.items.len - 2
    278             else
    279                 null;
    280 
    281             if (rest_line_trimmed.len > 0) {
    282                 try p.appendBlockStart(.{
    283                     .tag = .paragraph,
    284                     .data = .{ .none = {} },
    285                     .rest = undefined,
    286                 });
    287                 try p.addScratchStringLine(rest_line_trimmed);
    288             }
    289 
    290             if (maybe_containing_list_index) |containing_list_index| {
    291                 p.pending_blocks.items[containing_list_index].data.list.last_line_blank = rest_line_trimmed.len == 0;
    292             }
    293         },
    294         .inlines => try p.addScratchStringLine(rest_line_trimmed),
    295         .raw_inlines => try p.addScratchStringLine(rest_line),
    296         .nothing => {},
    297     }
    298 }
    299 
    300 /// Completes processing of the input and returns the parsed document.
    301 pub fn endInput(p: *Parser) Allocator.Error!Document {
    302     while (p.pending_blocks.items.len > 0) {
    303         try p.closeLastBlock();
    304     }
    305     // There should be no inline content pending after closing the last open
    306     // block.
    307     assert(p.scratch_string.items.len == 0);
    308 
    309     const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items));
    310     p.nodes.items(.data)[0] = .{ .container = .{ .children = children } };
    311     p.scratch_string.items.len = 0;
    312     p.scratch_extra.items.len = 0;
    313 
    314     var nodes = p.nodes.toOwnedSlice();
    315     errdefer nodes.deinit(p.allocator);
    316     const extra = try p.extra.toOwnedSlice(p.allocator);
    317     errdefer p.allocator.free(extra);
    318     const string_bytes = try p.string_bytes.toOwnedSlice(p.allocator);
    319     errdefer p.allocator.free(string_bytes);
    320 
    321     return .{
    322         .nodes = nodes,
    323         .extra = extra,
    324         .string_bytes = string_bytes,
    325     };
    326 }
    327 
    328 /// Data describing the start of a new block element.
    329 const BlockStart = struct {
    330     tag: Tag,
    331     data: Data,
    332     rest: []const u8,
    333 
    334     const Tag = enum {
    335         /// Data is `list_item`.
    336         list_item,
    337         /// Data is `table_row`.
    338         table_row,
    339         /// Data is `heading`.
    340         heading,
    341         /// Data is `code_block`.
    342         code_block,
    343         /// Data is `none`.
    344         blockquote,
    345         /// Data is `none`.
    346         paragraph,
    347         /// Data is `none`.
    348         thematic_break,
    349     };
    350 
    351     const Data = union {
    352         none: void,
    353         list_item: struct {
    354             marker: Block.Data.ListMarker,
    355             number: u30,
    356             continuation_indent: usize,
    357         },
    358         table_row: struct {
    359             cells_buffer: [max_table_columns][]const u8,
    360             cells_len: usize,
    361         },
    362         heading: struct {
    363             /// Between 1 and 6, inclusive.
    364             level: u3,
    365         },
    366         code_block: struct {
    367             tag: StringIndex,
    368             fence_len: usize,
    369             indent: usize,
    370         },
    371     };
    372 };
    373 
    374 fn appendBlockStart(p: *Parser, block_start: BlockStart) !void {
    375     if (p.pending_blocks.getLastOrNull()) |last_pending_block| {
    376         // Close the last block if it is a list and the new block is not a list item
    377         // or not of the same marker type.
    378         const should_close_list = last_pending_block.tag == .list and
    379             (block_start.tag != .list_item or
    380                 block_start.data.list_item.marker != last_pending_block.data.list.marker);
    381         // The last block should also be closed if the new block is not a table
    382         // row, which is the only allowed child of a table.
    383         const should_close_table = last_pending_block.tag == .table and
    384             block_start.tag != .table_row;
    385         if (should_close_list or should_close_table) {
    386             try p.closeLastBlock();
    387         }
    388     }
    389 
    390     if (p.pending_blocks.getLastOrNull()) |last_pending_block| {
    391         // If the last block is a list or list item, check for tightness based
    392         // on the last line.
    393         const maybe_containing_list = switch (last_pending_block.tag) {
    394             .list => &p.pending_blocks.items[p.pending_blocks.items.len - 1],
    395             .list_item => &p.pending_blocks.items[p.pending_blocks.items.len - 2],
    396             else => null,
    397         };
    398         if (maybe_containing_list) |containing_list| {
    399             if (containing_list.data.list.last_line_blank) {
    400                 containing_list.data.list.tight = false;
    401             }
    402         }
    403     }
    404 
    405     // Start a new list if the new block is a list item and there is no
    406     // containing list yet.
    407     if (block_start.tag == .list_item and
    408         (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .list))
    409     {
    410         try p.pending_blocks.append(p.allocator, .{
    411             .tag = .list,
    412             .data = .{ .list = .{
    413                 .marker = block_start.data.list_item.marker,
    414                 .start = block_start.data.list_item.number,
    415                 .tight = true,
    416             } },
    417             .string_start = p.scratch_string.items.len,
    418             .extra_start = p.scratch_extra.items.len,
    419         });
    420     }
    421 
    422     if (block_start.tag == .table_row) {
    423         // Likewise, table rows start a table implicitly.
    424         if (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .table) {
    425             try p.pending_blocks.append(p.allocator, .{
    426                 .tag = .table,
    427                 .data = .{ .table = .{
    428                     .column_alignments_buffer = undefined,
    429                     .column_alignments_len = 0,
    430                 } },
    431                 .string_start = p.scratch_string.items.len,
    432                 .extra_start = p.scratch_extra.items.len,
    433             });
    434         }
    435 
    436         const current_row = p.scratch_extra.items.len - p.pending_blocks.getLast().extra_start;
    437         if (current_row <= 1) {
    438             var buffer: [max_table_columns]Node.TableCellAlignment = undefined;
    439             const table_row = &block_start.data.table_row;
    440             if (parseTableHeaderDelimiter(table_row.cells_buffer[0..table_row.cells_len], &buffer)) |alignments| {
    441                 const table = &p.pending_blocks.items[p.pending_blocks.items.len - 1].data.table;
    442                 @memcpy(table.column_alignments_buffer[0..alignments.len], alignments);
    443                 table.column_alignments_len = alignments.len;
    444                 if (current_row == 1) {
    445                     // We need to go back and mark the header row and its column
    446                     // alignments.
    447                     const datas = p.nodes.items(.data);
    448                     const header_data = datas[p.scratch_extra.getLast()];
    449                     for (p.extraChildren(header_data.container.children), 0..) |header_cell, i| {
    450                         const alignment = if (i < alignments.len) alignments[i] else .unset;
    451                         const cell_data = &datas[@intFromEnum(header_cell)].table_cell;
    452                         cell_data.info.alignment = alignment;
    453                         cell_data.info.header = true;
    454                     }
    455                 }
    456                 return;
    457             }
    458         }
    459     }
    460 
    461     const tag: Block.Tag, const data: Block.Data = switch (block_start.tag) {
    462         .list_item => .{ .list_item, .{ .list_item = .{
    463             .continuation_indent = block_start.data.list_item.continuation_indent,
    464         } } },
    465         .table_row => .{ .table_row, .{ .none = {} } },
    466         .heading => .{ .heading, .{ .heading = .{
    467             .level = block_start.data.heading.level,
    468         } } },
    469         .code_block => .{ .code_block, .{ .code_block = .{
    470             .tag = block_start.data.code_block.tag,
    471             .fence_len = block_start.data.code_block.fence_len,
    472             .indent = block_start.data.code_block.indent,
    473         } } },
    474         .blockquote => .{ .blockquote, .{ .none = {} } },
    475         .paragraph => .{ .paragraph, .{ .none = {} } },
    476         .thematic_break => .{ .thematic_break, .{ .none = {} } },
    477     };
    478 
    479     try p.pending_blocks.append(p.allocator, .{
    480         .tag = tag,
    481         .data = data,
    482         .string_start = p.scratch_string.items.len,
    483         .extra_start = p.scratch_extra.items.len,
    484     });
    485 
    486     if (tag == .table_row) {
    487         // Table rows are unique, since we already have all the children
    488         // available in the BlockStart. We can immediately parse and append
    489         // these children now.
    490         const containing_table = p.pending_blocks.items[p.pending_blocks.items.len - 2];
    491         const table = &containing_table.data.table;
    492         const column_alignments = table.column_alignments_buffer[0..table.column_alignments_len];
    493         const table_row = &block_start.data.table_row;
    494         for (table_row.cells_buffer[0..table_row.cells_len], 0..) |cell_content, i| {
    495             const cell_children = try p.parseInlines(cell_content);
    496             const alignment = if (i < column_alignments.len) column_alignments[i] else .unset;
    497             const cell = try p.addNode(.{
    498                 .tag = .table_cell,
    499                 .data = .{ .table_cell = .{
    500                     .info = .{
    501                         .alignment = alignment,
    502                         .header = false,
    503                     },
    504                     .children = cell_children,
    505                 } },
    506             });
    507             try p.addScratchExtraNode(cell);
    508         }
    509     }
    510 }
    511 
    512 fn startBlock(p: *Parser, line: []const u8) !?BlockStart {
    513     const unindented = mem.trimStart(u8, line, " \t");
    514     const indent = line.len - unindented.len;
    515     if (isThematicBreak(line)) {
    516         // Thematic breaks take precedence over list items.
    517         return .{
    518             .tag = .thematic_break,
    519             .data = .{ .none = {} },
    520             .rest = "",
    521         };
    522     } else if (startListItem(unindented)) |list_item| {
    523         return .{
    524             .tag = .list_item,
    525             .data = .{ .list_item = .{
    526                 .marker = list_item.marker,
    527                 .number = list_item.number,
    528                 .continuation_indent = indent + list_item.marker_len,
    529             } },
    530             .rest = list_item.rest,
    531         };
    532     } else if (startTableRow(unindented)) |table_row| {
    533         return .{
    534             .tag = .table_row,
    535             .data = .{ .table_row = .{
    536                 .cells_buffer = table_row.cells_buffer,
    537                 .cells_len = table_row.cells_len,
    538             } },
    539             .rest = "",
    540         };
    541     } else if (startHeading(unindented)) |heading| {
    542         return .{
    543             .tag = .heading,
    544             .data = .{ .heading = .{
    545                 .level = heading.level,
    546             } },
    547             .rest = heading.rest,
    548         };
    549     } else if (try p.startCodeBlock(unindented)) |code_block| {
    550         return .{
    551             .tag = .code_block,
    552             .data = .{ .code_block = .{
    553                 .tag = code_block.tag,
    554                 .fence_len = code_block.fence_len,
    555                 .indent = indent,
    556             } },
    557             .rest = "",
    558         };
    559     } else if (startBlockquote(unindented)) |rest| {
    560         return .{
    561             .tag = .blockquote,
    562             .data = .{ .none = {} },
    563             .rest = rest,
    564         };
    565     } else {
    566         return null;
    567     }
    568 }
    569 
    570 const ListItemStart = struct {
    571     marker: Block.Data.ListMarker,
    572     number: u30,
    573     marker_len: usize,
    574     rest: []const u8,
    575 };
    576 
    577 fn startListItem(unindented_line: []const u8) ?ListItemStart {
    578     if (mem.startsWith(u8, unindented_line, "- ")) {
    579         return .{
    580             .marker = .@"-",
    581             .number = undefined,
    582             .marker_len = 2,
    583             .rest = unindented_line[2..],
    584         };
    585     } else if (mem.startsWith(u8, unindented_line, "* ")) {
    586         return .{
    587             .marker = .@"*",
    588             .number = undefined,
    589             .marker_len = 2,
    590             .rest = unindented_line[2..],
    591         };
    592     } else if (mem.startsWith(u8, unindented_line, "+ ")) {
    593         return .{
    594             .marker = .@"+",
    595             .number = undefined,
    596             .marker_len = 2,
    597             .rest = unindented_line[2..],
    598         };
    599     }
    600 
    601     const number_end = mem.indexOfNone(u8, unindented_line, "0123456789") orelse return null;
    602     const after_number = unindented_line[number_end..];
    603     const marker: Block.Data.ListMarker = if (mem.startsWith(u8, after_number, ". "))
    604         .number_dot
    605     else if (mem.startsWith(u8, after_number, ") "))
    606         .number_paren
    607     else
    608         return null;
    609     const number = std.fmt.parseInt(u30, unindented_line[0..number_end], 10) catch return null;
    610     if (number > 999_999_999) return null;
    611     return .{
    612         .marker = marker,
    613         .number = number,
    614         .marker_len = number_end + 2,
    615         .rest = after_number[2..],
    616     };
    617 }
    618 
    619 const TableRowStart = struct {
    620     cells_buffer: [max_table_columns][]const u8,
    621     cells_len: usize,
    622 };
    623 
    624 fn startTableRow(unindented_line: []const u8) ?TableRowStart {
    625     if (unindented_line.len < 2 or
    626         !mem.startsWith(u8, unindented_line, "|") or
    627         mem.endsWith(u8, unindented_line, "\\|") or
    628         !mem.endsWith(u8, unindented_line, "|")) return null;
    629 
    630     var cells_buffer: [max_table_columns][]const u8 = undefined;
    631     var cells: ArrayList([]const u8) = .initBuffer(&cells_buffer);
    632     const table_row_content = unindented_line[1 .. unindented_line.len - 1];
    633     var cell_start: usize = 0;
    634     var i: usize = 0;
    635     while (i < table_row_content.len) : (i += 1) {
    636         switch (table_row_content[i]) {
    637             '\\' => i += 1,
    638             '|' => {
    639                 cells.appendBounded(table_row_content[cell_start..i]) catch return null;
    640                 cell_start = i + 1;
    641             },
    642             '`' => {
    643                 // Ignoring pipes in code spans allows table cells to contain
    644                 // code using ||, for example.
    645                 const open_start = i;
    646                 i = mem.indexOfNonePos(u8, table_row_content, i, "`") orelse return null;
    647                 const open_len = i - open_start;
    648                 while (mem.indexOfScalarPos(u8, table_row_content, i, '`')) |close_start| {
    649                     i = mem.indexOfNonePos(u8, table_row_content, close_start, "`") orelse return null;
    650                     const close_len = i - close_start;
    651                     if (close_len == open_len) break;
    652                 } else return null;
    653             },
    654             else => {},
    655         }
    656     }
    657     cells.appendBounded(table_row_content[cell_start..]) catch return null;
    658 
    659     return .{ .cells_buffer = cells_buffer, .cells_len = cells.items.len };
    660 }
    661 
    662 fn parseTableHeaderDelimiter(
    663     row_cells: []const []const u8,
    664     buffer: []Node.TableCellAlignment,
    665 ) ?[]Node.TableCellAlignment {
    666     var alignments: ArrayList(Node.TableCellAlignment) = .initBuffer(buffer);
    667     for (row_cells) |content| {
    668         const alignment = parseTableHeaderDelimiterCell(content) orelse return null;
    669         alignments.appendAssumeCapacity(alignment);
    670     }
    671     return alignments.items;
    672 }
    673 
    674 fn parseTableHeaderDelimiterCell(content: []const u8) ?Node.TableCellAlignment {
    675     var state: enum {
    676         before_rule,
    677         after_left_anchor,
    678         in_rule,
    679         after_right_anchor,
    680         after_rule,
    681     } = .before_rule;
    682     var left_anchor = false;
    683     var right_anchor = false;
    684     for (content) |c| {
    685         switch (state) {
    686             .before_rule => switch (c) {
    687                 ' ' => {},
    688                 ':' => {
    689                     left_anchor = true;
    690                     state = .after_left_anchor;
    691                 },
    692                 '-' => state = .in_rule,
    693                 else => return null,
    694             },
    695             .after_left_anchor => switch (c) {
    696                 '-' => state = .in_rule,
    697                 else => return null,
    698             },
    699             .in_rule => switch (c) {
    700                 '-' => {},
    701                 ':' => {
    702                     right_anchor = true;
    703                     state = .after_right_anchor;
    704                 },
    705                 ' ' => state = .after_rule,
    706                 else => return null,
    707             },
    708             .after_right_anchor => switch (c) {
    709                 ' ' => state = .after_rule,
    710                 else => return null,
    711             },
    712             .after_rule => switch (c) {
    713                 ' ' => {},
    714                 else => return null,
    715             },
    716         }
    717     }
    718 
    719     switch (state) {
    720         .before_rule,
    721         .after_left_anchor,
    722         => return null,
    723 
    724         .in_rule,
    725         .after_right_anchor,
    726         .after_rule,
    727         => {},
    728     }
    729 
    730     return if (left_anchor and right_anchor)
    731         .center
    732     else if (left_anchor)
    733         .left
    734     else if (right_anchor)
    735         .right
    736     else
    737         .unset;
    738 }
    739 
    740 test parseTableHeaderDelimiterCell {
    741     try expectEqual(null, parseTableHeaderDelimiterCell(""));
    742     try expectEqual(null, parseTableHeaderDelimiterCell("   "));
    743     try expectEqual(.unset, parseTableHeaderDelimiterCell("-"));
    744     try expectEqual(.unset, parseTableHeaderDelimiterCell(" - "));
    745     try expectEqual(.unset, parseTableHeaderDelimiterCell("----"));
    746     try expectEqual(.unset, parseTableHeaderDelimiterCell(" ---- "));
    747     try expectEqual(null, parseTableHeaderDelimiterCell(":"));
    748     try expectEqual(null, parseTableHeaderDelimiterCell("::"));
    749     try expectEqual(.left, parseTableHeaderDelimiterCell(":-"));
    750     try expectEqual(.left, parseTableHeaderDelimiterCell(" :----"));
    751     try expectEqual(.center, parseTableHeaderDelimiterCell(":-:"));
    752     try expectEqual(.center, parseTableHeaderDelimiterCell(":----:"));
    753     try expectEqual(.center, parseTableHeaderDelimiterCell("   :----:   "));
    754     try expectEqual(.right, parseTableHeaderDelimiterCell("-:"));
    755     try expectEqual(.right, parseTableHeaderDelimiterCell("----:"));
    756     try expectEqual(.right, parseTableHeaderDelimiterCell("  ----:  "));
    757 }
    758 
    759 const HeadingStart = struct {
    760     level: u3,
    761     rest: []const u8,
    762 };
    763 
    764 fn startHeading(unindented_line: []const u8) ?HeadingStart {
    765     var level: u3 = 0;
    766     return for (unindented_line, 0..) |c, i| {
    767         switch (c) {
    768             '#' => {
    769                 if (level == 6) break null;
    770                 level += 1;
    771             },
    772             ' ' => {
    773                 // We must have seen at least one # by this point, since
    774                 // unindented_line has no leading spaces.
    775                 assert(level > 0);
    776                 break .{
    777                     .level = level,
    778                     .rest = unindented_line[i + 1 ..],
    779                 };
    780             },
    781             else => break null,
    782         }
    783     } else null;
    784 }
    785 
    786 const CodeBlockStart = struct {
    787     tag: StringIndex,
    788     fence_len: usize,
    789 };
    790 
    791 fn startCodeBlock(p: *Parser, unindented_line: []const u8) !?CodeBlockStart {
    792     var fence_len: usize = 0;
    793     const tag_bytes = for (unindented_line, 0..) |c, i| {
    794         switch (c) {
    795             '`' => fence_len += 1,
    796             else => break unindented_line[i..],
    797         }
    798     } else "";
    799     // Code block tags may not contain backticks, since that would create
    800     // potential confusion with inline code spans.
    801     if (fence_len < 3 or mem.indexOfScalar(u8, tag_bytes, '`') != null) return null;
    802     return .{
    803         .tag = try p.addString(mem.trim(u8, tag_bytes, " ")),
    804         .fence_len = fence_len,
    805     };
    806 }
    807 
    808 fn startBlockquote(unindented_line: []const u8) ?[]const u8 {
    809     return if (mem.startsWith(u8, unindented_line, ">"))
    810         unindented_line[1..]
    811     else
    812         null;
    813 }
    814 
    815 fn isThematicBreak(line: []const u8) bool {
    816     var char: ?u8 = null;
    817     var count: usize = 0;
    818     for (line) |c| {
    819         switch (c) {
    820             ' ' => {},
    821             '-', '_', '*' => {
    822                 if (char != null and c != char.?) return false;
    823                 char = c;
    824                 count += 1;
    825             },
    826             else => return false,
    827         }
    828     }
    829     return count >= 3;
    830 }
    831 
    832 fn closeLastBlock(p: *Parser) !void {
    833     const b = p.pending_blocks.pop().?;
    834     const node = switch (b.tag) {
    835         .list => list: {
    836             assert(b.string_start == p.scratch_string.items.len);
    837 
    838             // Although tightness is parsed as a property of the list, it is
    839             // stored at the list item level to make it possible to render each
    840             // node without any context from its parents.
    841             const list_items = p.scratch_extra.items[b.extra_start..];
    842             const node_datas = p.nodes.items(.data);
    843             if (!b.data.list.tight) {
    844                 for (list_items) |list_item| {
    845                     node_datas[list_item].list_item.tight = false;
    846                 }
    847             }
    848 
    849             const children = try p.addExtraChildren(@ptrCast(list_items));
    850             break :list try p.addNode(.{
    851                 .tag = .list,
    852                 .data = .{ .list = .{
    853                     .start = switch (b.data.list.marker) {
    854                         .number_dot, .number_paren => @enumFromInt(b.data.list.start),
    855                         .@"-", .@"*", .@"+" => .unordered,
    856                     },
    857                     .children = children,
    858                 } },
    859             });
    860         },
    861         .list_item => list_item: {
    862             assert(b.string_start == p.scratch_string.items.len);
    863             const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
    864             break :list_item try p.addNode(.{
    865                 .tag = .list_item,
    866                 .data = .{ .list_item = .{
    867                     .tight = true,
    868                     .children = children,
    869                 } },
    870             });
    871         },
    872         .table => table: {
    873             assert(b.string_start == p.scratch_string.items.len);
    874             const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
    875             break :table try p.addNode(.{
    876                 .tag = .table,
    877                 .data = .{ .container = .{
    878                     .children = children,
    879                 } },
    880             });
    881         },
    882         .table_row => table_row: {
    883             assert(b.string_start == p.scratch_string.items.len);
    884             const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
    885             break :table_row try p.addNode(.{
    886                 .tag = .table_row,
    887                 .data = .{ .container = .{
    888                     .children = children,
    889                 } },
    890             });
    891         },
    892         .heading => heading: {
    893             const children = try p.parseInlines(p.scratch_string.items[b.string_start..]);
    894             break :heading try p.addNode(.{
    895                 .tag = .heading,
    896                 .data = .{ .heading = .{
    897                     .level = b.data.heading.level,
    898                     .children = children,
    899                 } },
    900             });
    901         },
    902         .code_block => code_block: {
    903             const content = try p.addString(p.scratch_string.items[b.string_start..]);
    904             break :code_block try p.addNode(.{
    905                 .tag = .code_block,
    906                 .data = .{ .code_block = .{
    907                     .tag = b.data.code_block.tag,
    908                     .content = content,
    909                 } },
    910             });
    911         },
    912         .blockquote => blockquote: {
    913             assert(b.string_start == p.scratch_string.items.len);
    914             const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
    915             break :blockquote try p.addNode(.{
    916                 .tag = .blockquote,
    917                 .data = .{ .container = .{
    918                     .children = children,
    919                 } },
    920             });
    921         },
    922         .paragraph => paragraph: {
    923             const children = try p.parseInlines(p.scratch_string.items[b.string_start..]);
    924             break :paragraph try p.addNode(.{
    925                 .tag = .paragraph,
    926                 .data = .{ .container = .{
    927                     .children = children,
    928                 } },
    929             });
    930         },
    931         .thematic_break => try p.addNode(.{
    932             .tag = .thematic_break,
    933             .data = .{ .none = {} },
    934         }),
    935     };
    936     p.scratch_string.items.len = b.string_start;
    937     p.scratch_extra.items.len = b.extra_start;
    938     try p.addScratchExtraNode(node);
    939 }
    940 
    941 const InlineParser = struct {
    942     parent: *Parser,
    943     content: []const u8,
    944     pos: usize = 0,
    945     pending_inlines: ArrayList(PendingInline) = .empty,
    946     completed_inlines: ArrayList(CompletedInline) = .empty,
    947 
    948     const PendingInline = struct {
    949         tag: Tag,
    950         data: Data,
    951         start: usize,
    952 
    953         const Tag = enum {
    954             /// Data is `emphasis`.
    955             emphasis,
    956             /// Data is `none`.
    957             link,
    958             /// Data is `none`.
    959             image,
    960         };
    961 
    962         const Data = union {
    963             none: void,
    964             emphasis: struct {
    965                 underscore: bool,
    966                 run_len: usize,
    967             },
    968         };
    969     };
    970 
    971     const CompletedInline = struct {
    972         node: Node.Index,
    973         start: usize,
    974         len: usize,
    975     };
    976 
    977     fn deinit(ip: *InlineParser) void {
    978         ip.pending_inlines.deinit(ip.parent.allocator);
    979         ip.completed_inlines.deinit(ip.parent.allocator);
    980     }
    981 
    982     /// Parses all of `ip.content`, returning the children of the node
    983     /// containing the inline content.
    984     fn parse(ip: *InlineParser) Allocator.Error!ExtraIndex {
    985         while (ip.pos < ip.content.len) : (ip.pos += 1) {
    986             switch (ip.content[ip.pos]) {
    987                 '\\' => ip.pos += 1,
    988                 '[' => try ip.pending_inlines.append(ip.parent.allocator, .{
    989                     .tag = .link,
    990                     .data = .{ .none = {} },
    991                     .start = ip.pos,
    992                 }),
    993                 '!' => if (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == '[') {
    994                     try ip.pending_inlines.append(ip.parent.allocator, .{
    995                         .tag = .image,
    996                         .data = .{ .none = {} },
    997                         .start = ip.pos,
    998                     });
    999                     ip.pos += 1;
   1000                 },
   1001                 ']' => try ip.parseLink(),
   1002                 '<' => try ip.parseAutolink(),
   1003                 '*', '_' => try ip.parseEmphasis(),
   1004                 '`' => try ip.parseCodeSpan(),
   1005                 'h' => if (ip.pos == 0 or isPreTextAutolink(ip.content[ip.pos - 1])) {
   1006                     try ip.parseTextAutolink();
   1007                 },
   1008                 else => {},
   1009             }
   1010         }
   1011 
   1012         const children = try ip.encodeChildren(0, ip.content.len);
   1013         // There may be pending inlines after parsing (e.g. unclosed emphasis
   1014         // runs), but there must not be any completed inlines, since those
   1015         // should all be part of `children`.
   1016         assert(ip.completed_inlines.items.len == 0);
   1017         return children;
   1018     }
   1019 
   1020     /// Parses a link, starting at the `]` at the end of the link text. `ip.pos`
   1021     /// is left at the closing `)` of the link target or at the closing `]` if
   1022     /// there is none.
   1023     fn parseLink(ip: *InlineParser) !void {
   1024         var i = ip.pending_inlines.items.len;
   1025         while (i > 0) {
   1026             i -= 1;
   1027             if (ip.pending_inlines.items[i].tag == .link or
   1028                 ip.pending_inlines.items[i].tag == .image) break;
   1029         } else return;
   1030         const opener = ip.pending_inlines.items[i];
   1031         ip.pending_inlines.shrinkRetainingCapacity(i);
   1032         const text_start = switch (opener.tag) {
   1033             .link => opener.start + 1,
   1034             .image => opener.start + 2,
   1035             else => unreachable,
   1036         };
   1037 
   1038         if (ip.pos + 1 >= ip.content.len or ip.content[ip.pos + 1] != '(') return;
   1039         const text_end = ip.pos;
   1040 
   1041         const target_start = text_end + 2;
   1042         var target_end = target_start;
   1043         var nesting_level: usize = 1;
   1044         while (target_end < ip.content.len) : (target_end += 1) {
   1045             switch (ip.content[target_end]) {
   1046                 '\\' => target_end += 1,
   1047                 '(' => nesting_level += 1,
   1048                 ')' => {
   1049                     if (nesting_level == 1) break;
   1050                     nesting_level -= 1;
   1051                 },
   1052                 else => {},
   1053             }
   1054         } else return;
   1055         ip.pos = target_end;
   1056 
   1057         const children = try ip.encodeChildren(text_start, text_end);
   1058         const target = try ip.encodeLinkTarget(target_start, target_end);
   1059 
   1060         const link = try ip.parent.addNode(.{
   1061             .tag = switch (opener.tag) {
   1062                 .link => .link,
   1063                 .image => .image,
   1064                 else => unreachable,
   1065             },
   1066             .data = .{ .link = .{
   1067                 .target = target,
   1068                 .children = children,
   1069             } },
   1070         });
   1071         try ip.completed_inlines.append(ip.parent.allocator, .{
   1072             .node = link,
   1073             .start = opener.start,
   1074             .len = ip.pos - opener.start + 1,
   1075         });
   1076     }
   1077 
   1078     fn encodeLinkTarget(ip: *InlineParser, start: usize, end: usize) !StringIndex {
   1079         // For efficiency, we can encode directly into string_bytes rather than
   1080         // creating a temporary string and then encoding it, since this process
   1081         // is entirely linear.
   1082         const string_top = ip.parent.string_bytes.items.len;
   1083         errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top);
   1084 
   1085         var text_iter: TextIterator = .{ .content = ip.content[start..end] };
   1086         while (text_iter.next()) |content| {
   1087             switch (content) {
   1088                 .char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c),
   1089                 .text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s),
   1090                 .line_break => try ip.parent.string_bytes.appendSlice(ip.parent.allocator, "\\\n"),
   1091             }
   1092         }
   1093         try ip.parent.string_bytes.append(ip.parent.allocator, 0);
   1094         return @enumFromInt(string_top);
   1095     }
   1096 
   1097     /// Parses an autolink, starting at the opening `<`. `ip.pos` is left at the
   1098     /// closing `>`, or remains unchanged at the opening `<` if there is none.
   1099     fn parseAutolink(ip: *InlineParser) !void {
   1100         const start = ip.pos;
   1101         ip.pos += 1;
   1102         var state: enum {
   1103             start,
   1104             scheme,
   1105             target,
   1106         } = .start;
   1107         while (ip.pos < ip.content.len) : (ip.pos += 1) {
   1108             switch (state) {
   1109                 .start => switch (ip.content[ip.pos]) {
   1110                     'A'...'Z', 'a'...'z' => state = .scheme,
   1111                     else => break,
   1112                 },
   1113                 .scheme => switch (ip.content[ip.pos]) {
   1114                     'A'...'Z', 'a'...'z', '0'...'9', '+', '.', '-' => {},
   1115                     ':' => state = .target,
   1116                     else => break,
   1117                 },
   1118                 .target => switch (ip.content[ip.pos]) {
   1119                     '<', ' ', '\t', '\n' => break, // Not allowed in autolinks
   1120                     '>' => {
   1121                         // Backslash escapes are not recognized in autolink targets.
   1122                         const target = try ip.parent.addString(ip.content[start + 1 .. ip.pos]);
   1123                         const node = try ip.parent.addNode(.{
   1124                             .tag = .autolink,
   1125                             .data = .{ .text = .{
   1126                                 .content = target,
   1127                             } },
   1128                         });
   1129                         try ip.completed_inlines.append(ip.parent.allocator, .{
   1130                             .node = node,
   1131                             .start = start,
   1132                             .len = ip.pos - start + 1,
   1133                         });
   1134                         return;
   1135                     },
   1136                     else => {},
   1137                 },
   1138             }
   1139         }
   1140         ip.pos = start;
   1141     }
   1142 
   1143     /// Parses a plain text autolink (not delimited by `<>`), starting at the
   1144     /// first character in the link (an `h`). `ip.pos` is left at the last
   1145     /// character of the link, or remains unchanged if there is no valid link.
   1146     fn parseTextAutolink(ip: *InlineParser) !void {
   1147         const start = ip.pos;
   1148         var state: union(enum) {
   1149             /// Inside `http`. Contains the rest of the text to be matched.
   1150             http: []const u8,
   1151             after_http,
   1152             after_https,
   1153             /// Inside `://`. Contains the rest of the text to be matched.
   1154             authority: []const u8,
   1155             /// Inside link content.
   1156             content: struct {
   1157                 start: usize,
   1158                 paren_nesting: usize,
   1159             },
   1160         } = .{ .http = "http" };
   1161 
   1162         while (ip.pos < ip.content.len) : (ip.pos += 1) {
   1163             switch (state) {
   1164                 .http => |rest| {
   1165                     if (ip.content[ip.pos] != rest[0]) break;
   1166                     if (rest.len > 1) {
   1167                         state = .{ .http = rest[1..] };
   1168                     } else {
   1169                         state = .after_http;
   1170                     }
   1171                 },
   1172                 .after_http => switch (ip.content[ip.pos]) {
   1173                     's' => state = .after_https,
   1174                     ':' => state = .{ .authority = "//" },
   1175                     else => break,
   1176                 },
   1177                 .after_https => switch (ip.content[ip.pos]) {
   1178                     ':' => state = .{ .authority = "//" },
   1179                     else => break,
   1180                 },
   1181                 .authority => |rest| {
   1182                     if (ip.content[ip.pos] != rest[0]) break;
   1183                     if (rest.len > 1) {
   1184                         state = .{ .authority = rest[1..] };
   1185                     } else {
   1186                         state = .{ .content = .{
   1187                             .start = ip.pos + 1,
   1188                             .paren_nesting = 0,
   1189                         } };
   1190                     }
   1191                 },
   1192                 .content => |*content| switch (ip.content[ip.pos]) {
   1193                     ' ', '\t', '\n' => break,
   1194                     '(' => content.paren_nesting += 1,
   1195                     ')' => if (content.paren_nesting == 0) {
   1196                         break;
   1197                     } else {
   1198                         content.paren_nesting -= 1;
   1199                     },
   1200                     else => {},
   1201                 },
   1202             }
   1203         }
   1204 
   1205         switch (state) {
   1206             .http, .after_http, .after_https, .authority => {
   1207                 ip.pos = start;
   1208             },
   1209             .content => |content| {
   1210                 while (ip.pos > content.start and isPostTextAutolink(ip.content[ip.pos - 1])) {
   1211                     ip.pos -= 1;
   1212                 }
   1213                 if (ip.pos == content.start) {
   1214                     ip.pos = start;
   1215                     return;
   1216                 }
   1217 
   1218                 const target = try ip.parent.addString(ip.content[start..ip.pos]);
   1219                 const node = try ip.parent.addNode(.{
   1220                     .tag = .autolink,
   1221                     .data = .{ .text = .{
   1222                         .content = target,
   1223                     } },
   1224                 });
   1225                 try ip.completed_inlines.append(ip.parent.allocator, .{
   1226                     .node = node,
   1227                     .start = start,
   1228                     .len = ip.pos - start,
   1229                 });
   1230                 ip.pos -= 1;
   1231             },
   1232         }
   1233     }
   1234 
   1235     /// Returns whether `c` may appear before a text autolink is recognized.
   1236     fn isPreTextAutolink(c: u8) bool {
   1237         return switch (c) {
   1238             ' ', '\t', '\n', '*', '_', '(' => true,
   1239             else => false,
   1240         };
   1241     }
   1242 
   1243     /// Returns whether `c` is punctuation that may appear after a text autolink
   1244     /// and not be considered part of it.
   1245     fn isPostTextAutolink(c: u8) bool {
   1246         return switch (c) {
   1247             '?', '!', '.', ',', ':', '*', '_' => true,
   1248             else => false,
   1249         };
   1250     }
   1251 
   1252     /// Parses emphasis, starting at the beginning of a run of `*` or `_`
   1253     /// characters. `ip.pos` is left at the last character in the run after
   1254     /// parsing.
   1255     fn parseEmphasis(ip: *InlineParser) !void {
   1256         const char = ip.content[ip.pos];
   1257         var start = ip.pos;
   1258         while (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == char) {
   1259             ip.pos += 1;
   1260         }
   1261         var len = ip.pos - start + 1;
   1262         const underscore = char == '_';
   1263         const space_before = start == 0 or isWhitespace(ip.content[start - 1]);
   1264         const space_after = start + len == ip.content.len or isWhitespace(ip.content[start + len]);
   1265         const punct_before = start == 0 or isPunctuation(ip.content[start - 1]);
   1266         const punct_after = start + len == ip.content.len or isPunctuation(ip.content[start + len]);
   1267         // The rules for when emphasis may be closed or opened are stricter for
   1268         // underscores to avoid inappropriately interpreting snake_case words as
   1269         // containing emphasis markers.
   1270         const can_open = if (underscore)
   1271             !space_after and (space_before or punct_before)
   1272         else
   1273             !space_after;
   1274         const can_close = if (underscore)
   1275             !space_before and (space_after or punct_after)
   1276         else
   1277             !space_before;
   1278 
   1279         if (can_close and ip.pending_inlines.items.len > 0) {
   1280             var i = ip.pending_inlines.items.len;
   1281             while (i > 0 and len > 0) {
   1282                 i -= 1;
   1283                 const opener = &ip.pending_inlines.items[i];
   1284                 if (opener.tag != .emphasis or
   1285                     opener.data.emphasis.underscore != underscore) continue;
   1286 
   1287                 const close_len = @min(opener.data.emphasis.run_len, len);
   1288                 const opener_end = opener.start + opener.data.emphasis.run_len;
   1289 
   1290                 const emphasis = try ip.encodeEmphasis(opener_end, start, close_len);
   1291                 const emphasis_start = opener_end - close_len;
   1292                 const emphasis_len = start - emphasis_start + close_len;
   1293                 try ip.completed_inlines.append(ip.parent.allocator, .{
   1294                     .node = emphasis,
   1295                     .start = emphasis_start,
   1296                     .len = emphasis_len,
   1297                 });
   1298 
   1299                 // There may still be other openers further down in the
   1300                 // stack to close, or part of this run might serve as an
   1301                 // opener itself.
   1302                 start += close_len;
   1303                 len -= close_len;
   1304 
   1305                 // Remove any pending inlines above this on the stack, since
   1306                 // closing this emphasis will prevent them from being closed.
   1307                 // Additionally, if this opener is completely consumed by
   1308                 // being closed, it can be removed.
   1309                 opener.data.emphasis.run_len -= close_len;
   1310                 if (opener.data.emphasis.run_len == 0) {
   1311                     ip.pending_inlines.shrinkRetainingCapacity(i);
   1312                 } else {
   1313                     ip.pending_inlines.shrinkRetainingCapacity(i + 1);
   1314                 }
   1315             }
   1316         }
   1317 
   1318         if (can_open and len > 0) {
   1319             try ip.pending_inlines.append(ip.parent.allocator, .{
   1320                 .tag = .emphasis,
   1321                 .data = .{ .emphasis = .{
   1322                     .underscore = underscore,
   1323                     .run_len = len,
   1324                 } },
   1325                 .start = start,
   1326             });
   1327         }
   1328     }
   1329 
   1330     /// Encodes emphasis specified by a run of `run_len` emphasis characters,
   1331     /// with `start..end` being the range of content contained within the
   1332     /// emphasis.
   1333     fn encodeEmphasis(ip: *InlineParser, start: usize, end: usize, run_len: usize) !Node.Index {
   1334         const children = try ip.encodeChildren(start, end);
   1335         var inner = switch (run_len % 3) {
   1336             1 => try ip.parent.addNode(.{
   1337                 .tag = .emphasis,
   1338                 .data = .{ .container = .{
   1339                     .children = children,
   1340                 } },
   1341             }),
   1342             2 => try ip.parent.addNode(.{
   1343                 .tag = .strong,
   1344                 .data = .{ .container = .{
   1345                     .children = children,
   1346                 } },
   1347             }),
   1348             0 => strong_emphasis: {
   1349                 const strong = try ip.parent.addNode(.{
   1350                     .tag = .strong,
   1351                     .data = .{ .container = .{
   1352                         .children = children,
   1353                     } },
   1354                 });
   1355                 break :strong_emphasis try ip.parent.addNode(.{
   1356                     .tag = .emphasis,
   1357                     .data = .{ .container = .{
   1358                         .children = try ip.parent.addExtraChildren(&.{strong}),
   1359                     } },
   1360                 });
   1361             },
   1362             else => unreachable,
   1363         };
   1364 
   1365         var run_left = run_len;
   1366         while (run_left > 3) : (run_left -= 3) {
   1367             const strong = try ip.parent.addNode(.{
   1368                 .tag = .strong,
   1369                 .data = .{ .container = .{
   1370                     .children = try ip.parent.addExtraChildren(&.{inner}),
   1371                 } },
   1372             });
   1373             inner = try ip.parent.addNode(.{
   1374                 .tag = .emphasis,
   1375                 .data = .{ .container = .{
   1376                     .children = try ip.parent.addExtraChildren(&.{strong}),
   1377                 } },
   1378             });
   1379         }
   1380 
   1381         return inner;
   1382     }
   1383 
   1384     /// Parses a code span, starting at the beginning of the opening backtick
   1385     /// run. `ip.pos` is left at the last character in the closing run after
   1386     /// parsing.
   1387     fn parseCodeSpan(ip: *InlineParser) !void {
   1388         const opener_start = ip.pos;
   1389         ip.pos = mem.indexOfNonePos(u8, ip.content, ip.pos, "`") orelse ip.content.len;
   1390         const opener_len = ip.pos - opener_start;
   1391 
   1392         const start = ip.pos;
   1393         const end = while (mem.indexOfScalarPos(u8, ip.content, ip.pos, '`')) |closer_start| {
   1394             ip.pos = mem.indexOfNonePos(u8, ip.content, closer_start, "`") orelse ip.content.len;
   1395             const closer_len = ip.pos - closer_start;
   1396 
   1397             if (closer_len == opener_len) break closer_start;
   1398         } else unterminated: {
   1399             ip.pos = ip.content.len;
   1400             break :unterminated ip.content.len;
   1401         };
   1402 
   1403         var content = if (start < ip.content.len)
   1404             ip.content[start..end]
   1405         else
   1406             "";
   1407         // This single space removal rule allows code spans to be written which
   1408         // start or end with backticks.
   1409         if (mem.startsWith(u8, content, " `")) content = content[1..];
   1410         if (mem.endsWith(u8, content, "` ")) content = content[0 .. content.len - 1];
   1411 
   1412         const text = try ip.parent.addNode(.{
   1413             .tag = .code_span,
   1414             .data = .{ .text = .{
   1415                 .content = try ip.parent.addString(content),
   1416             } },
   1417         });
   1418         try ip.completed_inlines.append(ip.parent.allocator, .{
   1419             .node = text,
   1420             .start = opener_start,
   1421             .len = ip.pos - opener_start,
   1422         });
   1423         // Ensure ip.pos is pointing at the last character of the
   1424         // closer, not after it.
   1425         ip.pos -= 1;
   1426     }
   1427 
   1428     /// Encodes children parsed in the content range `start..end`. The children
   1429     /// will be text nodes and any completed inlines within the range.
   1430     fn encodeChildren(ip: *InlineParser, start: usize, end: usize) !ExtraIndex {
   1431         const scratch_extra_top = ip.parent.scratch_extra.items.len;
   1432         defer ip.parent.scratch_extra.shrinkRetainingCapacity(scratch_extra_top);
   1433 
   1434         var child_index = ip.completed_inlines.items.len;
   1435         while (child_index > 0 and ip.completed_inlines.items[child_index - 1].start >= start) {
   1436             child_index -= 1;
   1437         }
   1438         const start_child_index = child_index;
   1439 
   1440         var pos = start;
   1441         while (child_index < ip.completed_inlines.items.len) : (child_index += 1) {
   1442             const child_inline = ip.completed_inlines.items[child_index];
   1443             // Completed inlines must be strictly nested within the encodable
   1444             // content.
   1445             assert(child_inline.start >= pos and child_inline.start + child_inline.len <= end);
   1446 
   1447             if (child_inline.start > pos) {
   1448                 try ip.encodeTextNode(pos, child_inline.start);
   1449             }
   1450             try ip.parent.addScratchExtraNode(child_inline.node);
   1451 
   1452             pos = child_inline.start + child_inline.len;
   1453         }
   1454         ip.completed_inlines.shrinkRetainingCapacity(start_child_index);
   1455 
   1456         if (pos < end) {
   1457             try ip.encodeTextNode(pos, end);
   1458         }
   1459 
   1460         const children = ip.parent.scratch_extra.items[scratch_extra_top..];
   1461         return try ip.parent.addExtraChildren(@ptrCast(children));
   1462     }
   1463 
   1464     /// Encodes textual content `ip.content[start..end]` to `scratch_extra`. The
   1465     /// encoded content may include both `text` and `line_break` nodes.
   1466     fn encodeTextNode(ip: *InlineParser, start: usize, end: usize) !void {
   1467         // For efficiency, we can encode directly into string_bytes rather than
   1468         // creating a temporary string and then encoding it, since this process
   1469         // is entirely linear.
   1470         const string_top = ip.parent.string_bytes.items.len;
   1471         errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top);
   1472 
   1473         var string_start = string_top;
   1474         var text_iter: TextIterator = .{ .content = ip.content[start..end] };
   1475         while (text_iter.next()) |content| {
   1476             switch (content) {
   1477                 .char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c),
   1478                 .text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s),
   1479                 .line_break => {
   1480                     if (ip.parent.string_bytes.items.len > string_start) {
   1481                         try ip.parent.string_bytes.append(ip.parent.allocator, 0);
   1482                         try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
   1483                             .tag = .text,
   1484                             .data = .{ .text = .{
   1485                                 .content = @enumFromInt(string_start),
   1486                             } },
   1487                         }));
   1488                         string_start = ip.parent.string_bytes.items.len;
   1489                     }
   1490                     try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
   1491                         .tag = .line_break,
   1492                         .data = .{ .none = {} },
   1493                     }));
   1494                 },
   1495             }
   1496         }
   1497         if (ip.parent.string_bytes.items.len > string_start) {
   1498             try ip.parent.string_bytes.append(ip.parent.allocator, 0);
   1499             try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
   1500                 .tag = .text,
   1501                 .data = .{ .text = .{
   1502                     .content = @enumFromInt(string_start),
   1503                 } },
   1504             }));
   1505         }
   1506     }
   1507 
   1508     /// An iterator over parts of textual content, handling unescaping of
   1509     /// escaped characters and line breaks.
   1510     const TextIterator = struct {
   1511         content: []const u8,
   1512         pos: usize = 0,
   1513 
   1514         const Content = union(enum) {
   1515             char: u8,
   1516             text: []const u8,
   1517             line_break,
   1518         };
   1519 
   1520         const replacement = "\u{FFFD}";
   1521 
   1522         fn next(iter: *TextIterator) ?Content {
   1523             if (iter.pos >= iter.content.len) return null;
   1524             if (iter.content[iter.pos] == '\\') {
   1525                 iter.pos += 1;
   1526                 if (iter.pos == iter.content.len) {
   1527                     return .{ .char = '\\' };
   1528                 } else if (iter.content[iter.pos] == '\n') {
   1529                     iter.pos += 1;
   1530                     return .line_break;
   1531                 } else if (isPunctuation(iter.content[iter.pos])) {
   1532                     const c = iter.content[iter.pos];
   1533                     iter.pos += 1;
   1534                     return .{ .char = c };
   1535                 } else {
   1536                     return .{ .char = '\\' };
   1537                 }
   1538             }
   1539             return iter.nextCodepoint();
   1540         }
   1541 
   1542         fn nextCodepoint(iter: *TextIterator) ?Content {
   1543             switch (iter.content[iter.pos]) {
   1544                 0 => {
   1545                     iter.pos += 1;
   1546                     return .{ .text = replacement };
   1547                 },
   1548                 1...127 => |c| {
   1549                     iter.pos += 1;
   1550                     return .{ .char = c };
   1551                 },
   1552                 else => |b| {
   1553                     const cp_len = std.unicode.utf8ByteSequenceLength(b) catch {
   1554                         iter.pos += 1;
   1555                         return .{ .text = replacement };
   1556                     };
   1557                     const is_valid = iter.pos + cp_len <= iter.content.len and
   1558                         std.unicode.utf8ValidateSlice(iter.content[iter.pos..][0..cp_len]);
   1559                     const cp_encoded = if (is_valid)
   1560                         iter.content[iter.pos..][0..cp_len]
   1561                     else
   1562                         replacement;
   1563                     iter.pos += cp_len;
   1564                     return .{ .text = cp_encoded };
   1565                 },
   1566             }
   1567         }
   1568     };
   1569 };
   1570 
   1571 fn parseInlines(p: *Parser, content: []const u8) !ExtraIndex {
   1572     var ip: InlineParser = .{
   1573         .parent = p,
   1574         .content = mem.trim(u8, content, " \t\n"),
   1575     };
   1576     defer ip.deinit();
   1577     return try ip.parse();
   1578 }
   1579 
   1580 pub fn extraData(p: Parser, comptime T: type, index: ExtraIndex) ExtraData(T) {
   1581     const fields = @typeInfo(T).@"struct".fields;
   1582     var i: usize = @intFromEnum(index);
   1583     var result: T = undefined;
   1584     inline for (fields) |field| {
   1585         @field(result, field.name) = switch (field.type) {
   1586             u32 => p.extra.items[i],
   1587             else => @compileError("bad field type"),
   1588         };
   1589         i += 1;
   1590     }
   1591     return .{ .data = result, .end = i };
   1592 }
   1593 
   1594 pub fn extraChildren(p: Parser, index: ExtraIndex) []const Node.Index {
   1595     const children = p.extraData(Node.Children, index);
   1596     return @ptrCast(p.extra.items[children.end..][0..children.data.len]);
   1597 }
   1598 
   1599 fn addNode(p: *Parser, node: Node) !Node.Index {
   1600     const index: Node.Index = @enumFromInt(@as(u32, @intCast(p.nodes.len)));
   1601     try p.nodes.append(p.allocator, node);
   1602     return index;
   1603 }
   1604 
   1605 fn addString(p: *Parser, s: []const u8) !StringIndex {
   1606     if (s.len == 0) return .empty;
   1607 
   1608     const index: StringIndex = @enumFromInt(@as(u32, @intCast(p.string_bytes.items.len)));
   1609     try p.string_bytes.ensureUnusedCapacity(p.allocator, s.len + 1);
   1610     p.string_bytes.appendSliceAssumeCapacity(s);
   1611     p.string_bytes.appendAssumeCapacity(0);
   1612     return index;
   1613 }
   1614 
   1615 fn addExtraChildren(p: *Parser, nodes: []const Node.Index) !ExtraIndex {
   1616     const index: ExtraIndex = @enumFromInt(@as(u32, @intCast(p.extra.items.len)));
   1617     try p.extra.ensureUnusedCapacity(p.allocator, nodes.len + 1);
   1618     p.extra.appendAssumeCapacity(@intCast(nodes.len));
   1619     p.extra.appendSliceAssumeCapacity(@ptrCast(nodes));
   1620     return index;
   1621 }
   1622 
   1623 fn addScratchExtraNode(p: *Parser, node: Node.Index) !void {
   1624     try p.scratch_extra.append(p.allocator, @intFromEnum(node));
   1625 }
   1626 
   1627 fn addScratchStringLine(p: *Parser, line: []const u8) !void {
   1628     try p.scratch_string.ensureUnusedCapacity(p.allocator, line.len + 1);
   1629     p.scratch_string.appendSliceAssumeCapacity(line);
   1630     p.scratch_string.appendAssumeCapacity('\n');
   1631 }
   1632 
   1633 fn isBlank(line: []const u8) bool {
   1634     return mem.indexOfNone(u8, line, " \t") == null;
   1635 }
   1636 
   1637 fn isPunctuation(c: u8) bool {
   1638     return switch (c) {
   1639         '!',
   1640         '"',
   1641         '#',
   1642         '$',
   1643         '%',
   1644         '&',
   1645         '\'',
   1646         '(',
   1647         ')',
   1648         '*',
   1649         '+',
   1650         ',',
   1651         '-',
   1652         '.',
   1653         '/',
   1654         ':',
   1655         ';',
   1656         '<',
   1657         '=',
   1658         '>',
   1659         '?',
   1660         '@',
   1661         '[',
   1662         '\\',
   1663         ']',
   1664         '^',
   1665         '_',
   1666         '`',
   1667         '{',
   1668         '|',
   1669         '}',
   1670         '~',
   1671         => true,
   1672         else => false,
   1673     };
   1674 }
	zig fork of https://codeberg.org/ziglang/zig
	Log \| Files \| Refs \| README \| LICENSE