From ad34ed5a63ef912fba5232806a1adea6ea55181b Mon Sep 17 00:00:00 2001
From: Ian Johnson <ian@ianjohnson.dev>
Date: Fri, 22 Mar 2024 20:50:07 -0400
Subject: [PATCH] Autodoc: recognize Markdown links in plain text

This extension to the typical `<>` Markdown autolink syntax allows
HTTP(S) links to be recognized in normal text without being delimited by
`<>`. This is the most natural way to write links in text, so it makes
sense to support it and allow documentation comments to be written in a
more natural way.
---
 lib/docs/wasm/markdown.zig        |  25 +++++++
 lib/docs/wasm/markdown/Parser.zig | 112 ++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)
diff --git a/lib/docs/wasm/markdown.zig b/lib/docs/wasm/markdown.zig
index 092906c46a..e0bf4bbaac 100644
--- a/lib/docs/wasm/markdown.zig
+++ b/lib/docs/wasm/markdown.zig
@@ -81,6 +81,11 @@
 //!   escapes). `target` is expected to be an absolute URI: an autolink will not
 //!   be recognized unless `target` starts with a URI scheme followed by a `:`.
 //!
+//!   For convenience, autolinks may also be recognized in plain text without
+//!   any `<>` delimiters. Such autolinks are restricted to start with `http://`
+//!   or `https://` followed by at least one other character, not including any
+//!   trailing punctuation after the link.
+//!
 //! - **Image** - a link directly preceded by a `!`. The link text is
 //!   interpreted as the alt text of the image.
 //!
@@ -740,6 +745,26 @@ test "autolinks" {
     );
 }
 
+test "text autolinks" {
+    try testRender(
+        \\Text autolinks must start with http:// or https://.
+        \\This doesn't count: ftp://example.com.
+        \\Example: https://ziglang.org.
+        \\Here is an important link: **http://example.com**
+        \\(Links may be in parentheses: https://example.com/?q=(parens))
+        \\Escaping a link so it's plain text: https\://example.com
+        \\
+    ,
+        \\<p>Text autolinks must start with http:// or https://.
+        \\This doesn't count: ftp://example.com.
+        \\Example: <a href="https://ziglang.org">https://ziglang.org</a>.
+        \\Here is an important link: <strong><a href="http://example.com">http://example.com</a></strong>
+        \\(Links may be in parentheses: <a href="https://example.com/?q=(parens)">https://example.com/?q=(parens)</a>)
+        \\Escaping a link so it's plain text: https://example.com</p>
+        \\
+    );
+}
+
 test "images" {
     try testRender(
         \\![Alt text](https://example.com/image.png)
diff --git a/lib/docs/wasm/markdown/Parser.zig b/lib/docs/wasm/markdown/Parser.zig
index 5a52882e48..9b377dce34 100644
--- a/lib/docs/wasm/markdown/Parser.zig
+++ b/lib/docs/wasm/markdown/Parser.zig
@@ -988,6 +988,9 @@ const InlineParser = struct {
                 '<' => try ip.parseAutolink(),
                 '*', '_' => try ip.parseEmphasis(),
                 '`' => try ip.parseCodeSpan(),
+                'h' => if (ip.pos == 0 or isPreTextAutolink(ip.content[ip.pos - 1])) {
+                    try ip.parseTextAutolink();
+                },
                 else => {},
             }
         }
@@ -1123,6 +1126,115 @@ const InlineParser = struct {
         ip.pos = start;
     }
 
+    /// Parses a plain text autolink (not delimited by `<>`), starting at the
+    /// first character in the link (an `h`). `ip.pos` is left at the last
+    /// character of the link, or remains unchanged if there is no valid link.
+    fn parseTextAutolink(ip: *InlineParser) !void {
+        const start = ip.pos;
+        var state: union(enum) {
+            /// Inside `http`. Contains the rest of the text to be matched.
+            http: []const u8,
+            after_http,
+            after_https,
+            /// Inside `://`. Contains the rest of the text to be matched.
+            authority: []const u8,
+            /// Inside link content.
+            content: struct {
+                start: usize,
+                paren_nesting: usize,
+            },
+        } = .{ .http = "http" };
+
+        while (ip.pos < ip.content.len) : (ip.pos += 1) {
+            switch (state) {
+                .http => |rest| {
+                    if (ip.content[ip.pos] != rest[0]) break;
+                    if (rest.len > 1) {
+                        state = .{ .http = rest[1..] };
+                    } else {
+                        state = .after_http;
+                    }
+                },
+                .after_http => switch (ip.content[ip.pos]) {
+                    's' => state = .after_https,
+                    ':' => state = .{ .authority = "//" },
+                    else => break,
+                },
+                .after_https => switch (ip.content[ip.pos]) {
+                    ':' => state = .{ .authority = "//" },
+                    else => break,
+                },
+                .authority => |rest| {
+                    if (ip.content[ip.pos] != rest[0]) break;
+                    if (rest.len > 1) {
+                        state = .{ .authority = rest[1..] };
+                    } else {
+                        state = .{ .content = .{
+                            .start = ip.pos + 1,
+                            .paren_nesting = 0,
+                        } };
+                    }
+                },
+                .content => |*content| switch (ip.content[ip.pos]) {
+                    ' ', '\t', '\n' => break,
+                    '(' => content.paren_nesting += 1,
+                    ')' => if (content.paren_nesting == 0) {
+                        break;
+                    } else {
+                        content.paren_nesting -= 1;
+                    },
+                    else => {},
+                },
+            }
+        }
+
+        switch (state) {
+            .http, .after_http, .after_https, .authority => {
+                ip.pos = start;
+            },
+            .content => |content| {
+                while (ip.pos > content.start and isPostTextAutolink(ip.content[ip.pos - 1])) {
+                    ip.pos -= 1;
+                }
+                if (ip.pos == content.start) {
+                    ip.pos = start;
+                    return;
+                }
+
+                const target = try ip.parent.addString(ip.content[start..ip.pos]);
+                const node = try ip.parent.addNode(.{
+                    .tag = .autolink,
+                    .data = .{ .text = .{
+                        .content = target,
+                    } },
+                });
+                try ip.completed_inlines.append(ip.parent.allocator, .{
+                    .node = node,
+                    .start = start,
+                    .len = ip.pos - start,
+                });
+                ip.pos -= 1;
+            },
+        }
+    }
+
+    /// Returns whether `c` may appear before a text autolink is recognized.
+    fn isPreTextAutolink(c: u8) bool {
+        return switch (c) {
+            ' ', '\t', '\n', '*', '_', '(' => true,
+            else => false,
+        };
+    }
+
+    /// Returns whether `c` is punctuation that may appear after a text autolink
+    /// and not be considered part of it.
+    fn isPostTextAutolink(c: u8) bool {
+        return switch (c) {
+            '?', '!', '.', ',', ':', '*', '_' => true,
+            else => false,
+        };
+    }
+
     /// Parses emphasis, starting at the beginning of a run of `*` or `_`
     /// characters. `ip.pos` is left at the last character in the run after
     /// parsing.