std.zig.tokenizer: simplify line-based tokens

Closes #21358
Closes #21360

This commit modifies the `multiline_string_literal_line`, `doc_comment`,
and `container_doc_comment` tokens to no longer include the line ending
as part of the token. This makes it easier to handle line endings (which
may be LF, CRLF, or in edge cases possibly nonexistent) consistently.

In the two issues linked above, Autodoc was already assuming this for
doc comments, and yielding incorrect results when handling files with
CRLF line endings (both in Markdown parsing and source rendering).

Applying the same simplification for multiline string literals also
brings `zig fmt` into conformance with
https://github.com/ziglang/zig-spec/issues/38 regarding formatting of
multiline strings with CRLF line endings: the spec says that `zig fmt`
should remove the CR from such line endings, but this was not previously
the case.
This commit is contained in:
Ian Johnson
2024-09-09 22:23:45 -04:00
committed by Veikka Tuominen
parent d6d09f4ea7
commit 9007534551
4 changed files with 40 additions and 11 deletions

View File

@@ -11721,16 +11721,14 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
var tok_i = start;
{
const slice = tree.tokenSlice(tok_i);
const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
const line_bytes = slice[2 .. slice.len - carriage_return_ending];
const line_bytes = slice[2..];
try string_bytes.appendSlice(gpa, line_bytes);
tok_i += 1;
}
// Following lines: each line prepends a newline.
while (tok_i <= end) : (tok_i += 1) {
const slice = tree.tokenSlice(tok_i);
const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
const line_bytes = slice[2 .. slice.len - carriage_return_ending];
const line_bytes = slice[2..];
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
string_bytes.appendAssumeCapacity('\n');
string_bytes.appendSliceAssumeCapacity(line_bytes);

View File

@@ -3087,6 +3087,22 @@ test "zig fmt: multiline string" {
);
}
test "zig fmt: multiline string with CRLF line endings" {
try testTransform("" ++
"const s =\r\n" ++
" \\\\one\r\n" ++
" \\\\two)\r\n" ++
" \\\\three\r\n" ++
";\r\n",
\\const s =
\\ \\one
\\ \\two)
\\ \\three
\\;
\\
);
}
test "zig fmt: values" {
try testCanonical(
\\test "values" {
@@ -4404,6 +4420,28 @@ test "zig fmt: invalid doc comments on comptime and test blocks" {
});
}
test "zig fmt: comments with CRLF line endings" {
try testTransform("" ++
"//! Top-level doc comment\r\n" ++
"//! Continuing to another line\r\n" ++
"\r\n" ++
"/// Regular doc comment\r\n" ++
"const S = struct {\r\n" ++
" // Regular comment\r\n" ++
" // More content\r\n" ++
"};\r\n",
\\//! Top-level doc comment
\\//! Continuing to another line
\\
\\/// Regular doc comment
\\const S = struct {
\\ // Regular comment
\\ // More content
\\};
\\
);
}
test "zig fmt: else comptime expr" {
try testCanonical(
\\comptime {

View File

@@ -3170,9 +3170,6 @@ fn discardAllParams(r: *Render, fn_proto_node: Ast.Node.Index) Error!void {
fn tokenSliceForRender(tree: Ast, token_index: Ast.TokenIndex) []const u8 {
var ret = tree.tokenSlice(token_index);
switch (tree.tokens.items(.tag)[token_index]) {
.multiline_string_literal_line => {
if (ret[ret.len - 1] == '\n') ret.len -= 1;
},
.container_doc_comment, .doc_comment => {
ret = mem.trimRight(u8, ret, &std.ascii.whitespace);
},

View File

@@ -847,12 +847,10 @@ pub const Tokenizer = struct {
break;
},
'\n' => {
self.index += 1;
break;
},
'\r' => {
if (self.buffer[self.index + 1] == '\n') {
self.index += 2;
break;
} else {
state = .invalid;
@@ -1117,7 +1115,6 @@ pub const Tokenizer = struct {
},
'\r' => {
if (self.buffer[self.index + 1] == '\n') {
self.index += 1;
result.tag = .doc_comment;
break;
} else {
@@ -1167,7 +1164,6 @@ pub const Tokenizer = struct {
},
'\r' => {
if (self.buffer[self.index + 1] == '\n') {
self.index += 1;
break;
} else {
state = .invalid;