stage2 parser: UTF-8 encode \u{NNNNNN} escape sequences

The core of this change is to re-use the escape sequence parsing logic
for parsing both string and character literals.

The actual fix is that UTF-8 encoding was missing for string literals
with \u{...} escape sequences.
This commit is contained in:
Cody Tapscott
2022-03-01 20:51:01 -07:00
committed by Andrew Kelley
parent aa867c7dbe
commit 5c8a507e7a
4 changed files with 337 additions and 416 deletions

View File

@@ -14,6 +14,10 @@ pub const Ast = @import("zig/Ast.zig");
pub const system = @import("zig/system.zig");
pub const CrossTarget = @import("zig/CrossTarget.zig");
// Character literal parsing
pub const ParsedCharLiteral = string_literal.ParsedCharLiteral;
pub const parseCharLiteral = string_literal.parseCharLiteral;
// Files needed by translate-c.
pub const c_builtins = @import("zig/c_builtins.zig");
pub const c_translation = @import("zig/c_translation.zig");
@@ -185,205 +189,6 @@ pub fn binNameAlloc(allocator: std.mem.Allocator, options: BinNameOptions) error
}
}
pub const ParsedCharLiteral = union(enum) {
success: u32,
/// The character after backslash is not recognized.
invalid_escape_character: usize,
/// Expected hex digit at this index.
expected_hex_digit: usize,
/// Unicode escape sequence had no digits with rbrace at this index.
empty_unicode_escape_sequence: usize,
/// Expected hex digit or '}' at this index.
expected_hex_digit_or_rbrace: usize,
/// The unicode point is outside the range of Unicode codepoints.
unicode_escape_overflow: usize,
/// Expected '{' at this index.
expected_lbrace: usize,
/// Expected the terminating single quote at this index.
expected_end: usize,
/// The character at this index cannot be represented without an escape sequence.
invalid_character: usize,
};
/// Only validates escape sequence characters.
/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
switch (slice[1]) {
0 => return .{ .invalid_character = 1 },
'\\' => switch (slice[2]) {
'n' => return .{ .success = '\n' },
'r' => return .{ .success = '\r' },
'\\' => return .{ .success = '\\' },
't' => return .{ .success = '\t' },
'\'' => return .{ .success = '\'' },
'"' => return .{ .success = '"' },
'x' => {
if (slice.len < 4) {
return .{ .expected_hex_digit = 3 };
}
var value: u32 = 0;
var i: usize = 3;
while (i < 5) : (i += 1) {
const c = slice[i];
switch (c) {
'0'...'9' => {
value *= 16;
value += c - '0';
},
'a'...'f' => {
value *= 16;
value += c - 'a' + 10;
},
'A'...'F' => {
value *= 16;
value += c - 'A' + 10;
},
else => {
return .{ .expected_hex_digit = i };
},
}
}
if (slice[i] != '\'') {
return .{ .expected_end = i };
}
return .{ .success = value };
},
'u' => {
var i: usize = 3;
if (slice[i] != '{') {
return .{ .expected_lbrace = i };
}
i += 1;
if (slice[i] == '}') {
return .{ .empty_unicode_escape_sequence = i };
}
var value: u32 = 0;
while (i < slice.len) : (i += 1) {
const c = slice[i];
switch (c) {
'0'...'9' => {
value *= 16;
value += c - '0';
},
'a'...'f' => {
value *= 16;
value += c - 'a' + 10;
},
'A'...'F' => {
value *= 16;
value += c - 'A' + 10;
},
'}' => {
i += 1;
break;
},
else => return .{ .expected_hex_digit_or_rbrace = i },
}
if (value > 0x10ffff) {
return .{ .unicode_escape_overflow = i };
}
}
if (slice[i] != '\'') {
return .{ .expected_end = i };
}
return .{ .success = value };
},
else => return .{ .invalid_escape_character = 2 },
},
else => {
const codepoint = std.unicode.utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
return .{ .success = codepoint };
},
}
}
test "parseCharLiteral" {
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 'a' },
parseCharLiteral("'a'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 'ä' },
parseCharLiteral("'ä'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 0 },
parseCharLiteral("'\\x00'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 0x4f },
parseCharLiteral("'\\x4f'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 0x4f },
parseCharLiteral("'\\x4F'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 0x3041 },
parseCharLiteral("'ぁ'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 0 },
parseCharLiteral("'\\u{0}'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 0x3041 },
parseCharLiteral("'\\u{3041}'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 0x7f },
parseCharLiteral("'\\u{7f}'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .success = 0x7fff },
parseCharLiteral("'\\u{7FFF}'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .expected_hex_digit = 4 },
parseCharLiteral("'\\x0'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .expected_end = 5 },
parseCharLiteral("'\\x000'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .invalid_escape_character = 2 },
parseCharLiteral("'\\y'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .expected_lbrace = 3 },
parseCharLiteral("'\\u'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .expected_lbrace = 3 },
parseCharLiteral("'\\uFFFF'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .empty_unicode_escape_sequence = 4 },
parseCharLiteral("'\\u{}'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .unicode_escape_overflow = 9 },
parseCharLiteral("'\\u{FFFFFF}'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .expected_hex_digit_or_rbrace = 8 },
parseCharLiteral("'\\u{FFFF'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .expected_end = 9 },
parseCharLiteral("'\\u{FFFF}x'"),
);
try std.testing.expectEqual(
ParsedCharLiteral{ .invalid_character = 1 },
parseCharLiteral("'\x00'"),
);
}
test {
@import("std").testing.refAllDecls(@This());
}