stage2 parser: UTF-8 encode \u{NNNNNN} escape sequences
The core of this change is to re-use the escape sequence parsing logic
for parsing both string and character literals.
The actual fix is that UTF-8 encoding was missing for string literals
with \u{...} escape sequences.
This commit is contained in:
committed by
Andrew Kelley
parent
aa867c7dbe
commit
5c8a507e7a
203
lib/std/zig.zig
203
lib/std/zig.zig
@@ -14,6 +14,10 @@ pub const Ast = @import("zig/Ast.zig");
|
||||
pub const system = @import("zig/system.zig");
|
||||
pub const CrossTarget = @import("zig/CrossTarget.zig");
|
||||
|
||||
// Character literal parsing
|
||||
pub const ParsedCharLiteral = string_literal.ParsedCharLiteral;
|
||||
pub const parseCharLiteral = string_literal.parseCharLiteral;
|
||||
|
||||
// Files needed by translate-c.
|
||||
pub const c_builtins = @import("zig/c_builtins.zig");
|
||||
pub const c_translation = @import("zig/c_translation.zig");
|
||||
@@ -185,205 +189,6 @@ pub fn binNameAlloc(allocator: std.mem.Allocator, options: BinNameOptions) error
|
||||
}
|
||||
}
|
||||
|
||||
pub const ParsedCharLiteral = union(enum) {
|
||||
success: u32,
|
||||
/// The character after backslash is not recognized.
|
||||
invalid_escape_character: usize,
|
||||
/// Expected hex digit at this index.
|
||||
expected_hex_digit: usize,
|
||||
/// Unicode escape sequence had no digits with rbrace at this index.
|
||||
empty_unicode_escape_sequence: usize,
|
||||
/// Expected hex digit or '}' at this index.
|
||||
expected_hex_digit_or_rbrace: usize,
|
||||
/// The unicode point is outside the range of Unicode codepoints.
|
||||
unicode_escape_overflow: usize,
|
||||
/// Expected '{' at this index.
|
||||
expected_lbrace: usize,
|
||||
/// Expected the terminating single quote at this index.
|
||||
expected_end: usize,
|
||||
/// The character at this index cannot be represented without an escape sequence.
|
||||
invalid_character: usize,
|
||||
};
|
||||
|
||||
/// Only validates escape sequence characters.
|
||||
/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
|
||||
pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
|
||||
assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
|
||||
|
||||
switch (slice[1]) {
|
||||
0 => return .{ .invalid_character = 1 },
|
||||
'\\' => switch (slice[2]) {
|
||||
'n' => return .{ .success = '\n' },
|
||||
'r' => return .{ .success = '\r' },
|
||||
'\\' => return .{ .success = '\\' },
|
||||
't' => return .{ .success = '\t' },
|
||||
'\'' => return .{ .success = '\'' },
|
||||
'"' => return .{ .success = '"' },
|
||||
'x' => {
|
||||
if (slice.len < 4) {
|
||||
return .{ .expected_hex_digit = 3 };
|
||||
}
|
||||
var value: u32 = 0;
|
||||
var i: usize = 3;
|
||||
while (i < 5) : (i += 1) {
|
||||
const c = slice[i];
|
||||
switch (c) {
|
||||
'0'...'9' => {
|
||||
value *= 16;
|
||||
value += c - '0';
|
||||
},
|
||||
'a'...'f' => {
|
||||
value *= 16;
|
||||
value += c - 'a' + 10;
|
||||
},
|
||||
'A'...'F' => {
|
||||
value *= 16;
|
||||
value += c - 'A' + 10;
|
||||
},
|
||||
else => {
|
||||
return .{ .expected_hex_digit = i };
|
||||
},
|
||||
}
|
||||
}
|
||||
if (slice[i] != '\'') {
|
||||
return .{ .expected_end = i };
|
||||
}
|
||||
return .{ .success = value };
|
||||
},
|
||||
'u' => {
|
||||
var i: usize = 3;
|
||||
if (slice[i] != '{') {
|
||||
return .{ .expected_lbrace = i };
|
||||
}
|
||||
i += 1;
|
||||
if (slice[i] == '}') {
|
||||
return .{ .empty_unicode_escape_sequence = i };
|
||||
}
|
||||
|
||||
var value: u32 = 0;
|
||||
while (i < slice.len) : (i += 1) {
|
||||
const c = slice[i];
|
||||
switch (c) {
|
||||
'0'...'9' => {
|
||||
value *= 16;
|
||||
value += c - '0';
|
||||
},
|
||||
'a'...'f' => {
|
||||
value *= 16;
|
||||
value += c - 'a' + 10;
|
||||
},
|
||||
'A'...'F' => {
|
||||
value *= 16;
|
||||
value += c - 'A' + 10;
|
||||
},
|
||||
'}' => {
|
||||
i += 1;
|
||||
break;
|
||||
},
|
||||
else => return .{ .expected_hex_digit_or_rbrace = i },
|
||||
}
|
||||
if (value > 0x10ffff) {
|
||||
return .{ .unicode_escape_overflow = i };
|
||||
}
|
||||
}
|
||||
if (slice[i] != '\'') {
|
||||
return .{ .expected_end = i };
|
||||
}
|
||||
return .{ .success = value };
|
||||
},
|
||||
else => return .{ .invalid_escape_character = 2 },
|
||||
},
|
||||
else => {
|
||||
const codepoint = std.unicode.utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
|
||||
return .{ .success = codepoint };
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
test "parseCharLiteral" {
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 'a' },
|
||||
parseCharLiteral("'a'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 'ä' },
|
||||
parseCharLiteral("'ä'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 0 },
|
||||
parseCharLiteral("'\\x00'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 0x4f },
|
||||
parseCharLiteral("'\\x4f'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 0x4f },
|
||||
parseCharLiteral("'\\x4F'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 0x3041 },
|
||||
parseCharLiteral("'ぁ'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 0 },
|
||||
parseCharLiteral("'\\u{0}'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 0x3041 },
|
||||
parseCharLiteral("'\\u{3041}'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 0x7f },
|
||||
parseCharLiteral("'\\u{7f}'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .success = 0x7fff },
|
||||
parseCharLiteral("'\\u{7FFF}'"),
|
||||
);
|
||||
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .expected_hex_digit = 4 },
|
||||
parseCharLiteral("'\\x0'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .expected_end = 5 },
|
||||
parseCharLiteral("'\\x000'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .invalid_escape_character = 2 },
|
||||
parseCharLiteral("'\\y'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .expected_lbrace = 3 },
|
||||
parseCharLiteral("'\\u'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .expected_lbrace = 3 },
|
||||
parseCharLiteral("'\\uFFFF'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .empty_unicode_escape_sequence = 4 },
|
||||
parseCharLiteral("'\\u{}'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .unicode_escape_overflow = 9 },
|
||||
parseCharLiteral("'\\u{FFFFFF}'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .expected_hex_digit_or_rbrace = 8 },
|
||||
parseCharLiteral("'\\u{FFFF'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .expected_end = 9 },
|
||||
parseCharLiteral("'\\u{FFFF}x'"),
|
||||
);
|
||||
try std.testing.expectEqual(
|
||||
ParsedCharLiteral{ .invalid_character = 1 },
|
||||
parseCharLiteral("'\x00'"),
|
||||
);
|
||||
}
|
||||
|
||||
test {
|
||||
@import("std").testing.refAllDecls(@This());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user