Files
zig/src/link/tapi/Tokenizer.zig
2023-04-01 08:31:16 +02:00

576 lines
13 KiB
Zig

const Tokenizer = @This();
const std = @import("std");
const log = std.log.scoped(.yaml);
const testing = std.testing;
buffer: []const u8,
index: usize = 0,
pub const Token = struct {
id: Id,
start: usize,
end: usize,
pub const Id = enum {
// zig fmt: off
eof,
new_line,
doc_start, // ---
doc_end, // ...
seq_item_ind, // -
map_value_ind, // :
flow_map_start, // {
flow_map_end, // }
flow_seq_start, // [
flow_seq_end, // ]
comma,
space,
tab,
comment, // #
alias, // *
anchor, // &
tag, // !
single_quoted, // '...'
double_quoted, // "..."
literal,
// zig fmt: on
};
};
pub const TokenIndex = usize;
pub const TokenIterator = struct {
buffer: []const Token,
pos: TokenIndex = 0,
pub fn next(self: *TokenIterator) ?Token {
const token = self.peek() orelse return null;
self.pos += 1;
return token;
}
pub fn peek(self: TokenIterator) ?Token {
if (self.pos >= self.buffer.len) return null;
return self.buffer[self.pos];
}
pub fn reset(self: *TokenIterator) void {
self.pos = 0;
}
pub fn seekTo(self: *TokenIterator, pos: TokenIndex) void {
self.pos = pos;
}
pub fn seekBy(self: *TokenIterator, offset: isize) void {
const new_pos = @bitCast(isize, self.pos) + offset;
if (new_pos < 0) {
self.pos = 0;
} else {
self.pos = @intCast(usize, new_pos);
}
}
};
fn stringMatchesPattern(comptime pattern: []const u8, slice: []const u8) bool {
comptime var count: usize = 0;
inline while (count < pattern.len) : (count += 1) {
if (count >= slice.len) return false;
const c = slice[count];
if (pattern[count] != c) return false;
}
return true;
}
fn matchesPattern(self: Tokenizer, comptime pattern: []const u8) bool {
return stringMatchesPattern(pattern, self.buffer[self.index..]);
}
pub fn next(self: *Tokenizer) Token {
var result = Token{
.id = .eof,
.start = self.index,
.end = undefined,
};
var state: enum {
start,
new_line,
space,
tab,
comment,
single_quoted,
double_quoted,
literal,
} = .start;
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
.start => switch (c) {
' ' => {
state = .space;
},
'\t' => {
state = .tab;
},
'\n' => {
result.id = .new_line;
self.index += 1;
break;
},
'\r' => {
state = .new_line;
},
'-' => if (self.matchesPattern("---")) {
result.id = .doc_start;
self.index += "---".len;
break;
} else if (self.matchesPattern("- ")) {
result.id = .seq_item_ind;
self.index += "- ".len;
break;
} else {
state = .literal;
},
'.' => if (self.matchesPattern("...")) {
result.id = .doc_end;
self.index += "...".len;
break;
} else {
state = .literal;
},
',' => {
result.id = .comma;
self.index += 1;
break;
},
'#' => {
state = .comment;
},
'*' => {
result.id = .alias;
self.index += 1;
break;
},
'&' => {
result.id = .anchor;
self.index += 1;
break;
},
'!' => {
result.id = .tag;
self.index += 1;
break;
},
'[' => {
result.id = .flow_seq_start;
self.index += 1;
break;
},
']' => {
result.id = .flow_seq_end;
self.index += 1;
break;
},
':' => {
result.id = .map_value_ind;
self.index += 1;
break;
},
'{' => {
result.id = .flow_map_start;
self.index += 1;
break;
},
'}' => {
result.id = .flow_map_end;
self.index += 1;
break;
},
'\'' => {
state = .single_quoted;
},
'"' => {
state = .double_quoted;
},
else => {
state = .literal;
},
},
.comment => switch (c) {
'\r', '\n' => {
result.id = .comment;
break;
},
else => {},
},
.space => switch (c) {
' ' => {},
else => {
result.id = .space;
break;
},
},
.tab => switch (c) {
'\t' => {},
else => {
result.id = .tab;
break;
},
},
.new_line => switch (c) {
'\n' => {
result.id = .new_line;
self.index += 1;
break;
},
else => {}, // TODO this should be an error condition
},
.single_quoted => switch (c) {
'\'' => if (!self.matchesPattern("''")) {
result.id = .single_quoted;
self.index += 1;
break;
} else {
self.index += "''".len - 1;
},
else => {},
},
.double_quoted => switch (c) {
'"' => {
if (stringMatchesPattern("\\", self.buffer[self.index - 1 ..])) {
self.index += 1;
} else {
result.id = .double_quoted;
self.index += 1;
break;
}
},
else => {},
},
.literal => switch (c) {
'\r', '\n', ' ', '\'', '"', ',', ':', ']', '}' => {
result.id = .literal;
break;
},
else => {
result.id = .literal;
},
},
}
}
if (self.index >= self.buffer.len) {
switch (state) {
.literal => {
result.id = .literal;
},
else => {},
}
}
result.end = self.index;
log.debug("{any}", .{result});
log.debug(" | {s}", .{self.buffer[result.start..result.end]});
return result;
}
fn testExpected(source: []const u8, expected: []const Token.Id) !void {
var tokenizer = Tokenizer{
.buffer = source,
};
var given = std.ArrayList(Token.Id).init(testing.allocator);
defer given.deinit();
while (true) {
const token = tokenizer.next();
try given.append(token.id);
if (token.id == .eof) break;
}
try testing.expectEqualSlices(Token.Id, expected, given.items);
}
test {
std.testing.refAllDecls(@This());
}
test "empty doc" {
try testExpected("", &[_]Token.Id{.eof});
}
test "empty doc with explicit markers" {
try testExpected(
\\---
\\...
, &[_]Token.Id{
.doc_start, .new_line, .doc_end, .eof,
});
}
test "empty doc with explicit markers and a directive" {
try testExpected(
\\--- !tbd-v1
\\...
, &[_]Token.Id{
.doc_start,
.space,
.tag,
.literal,
.new_line,
.doc_end,
.eof,
});
}
test "sequence of values" {
try testExpected(
\\- 0
\\- 1
\\- 2
, &[_]Token.Id{
.seq_item_ind,
.literal,
.new_line,
.seq_item_ind,
.literal,
.new_line,
.seq_item_ind,
.literal,
.eof,
});
}
test "sequence of sequences" {
try testExpected(
\\- [ val1, val2]
\\- [val3, val4 ]
, &[_]Token.Id{
.seq_item_ind,
.flow_seq_start,
.space,
.literal,
.comma,
.space,
.literal,
.flow_seq_end,
.new_line,
.seq_item_ind,
.flow_seq_start,
.literal,
.comma,
.space,
.literal,
.space,
.flow_seq_end,
.eof,
});
}
test "mappings" {
try testExpected(
\\key1: value1
\\key2: value2
, &[_]Token.Id{
.literal,
.map_value_ind,
.space,
.literal,
.new_line,
.literal,
.map_value_ind,
.space,
.literal,
.eof,
});
}
test "inline mapped sequence of values" {
try testExpected(
\\key : [ val1,
\\ val2 ]
, &[_]Token.Id{
.literal,
.space,
.map_value_ind,
.space,
.flow_seq_start,
.space,
.literal,
.comma,
.space,
.new_line,
.space,
.literal,
.space,
.flow_seq_end,
.eof,
});
}
test "part of tbd" {
try testExpected(
\\--- !tapi-tbd
\\tbd-version: 4
\\targets: [ x86_64-macos ]
\\
\\uuids:
\\ - target: x86_64-macos
\\ value: F86CC732-D5E4-30B5-AA7D-167DF5EC2708
\\
\\install-name: '/usr/lib/libSystem.B.dylib'
\\...
, &[_]Token.Id{
.doc_start,
.space,
.tag,
.literal,
.new_line,
.literal,
.map_value_ind,
.space,
.literal,
.new_line,
.literal,
.map_value_ind,
.space,
.flow_seq_start,
.space,
.literal,
.space,
.flow_seq_end,
.new_line,
.new_line,
.literal,
.map_value_ind,
.new_line,
.space,
.seq_item_ind,
.literal,
.map_value_ind,
.space,
.literal,
.new_line,
.space,
.literal,
.map_value_ind,
.space,
.literal,
.new_line,
.new_line,
.literal,
.map_value_ind,
.space,
.single_quoted,
.new_line,
.doc_end,
.eof,
});
}
test "Unindented list" {
try testExpected(
\\b:
\\- foo: 1
\\c: 1
, &[_]Token.Id{
.literal,
.map_value_ind,
.new_line,
.seq_item_ind,
.literal,
.map_value_ind,
.space,
.literal,
.new_line,
.literal,
.map_value_ind,
.space,
.literal,
.eof,
});
}
test "escape sequences" {
try testExpected(
\\a: 'here''s an apostrophe'
\\b: "a newline\nand a\ttab"
\\c: "\"here\" and there"
, &[_]Token.Id{
.literal,
.map_value_ind,
.space,
.single_quoted,
.new_line,
.literal,
.map_value_ind,
.space,
.double_quoted,
.new_line,
.literal,
.map_value_ind,
.space,
.double_quoted,
.eof,
});
}
test "comments" {
try testExpected(
\\key: # some comment about the key
\\# first value
\\- val1
\\# second value
\\- val2
, &[_]Token.Id{
.literal,
.map_value_ind,
.space,
.comment,
.new_line,
.comment,
.new_line,
.seq_item_ind,
.literal,
.new_line,
.comment,
.new_line,
.seq_item_ind,
.literal,
.eof,
});
}
test "quoted literals" {
try testExpected(
\\'#000000'
\\'[000000'
\\"&someString"
, &[_]Token.Id{
.single_quoted,
.new_line,
.single_quoted,
.new_line,
.double_quoted,
.eof,
});
}