blob 09774cf0 (24330B) - Raw
1 const std = @import("std"); 2 const assert = std.debug.assert; 3 const log = std.log.scoped(.yaml); 4 const mem = std.mem; 5 6 const Allocator = mem.Allocator; 7 const Tokenizer = @import("Tokenizer.zig"); 8 const Token = Tokenizer.Token; 9 const TokenIndex = Tokenizer.TokenIndex; 10 const TokenIterator = Tokenizer.TokenIterator; 11 12 pub const ParseError = error{ 13 InvalidEscapeSequence, 14 MalformedYaml, 15 NestedDocuments, 16 UnexpectedEof, 17 UnexpectedToken, 18 Unhandled, 19 } || Allocator.Error; 20 21 pub const Node = struct { 22 tag: Tag, 23 tree: *const Tree, 24 start: TokenIndex, 25 end: TokenIndex, 26 27 pub const Tag = enum { 28 doc, 29 map, 30 list, 31 value, 32 }; 33 34 pub fn cast(self: *const Node, comptime T: type) ?*const T { 35 if (self.tag != T.base_tag) { 36 return null; 37 } 38 return @fieldParentPtr(T, "base", self); 39 } 40 41 pub fn deinit(self: *Node, allocator: Allocator) void { 42 switch (self.tag) { 43 .doc => @fieldParentPtr(Node.Doc, "base", self).deinit(allocator), 44 .map => @fieldParentPtr(Node.Map, "base", self).deinit(allocator), 45 .list => @fieldParentPtr(Node.List, "base", self).deinit(allocator), 46 .value => @fieldParentPtr(Node.Value, "base", self).deinit(allocator), 47 } 48 } 49 50 pub fn format( 51 self: *const Node, 52 comptime fmt: []const u8, 53 options: std.fmt.FormatOptions, 54 writer: anytype, 55 ) !void { 56 return switch (self.tag) { 57 .doc => @fieldParentPtr(Node.Doc, "base", self).format(fmt, options, writer), 58 .map => @fieldParentPtr(Node.Map, "base", self).format(fmt, options, writer), 59 .list => @fieldParentPtr(Node.List, "base", self).format(fmt, options, writer), 60 .value => @fieldParentPtr(Node.Value, "base", self).format(fmt, options, writer), 61 }; 62 } 63 64 pub const Doc = struct { 65 base: Node = Node{ 66 .tag = Tag.doc, 67 .tree = undefined, 68 .start = undefined, 69 .end = undefined, 70 }, 71 directive: ?TokenIndex = null, 72 value: ?*Node = null, 73 74 pub const base_tag: Node.Tag = .doc; 75 76 pub fn deinit(self: *Doc, allocator: Allocator) void { 77 if (self.value) |node| { 78 node.deinit(allocator); 79 allocator.destroy(node); 80 } 81 } 82 83 pub fn format( 84 self: *const Doc, 85 comptime fmt: []const u8, 86 options: std.fmt.FormatOptions, 87 writer: anytype, 88 ) !void { 89 _ = options; 90 _ = fmt; 91 if (self.directive) |id| { 92 try std.fmt.format(writer, "{{ ", .{}); 93 const directive = self.base.tree.getRaw(id, id); 94 try std.fmt.format(writer, ".directive = {s}, ", .{directive}); 95 } 96 if (self.value) |node| { 97 try std.fmt.format(writer, "{}", .{node}); 98 } 99 if (self.directive != null) { 100 try std.fmt.format(writer, " }}", .{}); 101 } 102 } 103 }; 104 105 pub const Map = struct { 106 base: Node = Node{ 107 .tag = Tag.map, 108 .tree = undefined, 109 .start = undefined, 110 .end = undefined, 111 }, 112 values: std.ArrayListUnmanaged(Entry) = .{}, 113 114 pub const base_tag: Node.Tag = .map; 115 116 pub const Entry = struct { 117 key: TokenIndex, 118 value: ?*Node, 119 }; 120 121 pub fn deinit(self: *Map, allocator: Allocator) void { 122 for (self.values.items) |entry| { 123 if (entry.value) |value| { 124 value.deinit(allocator); 125 allocator.destroy(value); 126 } 127 } 128 self.values.deinit(allocator); 129 } 130 131 pub fn format( 132 self: *const Map, 133 comptime fmt: []const u8, 134 options: std.fmt.FormatOptions, 135 writer: anytype, 136 ) !void { 137 _ = options; 138 _ = fmt; 139 try std.fmt.format(writer, "{{ ", .{}); 140 for (self.values.items) |entry| { 141 const key = self.base.tree.getRaw(entry.key, entry.key); 142 if (entry.value) |value| { 143 try std.fmt.format(writer, "{s} => {}, ", .{ key, value }); 144 } else { 145 try std.fmt.format(writer, "{s} => null, ", .{key}); 146 } 147 } 148 return std.fmt.format(writer, " }}", .{}); 149 } 150 }; 151 152 pub const List = struct { 153 base: Node = Node{ 154 .tag = Tag.list, 155 .tree = undefined, 156 .start = undefined, 157 .end = undefined, 158 }, 159 values: std.ArrayListUnmanaged(*Node) = .{}, 160 161 pub const base_tag: Node.Tag = .list; 162 163 pub fn deinit(self: *List, allocator: Allocator) void { 164 for (self.values.items) |node| { 165 node.deinit(allocator); 166 allocator.destroy(node); 167 } 168 self.values.deinit(allocator); 169 } 170 171 pub fn format( 172 self: *const List, 173 comptime fmt: []const u8, 174 options: std.fmt.FormatOptions, 175 writer: anytype, 176 ) !void { 177 _ = options; 178 _ = fmt; 179 try std.fmt.format(writer, "[ ", .{}); 180 for (self.values.items) |node| { 181 try std.fmt.format(writer, "{}, ", .{node}); 182 } 183 return std.fmt.format(writer, " ]", .{}); 184 } 185 }; 186 187 pub const Value = struct { 188 base: Node = Node{ 189 .tag = Tag.value, 190 .tree = undefined, 191 .start = undefined, 192 .end = undefined, 193 }, 194 string_value: std.ArrayListUnmanaged(u8) = .{}, 195 196 pub const base_tag: Node.Tag = .value; 197 198 pub fn deinit(self: *Value, allocator: Allocator) void { 199 self.string_value.deinit(allocator); 200 } 201 202 pub fn format( 203 self: *const Value, 204 comptime fmt: []const u8, 205 options: std.fmt.FormatOptions, 206 writer: anytype, 207 ) !void { 208 _ = options; 209 _ = fmt; 210 const raw = self.base.tree.getRaw(self.base.start, self.base.end); 211 return std.fmt.format(writer, "{s}", .{raw}); 212 } 213 }; 214 }; 215 216 pub const LineCol = struct { 217 line: usize, 218 col: usize, 219 }; 220 221 pub const Tree = struct { 222 allocator: Allocator, 223 source: []const u8, 224 tokens: []Token, 225 line_cols: std.AutoHashMap(TokenIndex, LineCol), 226 docs: std.ArrayListUnmanaged(*Node) = .{}, 227 228 pub fn init(allocator: Allocator) Tree { 229 return .{ 230 .allocator = allocator, 231 .source = undefined, 232 .tokens = undefined, 233 .line_cols = std.AutoHashMap(TokenIndex, LineCol).init(allocator), 234 }; 235 } 236 237 pub fn deinit(self: *Tree) void { 238 self.allocator.free(self.tokens); 239 self.line_cols.deinit(); 240 for (self.docs.items) |doc| { 241 doc.deinit(self.allocator); 242 self.allocator.destroy(doc); 243 } 244 self.docs.deinit(self.allocator); 245 } 246 247 pub fn getDirective(self: Tree, doc_index: usize) ?[]const u8 { 248 assert(doc_index < self.docs.items.len); 249 const doc = self.docs.items[doc_index].cast(Node.Doc) orelse return null; 250 const id = doc.directive orelse return null; 251 return self.getRaw(id, id); 252 } 253 254 pub fn getRaw(self: Tree, start: TokenIndex, end: TokenIndex) []const u8 { 255 assert(start <= end); 256 assert(start < self.tokens.len and end < self.tokens.len); 257 const start_token = self.tokens[start]; 258 const end_token = self.tokens[end]; 259 return self.source[start_token.start..end_token.end]; 260 } 261 262 pub fn parse(self: *Tree, source: []const u8) !void { 263 var tokenizer = Tokenizer{ .buffer = source }; 264 var tokens = std.ArrayList(Token).init(self.allocator); 265 defer tokens.deinit(); 266 267 var line: usize = 0; 268 var prev_line_last_col: usize = 0; 269 270 while (true) { 271 const token = tokenizer.next(); 272 const tok_id = tokens.items.len; 273 try tokens.append(token); 274 275 try self.line_cols.putNoClobber(tok_id, .{ 276 .line = line, 277 .col = token.start - prev_line_last_col, 278 }); 279 280 switch (token.id) { 281 .eof => break, 282 .new_line => { 283 line += 1; 284 prev_line_last_col = token.end; 285 }, 286 else => {}, 287 } 288 } 289 290 self.source = source; 291 self.tokens = try tokens.toOwnedSlice(); 292 293 var it = TokenIterator{ .buffer = self.tokens }; 294 var parser = Parser{ 295 .allocator = self.allocator, 296 .tree = self, 297 .token_it = &it, 298 .line_cols = &self.line_cols, 299 }; 300 301 parser.eatCommentsAndSpace(&.{}); 302 303 while (true) { 304 parser.eatCommentsAndSpace(&.{}); 305 const token = parser.token_it.next() orelse break; 306 307 log.debug("(main) next {s}@{d}", .{ @tagName(token.id), parser.token_it.pos - 1 }); 308 309 switch (token.id) { 310 .eof => break, 311 else => { 312 parser.token_it.seekBy(-1); 313 const doc = try parser.doc(); 314 try self.docs.append(self.allocator, doc); 315 }, 316 } 317 } 318 } 319 }; 320 321 const Parser = struct { 322 allocator: Allocator, 323 tree: *Tree, 324 token_it: *TokenIterator, 325 line_cols: *const std.AutoHashMap(TokenIndex, LineCol), 326 327 fn value(self: *Parser) ParseError!?*Node { 328 self.eatCommentsAndSpace(&.{}); 329 330 const pos = self.token_it.pos; 331 const token = self.token_it.next() orelse return error.UnexpectedEof; 332 333 log.debug(" next {s}@{d}", .{ @tagName(token.id), pos }); 334 335 switch (token.id) { 336 .literal => if (self.eatToken(.map_value_ind, &.{ .new_line, .comment })) |_| { 337 // map 338 self.token_it.seekTo(pos); 339 return self.map(); 340 } else { 341 // leaf value 342 self.token_it.seekTo(pos); 343 return self.leaf_value(); 344 }, 345 .single_quoted, .double_quoted => { 346 // leaf value 347 self.token_it.seekBy(-1); 348 return self.leaf_value(); 349 }, 350 .seq_item_ind => { 351 // list 352 self.token_it.seekBy(-1); 353 return self.list(); 354 }, 355 .flow_seq_start => { 356 // list 357 self.token_it.seekBy(-1); 358 return self.list_bracketed(); 359 }, 360 else => return null, 361 } 362 } 363 364 fn doc(self: *Parser) ParseError!*Node { 365 const node = try self.allocator.create(Node.Doc); 366 errdefer self.allocator.destroy(node); 367 node.* = .{}; 368 node.base.tree = self.tree; 369 node.base.start = self.token_it.pos; 370 371 log.debug("(doc) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start }); 372 373 // Parse header 374 const explicit_doc: bool = if (self.eatToken(.doc_start, &.{})) |doc_pos| explicit_doc: { 375 if (self.getCol(doc_pos) > 0) return error.MalformedYaml; 376 if (self.eatToken(.tag, &.{ .new_line, .comment })) |_| { 377 node.directive = try self.expectToken(.literal, &.{ .new_line, .comment }); 378 } 379 break :explicit_doc true; 380 } else false; 381 382 // Parse value 383 node.value = try self.value(); 384 if (node.value == null) { 385 self.token_it.seekBy(-1); 386 } 387 errdefer if (node.value) |val| { 388 val.deinit(self.allocator); 389 self.allocator.destroy(val); 390 }; 391 392 // Parse footer 393 footer: { 394 if (self.eatToken(.doc_end, &.{})) |pos| { 395 if (!explicit_doc) return error.UnexpectedToken; 396 if (self.getCol(pos) > 0) return error.MalformedYaml; 397 node.base.end = pos; 398 break :footer; 399 } 400 if (self.eatToken(.doc_start, &.{})) |pos| { 401 if (!explicit_doc) return error.UnexpectedToken; 402 if (self.getCol(pos) > 0) return error.MalformedYaml; 403 self.token_it.seekBy(-1); 404 node.base.end = pos - 1; 405 break :footer; 406 } 407 if (self.eatToken(.eof, &.{})) |pos| { 408 node.base.end = pos - 1; 409 break :footer; 410 } 411 return error.UnexpectedToken; 412 } 413 414 log.debug("(doc) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end }); 415 416 return &node.base; 417 } 418 419 fn map(self: *Parser) ParseError!*Node { 420 const node = try self.allocator.create(Node.Map); 421 errdefer self.allocator.destroy(node); 422 node.* = .{}; 423 node.base.tree = self.tree; 424 node.base.start = self.token_it.pos; 425 errdefer { 426 for (node.values.items) |entry| { 427 if (entry.value) |val| { 428 val.deinit(self.allocator); 429 self.allocator.destroy(val); 430 } 431 } 432 node.values.deinit(self.allocator); 433 } 434 435 log.debug("(map) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start }); 436 437 const col = self.getCol(node.base.start); 438 439 while (true) { 440 self.eatCommentsAndSpace(&.{}); 441 442 // Parse key 443 const key_pos = self.token_it.pos; 444 if (self.getCol(key_pos) < col) { 445 break; 446 } 447 448 const key = self.token_it.next() orelse return error.UnexpectedEof; 449 switch (key.id) { 450 .literal => {}, 451 .doc_start, .doc_end, .eof => { 452 self.token_it.seekBy(-1); 453 break; 454 }, 455 else => { 456 // TODO key not being a literal 457 return error.Unhandled; 458 }, 459 } 460 461 log.debug("(map) key {s}@{d}", .{ self.tree.getRaw(key_pos, key_pos), key_pos }); 462 463 // Separator 464 _ = try self.expectToken(.map_value_ind, &.{ .new_line, .comment }); 465 466 // Parse value 467 const val = try self.value(); 468 errdefer if (val) |v| { 469 v.deinit(self.allocator); 470 self.allocator.destroy(v); 471 }; 472 473 if (val) |v| { 474 if (self.getCol(v.start) < self.getCol(key_pos)) { 475 return error.MalformedYaml; 476 } 477 if (v.cast(Node.Value)) |_| { 478 if (self.getCol(v.start) == self.getCol(key_pos)) { 479 return error.MalformedYaml; 480 } 481 } 482 } 483 484 try node.values.append(self.allocator, .{ 485 .key = key_pos, 486 .value = val, 487 }); 488 } 489 490 node.base.end = self.token_it.pos - 1; 491 492 log.debug("(map) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end }); 493 494 return &node.base; 495 } 496 497 fn list(self: *Parser) ParseError!*Node { 498 const node = try self.allocator.create(Node.List); 499 errdefer self.allocator.destroy(node); 500 node.* = .{}; 501 node.base.tree = self.tree; 502 node.base.start = self.token_it.pos; 503 errdefer { 504 for (node.values.items) |val| { 505 val.deinit(self.allocator); 506 self.allocator.destroy(val); 507 } 508 node.values.deinit(self.allocator); 509 } 510 511 log.debug("(list) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start }); 512 513 while (true) { 514 self.eatCommentsAndSpace(&.{}); 515 516 _ = self.eatToken(.seq_item_ind, &.{}) orelse break; 517 518 const val = (try self.value()) orelse return error.MalformedYaml; 519 try node.values.append(self.allocator, val); 520 } 521 522 node.base.end = self.token_it.pos - 1; 523 524 log.debug("(list) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end }); 525 526 return &node.base; 527 } 528 529 fn list_bracketed(self: *Parser) ParseError!*Node { 530 const node = try self.allocator.create(Node.List); 531 errdefer self.allocator.destroy(node); 532 node.* = .{}; 533 node.base.tree = self.tree; 534 node.base.start = self.token_it.pos; 535 errdefer { 536 for (node.values.items) |val| { 537 val.deinit(self.allocator); 538 self.allocator.destroy(val); 539 } 540 node.values.deinit(self.allocator); 541 } 542 543 log.debug("(list) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start }); 544 545 _ = try self.expectToken(.flow_seq_start, &.{}); 546 547 while (true) { 548 self.eatCommentsAndSpace(&.{.comment}); 549 550 if (self.eatToken(.flow_seq_end, &.{.comment})) |pos| { 551 node.base.end = pos; 552 break; 553 } 554 _ = self.eatToken(.comma, &.{.comment}); 555 556 const val = (try self.value()) orelse return error.MalformedYaml; 557 try node.values.append(self.allocator, val); 558 } 559 560 log.debug("(list) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end }); 561 562 return &node.base; 563 } 564 565 fn leaf_value(self: *Parser) ParseError!*Node { 566 const node = try self.allocator.create(Node.Value); 567 errdefer self.allocator.destroy(node); 568 node.* = .{ .string_value = .{} }; 569 node.base.tree = self.tree; 570 node.base.start = self.token_it.pos; 571 errdefer node.string_value.deinit(self.allocator); 572 573 // TODO handle multiline strings in new block scope 574 while (self.token_it.next()) |tok| { 575 switch (tok.id) { 576 .single_quoted => { 577 node.base.end = self.token_it.pos - 1; 578 const raw = self.tree.getRaw(node.base.start, node.base.end); 579 try self.parseSingleQuoted(node, raw); 580 break; 581 }, 582 .double_quoted => { 583 node.base.end = self.token_it.pos - 1; 584 const raw = self.tree.getRaw(node.base.start, node.base.end); 585 try self.parseDoubleQuoted(node, raw); 586 break; 587 }, 588 .literal => {}, 589 .space => { 590 const trailing = self.token_it.pos - 2; 591 self.eatCommentsAndSpace(&.{}); 592 if (self.token_it.peek()) |peek| { 593 if (peek.id != .literal) { 594 node.base.end = trailing; 595 const raw = self.tree.getRaw(node.base.start, node.base.end); 596 try node.string_value.appendSlice(self.allocator, raw); 597 break; 598 } 599 } 600 }, 601 else => { 602 self.token_it.seekBy(-1); 603 node.base.end = self.token_it.pos - 1; 604 const raw = self.tree.getRaw(node.base.start, node.base.end); 605 try node.string_value.appendSlice(self.allocator, raw); 606 break; 607 }, 608 } 609 } 610 611 log.debug("(leaf) {s}", .{self.tree.getRaw(node.base.start, node.base.end)}); 612 613 return &node.base; 614 } 615 616 fn eatCommentsAndSpace(self: *Parser, comptime exclusions: []const Token.Id) void { 617 log.debug("eatCommentsAndSpace", .{}); 618 outer: while (self.token_it.next()) |token| { 619 log.debug(" (token '{s}')", .{@tagName(token.id)}); 620 switch (token.id) { 621 .comment, .space, .new_line => |space| { 622 inline for (exclusions) |excl| { 623 if (excl == space) { 624 self.token_it.seekBy(-1); 625 break :outer; 626 } 627 } else continue; 628 }, 629 else => { 630 self.token_it.seekBy(-1); 631 break; 632 }, 633 } 634 } 635 } 636 637 fn eatToken(self: *Parser, id: Token.Id, comptime exclusions: []const Token.Id) ?TokenIndex { 638 log.debug("eatToken('{s}')", .{@tagName(id)}); 639 self.eatCommentsAndSpace(exclusions); 640 const pos = self.token_it.pos; 641 const token = self.token_it.next() orelse return null; 642 if (token.id == id) { 643 log.debug(" (found at {d})", .{pos}); 644 return pos; 645 } else { 646 log.debug(" (not found)", .{}); 647 self.token_it.seekBy(-1); 648 return null; 649 } 650 } 651 652 fn expectToken(self: *Parser, id: Token.Id, comptime exclusions: []const Token.Id) ParseError!TokenIndex { 653 log.debug("expectToken('{s}')", .{@tagName(id)}); 654 return self.eatToken(id, exclusions) orelse error.UnexpectedToken; 655 } 656 657 fn getLine(self: *Parser, index: TokenIndex) usize { 658 return self.line_cols.get(index).?.line; 659 } 660 661 fn getCol(self: *Parser, index: TokenIndex) usize { 662 return self.line_cols.get(index).?.col; 663 } 664 665 fn parseSingleQuoted(self: *Parser, node: *Node.Value, raw: []const u8) ParseError!void { 666 assert(raw[0] == '\'' and raw[raw.len - 1] == '\''); 667 668 const raw_no_quotes = raw[1 .. raw.len - 1]; 669 try node.string_value.ensureTotalCapacity(self.allocator, raw_no_quotes.len); 670 671 var state: enum { 672 start, 673 escape, 674 } = .start; 675 var index: usize = 0; 676 677 while (index < raw_no_quotes.len) : (index += 1) { 678 const c = raw_no_quotes[index]; 679 switch (state) { 680 .start => switch (c) { 681 '\'' => { 682 state = .escape; 683 }, 684 else => { 685 node.string_value.appendAssumeCapacity(c); 686 }, 687 }, 688 .escape => switch (c) { 689 '\'' => { 690 state = .start; 691 node.string_value.appendAssumeCapacity(c); 692 }, 693 else => return error.InvalidEscapeSequence, 694 }, 695 } 696 } 697 } 698 699 fn parseDoubleQuoted(self: *Parser, node: *Node.Value, raw: []const u8) ParseError!void { 700 assert(raw[0] == '"' and raw[raw.len - 1] == '"'); 701 702 const raw_no_quotes = raw[1 .. raw.len - 1]; 703 try node.string_value.ensureTotalCapacity(self.allocator, raw_no_quotes.len); 704 705 var state: enum { 706 start, 707 escape, 708 } = .start; 709 710 var index: usize = 0; 711 while (index < raw_no_quotes.len) : (index += 1) { 712 const c = raw_no_quotes[index]; 713 switch (state) { 714 .start => switch (c) { 715 '\\' => { 716 state = .escape; 717 }, 718 else => { 719 node.string_value.appendAssumeCapacity(c); 720 }, 721 }, 722 .escape => switch (c) { 723 'n' => { 724 state = .start; 725 node.string_value.appendAssumeCapacity('\n'); 726 }, 727 't' => { 728 state = .start; 729 node.string_value.appendAssumeCapacity('\t'); 730 }, 731 '"' => { 732 state = .start; 733 node.string_value.appendAssumeCapacity('"'); 734 }, 735 else => return error.InvalidEscapeSequence, 736 }, 737 } 738 } 739 } 740 }; 741 742 test { 743 std.testing.refAllDecls(@This()); 744 _ = @import("parse/test.zig"); 745 }