From fc469acbf987ac18c996bf053a2a606654654728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Tue, 8 Mar 2022 20:44:32 +0200 Subject: [PATCH] offsets are u64 --- README.md | 12 ++++--- src/sections.zig | 36 +++++++++----------- src/user.zig | 89 ++++++++++++++++++++++-------------------------- 3 files changed, 66 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 3244329..e3cbb6f 100644 --- a/README.md +++ b/README.md @@ -197,13 +197,13 @@ const PackedGroup = packed struct { pub const PackedUser = packed struct { uid: u32, gid: u32, - additional_gids_offset: u29, - shell_here: bool, + padding: u2 = 0, shell_len_or_idx: u6, - home_len: u6, + shell_here: bool, name_is_a_suffix: bool, + home_len: u6, name_len: u5, - gecos_len: u8, + gecos_len: u11, // pseudocode: variable-sized array that will be stored immediately after // this struct. stringdata []u8; @@ -215,6 +215,7 @@ pub const PackedUser = packed struct { - name (optional). - gecos. - shell (optional). +- `additional_gids_offset`: varint. First byte of home is stored right after the `gecos_len` field, and it's length is `home_len`. The same logic applies to all the `stringdata` fields: @@ -232,6 +233,9 @@ Additionally, there are two "easy" optimizations: 2. `name_is_a_suffix=false`: name begins one byte after home, and it's length is `name_len`. +The last field, `additional_gids_offset`, which is needed least frequently, +is stored at the end. + Shells ------ diff --git a/src/sections.zig b/src/sections.zig index 59ff7c1..7c62d1f 100644 --- a/src/sections.zig +++ b/src/sections.zig @@ -35,10 +35,10 @@ const Corpus = struct { usersMulti: MultiArrayList(User), groupsMulti: MultiArrayList(Group), - name2user: StringHashMap(usize), - name2group: StringHashMap(usize), - groupname2users: StringHashMap([]usize), - username2groups: StringHashMap([]usize), + name2user: StringHashMap(u64), + name2group: StringHashMap(u64), + groupname2users: StringHashMap([]u64), + username2groups: StringHashMap([]u64), pub fn init( baseAllocator: Allocator, @@ -68,8 +68,8 @@ const Corpus = struct { for (groups) |group| groupsMulti.appendAssumeCapacity(group); - var name2user = StringHashMap(usize).init(allocator); - var name2group = StringHashMap(usize).init(allocator); + var name2user = StringHashMap(u64).init(allocator); + var name2group = StringHashMap(u64).init(allocator); for (users) |*user, i| { var res1 = try name2user.getOrPut(user.name); if (res1.found_existing) @@ -84,17 +84,17 @@ const Corpus = struct { res1.value_ptr.* = i; } - var groupname2users = StringHashMap([]usize).init(allocator); + var groupname2users = StringHashMap([]u64).init(allocator); // uses baseAllocator, because it will be freed before // returning from this function. This keeps the arena clean. var username2groups = StringHashMap( - ArrayListUnmanaged(usize), + ArrayListUnmanaged(u64), ).init(baseAllocator); defer username2groups.deinit(); for (groups) |*group, i| { - var members = try allocator.alloc(usize, group.members.count()); + var members = try allocator.alloc(u64, group.members.count()); members.len = 0; var it = group.members.iterator(); @@ -108,7 +108,7 @@ const Corpus = struct { var groupsOfMember = try username2groups.getOrPut(memberName.*); if (!groupsOfMember.found_existing) - groupsOfMember.value_ptr.* = ArrayListUnmanaged(usize){}; + groupsOfMember.value_ptr.* = ArrayListUnmanaged(u64){}; try groupsOfMember.value_ptr.*.append(allocator, i); } @@ -120,14 +120,14 @@ const Corpus = struct { var it1 = groupname2users.valueIterator(); while (it1.next()) |groupUsers| { - sort.sort(usize, groupUsers.*, {}, comptime sort.asc(usize)); + sort.sort(u64, groupUsers.*, {}, comptime sort.asc(u64)); } var it2 = username2groups.valueIterator(); while (it2.next()) |userGroups| - sort.sort(usize, userGroups.items, {}, comptime sort.asc(usize)); + sort.sort(u64, userGroups.items, {}, comptime sort.asc(u64)); - var username2groups_final = StringHashMap([]usize).init(allocator); + var username2groups_final = StringHashMap([]u64).init(allocator); var it = username2groups.iterator(); while (it.next()) |elem| { const username = elem.key_ptr.*; @@ -188,7 +188,7 @@ pub fn shellSections( pub const UserGids = struct { // user index -> offset in blob - idx2offset: []const u32, + idx2offset: []const u64, // compressed user gids blob. A blob contains N <= users.len items, // an item is: // len: varint @@ -211,7 +211,7 @@ pub fn userGids( ) error{ OutOfMemory, Overflow }!UserGids { var blob = ArrayList(u8).init(allocator); errdefer blob.deinit(); - var idx2offset = try allocator.alloc(u32, corpus.users.len); + var idx2offset = try allocator.alloc(u64, corpus.users.len); errdefer allocator.free(idx2offset); // zero'th entry is empty, so groupless users can refer to it. @@ -222,9 +222,7 @@ pub fn userGids( defer allocator.free(scratch); for (corpus.users) |user, user_idx| { if (corpus.username2groups.get(user.name)) |usergroups| { - const userOffset = try math.cast(u32, blob.items.len); - std.debug.assert(userOffset & 7 == 0); - idx2offset[user_idx] = userOffset; + idx2offset[user_idx] = blob.items.len; scratch = try allocator.realloc(scratch, usergroups.len); scratch.len = usergroups.len; for (usergroups) |group_idx, i| @@ -552,7 +550,7 @@ test "userGids" { var vit = try compress.VarintSliceIterator(user_gids.blob[offset..]); var it = compress.DeltaDecompressionIterator(&vit); try testing.expectEqual(it.remaining(), groups.?.len); - var i: usize = 0; + var i: u64 = 0; while (try it.next()) |gid| : (i += 1) { try testing.expectEqual(gid, corpus.groups[groups.?[i]].gid); } diff --git a/src/user.zig b/src/user.zig index 1e5f494..bf75710 100644 --- a/src/user.zig +++ b/src/user.zig @@ -2,6 +2,7 @@ const std = @import("std"); const pad = @import("padding.zig"); const validate = @import("validate.zig"); +const compress = @import("compress.zig"); const InvalidRecord = validate.InvalidRecord; const assert = std.debug.assert; @@ -85,17 +86,16 @@ fn packedUser(comptime ShellIndexType: type) type { const alignmentBits = 3; - const InnerSize = @divExact(@bitSizeOf(Inner), 8); const Inner = packed struct { uid: u32, gid: u32, - additional_gids_offset: u29, + padding: u2 = 0, shell_len_or_idx: u6, shell_here: bool, - home_len: u6, name_is_a_suffix: bool, + home_len: u6, name_len: u5, - gecos_len: u8, + gecos_len: u11, fn homeLen(self: *const Inner) usize { return @as(u32, self.home_len) + 1; @@ -135,8 +135,8 @@ fn packedUser(comptime ShellIndexType: type) type { return @as(u32, self.shell_len_or_idx) + 1; } - // blobLength returns the length of the blob storing string values. - fn blobLength(self: *const Inner) usize { + // stringLength returns the length of the blob storing string values. + fn stringLength(self: *const Inner) usize { var result: usize = self.homeLen() + self.gecosLen(); if (!self.name_is_a_suffix) result += self.nameLen(); @@ -150,25 +150,30 @@ fn packedUser(comptime ShellIndexType: type) type { // field. Both of those fields are pointers to "our representation" of // that field. inner: *const Inner, - userdata: []const u8, + bytes: []const u8, + additional_gids_offset: u64, pub const Entry = struct { user: Self, next: ?[]const u8, }; - pub fn fromBytes(bytes: []const u8) Entry { + // TODO(motiejus) provide a way to return an entry without decoding the + // additional_gids_offset: + // - will not return the 'next' slice. + // - cannot throw an Overflow error. + pub fn fromBytes(bytes: []const u8) error{Overflow}!Entry { const inner = mem.bytesAsValue( Inner, - // Should use InnerSize instead of sizeOf, see - // https://github.com/ziglang/zig/issues/10958 bytes[0..@sizeOf(Inner)], ); - const startBlob = InnerSize; - const endBlob = startBlob + inner.blobLength(); + const start_blob = @sizeOf(Inner); + const end_strings = start_blob + inner.stringLength(); + const gids_offset = try compress.uvarint(bytes[end_strings..]); + const end_blob = end_strings + gids_offset.bytes_read; - const nextStart = pad.roundUp(usize, alignmentBits, endBlob); + const nextStart = pad.roundUp(usize, alignmentBits, end_blob); var next: ?[]const u8 = null; if (nextStart < bytes.len) next = bytes[nextStart..]; @@ -176,7 +181,8 @@ fn packedUser(comptime ShellIndexType: type) type { return Entry{ .user = Self{ .inner = inner, - .userdata = bytes[startBlob..endBlob], + .bytes = bytes[start_blob..end_blob], + .additional_gids_offset = gids_offset.value, }, .next = next, }; @@ -186,9 +192,9 @@ fn packedUser(comptime ShellIndexType: type) type { section: ?[]const u8, shellIndex: Idx2ShellProto, - pub fn next(it: *Iterator) ?Self { + pub fn next(it: *Iterator) error{Overflow}!?Self { if (it.section) |section| { - const entry = Self.fromBytes(section); + const entry = try Self.fromBytes(section); it.section = entry.next; return entry.user; } @@ -203,13 +209,12 @@ fn packedUser(comptime ShellIndexType: type) type { // packTo packs the User record and copies it to the given byte slice. // The slice must have at least maxRecordSize() bytes available. The // slice is passed as a pointer, so it can be mutated. - const packErr = InvalidRecord || Allocator.Error; pub fn packTo( arr: *ArrayList(u8), user: User, - additional_gids_offset: u29, + additional_gids_offset: usize, idxFn: ShellIndexType, - ) packErr!void { + ) error{ InvalidRecord, OutOfMemory }!void { // function arguments are consts. We need to mutate the underlying // slice, so passing it via pointer instead. const home_len = try validate.downCast(u6, user.home.len - 1); @@ -225,7 +230,6 @@ fn packedUser(comptime ShellIndexType: type) type { const inner = Inner{ .uid = user.uid, .gid = user.gid, - .additional_gids_offset = additional_gids_offset, .shell_here = idxFn.get(user.shell) == null, .shell_len_or_idx = idxFn.get(user.shell) orelse shell_len, .home_len = home_len, @@ -235,9 +239,9 @@ fn packedUser(comptime ShellIndexType: type) type { }; const innerBytes = mem.asBytes(&inner); - // innerBytes.len is longer than InnerSize. We want to copy - // only the InnerSize-number of bytes. - try arr.*.appendSlice(innerBytes[0..InnerSize]); + // innerBytes.len is longer than @sizeOf(Inner). We want to copy + // only the @sizeOf(Inner)-number of bytes. + try arr.*.appendSlice(innerBytes[0..@sizeOf(Inner)]); try arr.*.appendSlice(user.home); if (!inner.name_is_a_suffix) @@ -245,14 +249,15 @@ fn packedUser(comptime ShellIndexType: type) type { try arr.*.appendSlice(user.gecos); if (inner.shell_here) try arr.*.appendSlice(user.shell); + try compress.appendUvarint(arr, additional_gids_offset); try pad.arrayList(arr, alignmentBits); } // maxSize is the maximum number of records a PackedUser can take - // (struct + userdata). + // (struct + strings). pub fn maxSize() usize { comptime { - const unpadded = InnerSize + + const unpadded = @sizeOf(Inner) + math.maxInt(u6) + 1 + // home math.maxInt(u5) + 1 + // name math.maxInt(u6) + 1 + // shell @@ -269,55 +274,43 @@ fn packedUser(comptime ShellIndexType: type) type { return self.inner.gid; } - pub fn additionalGidsOffset(self: Self) u29 { - return self.inner.additional_gids_offset; + pub fn additionalGidsOffset(self: Self) u64 { + return self.additional_gids_offset; } pub fn home(self: Self) []const u8 { - return self.userdata[0..self.inner.homeLen()]; + return self.bytes[0..self.inner.homeLen()]; } pub fn name(self: Self) []const u8 { const name_pos = self.inner.nameStart(); const name_len = self.inner.nameLen(); - return self.userdata[name_pos .. name_pos + name_len]; + return self.bytes[name_pos .. name_pos + name_len]; } pub fn gecos(self: Self) []const u8 { const gecos_pos = self.inner.gecosStart(); const gecos_len = self.inner.gecosLen(); - return self.userdata[gecos_pos .. gecos_pos + gecos_len]; + return self.bytes[gecos_pos .. gecos_pos + gecos_len]; } pub fn shell(self: Self, idxFn: Idx2ShellProto) []const u8 { if (self.inner.shell_here) { const shell_pos = self.inner.maybeShellStart(); const shell_len = self.inner.shellLen(); - return self.userdata[shell_pos .. shell_pos + shell_len]; + return self.bytes[shell_pos .. shell_pos + shell_len]; } return idxFn(self.inner.shell_len_or_idx); } - - // mutable only: this function will refuse to compile otherwise. - pub fn setAdditionalGidsOffset(self: Self, new: u29) void { - self.inner.additional_gids_offset = new; - } }; } const testing = std.testing; test "PackedUser internal and external alignment" { - // External padding (alignmentBits) must be higher or equal to - // the "internal" PackedUser alignment. By aligning PackedUser we are also - // working around https://github.com/ziglang/zig/issues/10958 ; PackedUser - // cannot be converted from/to [@bitSizeOf(PackedUser)/8]u8; - // asBytes/bytesAsValue use @sizeOf, which is larger. Now we are putting no - // more than 1, but it probably could be higher. try testing.expectEqual( - 8, - @sizeOf(PackedUserHash.Inner) * 8 - - @bitSizeOf(PackedUserHash.Inner), + @sizeOf(PackedUserHash.Inner) * 8, + @bitSizeOf(PackedUserHash.Inner), ); } @@ -376,15 +369,15 @@ test "construct PackedUser section" { .shell = "/", } }; for (users) |user| - try PackedUserTest.packTo(&buf, user, math.maxInt(u29), TestShellIndex{}); + try PackedUserTest.packTo(&buf, user, math.maxInt(u64), TestShellIndex{}); var i: u29 = 0; var it1 = PackedUserTest.iterator(buf.items, testShell); - while (it1.next()) |user| : (i += 1) { + while (try it1.next()) |user| : (i += 1) { try testing.expectEqual(users[i].uid, user.uid()); try testing.expectEqual(users[i].gid, user.gid()); try testing.expectEqual( - @as(u29, math.maxInt(u29)), + @as(u64, math.maxInt(u64)), user.additionalGidsOffset(), ); try testing.expectEqualStrings(users[i].name, user.name());