From fc469acbf987ac18c996bf053a2a606654654728 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= <motiejus@jakstys.lt>
Date: Tue, 8 Mar 2022 20:44:32 +0200
Subject: [PATCH] offsets are u64

---
 README.md        | 12 ++++---
 src/sections.zig | 36 +++++++++-----------
 src/user.zig     | 89 ++++++++++++++++++++++--------------------------
 3 files changed, 66 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index 3244329..e3cbb6f 100644
--- a/README.md
+++ b/README.md
@@ -197,13 +197,13 @@ const PackedGroup = packed struct {
 pub const PackedUser = packed struct {
     uid: u32,
     gid: u32,
-    additional_gids_offset: u29,
-    shell_here: bool,
+    padding: u2 = 0,
     shell_len_or_idx: u6,
-    home_len: u6,
+    shell_here: bool,
     name_is_a_suffix: bool,
+    home_len: u6,
     name_len: u5,
-    gecos_len: u8,
+    gecos_len: u11,
     // pseudocode: variable-sized array that will be stored immediately after
     // this struct.
     stringdata []u8;
@@ -215,6 +215,7 @@ pub const PackedUser = packed struct {
 - name (optional).
 - gecos.
 - shell (optional).
+- `additional_gids_offset`: varint.
 
 First byte of home is stored right after the `gecos_len` field, and it's
 length is `home_len`. The same logic applies to all the `stringdata` fields:
@@ -232,6 +233,9 @@ Additionally, there are two "easy" optimizations:
   2. `name_is_a_suffix=false`: name begins one byte after home, and it's length
   is `name_len`.
 
+The last field, `additional_gids_offset`, which is needed least frequently,
+is stored at the end.
+
 Shells
 ------
 
diff --git a/src/sections.zig b/src/sections.zig
index 59ff7c1..7c62d1f 100644
--- a/src/sections.zig
+++ b/src/sections.zig
@@ -35,10 +35,10 @@ const Corpus = struct {
     usersMulti: MultiArrayList(User),
     groupsMulti: MultiArrayList(Group),
 
-    name2user: StringHashMap(usize),
-    name2group: StringHashMap(usize),
-    groupname2users: StringHashMap([]usize),
-    username2groups: StringHashMap([]usize),
+    name2user: StringHashMap(u64),
+    name2group: StringHashMap(u64),
+    groupname2users: StringHashMap([]u64),
+    username2groups: StringHashMap([]u64),
 
     pub fn init(
         baseAllocator: Allocator,
@@ -68,8 +68,8 @@ const Corpus = struct {
         for (groups) |group|
             groupsMulti.appendAssumeCapacity(group);
 
-        var name2user = StringHashMap(usize).init(allocator);
-        var name2group = StringHashMap(usize).init(allocator);
+        var name2user = StringHashMap(u64).init(allocator);
+        var name2group = StringHashMap(u64).init(allocator);
         for (users) |*user, i| {
             var res1 = try name2user.getOrPut(user.name);
             if (res1.found_existing)
@@ -84,17 +84,17 @@ const Corpus = struct {
             res1.value_ptr.* = i;
         }
 
-        var groupname2users = StringHashMap([]usize).init(allocator);
+        var groupname2users = StringHashMap([]u64).init(allocator);
 
         // uses baseAllocator, because it will be freed before
         // returning from this function. This keeps the arena clean.
         var username2groups = StringHashMap(
-            ArrayListUnmanaged(usize),
+            ArrayListUnmanaged(u64),
         ).init(baseAllocator);
         defer username2groups.deinit();
 
         for (groups) |*group, i| {
-            var members = try allocator.alloc(usize, group.members.count());
+            var members = try allocator.alloc(u64, group.members.count());
             members.len = 0;
 
             var it = group.members.iterator();
@@ -108,7 +108,7 @@ const Corpus = struct {
 
                 var groupsOfMember = try username2groups.getOrPut(memberName.*);
                 if (!groupsOfMember.found_existing)
-                    groupsOfMember.value_ptr.* = ArrayListUnmanaged(usize){};
+                    groupsOfMember.value_ptr.* = ArrayListUnmanaged(u64){};
                 try groupsOfMember.value_ptr.*.append(allocator, i);
             }
 
@@ -120,14 +120,14 @@ const Corpus = struct {
 
         var it1 = groupname2users.valueIterator();
         while (it1.next()) |groupUsers| {
-            sort.sort(usize, groupUsers.*, {}, comptime sort.asc(usize));
+            sort.sort(u64, groupUsers.*, {}, comptime sort.asc(u64));
         }
 
         var it2 = username2groups.valueIterator();
         while (it2.next()) |userGroups|
-            sort.sort(usize, userGroups.items, {}, comptime sort.asc(usize));
+            sort.sort(u64, userGroups.items, {}, comptime sort.asc(u64));
 
-        var username2groups_final = StringHashMap([]usize).init(allocator);
+        var username2groups_final = StringHashMap([]u64).init(allocator);
         var it = username2groups.iterator();
         while (it.next()) |elem| {
             const username = elem.key_ptr.*;
@@ -188,7 +188,7 @@ pub fn shellSections(
 
 pub const UserGids = struct {
     // user index -> offset in blob
-    idx2offset: []const u32,
+    idx2offset: []const u64,
     // compressed user gids blob. A blob contains N <= users.len items,
     // an item is:
     //   len: varint
@@ -211,7 +211,7 @@ pub fn userGids(
 ) error{ OutOfMemory, Overflow }!UserGids {
     var blob = ArrayList(u8).init(allocator);
     errdefer blob.deinit();
-    var idx2offset = try allocator.alloc(u32, corpus.users.len);
+    var idx2offset = try allocator.alloc(u64, corpus.users.len);
     errdefer allocator.free(idx2offset);
 
     // zero'th entry is empty, so groupless users can refer to it.
@@ -222,9 +222,7 @@ pub fn userGids(
     defer allocator.free(scratch);
     for (corpus.users) |user, user_idx| {
         if (corpus.username2groups.get(user.name)) |usergroups| {
-            const userOffset = try math.cast(u32, blob.items.len);
-            std.debug.assert(userOffset & 7 == 0);
-            idx2offset[user_idx] = userOffset;
+            idx2offset[user_idx] = blob.items.len;
             scratch = try allocator.realloc(scratch, usergroups.len);
             scratch.len = usergroups.len;
             for (usergroups) |group_idx, i|
@@ -552,7 +550,7 @@ test "userGids" {
         var vit = try compress.VarintSliceIterator(user_gids.blob[offset..]);
         var it = compress.DeltaDecompressionIterator(&vit);
         try testing.expectEqual(it.remaining(), groups.?.len);
-        var i: usize = 0;
+        var i: u64 = 0;
         while (try it.next()) |gid| : (i += 1) {
             try testing.expectEqual(gid, corpus.groups[groups.?[i]].gid);
         }
diff --git a/src/user.zig b/src/user.zig
index 1e5f494..bf75710 100644
--- a/src/user.zig
+++ b/src/user.zig
@@ -2,6 +2,7 @@ const std = @import("std");
 
 const pad = @import("padding.zig");
 const validate = @import("validate.zig");
+const compress = @import("compress.zig");
 const InvalidRecord = validate.InvalidRecord;
 
 const assert = std.debug.assert;
@@ -85,17 +86,16 @@ fn packedUser(comptime ShellIndexType: type) type {
 
         const alignmentBits = 3;
 
-        const InnerSize = @divExact(@bitSizeOf(Inner), 8);
         const Inner = packed struct {
             uid: u32,
             gid: u32,
-            additional_gids_offset: u29,
+            padding: u2 = 0,
             shell_len_or_idx: u6,
             shell_here: bool,
-            home_len: u6,
             name_is_a_suffix: bool,
+            home_len: u6,
             name_len: u5,
-            gecos_len: u8,
+            gecos_len: u11,
 
             fn homeLen(self: *const Inner) usize {
                 return @as(u32, self.home_len) + 1;
@@ -135,8 +135,8 @@ fn packedUser(comptime ShellIndexType: type) type {
                 return @as(u32, self.shell_len_or_idx) + 1;
             }
 
-            // blobLength returns the length of the blob storing string values.
-            fn blobLength(self: *const Inner) usize {
+            // stringLength returns the length of the blob storing string values.
+            fn stringLength(self: *const Inner) usize {
                 var result: usize = self.homeLen() + self.gecosLen();
                 if (!self.name_is_a_suffix)
                     result += self.nameLen();
@@ -150,25 +150,30 @@ fn packedUser(comptime ShellIndexType: type) type {
         // field. Both of those fields are pointers to "our representation" of
         // that field.
         inner: *const Inner,
-        userdata: []const u8,
+        bytes: []const u8,
+        additional_gids_offset: u64,
 
         pub const Entry = struct {
             user: Self,
             next: ?[]const u8,
         };
 
-        pub fn fromBytes(bytes: []const u8) Entry {
+        // TODO(motiejus) provide a way to return an entry without decoding the
+        // additional_gids_offset:
+        // - will not return the 'next' slice.
+        // - cannot throw an Overflow error.
+        pub fn fromBytes(bytes: []const u8) error{Overflow}!Entry {
             const inner = mem.bytesAsValue(
                 Inner,
-                // Should use InnerSize instead of sizeOf, see
-                // https://github.com/ziglang/zig/issues/10958
                 bytes[0..@sizeOf(Inner)],
             );
 
-            const startBlob = InnerSize;
-            const endBlob = startBlob + inner.blobLength();
+            const start_blob = @sizeOf(Inner);
+            const end_strings = start_blob + inner.stringLength();
+            const gids_offset = try compress.uvarint(bytes[end_strings..]);
+            const end_blob = end_strings + gids_offset.bytes_read;
 
-            const nextStart = pad.roundUp(usize, alignmentBits, endBlob);
+            const nextStart = pad.roundUp(usize, alignmentBits, end_blob);
             var next: ?[]const u8 = null;
             if (nextStart < bytes.len)
                 next = bytes[nextStart..];
@@ -176,7 +181,8 @@ fn packedUser(comptime ShellIndexType: type) type {
             return Entry{
                 .user = Self{
                     .inner = inner,
-                    .userdata = bytes[startBlob..endBlob],
+                    .bytes = bytes[start_blob..end_blob],
+                    .additional_gids_offset = gids_offset.value,
                 },
                 .next = next,
             };
@@ -186,9 +192,9 @@ fn packedUser(comptime ShellIndexType: type) type {
             section: ?[]const u8,
             shellIndex: Idx2ShellProto,
 
-            pub fn next(it: *Iterator) ?Self {
+            pub fn next(it: *Iterator) error{Overflow}!?Self {
                 if (it.section) |section| {
-                    const entry = Self.fromBytes(section);
+                    const entry = try Self.fromBytes(section);
                     it.section = entry.next;
                     return entry.user;
                 }
@@ -203,13 +209,12 @@ fn packedUser(comptime ShellIndexType: type) type {
         // packTo packs the User record and copies it to the given byte slice.
         // The slice must have at least maxRecordSize() bytes available. The
         // slice is passed as a pointer, so it can be mutated.
-        const packErr = InvalidRecord || Allocator.Error;
         pub fn packTo(
             arr: *ArrayList(u8),
             user: User,
-            additional_gids_offset: u29,
+            additional_gids_offset: usize,
             idxFn: ShellIndexType,
-        ) packErr!void {
+        ) error{ InvalidRecord, OutOfMemory }!void {
             // function arguments are consts. We need to mutate the underlying
             // slice, so passing it via pointer instead.
             const home_len = try validate.downCast(u6, user.home.len - 1);
@@ -225,7 +230,6 @@ fn packedUser(comptime ShellIndexType: type) type {
             const inner = Inner{
                 .uid = user.uid,
                 .gid = user.gid,
-                .additional_gids_offset = additional_gids_offset,
                 .shell_here = idxFn.get(user.shell) == null,
                 .shell_len_or_idx = idxFn.get(user.shell) orelse shell_len,
                 .home_len = home_len,
@@ -235,9 +239,9 @@ fn packedUser(comptime ShellIndexType: type) type {
             };
             const innerBytes = mem.asBytes(&inner);
 
-            // innerBytes.len is longer than InnerSize. We want to copy
-            // only the InnerSize-number of bytes.
-            try arr.*.appendSlice(innerBytes[0..InnerSize]);
+            // innerBytes.len is longer than @sizeOf(Inner). We want to copy
+            // only the @sizeOf(Inner)-number of bytes.
+            try arr.*.appendSlice(innerBytes[0..@sizeOf(Inner)]);
             try arr.*.appendSlice(user.home);
 
             if (!inner.name_is_a_suffix)
@@ -245,14 +249,15 @@ fn packedUser(comptime ShellIndexType: type) type {
             try arr.*.appendSlice(user.gecos);
             if (inner.shell_here)
                 try arr.*.appendSlice(user.shell);
+            try compress.appendUvarint(arr, additional_gids_offset);
             try pad.arrayList(arr, alignmentBits);
         }
 
         // maxSize is the maximum number of records a PackedUser can take
-        // (struct + userdata).
+        // (struct + strings).
         pub fn maxSize() usize {
             comptime {
-                const unpadded = InnerSize +
+                const unpadded = @sizeOf(Inner) +
                     math.maxInt(u6) + 1 + // home
                     math.maxInt(u5) + 1 + // name
                     math.maxInt(u6) + 1 + // shell
@@ -269,55 +274,43 @@ fn packedUser(comptime ShellIndexType: type) type {
             return self.inner.gid;
         }
 
-        pub fn additionalGidsOffset(self: Self) u29 {
-            return self.inner.additional_gids_offset;
+        pub fn additionalGidsOffset(self: Self) u64 {
+            return self.additional_gids_offset;
         }
 
         pub fn home(self: Self) []const u8 {
-            return self.userdata[0..self.inner.homeLen()];
+            return self.bytes[0..self.inner.homeLen()];
         }
 
         pub fn name(self: Self) []const u8 {
             const name_pos = self.inner.nameStart();
             const name_len = self.inner.nameLen();
-            return self.userdata[name_pos .. name_pos + name_len];
+            return self.bytes[name_pos .. name_pos + name_len];
         }
 
         pub fn gecos(self: Self) []const u8 {
             const gecos_pos = self.inner.gecosStart();
             const gecos_len = self.inner.gecosLen();
-            return self.userdata[gecos_pos .. gecos_pos + gecos_len];
+            return self.bytes[gecos_pos .. gecos_pos + gecos_len];
         }
 
         pub fn shell(self: Self, idxFn: Idx2ShellProto) []const u8 {
             if (self.inner.shell_here) {
                 const shell_pos = self.inner.maybeShellStart();
                 const shell_len = self.inner.shellLen();
-                return self.userdata[shell_pos .. shell_pos + shell_len];
+                return self.bytes[shell_pos .. shell_pos + shell_len];
             }
             return idxFn(self.inner.shell_len_or_idx);
         }
-
-        // mutable only: this function will refuse to compile otherwise.
-        pub fn setAdditionalGidsOffset(self: Self, new: u29) void {
-            self.inner.additional_gids_offset = new;
-        }
     };
 }
 
 const testing = std.testing;
 
 test "PackedUser internal and external alignment" {
-    // External padding (alignmentBits) must be higher or equal to
-    // the "internal" PackedUser alignment. By aligning PackedUser we are also
-    // working around https://github.com/ziglang/zig/issues/10958 ; PackedUser
-    // cannot be converted from/to [@bitSizeOf(PackedUser)/8]u8;
-    // asBytes/bytesAsValue use @sizeOf, which is larger. Now we are putting no
-    // more than 1, but it probably could be higher.
     try testing.expectEqual(
-        8,
-        @sizeOf(PackedUserHash.Inner) * 8 -
-            @bitSizeOf(PackedUserHash.Inner),
+        @sizeOf(PackedUserHash.Inner) * 8,
+        @bitSizeOf(PackedUserHash.Inner),
     );
 }
 
@@ -376,15 +369,15 @@ test "construct PackedUser section" {
         .shell = "/",
     } };
     for (users) |user|
-        try PackedUserTest.packTo(&buf, user, math.maxInt(u29), TestShellIndex{});
+        try PackedUserTest.packTo(&buf, user, math.maxInt(u64), TestShellIndex{});
 
     var i: u29 = 0;
     var it1 = PackedUserTest.iterator(buf.items, testShell);
-    while (it1.next()) |user| : (i += 1) {
+    while (try it1.next()) |user| : (i += 1) {
         try testing.expectEqual(users[i].uid, user.uid());
         try testing.expectEqual(users[i].gid, user.gid());
         try testing.expectEqual(
-            @as(u29, math.maxInt(u29)),
+            @as(u64, math.maxInt(u64)),
             user.additionalGidsOffset(),
         );
         try testing.expectEqualStrings(users[i].name, user.name());