From 4e36d7850ecea9eabd6f89f706cc5bbe2515f089 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= <motiejus@jakstys.lt>
Date: Thu, 17 Mar 2022 16:50:41 +0100
Subject: [PATCH] rewrite shells

- Shell is up to 256 bytes long.
- Store up to 255 shells in the Shells area.
- Remove padding from the User struct.
---
 README.md        |  46 ++++++++++----------
 src/group.zig    |  13 +++---
 src/header.zig   |   6 ---
 src/sections.zig |   3 +-
 src/shell.zig    | 108 +++++++++++++++++++----------------------------
 src/user.zig     |  39 ++++-------------
 6 files changed, 81 insertions(+), 134 deletions(-)

diff --git a/README.md b/README.md
index 1aa00e0..49bdcb9 100644
--- a/README.md
+++ b/README.md
@@ -65,9 +65,12 @@ regions are shared. Turbonss reads do not consume any heap space.
 Tight packing places some constraints on the underlying data:
 
 - Permitted length of username and groupname: 1-32 bytes.
-- Permitted length of shell and home: 1-64 bytes.
+- Permitted length of shell and home: 1-256 bytes.
 - Permitted comment ("gecos") length: 0-255 bytes.
 - User name, groupname, gecos and shell must be utf8-encoded.
+- User and Groups sections are up to 2^35B (~34GB) large. Assuming an "average"
+  user record takes 50 bytes, this section would fit ~660M users. The
+  worst-case upper bound is left as an exercise to the reader.
 
 Sorting is stable. In v0:
 - Groups are sorted by gid, ascending.
@@ -173,7 +176,8 @@ the beginning of the section.
 ```
 const PackedGroup = packed struct {
     gid: u32,
-    groupname_len: u8, // max is 32, but have too much space here.
+    padding: u3,
+    groupname_len: u5,
 }
 ```
 
@@ -186,8 +190,7 @@ PackedUser is a bit more involved:
 pub const PackedUser = packed struct {
     uid: u32,
     gid: u32,
-    padding: u2 = 0,
-    shell_len_or_idx: u6,
+    shell_len_or_idx: u8,
     shell_here: bool,
     name_is_a_suffix: bool,
     home_len: u6,
@@ -219,8 +222,8 @@ PackedUser employs two "simple" compression techniques:
   2. `name_is_a_suffix=false`: name begins one byte after home, and it's length
   is `name_len`.
 
-The last field `additional_gids_offset: varint` points to the `additional_gids` section for
-this user.
+The last field `additional_gids_offset: varint` points to the `additional_gids`
+section for this user.
 
 Shells
 ------
@@ -231,23 +234,20 @@ others. Therefore, "shells" have an optimization: they can be pointed by in the
 external list, or, if they are unique to the user, reside among the user's
 data.
 
-63 most popular shells (i.e. referred to by at least two User entries) are
+255 most popular shells (i.e. referred to by at least two User entries) are
 stored externally in "Shells" area. The less popular ones are stored with
 userdata.
 
 Shells section consists of two sub-sections: the index and the blob. The index
-is a list of structs which point to a location in the "blob" area:
+is an array of offsets: the i'th shell starts at `offsets[i]` byte, and ends at
+`offsets[i+1]` byte. If there is at least one shell in the shell section, the
+index contains a sentinel index as the last element, which signifies the position
+of the last byte of the shell blob.
 
-```
-const ShellIndex = struct {
-    offset: u10,
-    len: u6,
-};
-```
-
-In the user's struct `shell_here=true` signifies that the shell is stored with
-userdata, and it's length is `shell_len_or_idx`. `shell_here=false` means it is
-stored in the `Shells` section, and it's index is `shell_len_or_idx`.
+`shell_here=true` in the User struct means the shell is stored with userdata,
+and it's length is `shell_len_or_idx`. `shell_here=false` means it is stored in
+the `Shells` section, and it's index is `shell_len_or_idx` (and the actual
+string start and end offsets are resolved as described in the paragraph above).
 
 Variable-length integers (varints)
 ----------------------------------
@@ -264,7 +264,6 @@ There are two group memberships at play:
 1. Given a group (gid/name), resolve the members' names (e.g. `getgrgid`).
 2. Given a username, resolve user's group gids (for `initgroups(3)`).
 
-
 When group's memberships are resolved in (1), the same call also requires other
 group information: gid and group name. Therefore it makes sense to store a
 pointer to the group members in the group information itself. However, the
@@ -323,9 +322,10 @@ will be pointing to a number `n ∈ [0,N-1]`, regardless whether the value was i
 the initial dictionary. Therefore one must always confirm, after calculating
 the hash, that the key matches what's been hashed.
 
-`idx_*` sections are of type `[]PackedIntArray(u29)` and are pointing to the
-respective `Groups` and `Users` entries (from the beginning of the respective
-section). Since User and Group records are 8-byte aligned, `u29` is used.
+`idx_*` sections are of type `[]u32` and are pointing to the respective
+`Groups` and `Users` entries (from the beginning of the respective section).
+Since User and Group records are 8-byte aligned, the actual offset to the
+record is acquired by right-shifting this value by 3 bits.
 
 Database file structure
 -----------------------
@@ -344,7 +344,7 @@ idx_groupname2group   len(group)*4     bdz->offset Groups
 idx_uid2user          len(user)*4      bdz->offset Users
 idx_name2user         len(user)*4      bdz->offset Users
 shell_index           len(shells)*2    shell index array
-shell_blob            <= 4032          shell data blob (max 63*64 bytes)
+shell_blob            <= 65280         shell data blob (max 255*256 bytes)
 groups                ?                packed Group entries (8b padding)
 users                 ?                packed User entries (8b padding)
 groupmembers          ?                per-group delta varint memberlist (no padding)
diff --git a/src/group.zig b/src/group.zig
index 1377b33..c8c22c6 100644
--- a/src/group.zig
+++ b/src/group.zig
@@ -42,10 +42,11 @@ pub const PackedGroup = struct {
 
     const Inner = packed struct {
         gid: u32,
-        groupname_len: u8,
+        padding: u3 = 0,
+        groupname_len: u5,
 
         pub fn groupnameLen(self: *const Inner) usize {
-            return self.groupname_len + 1;
+            return @as(usize, self.groupname_len) + 1;
         }
     };
 
@@ -120,13 +121,9 @@ pub const PackedGroup = struct {
         group: GroupStored,
     ) packErr!void {
         std.debug.assert(arr.items.len & 7 == 0);
-        const groupname_len = try validate.downCast(u5, group.name.len - 1);
         try validate.utf8(group.name);
-        const inner = Inner{
-            .gid = group.gid,
-            .groupname_len = groupname_len,
-        };
-
+        const len = try validate.downCast(u5, group.name.len - 1);
+        const inner = Inner{ .gid = group.gid, .groupname_len = len };
         try arr.*.appendSlice(mem.asBytes(&inner));
         try arr.*.appendSlice(group.name);
         try compress.appendUvarint(arr, group.members_offset);
diff --git a/src/header.zig b/src/header.zig
index 2eb49e9..9872271 100644
--- a/src/header.zig
+++ b/src/header.zig
@@ -110,12 +110,6 @@ test "header pack, unpack and validation" {
         try testing.expectError(error.InvalidBom, Header.init(header.asArray()));
     }
 
-    {
-        var header = goodHeader;
-        header.num_shells = shell.max_shells + 1;
-        try testing.expectError(error.TooManyShells, Header.init(header.asArray()));
-    }
-
     {
         var header = goodHeader;
         header.offset_bdz_uid2user = 65;
diff --git a/src/sections.zig b/src/sections.zig
index 97fb6c3..9bff7db 100644
--- a/src/sections.zig
+++ b/src/sections.zig
@@ -234,7 +234,7 @@ pub fn usersSection(
             &blob,
             user,
             gids.idx2offset[i],
-            shells.indices,
+            shells.shell2idx,
         );
         try pad.arrayList(&blob, userImport.PackedUser.alignment_bits);
     }
@@ -439,6 +439,7 @@ pub const AllSections = struct {
         var groups = try groupsSection(allocator, corpus, group_members.idx2offset);
         errdefer groups.deinit(allocator);
 
+        // TODO: these indices must point to the *offsets*, not the indices in "users"
         var idx_gid2group = try bdzIdx(u32, allocator, bdz_gid, gids);
         errdefer allocator.free(idx_gid2group);
 
diff --git a/src/shell.zig b/src/shell.zig
index ec1c09e..ff684f1 100644
--- a/src/shell.zig
+++ b/src/shell.zig
@@ -1,5 +1,4 @@
 const std = @import("std");
-const pad = @import("padding.zig");
 const Allocator = std.mem.Allocator;
 const PriorityDequeue = std.PriorityDequeue;
 const StringArrayHashMap = std.StringArrayHashMap;
@@ -7,40 +6,24 @@ const StringHashMap = std.StringHashMap;
 const BoundedArray = std.BoundedArray;
 const StringContext = std.hash_map.StringContext;
 
-// maxShells is the maximum number of "popular" shells.
-pub const max_shells = 63;
-pub const max_shell_len = 64;
-pub const shell_alignment_bits = 2; // bits
-
-// ShellIndex is an index to the shell strings. As shell can be up to 64 bytes
-// (1<<6), maximum number of shells is 63 (1<<6-1), the maximum location offset
-// is 1<<12. To make location resolvable in 10 bits, all shells will be padded
-// to 4 bytes.
-// The actual shell length is len+1: we don't allow empty shells, and the real
-// length of the shell is 1-64 bytes.
-pub const ShellIndex = packed struct {
-    offset: u10,
-    len: u6,
-};
+pub const max_shells = 255;
+pub const max_shell_len = 256;
 
 // ShellReader interprets "Shell Index" and "Shell Blob" sections.
 pub const ShellReader = struct {
-    section_index: []const ShellIndex,
-    section_blob: []const u8,
+    index: []const u16,
+    blob: []const u8,
 
-    pub fn init(index: []const u8, blob: []const u8) ShellReader {
+    pub fn init(index: []align(2) const u8, blob: []const u8) ShellReader {
         return ShellReader{
-            .section_index = std.mem.bytesAsSlice(ShellIndex, index),
-            .section_blob = blob,
+            .index = std.mem.bytesAsSlice(u16, index),
+            .blob = blob,
         };
     }
 
     // get returns a shell at the given index.
-    pub fn get(self: *const ShellReader, idx: u6) []const u8 {
-        const shell_index = self.section_index[idx];
-        const start = shell_index.offset << 2;
-        const end = start + shell_index.len + 1;
-        return self.section_blob[start..end];
+    pub fn get(self: *const ShellReader, idx: u8) []const u8 {
+        return self.blob[self.index[idx]..self.index[idx + 1]];
     }
 };
 
@@ -55,45 +38,42 @@ pub const ShellWriter = struct {
     };
 
     pub const ShellSections = struct {
-        index: BoundedArray(ShellIndex, max_shells),
-        blob: BoundedArray(u8, max_shells * max_shell_len),
-        indices: StringHashMap(u6),
+        // index points the i'th shell to it's offset in blob. The last
+        // byte of the i'th shell is index[i+1].
+        index: BoundedArray(u16, max_shells),
+        // blob contains `index.len+1` number of records. The last record is
+        // pointing to the end of the blob, so length of the last shell can be
+        // calculated from the index array.
+        blob: BoundedArray(u8, (max_shells + 1) * max_shell_len),
+        // shell2idx helps translate a shell (string) to it's index.
+        shell2idx: StringHashMap(u8),
 
         // initializes and populates shell sections. All strings are copied,
         // nothing is owned.
-        pub const initErr = Allocator.Error || error{Overflow};
         pub fn init(
             allocator: Allocator,
             shells: BoundedArray([]const u8, max_shells),
-        ) initErr!ShellSections {
+        ) error{ Overflow, OutOfMemory }!ShellSections {
             var self = ShellSections{
-                .index = try BoundedArray(ShellIndex, max_shells).init(shells.len),
-                .blob = try BoundedArray(u8, max_shells * max_shell_len).init(0),
-                .indices = StringHashMap(u6).init(allocator),
+                .index = try BoundedArray(u16, max_shells).init(shells.len),
+                .blob = try BoundedArray(u8, (max_shells + 1) * max_shell_len).init(0),
+                .shell2idx = StringHashMap(u8).init(allocator),
             };
-            errdefer self.indices.deinit();
-            var full_offset: u12 = 0;
-            var idx: u6 = 0;
-            while (idx < shells.len) : (idx += 1) {
-                const len = try std.math.cast(u6, shells.get(idx).len);
-                try self.blob.appendSlice(shells.get(idx));
-                const our_shell = self.blob.constSlice()[full_offset .. full_offset + len];
-                try self.indices.put(our_shell, idx);
-                std.debug.assert(full_offset & 3 == 0);
-                self.index.set(idx, ShellIndex{
-                    .offset = try std.math.cast(u10, full_offset >> 2),
-                    .len = len - 1,
-                });
+            if (shells.len == 0) return self;
 
-                full_offset += len;
-                const padding = pad.roundUpPadding(u12, shell_alignment_bits, full_offset);
-                full_offset += padding;
-                try self.blob.appendNTimes(0, padding);
+            errdefer self.shell2idx.deinit();
+            for (shells.constSlice()) |shell, idx| {
+                const idx8 = @intCast(u8, idx);
+                const offset = @intCast(u16, self.blob.len);
+                try self.blob.appendSlice(shell);
+                try self.shell2idx.put(self.blob.constSlice()[offset..], idx8);
+                self.index.set(idx8, offset);
             }
+            try self.index.append(@intCast(u8, self.blob.len));
             return self;
         }
 
-        pub fn section_index(self: *const ShellSections) []const u8 {
+        pub fn section_index(self: *const ShellSections) []align(2) const u8 {
             return std.mem.sliceAsBytes(self.index.constSlice());
         }
 
@@ -102,12 +82,12 @@ pub const ShellWriter = struct {
         }
 
         pub fn deinit(self: *ShellSections) void {
-            self.indices.deinit();
+            self.shell2idx.deinit();
             self.* = undefined;
         }
 
-        pub fn getIndex(self: *const ShellSections, shell: []const u8) ?u6 {
-            return self.indices.get(shell);
+        pub fn getIndex(self: *const ShellSections, shell: []const u8) ?u8 {
+            return self.shell2idx.get(shell);
         }
     };
 
@@ -143,8 +123,10 @@ pub const ShellWriter = struct {
     // toOwnedSections returns the analyzed ShellSections. Resets the shell
     // popularity contest. ShellSections memory is allocated by the ShellWriter
     // allocator, and must be deInit'ed by the caller.
-    const toOwnedSectionsErr = Allocator.Error || error{Overflow};
-    pub fn toOwnedSections(self: *ShellWriter, limit: u10) toOwnedSectionsErr!ShellSections {
+    pub fn toOwnedSections(
+        self: *ShellWriter,
+        limit: u10,
+    ) error{ Overflow, OutOfMemory }!ShellSections {
         var deque = PriorityDequeue(KV, void, cmpShells).init(self.allocator, {});
         defer deque.deinit();
 
@@ -164,9 +146,8 @@ pub const ShellWriter = struct {
             topShells.set(i, deque.removeMax().shell);
 
         const result = ShellSections.init(self.allocator, topShells);
-        const allocator = self.allocator;
         self.deinit();
-        self.* = init(allocator);
+        self.* = init(self.allocator);
         return result;
     }
 };
@@ -192,16 +173,13 @@ test "basic shellpopcon" {
 
     var sections = try popcon.toOwnedSections(max_shells);
     defer sections.deinit();
-    try testing.expectEqual(sections.index.len, 3); // all but "nobody" qualify
+    try testing.expectEqual(sections.index.len, 4); // all but "nobody" qualify
 
     try testing.expectEqual(sections.getIndex(long).?, 0);
     try testing.expectEqual(sections.getIndex(zsh).?, 1);
     try testing.expectEqual(sections.getIndex(bash).?, 2);
     try testing.expectEqual(sections.getIndex(nobody), null);
-    try testing.expectEqual(
-        sections.section_blob().len,
-        pad.roundUp(u12, 2, bash.len) + pad.roundUp(u12, 2, zsh.len) + pad.roundUp(u12, 2, long.len),
-    );
+    try testing.expectEqual(sections.section_blob().len, bash.len + zsh.len + long.len);
 
     const shellReader = ShellReader.init(
         sections.section_index(),
@@ -211,5 +189,5 @@ test "basic shellpopcon" {
     try testing.expectEqualStrings(shellReader.get(1), zsh);
     try testing.expectEqualStrings(shellReader.get(2), bash);
 
-    try testing.expectEqual(shellReader.section_index.len, 3);
+    try testing.expectEqual(shellReader.index.len, 4);
 }
diff --git a/src/user.zig b/src/user.zig
index 1d30d63..403e0da 100644
--- a/src/user.zig
+++ b/src/user.zig
@@ -13,10 +13,6 @@ const Allocator = mem.Allocator;
 const ArrayList = std.ArrayList;
 const StringHashMap = std.StringHashMap;
 
-// Idx2ShellProto is a function prototype that, given a shell's index (in
-// global shell section), will return a shell string. Matches ShellReader.get.
-const Idx2ShellProto = fn (u6) []const u8;
-
 // User is a convenient public struct for record construction and
 // serialization.
 pub const User = struct {
@@ -65,21 +61,6 @@ pub const User = struct {
     }
 };
 
-pub fn Shell2Index(T: type) type {
-    return struct {
-        const Self = @This();
-        data: T,
-
-        pub fn init(data: T) Self {
-            return Self{ .data = data };
-        }
-
-        pub fn get(self: *const Self, str: []const u8) ?u6 {
-            return self.data.get(str);
-        }
-    };
-}
-
 pub const PackedUser = struct {
     const Self = @This();
 
@@ -88,8 +69,7 @@ pub const PackedUser = struct {
     const Inner = packed struct {
         uid: u32,
         gid: u32,
-        padding: u2 = 0,
-        shell_len_or_idx: u6,
+        shell_len_or_idx: u8,
         shell_here: bool,
         name_is_a_suffix: bool,
         home_len: u6,
@@ -204,14 +184,14 @@ pub const PackedUser = struct {
         arr: *ArrayList(u8),
         user: User,
         additional_gids_offset: u64,
-        idxFn: StringHashMap(u6),
+        idxFn: StringHashMap(u8),
     ) error{ InvalidRecord, OutOfMemory }!void {
         std.debug.assert(arr.items.len & 7 == 0);
         // function arguments are consts. We need to mutate the underlying
         // slice, so passing it via pointer instead.
         const home_len = try validate.downCast(u6, user.home.len - 1);
         const name_len = try validate.downCast(u5, user.name.len - 1);
-        const shell_len = try validate.downCast(u6, user.shell.len - 1);
+        const shell_len = try validate.downCast(u8, user.shell.len - 1);
         const gecos_len = try validate.downCast(u8, user.gecos.len);
 
         try validate.utf8(user.home);
@@ -289,19 +269,16 @@ test "PackedUser internal and external alignment" {
     );
 }
 
-fn testShellIndex(allocator: Allocator) StringHashMap(u6) {
-    var result = StringHashMap(u6).init(allocator);
+fn testShellIndex(allocator: Allocator) StringHashMap(u8) {
+    var result = StringHashMap(u8).init(allocator);
     result.put("/bin/bash", 0) catch unreachable;
     result.put("/bin/zsh", 1) catch unreachable;
     return result;
 }
 
 const test_shell_reader = shellImport.ShellReader{
-    .section_blob = "/bin/bash.../bin/zsh",
-    .section_index = &[_]shellImport.ShellIndex{
-        shellImport.ShellIndex{ .offset = 0, .len = 9 - 1 },
-        shellImport.ShellIndex{ .offset = 12 >> 2, .len = 8 - 1 },
-    },
+    .blob = "/bin/bash/bin/zsh",
+    .index = &[_]u16{ 0, 9, 17 },
 };
 
 test "construct PackedUser section" {
@@ -328,7 +305,7 @@ test "construct PackedUser section" {
         .name = "Name" ** 8,
         .gecos = "Gecos" ** 51,
         .home = "Home" ** 16,
-        .shell = "She.LllL" ** 8,
+        .shell = "She.LllL" ** 32,
     }, User{
         .uid = 1002,
         .gid = 1002,