rewrite shells
- Shell is up to 256 bytes long. - Store up to 255 shells in the Shells area. - Remove padding from the User struct.
This commit is contained in:
parent
85552c1302
commit
4e36d7850e
46
README.md
46
README.md
|
@ -65,9 +65,12 @@ regions are shared. Turbonss reads do not consume any heap space.
|
|||
Tight packing places some constraints on the underlying data:
|
||||
|
||||
- Permitted length of username and groupname: 1-32 bytes.
|
||||
- Permitted length of shell and home: 1-64 bytes.
|
||||
- Permitted length of shell and home: 1-256 bytes.
|
||||
- Permitted comment ("gecos") length: 0-255 bytes.
|
||||
- User name, groupname, gecos and shell must be utf8-encoded.
|
||||
- User and Groups sections are up to 2^35B (~34GB) large. Assuming an "average"
|
||||
user record takes 50 bytes, this section would fit ~660M users. The
|
||||
worst-case upper bound is left as an exercise to the reader.
|
||||
|
||||
Sorting is stable. In v0:
|
||||
- Groups are sorted by gid, ascending.
|
||||
|
@ -173,7 +176,8 @@ the beginning of the section.
|
|||
```
|
||||
const PackedGroup = packed struct {
|
||||
gid: u32,
|
||||
groupname_len: u8, // max is 32, but have too much space here.
|
||||
padding: u3,
|
||||
groupname_len: u5,
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -186,8 +190,7 @@ PackedUser is a bit more involved:
|
|||
pub const PackedUser = packed struct {
|
||||
uid: u32,
|
||||
gid: u32,
|
||||
padding: u2 = 0,
|
||||
shell_len_or_idx: u6,
|
||||
shell_len_or_idx: u8,
|
||||
shell_here: bool,
|
||||
name_is_a_suffix: bool,
|
||||
home_len: u6,
|
||||
|
@ -219,8 +222,8 @@ PackedUser employs two "simple" compression techniques:
|
|||
2. `name_is_a_suffix=false`: name begins one byte after home, and it's length
|
||||
is `name_len`.
|
||||
|
||||
The last field `additional_gids_offset: varint` points to the `additional_gids` section for
|
||||
this user.
|
||||
The last field `additional_gids_offset: varint` points to the `additional_gids`
|
||||
section for this user.
|
||||
|
||||
Shells
|
||||
------
|
||||
|
@ -231,23 +234,20 @@ others. Therefore, "shells" have an optimization: they can be pointed by in the
|
|||
external list, or, if they are unique to the user, reside among the user's
|
||||
data.
|
||||
|
||||
63 most popular shells (i.e. referred to by at least two User entries) are
|
||||
255 most popular shells (i.e. referred to by at least two User entries) are
|
||||
stored externally in "Shells" area. The less popular ones are stored with
|
||||
userdata.
|
||||
|
||||
Shells section consists of two sub-sections: the index and the blob. The index
|
||||
is a list of structs which point to a location in the "blob" area:
|
||||
is an array of offsets: the i'th shell starts at `offsets[i]` byte, and ends at
|
||||
`offsets[i+1]` byte. If there is at least one shell in the shell section, the
|
||||
index contains a sentinel index as the last element, which signifies the position
|
||||
of the last byte of the shell blob.
|
||||
|
||||
```
|
||||
const ShellIndex = struct {
|
||||
offset: u10,
|
||||
len: u6,
|
||||
};
|
||||
```
|
||||
|
||||
In the user's struct `shell_here=true` signifies that the shell is stored with
|
||||
userdata, and it's length is `shell_len_or_idx`. `shell_here=false` means it is
|
||||
stored in the `Shells` section, and it's index is `shell_len_or_idx`.
|
||||
`shell_here=true` in the User struct means the shell is stored with userdata,
|
||||
and it's length is `shell_len_or_idx`. `shell_here=false` means it is stored in
|
||||
the `Shells` section, and it's index is `shell_len_or_idx` (and the actual
|
||||
string start and end offsets are resolved as described in the paragraph above).
|
||||
|
||||
Variable-length integers (varints)
|
||||
----------------------------------
|
||||
|
@ -264,7 +264,6 @@ There are two group memberships at play:
|
|||
1. Given a group (gid/name), resolve the members' names (e.g. `getgrgid`).
|
||||
2. Given a username, resolve user's group gids (for `initgroups(3)`).
|
||||
|
||||
|
||||
When group's memberships are resolved in (1), the same call also requires other
|
||||
group information: gid and group name. Therefore it makes sense to store a
|
||||
pointer to the group members in the group information itself. However, the
|
||||
|
@ -323,9 +322,10 @@ will be pointing to a number `n ∈ [0,N-1]`, regardless whether the value was i
|
|||
the initial dictionary. Therefore one must always confirm, after calculating
|
||||
the hash, that the key matches what's been hashed.
|
||||
|
||||
`idx_*` sections are of type `[]PackedIntArray(u29)` and are pointing to the
|
||||
respective `Groups` and `Users` entries (from the beginning of the respective
|
||||
section). Since User and Group records are 8-byte aligned, `u29` is used.
|
||||
`idx_*` sections are of type `[]u32` and are pointing to the respective
|
||||
`Groups` and `Users` entries (from the beginning of the respective section).
|
||||
Since User and Group records are 8-byte aligned, the actual offset to the
|
||||
record is acquired by right-shifting this value by 3 bits.
|
||||
|
||||
Database file structure
|
||||
-----------------------
|
||||
|
@ -344,7 +344,7 @@ idx_groupname2group len(group)*4 bdz->offset Groups
|
|||
idx_uid2user len(user)*4 bdz->offset Users
|
||||
idx_name2user len(user)*4 bdz->offset Users
|
||||
shell_index len(shells)*2 shell index array
|
||||
shell_blob <= 4032 shell data blob (max 63*64 bytes)
|
||||
shell_blob <= 65280 shell data blob (max 255*256 bytes)
|
||||
groups ? packed Group entries (8b padding)
|
||||
users ? packed User entries (8b padding)
|
||||
groupmembers ? per-group delta varint memberlist (no padding)
|
||||
|
|
|
@ -42,10 +42,11 @@ pub const PackedGroup = struct {
|
|||
|
||||
const Inner = packed struct {
|
||||
gid: u32,
|
||||
groupname_len: u8,
|
||||
padding: u3 = 0,
|
||||
groupname_len: u5,
|
||||
|
||||
pub fn groupnameLen(self: *const Inner) usize {
|
||||
return self.groupname_len + 1;
|
||||
return @as(usize, self.groupname_len) + 1;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -120,13 +121,9 @@ pub const PackedGroup = struct {
|
|||
group: GroupStored,
|
||||
) packErr!void {
|
||||
std.debug.assert(arr.items.len & 7 == 0);
|
||||
const groupname_len = try validate.downCast(u5, group.name.len - 1);
|
||||
try validate.utf8(group.name);
|
||||
const inner = Inner{
|
||||
.gid = group.gid,
|
||||
.groupname_len = groupname_len,
|
||||
};
|
||||
|
||||
const len = try validate.downCast(u5, group.name.len - 1);
|
||||
const inner = Inner{ .gid = group.gid, .groupname_len = len };
|
||||
try arr.*.appendSlice(mem.asBytes(&inner));
|
||||
try arr.*.appendSlice(group.name);
|
||||
try compress.appendUvarint(arr, group.members_offset);
|
||||
|
|
|
@ -110,12 +110,6 @@ test "header pack, unpack and validation" {
|
|||
try testing.expectError(error.InvalidBom, Header.init(header.asArray()));
|
||||
}
|
||||
|
||||
{
|
||||
var header = goodHeader;
|
||||
header.num_shells = shell.max_shells + 1;
|
||||
try testing.expectError(error.TooManyShells, Header.init(header.asArray()));
|
||||
}
|
||||
|
||||
{
|
||||
var header = goodHeader;
|
||||
header.offset_bdz_uid2user = 65;
|
||||
|
|
|
@ -234,7 +234,7 @@ pub fn usersSection(
|
|||
&blob,
|
||||
user,
|
||||
gids.idx2offset[i],
|
||||
shells.indices,
|
||||
shells.shell2idx,
|
||||
);
|
||||
try pad.arrayList(&blob, userImport.PackedUser.alignment_bits);
|
||||
}
|
||||
|
@ -439,6 +439,7 @@ pub const AllSections = struct {
|
|||
var groups = try groupsSection(allocator, corpus, group_members.idx2offset);
|
||||
errdefer groups.deinit(allocator);
|
||||
|
||||
// TODO: these indices must point to the *offsets*, not the indices in "users"
|
||||
var idx_gid2group = try bdzIdx(u32, allocator, bdz_gid, gids);
|
||||
errdefer allocator.free(idx_gid2group);
|
||||
|
||||
|
|
108
src/shell.zig
108
src/shell.zig
|
@ -1,5 +1,4 @@
|
|||
const std = @import("std");
|
||||
const pad = @import("padding.zig");
|
||||
const Allocator = std.mem.Allocator;
|
||||
const PriorityDequeue = std.PriorityDequeue;
|
||||
const StringArrayHashMap = std.StringArrayHashMap;
|
||||
|
@ -7,40 +6,24 @@ const StringHashMap = std.StringHashMap;
|
|||
const BoundedArray = std.BoundedArray;
|
||||
const StringContext = std.hash_map.StringContext;
|
||||
|
||||
// maxShells is the maximum number of "popular" shells.
|
||||
pub const max_shells = 63;
|
||||
pub const max_shell_len = 64;
|
||||
pub const shell_alignment_bits = 2; // bits
|
||||
|
||||
// ShellIndex is an index to the shell strings. As shell can be up to 64 bytes
|
||||
// (1<<6), maximum number of shells is 63 (1<<6-1), the maximum location offset
|
||||
// is 1<<12. To make location resolvable in 10 bits, all shells will be padded
|
||||
// to 4 bytes.
|
||||
// The actual shell length is len+1: we don't allow empty shells, and the real
|
||||
// length of the shell is 1-64 bytes.
|
||||
pub const ShellIndex = packed struct {
|
||||
offset: u10,
|
||||
len: u6,
|
||||
};
|
||||
pub const max_shells = 255;
|
||||
pub const max_shell_len = 256;
|
||||
|
||||
// ShellReader interprets "Shell Index" and "Shell Blob" sections.
|
||||
pub const ShellReader = struct {
|
||||
section_index: []const ShellIndex,
|
||||
section_blob: []const u8,
|
||||
index: []const u16,
|
||||
blob: []const u8,
|
||||
|
||||
pub fn init(index: []const u8, blob: []const u8) ShellReader {
|
||||
pub fn init(index: []align(2) const u8, blob: []const u8) ShellReader {
|
||||
return ShellReader{
|
||||
.section_index = std.mem.bytesAsSlice(ShellIndex, index),
|
||||
.section_blob = blob,
|
||||
.index = std.mem.bytesAsSlice(u16, index),
|
||||
.blob = blob,
|
||||
};
|
||||
}
|
||||
|
||||
// get returns a shell at the given index.
|
||||
pub fn get(self: *const ShellReader, idx: u6) []const u8 {
|
||||
const shell_index = self.section_index[idx];
|
||||
const start = shell_index.offset << 2;
|
||||
const end = start + shell_index.len + 1;
|
||||
return self.section_blob[start..end];
|
||||
pub fn get(self: *const ShellReader, idx: u8) []const u8 {
|
||||
return self.blob[self.index[idx]..self.index[idx + 1]];
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -55,45 +38,42 @@ pub const ShellWriter = struct {
|
|||
};
|
||||
|
||||
pub const ShellSections = struct {
|
||||
index: BoundedArray(ShellIndex, max_shells),
|
||||
blob: BoundedArray(u8, max_shells * max_shell_len),
|
||||
indices: StringHashMap(u6),
|
||||
// index points the i'th shell to it's offset in blob. The last
|
||||
// byte of the i'th shell is index[i+1].
|
||||
index: BoundedArray(u16, max_shells),
|
||||
// blob contains `index.len+1` number of records. The last record is
|
||||
// pointing to the end of the blob, so length of the last shell can be
|
||||
// calculated from the index array.
|
||||
blob: BoundedArray(u8, (max_shells + 1) * max_shell_len),
|
||||
// shell2idx helps translate a shell (string) to it's index.
|
||||
shell2idx: StringHashMap(u8),
|
||||
|
||||
// initializes and populates shell sections. All strings are copied,
|
||||
// nothing is owned.
|
||||
pub const initErr = Allocator.Error || error{Overflow};
|
||||
pub fn init(
|
||||
allocator: Allocator,
|
||||
shells: BoundedArray([]const u8, max_shells),
|
||||
) initErr!ShellSections {
|
||||
) error{ Overflow, OutOfMemory }!ShellSections {
|
||||
var self = ShellSections{
|
||||
.index = try BoundedArray(ShellIndex, max_shells).init(shells.len),
|
||||
.blob = try BoundedArray(u8, max_shells * max_shell_len).init(0),
|
||||
.indices = StringHashMap(u6).init(allocator),
|
||||
.index = try BoundedArray(u16, max_shells).init(shells.len),
|
||||
.blob = try BoundedArray(u8, (max_shells + 1) * max_shell_len).init(0),
|
||||
.shell2idx = StringHashMap(u8).init(allocator),
|
||||
};
|
||||
errdefer self.indices.deinit();
|
||||
var full_offset: u12 = 0;
|
||||
var idx: u6 = 0;
|
||||
while (idx < shells.len) : (idx += 1) {
|
||||
const len = try std.math.cast(u6, shells.get(idx).len);
|
||||
try self.blob.appendSlice(shells.get(idx));
|
||||
const our_shell = self.blob.constSlice()[full_offset .. full_offset + len];
|
||||
try self.indices.put(our_shell, idx);
|
||||
std.debug.assert(full_offset & 3 == 0);
|
||||
self.index.set(idx, ShellIndex{
|
||||
.offset = try std.math.cast(u10, full_offset >> 2),
|
||||
.len = len - 1,
|
||||
});
|
||||
if (shells.len == 0) return self;
|
||||
|
||||
full_offset += len;
|
||||
const padding = pad.roundUpPadding(u12, shell_alignment_bits, full_offset);
|
||||
full_offset += padding;
|
||||
try self.blob.appendNTimes(0, padding);
|
||||
errdefer self.shell2idx.deinit();
|
||||
for (shells.constSlice()) |shell, idx| {
|
||||
const idx8 = @intCast(u8, idx);
|
||||
const offset = @intCast(u16, self.blob.len);
|
||||
try self.blob.appendSlice(shell);
|
||||
try self.shell2idx.put(self.blob.constSlice()[offset..], idx8);
|
||||
self.index.set(idx8, offset);
|
||||
}
|
||||
try self.index.append(@intCast(u8, self.blob.len));
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn section_index(self: *const ShellSections) []const u8 {
|
||||
pub fn section_index(self: *const ShellSections) []align(2) const u8 {
|
||||
return std.mem.sliceAsBytes(self.index.constSlice());
|
||||
}
|
||||
|
||||
|
@ -102,12 +82,12 @@ pub const ShellWriter = struct {
|
|||
}
|
||||
|
||||
pub fn deinit(self: *ShellSections) void {
|
||||
self.indices.deinit();
|
||||
self.shell2idx.deinit();
|
||||
self.* = undefined;
|
||||
}
|
||||
|
||||
pub fn getIndex(self: *const ShellSections, shell: []const u8) ?u6 {
|
||||
return self.indices.get(shell);
|
||||
pub fn getIndex(self: *const ShellSections, shell: []const u8) ?u8 {
|
||||
return self.shell2idx.get(shell);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -143,8 +123,10 @@ pub const ShellWriter = struct {
|
|||
// toOwnedSections returns the analyzed ShellSections. Resets the shell
|
||||
// popularity contest. ShellSections memory is allocated by the ShellWriter
|
||||
// allocator, and must be deInit'ed by the caller.
|
||||
const toOwnedSectionsErr = Allocator.Error || error{Overflow};
|
||||
pub fn toOwnedSections(self: *ShellWriter, limit: u10) toOwnedSectionsErr!ShellSections {
|
||||
pub fn toOwnedSections(
|
||||
self: *ShellWriter,
|
||||
limit: u10,
|
||||
) error{ Overflow, OutOfMemory }!ShellSections {
|
||||
var deque = PriorityDequeue(KV, void, cmpShells).init(self.allocator, {});
|
||||
defer deque.deinit();
|
||||
|
||||
|
@ -164,9 +146,8 @@ pub const ShellWriter = struct {
|
|||
topShells.set(i, deque.removeMax().shell);
|
||||
|
||||
const result = ShellSections.init(self.allocator, topShells);
|
||||
const allocator = self.allocator;
|
||||
self.deinit();
|
||||
self.* = init(allocator);
|
||||
self.* = init(self.allocator);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
@ -192,16 +173,13 @@ test "basic shellpopcon" {
|
|||
|
||||
var sections = try popcon.toOwnedSections(max_shells);
|
||||
defer sections.deinit();
|
||||
try testing.expectEqual(sections.index.len, 3); // all but "nobody" qualify
|
||||
try testing.expectEqual(sections.index.len, 4); // all but "nobody" qualify
|
||||
|
||||
try testing.expectEqual(sections.getIndex(long).?, 0);
|
||||
try testing.expectEqual(sections.getIndex(zsh).?, 1);
|
||||
try testing.expectEqual(sections.getIndex(bash).?, 2);
|
||||
try testing.expectEqual(sections.getIndex(nobody), null);
|
||||
try testing.expectEqual(
|
||||
sections.section_blob().len,
|
||||
pad.roundUp(u12, 2, bash.len) + pad.roundUp(u12, 2, zsh.len) + pad.roundUp(u12, 2, long.len),
|
||||
);
|
||||
try testing.expectEqual(sections.section_blob().len, bash.len + zsh.len + long.len);
|
||||
|
||||
const shellReader = ShellReader.init(
|
||||
sections.section_index(),
|
||||
|
@ -211,5 +189,5 @@ test "basic shellpopcon" {
|
|||
try testing.expectEqualStrings(shellReader.get(1), zsh);
|
||||
try testing.expectEqualStrings(shellReader.get(2), bash);
|
||||
|
||||
try testing.expectEqual(shellReader.section_index.len, 3);
|
||||
try testing.expectEqual(shellReader.index.len, 4);
|
||||
}
|
||||
|
|
39
src/user.zig
39
src/user.zig
|
@ -13,10 +13,6 @@ const Allocator = mem.Allocator;
|
|||
const ArrayList = std.ArrayList;
|
||||
const StringHashMap = std.StringHashMap;
|
||||
|
||||
// Idx2ShellProto is a function prototype that, given a shell's index (in
|
||||
// global shell section), will return a shell string. Matches ShellReader.get.
|
||||
const Idx2ShellProto = fn (u6) []const u8;
|
||||
|
||||
// User is a convenient public struct for record construction and
|
||||
// serialization.
|
||||
pub const User = struct {
|
||||
|
@ -65,21 +61,6 @@ pub const User = struct {
|
|||
}
|
||||
};
|
||||
|
||||
pub fn Shell2Index(T: type) type {
|
||||
return struct {
|
||||
const Self = @This();
|
||||
data: T,
|
||||
|
||||
pub fn init(data: T) Self {
|
||||
return Self{ .data = data };
|
||||
}
|
||||
|
||||
pub fn get(self: *const Self, str: []const u8) ?u6 {
|
||||
return self.data.get(str);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub const PackedUser = struct {
|
||||
const Self = @This();
|
||||
|
||||
|
@ -88,8 +69,7 @@ pub const PackedUser = struct {
|
|||
const Inner = packed struct {
|
||||
uid: u32,
|
||||
gid: u32,
|
||||
padding: u2 = 0,
|
||||
shell_len_or_idx: u6,
|
||||
shell_len_or_idx: u8,
|
||||
shell_here: bool,
|
||||
name_is_a_suffix: bool,
|
||||
home_len: u6,
|
||||
|
@ -204,14 +184,14 @@ pub const PackedUser = struct {
|
|||
arr: *ArrayList(u8),
|
||||
user: User,
|
||||
additional_gids_offset: u64,
|
||||
idxFn: StringHashMap(u6),
|
||||
idxFn: StringHashMap(u8),
|
||||
) error{ InvalidRecord, OutOfMemory }!void {
|
||||
std.debug.assert(arr.items.len & 7 == 0);
|
||||
// function arguments are consts. We need to mutate the underlying
|
||||
// slice, so passing it via pointer instead.
|
||||
const home_len = try validate.downCast(u6, user.home.len - 1);
|
||||
const name_len = try validate.downCast(u5, user.name.len - 1);
|
||||
const shell_len = try validate.downCast(u6, user.shell.len - 1);
|
||||
const shell_len = try validate.downCast(u8, user.shell.len - 1);
|
||||
const gecos_len = try validate.downCast(u8, user.gecos.len);
|
||||
|
||||
try validate.utf8(user.home);
|
||||
|
@ -289,19 +269,16 @@ test "PackedUser internal and external alignment" {
|
|||
);
|
||||
}
|
||||
|
||||
fn testShellIndex(allocator: Allocator) StringHashMap(u6) {
|
||||
var result = StringHashMap(u6).init(allocator);
|
||||
fn testShellIndex(allocator: Allocator) StringHashMap(u8) {
|
||||
var result = StringHashMap(u8).init(allocator);
|
||||
result.put("/bin/bash", 0) catch unreachable;
|
||||
result.put("/bin/zsh", 1) catch unreachable;
|
||||
return result;
|
||||
}
|
||||
|
||||
const test_shell_reader = shellImport.ShellReader{
|
||||
.section_blob = "/bin/bash.../bin/zsh",
|
||||
.section_index = &[_]shellImport.ShellIndex{
|
||||
shellImport.ShellIndex{ .offset = 0, .len = 9 - 1 },
|
||||
shellImport.ShellIndex{ .offset = 12 >> 2, .len = 8 - 1 },
|
||||
},
|
||||
.blob = "/bin/bash/bin/zsh",
|
||||
.index = &[_]u16{ 0, 9, 17 },
|
||||
};
|
||||
|
||||
test "construct PackedUser section" {
|
||||
|
@ -328,7 +305,7 @@ test "construct PackedUser section" {
|
|||
.name = "Name" ** 8,
|
||||
.gecos = "Gecos" ** 51,
|
||||
.home = "Home" ** 16,
|
||||
.shell = "She.LllL" ** 8,
|
||||
.shell = "She.LllL" ** 32,
|
||||
}, User{
|
||||
.uid = 1002,
|
||||
.gid = 1002,
|
||||
|
|
Loading…
Reference in New Issue