packing shell sections

This commit is contained in:
Motiejus Jakštys 2022-03-03 18:05:46 +02:00 committed by Motiejus Jakštys
parent e1bdb6c529
commit a4e3e08f5f
4 changed files with 104 additions and 37 deletions

View File

@ -354,24 +354,24 @@ STATUS SECTION SIZE DESCRIPTION
✅ bdz_gid ? bdz(gid)
✅ bdz_groupname ? bdz(groupname)
✅ bdz_uid ? bdz(uid)
✅ bdz_name ? bdz(username)
✅ bdz_username ? bdz(username)
idx_gid2group len(group)*29/8 bdz->offset Groups
idx_groupname2group len(group)*29/8 bdz->offset Groups
idx_uid2user len(user)*29/8 bdz->offset Users
idx_name2user len(user)*29/8 bdz->offset Users
idx_username2gids len(user)*29/8 bdz->offset UserGids
ShellIndex len(shells)*2 shell index array
ShellBlob <= 4032 shell data blob (max 63*64 bytes)
Groups ? packed Group entries (8b padding)
Users ? packed User entries (8b padding)
Groupmembers ? per-group memberlist (no padding)
UserGids ? per-user gidlist entries (8b padding)
shellIndex len(shells)*2 shell index array
shellBlob <= 4032 shell data blob (max 63*64 bytes)
groups ? packed Group entries (8b padding)
users ? packed User entries (8b padding)
groupMembers ? per-group memberlist (no padding)
userGids ? per-user gidlist entries (8b padding)
```
Section creation order:
1. `bdz_*`. No depdendencies.
1. ShellIndex, ShellBlob. No dependencies.
1. `bdz_*`. No depdendencies.
1. `shellIndex`, `shellBlob`. No dependencies.
1. UserGids. No dependencies.
1. Users, but without `additional_gids_offset`. No dependencies.
1. Groupmembers. Depends on Users, ex. `additional_gids_offset`.

View File

@ -43,7 +43,7 @@ const Header = packed struct {
if (self.bom != Bom) {
return error.InvalidBom;
}
if (self.num_shells > shell.MaxShells) {
if (self.num_shells > shell.max_shells) {
return error.TooManyShells;
}
@ -112,7 +112,7 @@ test "header pack, unpack and validation" {
{
var header = goodHeader;
header.num_shells = shell.MaxShells + 1;
header.num_shells = shell.max_shells + 1;
try testing.expectError(error.TooManyShells, Header.init(header.asArray()));
}

View File

@ -13,6 +13,7 @@ const BufSet = std.BufSet;
const pad = @import("padding.zig");
const compress = @import("compress.zig");
const shellImport = @import("shell.zig");
const userImport = @import("user.zig");
const groupImport = @import("group.zig");
const cmph = @import("cmph.zig");
@ -185,9 +186,49 @@ pub const Sections = struct {
bytes: []const u8,
};
const groupMembersErr = error{Overflow} || Allocator.Error;
pub fn bdzGid(self: *const Sections) cmph.Error![]const u8 {
return try cmph.pack_u32(self.allocator, self.corpus.groupsMulti.items(.gid));
}
pub fn groupMembers(self: *const Sections) groupMembersErr!GroupMembers {
pub fn bdzGroupname(self: *const Sections) cmph.Error![]const u8 {
return try cmph.pack_str(self.allocator, self.corpus.groupsMulti.items(.name));
}
pub fn bdzUid(self: *const Sections) cmph.Error![]const u8 {
return try cmph.pack_u32(self.allocator, self.corpus.usersMulti.items(.uid));
}
pub fn bdzUsername(self: *const Sections) cmph.Error![]const u8 {
return try cmph.pack_str(self.allocator, self.corpus.usersMulti.items(.name));
}
pub const ShellSections = struct {
index: []const u8,
blob: []const u8,
};
// TODO(motiejus) there are a few problems:
// - memory management for shell sections is a mess. Make it easier by ...
// - shell module should accept a list of shells and spit out two slices
// (allocated with a given allocator). There is too much dancing around
// here.
const shellSectionsErr = Allocator.Error || error{Overflow};
pub fn shellSections(self: *const Sections) shellSectionsErr!ShellSections {
var popcon = shellImport.ShellWriter.init(self.allocator);
defer popcon.deinit();
for (self.corpus.usersMulti.items(.shell)) |shell| {
try popcon.put(shell);
}
var sections = try popcon.toOwnedSections(shellImport.max_shells);
defer sections.deinit();
return ShellSections{
.index = try self.allocator.dupe(u8, sections.sectionIndex()),
.blob = try self.allocator.dupe(u8, sections.sectionBlob()),
};
}
pub fn groupMembers(self: *const Sections) Allocator.Error!GroupMembers {
var buf: [compress.maxVarintLen64]u8 = undefined;
var offsets = ArrayListUnmanaged(usize).initCapacity(
self.allocator,
@ -198,7 +239,7 @@ pub const Sections = struct {
for (self.corpus.groups) |group, i| {
offsets[i] = offset;
const users = self.corpus.groupname2users.get(group.name).?;
const len = try compress.putVarint(&buf, users.len);
const len = compress.putVarint(&buf, users.len);
offset += len;
try bytes.appendSlice(buf[0..len]);
for (users) |user| {
@ -332,6 +373,30 @@ test "test corpus" {
try testing.expectEqual(corpus.username2groups.get("404"), null);
}
test "test sections" {
const allocator = testing.allocator;
var corpus = try testCorpus(allocator);
defer corpus.deinit();
var sections = Sections.init(allocator, &corpus);
const bdz_gid = try sections.bdzGid();
defer allocator.free(bdz_gid);
const bdz_groupname = try sections.bdzGroupname();
defer allocator.free(bdz_groupname);
const bdz_uid = try sections.bdzUid();
defer allocator.free(bdz_uid);
const bdz_username = try sections.bdzUsername();
defer allocator.free(bdz_username);
const shellSections = try sections.shellSections();
defer allocator.free(shellSections.index);
defer allocator.free(shellSections.blob);
}
test "pack gids" {
const allocator = testing.allocator;
var corpus = try testCorpus(allocator);

View File

@ -7,11 +7,22 @@ const StringHashMap = std.StringHashMap;
const BoundedArray = std.BoundedArray;
const StringContext = std.hash_map.StringContext;
// MaxShells is the maximum number of "popular" shells.
pub const MaxShells = 63;
pub const MaxShellLen = 64;
// maxShells is the maximum number of "popular" shells.
pub const max_shells = 63;
pub const max_shell_len = 64;
const ShellAlignment = 2; // bits
// ShellIndex is an index to the shell strings. As shell can be up to 64 bytes
// (1<<6), maximum number of shells is 63 (1<<6-1), the maximum location offset
// is 1<<12. To make location resolvable in 10 bits, all shells will be padded
// to 4 bytes.
// The actual shell length is len+1: we don't allow empty shells, and the real
// length of the shell is 1-64 bytes.
const ShellIndex = packed struct {
offset: u10,
len: u6,
};
// ShellReader interprets "Shell Index" and "Shell Blob" sections.
pub const ShellReader = struct {
sectionIndex: []const ShellIndex,
@ -44,19 +55,20 @@ pub const ShellWriter = struct {
};
const ShellSections = struct {
index: BoundedArray(ShellIndex, MaxShells),
blob: BoundedArray(u8, MaxShells * MaxShellLen),
index: BoundedArray(ShellIndex, max_shells),
blob: BoundedArray(u8, max_shells * max_shell_len),
indices: StringHashMap(u6),
// initializes and populates shell sections. All strings are copied,
// nothing is owned.
pub const initErr = Allocator.Error || error{Overflow};
pub fn init(
allocator: Allocator,
shells: BoundedArray([]const u8, MaxShells),
) !ShellSections {
shells: BoundedArray([]const u8, max_shells),
) initErr!ShellSections {
var self = ShellSections{
.index = try BoundedArray(ShellIndex, MaxShells).init(shells.len),
.blob = try BoundedArray(u8, MaxShells * MaxShellLen).init(0),
.index = try BoundedArray(ShellIndex, max_shells).init(shells.len),
.blob = try BoundedArray(u8, max_shells * max_shell_len).init(0),
.indices = StringHashMap(u6).init(allocator),
};
var fullOffset: u12 = 0;
@ -132,7 +144,8 @@ pub const ShellWriter = struct {
// toOwnedSections returns the analyzed ShellSections. Resets the shell
// popularity contest. ShellSections memory is allocated by the ShellWriter
// allocator, and must be deInit'ed by the caller.
pub fn toOwnedSections(self: *ShellWriter, limit: u10) !ShellSections {
const toOwnedSectionsErr = Allocator.Error || error{Overflow};
pub fn toOwnedSections(self: *ShellWriter, limit: u10) toOwnedSectionsErr!ShellSections {
var deque = PriorityDequeue(KV, void, cmpShells).init(self.allocator, {});
defer deque.deinit();
@ -145,7 +158,7 @@ pub const ShellWriter = struct {
}
const total = std.math.min(deque.count(), limit);
var topShells = try BoundedArray([]const u8, MaxShells).init(total);
var topShells = try BoundedArray([]const u8, max_shells).init(total);
var i: u32 = 0;
while (i < total) : (i += 1) {
@ -161,17 +174,6 @@ pub const ShellWriter = struct {
}
};
// ShellIndex is an index to the shell strings. As shell can be up to 64 bytes
// (1<<6), maximum number of shells is 63 (1<<6-1), the maximum location offset
// is 1<<12. To make location resolvable in 10 bits, all shells will be padded
// to 4 bytes.
// The actual shell length is len+1: we don't allow empty shells, and the real
// length of the shell is 1-64 bytes.
const ShellIndex = packed struct {
offset: u10,
len: u6,
};
const testing = std.testing;
test "basic shellpopcon" {
@ -192,7 +194,7 @@ test "basic shellpopcon" {
try popcon.put(shell);
}
var sections = try popcon.toOwnedSections(MaxShells);
var sections = try popcon.toOwnedSections(max_shells);
defer sections.deinit();
try testing.expectEqual(sections.index.len, 3); // all but "nobody" qualify