shellpop skeleton

This commit is contained in:
Motiejus Jakštys 2022-02-15 10:49:03 +02:00 committed by Motiejus Jakštys
parent ce882b9086
commit f584642cca
6 changed files with 222 additions and 21 deletions

View File

@ -66,10 +66,10 @@ consumed heap space for each separate turbonss instance will be minimal.
Tight packing places some constraints on the underlying data: Tight packing places some constraints on the underlying data:
- Maximum database size: 4GB. - Maximum database size: 4GB.
- Maximum length of username and groupname: 32 bytes. - Permitted length of username and groupname: 1-32 bytes.
- Maximum length of shell and homedir: 64 bytes. - Permitted length of shell and homedir: 1-64 bytes.
- Maximum comment ("gecos") length: 256 bytes. - Permitted comment ("gecos") length: 0-255 bytes.
- Username and groupname must be utf8-encoded. - Username, groupname and gecos must be utf8-encoded.
Checking out and building Checking out and building
------------------------- -------------------------
@ -156,7 +156,7 @@ OFFSET TYPE NAME DESCRIPTION
0 [4]u8 magic always 0xf09fa4b7 0 [4]u8 magic always 0xf09fa4b7
4 u8 version now `0` 4 u8 version now `0`
5 u16 bom 0x1234 5 u16 bom 0x1234
7 u8 padding 7 u6 num_shells max value: 63
8 u32 num_users number of passwd entries 8 u32 num_users number of passwd entries
12 u32 num_groups number of group entries 12 u32 num_groups number of group entries
16 u32 offset_cmph_uid2user 16 u32 offset_cmph_uid2user
@ -165,9 +165,8 @@ OFFSET TYPE NAME DESCRIPTION
28 u32 offset_idx offset to the first idx_ section 28 u32 offset_idx offset to the first idx_ section
32 u32 offset_groups 32 u32 offset_groups
36 u32 offset_users 36 u32 offset_users
40 u32 offset_shells 40 u32 offset_groupmembers
44 u32 offset_groupmembers 44 u32 offset_additional_gids
48 u32 offset_additional_gids
``` ```
`magic` is 0xf09fa4b7, and `version` must be `0`. All integers are `magic` is 0xf09fa4b7, and `version` must be `0`. All integers are
@ -255,15 +254,25 @@ few examples: `/bin/bash`, `/usr/bin/nologin`, `/bin/zsh` among others.
Therefore, "shells" have an optimization: they can be pointed by in the Therefore, "shells" have an optimization: they can be pointed by in the
external list, or reside among the user's data. external list, or reside among the user's data.
64 (1>>6) most popular shells (i.e. referred to by at least two User entries) 63 most popular shells (i.e. referred to by at least two User entries) are
are stored externally in "Shells" area. The less popular ones are stored with stored externally in "Shells" area. The less popular ones are stored with
userdata. userdata.
The `shell_here=true` bit signifies that the shell is stored with userdata. There are two "Shells" areas: the index and the blob. The index is a list of
`false` means it is stored in the `Shells` section. If the shell is stored structs which point to a location in the "blob" area:
"here", it is the first element in `stringdata`, and it's length is
`shell_len_or_place`. If it is stored externally, the latter variable points ```
to it's index in the external storage. const ShellIndex = struct {
offset: u10,
len: u6,
};
```
In the user's struct the `shell_here=true` bit signifies that the shell is
stored with userdata. `false` means it is stored in the `Shells` section. If
the shell is stored "here", it is the first element in `stringdata`, and it's
length is `shell_len_or_place`. If it is stored externally, the latter variable
points to it's index in the ShellIndex area.
Shells in the external storage are sorted by their weight, which is Shells in the external storage are sorted by their weight, which is
`length*frequency`. `length*frequency`.
@ -315,7 +324,7 @@ Each section is padded to 64 bytes.
``` ```
SECTION SIZE DESCRIPTION SECTION SIZE DESCRIPTION
Header 52 see "Turbonss header" section Header 48 see "Turbonss header" section
cmph_gid2group ? gid->group cmph cmph_gid2group ? gid->group cmph
cmph_uid2user ? uid->user cmph cmph_uid2user ? uid->user cmph
cmph_groupname2group ? groupname->group cmph cmph_groupname2group ? groupname->group cmph
@ -324,9 +333,10 @@ idx_gid2group len(group)*4*29/32 cmph->offset gid2group
idx_groupname2group len(group)*4*29/32 cmph->offset groupname2group idx_groupname2group len(group)*4*29/32 cmph->offset groupname2group
idx_uid2user len(user)*4*29/32 cmph->offset uid2user idx_uid2user len(user)*4*29/32 cmph->offset uid2user
idx_username2user len(user)*4*29/32 cmph->offset username2user idx_username2user len(user)*4*29/32 cmph->offset username2user
ShellIndex len(shells)*2 Shell index array
ShellBlob <= 4032 Shell data blob (max 63*64 bytes)
Groups ? packed Group entries (8b padding) Groups ? packed Group entries (8b padding)
Users ? packed User entries (8b padding) Users ? packed User entries (8b padding)
Shells ? See "Shells" section
groupmembers ? per-group memberlist (32b padding) groupmembers ? per-group memberlist (32b padding)
additional_gids ? per-user grouplist (8b padding) additional_gids ? per-user grouplist (8b padding)
``` ```

View File

@ -68,7 +68,7 @@ pub fn build(b: *zbs.Builder) void {
exe.install(); exe.install();
{ {
const turbonss_test = b.addTest("src/main.zig"); const turbonss_test = b.addTest("src/test_main.zig");
addCmphDeps(turbonss_test, cmph); addCmphDeps(turbonss_test, cmph);
const test_step = b.step("test", "Run the tests"); const test_step = b.step("test", "Run the tests");
test_step.dependOn(&turbonss_test.step); test_step.dependOn(&turbonss_test.step);

View File

@ -14,7 +14,7 @@ pub fn main() !void {}
test "simple cmph usage" { test "simple cmph usage" {
var arena_instance = std.heap.ArenaAllocator.init(std.heap.page_allocator); var arena_instance = std.heap.ArenaAllocator.init(std.heap.page_allocator);
const arena = arena_instance.allocator(); const arena = arena_instance.allocator();
const stderr = std.io.getStdErr().writer(); const stdout = std.io.getStdOut().writer();
var vector = std.ArrayList([*:0]const u8).init(arena); var vector = std.ArrayList([*:0]const u8).init(arena);
try vector.appendSlice(&.{ try vector.appendSlice(&.{
@ -50,10 +50,10 @@ test "simple cmph usage" {
hash = c.cmph_load(mphf_fd) orelse unreachable; hash = c.cmph_load(mphf_fd) orelse unreachable;
defer c.cmph_destroy(hash); defer c.cmph_destroy(hash);
try stderr.print("\n", .{}); try stdout.print("\n", .{});
for (vector.items) |key| { for (vector.items) |key| {
var id = c.cmph_search(hash, key, @truncate(c_uint, c.strlen(key))); var id = c.cmph_search(hash, key, @truncate(c_uint, c.strlen(key)));
try stderr.print("key: {s}, id: {d}\n", .{ key, id }); try stdout.print("key: {s}, id: {d}\n", .{ key, id });
} }
} }

153
src/shellpop.zig Normal file
View File

@ -0,0 +1,153 @@
const std = @import("std");
const Allocator = std.mem.Allocator;
const PriorityDequeue = std.PriorityDequeue;
const StringArrayHashMap = std.StringArrayHashMap;
const StringHashMap = std.StringHashMap;
const BoundedArray = std.BoundedArray;
const testing = std.testing;
// ShellIndex is an index to the shell strings. As shell can be up to 64 bytes
// (1<<6), maximum number of shells is 63 (1<<6-1), the maximum location offset
// is 1<<12. To make location resolvable in 10 bits, all shells will be padded
// to 4 bytes.
const ShellIndex = struct {
offset: u10,
len: u6,
};
// MaxShells is the maximum number of "popular" shells.
const MaxShells = 63;
// ShellPopcon is a shell popularity contest: collect shells and return the
// popular ones, sorted by score. score := len(shell) * number_of_shells.
// String values are copied, the returned slice of shells is allocated
// using an allocator.
const ShellPopcon = struct {
counts: std.StringHashMap(u32),
allocator: Allocator,
const Self = @This();
const KV = struct {
shell: []const u8,
score: u32,
};
const ShellSections = struct {
index: []ShellIndex,
blob: []const u8,
offsets: StringHashMap(u10),
pub fn getOffset(self: *ShellSections, shell: []const u8) ?u10 {
return self.offsets.get(shell);
}
// initializes ShellSections. All strings are copied, nothing is owned.
pub fn init(allocator: Allocator, shells: BoundedArray([]const u8, MaxShells)) ShellSections {
self.offsets = StringHashMap(u10).init(allocator);
_ = allocator;
_ = shells;
}
};
pub fn init(allocator: Allocator) Self {
return Self{
.counts = std.StringHashMap(u32).init(allocator),
.allocator = allocator,
};
}
pub fn deinit(self: *Self) void {
var it = self.counts.keyIterator();
while (it.next()) |key_ptr| {
self.counts.allocator.free(key_ptr.*);
}
self.counts.deinit();
self.* = undefined;
}
pub fn put(self: *Self, shell: []const u8) !void {
// TODO getOrPutAdapted may be more elegant, not sure which
// context to pass.
if (self.counts.getPtr(shell)) |ptr| {
ptr.* += 1;
} else {
var ourShell = try self.allocator.alloc(u8, shell.len);
std.mem.copy(u8, ourShell, shell);
try self.counts.put(ourShell, 1);
}
}
fn cmpShells(context: void, a: KV, b: KV) std.math.Order {
_ = context;
return std.math.order(a.score, b.score);
}
pub fn getSections(self: *Self, limit: u32) ShellSections {
const stderr = std.io.getStdErr().writer();
_ = stderr;
var deque = PriorityDequeue(KV, void, cmpShells).init(self.allocator, {});
defer deque.deinit();
var it = self.counts.iterator();
while (it.next()) |entry| {
if (entry.value_ptr.* == 1) {
continue;
}
const score = @truncate(u32, entry.key_ptr.*.len) * entry.value_ptr.*;
try deque.add(KV{ .shell = entry.key_ptr.*, .score = score });
}
const total = std.math.min(deque.count(), limit);
var strSlice = self.allocator.alloc([]u8, total);
defer strSlice.deinit();
var i: u32 = 0;
while (i < total) {
strSlice[i] = deque.removeMax();
i += 1;
}
return ShellSections.init(self.allocator, strSlice);
}
};
test "[]u8 comparison" {
var s1: []const u8 = "/bin/bash";
var s2: []const u8 = "/bin/bash";
try testing.expectEqual(s1, s2);
}
test "basic shellpop" {
var popcon = ShellPopcon.init(testing.allocator);
defer popcon.deinit();
try popcon.put("/bin/bash");
try popcon.put("/bin/bash");
try popcon.put("/bin/bash");
try popcon.put("/bin/zsh");
try popcon.put("/bin/zsh");
try popcon.put("/bin/zsh");
try popcon.put("/bin/zsh");
try popcon.put("/bin/nobody");
try popcon.put("/bin/very-long-shell-name-ought-to-be-first");
try popcon.put("/bin/very-long-shell-name-ought-to-be-first");
const stderr = std.io.getStdErr().writer();
var topshells = try popcon.top(2);
defer topshells.deinit();
var shellStrings = topshells.keys();
try testing.expectEqual(shellStrings.len, 2);
try stderr.print("\n", .{});
try stderr.print("0th type: {s}\n", .{@typeName(@TypeOf(shellStrings[0]))});
try stderr.print("1st type: {s}\n", .{@typeName(@TypeOf(shellStrings[1]))});
try stderr.print("0th: {s}, len: {d}\n", .{ shellStrings[0], shellStrings[0].len });
try stderr.print("0ww: /bin/very-long-shell-name-ought-to-be-first\n", .{});
try stderr.print("1st: {s}, len: {d}\n", .{ shellStrings[1], shellStrings[1].len });
try stderr.print("1ww: /bin/zsh\n", .{});
try testing.expectEqual(shellStrings[0], "/bin/very-long-shell-name-ought-to-be-first");
try testing.expectEqual(shellStrings[1], "/bin/zsh");
}

4
src/test_main.zig Normal file
View File

@ -0,0 +1,4 @@
test "turbonss test suite" {
_ = @import("main.zig");
_ = @import("shellpop.zig");
}

34
src/usergroups.zig Normal file
View File

@ -0,0 +1,34 @@
const std = @import("std");
const DB = struct {
users: std.StringHashMap(User),
groups: std.StringHashMap(Group),
};
const Group = struct {
gid: u32,
name: []const u8,
members: std.BufSet,
};
const User = struct {
uid: u32,
gid: u32,
name: []const u8,
gecos: []const u8,
home: []const u8,
shell: []const u8,
groups: std.BufSet,
};
const PackedUser = packed struct {
uid: u32,
gid: u32,
additional_gids_offset: u29,
shell_here: u1,
shell_len_or_place: u6,
homedir_len: u6,
username_is_a_suffix: u1,
username_offset_or_len: u5,
gecos_len: u8,
};