From f584642cca41c6ae9f9849883fae4970153a1492 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Tue, 15 Feb 2022 10:49:03 +0200 Subject: [PATCH] shellpop skeleton --- README.md | 44 ++++++++----- build.zig | 2 +- src/main.zig | 6 +- src/shellpop.zig | 153 +++++++++++++++++++++++++++++++++++++++++++++ src/test_main.zig | 4 ++ src/usergroups.zig | 34 ++++++++++ 6 files changed, 222 insertions(+), 21 deletions(-) create mode 100644 src/shellpop.zig create mode 100644 src/test_main.zig create mode 100644 src/usergroups.zig diff --git a/README.md b/README.md index 5bb82cc..6874ea9 100644 --- a/README.md +++ b/README.md @@ -66,10 +66,10 @@ consumed heap space for each separate turbonss instance will be minimal. Tight packing places some constraints on the underlying data: - Maximum database size: 4GB. -- Maximum length of username and groupname: 32 bytes. -- Maximum length of shell and homedir: 64 bytes. -- Maximum comment ("gecos") length: 256 bytes. -- Username and groupname must be utf8-encoded. +- Permitted length of username and groupname: 1-32 bytes. +- Permitted length of shell and homedir: 1-64 bytes. +- Permitted comment ("gecos") length: 0-255 bytes. +- Username, groupname and gecos must be utf8-encoded. Checking out and building ------------------------- @@ -156,7 +156,7 @@ OFFSET TYPE NAME DESCRIPTION 0 [4]u8 magic always 0xf09fa4b7 4 u8 version now `0` 5 u16 bom 0x1234 - 7 u8 padding + 7 u6 num_shells max value: 63 8 u32 num_users number of passwd entries 12 u32 num_groups number of group entries 16 u32 offset_cmph_uid2user @@ -165,9 +165,8 @@ OFFSET TYPE NAME DESCRIPTION 28 u32 offset_idx offset to the first idx_ section 32 u32 offset_groups 36 u32 offset_users - 40 u32 offset_shells - 44 u32 offset_groupmembers - 48 u32 offset_additional_gids + 40 u32 offset_groupmembers + 44 u32 offset_additional_gids ``` `magic` is 0xf09fa4b7, and `version` must be `0`. All integers are @@ -255,15 +254,25 @@ few examples: `/bin/bash`, `/usr/bin/nologin`, `/bin/zsh` among others. Therefore, "shells" have an optimization: they can be pointed by in the external list, or reside among the user's data. -64 (1>>6) most popular shells (i.e. referred to by at least two User entries) -are stored externally in "Shells" area. The less popular ones are stored with +63 most popular shells (i.e. referred to by at least two User entries) are +stored externally in "Shells" area. The less popular ones are stored with userdata. -The `shell_here=true` bit signifies that the shell is stored with userdata. -`false` means it is stored in the `Shells` section. If the shell is stored -"here", it is the first element in `stringdata`, and it's length is -`shell_len_or_place`. If it is stored externally, the latter variable points -to it's index in the external storage. +There are two "Shells" areas: the index and the blob. The index is a list of +structs which point to a location in the "blob" area: + +``` +const ShellIndex = struct { + offset: u10, + len: u6, +}; +``` + +In the user's struct the `shell_here=true` bit signifies that the shell is +stored with userdata. `false` means it is stored in the `Shells` section. If +the shell is stored "here", it is the first element in `stringdata`, and it's +length is `shell_len_or_place`. If it is stored externally, the latter variable +points to it's index in the ShellIndex area. Shells in the external storage are sorted by their weight, which is `length*frequency`. @@ -315,7 +324,7 @@ Each section is padded to 64 bytes. ``` SECTION SIZE DESCRIPTION -Header 52 see "Turbonss header" section +Header 48 see "Turbonss header" section cmph_gid2group ? gid->group cmph cmph_uid2user ? uid->user cmph cmph_groupname2group ? groupname->group cmph @@ -324,9 +333,10 @@ idx_gid2group len(group)*4*29/32 cmph->offset gid2group idx_groupname2group len(group)*4*29/32 cmph->offset groupname2group idx_uid2user len(user)*4*29/32 cmph->offset uid2user idx_username2user len(user)*4*29/32 cmph->offset username2user +ShellIndex len(shells)*2 Shell index array +ShellBlob <= 4032 Shell data blob (max 63*64 bytes) Groups ? packed Group entries (8b padding) Users ? packed User entries (8b padding) -Shells ? See "Shells" section groupmembers ? per-group memberlist (32b padding) additional_gids ? per-user grouplist (8b padding) ``` diff --git a/build.zig b/build.zig index 3f88864..2ee1a59 100644 --- a/build.zig +++ b/build.zig @@ -68,7 +68,7 @@ pub fn build(b: *zbs.Builder) void { exe.install(); { - const turbonss_test = b.addTest("src/main.zig"); + const turbonss_test = b.addTest("src/test_main.zig"); addCmphDeps(turbonss_test, cmph); const test_step = b.step("test", "Run the tests"); test_step.dependOn(&turbonss_test.step); diff --git a/src/main.zig b/src/main.zig index c5b51fe..a68ab3f 100644 --- a/src/main.zig +++ b/src/main.zig @@ -14,7 +14,7 @@ pub fn main() !void {} test "simple cmph usage" { var arena_instance = std.heap.ArenaAllocator.init(std.heap.page_allocator); const arena = arena_instance.allocator(); - const stderr = std.io.getStdErr().writer(); + const stdout = std.io.getStdOut().writer(); var vector = std.ArrayList([*:0]const u8).init(arena); try vector.appendSlice(&.{ @@ -50,10 +50,10 @@ test "simple cmph usage" { hash = c.cmph_load(mphf_fd) orelse unreachable; defer c.cmph_destroy(hash); - try stderr.print("\n", .{}); + try stdout.print("\n", .{}); for (vector.items) |key| { var id = c.cmph_search(hash, key, @truncate(c_uint, c.strlen(key))); - try stderr.print("key: {s}, id: {d}\n", .{ key, id }); + try stdout.print("key: {s}, id: {d}\n", .{ key, id }); } } diff --git a/src/shellpop.zig b/src/shellpop.zig new file mode 100644 index 0000000..a3cebfb --- /dev/null +++ b/src/shellpop.zig @@ -0,0 +1,153 @@ +const std = @import("std"); +const Allocator = std.mem.Allocator; +const PriorityDequeue = std.PriorityDequeue; +const StringArrayHashMap = std.StringArrayHashMap; +const StringHashMap = std.StringHashMap; +const BoundedArray = std.BoundedArray; +const testing = std.testing; + +// ShellIndex is an index to the shell strings. As shell can be up to 64 bytes +// (1<<6), maximum number of shells is 63 (1<<6-1), the maximum location offset +// is 1<<12. To make location resolvable in 10 bits, all shells will be padded +// to 4 bytes. +const ShellIndex = struct { + offset: u10, + len: u6, +}; + +// MaxShells is the maximum number of "popular" shells. +const MaxShells = 63; + +// ShellPopcon is a shell popularity contest: collect shells and return the +// popular ones, sorted by score. score := len(shell) * number_of_shells. +// String values are copied, the returned slice of shells is allocated +// using an allocator. +const ShellPopcon = struct { + counts: std.StringHashMap(u32), + allocator: Allocator, + const Self = @This(); + const KV = struct { + shell: []const u8, + score: u32, + }; + + const ShellSections = struct { + index: []ShellIndex, + blob: []const u8, + + offsets: StringHashMap(u10), + + pub fn getOffset(self: *ShellSections, shell: []const u8) ?u10 { + return self.offsets.get(shell); + } + + // initializes ShellSections. All strings are copied, nothing is owned. + pub fn init(allocator: Allocator, shells: BoundedArray([]const u8, MaxShells)) ShellSections { + self.offsets = StringHashMap(u10).init(allocator); + _ = allocator; + _ = shells; + } + }; + + pub fn init(allocator: Allocator) Self { + return Self{ + .counts = std.StringHashMap(u32).init(allocator), + .allocator = allocator, + }; + } + + pub fn deinit(self: *Self) void { + var it = self.counts.keyIterator(); + while (it.next()) |key_ptr| { + self.counts.allocator.free(key_ptr.*); + } + self.counts.deinit(); + self.* = undefined; + } + + pub fn put(self: *Self, shell: []const u8) !void { + // TODO getOrPutAdapted may be more elegant, not sure which + // context to pass. + if (self.counts.getPtr(shell)) |ptr| { + ptr.* += 1; + } else { + var ourShell = try self.allocator.alloc(u8, shell.len); + std.mem.copy(u8, ourShell, shell); + try self.counts.put(ourShell, 1); + } + } + + fn cmpShells(context: void, a: KV, b: KV) std.math.Order { + _ = context; + return std.math.order(a.score, b.score); + } + + pub fn getSections(self: *Self, limit: u32) ShellSections { + const stderr = std.io.getStdErr().writer(); + _ = stderr; + + var deque = PriorityDequeue(KV, void, cmpShells).init(self.allocator, {}); + defer deque.deinit(); + + var it = self.counts.iterator(); + while (it.next()) |entry| { + if (entry.value_ptr.* == 1) { + continue; + } + const score = @truncate(u32, entry.key_ptr.*.len) * entry.value_ptr.*; + try deque.add(KV{ .shell = entry.key_ptr.*, .score = score }); + } + + const total = std.math.min(deque.count(), limit); + var strSlice = self.allocator.alloc([]u8, total); + defer strSlice.deinit(); + + var i: u32 = 0; + while (i < total) { + strSlice[i] = deque.removeMax(); + i += 1; + } + + return ShellSections.init(self.allocator, strSlice); + } +}; + +test "[]u8 comparison" { + var s1: []const u8 = "/bin/bash"; + var s2: []const u8 = "/bin/bash"; + try testing.expectEqual(s1, s2); +} + +test "basic shellpop" { + var popcon = ShellPopcon.init(testing.allocator); + defer popcon.deinit(); + + try popcon.put("/bin/bash"); + try popcon.put("/bin/bash"); + try popcon.put("/bin/bash"); + try popcon.put("/bin/zsh"); + try popcon.put("/bin/zsh"); + try popcon.put("/bin/zsh"); + try popcon.put("/bin/zsh"); + try popcon.put("/bin/nobody"); + try popcon.put("/bin/very-long-shell-name-ought-to-be-first"); + try popcon.put("/bin/very-long-shell-name-ought-to-be-first"); + + const stderr = std.io.getStdErr().writer(); + + var topshells = try popcon.top(2); + defer topshells.deinit(); + var shellStrings = topshells.keys(); + try testing.expectEqual(shellStrings.len, 2); + + try stderr.print("\n", .{}); + try stderr.print("0th type: {s}\n", .{@typeName(@TypeOf(shellStrings[0]))}); + try stderr.print("1st type: {s}\n", .{@typeName(@TypeOf(shellStrings[1]))}); + try stderr.print("0th: {s}, len: {d}\n", .{ shellStrings[0], shellStrings[0].len }); + try stderr.print("0ww: /bin/very-long-shell-name-ought-to-be-first\n", .{}); + try stderr.print("1st: {s}, len: {d}\n", .{ shellStrings[1], shellStrings[1].len }); + try stderr.print("1ww: /bin/zsh\n", .{}); + + try testing.expectEqual(shellStrings[0], "/bin/very-long-shell-name-ought-to-be-first"); + try testing.expectEqual(shellStrings[1], "/bin/zsh"); +} diff --git a/src/test_main.zig b/src/test_main.zig new file mode 100644 index 0000000..92f8977 --- /dev/null +++ b/src/test_main.zig @@ -0,0 +1,4 @@ +test "turbonss test suite" { + _ = @import("main.zig"); + _ = @import("shellpop.zig"); +} diff --git a/src/usergroups.zig b/src/usergroups.zig new file mode 100644 index 0000000..2cf9008 --- /dev/null +++ b/src/usergroups.zig @@ -0,0 +1,34 @@ +const std = @import("std"); + +const DB = struct { + users: std.StringHashMap(User), + groups: std.StringHashMap(Group), +}; + +const Group = struct { + gid: u32, + name: []const u8, + members: std.BufSet, +}; + +const User = struct { + uid: u32, + gid: u32, + name: []const u8, + gecos: []const u8, + home: []const u8, + shell: []const u8, + groups: std.BufSet, +}; + +const PackedUser = packed struct { + uid: u32, + gid: u32, + additional_gids_offset: u29, + shell_here: u1, + shell_len_or_place: u6, + homedir_len: u6, + username_is_a_suffix: u1, + username_offset_or_len: u5, + gecos_len: u8, +};