From e1bdb6c5296ae3517ba9f31adce1effbd3f8bcd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Wed, 2 Mar 2022 11:05:20 +0200 Subject: [PATCH] more robust bdz for numbers + helpers --- src/bdz.zig | 32 ++++++++++---- src/cmph.zig | 85 ++++++++++++++++++++++++++++++++--- src/group.zig | 3 +- src/sections.zig | 112 ++++++++++++++++++++++++++++++----------------- src/shell.zig | 5 +-- 5 files changed, 176 insertions(+), 61 deletions(-) diff --git a/src/bdz.zig b/src/bdz.zig index 628d9d2..6da3771 100644 --- a/src/bdz.zig +++ b/src/bdz.zig @@ -1,12 +1,28 @@ const std = @import("std"); -const Allocator = std.mem.Allocator; -const c = @cImport({ - @cInclude("bdz.h"); -}); +extern fn bdz_search_packed(packed_mphf: [*]const u8, key: [*]const u8, len: c_uint) u32; -pub fn search(packed_mphf: []const u8, key: []const u8) error{Overflow}!u32 { - const bdz_start = @intToPtr(?*anyopaque, @ptrToInt(&packed_mphf[0])); - const len = try std.math.cast(c_uint, key.len); - return @as(u32, c.bdz_search_packed(bdz_start, key.ptr, len)); +pub fn search(packed_mphf: []const u8, key: []const u8) u32 { + const len = std.math.cast(c_uint, key.len) catch unreachable; + return @as(u32, bdz_search_packed(packed_mphf.ptr, key.ptr, len)); +} + +pub fn search_u32(packed_mphf: []const u8, key: u32) u32 { + return search(packed_mphf, unzero(key)[0..]); +} + +// encode a u32 to 5 bytes so no bytes is a '\0'. +// +// TODO(motiejus) figure out how to use cmph_io_byte_vector_adapter, so cmph +// packing would accept zero bytes. For now we will be doing a dance of not +// passing zero bytes. +pub fn unzero(x: u32) [5]u8 { + const one: u8 = 0b10000000; + var buf: [5]u8 = undefined; + buf[0] = @truncate(u8, (x & 0b11111110_00000000_00000000_00000000) >> 25) | one; + buf[1] = @truncate(u8, (x & 0b00000001_11111100_00000000_00000000) >> 18) | one; + buf[2] = @truncate(u8, (x & 0b00000000_00000011_11110000_00000000) >> 12) | one; + buf[3] = @truncate(u8, (x & 0b00000000_00000000_00001111_11000000) >> 6) | one; + buf[4] = @truncate(u8, (x & 0b00000000_00000000_00000000_00111111) >> 0) | one; + return buf; } diff --git a/src/cmph.zig b/src/cmph.zig index 3f36577..eaefc75 100644 --- a/src/cmph.zig +++ b/src/cmph.zig @@ -1,5 +1,8 @@ const std = @import("std"); const Allocator = std.mem.Allocator; +const math = std.math; +const sort = std.sort; + const bdz = @import("bdz.zig"); const c = @cImport({ @@ -9,11 +12,12 @@ const c = @cImport({ // pack packs cmph hashes for the given input and returns a slice ("cmph pack // minus first 4 bytes") for further storage. The slice must be freed by the // caller. -const packErr = Allocator.Error || error{Overflow}; -pub fn pack(allocator: Allocator, input: [][*:0]const u8) packErr![]const u8 { - var cvector = @ptrCast([*c][*c]u8, input.ptr); - const len = try std.math.cast(c_uint, input.len); - var source = c.cmph_io_vector_adapter(cvector, len); +pub const Error = Allocator.Error || error{Overflow}; +pub fn pack(allocator: Allocator, input: [][*:0]const u8) Error![]const u8 { + var source = c.cmph_io_vector_adapter( + @ptrCast(*[*c]u8, input.ptr), + try math.cast(c_uint, input.len), + ); defer c.cmph_io_vector_adapter_destroy(source); var config: *c.cmph_config_t = c.cmph_config_new(source) orelse return error.OutOfMemory; @@ -29,6 +33,30 @@ pub fn pack(allocator: Allocator, input: [][*:0]const u8) packErr![]const u8 { return buf[4..]; } +// perfect-hash a list of numbers and return the packed mphf +pub fn pack_u32(allocator: Allocator, numbers: []const u32) Error![]const u8 { + var keys: [][6]u8 = try allocator.alloc([6]u8, numbers.len); + defer allocator.free(keys); + for (numbers) |n, i| + keys[i] = unzeroZ(n); + + var keys2 = try allocator.alloc([*:0]const u8, numbers.len); + defer allocator.free(keys2); + for (keys) |_, i| + keys2[i] = @ptrCast([*:0]const u8, &keys[i]); + return pack(allocator, keys2); +} + +// perfect-hash a list of strings and return the packed mphf +pub fn pack_str(allocator: Allocator, strings: []const []const u8) Error![]const u8 { + var arena = std.heap.ArenaAllocator.init(allocator); + defer arena.deinit(); + var keys = try arena.allocator().alloc([*:0]const u8, strings.len); + for (strings) |_, i| + keys[i] = try arena.allocator().dupeZ(u8, strings[i]); + return pack(allocator, keys); +} + const testing = std.testing; const items = .{ @@ -60,7 +88,7 @@ test "basic pack/unpack" { var used: [items_len]bool = undefined; inline for (items) |elem| { - const hashed = try bdz.search(buf, elem); + const hashed = bdz.search(buf, elem); used[hashed] = true; } @@ -68,3 +96,48 @@ test "basic pack/unpack" { try testing.expect(item); } } + +// encodes a u32 to 6 bytes so no bytes except the last one is a '\0'. +// This is useful for cmph-packing, where it accepts 0-terminated char*s. +pub fn unzeroZ(x: u32) [6]u8 { + var buf: [6]u8 = undefined; + std.mem.copy(u8, buf[0..], bdz.unzero(x)[0..]); + buf[5] = 0; + return buf; +} + +test "unzeroZ" { + const result = unzeroZ(0); + try testing.expect(result[0] != 0); + try testing.expect(result[1] != 0); + try testing.expect(result[2] != 0); + try testing.expect(result[3] != 0); + try testing.expect(result[4] != 0); + try testing.expect(result[5] == 0); +} + +test "pack u32" { + const keys = &[_]u32{ 42, 1, math.maxInt(u32), 2 }; + const packed_mphf = try pack_u32(testing.allocator, keys); + defer testing.allocator.free(packed_mphf); + var hashes: [keys.len]u32 = undefined; + for (keys) |key, i| { + hashes[i] = bdz.search_u32(packed_mphf, key); + } + sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32)); + for (hashes) |hash, i| + try testing.expectEqual(i, hash); +} + +test "pack str" { + const keys = &[_][]const u8{ "foo", "bar", "baz", "1", "2", "3" }; + const packed_mphf = try pack_str(testing.allocator, keys[0..]); + defer testing.allocator.free(packed_mphf); + var hashes: [keys.len]u32 = undefined; + for (keys) |key, i| { + hashes[i] = bdz.search(packed_mphf, key); + } + sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32)); + for (hashes) |hash, i| + try testing.expectEqual(i, hash); +} diff --git a/src/group.zig b/src/group.zig index 3187680..b2d3fd3 100644 --- a/src/group.zig +++ b/src/group.zig @@ -15,8 +15,7 @@ pub const Group = struct { members: BufSet, pub fn clone(self: *const Group, allocator: Allocator) Allocator.Error!Group { - var name = try allocator.alloc(u8, self.name.len); - mem.copy(u8, name, self.name); + var name = try allocator.dupe(u8, self.name); return Group{ .gid = self.gid, .name = name, diff --git a/src/sections.zig b/src/sections.zig index 996b0b1..806e672 100644 --- a/src/sections.zig +++ b/src/sections.zig @@ -1,9 +1,12 @@ const std = @import("std"); -const unicode = std.unicode; +const fmt = std.fmt; +const math = std.math; const sort = std.sort; +const unicode = std.unicode; const Allocator = std.mem.Allocator; const ArrayListUnmanaged = std.ArrayListUnmanaged; const ArrayList = std.ArrayList; +const MultiArrayList = std.MultiArrayList; const StringHashMap = std.StringHashMap; const AutoHashMap = std.AutoHashMap; const BufSet = std.BufSet; @@ -12,6 +15,8 @@ const pad = @import("padding.zig"); const compress = @import("compress.zig"); const userImport = @import("user.zig"); const groupImport = @import("group.zig"); +const cmph = @import("cmph.zig"); +const bdz = @import("bdz.zig"); const User = userImport.User; const Group = groupImport.Group; @@ -23,6 +28,10 @@ const Corpus = struct { // sorted by gid groups: []Group, + // convenience users and groups by column + usersMulti: MultiArrayList(User), + groupsMulti: MultiArrayList(Group), + // pointing to `users` and `groups` slices above. name2user: StringHashMap(*const User), uid2user: AutoHashMap(u32, *const User), @@ -54,6 +63,15 @@ const Corpus = struct { sort.sort(User, users, {}, cmpUser); sort.sort(Group, groups, {}, cmpGroup); + var usersMulti = MultiArrayList(User){}; + try usersMulti.ensureTotalCapacity(allocator, users.len); + for (users) |user| + usersMulti.appendAssumeCapacity(user); + var groupsMulti = MultiArrayList(Group){}; + try groupsMulti.ensureTotalCapacity(allocator, groups.len); + for (groups) |group| + groupsMulti.appendAssumeCapacity(group); + var name2user = StringHashMap(*const User).init(allocator); var uid2user = AutoHashMap(u32, *const User).init(allocator); var name2group = StringHashMap(*const Group).init(allocator); @@ -82,22 +100,22 @@ const Corpus = struct { res2.value_ptr.* = group; } - var groupname2users = StringHashMap( - ArrayListUnmanaged(*const User), - ).init(baseAllocator); + var groupname2users = StringHashMap([]*const User).init(allocator); + + // uses baseAllocator, because it will be freed before + // returning from this function. This keeps the arena clean. var username2groups = StringHashMap( ArrayListUnmanaged(*const Group), ).init(baseAllocator); for (groups) |*group| { - var members = try ArrayListUnmanaged(*const User).initCapacity( - allocator, - group.members.count(), - ); + var members = try allocator.alloc(*const User, group.members.count()); + members.len = 0; var it = group.members.iterator(); while (it.next()) |memberName| { if (name2user.get(memberName.*)) |user| { - members.appendAssumeCapacity(user); + members.len += 1; + members[members.len - 1] = user; } else { return error.NotFound; } @@ -114,24 +132,14 @@ const Corpus = struct { result.value_ptr.* = members; } - { - var it = groupname2users.valueIterator(); - while (it.next()) |groupUsers| - sort.sort(*const User, groupUsers.items, {}, cmpUserPtr); + var it1 = groupname2users.valueIterator(); + while (it1.next()) |groupUsers| { + sort.sort(*const User, groupUsers.*, {}, cmpUserPtr); } - { - var it = username2groups.valueIterator(); - while (it.next()) |userGroups| - sort.sort(*const Group, userGroups.items, {}, cmpGroupPtr); - } - - var groupname2users_final = StringHashMap([]*const User).init(allocator); - for (groups) |group| { - const groupUsers = groupname2users.get(group.name).?.toOwnedSlice(allocator); - try groupname2users_final.put(group.name, groupUsers); - } - groupname2users.deinit(); + var it2 = username2groups.valueIterator(); + while (it2.next()) |userGroups| + sort.sort(*const Group, userGroups.items, {}, cmpGroupPtr); var username2groups_final = StringHashMap([]*const Group).init(allocator); for (users) |user| { @@ -144,11 +152,13 @@ const Corpus = struct { .arena = arena, .users = users, .groups = groups, + .usersMulti = usersMulti, + .groupsMulti = groupsMulti, .name2user = name2user, .uid2user = uid2user, .name2group = name2group, .gid2group = gid2group, - .groupname2users = groupname2users_final, + .groupname2users = groupname2users, .username2groups = username2groups_final, }; } @@ -233,19 +243,17 @@ fn cmpGroupPtr(context: void, a: *const Group, b: *const Group) bool { const testing = std.testing; -test "test corpus" { - const allocator = testing.allocator; - +fn testCorpus(allocator: Allocator) !Corpus { const users = [_]User{ User{ - .uid = 1000, - .gid = 1000, + .uid = 128, + .gid = 128, .name = "vidmantas", .gecos = "Vidmantas Kaminskas", .home = "/home/vidmantas", .shell = "/bin/bash", }, User{ .uid = 0, - .gid = std.math.maxInt(u32), + .gid = math.maxInt(u32), .name = "Name" ** 8, .gecos = "Gecos" ** 51, .home = "Home" ** 16, @@ -278,7 +286,7 @@ test "test corpus" { defer members3.deinit(); const groups = [_]Group{ Group{ - .gid = 1000, + .gid = 128, .name = "vidmantas", .members = members1, }, Group{ @@ -286,25 +294,30 @@ test "test corpus" { .name = "all", .members = members3, }, Group{ - .gid = 1234, + .gid = 0, .name = "service-account", .members = members2, } }; - var corpus = try Corpus.init(allocator, users[0..], groups[0..]); + return try Corpus.init(allocator, users[0..], groups[0..]); +} + +test "test corpus" { + var corpus = try testCorpus(testing.allocator); defer corpus.deinit(); + try testing.expectEqualStrings(corpus.users[0].name, "Name" ** 8); try testing.expectEqualStrings(corpus.users[1].name, "svc-bar"); try testing.expectEqualStrings(corpus.users[2].name, "vidmantas"); try testing.expectEqual(corpus.name2user.get("404"), null); - try testing.expectEqual(corpus.name2user.get("vidmantas").?.uid, 1000); + try testing.expectEqual(corpus.name2user.get("vidmantas").?.uid, 128); try testing.expectEqual(corpus.uid2user.get(42), null); - try testing.expectEqual(corpus.uid2user.get(1000).?.gid, 1000); + try testing.expectEqual(corpus.uid2user.get(128).?.gid, 128); try testing.expectEqual(corpus.name2group.get("404"), null); - try testing.expectEqual(corpus.name2group.get("vidmantas").?.gid, 1000); + try testing.expectEqual(corpus.name2group.get("vidmantas").?.gid, 128); try testing.expectEqual(corpus.gid2group.get(42), null); - try testing.expectEqual(corpus.gid2group.get(1000).?.gid, 1000); + try testing.expectEqual(corpus.gid2group.get(128).?.gid, 128); const membersOfAll = corpus.groupname2users.get("all").?; try testing.expectEqualStrings(membersOfAll[0].name, "Name" ** 8); @@ -313,12 +326,29 @@ test "test corpus" { try testing.expectEqual(corpus.groupname2users.get("404"), null); const groupsOfVidmantas = corpus.username2groups.get("vidmantas").?; - try testing.expectEqual(groupsOfVidmantas[0].gid, 1000); - try testing.expectEqual(groupsOfVidmantas[1].gid, 1234); + try testing.expectEqual(groupsOfVidmantas[0].gid, 0); + try testing.expectEqual(groupsOfVidmantas[1].gid, 128); try testing.expectEqual(groupsOfVidmantas[2].gid, 9999); try testing.expectEqual(corpus.username2groups.get("404"), null); } +test "pack gids" { + const allocator = testing.allocator; + var corpus = try testCorpus(allocator); + defer corpus.deinit(); + + const cmph_gid = try cmph.pack_u32(allocator, corpus.groupsMulti.items(.gid)); + defer allocator.free(cmph_gid); + + const k1 = bdz.search_u32(cmph_gid, 0); + const k2 = bdz.search_u32(cmph_gid, 128); + const k3 = bdz.search_u32(cmph_gid, 9999); + var hashes = &[_]u32{ k1, k2, k3 }; + sort.sort(u32, hashes, {}, comptime sort.asc(u32)); + for (hashes) |hash, i| + try testing.expectEqual(i, hash); +} + fn testUser(name: []const u8) User { var result = std.mem.zeroes(User); result.name = name; diff --git a/src/shell.zig b/src/shell.zig index a6ad8f0..aaa591f 100644 --- a/src/shell.zig +++ b/src/shell.zig @@ -119,10 +119,7 @@ pub const ShellWriter = struct { if (res.found_existing) { res.value_ptr.* += 1; } else { - // TODO(motiejus): can we avoid `ourShell` variable here? - const ourShell = try self.allocator.alloc(u8, shell.len); - std.mem.copy(u8, ourShell, shell); - res.key_ptr.* = ourShell; + res.key_ptr.* = try self.allocator.dupe(u8, shell); res.value_ptr.* = 1; } }