1
Fork 0

more robust bdz for numbers + helpers

main
Motiejus Jakštys 2022-03-02 11:05:20 +02:00 committed by Motiejus Jakštys
parent 4fc54e5b65
commit e1bdb6c529
5 changed files with 176 additions and 61 deletions

View File

@ -1,12 +1,28 @@
const std = @import("std");
const Allocator = std.mem.Allocator;
const c = @cImport({
@cInclude("bdz.h");
});
extern fn bdz_search_packed(packed_mphf: [*]const u8, key: [*]const u8, len: c_uint) u32;
pub fn search(packed_mphf: []const u8, key: []const u8) error{Overflow}!u32 {
const bdz_start = @intToPtr(?*anyopaque, @ptrToInt(&packed_mphf[0]));
const len = try std.math.cast(c_uint, key.len);
return @as(u32, c.bdz_search_packed(bdz_start, key.ptr, len));
pub fn search(packed_mphf: []const u8, key: []const u8) u32 {
const len = std.math.cast(c_uint, key.len) catch unreachable;
return @as(u32, bdz_search_packed(packed_mphf.ptr, key.ptr, len));
}
pub fn search_u32(packed_mphf: []const u8, key: u32) u32 {
return search(packed_mphf, unzero(key)[0..]);
}
// encode a u32 to 5 bytes so no bytes is a '\0'.
//
// TODO(motiejus) figure out how to use cmph_io_byte_vector_adapter, so cmph
// packing would accept zero bytes. For now we will be doing a dance of not
// passing zero bytes.
pub fn unzero(x: u32) [5]u8 {
const one: u8 = 0b10000000;
var buf: [5]u8 = undefined;
buf[0] = @truncate(u8, (x & 0b11111110_00000000_00000000_00000000) >> 25) | one;
buf[1] = @truncate(u8, (x & 0b00000001_11111100_00000000_00000000) >> 18) | one;
buf[2] = @truncate(u8, (x & 0b00000000_00000011_11110000_00000000) >> 12) | one;
buf[3] = @truncate(u8, (x & 0b00000000_00000000_00001111_11000000) >> 6) | one;
buf[4] = @truncate(u8, (x & 0b00000000_00000000_00000000_00111111) >> 0) | one;
return buf;
}

View File

@ -1,5 +1,8 @@
const std = @import("std");
const Allocator = std.mem.Allocator;
const math = std.math;
const sort = std.sort;
const bdz = @import("bdz.zig");
const c = @cImport({
@ -9,11 +12,12 @@ const c = @cImport({
// pack packs cmph hashes for the given input and returns a slice ("cmph pack
// minus first 4 bytes") for further storage. The slice must be freed by the
// caller.
const packErr = Allocator.Error || error{Overflow};
pub fn pack(allocator: Allocator, input: [][*:0]const u8) packErr![]const u8 {
var cvector = @ptrCast([*c][*c]u8, input.ptr);
const len = try std.math.cast(c_uint, input.len);
var source = c.cmph_io_vector_adapter(cvector, len);
pub const Error = Allocator.Error || error{Overflow};
pub fn pack(allocator: Allocator, input: [][*:0]const u8) Error![]const u8 {
var source = c.cmph_io_vector_adapter(
@ptrCast(*[*c]u8, input.ptr),
try math.cast(c_uint, input.len),
);
defer c.cmph_io_vector_adapter_destroy(source);
var config: *c.cmph_config_t = c.cmph_config_new(source) orelse
return error.OutOfMemory;
@ -29,6 +33,30 @@ pub fn pack(allocator: Allocator, input: [][*:0]const u8) packErr![]const u8 {
return buf[4..];
}
// perfect-hash a list of numbers and return the packed mphf
pub fn pack_u32(allocator: Allocator, numbers: []const u32) Error![]const u8 {
var keys: [][6]u8 = try allocator.alloc([6]u8, numbers.len);
defer allocator.free(keys);
for (numbers) |n, i|
keys[i] = unzeroZ(n);
var keys2 = try allocator.alloc([*:0]const u8, numbers.len);
defer allocator.free(keys2);
for (keys) |_, i|
keys2[i] = @ptrCast([*:0]const u8, &keys[i]);
return pack(allocator, keys2);
}
// perfect-hash a list of strings and return the packed mphf
pub fn pack_str(allocator: Allocator, strings: []const []const u8) Error![]const u8 {
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
var keys = try arena.allocator().alloc([*:0]const u8, strings.len);
for (strings) |_, i|
keys[i] = try arena.allocator().dupeZ(u8, strings[i]);
return pack(allocator, keys);
}
const testing = std.testing;
const items = .{
@ -60,7 +88,7 @@ test "basic pack/unpack" {
var used: [items_len]bool = undefined;
inline for (items) |elem| {
const hashed = try bdz.search(buf, elem);
const hashed = bdz.search(buf, elem);
used[hashed] = true;
}
@ -68,3 +96,48 @@ test "basic pack/unpack" {
try testing.expect(item);
}
}
// encodes a u32 to 6 bytes so no bytes except the last one is a '\0'.
// This is useful for cmph-packing, where it accepts 0-terminated char*s.
pub fn unzeroZ(x: u32) [6]u8 {
var buf: [6]u8 = undefined;
std.mem.copy(u8, buf[0..], bdz.unzero(x)[0..]);
buf[5] = 0;
return buf;
}
test "unzeroZ" {
const result = unzeroZ(0);
try testing.expect(result[0] != 0);
try testing.expect(result[1] != 0);
try testing.expect(result[2] != 0);
try testing.expect(result[3] != 0);
try testing.expect(result[4] != 0);
try testing.expect(result[5] == 0);
}
test "pack u32" {
const keys = &[_]u32{ 42, 1, math.maxInt(u32), 2 };
const packed_mphf = try pack_u32(testing.allocator, keys);
defer testing.allocator.free(packed_mphf);
var hashes: [keys.len]u32 = undefined;
for (keys) |key, i| {
hashes[i] = bdz.search_u32(packed_mphf, key);
}
sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32));
for (hashes) |hash, i|
try testing.expectEqual(i, hash);
}
test "pack str" {
const keys = &[_][]const u8{ "foo", "bar", "baz", "1", "2", "3" };
const packed_mphf = try pack_str(testing.allocator, keys[0..]);
defer testing.allocator.free(packed_mphf);
var hashes: [keys.len]u32 = undefined;
for (keys) |key, i| {
hashes[i] = bdz.search(packed_mphf, key);
}
sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32));
for (hashes) |hash, i|
try testing.expectEqual(i, hash);
}

View File

@ -15,8 +15,7 @@ pub const Group = struct {
members: BufSet,
pub fn clone(self: *const Group, allocator: Allocator) Allocator.Error!Group {
var name = try allocator.alloc(u8, self.name.len);
mem.copy(u8, name, self.name);
var name = try allocator.dupe(u8, self.name);
return Group{
.gid = self.gid,
.name = name,

View File

@ -1,9 +1,12 @@
const std = @import("std");
const unicode = std.unicode;
const fmt = std.fmt;
const math = std.math;
const sort = std.sort;
const unicode = std.unicode;
const Allocator = std.mem.Allocator;
const ArrayListUnmanaged = std.ArrayListUnmanaged;
const ArrayList = std.ArrayList;
const MultiArrayList = std.MultiArrayList;
const StringHashMap = std.StringHashMap;
const AutoHashMap = std.AutoHashMap;
const BufSet = std.BufSet;
@ -12,6 +15,8 @@ const pad = @import("padding.zig");
const compress = @import("compress.zig");
const userImport = @import("user.zig");
const groupImport = @import("group.zig");
const cmph = @import("cmph.zig");
const bdz = @import("bdz.zig");
const User = userImport.User;
const Group = groupImport.Group;
@ -23,6 +28,10 @@ const Corpus = struct {
// sorted by gid
groups: []Group,
// convenience users and groups by column
usersMulti: MultiArrayList(User),
groupsMulti: MultiArrayList(Group),
// pointing to `users` and `groups` slices above.
name2user: StringHashMap(*const User),
uid2user: AutoHashMap(u32, *const User),
@ -54,6 +63,15 @@ const Corpus = struct {
sort.sort(User, users, {}, cmpUser);
sort.sort(Group, groups, {}, cmpGroup);
var usersMulti = MultiArrayList(User){};
try usersMulti.ensureTotalCapacity(allocator, users.len);
for (users) |user|
usersMulti.appendAssumeCapacity(user);
var groupsMulti = MultiArrayList(Group){};
try groupsMulti.ensureTotalCapacity(allocator, groups.len);
for (groups) |group|
groupsMulti.appendAssumeCapacity(group);
var name2user = StringHashMap(*const User).init(allocator);
var uid2user = AutoHashMap(u32, *const User).init(allocator);
var name2group = StringHashMap(*const Group).init(allocator);
@ -82,22 +100,22 @@ const Corpus = struct {
res2.value_ptr.* = group;
}
var groupname2users = StringHashMap(
ArrayListUnmanaged(*const User),
).init(baseAllocator);
var groupname2users = StringHashMap([]*const User).init(allocator);
// uses baseAllocator, because it will be freed before
// returning from this function. This keeps the arena clean.
var username2groups = StringHashMap(
ArrayListUnmanaged(*const Group),
).init(baseAllocator);
for (groups) |*group| {
var members = try ArrayListUnmanaged(*const User).initCapacity(
allocator,
group.members.count(),
);
var members = try allocator.alloc(*const User, group.members.count());
members.len = 0;
var it = group.members.iterator();
while (it.next()) |memberName| {
if (name2user.get(memberName.*)) |user| {
members.appendAssumeCapacity(user);
members.len += 1;
members[members.len - 1] = user;
} else {
return error.NotFound;
}
@ -114,24 +132,14 @@ const Corpus = struct {
result.value_ptr.* = members;
}
{
var it = groupname2users.valueIterator();
while (it.next()) |groupUsers|
sort.sort(*const User, groupUsers.items, {}, cmpUserPtr);
var it1 = groupname2users.valueIterator();
while (it1.next()) |groupUsers| {
sort.sort(*const User, groupUsers.*, {}, cmpUserPtr);
}
{
var it = username2groups.valueIterator();
while (it.next()) |userGroups|
sort.sort(*const Group, userGroups.items, {}, cmpGroupPtr);
}
var groupname2users_final = StringHashMap([]*const User).init(allocator);
for (groups) |group| {
const groupUsers = groupname2users.get(group.name).?.toOwnedSlice(allocator);
try groupname2users_final.put(group.name, groupUsers);
}
groupname2users.deinit();
var it2 = username2groups.valueIterator();
while (it2.next()) |userGroups|
sort.sort(*const Group, userGroups.items, {}, cmpGroupPtr);
var username2groups_final = StringHashMap([]*const Group).init(allocator);
for (users) |user| {
@ -144,11 +152,13 @@ const Corpus = struct {
.arena = arena,
.users = users,
.groups = groups,
.usersMulti = usersMulti,
.groupsMulti = groupsMulti,
.name2user = name2user,
.uid2user = uid2user,
.name2group = name2group,
.gid2group = gid2group,
.groupname2users = groupname2users_final,
.groupname2users = groupname2users,
.username2groups = username2groups_final,
};
}
@ -233,19 +243,17 @@ fn cmpGroupPtr(context: void, a: *const Group, b: *const Group) bool {
const testing = std.testing;
test "test corpus" {
const allocator = testing.allocator;
fn testCorpus(allocator: Allocator) !Corpus {
const users = [_]User{ User{
.uid = 1000,
.gid = 1000,
.uid = 128,
.gid = 128,
.name = "vidmantas",
.gecos = "Vidmantas Kaminskas",
.home = "/home/vidmantas",
.shell = "/bin/bash",
}, User{
.uid = 0,
.gid = std.math.maxInt(u32),
.gid = math.maxInt(u32),
.name = "Name" ** 8,
.gecos = "Gecos" ** 51,
.home = "Home" ** 16,
@ -278,7 +286,7 @@ test "test corpus" {
defer members3.deinit();
const groups = [_]Group{ Group{
.gid = 1000,
.gid = 128,
.name = "vidmantas",
.members = members1,
}, Group{
@ -286,25 +294,30 @@ test "test corpus" {
.name = "all",
.members = members3,
}, Group{
.gid = 1234,
.gid = 0,
.name = "service-account",
.members = members2,
} };
var corpus = try Corpus.init(allocator, users[0..], groups[0..]);
return try Corpus.init(allocator, users[0..], groups[0..]);
}
test "test corpus" {
var corpus = try testCorpus(testing.allocator);
defer corpus.deinit();
try testing.expectEqualStrings(corpus.users[0].name, "Name" ** 8);
try testing.expectEqualStrings(corpus.users[1].name, "svc-bar");
try testing.expectEqualStrings(corpus.users[2].name, "vidmantas");
try testing.expectEqual(corpus.name2user.get("404"), null);
try testing.expectEqual(corpus.name2user.get("vidmantas").?.uid, 1000);
try testing.expectEqual(corpus.name2user.get("vidmantas").?.uid, 128);
try testing.expectEqual(corpus.uid2user.get(42), null);
try testing.expectEqual(corpus.uid2user.get(1000).?.gid, 1000);
try testing.expectEqual(corpus.uid2user.get(128).?.gid, 128);
try testing.expectEqual(corpus.name2group.get("404"), null);
try testing.expectEqual(corpus.name2group.get("vidmantas").?.gid, 1000);
try testing.expectEqual(corpus.name2group.get("vidmantas").?.gid, 128);
try testing.expectEqual(corpus.gid2group.get(42), null);
try testing.expectEqual(corpus.gid2group.get(1000).?.gid, 1000);
try testing.expectEqual(corpus.gid2group.get(128).?.gid, 128);
const membersOfAll = corpus.groupname2users.get("all").?;
try testing.expectEqualStrings(membersOfAll[0].name, "Name" ** 8);
@ -313,12 +326,29 @@ test "test corpus" {
try testing.expectEqual(corpus.groupname2users.get("404"), null);
const groupsOfVidmantas = corpus.username2groups.get("vidmantas").?;
try testing.expectEqual(groupsOfVidmantas[0].gid, 1000);
try testing.expectEqual(groupsOfVidmantas[1].gid, 1234);
try testing.expectEqual(groupsOfVidmantas[0].gid, 0);
try testing.expectEqual(groupsOfVidmantas[1].gid, 128);
try testing.expectEqual(groupsOfVidmantas[2].gid, 9999);
try testing.expectEqual(corpus.username2groups.get("404"), null);
}
test "pack gids" {
const allocator = testing.allocator;
var corpus = try testCorpus(allocator);
defer corpus.deinit();
const cmph_gid = try cmph.pack_u32(allocator, corpus.groupsMulti.items(.gid));
defer allocator.free(cmph_gid);
const k1 = bdz.search_u32(cmph_gid, 0);
const k2 = bdz.search_u32(cmph_gid, 128);
const k3 = bdz.search_u32(cmph_gid, 9999);
var hashes = &[_]u32{ k1, k2, k3 };
sort.sort(u32, hashes, {}, comptime sort.asc(u32));
for (hashes) |hash, i|
try testing.expectEqual(i, hash);
}
fn testUser(name: []const u8) User {
var result = std.mem.zeroes(User);
result.name = name;

View File

@ -119,10 +119,7 @@ pub const ShellWriter = struct {
if (res.found_existing) {
res.value_ptr.* += 1;
} else {
// TODO(motiejus): can we avoid `ourShell` variable here?
const ourShell = try self.allocator.alloc(u8, shell.len);
std.mem.copy(u8, ourShell, shell);
res.key_ptr.* = ourShell;
res.key_ptr.* = try self.allocator.dupe(u8, shell);
res.value_ptr.* = 1;
}
}