more robust bdz for numbers + helpers

This commit is contained in:
2022-03-02 11:05:20 +02:00
committed by Motiejus Jakštys
parent 4fc54e5b65
commit e1bdb6c529
5 changed files with 176 additions and 61 deletions

View File

@@ -1,5 +1,8 @@
const std = @import("std");
const Allocator = std.mem.Allocator;
const math = std.math;
const sort = std.sort;
const bdz = @import("bdz.zig");
const c = @cImport({
@@ -9,11 +12,12 @@ const c = @cImport({
// pack packs cmph hashes for the given input and returns a slice ("cmph pack
// minus first 4 bytes") for further storage. The slice must be freed by the
// caller.
const packErr = Allocator.Error || error{Overflow};
pub fn pack(allocator: Allocator, input: [][*:0]const u8) packErr![]const u8 {
var cvector = @ptrCast([*c][*c]u8, input.ptr);
const len = try std.math.cast(c_uint, input.len);
var source = c.cmph_io_vector_adapter(cvector, len);
pub const Error = Allocator.Error || error{Overflow};
pub fn pack(allocator: Allocator, input: [][*:0]const u8) Error![]const u8 {
var source = c.cmph_io_vector_adapter(
@ptrCast(*[*c]u8, input.ptr),
try math.cast(c_uint, input.len),
);
defer c.cmph_io_vector_adapter_destroy(source);
var config: *c.cmph_config_t = c.cmph_config_new(source) orelse
return error.OutOfMemory;
@@ -29,6 +33,30 @@ pub fn pack(allocator: Allocator, input: [][*:0]const u8) packErr![]const u8 {
return buf[4..];
}
// perfect-hash a list of numbers and return the packed mphf
pub fn pack_u32(allocator: Allocator, numbers: []const u32) Error![]const u8 {
var keys: [][6]u8 = try allocator.alloc([6]u8, numbers.len);
defer allocator.free(keys);
for (numbers) |n, i|
keys[i] = unzeroZ(n);
var keys2 = try allocator.alloc([*:0]const u8, numbers.len);
defer allocator.free(keys2);
for (keys) |_, i|
keys2[i] = @ptrCast([*:0]const u8, &keys[i]);
return pack(allocator, keys2);
}
// perfect-hash a list of strings and return the packed mphf
pub fn pack_str(allocator: Allocator, strings: []const []const u8) Error![]const u8 {
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
var keys = try arena.allocator().alloc([*:0]const u8, strings.len);
for (strings) |_, i|
keys[i] = try arena.allocator().dupeZ(u8, strings[i]);
return pack(allocator, keys);
}
const testing = std.testing;
const items = .{
@@ -60,7 +88,7 @@ test "basic pack/unpack" {
var used: [items_len]bool = undefined;
inline for (items) |elem| {
const hashed = try bdz.search(buf, elem);
const hashed = bdz.search(buf, elem);
used[hashed] = true;
}
@@ -68,3 +96,48 @@ test "basic pack/unpack" {
try testing.expect(item);
}
}
// encodes a u32 to 6 bytes so no bytes except the last one is a '\0'.
// This is useful for cmph-packing, where it accepts 0-terminated char*s.
pub fn unzeroZ(x: u32) [6]u8 {
var buf: [6]u8 = undefined;
std.mem.copy(u8, buf[0..], bdz.unzero(x)[0..]);
buf[5] = 0;
return buf;
}
test "unzeroZ" {
const result = unzeroZ(0);
try testing.expect(result[0] != 0);
try testing.expect(result[1] != 0);
try testing.expect(result[2] != 0);
try testing.expect(result[3] != 0);
try testing.expect(result[4] != 0);
try testing.expect(result[5] == 0);
}
test "pack u32" {
const keys = &[_]u32{ 42, 1, math.maxInt(u32), 2 };
const packed_mphf = try pack_u32(testing.allocator, keys);
defer testing.allocator.free(packed_mphf);
var hashes: [keys.len]u32 = undefined;
for (keys) |key, i| {
hashes[i] = bdz.search_u32(packed_mphf, key);
}
sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32));
for (hashes) |hash, i|
try testing.expectEqual(i, hash);
}
test "pack str" {
const keys = &[_][]const u8{ "foo", "bar", "baz", "1", "2", "3" };
const packed_mphf = try pack_str(testing.allocator, keys[0..]);
defer testing.allocator.free(packed_mphf);
var hashes: [keys.len]u32 = undefined;
for (keys) |key, i| {
hashes[i] = bdz.search(packed_mphf, key);
}
sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32));
for (hashes) |hash, i|
try testing.expectEqual(i, hash);
}