2022-02-23 15:25:55 +02:00
|
|
|
const std = @import("std");
|
|
|
|
const Allocator = std.mem.Allocator;
|
2022-03-02 11:05:20 +02:00
|
|
|
const math = std.math;
|
|
|
|
const sort = std.sort;
|
|
|
|
|
2022-02-23 15:25:55 +02:00
|
|
|
const bdz = @import("bdz.zig");
|
|
|
|
|
|
|
|
const c = @cImport({
|
|
|
|
@cInclude("cmph.h");
|
|
|
|
});
|
|
|
|
|
2022-02-23 20:30:23 +02:00
|
|
|
// pack packs cmph hashes for the given input and returns a slice ("cmph pack
|
|
|
|
// minus first 4 bytes") for further storage. The slice must be freed by the
|
|
|
|
// caller.
|
2022-03-02 11:05:20 +02:00
|
|
|
pub const Error = Allocator.Error || error{Overflow};
|
|
|
|
pub fn pack(allocator: Allocator, input: [][*:0]const u8) Error![]const u8 {
|
|
|
|
var source = c.cmph_io_vector_adapter(
|
|
|
|
@ptrCast(*[*c]u8, input.ptr),
|
|
|
|
try math.cast(c_uint, input.len),
|
|
|
|
);
|
2022-02-23 15:25:55 +02:00
|
|
|
defer c.cmph_io_vector_adapter_destroy(source);
|
2022-03-02 06:18:19 +02:00
|
|
|
var config: *c.cmph_config_t = c.cmph_config_new(source) orelse
|
|
|
|
return error.OutOfMemory;
|
2022-02-23 15:25:55 +02:00
|
|
|
c.cmph_config_set_algo(config, c.CMPH_BDZ);
|
|
|
|
c.cmph_config_set_b(config, 7);
|
|
|
|
var hash: *c.cmph_t = c.cmph_new(config) orelse return error.OutOfMemory;
|
|
|
|
c.cmph_config_destroy(config);
|
|
|
|
|
|
|
|
const size = c.cmph_packed_size(hash);
|
|
|
|
var buf = try allocator.alloc(u8, size);
|
2022-03-05 05:33:31 +02:00
|
|
|
errdefer allocator.free(buf);
|
2022-02-23 15:25:55 +02:00
|
|
|
c.cmph_pack(hash, &buf[0]);
|
|
|
|
c.cmph_destroy(hash);
|
2022-02-23 20:30:23 +02:00
|
|
|
return buf[4..];
|
2022-02-23 15:25:55 +02:00
|
|
|
}
|
|
|
|
|
2022-03-02 11:05:20 +02:00
|
|
|
// perfect-hash a list of numbers and return the packed mphf
|
2022-03-15 06:35:48 +02:00
|
|
|
pub fn packU32(allocator: Allocator, numbers: []const u32) Error![]const u8 {
|
2022-03-02 11:05:20 +02:00
|
|
|
var keys: [][6]u8 = try allocator.alloc([6]u8, numbers.len);
|
|
|
|
defer allocator.free(keys);
|
|
|
|
for (numbers) |n, i|
|
|
|
|
keys[i] = unzeroZ(n);
|
|
|
|
|
|
|
|
var keys2 = try allocator.alloc([*:0]const u8, numbers.len);
|
|
|
|
defer allocator.free(keys2);
|
|
|
|
for (keys) |_, i|
|
|
|
|
keys2[i] = @ptrCast([*:0]const u8, &keys[i]);
|
|
|
|
return pack(allocator, keys2);
|
|
|
|
}
|
|
|
|
|
|
|
|
// perfect-hash a list of strings and return the packed mphf
|
2022-03-15 06:35:48 +02:00
|
|
|
pub fn packStr(allocator: Allocator, strings: []const []const u8) Error![]const u8 {
|
2022-03-02 11:05:20 +02:00
|
|
|
var arena = std.heap.ArenaAllocator.init(allocator);
|
|
|
|
defer arena.deinit();
|
|
|
|
var keys = try arena.allocator().alloc([*:0]const u8, strings.len);
|
|
|
|
for (strings) |_, i|
|
|
|
|
keys[i] = try arena.allocator().dupeZ(u8, strings[i]);
|
|
|
|
return pack(allocator, keys);
|
|
|
|
}
|
|
|
|
|
2022-02-23 15:25:55 +02:00
|
|
|
const testing = std.testing;
|
|
|
|
|
|
|
|
const items = .{
|
|
|
|
"aaaaaaaaaa",
|
|
|
|
"bbbbbbbbbb",
|
|
|
|
"cccccccccc",
|
|
|
|
"dddddddddd",
|
|
|
|
"eeeeeeeeee",
|
|
|
|
"ffffffffff",
|
|
|
|
"gggggggggg",
|
|
|
|
"hhhhhhhhhh",
|
|
|
|
"iiiiiiiiii",
|
|
|
|
"jjjjjjjjjj",
|
|
|
|
};
|
|
|
|
const items_len = items.len;
|
|
|
|
|
|
|
|
fn samplePack(allocator: Allocator) ![]const u8 {
|
|
|
|
var vector = std.ArrayList([*:0]const u8).init(allocator);
|
|
|
|
defer vector.deinit();
|
|
|
|
try vector.appendSlice(&items);
|
|
|
|
return pack(allocator, vector.items);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "basic pack/unpack" {
|
|
|
|
const buf = try samplePack(testing.allocator);
|
|
|
|
defer testing.allocator.free(buf);
|
|
|
|
try testing.expect(buf.len < 100);
|
|
|
|
|
|
|
|
var used: [items_len]bool = undefined;
|
|
|
|
|
|
|
|
inline for (items) |elem| {
|
2022-03-02 11:05:20 +02:00
|
|
|
const hashed = bdz.search(buf, elem);
|
2022-02-23 15:25:55 +02:00
|
|
|
used[hashed] = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (used) |item| {
|
|
|
|
try testing.expect(item);
|
|
|
|
}
|
|
|
|
}
|
2022-03-02 11:05:20 +02:00
|
|
|
|
|
|
|
// encodes a u32 to 6 bytes so no bytes except the last one is a '\0'.
|
|
|
|
// This is useful for cmph-packing, where it accepts 0-terminated char*s.
|
|
|
|
pub fn unzeroZ(x: u32) [6]u8 {
|
|
|
|
var buf: [6]u8 = undefined;
|
|
|
|
std.mem.copy(u8, buf[0..], bdz.unzero(x)[0..]);
|
|
|
|
buf[5] = 0;
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
test "unzeroZ" {
|
|
|
|
const result = unzeroZ(0);
|
|
|
|
try testing.expect(result[0] != 0);
|
|
|
|
try testing.expect(result[1] != 0);
|
|
|
|
try testing.expect(result[2] != 0);
|
|
|
|
try testing.expect(result[3] != 0);
|
|
|
|
try testing.expect(result[4] != 0);
|
|
|
|
try testing.expect(result[5] == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "pack u32" {
|
|
|
|
const keys = &[_]u32{ 42, 1, math.maxInt(u32), 2 };
|
2022-03-15 06:35:48 +02:00
|
|
|
const packed_mphf = try packU32(testing.allocator, keys);
|
2022-03-02 11:05:20 +02:00
|
|
|
defer testing.allocator.free(packed_mphf);
|
|
|
|
var hashes: [keys.len]u32 = undefined;
|
|
|
|
for (keys) |key, i| {
|
|
|
|
hashes[i] = bdz.search_u32(packed_mphf, key);
|
|
|
|
}
|
|
|
|
sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32));
|
|
|
|
for (hashes) |hash, i|
|
|
|
|
try testing.expectEqual(i, hash);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "pack str" {
|
|
|
|
const keys = &[_][]const u8{ "foo", "bar", "baz", "1", "2", "3" };
|
2022-03-15 06:35:48 +02:00
|
|
|
const packed_mphf = try packStr(testing.allocator, keys[0..]);
|
2022-03-02 11:05:20 +02:00
|
|
|
defer testing.allocator.free(packed_mphf);
|
|
|
|
var hashes: [keys.len]u32 = undefined;
|
|
|
|
for (keys) |key, i| {
|
|
|
|
hashes[i] = bdz.search(packed_mphf, key);
|
|
|
|
}
|
|
|
|
sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32));
|
|
|
|
for (hashes) |hash, i|
|
|
|
|
try testing.expectEqual(i, hash);
|
|
|
|
}
|