1
Fork 0
turbonss/src/cmph.zig

147 lines
4.9 KiB
Zig
Raw Normal View History

2022-02-23 15:25:55 +02:00
const std = @import("std");
const Allocator = std.mem.Allocator;
2022-03-02 11:05:20 +02:00
const math = std.math;
const sort = std.sort;
const assert = std.debug.assert;
2022-03-02 11:05:20 +02:00
2022-02-23 15:25:55 +02:00
const bdz = @import("bdz.zig");
const CMPH_BDZ = @cImport({
@cInclude("cmph_types.h");
}).CMPH_BDZ;
extern fn cmph_io_vector_adapter(vector: [*]const [*:0]const u8, len: c_uint) [*]u8;
extern fn cmph_io_vector_adapter_destroy(key_source: [*]u8) void;
extern fn cmph_config_new(key_source: [*]const u8) ?[*]u8;
extern fn cmph_config_set_algo(mph: [*]u8, algo: c_int) void;
extern fn cmph_config_set_b(mph: [*]u8, b: c_int) void;
2022-03-20 09:46:17 +02:00
extern fn cmph_new(config: [*]const u8) ?[*]u8;
extern fn cmph_config_destroy(mph: [*]u8) void;
extern fn cmph_packed_size(mphf: [*]const u8) u32;
extern fn cmph_pack(mphf: [*]const u8, packed_mphf: [*]u8) void;
extern fn cmph_destroy(mphf: [*]u8) void;
2022-02-23 15:25:55 +02:00
2022-02-23 20:30:23 +02:00
// pack packs cmph hashes for the given input and returns a slice ("cmph pack
// minus first 4 bytes") for further storage. The slice must be freed by the
// caller.
pub fn pack(allocator: Allocator, input: [][*:0]const u8) error{OutOfMemory}![]const u8 {
const input_len = @intCast(c_uint, input.len);
var source = cmph_io_vector_adapter(input.ptr, input_len);
defer cmph_io_vector_adapter_destroy(source);
var config = cmph_config_new(source) orelse return error.OutOfMemory;
cmph_config_set_algo(config, CMPH_BDZ);
cmph_config_set_b(config, 7);
var mph = cmph_new(config) orelse return error.OutOfMemory;
cmph_config_destroy(config);
const size = cmph_packed_size(mph);
2022-02-23 15:25:55 +02:00
var buf = try allocator.alloc(u8, size);
2022-03-05 05:33:31 +02:00
errdefer allocator.free(buf);
cmph_pack(mph, buf.ptr);
cmph_destroy(mph);
2022-07-04 07:44:20 +03:00
return buf;
2022-02-23 15:25:55 +02:00
}
2022-03-02 11:05:20 +02:00
// perfect-hash a list of numbers and return the packed mphf
pub fn packU32(allocator: Allocator, numbers: []const u32) error{OutOfMemory}![]const u8 {
2022-03-02 11:05:20 +02:00
var keys: [][6]u8 = try allocator.alloc([6]u8, numbers.len);
defer allocator.free(keys);
for (numbers) |n, i|
keys[i] = unzeroZ(n);
var keys2 = try allocator.alloc([*:0]const u8, numbers.len);
defer allocator.free(keys2);
for (keys) |_, i|
keys2[i] = @ptrCast([*:0]const u8, &keys[i]);
return pack(allocator, keys2);
}
// perfect-hash a list of strings and return the packed mphf
pub fn packStr(allocator: Allocator, strings: []const []const u8) error{OutOfMemory}![]const u8 {
2022-03-02 11:05:20 +02:00
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
var keys = try arena.allocator().alloc([*:0]const u8, strings.len);
for (strings) |_, i|
keys[i] = try arena.allocator().dupeZ(u8, strings[i]);
return pack(allocator, keys);
}
2022-02-23 15:25:55 +02:00
const testing = std.testing;
const items = .{
"aaaaaaaaaa",
"bbbbbbbbbb",
"cccccccccc",
"dddddddddd",
"eeeeeeeeee",
"ffffffffff",
"gggggggggg",
"hhhhhhhhhh",
"iiiiiiiiii",
"jjjjjjjjjj",
};
const items_len = items.len;
fn samplePack(allocator: Allocator) ![]const u8 {
var vector = std.ArrayList([*:0]const u8).init(allocator);
defer vector.deinit();
try vector.appendSlice(&items);
return pack(allocator, vector.items);
}
2022-07-12 12:59:47 +03:00
test "cmph basic pack/unpack" {
2022-02-23 15:25:55 +02:00
const buf = try samplePack(testing.allocator);
defer testing.allocator.free(buf);
try testing.expect(buf.len < 100);
var used: [items_len]bool = undefined;
inline for (items) |elem| {
2022-03-02 11:05:20 +02:00
const hashed = bdz.search(buf, elem);
2022-02-23 15:25:55 +02:00
used[hashed] = true;
}
for (used) |item| try testing.expect(item);
2022-02-23 15:25:55 +02:00
}
2022-03-02 11:05:20 +02:00
// encodes a u32 to 6 bytes so no bytes except the last one is a '\0'.
// This is useful for cmph-packing, where it accepts 0-terminated char*s.
pub fn unzeroZ(x: u32) [6]u8 {
var buf: [6]u8 = undefined;
std.mem.copy(u8, &buf, &bdz.unzero(x));
2022-03-02 11:05:20 +02:00
buf[5] = 0;
return buf;
}
2022-07-12 12:59:47 +03:00
test "cmph unzeroZ" {
2022-03-02 11:05:20 +02:00
const result = unzeroZ(0);
try testing.expect(result[0] != 0);
try testing.expect(result[1] != 0);
try testing.expect(result[2] != 0);
try testing.expect(result[3] != 0);
try testing.expect(result[4] != 0);
try testing.expect(result[5] == 0);
}
2022-07-12 12:59:47 +03:00
test "cmph pack u32" {
2022-03-02 11:05:20 +02:00
const keys = &[_]u32{ 42, 1, math.maxInt(u32), 2 };
2022-03-15 06:35:48 +02:00
const packed_mphf = try packU32(testing.allocator, keys);
2022-03-02 11:05:20 +02:00
defer testing.allocator.free(packed_mphf);
var hashes: [keys.len]u32 = undefined;
for (keys) |key, i|
2022-03-02 11:05:20 +02:00
hashes[i] = bdz.search_u32(packed_mphf, key);
sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32));
for (hashes) |hash, i|
try testing.expectEqual(i, hash);
}
2022-07-12 12:59:47 +03:00
test "cmph pack str" {
2022-03-02 11:05:20 +02:00
const keys = &[_][]const u8{ "foo", "bar", "baz", "1", "2", "3" };
2022-03-15 06:35:48 +02:00
const packed_mphf = try packStr(testing.allocator, keys[0..]);
2022-03-02 11:05:20 +02:00
defer testing.allocator.free(packed_mphf);
var hashes: [keys.len]u32 = undefined;
for (keys) |key, i|
2022-03-02 11:05:20 +02:00
hashes[i] = bdz.search(packed_mphf, key);
sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32));
for (hashes) |hash, i|
try testing.expectEqual(i, hash);
}