const std = @import("std"); const Allocator = std.mem.Allocator; const math = std.math; const sort = std.sort; const assert = std.debug.assert; const bdz = @import("bdz.zig"); // must be kept in sync with the definition in cmph_types.h const CMPH_ALGO = enum(c_int) { CMPH_BMZ, CMPH_BMZ8, CMPH_CHM, CMPH_BRZ, CMPH_FCH, CMPH_BDZ, CMPH_BDZ_PH, CMPH_CHD_PH, CMPH_CHD, CMPH_COUNT, }; extern fn cmph_io_vector_adapter(vector: [*]const [*:0]const u8, len: c_uint) [*]u8; extern fn cmph_io_vector_adapter_destroy(key_source: [*]u8) void; extern fn cmph_config_new(key_source: [*]const u8) ?[*]u8; extern fn cmph_config_set_algo(mph: [*]u8, algo: c_int) void; extern fn cmph_config_set_b(mph: [*]u8, b: c_int) void; extern fn cmph_new(config: [*]const u8) ?[*]u8; extern fn cmph_config_destroy(mph: [*]u8) void; extern fn cmph_packed_size(mphf: [*]const u8) u32; extern fn cmph_pack(mphf: [*]const u8, packed_mphf: [*]u8) void; extern fn cmph_destroy(mphf: [*]const u8) void; // pack packs cmph hashes for the given input and returns a slice ("cmph pack // minus first 4 bytes") for further storage. The slice must be freed by the // caller. pub fn pack(allocator: Allocator, input: [][*:0]const u8) error{OutOfMemory}![]const u8 { assert(input.len <= math.maxInt(c_uint)); const input_len = @intCast(c_uint, input.len); var source = cmph_io_vector_adapter(input.ptr, input_len); defer cmph_io_vector_adapter_destroy(source); var config = cmph_config_new(source) orelse return error.OutOfMemory; cmph_config_set_algo(config, @enumToInt(CMPH_ALGO.CMPH_BDZ)); cmph_config_set_b(config, 7); var mph = cmph_new(config) orelse return error.OutOfMemory; cmph_config_destroy(config); const size = cmph_packed_size(mph); var buf = try allocator.alloc(u8, size); errdefer allocator.free(buf); cmph_pack(mph, buf.ptr); cmph_destroy(mph); return buf[4..]; } // perfect-hash a list of numbers and return the packed mphf pub fn packU32(allocator: Allocator, numbers: []const u32) error{OutOfMemory}![]const u8 { var keys: [][6]u8 = try allocator.alloc([6]u8, numbers.len); defer allocator.free(keys); for (numbers) |n, i| keys[i] = unzeroZ(n); var keys2 = try allocator.alloc([*:0]const u8, numbers.len); defer allocator.free(keys2); for (keys) |_, i| keys2[i] = @ptrCast([*:0]const u8, &keys[i]); return pack(allocator, keys2); } // perfect-hash a list of strings and return the packed mphf pub fn packStr(allocator: Allocator, strings: []const []const u8) error{OutOfMemory}![]const u8 { var arena = std.heap.ArenaAllocator.init(allocator); defer arena.deinit(); var keys = try arena.allocator().alloc([*:0]const u8, strings.len); for (strings) |_, i| keys[i] = try arena.allocator().dupeZ(u8, strings[i]); return pack(allocator, keys); } const testing = std.testing; const items = .{ "aaaaaaaaaa", "bbbbbbbbbb", "cccccccccc", "dddddddddd", "eeeeeeeeee", "ffffffffff", "gggggggggg", "hhhhhhhhhh", "iiiiiiiiii", "jjjjjjjjjj", }; const items_len = items.len; fn samplePack(allocator: Allocator) ![]const u8 { var vector = std.ArrayList([*:0]const u8).init(allocator); defer vector.deinit(); try vector.appendSlice(&items); return pack(allocator, vector.items); } test "basic pack/unpack" { const buf = try samplePack(testing.allocator); defer testing.allocator.free(buf); try testing.expect(buf.len < 100); var used: [items_len]bool = undefined; inline for (items) |elem| { const hashed = bdz.search(buf, elem); used[hashed] = true; } for (used) |item| { try testing.expect(item); } } // encodes a u32 to 6 bytes so no bytes except the last one is a '\0'. // This is useful for cmph-packing, where it accepts 0-terminated char*s. pub fn unzeroZ(x: u32) [6]u8 { var buf: [6]u8 = undefined; std.mem.copy(u8, buf[0..], bdz.unzero(x)[0..]); buf[5] = 0; return buf; } test "unzeroZ" { const result = unzeroZ(0); try testing.expect(result[0] != 0); try testing.expect(result[1] != 0); try testing.expect(result[2] != 0); try testing.expect(result[3] != 0); try testing.expect(result[4] != 0); try testing.expect(result[5] == 0); } test "pack u32" { const keys = &[_]u32{ 42, 1, math.maxInt(u32), 2 }; const packed_mphf = try packU32(testing.allocator, keys); defer testing.allocator.free(packed_mphf); var hashes: [keys.len]u32 = undefined; for (keys) |key, i| { hashes[i] = bdz.search_u32(packed_mphf, key); } sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32)); for (hashes) |hash, i| try testing.expectEqual(i, hash); } test "pack str" { const keys = &[_][]const u8{ "foo", "bar", "baz", "1", "2", "3" }; const packed_mphf = try packStr(testing.allocator, keys[0..]); defer testing.allocator.free(packed_mphf); var hashes: [keys.len]u32 = undefined; for (keys) |key, i| { hashes[i] = bdz.search(packed_mphf, key); } sort.sort(u32, hashes[0..], {}, comptime sort.asc(u32)); for (hashes) |hash, i| try testing.expectEqual(i, hash); } test "CMPH_ALGO.CMPH_BDZ is in sync with our definition" { const c = @cImport({ @cInclude("cmph_types.h"); }); try testing.expectEqual(c.CMPH_BDZ, @enumToInt(CMPH_ALGO.CMPH_BDZ)); }