turbonss/lib/shell.zig
Motiejus Jakštys 347f0a1392 Write the file
Also move DB and Corpus to their own files while doing that.
2022-03-24 12:36:08 +02:00

198 lines
7.0 KiB
Zig

const std = @import("std");
const Allocator = std.mem.Allocator;
const PriorityDequeue = std.PriorityDequeue;
const StringHashMap = std.StringHashMap;
const BoundedArray = std.BoundedArray;
const assert = std.debug.assert;
pub const max_shells = 255;
pub const max_shell_len = 256;
// ShellReader interprets "Shell Index" and "Shell Blob" sections.
pub const ShellReader = struct {
index: []const u16,
blob: []const u8,
pub fn init(index: []align(2) const u8, blob: []const u8) ShellReader {
return ShellReader{
.index = std.mem.bytesAsSlice(u16, index),
.blob = blob,
};
}
// get returns a shell at the given index.
pub fn get(self: *const ShellReader, idx: u8) []const u8 {
return self.blob[self.index[idx]..self.index[idx + 1]];
}
};
// ShellWriter is a shell popularity contest: collect shells and return the
// popular ones, sorted by score. score := len(shell) * number_of_shells.
pub const ShellWriter = struct {
counts: std.StringHashMap(u32),
allocator: Allocator,
const KV = struct {
shell: []const u8,
score: u64,
};
pub const ShellSections = struct {
// len is the number of shells in this section.
len: u8,
// index points the i'th shell to it's offset in blob. The last
// byte of the i'th shell is index[i+1].
index: BoundedArray(u16, max_shells),
// blob contains `index.len+1` number of records. The last record is
// pointing to the end of the blob, so length of the last shell can be
// calculated from the index array.
blob: BoundedArray(u8, (max_shells + 1) * max_shell_len),
// shell2idx helps translate a shell (string) to it's index.
shell2idx: StringHashMap(u8),
// initializes and populates shell sections. All strings are copied,
// nothing is owned.
pub fn init(
allocator: Allocator,
shells: BoundedArray([]const u8, max_shells),
) error{OutOfMemory}!ShellSections {
assert(shells.len <= max_shells);
var self = ShellSections{
.len = @intCast(u8, shells.len),
.index = BoundedArray(u16, max_shells).init(shells.len) catch unreachable,
.blob = BoundedArray(u8, (max_shells + 1) * max_shell_len).init(0) catch unreachable,
.shell2idx = StringHashMap(u8).init(allocator),
};
if (shells.len == 0) return self;
errdefer self.shell2idx.deinit();
for (shells.constSlice()) |shell, idx| {
const idx8 = @intCast(u8, idx);
const offset = @intCast(u16, self.blob.len);
self.blob.appendSliceAssumeCapacity(shell);
try self.shell2idx.put(self.blob.constSlice()[offset..], idx8);
self.index.set(idx8, offset);
}
self.index.appendAssumeCapacity(@intCast(u8, self.blob.len));
return self;
}
pub fn section_index(self: *const ShellSections) []align(2) const u8 {
return std.mem.sliceAsBytes(self.index.constSlice());
}
pub fn section_blob(self: *const ShellSections) []const u8 {
return self.blob.constSlice();
}
pub fn deinit(self: *ShellSections) void {
self.shell2idx.deinit();
self.* = undefined;
}
pub fn getIndex(self: *const ShellSections, shell: []const u8) ?u8 {
return self.shell2idx.get(shell);
}
};
pub fn init(allocator: Allocator) ShellWriter {
return ShellWriter{
.counts = std.StringHashMap(u32).init(allocator),
.allocator = allocator,
};
}
pub fn deinit(self: *ShellWriter) void {
var it = self.counts.keyIterator();
while (it.next()) |key_ptr|
self.counts.allocator.free(key_ptr.*);
self.counts.deinit();
self.* = undefined;
}
pub fn put(self: *ShellWriter, shell: []const u8) !void {
const res = try self.counts.getOrPutAdapted(shell, self.counts.ctx);
if (!res.found_existing) {
res.key_ptr.* = try self.allocator.dupe(u8, shell);
res.value_ptr.* = 1;
} else {
res.value_ptr.* += 1;
}
}
fn cmpShells(_: void, a: KV, b: KV) std.math.Order {
return std.math.order(a.score, b.score);
}
// toOwnedSections returns the analyzed ShellSections. Resets the shell
// popularity contest. ShellSections memory is allocated by the ShellWriter
// allocator, and must be deInit'ed by the caller.
pub fn toOwnedSections(self: *ShellWriter, limit: u10) error{OutOfMemory}!ShellSections {
assert(limit <= max_shells);
var deque = PriorityDequeue(KV, void, cmpShells).init(self.allocator, {});
defer deque.deinit();
var it = self.counts.iterator();
while (it.next()) |entry| {
if (entry.value_ptr.* == 1)
continue;
const score = entry.key_ptr.*.len * entry.value_ptr.*;
try deque.add(KV{ .shell = entry.key_ptr.*, .score = score });
}
const total = std.math.min(deque.count(), limit);
var topShells = BoundedArray([]const u8, max_shells).init(total) catch |err| switch (err) {
error.Overflow => unreachable,
};
var i: u32 = 0;
while (i < total) : (i += 1)
topShells.set(i, deque.removeMax().shell);
const result = ShellSections.init(self.allocator, topShells);
self.deinit();
self.* = init(self.allocator);
return result;
}
};
const testing = std.testing;
test "basic shellpopcon" {
var popcon = ShellWriter.init(testing.allocator);
const bash = "/bin/bash"; // 9 chars
const zsh = "/bin/zsh"; // 8 chars
const long = "/bin/very-long-shell-name-ought-to-be-first"; // 43 chars
const nobody = "/bin/nobody"; // only 1 instance, ought to ignore
const input = [_][]const u8{
zsh, zsh, zsh, zsh, // zsh score 8*4=32
bash, bash, bash, nobody, // bash score 3*9=27
long, long, // long score 2*43=86
};
for (input) |shell| {
try popcon.put(shell);
}
var sections = try popcon.toOwnedSections(max_shells);
defer sections.deinit();
try testing.expectEqual(sections.index.len, 4); // all but "nobody" qualify
try testing.expectEqual(sections.getIndex(long).?, 0);
try testing.expectEqual(sections.getIndex(zsh).?, 1);
try testing.expectEqual(sections.getIndex(bash).?, 2);
try testing.expectEqual(sections.getIndex(nobody), null);
try testing.expectEqual(sections.section_blob().len, bash.len + zsh.len + long.len);
const shellReader = ShellReader.init(
sections.section_index(),
sections.section_blob(),
);
try testing.expectEqualStrings(shellReader.get(0), long);
try testing.expectEqualStrings(shellReader.get(1), zsh);
try testing.expectEqualStrings(shellReader.get(2), bash);
try testing.expectEqual(shellReader.index.len, 4);
}