const std = @import("std"); const Allocator = std.mem.Allocator; const PriorityDequeue = std.PriorityDequeue; const StringArrayHashMap = std.StringArrayHashMap; const StringHashMap = std.StringHashMap; const BoundedArray = std.BoundedArray; const StringContext = std.hash_map.StringContext; // MaxShells is the maximum number of "popular" shells. pub const MaxShells = 63; pub const MaxShellLen = 64; // ShellReader interprets "Shell Index" and "Shell Blob" sections. pub const ShellReader = struct { sectionIndex: []const ShellIndex, sectionBlob: []const u8, pub fn init(index: []const u8, blob: []const u8) ShellReader { return ShellReader{ .sectionIndex = std.mem.bytesAsSlice(ShellIndex, index), .sectionBlob = blob, }; } // get returns a shell at the given index. pub fn get(self: *const ShellReader, idx: u10) []const u8 { const shellIndex = self.sectionIndex[idx]; const start = shellIndex.offset << 2; const end = start + shellIndex.len + 1; return self.sectionBlob[start..end]; } }; // ShellWriter is a shell popularity contest: collect shells and return the // popular ones, sorted by score. score := len(shell) * number_of_shells. pub const ShellWriter = struct { counts: std.StringHashMap(u32), allocator: Allocator, const KV = struct { shell: []const u8, score: u64, }; const ShellSections = struct { index: BoundedArray(ShellIndex, MaxShells), blob: BoundedArray(u8, MaxShells * MaxShellLen), indices: StringHashMap(u6), // initializes and populates shell sections. All strings are copied, // nothing is owned. pub fn init( allocator: Allocator, shells: BoundedArray([]const u8, MaxShells), ) !ShellSections { var self = ShellSections{ .index = try BoundedArray(ShellIndex, MaxShells).init(shells.len), .blob = try BoundedArray(u8, MaxShells * MaxShellLen).init(0), .indices = StringHashMap(u6).init(allocator), }; var fullOffset: u12 = 0; var idx: u6 = 0; while (idx < shells.len) { const len = @intCast(u6, shells.get(idx).len); try self.blob.appendSlice(shells.get(idx)); const ourShell = self.blob.constSlice()[fullOffset .. fullOffset + len]; try self.indices.put(ourShell, idx); self.index.set(idx, ShellIndex{ .offset = @intCast(u10, fullOffset >> 2), .len = len - 1, }); fullOffset += len; const padding = roundUp4Padding(fullOffset); fullOffset += padding; //const stderr = std.io.getStdErr().writer(); //stderr.print("\n", .{}) catch unreachable; try self.blob.appendNTimes(0, padding); idx += 1; } return self; } pub fn sectionIndex(self: *const ShellSections) []const u8 { return std.mem.sliceAsBytes(self.index.constSlice()); } pub fn sectionBlob(self: *const ShellSections) []const u8 { return self.blob.constSlice(); } pub fn deinit(self: *ShellSections) void { self.indices.deinit(); self.* = undefined; } pub fn getIndex(self: *const ShellSections, shell: []const u8) ?u6 { return self.indices.get(shell); } }; pub fn init(allocator: Allocator) ShellWriter { return ShellWriter{ .counts = std.StringHashMap(u32).init(allocator), .allocator = allocator, }; } pub fn deinit(self: *ShellWriter) void { var it = self.counts.keyIterator(); while (it.next()) |key_ptr| { self.counts.allocator.free(key_ptr.*); } self.counts.deinit(); self.* = undefined; } pub fn put(self: *ShellWriter, shell: []const u8) !void { const res = try self.counts.getOrPutAdapted(shell, self.counts.ctx); if (res.found_existing) { res.value_ptr.* += 1; } else { // TODO(motiejus): can we avoid `ourShell` variable here? const ourShell = try self.allocator.alloc(u8, shell.len); std.mem.copy(u8, ourShell, shell); res.key_ptr.* = ourShell; res.value_ptr.* = 1; } } fn cmpShells(context: void, a: KV, b: KV) std.math.Order { _ = context; return std.math.order(a.score, b.score); } // toOwnedSections returns the analyzed ShellSections. Resets the shell // popularity contest. ShellSections memory is allocated by the ShellWriter // allocator, and must be deInit'ed by the caller. pub fn toOwnedSections(self: *ShellWriter, limit: u10) !ShellSections { var deque = PriorityDequeue(KV, void, cmpShells).init(self.allocator, {}); defer deque.deinit(); var it = self.counts.iterator(); while (it.next()) |entry| { if (entry.value_ptr.* == 1) { continue; } const score = entry.key_ptr.*.len * entry.value_ptr.*; try deque.add(KV{ .shell = entry.key_ptr.*, .score = score }); } const total = std.math.min(deque.count(), limit); var topShells = try BoundedArray([]const u8, MaxShells).init(total); var i: u32 = 0; while (i < total) { const elem = deque.removeMax().shell; topShells.set(i, elem); i += 1; } const result = ShellSections.init(self.allocator, topShells); const allocator = self.allocator; self.deinit(); self.* = init(allocator); return result; } }; // rounds up a u12 to the nearest factor of 4 and returns the difference // (padding) inline fn roundUp4Padding(n: u12) u12 { return roundUp4(n) - n; } // rounds up a u12 to the nearest factor of 4. inline fn roundUp4(n: u12) u12 { return ((n + 3) & ~@intCast(u12, 3)); } // ShellIndex is an index to the shell strings. As shell can be up to 64 bytes // (1<<6), maximum number of shells is 63 (1<<6-1), the maximum location offset // is 1<<12. To make location resolvable in 10 bits, all shells will be padded // to 4 bytes. // The actual shell length is len+1: we don't allow empty shells, and the real // length of the shell is 1-64 bytes. const ShellIndex = packed struct { offset: u10, len: u6, }; const testing = std.testing; test "basic shellpopcon" { var popcon = ShellWriter.init(testing.allocator); defer popcon.deinit(); const bash = "/bin/bash"; // 9 chars const zsh = "/bin/zsh"; // 8 chars const long = "/bin/very-long-shell-name-ought-to-be-first"; // 43 chars const nobody = "/bin/nobody"; // only 1 instance, ought to ignore const input = [_][]const u8{ zsh, zsh, zsh, zsh, // zsh score 8*4=32 bash, bash, bash, nobody, // bash score 3*9=27 long, long, // long score 2*43=86 }; for (input) |shell| { try popcon.put(shell); } var sections = try popcon.toOwnedSections(MaxShells); defer sections.deinit(); try testing.expectEqual(sections.index.len, 3); // all but "nobody" qualify try testing.expectEqual(sections.getIndex(long).?, 0); try testing.expectEqual(sections.getIndex(zsh).?, 1); try testing.expectEqual(sections.getIndex(bash).?, 2); try testing.expectEqual(sections.getIndex(nobody), null); try testing.expectEqual( sections.sectionBlob().len, roundUp4(bash.len) + roundUp4(zsh.len) + roundUp4(long.len), ); const shellReader = ShellReader.init( sections.sectionIndex(), sections.sectionBlob(), ); try testing.expectEqualStrings(shellReader.get(0), long); try testing.expectEqualStrings(shellReader.get(1), zsh); try testing.expectEqualStrings(shellReader.get(2), bash); try testing.expectEqual(shellReader.sectionIndex.len, 3); } test "padding" { try testing.expectEqual(roundUp4Padding(@intCast(u12, 0)), 0); try testing.expectEqual(roundUp4Padding(@intCast(u12, 1)), 3); try testing.expectEqual(roundUp4Padding(@intCast(u12, 2)), 2); try testing.expectEqual(roundUp4Padding(@intCast(u12, 3)), 1); try testing.expectEqual(roundUp4Padding(@intCast(u12, 4)), 0); try testing.expectEqual(roundUp4Padding(@intCast(u12, 40)), 0); try testing.expectEqual(roundUp4Padding(@intCast(u12, 41)), 3); try testing.expectEqual(roundUp4Padding(@intCast(u12, 42)), 2); try testing.expectEqual(roundUp4Padding(@intCast(u12, 43)), 1); try testing.expectEqual(roundUp4Padding(@intCast(u12, 44)), 0); try testing.expectEqual(roundUp4Padding(@intCast(u12, 4091)), 1); try testing.expectEqual(roundUp4Padding(@intCast(u12, 4092)), 0); }