commit ea1ce2df9b20b5c91278eb0ed99a9cd0b0949e1a (tree)
parent 3fe981e1ad746f9e3dfa2006fc69c907c92ddce6
Author: Andrew Kelley <andrew@ziglang.org>
Date: Sat, 8 Feb 2025 04:54:38 -0800
Merge pull request #22808 from ziglang/fast-gpa
introduce std.heap.SmpAllocator
Diffstat:
11 files changed, 326 insertions(+), 43 deletions(-)
diff --git a/bootstrap.c b/bootstrap.c
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
"pub const enable_tracy = false;\n"
"pub const value_tracing = false;\n"
"pub const skip_non_native = false;\n"
- "pub const force_gpa = false;\n"
+ "pub const debug_gpa = false;\n"
"pub const dev = .core;\n"
"pub const value_interpret_mode = .direct;\n"
, zig_version);
diff --git a/build.zig b/build.zig
@@ -171,7 +171,7 @@ pub fn build(b: *std.Build) !void {
const tracy_callstack = b.option(bool, "tracy-callstack", "Include callstack information with Tracy data. Does nothing if -Dtracy is not provided") orelse (tracy != null);
const tracy_allocation = b.option(bool, "tracy-allocation", "Include allocation information with Tracy data. Does nothing if -Dtracy is not provided") orelse (tracy != null);
const tracy_callstack_depth: u32 = b.option(u32, "tracy-callstack-depth", "Declare callstack depth for Tracy data. Does nothing if -Dtracy_callstack is not provided") orelse 10;
- const force_gpa = b.option(bool, "force-gpa", "Force the compiler to use GeneralPurposeAllocator") orelse false;
+ const debug_gpa = b.option(bool, "debug-allocator", "Force the compiler to use DebugAllocator") orelse false;
const link_libc = b.option(bool, "force-link-libc", "Force self-hosted compiler to link libc") orelse (enable_llvm or only_c);
const sanitize_thread = b.option(bool, "sanitize-thread", "Enable thread-sanitization") orelse false;
const strip = b.option(bool, "strip", "Omit debug information");
@@ -233,7 +233,7 @@ pub fn build(b: *std.Build) !void {
exe_options.addOption(bool, "llvm_has_csky", llvm_has_csky);
exe_options.addOption(bool, "llvm_has_arc", llvm_has_arc);
exe_options.addOption(bool, "llvm_has_xtensa", llvm_has_xtensa);
- exe_options.addOption(bool, "force_gpa", force_gpa);
+ exe_options.addOption(bool, "debug_gpa", debug_gpa);
exe_options.addOption(DevEnv, "dev", b.option(DevEnv, "dev", "Build a compiler with a reduced feature set for development of specific features") orelse if (only_c) .bootstrap else .full);
exe_options.addOption(ValueInterpretMode, "value_interpret_mode", value_interpret_mode);
@@ -608,7 +608,7 @@ fn addWasiUpdateStep(b: *std.Build, version: [:0]const u8) !void {
exe_options.addOption(u32, "mem_leak_frames", 0);
exe_options.addOption(bool, "have_llvm", false);
- exe_options.addOption(bool, "force_gpa", false);
+ exe_options.addOption(bool, "debug_gpa", false);
exe_options.addOption([:0]const u8, "version", version);
exe_options.addOption(std.SemanticVersion, "semver", semver);
exe_options.addOption(bool, "enable_debug_extensions", false);
diff --git a/lib/libc/musl/src/thread/riscv32/clone.s b/lib/libc/musl/src/thread/riscv32/clone.s
@@ -7,6 +7,8 @@
.global __clone
.type __clone, %function
__clone:
+ andi a1, a1, -16
+
# Save func and arg to stack
addi a1, a1, -16
sw a0, 0(a1)
diff --git a/lib/libc/musl/src/thread/riscv64/clone.s b/lib/libc/musl/src/thread/riscv64/clone.s
@@ -7,6 +7,8 @@
.global __clone
.type __clone, %function
__clone:
+ andi a1, a1, -16
+
# Save func and arg to stack
addi a1, a1, -16
sd a0, 0(a1)
diff --git a/lib/std/heap.zig b/lib/std/heap.zig
@@ -9,11 +9,12 @@ const Allocator = std.mem.Allocator;
const windows = std.os.windows;
pub const ArenaAllocator = @import("heap/arena_allocator.zig").ArenaAllocator;
-pub const WasmAllocator = @import("heap/WasmAllocator.zig");
+pub const SmpAllocator = @import("heap/SmpAllocator.zig");
+pub const FixedBufferAllocator = @import("heap/FixedBufferAllocator.zig");
pub const PageAllocator = @import("heap/PageAllocator.zig");
-pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig");
pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator;
-pub const FixedBufferAllocator = @import("heap/FixedBufferAllocator.zig");
+pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig");
+pub const WasmAllocator = @import("heap/WasmAllocator.zig");
pub const DebugAllocatorConfig = @import("heap/debug_allocator.zig").Config;
pub const DebugAllocator = @import("heap/debug_allocator.zig").DebugAllocator;
@@ -358,6 +359,11 @@ else if (builtin.target.isWasm()) .{
.vtable = &PageAllocator.vtable,
};
+pub const smp_allocator: Allocator = .{
+ .ptr = undefined,
+ .vtable = &SmpAllocator.vtable,
+};
+
/// This allocator is fast, small, and specific to WebAssembly. In the future,
/// this will be the implementation automatically selected by
/// `GeneralPurposeAllocator` when compiling in `ReleaseSmall` mode for wasm32
@@ -475,7 +481,7 @@ pub fn StackFallbackAllocator(comptime size: usize) type {
};
}
-test "c_allocator" {
+test c_allocator {
if (builtin.link_libc) {
try testAllocator(c_allocator);
try testAllocatorAligned(c_allocator);
@@ -484,12 +490,20 @@ test "c_allocator" {
}
}
-test "raw_c_allocator" {
+test raw_c_allocator {
if (builtin.link_libc) {
try testAllocator(raw_c_allocator);
}
}
+test smp_allocator {
+ if (builtin.single_threaded) return;
+ try testAllocator(smp_allocator);
+ try testAllocatorAligned(smp_allocator);
+ try testAllocatorLargeAlignment(smp_allocator);
+ try testAllocatorAlignedShrink(smp_allocator);
+}
+
test PageAllocator {
const allocator = page_allocator;
try testAllocator(allocator);
@@ -978,4 +992,5 @@ test {
if (builtin.target.isWasm()) {
_ = WasmAllocator;
}
+ if (!builtin.single_threaded) _ = smp_allocator;
}
diff --git a/lib/std/heap/PageAllocator.zig b/lib/std/heap/PageAllocator.zig
@@ -16,11 +16,7 @@ pub const vtable: Allocator.VTable = .{
.free = free,
};
-fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
- _ = context;
- _ = ra;
- assert(n > 0);
-
+pub fn map(n: usize, alignment: mem.Alignment) ?[*]u8 {
const page_size = std.heap.pageSize();
if (n >= maxInt(usize) - page_size) return null;
const alignment_bytes = alignment.toByteUnits();
@@ -101,6 +97,13 @@ fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*
return result_ptr;
}
+fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
+ _ = context;
+ _ = ra;
+ assert(n > 0);
+ return map(n, alignment);
+}
+
fn resize(
context: *anyopaque,
memory: []u8,
@@ -114,7 +117,7 @@ fn resize(
return realloc(memory, new_len, false) != null;
}
-pub fn remap(
+fn remap(
context: *anyopaque,
memory: []u8,
alignment: mem.Alignment,
@@ -127,21 +130,24 @@ pub fn remap(
return realloc(memory, new_len, true);
}
-fn free(context: *anyopaque, slice: []u8, alignment: mem.Alignment, return_address: usize) void {
+fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, return_address: usize) void {
_ = context;
_ = alignment;
_ = return_address;
+ return unmap(@alignCast(memory));
+}
+pub fn unmap(memory: []align(page_size_min) u8) void {
if (native_os == .windows) {
- windows.VirtualFree(slice.ptr, 0, windows.MEM_RELEASE);
+ windows.VirtualFree(memory.ptr, 0, windows.MEM_RELEASE);
} else {
- const buf_aligned_len = mem.alignForward(usize, slice.len, std.heap.pageSize());
- posix.munmap(@alignCast(slice.ptr[0..buf_aligned_len]));
+ const page_aligned_len = mem.alignForward(usize, memory.len, std.heap.pageSize());
+ posix.munmap(memory.ptr[0..page_aligned_len]);
}
}
-fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 {
- const memory: []align(std.heap.page_size_min) u8 = @alignCast(uncasted_memory);
+pub fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 {
+ const memory: []align(page_size_min) u8 = @alignCast(uncasted_memory);
const page_size = std.heap.pageSize();
const new_size_aligned = mem.alignForward(usize, new_len, page_size);
diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
@@ -0,0 +1,261 @@
+//! An allocator that is designed for ReleaseFast optimization mode, with
+//! multi-threading enabled.
+//!
+//! This allocator is a singleton; it uses global state and only one should be
+//! instantiated for the entire process.
+//!
+//! ## Basic Design
+//!
+//! Each thread gets a separate freelist, however, the data must be recoverable
+//! when the thread exits. We do not directly learn when a thread exits, so
+//! occasionally, one thread must attempt to reclaim another thread's
+//! resources.
+//!
+//! Above a certain size, those allocations are memory mapped directly, with no
+//! storage of allocation metadata. This works because the implementation
+//! refuses resizes that would move an allocation from small category to large
+//! category or vice versa.
+//!
+//! Each allocator operation checks the thread identifier from a threadlocal
+//! variable to find out which metadata in the global state to access, and
+//! attempts to grab its lock. This will usually succeed without contention,
+//! unless another thread has been assigned the same id. In the case of such
+//! contention, the thread moves on to the next thread metadata slot and
+//! repeats the process of attempting to obtain the lock.
+//!
+//! By limiting the thread-local metadata array to the same number as the CPU
+//! count, ensures that as threads are created and destroyed, they cycle
+//! through the full set of freelists.
+
+const builtin = @import("builtin");
+
+const std = @import("../std.zig");
+const assert = std.debug.assert;
+const mem = std.mem;
+const math = std.math;
+const Allocator = std.mem.Allocator;
+const SmpAllocator = @This();
+const PageAllocator = std.heap.PageAllocator;
+
+cpu_count: u32,
+threads: [max_thread_count]Thread,
+
+var global: SmpAllocator = .{
+ .threads = @splat(.{}),
+ .cpu_count = 0,
+};
+threadlocal var thread_index: u32 = 0;
+
+const max_thread_count = 128;
+const slab_len: usize = @max(std.heap.page_size_max, 64 * 1024);
+/// Because of storing free list pointers, the minimum size class is 3.
+const min_class = math.log2(@sizeOf(usize));
+const size_class_count = math.log2(slab_len) - min_class;
+/// When a freelist length exceeds this number, a `free` will rotate up to
+/// `max_free_search` times before pushing.
+const max_freelist_len: u8 = 16;
+const max_free_search = 1;
+/// Before mapping a fresh page, `alloc` will rotate this many times.
+const max_alloc_search = 1;
+
+const Thread = struct {
+ /// Avoid false sharing.
+ _: void align(std.atomic.cache_line) = {},
+
+ /// Protects the state in this struct (per-thread state).
+ ///
+ /// Threads lock this before accessing their own state in order
+ /// to support freelist reclamation.
+ mutex: std.Thread.Mutex = .{},
+
+ /// For each size class, tracks the next address to be returned from
+ /// `alloc` when the freelist is empty.
+ next_addrs: [size_class_count]usize = @splat(0),
+ /// For each size class, points to the freed pointer.
+ frees: [size_class_count]usize = @splat(0),
+ /// For each size class, tracks the number of items in the freelist.
+ freelist_lens: [size_class_count]u8 = @splat(0),
+
+ fn lock() *Thread {
+ var index = thread_index;
+ {
+ const t = &global.threads[index];
+ if (t.mutex.tryLock()) {
+ @branchHint(.likely);
+ return t;
+ }
+ }
+ const cpu_count = getCpuCount();
+ assert(cpu_count != 0);
+ while (true) {
+ index = (index + 1) % cpu_count;
+ const t = &global.threads[index];
+ if (t.mutex.tryLock()) {
+ thread_index = index;
+ return t;
+ }
+ }
+ }
+
+ fn unlock(t: *Thread) void {
+ t.mutex.unlock();
+ }
+};
+
+fn getCpuCount() u32 {
+ const cpu_count = @atomicLoad(u32, &global.cpu_count, .unordered);
+ if (cpu_count != 0) return cpu_count;
+ const n: u32 = @min(std.Thread.getCpuCount() catch max_thread_count, max_thread_count);
+ return if (@cmpxchgStrong(u32, &global.cpu_count, 0, n, .monotonic, .monotonic)) |other| other else n;
+}
+
+pub const vtable: Allocator.VTable = .{
+ .alloc = alloc,
+ .resize = resize,
+ .remap = remap,
+ .free = free,
+};
+
+comptime {
+ assert(!builtin.single_threaded); // you're holding it wrong
+}
+
+fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
+ _ = context;
+ _ = ra;
+ const class = sizeClassIndex(len, alignment);
+ if (class >= size_class_count) {
+ @branchHint(.unlikely);
+ return PageAllocator.map(len, alignment);
+ }
+
+ const slot_size = slotSize(class);
+ assert(slab_len % slot_size == 0);
+ var search_count: u8 = 0;
+
+ var t = Thread.lock();
+
+ outer: while (true) {
+ const top_free_ptr = t.frees[class];
+ if (top_free_ptr != 0) {
+ @branchHint(.likely);
+ defer t.unlock();
+ const node: *usize = @ptrFromInt(top_free_ptr);
+ t.frees[class] = node.*;
+ t.freelist_lens[class] -|= 1;
+ return @ptrFromInt(top_free_ptr);
+ }
+
+ const next_addr = t.next_addrs[class];
+ if ((next_addr % slab_len) != 0) {
+ @branchHint(.likely);
+ defer t.unlock();
+ t.next_addrs[class] = next_addr + slot_size;
+ return @ptrFromInt(next_addr);
+ }
+
+ if (search_count >= max_alloc_search) {
+ @branchHint(.likely);
+ defer t.unlock();
+ // slab alignment here ensures the % slab len earlier catches the end of slots.
+ const slab = PageAllocator.map(slab_len, .fromByteUnits(slab_len)) orelse return null;
+ t.next_addrs[class] = @intFromPtr(slab) + slot_size;
+ t.freelist_lens[class] = 0;
+ return slab;
+ }
+
+ t.unlock();
+ const cpu_count = getCpuCount();
+ assert(cpu_count != 0);
+ var index = thread_index;
+ while (true) {
+ index = (index + 1) % cpu_count;
+ t = &global.threads[index];
+ if (t.mutex.tryLock()) {
+ thread_index = index;
+ search_count += 1;
+ continue :outer;
+ }
+ }
+ }
+}
+
+fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool {
+ _ = context;
+ _ = ra;
+ const class = sizeClassIndex(memory.len, alignment);
+ const new_class = sizeClassIndex(new_len, alignment);
+ if (class >= size_class_count) {
+ if (new_class < size_class_count) return false;
+ return PageAllocator.realloc(memory, new_len, false) != null;
+ }
+ return new_class == class;
+}
+
+fn remap(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) ?[*]u8 {
+ _ = context;
+ _ = ra;
+ const class = sizeClassIndex(memory.len, alignment);
+ const new_class = sizeClassIndex(new_len, alignment);
+ if (class >= size_class_count) {
+ if (new_class < size_class_count) return null;
+ return PageAllocator.realloc(memory, new_len, true);
+ }
+ return if (new_class == class) memory.ptr else null;
+}
+
+fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) void {
+ _ = context;
+ _ = ra;
+ const class = sizeClassIndex(memory.len, alignment);
+ if (class >= size_class_count) {
+ @branchHint(.unlikely);
+ return PageAllocator.unmap(@alignCast(memory));
+ }
+
+ const node: *usize = @alignCast(@ptrCast(memory.ptr));
+ var search_count: u8 = 0;
+
+ var t = Thread.lock();
+
+ outer: while (true) {
+ const freelist_len = t.freelist_lens[class];
+ if (freelist_len < max_freelist_len) {
+ @branchHint(.likely);
+ defer t.unlock();
+ node.* = t.frees[class];
+ t.frees[class] = @intFromPtr(node);
+ return;
+ }
+
+ if (search_count >= max_free_search) {
+ defer t.unlock();
+ t.freelist_lens[class] = freelist_len +| 1;
+ node.* = t.frees[class];
+ t.frees[class] = @intFromPtr(node);
+ return;
+ }
+
+ t.unlock();
+ const cpu_count = getCpuCount();
+ assert(cpu_count != 0);
+ var index = thread_index;
+ while (true) {
+ index = (index + 1) % cpu_count;
+ t = &global.threads[index];
+ if (t.mutex.tryLock()) {
+ thread_index = index;
+ search_count += 1;
+ continue :outer;
+ }
+ }
+ }
+}
+
+fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize {
+ return @max(@bitSizeOf(usize) - @clz(len - 1), @intFromEnum(alignment), min_class) - min_class;
+}
+
+fn slotSize(class: usize) usize {
+ return @as(usize, 1) << @intCast(class + min_class);
+}
diff --git a/lib/std/heap/WasmAllocator.zig b/lib/std/heap/WasmAllocator.zig
@@ -1,5 +1,3 @@
-//! This is intended to be merged into GeneralPurposeAllocator at some point.
-
const std = @import("../std.zig");
const builtin = @import("builtin");
const Allocator = std.mem.Allocator;
diff --git a/lib/std/heap/debug_allocator.zig b/lib/std/heap/debug_allocator.zig
@@ -851,8 +851,6 @@ pub fn DebugAllocator(comptime config: Config) type {
self.mutex.lock();
defer self.mutex.unlock();
- assert(old_memory.len != 0);
-
const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(old_memory.len - 1), @intFromEnum(alignment));
if (size_class_index >= self.buckets.len) {
@branchHint(.unlikely);
diff --git a/src/main.zig b/src/main.zig
@@ -171,30 +171,31 @@ pub fn log(
std.debug.print(prefix1 ++ prefix2 ++ format ++ "\n", args);
}
-var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{
+var debug_allocator: std.heap.DebugAllocator(.{
.stack_trace_frames = build_options.mem_leak_frames,
-}){};
+}) = .init;
pub fn main() anyerror!void {
crash_report.initialize();
- const use_gpa = (build_options.force_gpa or !builtin.link_libc) and native_os != .wasi;
- const gpa = gpa: {
- if (native_os == .wasi) {
- break :gpa std.heap.wasm_allocator;
- }
- if (use_gpa) {
- break :gpa general_purpose_allocator.allocator();
- }
- // We would prefer to use raw libc allocator here, but cannot
- // use it if it won't support the alignment we need.
- if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) {
- break :gpa std.heap.c_allocator;
+ const gpa, const is_debug = gpa: {
+ if (build_options.debug_gpa) break :gpa .{ debug_allocator.allocator(), true };
+ if (native_os == .wasi) break :gpa .{ std.heap.wasm_allocator, false };
+ if (builtin.link_libc) {
+ // We would prefer to use raw libc allocator here, but cannot use
+ // it if it won't support the alignment we need.
+ if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) {
+ break :gpa .{ std.heap.c_allocator, false };
+ }
+ break :gpa .{ std.heap.raw_c_allocator, false };
}
- break :gpa std.heap.raw_c_allocator;
+ break :gpa switch (builtin.mode) {
+ .Debug, .ReleaseSafe => .{ debug_allocator.allocator(), true },
+ .ReleaseFast, .ReleaseSmall => .{ std.heap.smp_allocator, false },
+ };
};
- defer if (use_gpa) {
- _ = general_purpose_allocator.deinit();
+ defer if (is_debug) {
+ _ = debug_allocator.deinit();
};
var arena_instance = std.heap.ArenaAllocator.init(gpa);
defer arena_instance.deinit();
diff --git a/stage1/config.zig.in b/stage1/config.zig.in
@@ -11,6 +11,6 @@ pub const enable_link_snapshots = false;
pub const enable_tracy = false;
pub const value_tracing = false;
pub const skip_non_native = false;
-pub const force_gpa = false;
+pub const debug_gpa = false;
pub const dev = .core;
pub const value_interpret_mode = .direct;