zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

commit 9cf28d1e9bc940b13cd47efb778fd42a4b3b88de (tree)
parent f3227598ebe9ac7e330fea0259d4290ee31e96b9
Author: Matthew Lugg <mlugg@mlugg.co.uk>
Date:   Fri,  8 Mar 2024 21:59:07 +0000

Merge pull request #19214 from mlugg/fuck-usingnamespace

std: fuck usingnamespace
Diffstat:
MCMakeLists.txt | 3++-
Mlib/std/c.zig | 63++++++++++++++++++++++++++++-----------------------------------
Mlib/std/c/openbsd.zig | 5-----
Mlib/std/enums.zig | 1534+++++++++++++++++++++++++++++++++++++------------------------------------------
Mlib/std/os/linux.zig | 594++++++++++++++++++++++++++++++++++++++-----------------------------------------
Alib/std/os/linux/IoUring.zig | 3670+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dlib/std/os/linux/io_uring.zig | 4228-------------------------------------------------------------------------------
Alib/std/os/linux/io_uring_sqe.zig | 579+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlib/std/os/linux/test.zig | 4++++
Mlib/std/os/windows.zig | 546++++++++++++++++++++++++++++++++++++++++---------------------------------------
10 files changed, 5565 insertions(+), 5661 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -291,7 +291,8 @@ set(ZIG_STAGE2_SOURCES "${CMAKE_SOURCE_DIR}/lib/std/os/linux/errno/generic.zig" "${CMAKE_SOURCE_DIR}/lib/std/os/linux/x86_64.zig" "${CMAKE_SOURCE_DIR}/lib/std/os/linux.zig" - "${CMAKE_SOURCE_DIR}/lib/std/os/linux/io_uring.zig" + "${CMAKE_SOURCE_DIR}/lib/std/os/linux/IoUring.zig" + "${CMAKE_SOURCE_DIR}/lib/std/os/linux/io_uring_sqe.zig" "${CMAKE_SOURCE_DIR}/lib/std/os/linux/x86_64.zig" "${CMAKE_SOURCE_DIR}/lib/std/os/windows.zig" "${CMAKE_SOURCE_DIR}/lib/std/os/windows/ntstatus.zig" diff --git a/lib/std/c.zig b/lib/std/c.zig @@ -1494,38 +1494,33 @@ pub const speed_t = switch (native_os) { pub const whence_t = if (native_os == .wasi) std.os.wasi.whence_t else c_int; // Unix-like systems -pub usingnamespace switch (native_os) { - .netbsd, .windows => struct {}, - else => struct { - pub const DIR = opaque {}; - pub extern "c" fn opendir(pathname: [*:0]const u8) ?*DIR; - pub extern "c" fn fdopendir(fd: c_int) ?*DIR; - pub extern "c" fn rewinddir(dp: *DIR) void; - pub extern "c" fn closedir(dp: *DIR) c_int; - pub extern "c" fn telldir(dp: *DIR) c_long; - pub extern "c" fn seekdir(dp: *DIR, loc: c_long) void; +pub const DIR = opaque {}; +pub extern "c" fn opendir(pathname: [*:0]const u8) ?*DIR; +pub extern "c" fn fdopendir(fd: c_int) ?*DIR; +pub extern "c" fn rewinddir(dp: *DIR) void; +pub extern "c" fn closedir(dp: *DIR) c_int; +pub extern "c" fn telldir(dp: *DIR) c_long; +pub extern "c" fn seekdir(dp: *DIR, loc: c_long) void; - pub extern "c" fn clock_gettime(clk_id: c_int, tp: *c.timespec) c_int; - pub extern "c" fn clock_getres(clk_id: c_int, tp: *c.timespec) c_int; - pub extern "c" fn gettimeofday(noalias tv: ?*c.timeval, noalias tz: ?*c.timezone) c_int; - pub extern "c" fn nanosleep(rqtp: *const c.timespec, rmtp: ?*c.timespec) c_int; +pub extern "c" fn clock_gettime(clk_id: c_int, tp: *c.timespec) c_int; +pub extern "c" fn clock_getres(clk_id: c_int, tp: *c.timespec) c_int; +pub extern "c" fn gettimeofday(noalias tv: ?*c.timeval, noalias tz: ?*c.timezone) c_int; +pub extern "c" fn nanosleep(rqtp: *const c.timespec, rmtp: ?*c.timespec) c_int; - pub extern "c" fn getrusage(who: c_int, usage: *c.rusage) c_int; +pub extern "c" fn getrusage(who: c_int, usage: *c.rusage) c_int; - pub extern "c" fn sched_yield() c_int; +pub extern "c" fn sched_yield() c_int; - pub extern "c" fn sigaction(sig: c_int, noalias act: ?*const c.Sigaction, noalias oact: ?*c.Sigaction) c_int; - pub extern "c" fn sigprocmask(how: c_int, noalias set: ?*const c.sigset_t, noalias oset: ?*c.sigset_t) c_int; - pub extern "c" fn sigfillset(set: ?*c.sigset_t) void; - pub extern "c" fn sigwait(set: ?*c.sigset_t, sig: ?*c_int) c_int; +pub extern "c" fn sigaction(sig: c_int, noalias act: ?*const c.Sigaction, noalias oact: ?*c.Sigaction) c_int; +pub extern "c" fn sigprocmask(how: c_int, noalias set: ?*const c.sigset_t, noalias oset: ?*c.sigset_t) c_int; +pub extern "c" fn sigfillset(set: ?*c.sigset_t) void; +pub extern "c" fn sigwait(set: ?*c.sigset_t, sig: ?*c_int) c_int; - pub extern "c" fn socket(domain: c_uint, sock_type: c_uint, protocol: c_uint) c_int; +pub extern "c" fn socket(domain: c_uint, sock_type: c_uint, protocol: c_uint) c_int; - pub extern "c" fn alarm(seconds: c_uint) c_uint; +pub extern "c" fn alarm(seconds: c_uint) c_uint; - pub extern "c" fn msync(addr: *align(page_size) const anyopaque, len: usize, flags: c_int) c_int; - }, -}; +pub extern "c" fn msync(addr: *align(page_size) const anyopaque, len: usize, flags: c_int) c_int; pub const fstat = switch (native_os) { .macos => switch (native_arch) { @@ -1863,16 +1858,14 @@ pub extern "c" fn setlogmask(maskpri: c_int) c_int; pub extern "c" fn if_nametoindex([*:0]const u8) c_int; -pub usingnamespace if (builtin.target.isAndroid()) struct { - // android bionic libc does not implement getcontext, - // and std.os.linux.getcontext also cannot be built for - // bionic libc currently. -} else if (native_os == .linux and builtin.target.isMusl()) struct { - // musl does not implement getcontext - pub const getcontext = std.os.linux.getcontext; -} else struct { - pub extern "c" fn getcontext(ucp: *std.os.ucontext_t) c_int; -}; +pub const getcontext = if (builtin.target.isAndroid()) + @compileError("android bionic libc does not implement getcontext") +else if (native_os == .linux and builtin.target.isMusl()) + std.os.linux.getcontext +else + struct { + extern fn getcontext(ucp: *std.os.ucontext_t) c_int; + }.getcontext; pub const max_align_t = if (native_abi == .msvc) f64 diff --git a/lib/std/c/openbsd.zig b/lib/std/c/openbsd.zig @@ -894,11 +894,6 @@ comptime { std.debug.assert(@sizeOf(siginfo_t) == 136); } -pub usingnamespace switch (builtin.cpu.arch) { - .x86_64 => struct {}, - else => struct {}, -}; - pub const ucontext_t = switch (builtin.cpu.arch) { .x86_64 => extern struct { sc_rdi: c_long, diff --git a/lib/std/enums.zig b/lib/std/enums.zig @@ -241,957 +241,794 @@ test nameCast { /// to dense indices. This type does no dynamic allocation and /// can be copied by value. pub fn EnumSet(comptime E: type) type { - const mixin = struct { - fn EnumSetExt(comptime Self: type) type { - const Indexer = Self.Indexer; - return struct { - /// Initializes the set using a struct of bools - pub fn init(init_values: EnumFieldStruct(E, bool, false)) Self { - var result = Self{}; - comptime var i: usize = 0; - inline while (i < Self.len) : (i += 1) { - const key = comptime Indexer.keyForIndex(i); - const tag = comptime @tagName(key); - if (@field(init_values, tag)) { - result.bits.set(i); - } - } - return result; - } - }; - } - }; - return IndexedSet(EnumIndexer(E), mixin.EnumSetExt); -} + return struct { + const Self = @This(); -/// A map keyed by an enum, backed by a bitfield and a dense array. -/// If the enum is not dense, a mapping will be constructed from -/// enum values to dense indices. This type does no dynamic -/// allocation and can be copied by value. -pub fn EnumMap(comptime E: type, comptime V: type) type { - const mixin = struct { - fn EnumMapExt(comptime Self: type) type { - const Indexer = Self.Indexer; - return struct { - /// Initializes the map using a sparse struct of optionals - pub fn init(init_values: EnumFieldStruct(E, ?V, @as(?V, null))) Self { - var result = Self{}; - comptime var i: usize = 0; - inline while (i < Self.len) : (i += 1) { - const key = comptime Indexer.keyForIndex(i); - const tag = comptime @tagName(key); - if (@field(init_values, tag)) |*v| { - result.bits.set(i); - result.values[i] = v.*; - } - } - return result; - } - /// Initializes a full mapping with all keys set to value. - /// Consider using EnumArray instead if the map will remain full. - pub fn initFull(value: V) Self { - var result = Self{ - .bits = Self.BitSet.initFull(), - .values = undefined, - }; - @memset(&result.values, value); - return result; - } - /// Initializes a full mapping with supplied values. - /// Consider using EnumArray instead if the map will remain full. - pub fn initFullWith(init_values: EnumFieldStruct(E, V, @as(?V, null))) Self { - return initFullWithDefault(@as(?V, null), init_values); - } - /// Initializes a full mapping with a provided default. - /// Consider using EnumArray instead if the map will remain full. - pub fn initFullWithDefault(comptime default: ?V, init_values: EnumFieldStruct(E, V, default)) Self { - var result = Self{ - .bits = Self.BitSet.initFull(), - .values = undefined, - }; - comptime var i: usize = 0; - inline while (i < Self.len) : (i += 1) { - const key = comptime Indexer.keyForIndex(i); - const tag = comptime @tagName(key); - result.values[i] = @field(init_values, tag); - } - return result; - } - }; - } - }; - return IndexedMap(EnumIndexer(E), V, mixin.EnumMapExt); -} + /// The indexing rules for converting between keys and indices. + pub const Indexer = EnumIndexer(E); + /// The element type for this set. + pub const Key = Indexer.Key; -/// A multiset of enum elements up to a count of usize. Backed -/// by an EnumArray. This type does no dynamic allocation and can -/// be copied by value. -pub fn EnumMultiset(comptime E: type) type { - return BoundedEnumMultiset(E, usize); -} + const BitSet = std.StaticBitSet(Indexer.count); -/// A multiset of enum elements up to CountSize. Backed by an -/// EnumArray. This type does no dynamic allocation and can be -/// copied by value. -pub fn BoundedEnumMultiset(comptime E: type, comptime CountSize: type) type { - return struct { - const Self = @This(); + /// The maximum number of items in this set. + pub const len = Indexer.count; - counts: EnumArray(E, CountSize), + bits: BitSet = BitSet.initEmpty(), - /// Initializes the multiset using a struct of counts. - pub fn init(init_counts: EnumFieldStruct(E, CountSize, 0)) Self { - var self = initWithCount(0); - inline for (@typeInfo(E).Enum.fields) |field| { - const c = @field(init_counts, field.name); - const key = @as(E, @enumFromInt(field.value)); - self.counts.set(key, c); + /// Initializes the set using a struct of bools + pub fn init(init_values: EnumFieldStruct(E, bool, false)) Self { + var result: Self = .{}; + inline for (0..Self.len) |i| { + const key = comptime Indexer.keyForIndex(i); + const tag = @tagName(key); + if (@field(init_values, tag)) { + result.bits.set(i); + } } - return self; + return result; } - /// Initializes the multiset with a count of zero. + /// Returns a set containing no keys. pub fn initEmpty() Self { - return initWithCount(0); + return .{ .bits = BitSet.initEmpty() }; } - /// Initializes the multiset with all keys at the - /// same count. - pub fn initWithCount(comptime c: CountSize) Self { - return .{ - .counts = EnumArray(E, CountSize).initDefault(c, .{}), - }; + /// Returns a set containing all possible keys. + pub fn initFull() Self { + return .{ .bits = BitSet.initFull() }; } - /// Returns the total number of key counts in the multiset. - pub fn count(self: Self) usize { - var sum: usize = 0; - for (self.counts.values) |c| { - sum += c; - } - return sum; + /// Returns a set containing multiple keys. + pub fn initMany(keys: []const Key) Self { + var set = initEmpty(); + for (keys) |key| set.insert(key); + return set; } - /// Checks if at least one key in multiset. - pub fn contains(self: Self, key: E) bool { - return self.counts.get(key) > 0; + /// Returns a set containing a single key. + pub fn initOne(key: Key) Self { + return initMany(&[_]Key{key}); } - /// Removes all instance of a key from multiset. Same as - /// setCount(key, 0). - pub fn removeAll(self: *Self, key: E) void { - return self.counts.set(key, 0); + /// Returns the number of keys in the set. + pub fn count(self: Self) usize { + return self.bits.count(); } - /// Increases the key count by given amount. Caller asserts - /// operation will not overflow. - pub fn addAssertSafe(self: *Self, key: E, c: CountSize) void { - self.counts.getPtr(key).* += c; + /// Checks if a key is in the set. + pub fn contains(self: Self, key: Key) bool { + return self.bits.isSet(Indexer.indexOf(key)); } - /// Increases the key count by given amount. - pub fn add(self: *Self, key: E, c: CountSize) error{Overflow}!void { - self.counts.set(key, try std.math.add(CountSize, self.counts.get(key), c)); + /// Puts a key in the set. + pub fn insert(self: *Self, key: Key) void { + self.bits.set(Indexer.indexOf(key)); } - /// Decreases the key count by given amount. If amount is - /// greater than the number of keys in multset, then key count - /// will be set to zero. - pub fn remove(self: *Self, key: E, c: CountSize) void { - self.counts.getPtr(key).* -= @min(self.getCount(key), c); + /// Removes a key from the set. + pub fn remove(self: *Self, key: Key) void { + self.bits.unset(Indexer.indexOf(key)); } - /// Returns the count for a key. - pub fn getCount(self: Self, key: E) CountSize { - return self.counts.get(key); + /// Changes the presence of a key in the set to match the passed bool. + pub fn setPresent(self: *Self, key: Key, present: bool) void { + self.bits.setValue(Indexer.indexOf(key), present); } - /// Set the count for a key. - pub fn setCount(self: *Self, key: E, c: CountSize) void { - self.counts.set(key, c); + /// Toggles the presence of a key in the set. If the key is in + /// the set, removes it. Otherwise adds it. + pub fn toggle(self: *Self, key: Key) void { + self.bits.toggle(Indexer.indexOf(key)); } - /// Increases the all key counts by given multiset. Caller - /// asserts operation will not overflow any key. - pub fn addSetAssertSafe(self: *Self, other: Self) void { - inline for (@typeInfo(E).Enum.fields) |field| { - const key = @as(E, @enumFromInt(field.value)); - self.addAssertSafe(key, other.getCount(key)); - } + /// Toggles the presence of all keys in the passed set. + pub fn toggleSet(self: *Self, other: Self) void { + self.bits.toggleSet(other.bits); } - /// Increases the all key counts by given multiset. - pub fn addSet(self: *Self, other: Self) error{Overflow}!void { - inline for (@typeInfo(E).Enum.fields) |field| { - const key = @as(E, @enumFromInt(field.value)); - try self.add(key, other.getCount(key)); - } + /// Toggles all possible keys in the set. + pub fn toggleAll(self: *Self) void { + self.bits.toggleAll(); } - /// Decreases the all key counts by given multiset. If - /// the given multiset has more key counts than this, - /// then that key will have a key count of zero. - pub fn removeSet(self: *Self, other: Self) void { - inline for (@typeInfo(E).Enum.fields) |field| { - const key = @as(E, @enumFromInt(field.value)); - self.remove(key, other.getCount(key)); - } + /// Adds all keys in the passed set to this set. + pub fn setUnion(self: *Self, other: Self) void { + self.bits.setUnion(other.bits); } - /// Returns true iff all key counts are the same as - /// given multiset. + /// Removes all keys which are not in the passed set. + pub fn setIntersection(self: *Self, other: Self) void { + self.bits.setIntersection(other.bits); + } + + /// Returns true iff both sets have the same keys. pub fn eql(self: Self, other: Self) bool { - inline for (@typeInfo(E).Enum.fields) |field| { - const key = @as(E, @enumFromInt(field.value)); - if (self.getCount(key) != other.getCount(key)) { - return false; - } - } - return true; + return self.bits.eql(other.bits); } - /// Returns true iff all key counts less than or - /// equal to the given multiset. + /// Returns true iff all the keys in this set are + /// in the other set. The other set may have keys + /// not found in this set. pub fn subsetOf(self: Self, other: Self) bool { - inline for (@typeInfo(E).Enum.fields) |field| { - const key = @as(E, @enumFromInt(field.value)); - if (self.getCount(key) > other.getCount(key)) { - return false; - } - } - return true; + return self.bits.subsetOf(other.bits); } - /// Returns true iff all key counts greater than or - /// equal to the given multiset. + /// Returns true iff this set contains all the keys + /// in the other set. This set may have keys not + /// found in the other set. pub fn supersetOf(self: Self, other: Self) bool { - inline for (@typeInfo(E).Enum.fields) |field| { - const key = @as(E, @enumFromInt(field.value)); - if (self.getCount(key) < other.getCount(key)) { - return false; - } - } - return true; + return self.bits.supersetOf(other.bits); } - /// Returns a multiset with the total key count of this - /// multiset and the other multiset. Caller asserts - /// operation will not overflow any key. - pub fn plusAssertSafe(self: Self, other: Self) Self { - var result = self; - result.addSetAssertSafe(other); - return result; + /// Returns a set with all the keys not in this set. + pub fn complement(self: Self) Self { + return .{ .bits = self.bits.complement() }; } - /// Returns a multiset with the total key count of this - /// multiset and the other multiset. - pub fn plus(self: Self, other: Self) error{Overflow}!Self { - var result = self; - try result.addSet(other); - return result; + /// Returns a set with keys that are in either this + /// set or the other set. + pub fn unionWith(self: Self, other: Self) Self { + return .{ .bits = self.bits.unionWith(other.bits) }; } - /// Returns a multiset with the key count of this - /// multiset minus the corresponding key count in the - /// other multiset. If the other multiset contains - /// more key count than this set, that key will have - /// a count of zero. - pub fn minus(self: Self, other: Self) Self { - var result = self; - result.removeSet(other); - return result; + /// Returns a set with keys that are in both this + /// set and the other set. + pub fn intersectWith(self: Self, other: Self) Self { + return .{ .bits = self.bits.intersectWith(other.bits) }; } - pub const Entry = EnumArray(E, CountSize).Entry; - pub const Iterator = EnumArray(E, CountSize).Iterator; - - /// Returns an iterator over this multiset. Keys with zero - /// counts are included. Modifications to the set during - /// iteration may or may not be observed by the iterator, - /// but will not invalidate it. - pub fn iterator(self: *Self) Iterator { - return self.counts.iterator(); + /// Returns a set with keys that are in either this + /// set or the other set, but not both. + pub fn xorWith(self: Self, other: Self) Self { + return .{ .bits = self.bits.xorWith(other.bits) }; } - }; -} -test EnumMultiset { - const Ball = enum { red, green, blue }; + /// Returns a set with keys that are in this set + /// except for keys in the other set. + pub fn differenceWith(self: Self, other: Self) Self { + return .{ .bits = self.bits.differenceWith(other.bits) }; + } - const empty = EnumMultiset(Ball).initEmpty(); - const r0_g1_b2 = EnumMultiset(Ball).init(.{ - .red = 0, - .green = 1, - .blue = 2, - }); - const ten_of_each = EnumMultiset(Ball).initWithCount(10); - - try testing.expectEqual(empty.count(), 0); - try testing.expectEqual(r0_g1_b2.count(), 3); - try testing.expectEqual(ten_of_each.count(), 30); - - try testing.expect(!empty.contains(.red)); - try testing.expect(!empty.contains(.green)); - try testing.expect(!empty.contains(.blue)); - - try testing.expect(!r0_g1_b2.contains(.red)); - try testing.expect(r0_g1_b2.contains(.green)); - try testing.expect(r0_g1_b2.contains(.blue)); - - try testing.expect(ten_of_each.contains(.red)); - try testing.expect(ten_of_each.contains(.green)); - try testing.expect(ten_of_each.contains(.blue)); - - { - var copy = ten_of_each; - copy.removeAll(.red); - try testing.expect(!copy.contains(.red)); - - // removeAll second time does nothing - copy.removeAll(.red); - try testing.expect(!copy.contains(.red)); - } + /// Returns an iterator over this set, which iterates in + /// index order. Modifications to the set during iteration + /// may or may not be observed by the iterator, but will + /// not invalidate it. + pub fn iterator(self: *const Self) Iterator { + return .{ .inner = self.bits.iterator(.{}) }; + } - { - var copy = ten_of_each; - copy.addAssertSafe(.red, 6); - try testing.expectEqual(copy.getCount(.red), 16); - } + pub const Iterator = struct { + inner: BitSet.Iterator(.{}), - { - var copy = ten_of_each; - try copy.add(.red, 6); - try testing.expectEqual(copy.getCount(.red), 16); + pub fn next(self: *Iterator) ?Key { + return if (self.inner.next()) |index| + Indexer.keyForIndex(index) + else + null; + } + }; + }; +} - try testing.expectError(error.Overflow, copy.add(.red, std.math.maxInt(usize))); - } +/// A map keyed by an enum, backed by a bitfield and a dense array. +/// If the enum is not dense, a mapping will be constructed from +/// enum values to dense indices. This type does no dynamic +/// allocation and can be copied by value. +pub fn EnumMap(comptime E: type, comptime V: type) type { + return struct { + const Self = @This(); - { - var copy = ten_of_each; - copy.remove(.red, 4); - try testing.expectEqual(copy.getCount(.red), 6); + /// The index mapping for this map + pub const Indexer = EnumIndexer(E); + /// The key type used to index this map + pub const Key = Indexer.Key; + /// The value type stored in this map + pub const Value = V; + /// The number of possible keys in the map + pub const len = Indexer.count; - // subtracting more it contains does not underflow - copy.remove(.green, 14); - try testing.expectEqual(copy.getCount(.green), 0); - } + const BitSet = std.StaticBitSet(Indexer.count); - try testing.expectEqual(empty.getCount(.green), 0); - try testing.expectEqual(r0_g1_b2.getCount(.green), 1); - try testing.expectEqual(ten_of_each.getCount(.green), 10); + /// Bits determining whether items are in the map + bits: BitSet = BitSet.initEmpty(), + /// Values of items in the map. If the associated + /// bit is zero, the value is undefined. + values: [Indexer.count]Value = undefined, - { - var copy = empty; - copy.setCount(.red, 6); - try testing.expectEqual(copy.getCount(.red), 6); - } + /// Initializes the map using a sparse struct of optionals + pub fn init(init_values: EnumFieldStruct(E, ?Value, null)) Self { + var result: Self = .{}; + inline for (0..Self.len) |i| { + const key = comptime Indexer.keyForIndex(i); + const tag = @tagName(key); + if (@field(init_values, tag)) |*v| { + result.bits.set(i); + result.values[i] = v.*; + } + } + } - { - var copy = r0_g1_b2; - copy.addSetAssertSafe(ten_of_each); - try testing.expectEqual(copy.getCount(.red), 10); - try testing.expectEqual(copy.getCount(.green), 11); - try testing.expectEqual(copy.getCount(.blue), 12); - } + /// Initializes a full mapping with all keys set to value. + /// Consider using EnumArray instead if the map will remain full. + pub fn initFull(value: Value) Self { + var result: Self = .{ + .bits = Self.BitSet.initFull(), + .values = undefined, + }; + @memset(&result.values, value); + return result; + } - { - var copy = r0_g1_b2; - try copy.addSet(ten_of_each); - try testing.expectEqual(copy.getCount(.red), 10); - try testing.expectEqual(copy.getCount(.green), 11); - try testing.expectEqual(copy.getCount(.blue), 12); + /// Initializes a full mapping with supplied values. + /// Consider using EnumArray instead if the map will remain full. + pub fn initFullWith(init_values: EnumFieldStruct(E, Value, null)) Self { + return initFullWithDefault(null, init_values); + } - const full = EnumMultiset(Ball).initWithCount(std.math.maxInt(usize)); - try testing.expectError(error.Overflow, copy.addSet(full)); - } + /// Initializes a full mapping with a provided default. + /// Consider using EnumArray instead if the map will remain full. + pub fn initFullWithDefault(comptime default: ?Value, init_values: EnumFieldStruct(E, Value, default)) Self { + var result: Self = .{ + .bits = Self.BitSet.initFull(), + .values = undefined, + }; + inline for (0..Self.len) |i| { + const key = comptime Indexer.keyForIndex(i); + const tag = @tagName(key); + result.values[i] = @field(init_values, tag); + } + return result; + } - { - var copy = ten_of_each; - copy.removeSet(r0_g1_b2); - try testing.expectEqual(copy.getCount(.red), 10); - try testing.expectEqual(copy.getCount(.green), 9); - try testing.expectEqual(copy.getCount(.blue), 8); + /// The number of items in the map. + pub fn count(self: Self) usize { + return self.bits.count(); + } - copy.removeSet(ten_of_each); - try testing.expectEqual(copy.getCount(.red), 0); - try testing.expectEqual(copy.getCount(.green), 0); - try testing.expectEqual(copy.getCount(.blue), 0); - } + /// Checks if the map contains an item. + pub fn contains(self: Self, key: Key) bool { + return self.bits.isSet(Indexer.indexOf(key)); + } - try testing.expect(empty.eql(empty)); - try testing.expect(r0_g1_b2.eql(r0_g1_b2)); - try testing.expect(ten_of_each.eql(ten_of_each)); - try testing.expect(!empty.eql(r0_g1_b2)); - try testing.expect(!r0_g1_b2.eql(ten_of_each)); - try testing.expect(!ten_of_each.eql(empty)); + /// Gets the value associated with a key. + /// If the key is not in the map, returns null. + pub fn get(self: Self, key: Key) ?Value { + const index = Indexer.indexOf(key); + return if (self.bits.isSet(index)) self.values[index] else null; + } - try testing.expect(empty.subsetOf(empty)); - try testing.expect(r0_g1_b2.subsetOf(r0_g1_b2)); - try testing.expect(empty.subsetOf(r0_g1_b2)); - try testing.expect(r0_g1_b2.subsetOf(ten_of_each)); - try testing.expect(!ten_of_each.subsetOf(r0_g1_b2)); - try testing.expect(!r0_g1_b2.subsetOf(empty)); + /// Gets the value associated with a key, which must + /// exist in the map. + pub fn getAssertContains(self: Self, key: Key) Value { + const index = Indexer.indexOf(key); + assert(self.bits.isSet(index)); + return self.values[index]; + } - try testing.expect(empty.supersetOf(empty)); - try testing.expect(r0_g1_b2.supersetOf(r0_g1_b2)); - try testing.expect(r0_g1_b2.supersetOf(empty)); - try testing.expect(ten_of_each.supersetOf(r0_g1_b2)); - try testing.expect(!r0_g1_b2.supersetOf(ten_of_each)); - try testing.expect(!empty.supersetOf(r0_g1_b2)); + /// Gets the address of the value associated with a key. + /// If the key is not in the map, returns null. + pub fn getPtr(self: *Self, key: Key) ?*Value { + const index = Indexer.indexOf(key); + return if (self.bits.isSet(index)) &self.values[index] else null; + } - { - // with multisets it could be the case where two - // multisets are neither subset nor superset of each - // other. + /// Gets the address of the const value associated with a key. + /// If the key is not in the map, returns null. + pub fn getPtrConst(self: *const Self, key: Key) ?*const Value { + const index = Indexer.indexOf(key); + return if (self.bits.isSet(index)) &self.values[index] else null; + } - const r10 = EnumMultiset(Ball).init(.{ - .red = 10, - }); - const b10 = EnumMultiset(Ball).init(.{ - .blue = 10, - }); + /// Gets the address of the value associated with a key. + /// The key must be present in the map. + pub fn getPtrAssertContains(self: *Self, key: Key) *Value { + const index = Indexer.indexOf(key); + assert(self.bits.isSet(index)); + return &self.values[index]; + } - try testing.expect(!r10.subsetOf(b10)); - try testing.expect(!b10.subsetOf(r10)); - try testing.expect(!r10.supersetOf(b10)); - try testing.expect(!b10.supersetOf(r10)); - } + /// Gets the address of the const value associated with a key. + /// The key must be present in the map. + pub fn getPtrConstAssertContains(self: *const Self, key: Key) *const Value { + const index = Indexer.indexOf(key); + assert(self.bits.isSet(index)); + return &self.values[index]; + } - { - const result = r0_g1_b2.plusAssertSafe(ten_of_each); - try testing.expectEqual(result.getCount(.red), 10); - try testing.expectEqual(result.getCount(.green), 11); - try testing.expectEqual(result.getCount(.blue), 12); - } + /// Adds the key to the map with the supplied value. + /// If the key is already in the map, overwrites the value. + pub fn put(self: *Self, key: Key, value: Value) void { + const index = Indexer.indexOf(key); + self.bits.set(index); + self.values[index] = value; + } - { - const result = try r0_g1_b2.plus(ten_of_each); - try testing.expectEqual(result.getCount(.red), 10); - try testing.expectEqual(result.getCount(.green), 11); - try testing.expectEqual(result.getCount(.blue), 12); + /// Adds the key to the map with an undefined value. + /// If the key is already in the map, the value becomes undefined. + /// A pointer to the value is returned, which should be + /// used to initialize the value. + pub fn putUninitialized(self: *Self, key: Key) *Value { + const index = Indexer.indexOf(key); + self.bits.set(index); + self.values[index] = undefined; + return &self.values[index]; + } - const full = EnumMultiset(Ball).initWithCount(std.math.maxInt(usize)); - try testing.expectError(error.Overflow, result.plus(full)); - } + /// Sets the value associated with the key in the map, + /// and returns the old value. If the key was not in + /// the map, returns null. + pub fn fetchPut(self: *Self, key: Key, value: Value) ?Value { + const index = Indexer.indexOf(key); + const result: ?Value = if (self.bits.isSet(index)) self.values[index] else null; + self.bits.set(index); + self.values[index] = value; + return result; + } - { - const result = ten_of_each.minus(r0_g1_b2); - try testing.expectEqual(result.getCount(.red), 10); - try testing.expectEqual(result.getCount(.green), 9); - try testing.expectEqual(result.getCount(.blue), 8); - } + /// Removes a key from the map. If the key was not in the map, + /// does nothing. + pub fn remove(self: *Self, key: Key) void { + const index = Indexer.indexOf(key); + self.bits.unset(index); + self.values[index] = undefined; + } - { - const result = ten_of_each.minus(r0_g1_b2).minus(ten_of_each); - try testing.expectEqual(result.getCount(.red), 0); - try testing.expectEqual(result.getCount(.green), 0); - try testing.expectEqual(result.getCount(.blue), 0); - } + /// Removes a key from the map, and returns the old value. + /// If the key was not in the map, returns null. + pub fn fetchRemove(self: *Self, key: Key) ?Value { + const index = Indexer.indexOf(key); + const result: ?Value = if (self.bits.isSet(index)) self.values[index] else null; + self.bits.unset(index); + self.values[index] = undefined; + return result; + } - { - var copy = empty; - var it = copy.iterator(); - var entry = it.next().?; - try testing.expectEqual(entry.key, .red); - try testing.expectEqual(entry.value.*, 0); - entry = it.next().?; - try testing.expectEqual(entry.key, .green); - try testing.expectEqual(entry.value.*, 0); - entry = it.next().?; - try testing.expectEqual(entry.key, .blue); - try testing.expectEqual(entry.value.*, 0); - try testing.expectEqual(it.next(), null); - } + /// Returns an iterator over the map, which visits items in index order. + /// Modifications to the underlying map may or may not be observed by + /// the iterator, but will not invalidate it. + pub fn iterator(self: *Self) Iterator { + return .{ + .inner = self.bits.iterator(.{}), + .values = &self.values, + }; + } - { - var copy = r0_g1_b2; - var it = copy.iterator(); - var entry = it.next().?; - try testing.expectEqual(entry.key, .red); - try testing.expectEqual(entry.value.*, 0); - entry = it.next().?; - try testing.expectEqual(entry.key, .green); - try testing.expectEqual(entry.value.*, 1); - entry = it.next().?; - try testing.expectEqual(entry.key, .blue); - try testing.expectEqual(entry.value.*, 2); - try testing.expectEqual(it.next(), null); - } -} + /// An entry in the map. + pub const Entry = struct { + /// The key associated with this entry. + /// Modifying this key will not change the map. + key: Key, -/// An array keyed by an enum, backed by a dense array. -/// If the enum is not dense, a mapping will be constructed from -/// enum values to dense indices. This type does no dynamic -/// allocation and can be copied by value. -pub fn EnumArray(comptime E: type, comptime V: type) type { - const mixin = struct { - fn EnumArrayExt(comptime Self: type) type { - const Indexer = Self.Indexer; - return struct { - /// Initializes all values in the enum array - pub fn init(init_values: EnumFieldStruct(E, V, @as(?V, null))) Self { - return initDefault(@as(?V, null), init_values); - } + /// A pointer to the value in the map associated + /// with this key. Modifications through this + /// pointer will modify the underlying data. + value: *Value, + }; + + pub const Iterator = struct { + inner: BitSet.Iterator(.{}), + values: *[Indexer.count]Value, - /// Initializes values in the enum array, with the specified default. - pub fn initDefault(comptime default: ?V, init_values: EnumFieldStruct(E, V, default)) Self { - var result = Self{ .values = undefined }; - comptime var i: usize = 0; - inline while (i < Self.len) : (i += 1) { - const key = comptime Indexer.keyForIndex(i); - const tag = @tagName(key); - result.values[i] = @field(init_values, tag); + pub fn next(self: *Iterator) ?Entry { + return if (self.inner.next()) |index| + Entry{ + .key = Indexer.keyForIndex(index), + .value = &self.values[index], } - return result; - } - }; - } + else + null; + } + }; }; - return IndexedArray(EnumIndexer(E), V, mixin.EnumArrayExt); } -fn NoExtension(comptime Self: type) type { - _ = Self; - return NoExt; +/// A multiset of enum elements up to a count of usize. Backed +/// by an EnumArray. This type does no dynamic allocation and can +/// be copied by value. +pub fn EnumMultiset(comptime E: type) type { + return BoundedEnumMultiset(E, usize); } -const NoExt = struct {}; -/// A set type with an Indexer mapping from keys to indices. -/// Presence or absence is stored as a dense bitfield. This -/// type does no allocation and can be copied by value. -pub fn IndexedSet(comptime I: type, comptime Ext: ?fn (type) type) type { - comptime ensureIndexer(I); +/// A multiset of enum elements up to CountSize. Backed by an +/// EnumArray. This type does no dynamic allocation and can be +/// copied by value. +pub fn BoundedEnumMultiset(comptime E: type, comptime CountSize: type) type { return struct { const Self = @This(); - pub usingnamespace (Ext orelse NoExtension)(Self); - - /// The indexing rules for converting between keys and indices. - pub const Indexer = I; - /// The element type for this set. - pub const Key = Indexer.Key; - - const BitSet = std.StaticBitSet(Indexer.count); - - /// The maximum number of items in this set. - pub const len = Indexer.count; - - bits: BitSet = BitSet.initEmpty(), + counts: EnumArray(E, CountSize), - /// Returns a set containing no keys. - pub fn initEmpty() Self { - return .{ .bits = BitSet.initEmpty() }; + /// Initializes the multiset using a struct of counts. + pub fn init(init_counts: EnumFieldStruct(E, CountSize, 0)) Self { + var self = initWithCount(0); + inline for (@typeInfo(E).Enum.fields) |field| { + const c = @field(init_counts, field.name); + const key = @as(E, @enumFromInt(field.value)); + self.counts.set(key, c); + } + return self; } - /// Returns a set containing all possible keys. - pub fn initFull() Self { - return .{ .bits = BitSet.initFull() }; + /// Initializes the multiset with a count of zero. + pub fn initEmpty() Self { + return initWithCount(0); } - /// Returns a set containing multiple keys. - pub fn initMany(keys: []const Key) Self { - var set = initEmpty(); - for (keys) |key| set.insert(key); - return set; + /// Initializes the multiset with all keys at the + /// same count. + pub fn initWithCount(comptime c: CountSize) Self { + return .{ + .counts = EnumArray(E, CountSize).initDefault(c, .{}), + }; } - /// Returns a set containing a single key. - pub fn initOne(key: Key) Self { - return initMany(&[_]Key{key}); + /// Returns the total number of key counts in the multiset. + pub fn count(self: Self) usize { + var sum: usize = 0; + for (self.counts.values) |c| { + sum += c; + } + return sum; } - /// Returns the number of keys in the set. - pub fn count(self: Self) usize { - return self.bits.count(); + /// Checks if at least one key in multiset. + pub fn contains(self: Self, key: E) bool { + return self.counts.get(key) > 0; } - /// Checks if a key is in the set. - pub fn contains(self: Self, key: Key) bool { - return self.bits.isSet(Indexer.indexOf(key)); + /// Removes all instance of a key from multiset. Same as + /// setCount(key, 0). + pub fn removeAll(self: *Self, key: E) void { + return self.counts.set(key, 0); } - /// Puts a key in the set. - pub fn insert(self: *Self, key: Key) void { - self.bits.set(Indexer.indexOf(key)); + /// Increases the key count by given amount. Caller asserts + /// operation will not overflow. + pub fn addAssertSafe(self: *Self, key: E, c: CountSize) void { + self.counts.getPtr(key).* += c; } - /// Removes a key from the set. - pub fn remove(self: *Self, key: Key) void { - self.bits.unset(Indexer.indexOf(key)); + /// Increases the key count by given amount. + pub fn add(self: *Self, key: E, c: CountSize) error{Overflow}!void { + self.counts.set(key, try std.math.add(CountSize, self.counts.get(key), c)); } - /// Changes the presence of a key in the set to match the passed bool. - pub fn setPresent(self: *Self, key: Key, present: bool) void { - self.bits.setValue(Indexer.indexOf(key), present); + /// Decreases the key count by given amount. If amount is + /// greater than the number of keys in multset, then key count + /// will be set to zero. + pub fn remove(self: *Self, key: E, c: CountSize) void { + self.counts.getPtr(key).* -= @min(self.getCount(key), c); } - /// Toggles the presence of a key in the set. If the key is in - /// the set, removes it. Otherwise adds it. - pub fn toggle(self: *Self, key: Key) void { - self.bits.toggle(Indexer.indexOf(key)); + /// Returns the count for a key. + pub fn getCount(self: Self, key: E) CountSize { + return self.counts.get(key); } - /// Toggles the presence of all keys in the passed set. - pub fn toggleSet(self: *Self, other: Self) void { - self.bits.toggleSet(other.bits); + /// Set the count for a key. + pub fn setCount(self: *Self, key: E, c: CountSize) void { + self.counts.set(key, c); } - /// Toggles all possible keys in the set. - pub fn toggleAll(self: *Self) void { - self.bits.toggleAll(); + /// Increases the all key counts by given multiset. Caller + /// asserts operation will not overflow any key. + pub fn addSetAssertSafe(self: *Self, other: Self) void { + inline for (@typeInfo(E).Enum.fields) |field| { + const key = @as(E, @enumFromInt(field.value)); + self.addAssertSafe(key, other.getCount(key)); + } } - /// Adds all keys in the passed set to this set. - pub fn setUnion(self: *Self, other: Self) void { - self.bits.setUnion(other.bits); + /// Increases the all key counts by given multiset. + pub fn addSet(self: *Self, other: Self) error{Overflow}!void { + inline for (@typeInfo(E).Enum.fields) |field| { + const key = @as(E, @enumFromInt(field.value)); + try self.add(key, other.getCount(key)); + } } - /// Removes all keys which are not in the passed set. - pub fn setIntersection(self: *Self, other: Self) void { - self.bits.setIntersection(other.bits); + /// Decreases the all key counts by given multiset. If + /// the given multiset has more key counts than this, + /// then that key will have a key count of zero. + pub fn removeSet(self: *Self, other: Self) void { + inline for (@typeInfo(E).Enum.fields) |field| { + const key = @as(E, @enumFromInt(field.value)); + self.remove(key, other.getCount(key)); + } } - /// Returns true iff both sets have the same keys. + /// Returns true iff all key counts are the same as + /// given multiset. pub fn eql(self: Self, other: Self) bool { - return self.bits.eql(other.bits); + inline for (@typeInfo(E).Enum.fields) |field| { + const key = @as(E, @enumFromInt(field.value)); + if (self.getCount(key) != other.getCount(key)) { + return false; + } + } + return true; } - /// Returns true iff all the keys in this set are - /// in the other set. The other set may have keys - /// not found in this set. + /// Returns true iff all key counts less than or + /// equal to the given multiset. pub fn subsetOf(self: Self, other: Self) bool { - return self.bits.subsetOf(other.bits); + inline for (@typeInfo(E).Enum.fields) |field| { + const key = @as(E, @enumFromInt(field.value)); + if (self.getCount(key) > other.getCount(key)) { + return false; + } + } + return true; } - /// Returns true iff this set contains all the keys - /// in the other set. This set may have keys not - /// found in the other set. + /// Returns true iff all key counts greater than or + /// equal to the given multiset. pub fn supersetOf(self: Self, other: Self) bool { - return self.bits.supersetOf(other.bits); - } - - /// Returns a set with all the keys not in this set. - pub fn complement(self: Self) Self { - return .{ .bits = self.bits.complement() }; - } - - /// Returns a set with keys that are in either this - /// set or the other set. - pub fn unionWith(self: Self, other: Self) Self { - return .{ .bits = self.bits.unionWith(other.bits) }; - } - - /// Returns a set with keys that are in both this - /// set and the other set. - pub fn intersectWith(self: Self, other: Self) Self { - return .{ .bits = self.bits.intersectWith(other.bits) }; + inline for (@typeInfo(E).Enum.fields) |field| { + const key = @as(E, @enumFromInt(field.value)); + if (self.getCount(key) < other.getCount(key)) { + return false; + } + } + return true; } - /// Returns a set with keys that are in either this - /// set or the other set, but not both. - pub fn xorWith(self: Self, other: Self) Self { - return .{ .bits = self.bits.xorWith(other.bits) }; + /// Returns a multiset with the total key count of this + /// multiset and the other multiset. Caller asserts + /// operation will not overflow any key. + pub fn plusAssertSafe(self: Self, other: Self) Self { + var result = self; + result.addSetAssertSafe(other); + return result; } - /// Returns a set with keys that are in this set - /// except for keys in the other set. - pub fn differenceWith(self: Self, other: Self) Self { - return .{ .bits = self.bits.differenceWith(other.bits) }; + /// Returns a multiset with the total key count of this + /// multiset and the other multiset. + pub fn plus(self: Self, other: Self) error{Overflow}!Self { + var result = self; + try result.addSet(other); + return result; } - /// Returns an iterator over this set, which iterates in - /// index order. Modifications to the set during iteration - /// may or may not be observed by the iterator, but will - /// not invalidate it. - pub fn iterator(self: *const Self) Iterator { - return .{ .inner = self.bits.iterator(.{}) }; + /// Returns a multiset with the key count of this + /// multiset minus the corresponding key count in the + /// other multiset. If the other multiset contains + /// more key count than this set, that key will have + /// a count of zero. + pub fn minus(self: Self, other: Self) Self { + var result = self; + result.removeSet(other); + return result; } - pub const Iterator = struct { - inner: BitSet.Iterator(.{}), + pub const Entry = EnumArray(E, CountSize).Entry; + pub const Iterator = EnumArray(E, CountSize).Iterator; - pub fn next(self: *Iterator) ?Key { - return if (self.inner.next()) |index| - Indexer.keyForIndex(index) - else - null; - } - }; + /// Returns an iterator over this multiset. Keys with zero + /// counts are included. Modifications to the set during + /// iteration may or may not be observed by the iterator, + /// but will not invalidate it. + pub fn iterator(self: *Self) Iterator { + return self.counts.iterator(); + } }; } -test "pure EnumSet fns" { - const Suit = enum { spades, hearts, clubs, diamonds }; - - const empty = EnumSet(Suit).initEmpty(); - const full = EnumSet(Suit).initFull(); - const black = EnumSet(Suit).initMany(&[_]Suit{ .spades, .clubs }); - const red = EnumSet(Suit).initMany(&[_]Suit{ .hearts, .diamonds }); - - try testing.expect(empty.eql(empty)); - try testing.expect(full.eql(full)); - try testing.expect(!empty.eql(full)); - try testing.expect(!full.eql(empty)); - try testing.expect(!empty.eql(black)); - try testing.expect(!full.eql(red)); - try testing.expect(!red.eql(empty)); - try testing.expect(!black.eql(full)); - - try testing.expect(empty.subsetOf(empty)); - try testing.expect(empty.subsetOf(full)); - try testing.expect(full.subsetOf(full)); - try testing.expect(!black.subsetOf(red)); - try testing.expect(!red.subsetOf(black)); - - try testing.expect(full.supersetOf(full)); - try testing.expect(full.supersetOf(empty)); - try testing.expect(empty.supersetOf(empty)); - try testing.expect(!black.supersetOf(red)); - try testing.expect(!red.supersetOf(black)); - - try testing.expect(empty.complement().eql(full)); - try testing.expect(full.complement().eql(empty)); - try testing.expect(black.complement().eql(red)); - try testing.expect(red.complement().eql(black)); +test EnumMultiset { + const Ball = enum { red, green, blue }; - try testing.expect(empty.unionWith(empty).eql(empty)); - try testing.expect(empty.unionWith(full).eql(full)); - try testing.expect(full.unionWith(full).eql(full)); - try testing.expect(full.unionWith(empty).eql(full)); - try testing.expect(black.unionWith(red).eql(full)); - try testing.expect(red.unionWith(black).eql(full)); + const empty = EnumMultiset(Ball).initEmpty(); + const r0_g1_b2 = EnumMultiset(Ball).init(.{ + .red = 0, + .green = 1, + .blue = 2, + }); + const ten_of_each = EnumMultiset(Ball).initWithCount(10); - try testing.expect(empty.intersectWith(empty).eql(empty)); - try testing.expect(empty.intersectWith(full).eql(empty)); - try testing.expect(full.intersectWith(full).eql(full)); - try testing.expect(full.intersectWith(empty).eql(empty)); - try testing.expect(black.intersectWith(red).eql(empty)); - try testing.expect(red.intersectWith(black).eql(empty)); + try testing.expectEqual(empty.count(), 0); + try testing.expectEqual(r0_g1_b2.count(), 3); + try testing.expectEqual(ten_of_each.count(), 30); - try testing.expect(empty.xorWith(empty).eql(empty)); - try testing.expect(empty.xorWith(full).eql(full)); - try testing.expect(full.xorWith(full).eql(empty)); - try testing.expect(full.xorWith(empty).eql(full)); - try testing.expect(black.xorWith(red).eql(full)); - try testing.expect(red.xorWith(black).eql(full)); + try testing.expect(!empty.contains(.red)); + try testing.expect(!empty.contains(.green)); + try testing.expect(!empty.contains(.blue)); - try testing.expect(empty.differenceWith(empty).eql(empty)); - try testing.expect(empty.differenceWith(full).eql(empty)); - try testing.expect(full.differenceWith(full).eql(empty)); - try testing.expect(full.differenceWith(empty).eql(full)); - try testing.expect(full.differenceWith(red).eql(black)); - try testing.expect(full.differenceWith(black).eql(red)); -} + try testing.expect(!r0_g1_b2.contains(.red)); + try testing.expect(r0_g1_b2.contains(.green)); + try testing.expect(r0_g1_b2.contains(.blue)); -test "EnumSet empty" { - const E = enum {}; - const empty = EnumSet(E).initEmpty(); - const full = EnumSet(E).initFull(); + try testing.expect(ten_of_each.contains(.red)); + try testing.expect(ten_of_each.contains(.green)); + try testing.expect(ten_of_each.contains(.blue)); - try std.testing.expect(empty.eql(full)); - try std.testing.expect(empty.complement().eql(full)); - try std.testing.expect(empty.complement().eql(full.complement())); - try std.testing.expect(empty.eql(full.complement())); -} + { + var copy = ten_of_each; + copy.removeAll(.red); + try testing.expect(!copy.contains(.red)); -test "EnumSet const iterator" { - const Direction = enum { up, down, left, right }; - const diag_move = init: { - var move = EnumSet(Direction).initEmpty(); - move.insert(.right); - move.insert(.up); - break :init move; - }; + // removeAll second time does nothing + copy.removeAll(.red); + try testing.expect(!copy.contains(.red)); + } - var result = EnumSet(Direction).initEmpty(); - var it = diag_move.iterator(); - while (it.next()) |dir| { - result.insert(dir); + { + var copy = ten_of_each; + copy.addAssertSafe(.red, 6); + try testing.expectEqual(copy.getCount(.red), 16); } - try testing.expect(result.eql(diag_move)); -} + { + var copy = ten_of_each; + try copy.add(.red, 6); + try testing.expectEqual(copy.getCount(.red), 16); -/// A map from keys to values, using an index lookup. Uses a -/// bitfield to track presence and a dense array of values. -/// This type does no allocation and can be copied by value. -pub fn IndexedMap(comptime I: type, comptime V: type, comptime Ext: ?fn (type) type) type { - comptime ensureIndexer(I); - return struct { - const Self = @This(); + try testing.expectError(error.Overflow, copy.add(.red, std.math.maxInt(usize))); + } - pub usingnamespace (Ext orelse NoExtension)(Self); + { + var copy = ten_of_each; + copy.remove(.red, 4); + try testing.expectEqual(copy.getCount(.red), 6); - /// The index mapping for this map - pub const Indexer = I; - /// The key type used to index this map - pub const Key = Indexer.Key; - /// The value type stored in this map - pub const Value = V; - /// The number of possible keys in the map - pub const len = Indexer.count; + // subtracting more it contains does not underflow + copy.remove(.green, 14); + try testing.expectEqual(copy.getCount(.green), 0); + } - const BitSet = std.StaticBitSet(Indexer.count); + try testing.expectEqual(empty.getCount(.green), 0); + try testing.expectEqual(r0_g1_b2.getCount(.green), 1); + try testing.expectEqual(ten_of_each.getCount(.green), 10); - /// Bits determining whether items are in the map - bits: BitSet = BitSet.initEmpty(), - /// Values of items in the map. If the associated - /// bit is zero, the value is undefined. - values: [Indexer.count]Value = undefined, + { + var copy = empty; + copy.setCount(.red, 6); + try testing.expectEqual(copy.getCount(.red), 6); + } - /// The number of items in the map. - pub fn count(self: Self) usize { - return self.bits.count(); - } + { + var copy = r0_g1_b2; + copy.addSetAssertSafe(ten_of_each); + try testing.expectEqual(copy.getCount(.red), 10); + try testing.expectEqual(copy.getCount(.green), 11); + try testing.expectEqual(copy.getCount(.blue), 12); + } - /// Checks if the map contains an item. - pub fn contains(self: Self, key: Key) bool { - return self.bits.isSet(Indexer.indexOf(key)); - } + { + var copy = r0_g1_b2; + try copy.addSet(ten_of_each); + try testing.expectEqual(copy.getCount(.red), 10); + try testing.expectEqual(copy.getCount(.green), 11); + try testing.expectEqual(copy.getCount(.blue), 12); - /// Gets the value associated with a key. - /// If the key is not in the map, returns null. - pub fn get(self: Self, key: Key) ?Value { - const index = Indexer.indexOf(key); - return if (self.bits.isSet(index)) self.values[index] else null; - } + const full = EnumMultiset(Ball).initWithCount(std.math.maxInt(usize)); + try testing.expectError(error.Overflow, copy.addSet(full)); + } - /// Gets the value associated with a key, which must - /// exist in the map. - pub fn getAssertContains(self: Self, key: Key) Value { - const index = Indexer.indexOf(key); - assert(self.bits.isSet(index)); - return self.values[index]; - } + { + var copy = ten_of_each; + copy.removeSet(r0_g1_b2); + try testing.expectEqual(copy.getCount(.red), 10); + try testing.expectEqual(copy.getCount(.green), 9); + try testing.expectEqual(copy.getCount(.blue), 8); - /// Gets the address of the value associated with a key. - /// If the key is not in the map, returns null. - pub fn getPtr(self: *Self, key: Key) ?*Value { - const index = Indexer.indexOf(key); - return if (self.bits.isSet(index)) &self.values[index] else null; - } + copy.removeSet(ten_of_each); + try testing.expectEqual(copy.getCount(.red), 0); + try testing.expectEqual(copy.getCount(.green), 0); + try testing.expectEqual(copy.getCount(.blue), 0); + } - /// Gets the address of the const value associated with a key. - /// If the key is not in the map, returns null. - pub fn getPtrConst(self: *const Self, key: Key) ?*const Value { - const index = Indexer.indexOf(key); - return if (self.bits.isSet(index)) &self.values[index] else null; - } + try testing.expect(empty.eql(empty)); + try testing.expect(r0_g1_b2.eql(r0_g1_b2)); + try testing.expect(ten_of_each.eql(ten_of_each)); + try testing.expect(!empty.eql(r0_g1_b2)); + try testing.expect(!r0_g1_b2.eql(ten_of_each)); + try testing.expect(!ten_of_each.eql(empty)); - /// Gets the address of the value associated with a key. - /// The key must be present in the map. - pub fn getPtrAssertContains(self: *Self, key: Key) *Value { - const index = Indexer.indexOf(key); - assert(self.bits.isSet(index)); - return &self.values[index]; - } + try testing.expect(empty.subsetOf(empty)); + try testing.expect(r0_g1_b2.subsetOf(r0_g1_b2)); + try testing.expect(empty.subsetOf(r0_g1_b2)); + try testing.expect(r0_g1_b2.subsetOf(ten_of_each)); + try testing.expect(!ten_of_each.subsetOf(r0_g1_b2)); + try testing.expect(!r0_g1_b2.subsetOf(empty)); - /// Gets the address of the const value associated with a key. - /// The key must be present in the map. - pub fn getPtrConstAssertContains(self: *const Self, key: Key) *const Value { - const index = Indexer.indexOf(key); - assert(self.bits.isSet(index)); - return &self.values[index]; - } + try testing.expect(empty.supersetOf(empty)); + try testing.expect(r0_g1_b2.supersetOf(r0_g1_b2)); + try testing.expect(r0_g1_b2.supersetOf(empty)); + try testing.expect(ten_of_each.supersetOf(r0_g1_b2)); + try testing.expect(!r0_g1_b2.supersetOf(ten_of_each)); + try testing.expect(!empty.supersetOf(r0_g1_b2)); - /// Adds the key to the map with the supplied value. - /// If the key is already in the map, overwrites the value. - pub fn put(self: *Self, key: Key, value: Value) void { - const index = Indexer.indexOf(key); - self.bits.set(index); - self.values[index] = value; - } + { + // with multisets it could be the case where two + // multisets are neither subset nor superset of each + // other. - /// Adds the key to the map with an undefined value. - /// If the key is already in the map, the value becomes undefined. - /// A pointer to the value is returned, which should be - /// used to initialize the value. - pub fn putUninitialized(self: *Self, key: Key) *Value { - const index = Indexer.indexOf(key); - self.bits.set(index); - self.values[index] = undefined; - return &self.values[index]; - } + const r10 = EnumMultiset(Ball).init(.{ + .red = 10, + }); + const b10 = EnumMultiset(Ball).init(.{ + .blue = 10, + }); - /// Sets the value associated with the key in the map, - /// and returns the old value. If the key was not in - /// the map, returns null. - pub fn fetchPut(self: *Self, key: Key, value: Value) ?Value { - const index = Indexer.indexOf(key); - const result: ?Value = if (self.bits.isSet(index)) self.values[index] else null; - self.bits.set(index); - self.values[index] = value; - return result; - } + try testing.expect(!r10.subsetOf(b10)); + try testing.expect(!b10.subsetOf(r10)); + try testing.expect(!r10.supersetOf(b10)); + try testing.expect(!b10.supersetOf(r10)); + } - /// Removes a key from the map. If the key was not in the map, - /// does nothing. - pub fn remove(self: *Self, key: Key) void { - const index = Indexer.indexOf(key); - self.bits.unset(index); - self.values[index] = undefined; - } + { + const result = r0_g1_b2.plusAssertSafe(ten_of_each); + try testing.expectEqual(result.getCount(.red), 10); + try testing.expectEqual(result.getCount(.green), 11); + try testing.expectEqual(result.getCount(.blue), 12); + } - /// Removes a key from the map, and returns the old value. - /// If the key was not in the map, returns null. - pub fn fetchRemove(self: *Self, key: Key) ?Value { - const index = Indexer.indexOf(key); - const result: ?Value = if (self.bits.isSet(index)) self.values[index] else null; - self.bits.unset(index); - self.values[index] = undefined; - return result; - } + { + const result = try r0_g1_b2.plus(ten_of_each); + try testing.expectEqual(result.getCount(.red), 10); + try testing.expectEqual(result.getCount(.green), 11); + try testing.expectEqual(result.getCount(.blue), 12); - /// Returns an iterator over the map, which visits items in index order. - /// Modifications to the underlying map may or may not be observed by - /// the iterator, but will not invalidate it. - pub fn iterator(self: *Self) Iterator { - return .{ - .inner = self.bits.iterator(.{}), - .values = &self.values, - }; - } + const full = EnumMultiset(Ball).initWithCount(std.math.maxInt(usize)); + try testing.expectError(error.Overflow, result.plus(full)); + } - /// An entry in the map. - pub const Entry = struct { - /// The key associated with this entry. - /// Modifying this key will not change the map. - key: Key, + { + const result = ten_of_each.minus(r0_g1_b2); + try testing.expectEqual(result.getCount(.red), 10); + try testing.expectEqual(result.getCount(.green), 9); + try testing.expectEqual(result.getCount(.blue), 8); + } - /// A pointer to the value in the map associated - /// with this key. Modifications through this - /// pointer will modify the underlying data. - value: *Value, - }; + { + const result = ten_of_each.minus(r0_g1_b2).minus(ten_of_each); + try testing.expectEqual(result.getCount(.red), 0); + try testing.expectEqual(result.getCount(.green), 0); + try testing.expectEqual(result.getCount(.blue), 0); + } - pub const Iterator = struct { - inner: BitSet.Iterator(.{}), - values: *[Indexer.count]Value, + { + var copy = empty; + var it = copy.iterator(); + var entry = it.next().?; + try testing.expectEqual(entry.key, .red); + try testing.expectEqual(entry.value.*, 0); + entry = it.next().?; + try testing.expectEqual(entry.key, .green); + try testing.expectEqual(entry.value.*, 0); + entry = it.next().?; + try testing.expectEqual(entry.key, .blue); + try testing.expectEqual(entry.value.*, 0); + try testing.expectEqual(it.next(), null); + } - pub fn next(self: *Iterator) ?Entry { - return if (self.inner.next()) |index| - Entry{ - .key = Indexer.keyForIndex(index), - .value = &self.values[index], - } - else - null; - } - }; - }; + { + var copy = r0_g1_b2; + var it = copy.iterator(); + var entry = it.next().?; + try testing.expectEqual(entry.key, .red); + try testing.expectEqual(entry.value.*, 0); + entry = it.next().?; + try testing.expectEqual(entry.key, .green); + try testing.expectEqual(entry.value.*, 1); + entry = it.next().?; + try testing.expectEqual(entry.key, .blue); + try testing.expectEqual(entry.value.*, 2); + try testing.expectEqual(it.next(), null); + } } -/// A dense array of values, using an indexed lookup. -/// This type does no allocation and can be copied by value. -pub fn IndexedArray(comptime I: type, comptime V: type, comptime Ext: ?fn (type) type) type { - comptime ensureIndexer(I); +/// An array keyed by an enum, backed by a dense array. +/// If the enum is not dense, a mapping will be constructed from +/// enum values to dense indices. This type does no dynamic +/// allocation and can be copied by value. +pub fn EnumArray(comptime E: type, comptime V: type) type { return struct { const Self = @This(); - pub usingnamespace (Ext orelse NoExtension)(Self); - /// The index mapping for this map - pub const Indexer = I; + pub const Indexer = EnumIndexer(E); /// The key type used to index this map pub const Key = Indexer.Key; /// The value type stored in this map @@ -1201,6 +1038,21 @@ pub fn IndexedArray(comptime I: type, comptime V: type, comptime Ext: ?fn (type) values: [Indexer.count]Value, + pub fn init(init_values: EnumFieldStruct(E, Value, null)) Self { + return initDefault(null, init_values); + } + + /// Initializes values in the enum array, with the specified default. + pub fn initDefault(comptime default: ?Value, init_values: EnumFieldStruct(E, Value, default)) Self { + var result: Self = .{ .values = undefined }; + inline for (0..Self.len) |i| { + const key = comptime Indexer.keyForIndex(i); + const tag = @tagName(key); + result.values[i] = @field(init_values, tag); + } + return result; + } + pub fn initUndefined() Self { return Self{ .values = undefined }; } @@ -1269,46 +1121,96 @@ pub fn IndexedArray(comptime I: type, comptime V: type, comptime Ext: ?fn (type) }; } -/// Verifies that a type is a valid Indexer, providing a helpful -/// compile error if not. An Indexer maps a comptime-known set -/// of keys to a dense set of zero-based indices. -/// The indexer interface must look like this: -/// ``` -/// struct { -/// /// The key type which this indexer converts to indices -/// pub const Key: type, -/// /// The number of indexes in the dense mapping -/// pub const count: comptime_int, -/// /// Converts from a key to an index -/// pub fn indexOf(Key) usize; -/// /// Converts from an index to a key -/// pub fn keyForIndex(usize) Key; -/// } -/// ``` -pub fn ensureIndexer(comptime T: type) void { - comptime { - if (!@hasDecl(T, "Key")) @compileError("Indexer must have decl Key: type."); - if (@TypeOf(T.Key) != type) @compileError("Indexer.Key must be a type."); - if (!@hasDecl(T, "count")) @compileError("Indexer must have decl count: comptime_int."); - if (@TypeOf(T.count) != comptime_int) @compileError("Indexer.count must be a comptime_int."); - if (!@hasDecl(T, "indexOf")) @compileError("Indexer.indexOf must be a fn (Key) usize."); - if (@TypeOf(T.indexOf) != fn (T.Key) usize) @compileError("Indexer must have decl indexOf: fn (Key) usize."); - if (!@hasDecl(T, "keyForIndex")) @compileError("Indexer must have decl keyForIndex: fn (usize) Key."); - if (@TypeOf(T.keyForIndex) != fn (usize) T.Key) @compileError("Indexer.keyForIndex must be a fn (usize) Key."); - } +test "pure EnumSet fns" { + const Suit = enum { spades, hearts, clubs, diamonds }; + + const empty = EnumSet(Suit).initEmpty(); + const full = EnumSet(Suit).initFull(); + const black = EnumSet(Suit).initMany(&[_]Suit{ .spades, .clubs }); + const red = EnumSet(Suit).initMany(&[_]Suit{ .hearts, .diamonds }); + + try testing.expect(empty.eql(empty)); + try testing.expect(full.eql(full)); + try testing.expect(!empty.eql(full)); + try testing.expect(!full.eql(empty)); + try testing.expect(!empty.eql(black)); + try testing.expect(!full.eql(red)); + try testing.expect(!red.eql(empty)); + try testing.expect(!black.eql(full)); + + try testing.expect(empty.subsetOf(empty)); + try testing.expect(empty.subsetOf(full)); + try testing.expect(full.subsetOf(full)); + try testing.expect(!black.subsetOf(red)); + try testing.expect(!red.subsetOf(black)); + + try testing.expect(full.supersetOf(full)); + try testing.expect(full.supersetOf(empty)); + try testing.expect(empty.supersetOf(empty)); + try testing.expect(!black.supersetOf(red)); + try testing.expect(!red.supersetOf(black)); + + try testing.expect(empty.complement().eql(full)); + try testing.expect(full.complement().eql(empty)); + try testing.expect(black.complement().eql(red)); + try testing.expect(red.complement().eql(black)); + + try testing.expect(empty.unionWith(empty).eql(empty)); + try testing.expect(empty.unionWith(full).eql(full)); + try testing.expect(full.unionWith(full).eql(full)); + try testing.expect(full.unionWith(empty).eql(full)); + try testing.expect(black.unionWith(red).eql(full)); + try testing.expect(red.unionWith(black).eql(full)); + + try testing.expect(empty.intersectWith(empty).eql(empty)); + try testing.expect(empty.intersectWith(full).eql(empty)); + try testing.expect(full.intersectWith(full).eql(full)); + try testing.expect(full.intersectWith(empty).eql(empty)); + try testing.expect(black.intersectWith(red).eql(empty)); + try testing.expect(red.intersectWith(black).eql(empty)); + + try testing.expect(empty.xorWith(empty).eql(empty)); + try testing.expect(empty.xorWith(full).eql(full)); + try testing.expect(full.xorWith(full).eql(empty)); + try testing.expect(full.xorWith(empty).eql(full)); + try testing.expect(black.xorWith(red).eql(full)); + try testing.expect(red.xorWith(black).eql(full)); + + try testing.expect(empty.differenceWith(empty).eql(empty)); + try testing.expect(empty.differenceWith(full).eql(empty)); + try testing.expect(full.differenceWith(full).eql(empty)); + try testing.expect(full.differenceWith(empty).eql(full)); + try testing.expect(full.differenceWith(red).eql(black)); + try testing.expect(full.differenceWith(black).eql(red)); } -test ensureIndexer { - ensureIndexer(struct { - pub const Key = u32; - pub const count: comptime_int = 8; - pub fn indexOf(k: Key) usize { - return @as(usize, @intCast(k)); - } - pub fn keyForIndex(index: usize) Key { - return @as(Key, @intCast(index)); - } - }); +test "EnumSet empty" { + const E = enum {}; + const empty = EnumSet(E).initEmpty(); + const full = EnumSet(E).initFull(); + + try std.testing.expect(empty.eql(full)); + try std.testing.expect(empty.complement().eql(full)); + try std.testing.expect(empty.complement().eql(full.complement())); + try std.testing.expect(empty.eql(full.complement())); +} + +test "EnumSet const iterator" { + const Direction = enum { up, down, left, right }; + const diag_move = init: { + var move = EnumSet(Direction).initEmpty(); + move.insert(.right); + move.insert(.up); + break :init move; + }; + + var result = EnumSet(Direction).initEmpty(); + var it = diag_move.iterator(); + while (it.next()) |dir| { + result.insert(dir); + } + + try testing.expect(result.eql(diag_move)); } pub fn EnumIndexer(comptime E: type) type { @@ -1438,7 +1340,6 @@ test "EnumIndexer non-exhaustive" { _, }; const Indexer = EnumIndexer(E); - ensureIndexer(Indexer); const min_tag: E = @enumFromInt(std.math.minInt(BackingInt)); const max_tag: E = @enumFromInt(std.math.maxInt(BackingInt)); @@ -1466,7 +1367,6 @@ test "EnumIndexer non-exhaustive" { test "EnumIndexer dense zeroed" { const E = enum(u2) { b = 1, a = 0, c = 2 }; const Indexer = EnumIndexer(E); - ensureIndexer(Indexer); try testing.expectEqual(E, Indexer.Key); try testing.expectEqual(3, Indexer.count); @@ -1482,7 +1382,6 @@ test "EnumIndexer dense zeroed" { test "EnumIndexer dense positive" { const E = enum(u4) { c = 6, a = 4, b = 5 }; const Indexer = EnumIndexer(E); - ensureIndexer(Indexer); try testing.expectEqual(E, Indexer.Key); try testing.expectEqual(3, Indexer.count); @@ -1498,7 +1397,6 @@ test "EnumIndexer dense positive" { test "EnumIndexer dense negative" { const E = enum(i4) { a = -6, c = -4, b = -5 }; const Indexer = EnumIndexer(E); - ensureIndexer(Indexer); try testing.expectEqual(E, Indexer.Key); try testing.expectEqual(3, Indexer.count); @@ -1514,7 +1412,6 @@ test "EnumIndexer dense negative" { test "EnumIndexer sparse" { const E = enum(i4) { a = -2, c = 6, b = 4 }; const Indexer = EnumIndexer(E); - ensureIndexer(Indexer); try testing.expectEqual(E, Indexer.Key); try testing.expectEqual(3, Indexer.count); @@ -1530,7 +1427,6 @@ test "EnumIndexer sparse" { test "EnumIndexer empty" { const E = enum {}; const Indexer = EnumIndexer(E); - ensureIndexer(Indexer); try testing.expectEqual(E, Indexer.Key); try testing.expectEqual(0, Indexer.count); } diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig @@ -383,25 +383,24 @@ pub const O = switch (native_arch) { else => @compileError("missing std.os.linux.O constants for this architecture"), }; -pub usingnamespace @import("linux/io_uring.zig"); - /// Set by startup code, used by `getauxval`. pub var elf_aux_maybe: ?[*]std.elf.Auxv = null; -pub usingnamespace if (switch (builtin.zig_backend) { +const extern_getauxval = switch (builtin.zig_backend) { // Calling extern functions is not yet supported with these backends .stage2_aarch64, .stage2_arm, .stage2_riscv64, .stage2_sparc64 => false, else => !builtin.link_libc, -}) struct { - /// See `std.elf` for the constants. - /// This matches the libc getauxval function. - pub extern fn getauxval(index: usize) usize; - comptime { +}; + +comptime { + if (extern_getauxval) { @export(getauxvalImpl, .{ .name = "getauxval", .linkage = .Weak }); } -} else struct { - pub const getauxval = getauxvalImpl; -}; +} + +pub const getauxval = if (extern_getauxval) struct { + extern fn getauxval(index: usize) usize; +}.getauxval else getauxvalImpl; fn getauxvalImpl(index: usize) callconv(.C) usize { const auxv = elf_aux_maybe orelse return 0; @@ -2823,284 +2822,282 @@ pub const AF = struct { pub const MAX = PF.MAX; }; -pub const SO = struct { - pub usingnamespace if (is_mips) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 0x0004; - pub const KEEPALIVE = 0x0008; - pub const DONTROUTE = 0x0010; - pub const BROADCAST = 0x0020; - pub const LINGER = 0x0080; - pub const OOBINLINE = 0x0100; - pub const REUSEPORT = 0x0200; - pub const SNDBUF = 0x1001; - pub const RCVBUF = 0x1002; - pub const SNDLOWAT = 0x1003; - pub const RCVLOWAT = 0x1004; - pub const RCVTIMEO = 0x1006; - pub const SNDTIMEO = 0x1005; - pub const ERROR = 0x1007; - pub const TYPE = 0x1008; - pub const ACCEPTCONN = 0x1009; - pub const PROTOCOL = 0x1028; - pub const DOMAIN = 0x1029; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const BSDCOMPAT = 14; - pub const PASSCRED = 17; - pub const PEERCRED = 18; - pub const PEERSEC = 30; - pub const SNDBUFFORCE = 31; - pub const RCVBUFFORCE = 33; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; - } else if (is_ppc or is_ppc64) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 2; - pub const TYPE = 3; - pub const ERROR = 4; - pub const DONTROUTE = 5; - pub const BROADCAST = 6; - pub const SNDBUF = 7; - pub const RCVBUF = 8; - pub const KEEPALIVE = 9; - pub const OOBINLINE = 10; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 13; - pub const BSDCOMPAT = 14; - pub const REUSEPORT = 15; - pub const RCVLOWAT = 16; - pub const SNDLOWAT = 17; - pub const RCVTIMEO = 18; - pub const SNDTIMEO = 19; - pub const PASSCRED = 20; - pub const PEERCRED = 21; - pub const ACCEPTCONN = 30; - pub const PEERSEC = 31; - pub const SNDBUFFORCE = 32; - pub const RCVBUFFORCE = 33; - pub const PROTOCOL = 38; - pub const DOMAIN = 39; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; - } else if (is_sparc) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 4; - pub const TYPE = 4104; - pub const ERROR = 4103; - pub const DONTROUTE = 16; - pub const BROADCAST = 32; - pub const SNDBUF = 4097; - pub const RCVBUF = 4098; - pub const KEEPALIVE = 8; - pub const OOBINLINE = 256; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 128; - pub const BSDCOMPAT = 1024; - pub const REUSEPORT = 512; - pub const PASSCRED = 2; - pub const PEERCRED = 64; - pub const RCVLOWAT = 2048; - pub const SNDLOWAT = 4096; - pub const RCVTIMEO = 8192; - pub const SNDTIMEO = 16384; - pub const ACCEPTCONN = 32768; - pub const PEERSEC = 30; - pub const SNDBUFFORCE = 4106; - pub const RCVBUFFORCE = 4107; - pub const PROTOCOL = 4136; - pub const DOMAIN = 4137; - pub const SECURITY_AUTHENTICATION = 20481; - pub const SECURITY_ENCRYPTION_TRANSPORT = 20482; - pub const SECURITY_ENCRYPTION_NETWORK = 20484; - pub const BINDTODEVICE = 13; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = 26; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 31; - pub const TIMESTAMPNS_OLD = 33; - pub const MARK = 34; - pub const TIMESTAMPING_OLD = 35; - pub const RXQ_OVFL = 36; - pub const WIFI_STATUS = 37; - pub const PEEK_OFF = 38; - pub const NOFCS = 39; - pub const LOCK_FILTER = 40; - pub const SELECT_ERR_QUEUE = 41; - pub const BUSY_POLL = 48; - pub const MAX_PACING_RATE = 49; - pub const BPF_EXTENSIONS = 50; - pub const INCOMING_CPU = 51; - pub const ATTACH_BPF = 52; - pub const DETACH_BPF = 27; - pub const ATTACH_REUSEPORT_CBPF = 53; - pub const ATTACH_REUSEPORT_EBPF = 54; - pub const CNX_ADVICE = 55; - pub const MEMINFO = 57; - pub const INCOMING_NAPI_ID = 58; - pub const COOKIE = 59; - pub const PEERGROUPS = 61; - pub const ZEROCOPY = 62; - pub const TXTIME = 63; - pub const BINDTOIFINDEX = 65; - pub const TIMESTAMP_NEW = 70; - pub const TIMESTAMPNS_NEW = 66; - pub const TIMESTAMPING_NEW = 67; - pub const RCVTIMEO_NEW = 68; - pub const SNDTIMEO_NEW = 69; - pub const DETACH_REUSEPORT_BPF = 71; - } else struct { - pub const DEBUG = 1; - pub const REUSEADDR = 2; - pub const TYPE = 3; - pub const ERROR = 4; - pub const DONTROUTE = 5; - pub const BROADCAST = 6; - pub const SNDBUF = 7; - pub const RCVBUF = 8; - pub const KEEPALIVE = 9; - pub const OOBINLINE = 10; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 13; - pub const BSDCOMPAT = 14; - pub const REUSEPORT = 15; - pub const PASSCRED = 16; - pub const PEERCRED = 17; - pub const RCVLOWAT = 18; - pub const SNDLOWAT = 19; - pub const RCVTIMEO = 20; - pub const SNDTIMEO = 21; - pub const ACCEPTCONN = 30; - pub const PEERSEC = 31; - pub const SNDBUFFORCE = 32; - pub const RCVBUFFORCE = 33; - pub const PROTOCOL = 38; - pub const DOMAIN = 39; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; - }; +pub const SO = if (is_mips) struct { + pub const DEBUG = 1; + pub const REUSEADDR = 0x0004; + pub const KEEPALIVE = 0x0008; + pub const DONTROUTE = 0x0010; + pub const BROADCAST = 0x0020; + pub const LINGER = 0x0080; + pub const OOBINLINE = 0x0100; + pub const REUSEPORT = 0x0200; + pub const SNDBUF = 0x1001; + pub const RCVBUF = 0x1002; + pub const SNDLOWAT = 0x1003; + pub const RCVLOWAT = 0x1004; + pub const RCVTIMEO = 0x1006; + pub const SNDTIMEO = 0x1005; + pub const ERROR = 0x1007; + pub const TYPE = 0x1008; + pub const ACCEPTCONN = 0x1009; + pub const PROTOCOL = 0x1028; + pub const DOMAIN = 0x1029; + pub const NO_CHECK = 11; + pub const PRIORITY = 12; + pub const BSDCOMPAT = 14; + pub const PASSCRED = 17; + pub const PEERCRED = 18; + pub const PEERSEC = 30; + pub const SNDBUFFORCE = 31; + pub const RCVBUFFORCE = 33; + pub const SECURITY_AUTHENTICATION = 22; + pub const SECURITY_ENCRYPTION_TRANSPORT = 23; + pub const SECURITY_ENCRYPTION_NETWORK = 24; + pub const BINDTODEVICE = 25; + pub const ATTACH_FILTER = 26; + pub const DETACH_FILTER = 27; + pub const GET_FILTER = ATTACH_FILTER; + pub const PEERNAME = 28; + pub const TIMESTAMP_OLD = 29; + pub const PASSSEC = 34; + pub const TIMESTAMPNS_OLD = 35; + pub const MARK = 36; + pub const TIMESTAMPING_OLD = 37; + pub const RXQ_OVFL = 40; + pub const WIFI_STATUS = 41; + pub const PEEK_OFF = 42; + pub const NOFCS = 43; + pub const LOCK_FILTER = 44; + pub const SELECT_ERR_QUEUE = 45; + pub const BUSY_POLL = 46; + pub const MAX_PACING_RATE = 47; + pub const BPF_EXTENSIONS = 48; + pub const INCOMING_CPU = 49; + pub const ATTACH_BPF = 50; + pub const DETACH_BPF = DETACH_FILTER; + pub const ATTACH_REUSEPORT_CBPF = 51; + pub const ATTACH_REUSEPORT_EBPF = 52; + pub const CNX_ADVICE = 53; + pub const MEMINFO = 55; + pub const INCOMING_NAPI_ID = 56; + pub const COOKIE = 57; + pub const PEERGROUPS = 59; + pub const ZEROCOPY = 60; + pub const TXTIME = 61; + pub const BINDTOIFINDEX = 62; + pub const TIMESTAMP_NEW = 63; + pub const TIMESTAMPNS_NEW = 64; + pub const TIMESTAMPING_NEW = 65; + pub const RCVTIMEO_NEW = 66; + pub const SNDTIMEO_NEW = 67; + pub const DETACH_REUSEPORT_BPF = 68; +} else if (is_ppc or is_ppc64) struct { + pub const DEBUG = 1; + pub const REUSEADDR = 2; + pub const TYPE = 3; + pub const ERROR = 4; + pub const DONTROUTE = 5; + pub const BROADCAST = 6; + pub const SNDBUF = 7; + pub const RCVBUF = 8; + pub const KEEPALIVE = 9; + pub const OOBINLINE = 10; + pub const NO_CHECK = 11; + pub const PRIORITY = 12; + pub const LINGER = 13; + pub const BSDCOMPAT = 14; + pub const REUSEPORT = 15; + pub const RCVLOWAT = 16; + pub const SNDLOWAT = 17; + pub const RCVTIMEO = 18; + pub const SNDTIMEO = 19; + pub const PASSCRED = 20; + pub const PEERCRED = 21; + pub const ACCEPTCONN = 30; + pub const PEERSEC = 31; + pub const SNDBUFFORCE = 32; + pub const RCVBUFFORCE = 33; + pub const PROTOCOL = 38; + pub const DOMAIN = 39; + pub const SECURITY_AUTHENTICATION = 22; + pub const SECURITY_ENCRYPTION_TRANSPORT = 23; + pub const SECURITY_ENCRYPTION_NETWORK = 24; + pub const BINDTODEVICE = 25; + pub const ATTACH_FILTER = 26; + pub const DETACH_FILTER = 27; + pub const GET_FILTER = ATTACH_FILTER; + pub const PEERNAME = 28; + pub const TIMESTAMP_OLD = 29; + pub const PASSSEC = 34; + pub const TIMESTAMPNS_OLD = 35; + pub const MARK = 36; + pub const TIMESTAMPING_OLD = 37; + pub const RXQ_OVFL = 40; + pub const WIFI_STATUS = 41; + pub const PEEK_OFF = 42; + pub const NOFCS = 43; + pub const LOCK_FILTER = 44; + pub const SELECT_ERR_QUEUE = 45; + pub const BUSY_POLL = 46; + pub const MAX_PACING_RATE = 47; + pub const BPF_EXTENSIONS = 48; + pub const INCOMING_CPU = 49; + pub const ATTACH_BPF = 50; + pub const DETACH_BPF = DETACH_FILTER; + pub const ATTACH_REUSEPORT_CBPF = 51; + pub const ATTACH_REUSEPORT_EBPF = 52; + pub const CNX_ADVICE = 53; + pub const MEMINFO = 55; + pub const INCOMING_NAPI_ID = 56; + pub const COOKIE = 57; + pub const PEERGROUPS = 59; + pub const ZEROCOPY = 60; + pub const TXTIME = 61; + pub const BINDTOIFINDEX = 62; + pub const TIMESTAMP_NEW = 63; + pub const TIMESTAMPNS_NEW = 64; + pub const TIMESTAMPING_NEW = 65; + pub const RCVTIMEO_NEW = 66; + pub const SNDTIMEO_NEW = 67; + pub const DETACH_REUSEPORT_BPF = 68; +} else if (is_sparc) struct { + pub const DEBUG = 1; + pub const REUSEADDR = 4; + pub const TYPE = 4104; + pub const ERROR = 4103; + pub const DONTROUTE = 16; + pub const BROADCAST = 32; + pub const SNDBUF = 4097; + pub const RCVBUF = 4098; + pub const KEEPALIVE = 8; + pub const OOBINLINE = 256; + pub const NO_CHECK = 11; + pub const PRIORITY = 12; + pub const LINGER = 128; + pub const BSDCOMPAT = 1024; + pub const REUSEPORT = 512; + pub const PASSCRED = 2; + pub const PEERCRED = 64; + pub const RCVLOWAT = 2048; + pub const SNDLOWAT = 4096; + pub const RCVTIMEO = 8192; + pub const SNDTIMEO = 16384; + pub const ACCEPTCONN = 32768; + pub const PEERSEC = 30; + pub const SNDBUFFORCE = 4106; + pub const RCVBUFFORCE = 4107; + pub const PROTOCOL = 4136; + pub const DOMAIN = 4137; + pub const SECURITY_AUTHENTICATION = 20481; + pub const SECURITY_ENCRYPTION_TRANSPORT = 20482; + pub const SECURITY_ENCRYPTION_NETWORK = 20484; + pub const BINDTODEVICE = 13; + pub const ATTACH_FILTER = 26; + pub const DETACH_FILTER = 27; + pub const GET_FILTER = 26; + pub const PEERNAME = 28; + pub const TIMESTAMP_OLD = 29; + pub const PASSSEC = 31; + pub const TIMESTAMPNS_OLD = 33; + pub const MARK = 34; + pub const TIMESTAMPING_OLD = 35; + pub const RXQ_OVFL = 36; + pub const WIFI_STATUS = 37; + pub const PEEK_OFF = 38; + pub const NOFCS = 39; + pub const LOCK_FILTER = 40; + pub const SELECT_ERR_QUEUE = 41; + pub const BUSY_POLL = 48; + pub const MAX_PACING_RATE = 49; + pub const BPF_EXTENSIONS = 50; + pub const INCOMING_CPU = 51; + pub const ATTACH_BPF = 52; + pub const DETACH_BPF = 27; + pub const ATTACH_REUSEPORT_CBPF = 53; + pub const ATTACH_REUSEPORT_EBPF = 54; + pub const CNX_ADVICE = 55; + pub const MEMINFO = 57; + pub const INCOMING_NAPI_ID = 58; + pub const COOKIE = 59; + pub const PEERGROUPS = 61; + pub const ZEROCOPY = 62; + pub const TXTIME = 63; + pub const BINDTOIFINDEX = 65; + pub const TIMESTAMP_NEW = 70; + pub const TIMESTAMPNS_NEW = 66; + pub const TIMESTAMPING_NEW = 67; + pub const RCVTIMEO_NEW = 68; + pub const SNDTIMEO_NEW = 69; + pub const DETACH_REUSEPORT_BPF = 71; +} else struct { + pub const DEBUG = 1; + pub const REUSEADDR = 2; + pub const TYPE = 3; + pub const ERROR = 4; + pub const DONTROUTE = 5; + pub const BROADCAST = 6; + pub const SNDBUF = 7; + pub const RCVBUF = 8; + pub const KEEPALIVE = 9; + pub const OOBINLINE = 10; + pub const NO_CHECK = 11; + pub const PRIORITY = 12; + pub const LINGER = 13; + pub const BSDCOMPAT = 14; + pub const REUSEPORT = 15; + pub const PASSCRED = 16; + pub const PEERCRED = 17; + pub const RCVLOWAT = 18; + pub const SNDLOWAT = 19; + pub const RCVTIMEO = 20; + pub const SNDTIMEO = 21; + pub const ACCEPTCONN = 30; + pub const PEERSEC = 31; + pub const SNDBUFFORCE = 32; + pub const RCVBUFFORCE = 33; + pub const PROTOCOL = 38; + pub const DOMAIN = 39; + pub const SECURITY_AUTHENTICATION = 22; + pub const SECURITY_ENCRYPTION_TRANSPORT = 23; + pub const SECURITY_ENCRYPTION_NETWORK = 24; + pub const BINDTODEVICE = 25; + pub const ATTACH_FILTER = 26; + pub const DETACH_FILTER = 27; + pub const GET_FILTER = ATTACH_FILTER; + pub const PEERNAME = 28; + pub const TIMESTAMP_OLD = 29; + pub const PASSSEC = 34; + pub const TIMESTAMPNS_OLD = 35; + pub const MARK = 36; + pub const TIMESTAMPING_OLD = 37; + pub const RXQ_OVFL = 40; + pub const WIFI_STATUS = 41; + pub const PEEK_OFF = 42; + pub const NOFCS = 43; + pub const LOCK_FILTER = 44; + pub const SELECT_ERR_QUEUE = 45; + pub const BUSY_POLL = 46; + pub const MAX_PACING_RATE = 47; + pub const BPF_EXTENSIONS = 48; + pub const INCOMING_CPU = 49; + pub const ATTACH_BPF = 50; + pub const DETACH_BPF = DETACH_FILTER; + pub const ATTACH_REUSEPORT_CBPF = 51; + pub const ATTACH_REUSEPORT_EBPF = 52; + pub const CNX_ADVICE = 53; + pub const MEMINFO = 55; + pub const INCOMING_NAPI_ID = 56; + pub const COOKIE = 57; + pub const PEERGROUPS = 59; + pub const ZEROCOPY = 60; + pub const TXTIME = 61; + pub const BINDTOIFINDEX = 62; + pub const TIMESTAMP_NEW = 63; + pub const TIMESTAMPNS_NEW = 64; + pub const TIMESTAMPING_NEW = 65; + pub const RCVTIMEO_NEW = 66; + pub const SNDTIMEO_NEW = 67; + pub const DETACH_REUSEPORT_BPF = 68; }; pub const SCM = struct { @@ -4189,22 +4186,9 @@ pub const IORING_SETUP_SINGLE_ISSUER = 1 << 12; pub const IORING_SETUP_DEFER_TASKRUN = 1 << 13; /// IO submission data structure (Submission Queue Entry) -pub const io_uring_sqe = extern struct { - opcode: IORING_OP, - flags: u8, - ioprio: u16, - fd: i32, - off: u64, - addr: u64, - len: u32, - rw_flags: u32, - user_data: u64, - buf_index: u16, - personality: u16, - splice_fd_in: i32, - addr3: u64, - resv: u64, -}; +pub const io_uring_sqe = @import("linux/io_uring_sqe.zig").io_uring_sqe; + +pub const IoUring = @import("linux/IoUring.zig"); /// If sqe->file_index is set to this for opcodes that instantiate a new /// direct descriptor (like openat/openat2/accept), then io_uring will allocate diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig @@ -0,0 +1,3670 @@ +const IoUring = @This(); +const std = @import("../../std.zig"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const mem = std.mem; +const net = std.net; +const os = std.os; +const posix = std.posix; +const linux = os.linux; +const testing = std.testing; + +fd: os.fd_t = -1, +sq: SubmissionQueue, +cq: CompletionQueue, +flags: u32, +features: u32, + +/// A friendly way to setup an io_uring, with default linux.io_uring_params. +/// `entries` must be a power of two between 1 and 32768, although the kernel will make the final +/// call on how many entries the submission and completion queues will ultimately have, +/// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. +/// Matches the interface of io_uring_queue_init() in liburing. +pub fn init(entries: u16, flags: u32) !IoUring { + var params = mem.zeroInit(linux.io_uring_params, .{ + .flags = flags, + .sq_thread_idle = 1000, + }); + return try IoUring.init_params(entries, &params); +} + +/// A powerful way to setup an io_uring, if you want to tweak linux.io_uring_params such as submission +/// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second). +/// `params` is passed by reference because the kernel needs to modify the parameters. +/// Matches the interface of io_uring_queue_init_params() in liburing. +pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { + if (entries == 0) return error.EntriesZero; + if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; + + assert(p.sq_entries == 0); + assert(p.cq_entries == 0 or p.flags & linux.IORING_SETUP_CQSIZE != 0); + assert(p.features == 0); + assert(p.wq_fd == 0 or p.flags & linux.IORING_SETUP_ATTACH_WQ != 0); + assert(p.resv[0] == 0); + assert(p.resv[1] == 0); + assert(p.resv[2] == 0); + + const res = linux.io_uring_setup(entries, p); + switch (linux.getErrno(res)) { + .SUCCESS => {}, + .FAULT => return error.ParamsOutsideAccessibleAddressSpace, + // The resv array contains non-zero data, p.flags contains an unsupported flag, + // entries out of bounds, IORING_SETUP_SQ_AFF was specified without IORING_SETUP_SQPOLL, + // or IORING_SETUP_CQSIZE was specified but linux.io_uring_params.cq_entries was invalid: + .INVAL => return error.ArgumentsInvalid, + .MFILE => return error.ProcessFdQuotaExceeded, + .NFILE => return error.SystemFdQuotaExceeded, + .NOMEM => return error.SystemResources, + // IORING_SETUP_SQPOLL was specified but effective user ID lacks sufficient privileges, + // or a container seccomp policy prohibits io_uring syscalls: + .PERM => return error.PermissionDenied, + .NOSYS => return error.SystemOutdated, + else => |errno| return os.unexpectedErrno(errno), + } + const fd = @as(os.fd_t, @intCast(res)); + assert(fd >= 0); + errdefer os.close(fd); + + // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. + // This is not an optional feature for us... if the kernel does it, we have to do it. + // The thinking on this by the kernel developers was that both the submission and the + // completion queue rings have sizes just over a power of two, but the submission queue ring + // is significantly smaller with u32 slots. By bundling both in a single mmap, the kernel + // gets the submission queue ring for free. + // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. + // We do not support the double mmap() done before 5.4, because we want to keep the + // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. + if ((p.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { + return error.SystemOutdated; + } + + // Check that the kernel has actually set params and that "impossible is nothing". + assert(p.sq_entries != 0); + assert(p.cq_entries != 0); + assert(p.cq_entries >= p.sq_entries); + + // From here on, we only need to read from params, so pass `p` by value as immutable. + // The completion queue shares the mmap with the submission queue, so pass `sq` there too. + var sq = try SubmissionQueue.init(fd, p.*); + errdefer sq.deinit(); + var cq = try CompletionQueue.init(fd, p.*, sq); + errdefer cq.deinit(); + + // Check that our starting state is as we expect. + assert(sq.head.* == 0); + assert(sq.tail.* == 0); + assert(sq.mask == p.sq_entries - 1); + // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. + assert(sq.dropped.* == 0); + assert(sq.array.len == p.sq_entries); + assert(sq.sqes.len == p.sq_entries); + assert(sq.sqe_head == 0); + assert(sq.sqe_tail == 0); + + assert(cq.head.* == 0); + assert(cq.tail.* == 0); + assert(cq.mask == p.cq_entries - 1); + assert(cq.overflow.* == 0); + assert(cq.cqes.len == p.cq_entries); + + return IoUring{ + .fd = fd, + .sq = sq, + .cq = cq, + .flags = p.flags, + .features = p.features, + }; +} + +pub fn deinit(self: *IoUring) void { + assert(self.fd >= 0); + // The mmaps depend on the fd, so the order of these calls is important: + self.cq.deinit(); + self.sq.deinit(); + os.close(self.fd); + self.fd = -1; +} + +/// Returns a pointer to a vacant SQE, or an error if the submission queue is full. +/// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. +/// However, instead of a null we return an error to force safe handling. +/// Any situation where the submission queue is full tends more towards a control flow error, +/// and the null return in liburing is more a C idiom than anything else, for lack of a better +/// alternative. In Zig, we have first-class error handling... so let's use it. +/// Matches the implementation of io_uring_get_sqe() in liburing. +pub fn get_sqe(self: *IoUring) !*linux.io_uring_sqe { + const head = @atomicLoad(u32, self.sq.head, .Acquire); + // Remember that these head and tail offsets wrap around every four billion operations. + // We must therefore use wrapping addition and subtraction to avoid a runtime crash. + const next = self.sq.sqe_tail +% 1; + if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; + const sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; + self.sq.sqe_tail = next; + return sqe; +} + +/// Submits the SQEs acquired via get_sqe() to the kernel. You can call this once after you have +/// called get_sqe() multiple times to setup multiple I/O requests. +/// Returns the number of SQEs submitted, if not used alongside IORING_SETUP_SQPOLL. +/// If the io_uring instance is uses IORING_SETUP_SQPOLL, the value returned on success is not +/// guaranteed to match the amount of actually submitted sqes during this call. A value higher +/// or lower, including 0, may be returned. +/// Matches the implementation of io_uring_submit() in liburing. +pub fn submit(self: *IoUring) !u32 { + return self.submit_and_wait(0); +} + +/// Like submit(), but allows waiting for events as well. +/// Returns the number of SQEs submitted. +/// Matches the implementation of io_uring_submit_and_wait() in liburing. +pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { + const submitted = self.flush_sq(); + var flags: u32 = 0; + if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { + if (wait_nr > 0 or (self.flags & linux.IORING_SETUP_IOPOLL) != 0) { + flags |= linux.IORING_ENTER_GETEVENTS; + } + return try self.enter(submitted, wait_nr, flags); + } + return submitted; +} + +/// Tell the kernel we have submitted SQEs and/or want to wait for CQEs. +/// Returns the number of SQEs submitted. +pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: u32) !u32 { + assert(self.fd >= 0); + const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); + switch (linux.getErrno(res)) { + .SUCCESS => {}, + // The kernel was unable to allocate memory or ran out of resources for the request. + // The application should wait for some completions and try again: + .AGAIN => return error.SystemResources, + // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: + .BADF => return error.FileDescriptorInvalid, + // The file descriptor is valid, but the ring is not in the right state. + // See io_uring_register(2) for how to enable the ring. + .BADFD => return error.FileDescriptorInBadState, + // The application attempted to overcommit the number of requests it can have pending. + // The application should wait for some completions and try again: + .BUSY => return error.CompletionQueueOvercommitted, + // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: + .INVAL => return error.SubmissionQueueEntryInvalid, + // The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED + // or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range + // described by `addr` and `len` is not within the buffer registered at `buf_index`: + .FAULT => return error.BufferInvalid, + .NXIO => return error.RingShuttingDown, + // The kernel believes our `self.fd` does not refer to an io_uring instance, + // or the opcode is valid but not supported by this kernel (more likely): + .OPNOTSUPP => return error.OpcodeNotSupported, + // The operation was interrupted by a delivery of a signal before it could complete. + // This can happen while waiting for events with IORING_ENTER_GETEVENTS: + .INTR => return error.SignalInterrupt, + else => |errno| return os.unexpectedErrno(errno), + } + return @as(u32, @intCast(res)); +} + +/// Sync internal state with kernel ring state on the SQ side. +/// Returns the number of all pending events in the SQ ring, for the shared ring. +/// This return value includes previously flushed SQEs, as per liburing. +/// The rationale is to suggest that an io_uring_enter() call is needed rather than not. +/// Matches the implementation of __io_uring_flush_sq() in liburing. +pub fn flush_sq(self: *IoUring) u32 { + if (self.sq.sqe_head != self.sq.sqe_tail) { + // Fill in SQEs that we have queued up, adding them to the kernel ring. + const to_submit = self.sq.sqe_tail -% self.sq.sqe_head; + var tail = self.sq.tail.*; + var i: usize = 0; + while (i < to_submit) : (i += 1) { + self.sq.array[tail & self.sq.mask] = self.sq.sqe_head & self.sq.mask; + tail +%= 1; + self.sq.sqe_head +%= 1; + } + // Ensure that the kernel can actually see the SQE updates when it sees the tail update. + @atomicStore(u32, self.sq.tail, tail, .Release); + } + return self.sq_ready(); +} + +/// Returns true if we are not using an SQ thread (thus nobody submits but us), +/// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. +/// For the latter case, we set the SQ thread wakeup flag. +/// Matches the implementation of sq_ring_needs_enter() in liburing. +pub fn sq_ring_needs_enter(self: *IoUring, flags: *u32) bool { + assert(flags.* == 0); + if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0) return true; + if ((@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { + flags.* |= linux.IORING_ENTER_SQ_WAKEUP; + return true; + } + return false; +} + +/// Returns the number of flushed and unflushed SQEs pending in the submission queue. +/// In other words, this is the number of SQEs in the submission queue, i.e. its length. +/// These are SQEs that the kernel is yet to consume. +/// Matches the implementation of io_uring_sq_ready in liburing. +pub fn sq_ready(self: *IoUring) u32 { + // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, + // see https://github.com/axboe/liburing/issues/92. + return self.sq.sqe_tail -% @atomicLoad(u32, self.sq.head, .Acquire); +} + +/// Returns the number of CQEs in the completion queue, i.e. its length. +/// These are CQEs that the application is yet to consume. +/// Matches the implementation of io_uring_cq_ready in liburing. +pub fn cq_ready(self: *IoUring) u32 { + return @atomicLoad(u32, self.cq.tail, .Acquire) -% self.cq.head.*; +} + +/// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice. +/// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs. +/// Returns the number of CQEs copied, advancing the CQ ring. +/// Provides all the wait/peek methods found in liburing, but with batching and a single method. +/// The rationale for copying CQEs rather than copying pointers is that pointers are 8 bytes +/// whereas CQEs are not much more at only 16 bytes, and this provides a safer faster interface. +/// Safer, because you no longer need to call cqe_seen(), avoiding idempotency bugs. +/// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. +/// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. +/// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. +pub fn copy_cqes(self: *IoUring, cqes: []linux.io_uring_cqe, wait_nr: u32) !u32 { + const count = self.copy_cqes_ready(cqes); + if (count > 0) return count; + if (self.cq_ring_needs_flush() or wait_nr > 0) { + _ = try self.enter(0, wait_nr, linux.IORING_ENTER_GETEVENTS); + return self.copy_cqes_ready(cqes); + } + return 0; +} + +fn copy_cqes_ready(self: *IoUring, cqes: []linux.io_uring_cqe) u32 { + const ready = self.cq_ready(); + const count = @min(cqes.len, ready); + const head = self.cq.head.* & self.cq.mask; + const tail = (self.cq.head.* +% count) & self.cq.mask; + + if (head <= tail) { + // head behind tail -> no wrapping + @memcpy(cqes[0..count], self.cq.cqes[head..tail]); + } else { + // head in front of tail -> buffer wraps + const two_copies_required: bool = self.cq.cqes.len - head < count; + const amount_to_copy_in_first = if (two_copies_required) self.cq.cqes.len - head else count; + @memcpy(cqes[0..amount_to_copy_in_first], self.cq.cqes[head .. head + amount_to_copy_in_first]); + if (two_copies_required) { + @memcpy(cqes[amount_to_copy_in_first..count], self.cq.cqes[0..tail]); + } + } + + self.cq_advance(count); + return count; +} + +/// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. +/// A convenience method for `copy_cqes()` for when you don't need to batch or peek. +pub fn copy_cqe(ring: *IoUring) !linux.io_uring_cqe { + var cqes: [1]linux.io_uring_cqe = undefined; + while (true) { + const count = try ring.copy_cqes(&cqes, 1); + if (count > 0) return cqes[0]; + } +} + +/// Matches the implementation of cq_ring_needs_flush() in liburing. +pub fn cq_ring_needs_flush(self: *IoUring) bool { + return (@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_CQ_OVERFLOW) != 0; +} + +/// For advanced use cases only that implement custom completion queue methods. +/// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance(). +/// Must be called exactly once after a zero-copy CQE has been processed by your application. +/// Not idempotent, calling more than once will result in other CQEs being lost. +/// Matches the implementation of cqe_seen() in liburing. +pub fn cqe_seen(self: *IoUring, cqe: *linux.io_uring_cqe) void { + _ = cqe; + self.cq_advance(1); +} + +/// For advanced use cases only that implement custom completion queue methods. +/// Matches the implementation of cq_advance() in liburing. +pub fn cq_advance(self: *IoUring, count: u32) void { + if (count > 0) { + // Ensure the kernel only sees the new head value after the CQEs have been read. + @atomicStore(u32, self.cq.head, self.cq.head.* +% count, .Release); + } +} + +/// Queues (but does not submit) an SQE to perform an `fsync(2)`. +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`. +/// N.B. While SQEs are initiated in the order in which they appear in the submission queue, +/// operations execute in parallel and completions are unordered. Therefore, an application that +/// submits a write followed by an fsync in the submission queue cannot expect the fsync to +/// apply to the write, since the fsync may complete before the write is issued to the disk. +/// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, +/// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. +pub fn fsync(self: *IoUring, user_data: u64, fd: os.fd_t, flags: u32) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_fsync(fd, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a no-op. +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// A no-op is more useful than may appear at first glance. +/// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to +/// know when the ring is idle before acting on a kill signal. +pub fn nop(self: *IoUring, user_data: u64) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_nop(); + sqe.user_data = user_data; + return sqe; +} + +/// Used to select how the read should be handled. +pub const ReadBuffer = union(enum) { + /// io_uring will read directly into this buffer + buffer: []u8, + + /// io_uring will read directly into these buffers using readv. + iovecs: []const os.iovec, + + /// io_uring will select a buffer that has previously been provided with `provide_buffers`. + /// The buffer group reference by `group_id` must contain at least one buffer for the read to work. + /// `len` controls the number of bytes to read into the selected buffer. + buffer_selection: struct { + group_id: u16, + len: usize, + }, +}; + +/// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` depending on the buffer type. +/// * Reading into a `ReadBuffer.buffer` uses `read(2)` +/// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)` +/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. See https://man7.org/linux/man-pages/man2/preadv2.2.html +/// +/// Returns a pointer to the SQE. +pub fn read( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + buffer: ReadBuffer, + offset: u64, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_read(fd, slice, offset), + .iovecs => |vecs| sqe.prep_readv(fd, vecs, offset), + .buffer_selection => |selection| { + sqe.prep_rw(.READ, fd, 0, selection.len, offset); + sqe.flags |= linux.IOSQE_BUFFER_SELECT; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `write(2)`. +/// Returns a pointer to the SQE. +pub fn write( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + buffer: []const u8, + offset: u64, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_write(fd, buffer, offset); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `splice(2)` +/// Either `fd_in` or `fd_out` must be a pipe. +/// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to std.math.maxInt(u64). +/// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` are read +/// from `fd_in` starting from the file offset, which is incremented by the number of bytes read. +/// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then the starting offset of `fd_in` will be `off_in`. +/// This splice operation can be used to implement sendfile by splicing to an intermediate pipe first, +/// then splice to the final destination. In fact, the implementation of sendfile in kernel uses splice internally. +/// +/// NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still fail with EINVAL if one of the +/// fd doesn't explicitly support splice peration, e.g. reading from terminal is unsupported from kernel 5.7 to 5.11. +/// See https://github.com/axboe/liburing/issues/291 +/// +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +pub fn splice(self: *IoUring, user_data: u64, fd_in: os.fd_t, off_in: u64, fd_out: os.fd_t, off_out: u64, len: usize) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_splice(fd_in, off_in, fd_out, off_out, len); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a IORING_OP_READ_FIXED. +/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. +/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. +/// +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +pub fn read_fixed( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + buffer: *os.iovec, + offset: u64, + buffer_index: u16, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_read_fixed(fd, buffer, offset, buffer_index); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `pwritev()`. +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE. +/// See https://linux.die.net/man/2/pwritev. +pub fn writev( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + iovecs: []const os.iovec_const, + offset: u64, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_writev(fd, iovecs, offset); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a IORING_OP_WRITE_FIXED. +/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. +/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. +/// +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +pub fn write_fixed( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + buffer: *os.iovec, + offset: u64, + buffer_index: u16, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_write_fixed(fd, buffer, offset, buffer_index); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. +/// Returns a pointer to the SQE. +/// Available since 5.5 +pub fn accept( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + addr: ?*os.sockaddr, + addrlen: ?*os.socklen_t, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_accept(fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues an multishot accept on a socket. +/// +/// Multishot variant allows an application to issue a single accept request, +/// which will repeatedly trigger a CQE when a connection request comes in. +/// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate +/// further CQEs. +/// +/// Available since 5.19 +pub fn accept_multishot( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + addr: ?*os.sockaddr, + addrlen: ?*os.socklen_t, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_multishot_accept(fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues an accept using direct (registered) file descriptors. +/// +/// To use an accept direct variant, the application must first have registered +/// a file table (with register_files). An unused table index will be +/// dynamically chosen and returned in the CQE res field. +/// +/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE +/// flags member, and setting the SQE fd field to the direct descriptor value +/// rather than the regular file descriptor. +/// +/// Available since 5.19 +pub fn accept_direct( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + addr: ?*os.sockaddr, + addrlen: ?*os.socklen_t, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_accept_direct(fd, addr, addrlen, flags, linux.IORING_FILE_INDEX_ALLOC); + sqe.user_data = user_data; + return sqe; +} + +/// Queues an multishot accept using direct (registered) file descriptors. +/// Available since 5.19 +pub fn accept_multishot_direct( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + addr: ?*os.sockaddr, + addrlen: ?*os.socklen_t, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket. +/// Returns a pointer to the SQE. +pub fn connect( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + addr: *const os.sockaddr, + addrlen: os.socklen_t, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_connect(fd, addr, addrlen); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`. +/// Returns a pointer to the SQE. +pub fn epoll_ctl( + self: *IoUring, + user_data: u64, + epfd: os.fd_t, + fd: os.fd_t, + op: u32, + ev: ?*linux.epoll_event, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_epoll_ctl(epfd, fd, op, ev); + sqe.user_data = user_data; + return sqe; +} + +/// Used to select how the recv call should be handled. +pub const RecvBuffer = union(enum) { + /// io_uring will recv directly into this buffer + buffer: []u8, + + /// io_uring will select a buffer that has previously been provided with `provide_buffers`. + /// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work. + /// `len` controls the number of bytes to read into the selected buffer. + buffer_selection: struct { + group_id: u16, + len: usize, + }, +}; + +/// Queues (but does not submit) an SQE to perform a `recv(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6 +pub fn recv( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + buffer: RecvBuffer, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_recv(fd, slice, flags), + .buffer_selection => |selection| { + sqe.prep_rw(.RECV, fd, 0, selection.len, 0); + sqe.rw_flags = flags; + sqe.flags |= linux.IOSQE_BUFFER_SELECT; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `send(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6 +pub fn send( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + buffer: []const u8, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_send(fd, buffer, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// +/// This operation will most likely produce two CQEs. The flags field of the +/// first cqe may likely contain IORING_CQE_F_MORE, which means that there will +/// be a second cqe with the user_data field set to the same value. The user +/// must not modify the data buffer until the notification is posted. The first +/// cqe follows the usual rules and so its res field will contain the number of +/// bytes sent or a negative error code. The notification's res field will be +/// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two +/// step model is needed because the kernel may hold on to buffers for a long +/// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling +/// the lifetime of the buffers. Even errored requests may generate a +/// notification. +/// +/// Available since 6.0 +pub fn send_zc( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + buffer: []const u8, + send_flags: u32, + zc_flags: u16, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// Returns a pointer to the SQE. +/// Available since 6.0 +pub fn send_zc_fixed( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + buffer: []const u8, + send_flags: u32, + zc_flags: u16, + buf_index: u16, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `recvmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.3 +pub fn recvmsg( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + msg: *os.msghdr, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_recvmsg(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `sendmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.3 +pub fn sendmsg( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + msg: *const os.msghdr_const, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_sendmsg(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 6.1 +pub fn sendmsg_zc( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + msg: *const os.msghdr_const, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_sendmsg_zc(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an `openat(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6. +pub fn openat( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: os.mode_t, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_openat(fd, path, flags, mode); + sqe.user_data = user_data; + return sqe; +} + +/// Queues an openat using direct (registered) file descriptors. +/// +/// To use an accept direct variant, the application must first have registered +/// a file table (with register_files). An unused table index will be +/// dynamically chosen and returned in the CQE res field. +/// +/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE +/// flags member, and setting the SQE fd field to the direct descriptor value +/// rather than the regular file descriptor. +/// +/// Available since 5.15 +pub fn openat_direct( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: os.mode_t, + file_index: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_openat_direct(fd, path, flags, mode, file_index); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `close(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6. +pub fn close(self: *IoUring, user_data: u64, fd: os.fd_t) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_close(fd); + sqe.user_data = user_data; + return sqe; +} + +/// Queues close of registered file descriptor. +/// Available since 5.15 +pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_close_direct(file_index); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to register a timeout operation. +/// Returns a pointer to the SQE. +/// +/// The timeout will complete when either the timeout expires, or after the specified number of +/// events complete (if `count` is greater than `0`). +/// +/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an absolute timeout. +/// +/// The completion event result will be `-ETIME` if the timeout completed through expiration, +/// `0` if the timeout completed after the specified number of events, or `-ECANCELED` if the +/// timeout was removed before it expired. +/// +/// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. +pub fn timeout( + self: *IoUring, + user_data: u64, + ts: *const os.linux.kernel_timespec, + count: u32, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_timeout(ts, count, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to remove an existing timeout operation. +/// Returns a pointer to the SQE. +/// +/// The timeout is identified by its `user_data`. +/// +/// The completion event result will be `0` if the timeout was found and cancelled successfully, +/// `-EBUSY` if the timeout was found but expiration was already in progress, or +/// `-ENOENT` if the timeout was not found. +pub fn timeout_remove( + self: *IoUring, + user_data: u64, + timeout_user_data: u64, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_timeout_remove(timeout_user_data, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to add a link timeout operation. +/// Returns a pointer to the SQE. +/// +/// You need to set linux.IOSQE_IO_LINK to flags of the target operation +/// and then call this method right after the target operation. +/// See https://lwn.net/Articles/803932/ for detail. +/// +/// If the dependent request finishes before the linked timeout, the timeout +/// is canceled. If the timeout finishes before the dependent request, the +/// dependent request will be canceled. +/// +/// The completion event result of the link_timeout will be +/// `-ETIME` if the timeout finishes before the dependent request +/// (in this case, the completion event result of the dependent request will +/// be `-ECANCELED`), or +/// `-EALREADY` if the dependent request finishes before the linked timeout. +pub fn link_timeout( + self: *IoUring, + user_data: u64, + ts: *const os.linux.kernel_timespec, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_link_timeout(ts, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `poll(2)`. +/// Returns a pointer to the SQE. +pub fn poll_add( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + poll_mask: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_poll_add(fd, poll_mask); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to remove an existing poll operation. +/// Returns a pointer to the SQE. +pub fn poll_remove( + self: *IoUring, + user_data: u64, + target_user_data: u64, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_poll_remove(target_user_data); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to update the user data of an existing poll +/// operation. Returns a pointer to the SQE. +pub fn poll_update( + self: *IoUring, + user_data: u64, + old_user_data: u64, + new_user_data: u64, + poll_mask: u32, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an `fallocate(2)`. +/// Returns a pointer to the SQE. +pub fn fallocate( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + mode: i32, + offset: u64, + len: u64, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_fallocate(fd, mode, offset, len); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an `statx(2)`. +/// Returns a pointer to the SQE. +pub fn statx( + self: *IoUring, + user_data: u64, + fd: os.fd_t, + path: [:0]const u8, + flags: u32, + mask: u32, + buf: *linux.Statx, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_statx(fd, path, flags, mask, buf); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to remove an existing operation. +/// Returns a pointer to the SQE. +/// +/// The operation is identified by its `user_data`. +/// +/// The completion event result will be `0` if the operation was found and cancelled successfully, +/// `-EALREADY` if the operation was found but was already in progress, or +/// `-ENOENT` if the operation was not found. +pub fn cancel( + self: *IoUring, + user_data: u64, + cancel_user_data: u64, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_cancel(cancel_user_data, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `shutdown(2)`. +/// Returns a pointer to the SQE. +/// +/// The operation is identified by its `user_data`. +pub fn shutdown( + self: *IoUring, + user_data: u64, + sockfd: os.socket_t, + how: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_shutdown(sockfd, how); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `renameat2(2)`. +/// Returns a pointer to the SQE. +pub fn renameat( + self: *IoUring, + user_data: u64, + old_dir_fd: os.fd_t, + old_path: [*:0]const u8, + new_dir_fd: os.fd_t, + new_path: [*:0]const u8, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `unlinkat(2)`. +/// Returns a pointer to the SQE. +pub fn unlinkat( + self: *IoUring, + user_data: u64, + dir_fd: os.fd_t, + path: [*:0]const u8, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_unlinkat(dir_fd, path, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `mkdirat(2)`. +/// Returns a pointer to the SQE. +pub fn mkdirat( + self: *IoUring, + user_data: u64, + dir_fd: os.fd_t, + path: [*:0]const u8, + mode: os.mode_t, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_mkdirat(dir_fd, path, mode); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `symlinkat(2)`. +/// Returns a pointer to the SQE. +pub fn symlinkat( + self: *IoUring, + user_data: u64, + target: [*:0]const u8, + new_dir_fd: os.fd_t, + link_path: [*:0]const u8, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_symlinkat(target, new_dir_fd, link_path); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `linkat(2)`. +/// Returns a pointer to the SQE. +pub fn linkat( + self: *IoUring, + user_data: u64, + old_dir_fd: os.fd_t, + old_path: [*:0]const u8, + new_dir_fd: os.fd_t, + new_path: [*:0]const u8, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to provide a group of buffers used for commands that read/receive data. +/// Returns a pointer to the SQE. +/// +/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via .buffer_selection. +/// +/// The kernel expects a contiguous block of memory of size (buffers_count * buffer_size). +pub fn provide_buffers( + self: *IoUring, + user_data: u64, + buffers: [*]u8, + buffer_size: usize, + buffers_count: usize, + group_id: usize, + buffer_id: usize, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to remove a group of provided buffers. +/// Returns a pointer to the SQE. +pub fn remove_buffers( + self: *IoUring, + user_data: u64, + buffers_count: usize, + group_id: usize, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_remove_buffers(buffers_count, group_id); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `waitid(2)`. +/// Returns a pointer to the SQE. +pub fn waitid( + self: *IoUring, + user_data: u64, + id_type: linux.P, + id: i32, + infop: *linux.siginfo_t, + options: u32, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_waitid(id_type, id, infop, options, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Registers an array of file descriptors. +/// Every time a file descriptor is put in an SQE and submitted to the kernel, the kernel must +/// retrieve a reference to the file, and once I/O has completed the file reference must be +/// dropped. The atomic nature of this file reference can be a slowdown for high IOPS workloads. +/// This slowdown can be avoided by pre-registering file descriptors. +/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in the SQE's flags, +/// and the SQE's fd must be set to the index of the file descriptor in the registered array. +/// Registering file descriptors will wait for the ring to idle. +/// Files are automatically unregistered by the kernel when the ring is torn down. +/// An application need unregister only if it wants to register a new array of file descriptors. +pub fn register_files(self: *IoUring, fds: []const os.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_FILES, + @as(*const anyopaque, @ptrCast(fds.ptr)), + @as(u32, @intCast(fds.len)), + ); + try handle_registration_result(res); +} + +/// Updates registered file descriptors. +/// +/// Updates are applied starting at the provided offset in the original file descriptors slice. +/// There are three kind of updates: +/// * turning a sparse entry (where the fd is -1) into a real one +/// * removing an existing entry (set the fd to -1) +/// * replacing an existing entry with a new fd +/// Adding new file descriptors must be done with `register_files`. +pub fn register_files_update(self: *IoUring, offset: u32, fds: []const os.fd_t) !void { + assert(self.fd >= 0); + + const FilesUpdate = extern struct { + offset: u32, + resv: u32, + fds: u64 align(8), + }; + var update = FilesUpdate{ + .offset = offset, + .resv = @as(u32, 0), + .fds = @as(u64, @intFromPtr(fds.ptr)), + }; + + const res = linux.io_uring_register( + self.fd, + .REGISTER_FILES_UPDATE, + @as(*const anyopaque, @ptrCast(&update)), + @as(u32, @intCast(fds.len)), + ); + try handle_registration_result(res); +} + +/// Registers the file descriptor for an eventfd that will be notified of completion events on +/// an io_uring instance. +/// Only a single a eventfd can be registered at any given point in time. +pub fn register_eventfd(self: *IoUring, fd: os.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_EVENTFD, + @as(*const anyopaque, @ptrCast(&fd)), + 1, + ); + try handle_registration_result(res); +} + +/// Registers the file descriptor for an eventfd that will be notified of completion events on +/// an io_uring instance. Notifications are only posted for events that complete in an async manner. +/// This means that events that complete inline while being submitted do not trigger a notification event. +/// Only a single eventfd can be registered at any given point in time. +pub fn register_eventfd_async(self: *IoUring, fd: os.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_EVENTFD_ASYNC, + @as(*const anyopaque, @ptrCast(&fd)), + 1, + ); + try handle_registration_result(res); +} + +/// Unregister the registered eventfd file descriptor. +pub fn unregister_eventfd(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .UNREGISTER_EVENTFD, + null, + 0, + ); + try handle_registration_result(res); +} + +/// Registers an array of buffers for use with `read_fixed` and `write_fixed`. +pub fn register_buffers(self: *IoUring, buffers: []const os.iovec) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_BUFFERS, + buffers.ptr, + @as(u32, @intCast(buffers.len)), + ); + try handle_registration_result(res); +} + +/// Unregister the registered buffers. +pub fn unregister_buffers(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .UNREGISTER_BUFFERS, null, 0); + switch (linux.getErrno(res)) { + .SUCCESS => {}, + .NXIO => return error.BuffersNotRegistered, + else => |errno| return os.unexpectedErrno(errno), + } +} + +fn handle_registration_result(res: usize) !void { + switch (linux.getErrno(res)) { + .SUCCESS => {}, + // One or more fds in the array are invalid, or the kernel does not support sparse sets: + .BADF => return error.FileDescriptorInvalid, + .BUSY => return error.FilesAlreadyRegistered, + .INVAL => return error.FilesEmpty, + // Adding `nr_args` file references would exceed the maximum allowed number of files the + // user is allowed to have according to the per-user RLIMIT_NOFILE resource limit and + // the CAP_SYS_RESOURCE capability is not set, or `nr_args` exceeds the maximum allowed + // for a fixed file set (older kernels have a limit of 1024 files vs 64K files): + .MFILE => return error.UserFdQuotaExceeded, + // Insufficient kernel resources, or the caller had a non-zero RLIMIT_MEMLOCK soft + // resource limit but tried to lock more memory than the limit permitted (not enforced + // when the process is privileged with CAP_IPC_LOCK): + .NOMEM => return error.SystemResources, + // Attempt to register files on a ring already registering files or being torn down: + .NXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles, + else => |errno| return os.unexpectedErrno(errno), + } +} + +/// Unregisters all registered file descriptors previously associated with the ring. +pub fn unregister_files(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0); + switch (linux.getErrno(res)) { + .SUCCESS => {}, + .NXIO => return error.FilesNotRegistered, + else => |errno| return os.unexpectedErrno(errno), + } +} + +/// Prepares a socket creation request. +/// New socket fd will be returned in completion result. +/// Available since 5.19 +pub fn socket( + self: *IoUring, + user_data: u64, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket(domain, socket_type, protocol, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Prepares a socket creation request for registered file at index `file_index`. +/// Available since 5.19 +pub fn socket_direct( + self: *IoUring, + user_data: u64, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, + file_index: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); + sqe.user_data = user_data; + return sqe; +} + +/// Prepares a socket creation request for registered file, index chosen by kernel (file index alloc). +/// File index will be returned in CQE res field. +/// Available since 5.19 +pub fn socket_direct_alloc( + self: *IoUring, + user_data: u64, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, +) !*linux.io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); + sqe.user_data = user_data; + return sqe; +} + +pub const SubmissionQueue = struct { + head: *u32, + tail: *u32, + mask: u32, + flags: *u32, + dropped: *u32, + array: []u32, + sqes: []linux.io_uring_sqe, + mmap: []align(mem.page_size) u8, + mmap_sqes: []align(mem.page_size) u8, + + // We use `sqe_head` and `sqe_tail` in the same way as liburing: + // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. + // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. + // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. + sqe_head: u32 = 0, + sqe_tail: u32 = 0, + + pub fn init(fd: os.fd_t, p: linux.io_uring_params) !SubmissionQueue { + assert(fd >= 0); + assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); + const size = @max( + p.sq_off.array + p.sq_entries * @sizeOf(u32), + p.cq_off.cqes + p.cq_entries * @sizeOf(linux.io_uring_cqe), + ); + const mmap = try os.mmap( + null, + size, + os.PROT.READ | os.PROT.WRITE, + .{ .TYPE = .SHARED, .POPULATE = true }, + fd, + linux.IORING_OFF_SQ_RING, + ); + errdefer os.munmap(mmap); + assert(mmap.len == size); + + // The motivation for the `sqes` and `array` indirection is to make it possible for the + // application to preallocate static linux.io_uring_sqe entries and then replay them when needed. + const size_sqes = p.sq_entries * @sizeOf(linux.io_uring_sqe); + const mmap_sqes = try os.mmap( + null, + size_sqes, + os.PROT.READ | os.PROT.WRITE, + .{ .TYPE = .SHARED, .POPULATE = true }, + fd, + linux.IORING_OFF_SQES, + ); + errdefer os.munmap(mmap_sqes); + assert(mmap_sqes.len == size_sqes); + + const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); + const sqes: [*]linux.io_uring_sqe = @ptrCast(@alignCast(&mmap_sqes[0])); + // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, + // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. + assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); + return SubmissionQueue{ + .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), + .tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])), + .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*, + .flags = @ptrCast(@alignCast(&mmap[p.sq_off.flags])), + .dropped = @ptrCast(@alignCast(&mmap[p.sq_off.dropped])), + .array = array[0..p.sq_entries], + .sqes = sqes[0..p.sq_entries], + .mmap = mmap, + .mmap_sqes = mmap_sqes, + }; + } + + pub fn deinit(self: *SubmissionQueue) void { + os.munmap(self.mmap_sqes); + os.munmap(self.mmap); + } +}; + +pub const CompletionQueue = struct { + head: *u32, + tail: *u32, + mask: u32, + overflow: *u32, + cqes: []linux.io_uring_cqe, + + pub fn init(fd: os.fd_t, p: linux.io_uring_params, sq: SubmissionQueue) !CompletionQueue { + assert(fd >= 0); + assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); + const mmap = sq.mmap; + const cqes: [*]linux.io_uring_cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); + assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); + return CompletionQueue{ + .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), + .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), + .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, + .overflow = @ptrCast(@alignCast(&mmap[p.cq_off.overflow])), + .cqes = cqes[0..p.cq_entries], + }; + } + + pub fn deinit(self: *CompletionQueue) void { + _ = self; + // A no-op since we now share the mmap with the submission queue. + // Here for symmetry with the submission queue, and for any future feature support. + } +}; + +test "structs/offsets/entries" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + try testing.expectEqual(@as(usize, 120), @sizeOf(linux.io_uring_params)); + try testing.expectEqual(@as(usize, 64), @sizeOf(linux.io_uring_sqe)); + try testing.expectEqual(@as(usize, 16), @sizeOf(linux.io_uring_cqe)); + + try testing.expectEqual(0, linux.IORING_OFF_SQ_RING); + try testing.expectEqual(0x8000000, linux.IORING_OFF_CQ_RING); + try testing.expectEqual(0x10000000, linux.IORING_OFF_SQES); + + try testing.expectError(error.EntriesZero, IoUring.init(0, 0)); + try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, 0)); +} + +test "nop" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer { + ring.deinit(); + testing.expectEqual(@as(os.fd_t, -1), ring.fd) catch @panic("test failed"); + } + + const sqe = try ring.nop(0xaaaaaaaa); + try testing.expectEqual(linux.io_uring_sqe{ + .opcode = .NOP, + .flags = 0, + .ioprio = 0, + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0xaaaaaaaa, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }, sqe.*); + + try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 0), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 0), ring.cq.head.*); + try testing.expectEqual(@as(u32, 1), ring.sq_ready()); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 1), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 0), ring.cq.head.*); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = 0, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 1), ring.cq.head.*); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + const sqe_barrier = try ring.nop(0xbbbbbbbb); + sqe_barrier.flags |= linux.IOSQE_IO_DRAIN; + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xbbbbbbbb, + .res = 0, + .flags = 0, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 2), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 2), ring.cq.head.*); +} + +test "readv" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const fd = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer os.close(fd); + + // Linux Kernel 5.4 supports IORING_REGISTER_FILES but not sparse fd sets (i.e. an fd of -1). + // Linux Kernel 5.5 adds support for sparse fd sets. + // Compare: + // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs + // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 + // We therefore avoid stressing sparse fd sets here: + var registered_fds = [_]os.fd_t{0} ** 1; + const fd_index = 0; + registered_fds[fd_index] = fd; + try ring.register_files(registered_fds[0..]); + + var buffer = [_]u8{42} ** 128; + var iovecs = [_]os.iovec{os.iovec{ .iov_base = &buffer, .iov_len = buffer.len }}; + const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); + try testing.expectEqual(linux.IORING_OP.READV, sqe.opcode); + sqe.flags |= linux.IOSQE_FIXED_FILE; + + try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = 0, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + + try ring.unregister_files(); +} + +test "writev/fsync/readv" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(4, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const path = "test_io_uring_writev_fsync_readv"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; + + const buffer_write = [_]u8{42} ** 128; + const iovecs_write = [_]os.iovec_const{ + os.iovec_const{ .iov_base = &buffer_write, .iov_len = buffer_write.len }, + }; + var buffer_read = [_]u8{0} ** 128; + var iovecs_read = [_]os.iovec{ + os.iovec{ .iov_base = &buffer_read, .iov_len = buffer_read.len }, + }; + + const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); + try testing.expectEqual(linux.IORING_OP.WRITEV, sqe_writev.opcode); + try testing.expectEqual(@as(u64, 17), sqe_writev.off); + sqe_writev.flags |= linux.IOSQE_IO_LINK; + + const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); + try testing.expectEqual(linux.IORING_OP.FSYNC, sqe_fsync.opcode); + try testing.expectEqual(fd, sqe_fsync.fd); + sqe_fsync.flags |= linux.IOSQE_IO_LINK; + + const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); + try testing.expectEqual(linux.IORING_OP.READV, sqe_readv.opcode); + try testing.expectEqual(@as(u64, 17), sqe_readv.off); + + try testing.expectEqual(@as(u32, 3), ring.sq_ready()); + try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + try testing.expectEqual(@as(u32, 3), ring.cq_ready()); + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xdddddddd, + .res = buffer_write.len, + .flags = 0, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xeeeeeeee, + .res = 0, + .flags = 0, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 1), ring.cq_ready()); + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xffffffff, + .res = buffer_read.len, + .flags = 0, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); +} + +test "write/read" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const path = "test_io_uring_write_read"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; + + const buffer_write = [_]u8{97} ** 20; + var buffer_read = [_]u8{98} ** 20; + const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); + try testing.expectEqual(linux.IORING_OP.WRITE, sqe_write.opcode); + try testing.expectEqual(@as(u64, 10), sqe_write.off); + sqe_write.flags |= linux.IOSQE_IO_LINK; + const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); + try testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); + try testing.expectEqual(@as(u64, 10), sqe_read.off); + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + const cqe_write = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); + // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: + // https://lwn.net/Articles/809820/ + if (cqe_write.err() == .INVAL) return error.SkipZigTest; + if (cqe_read.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x11111111, + .res = buffer_write.len, + .flags = 0, + }, cqe_write); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x22222222, + .res = buffer_read.len, + .flags = 0, + }, cqe_read); + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); +} + +test "splice/read" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(4, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + const path_src = "test_io_uring_splice_src"; + const file_src = try tmp.dir.createFile(path_src, .{ .read = true, .truncate = true }); + defer file_src.close(); + const fd_src = file_src.handle; + + const path_dst = "test_io_uring_splice_dst"; + const file_dst = try tmp.dir.createFile(path_dst, .{ .read = true, .truncate = true }); + defer file_dst.close(); + const fd_dst = file_dst.handle; + + const buffer_write = [_]u8{97} ** 20; + var buffer_read = [_]u8{98} ** 20; + _ = try file_src.write(&buffer_write); + + const fds = try os.pipe(); + const pipe_offset: u64 = std.math.maxInt(u64); + + const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); + try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_to_pipe.opcode); + try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); + try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); + sqe_splice_to_pipe.flags |= linux.IOSQE_IO_LINK; + + const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); + try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_from_pipe.opcode); + try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); + try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); + sqe_splice_from_pipe.flags |= linux.IOSQE_IO_LINK; + + const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); + try testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); + try testing.expectEqual(@as(u64, 10), sqe_read.off); + try testing.expectEqual(@as(u32, 3), try ring.submit()); + + const cqe_splice_to_pipe = try ring.copy_cqe(); + const cqe_splice_from_pipe = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); + // Prior to Linux Kernel 5.6 this is the only way to test for splice/read support: + // https://lwn.net/Articles/809820/ + if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; + if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; + if (cqe_read.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x11111111, + .res = buffer_write.len, + .flags = 0, + }, cqe_splice_to_pipe); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x22222222, + .res = buffer_write.len, + .flags = 0, + }, cqe_splice_from_pipe); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x33333333, + .res = buffer_read.len, + .flags = 0, + }, cqe_read); + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); +} + +test "write_fixed/read_fixed" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const path = "test_io_uring_write_read_fixed"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; + + var raw_buffers: [2][11]u8 = undefined; + // First buffer will be written to the file. + @memset(&raw_buffers[0], 'z'); + raw_buffers[0][0.."foobar".len].* = "foobar".*; + + var buffers = [2]os.iovec{ + .{ .iov_base = &raw_buffers[0], .iov_len = raw_buffers[0].len }, + .{ .iov_base = &raw_buffers[1], .iov_len = raw_buffers[1].len }, + }; + ring.register_buffers(&buffers) catch |err| switch (err) { + error.SystemResources => { + // See https://github.com/ziglang/zig/issues/15362 + return error.SkipZigTest; + }, + else => |e| return e, + }; + + const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); + try testing.expectEqual(linux.IORING_OP.WRITE_FIXED, sqe_write.opcode); + try testing.expectEqual(@as(u64, 3), sqe_write.off); + sqe_write.flags |= linux.IOSQE_IO_LINK; + + const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); + try testing.expectEqual(linux.IORING_OP.READ_FIXED, sqe_read.opcode); + try testing.expectEqual(@as(u64, 0), sqe_read.off); + + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + const cqe_write = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x45454545, + .res = @as(i32, @intCast(buffers[0].iov_len)), + .flags = 0, + }, cqe_write); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x12121212, + .res = @as(i32, @intCast(buffers[1].iov_len)), + .flags = 0, + }, cqe_read); + + try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].iov_base[0..3]); + try testing.expectEqualSlices(u8, "foobar", buffers[1].iov_base[3..9]); + try testing.expectEqualSlices(u8, "zz", buffers[1].iov_base[9..11]); +} + +test "openat" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const path = "test_io_uring_openat"; + + // Workaround for LLVM bug: https://github.com/ziglang/zig/issues/12014 + const path_addr = if (builtin.zig_backend == .stage2_llvm) p: { + var workaround = path; + _ = &workaround; + break :p @intFromPtr(workaround); + } else @intFromPtr(path); + + const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; + const mode: os.mode_t = 0o666; + const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); + try testing.expectEqual(linux.io_uring_sqe{ + .opcode = .OPENAT, + .flags = 0, + .ioprio = 0, + .fd = tmp.dir.fd, + .off = 0, + .addr = path_addr, + .len = mode, + .rw_flags = @bitCast(flags), + .user_data = 0x33333333, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }, sqe_openat.*); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe_openat = try ring.copy_cqe(); + try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); + if (cqe_openat.err() == .INVAL) return error.SkipZigTest; + if (cqe_openat.err() == .BADF) return error.SkipZigTest; + if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); + try testing.expect(cqe_openat.res > 0); + try testing.expectEqual(@as(u32, 0), cqe_openat.flags); + + os.close(cqe_openat.res); +} + +test "close" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const path = "test_io_uring_close"; + const file = try tmp.dir.createFile(path, .{}); + errdefer file.close(); + + const sqe_close = try ring.close(0x44444444, file.handle); + try testing.expectEqual(linux.IORING_OP.CLOSE, sqe_close.opcode); + try testing.expectEqual(file.handle, sqe_close.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe_close = try ring.copy_cqe(); + if (cqe_close.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x44444444, + .res = 0, + .flags = 0, + }, cqe_close); +} + +test "accept/connect/send/recv" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); + + const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + + const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); + sqe_send.flags |= linux.IOSQE_IO_LINK; + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xeeeeeeee, + .res = buffer_send.len, + .flags = 0, + }, cqe_send); + + const cqe_recv = try ring.copy_cqe(); + if (cqe_recv.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xffffffff, + .res = buffer_recv.len, + // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems + .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + }, cqe_recv); + + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); +} + +test "sendmsg/recvmsg" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var address_server = try net.Address.parseIp4("127.0.0.1", 0); + + const server = try os.socket(address_server.any.family, os.SOCK.DGRAM, 0); + defer os.close(server); + try os.setsockopt(server, os.SOL.SOCKET, os.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1))); + try os.setsockopt(server, os.SOL.SOCKET, os.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try os.bind(server, &address_server.any, address_server.getOsSockLen()); + + // set address_server to the OS-chosen IP/port. + var slen: os.socklen_t = address_server.getOsSockLen(); + try os.getsockname(server, &address_server.any, &slen); + + const client = try os.socket(address_server.any.family, os.SOCK.DGRAM, 0); + defer os.close(client); + + const buffer_send = [_]u8{42} ** 128; + const iovecs_send = [_]os.iovec_const{ + os.iovec_const{ .iov_base = &buffer_send, .iov_len = buffer_send.len }, + }; + const msg_send = os.msghdr_const{ + .name = &address_server.any, + .namelen = address_server.getOsSockLen(), + .iov = &iovecs_send, + .iovlen = 1, + .control = null, + .controllen = 0, + .flags = 0, + }; + const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); + sqe_sendmsg.flags |= linux.IOSQE_IO_LINK; + try testing.expectEqual(linux.IORING_OP.SENDMSG, sqe_sendmsg.opcode); + try testing.expectEqual(client, sqe_sendmsg.fd); + + var buffer_recv = [_]u8{0} ** 128; + var iovecs_recv = [_]os.iovec{ + os.iovec{ .iov_base = &buffer_recv, .iov_len = buffer_recv.len }, + }; + const addr = [_]u8{0} ** 4; + var address_recv = net.Address.initIp4(addr, 0); + var msg_recv: os.msghdr = os.msghdr{ + .name = &address_recv.any, + .namelen = address_recv.getOsSockLen(), + .iov = &iovecs_recv, + .iovlen = 1, + .control = null, + .controllen = 0, + .flags = 0, + }; + const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); + try testing.expectEqual(linux.IORING_OP.RECVMSG, sqe_recvmsg.opcode); + try testing.expectEqual(server, sqe_recvmsg.fd); + + try testing.expectEqual(@as(u32, 2), ring.sq_ready()); + try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + + const cqe_sendmsg = try ring.copy_cqe(); + if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x11111111, + .res = buffer_send.len, + .flags = 0, + }, cqe_sendmsg); + + const cqe_recvmsg = try ring.copy_cqe(); + if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x22222222, + .res = buffer_recv.len, + // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically + .flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + }, cqe_recvmsg); + + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); +} + +test "timeout (after a relative time)" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const ms = 10; + const margin = 5; + const ts = os.linux.kernel_timespec{ .tv_sec = 0, .tv_nsec = ms * 1000000 }; + + const started = std.time.milliTimestamp(); + const sqe = try ring.timeout(0x55555555, &ts, 0, 0); + try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe.opcode); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); + const stopped = std.time.milliTimestamp(); + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x55555555, + .res = -@as(i32, @intFromEnum(linux.E.TIME)), + .flags = 0, + }, cqe); + + // Tests should not depend on timings: skip test if outside margin. + if (!std.math.approxEqAbs(f64, ms, @as(f64, @floatFromInt(stopped - started)), margin)) return error.SkipZigTest; +} + +test "timeout (after a number of completions)" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const ts = os.linux.kernel_timespec{ .tv_sec = 3, .tv_nsec = 0 }; + const count_completions: u64 = 1; + const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, 0); + try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(count_completions, sqe_timeout.off); + _ = try ring.nop(0x77777777); + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + const cqe_nop = try ring.copy_cqe(); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x77777777, + .res = 0, + .flags = 0, + }, cqe_nop); + + const cqe_timeout = try ring.copy_cqe(); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x66666666, + .res = 0, + .flags = 0, + }, cqe_timeout); +} + +test "timeout_remove" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const ts = os.linux.kernel_timespec{ .tv_sec = 3, .tv_nsec = 0 }; + const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, 0); + try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); + + const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, 0); + try testing.expectEqual(linux.IORING_OP.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); + try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); + try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); + + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + // The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18: + // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second + // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second + + var cqes: [2]os.linux.io_uring_cqe = undefined; + cqes[0] = try ring.copy_cqe(); + cqes[1] = try ring.copy_cqe(); + + for (cqes) |cqe| { + // IORING_OP_TIMEOUT_REMOVE is not supported by this kernel version: + // Timeout remove operations set the fd to -1, which results in EBADF before EINVAL. + // We use IORING_FEAT_RW_CUR_POS as a safety check here to make sure we are at least pre-5.6. + // We don't want to skip this test for newer kernels. + if (cqe.user_data == 0x99999999 and + cqe.err() == .BADF and + (ring.features & linux.IORING_FEAT_RW_CUR_POS) == 0) + { + return error.SkipZigTest; + } + + try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); + + if (cqe.user_data == 0x88888888) { + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x88888888, + .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), + .flags = 0, + }, cqe); + } else if (cqe.user_data == 0x99999999) { + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x99999999, + .res = 0, + .flags = 0, + }, cqe); + } + } +} + +test "accept/connect/recv/link_timeout" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); + + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + + const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + sqe_recv.flags |= linux.IOSQE_IO_LINK; + + const ts = os.linux.kernel_timespec{ .tv_sec = 0, .tv_nsec = 1000000 }; + _ = try ring.link_timeout(0x22222222, &ts, 0); + + const nr_wait = try ring.submit(); + try testing.expectEqual(@as(u32, 2), nr_wait); + + var i: usize = 0; + while (i < nr_wait) : (i += 1) { + const cqe = try ring.copy_cqe(); + switch (cqe.user_data) { + 0xffffffff => { + if (cqe.res != -@as(i32, @intFromEnum(linux.E.INTR)) and + cqe.res != -@as(i32, @intFromEnum(linux.E.CANCELED))) + { + std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); + try testing.expect(false); + } + }, + 0x22222222 => { + if (cqe.res != -@as(i32, @intFromEnum(linux.E.ALREADY)) and + cqe.res != -@as(i32, @intFromEnum(linux.E.TIME))) + { + std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); + try testing.expect(false); + } + }, + else => @panic("should not happen"), + } + } +} + +test "fallocate" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const path = "test_io_uring_fallocate"; + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); + + try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + + const len: u64 = 65536; + const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); + try testing.expectEqual(linux.IORING_OP.FALLOCATE, sqe.opcode); + try testing.expectEqual(file.handle, sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement fallocate(): + .INVAL => return error.SkipZigTest, + // This kernel does not implement fallocate(): + .NOSYS => return error.SkipZigTest, + // The filesystem containing the file referred to by fd does not support this operation; + // or the mode is not supported by the filesystem containing the file referred to by fd: + .OPNOTSUPP => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = 0, + }, cqe); + + try testing.expectEqual(len, (try file.stat()).size); +} + +test "statx" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const path = "test_io_uring_statx"; + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); + + try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + + try file.writeAll("foobar"); + + var buf: linux.Statx = undefined; + const sqe = try ring.statx( + 0xaaaaaaaa, + tmp.dir.fd, + path, + 0, + linux.STATX_SIZE, + &buf, + ); + try testing.expectEqual(linux.IORING_OP.STATX, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement statx(): + .INVAL => return error.SkipZigTest, + // This kernel does not implement statx(): + .NOSYS => return error.SkipZigTest, + // The filesystem containing the file referred to by fd does not support this operation; + // or the mode is not supported by the filesystem containing the file referred to by fd: + .OPNOTSUPP => return error.SkipZigTest, + // not supported on older kernels (5.4) + .BADF => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = 0, + }, cqe); + + try testing.expect(buf.mask & os.linux.STATX_SIZE == os.linux.STATX_SIZE); + try testing.expectEqual(@as(u64, 6), buf.size); +} + +test "accept/connect/recv/cancel" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); + + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); + try testing.expectEqual(linux.IORING_OP.ASYNC_CANCEL, sqe_cancel.opcode); + try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); + try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + var cqe_recv = try ring.copy_cqe(); + if (cqe_recv.err() == .INVAL) return error.SkipZigTest; + var cqe_cancel = try ring.copy_cqe(); + if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; + + // The recv/cancel CQEs may arrive in any order, the recv CQE will sometimes come first: + if (cqe_recv.user_data == 0x99999999 and cqe_cancel.user_data == 0xffffffff) { + const a = cqe_recv; + const b = cqe_cancel; + cqe_recv = b; + cqe_cancel = a; + } + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xffffffff, + .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), + .flags = 0, + }, cqe_recv); + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x99999999, + .res = 0, + .flags = 0, + }, cqe_cancel); +} + +test "register_files_update" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const fd = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer os.close(fd); + + var registered_fds = [_]os.fd_t{0} ** 2; + const fd_index = 0; + const fd_index2 = 1; + registered_fds[fd_index] = fd; + registered_fds[fd_index2] = -1; + + ring.register_files(registered_fds[0..]) catch |err| switch (err) { + // Happens when the kernel doesn't support sparse entry (-1) in the file descriptors array. + error.FileDescriptorInvalid => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + }; + + // Test IORING_REGISTER_FILES_UPDATE + // Only available since Linux 5.5 + + const fd2 = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer os.close(fd2); + + registered_fds[fd_index] = fd2; + registered_fds[fd_index2] = -1; + try ring.register_files_update(0, registered_fds[0..]); + + var buffer = [_]u8{42} ** 128; + { + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + sqe.flags |= linux.IOSQE_FIXED_FILE; + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = 0, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + } + + // Test with a non-zero offset + + registered_fds[fd_index] = -1; + registered_fds[fd_index2] = -1; + try ring.register_files_update(1, registered_fds[1..]); + + { + // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + sqe.flags |= linux.IOSQE_FIXED_FILE; + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = 0, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + } + + try ring.register_files_update(0, registered_fds[0..]); + + { + // Now this should fail since both fds are sparse (-1) + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + sqe.flags |= linux.IOSQE_FIXED_FILE; + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); + try testing.expectEqual(os.linux.E.BADF, cqe.err()); + } + + try ring.unregister_files(); +} + +test "shutdown" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var address = try net.Address.parseIp4("127.0.0.1", 0); + + // Socket bound, expect shutdown to work + { + const server = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + defer os.close(server); + try os.setsockopt(server, os.SOL.SOCKET, os.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try os.bind(server, &address.any, address.getOsSockLen()); + try os.listen(server, 1); + + // set address to the OS-chosen IP/port. + var slen: os.socklen_t = address.getOsSockLen(); + try os.getsockname(server, &address.any, &slen); + + const shutdown_sqe = try ring.shutdown(0x445445445, server, os.linux.SHUT.RD); + try testing.expectEqual(linux.IORING_OP.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement shutdown (kernel version < 5.11) + .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x445445445, + .res = 0, + .flags = 0, + }, cqe); + } + + // Socket not bound, expect to fail with ENOTCONN + { + const server = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + defer os.close(server); + + const shutdown_sqe = ring.shutdown(0x445445445, server, os.linux.SHUT.RD) catch |err| switch (err) { + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + }; + try testing.expectEqual(linux.IORING_OP.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data); + try testing.expectEqual(os.linux.E.NOTCONN, cqe.err()); + } +} + +test "renameat" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const old_path = "test_io_uring_renameat_old"; + const new_path = "test_io_uring_renameat_new"; + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + // Write old file with data + + const old_file = try tmp.dir.createFile(old_path, .{ .truncate = true, .mode = 0o666 }); + defer old_file.close(); + try old_file.writeAll("hello"); + + // Submit renameat + + const sqe = try ring.renameat( + 0x12121212, + tmp.dir.fd, + old_path, + tmp.dir.fd, + new_path, + 0, + ); + try testing.expectEqual(linux.IORING_OP.RENAMEAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement renameat (kernel version < 5.11) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = 0, + }, cqe); + + // Validate that the old file doesn't exist anymore + { + _ = tmp.dir.openFile(old_path, .{}) catch |err| switch (err) { + error.FileNotFound => {}, + else => std.debug.panic("unexpected error: {}", .{err}), + }; + } + + // Validate that the new file exists with the proper content + { + const new_file = try tmp.dir.openFile(new_path, .{}); + defer new_file.close(); + + var new_file_data: [16]u8 = undefined; + const bytes_read = try new_file.readAll(&new_file_data); + try testing.expectEqualStrings("hello", new_file_data[0..bytes_read]); + } +} + +test "unlinkat" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const path = "test_io_uring_unlinkat"; + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + // Write old file with data + + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); + + // Submit unlinkat + + const sqe = try ring.unlinkat( + 0x12121212, + tmp.dir.fd, + path, + 0, + ); + try testing.expectEqual(linux.IORING_OP.UNLINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement unlinkat (kernel version < 5.11) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = 0, + }, cqe); + + // Validate that the file doesn't exist anymore + _ = tmp.dir.openFile(path, .{}) catch |err| switch (err) { + error.FileNotFound => {}, + else => std.debug.panic("unexpected error: {}", .{err}), + }; +} + +test "mkdirat" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const path = "test_io_uring_mkdirat"; + + // Submit mkdirat + + const sqe = try ring.mkdirat( + 0x12121212, + tmp.dir.fd, + path, + 0o0755, + ); + try testing.expectEqual(linux.IORING_OP.MKDIRAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement mkdirat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = 0, + }, cqe); + + // Validate that the directory exist + _ = try tmp.dir.openDir(path, .{}); +} + +test "symlinkat" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const path = "test_io_uring_symlinkat"; + const link_path = "test_io_uring_symlinkat_link"; + + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); + + // Submit symlinkat + + const sqe = try ring.symlinkat( + 0x12121212, + path, + tmp.dir.fd, + link_path, + ); + try testing.expectEqual(linux.IORING_OP.SYMLINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement symlinkat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = 0, + }, cqe); + + // Validate that the symlink exist + _ = try tmp.dir.openFile(link_path, .{}); +} + +test "linkat" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const first_path = "test_io_uring_linkat_first"; + const second_path = "test_io_uring_linkat_second"; + + // Write file with data + + const first_file = try tmp.dir.createFile(first_path, .{ .truncate = true, .mode = 0o666 }); + defer first_file.close(); + try first_file.writeAll("hello"); + + // Submit linkat + + const sqe = try ring.linkat( + 0x12121212, + tmp.dir.fd, + first_path, + tmp.dir.fd, + second_path, + 0, + ); + try testing.expectEqual(linux.IORING_OP.LINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement linkat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = 0, + }, cqe); + + // Validate the second file + const second_file = try tmp.dir.openFile(second_path, .{}); + defer second_file.close(); + + var second_file_data: [16]u8 = undefined; + const bytes_read = try second_file.readAll(&second_file_data); + try testing.expectEqualStrings("hello", second_file_data[0..bytes_read]); +} + +test "provide_buffers: read" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const fd = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer os.close(fd); + + const group_id = 1337; + const buffer_id = 0; + + const buffer_len = 128; + + var buffers: [4][buffer_len]u8 = undefined; + + // Provide 4 buffers + + { + const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(linux.IORING_OP.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, buffers.len), sqe.fd); + try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Happens when the kernel is < 5.7 + .INVAL => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + } + + // Do 4 reads which should consume all buffers + + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); + const used_buffer_id = cqe.flags >> 16; + try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + + try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + } + + // This read should fail + + { + const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + } + + // Provide 1 buffer again + + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 42); + + const reprovided_buffer_id = 2; + + { + _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + } + + // Final read which should work + + { + const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); + const used_buffer_id = cqe.flags >> 16; + try testing.expectEqual(used_buffer_id, reprovided_buffer_id); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + } +} + +test "remove_buffers" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const fd = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer os.close(fd); + + const group_id = 1337; + const buffer_id = 0; + + const buffer_len = 128; + + var buffers: [4][buffer_len]u8 = undefined; + + // Provide 4 buffers + + { + _ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .INVAL => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + } + + // Remove 3 buffers + + { + const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); + try testing.expectEqual(linux.IORING_OP.REMOVE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, 3), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data); + } + + // This read should work + + { + _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); + const used_buffer_id = cqe.flags >> 16; + try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + } + + // Final read should _not_ work + + { + _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + } +} + +test "provide_buffers: accept/connect/send/recv" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const group_id = 1337; + const buffer_id = 0; + + const buffer_len = 128; + var buffers: [4][buffer_len]u8 = undefined; + + // Provide 4 buffers + + { + const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(linux.IORING_OP.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, buffers.len), sqe.fd); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Happens when the kernel is < 5.7 + .INVAL => return error.SkipZigTest, + // Happens on the kernel 5.4 + .BADF => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + } + + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); + + // Do 4 send on the socket + + { + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + } + + var cqes: [4]linux.io_uring_cqe = undefined; + try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); + } + + // Do 4 recv which should consume all buffers + + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 1); + + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); + const used_buffer_id = cqe.flags >> 16; + try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + + try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); + const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; + try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer); + } + + // This recv should fail + + { + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + } + + // Provide 1 buffer again + + const reprovided_buffer_id = 2; + + { + _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + } + + // Redo 1 send on the server socket + + { + _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + _ = try ring.copy_cqe(); + } + + // Final recv which should work + + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 1); + + { + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); + const used_buffer_id = cqe.flags >> 16; + try testing.expectEqual(used_buffer_id, reprovided_buffer_id); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; + try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer); + } +} + +/// Used for testing server/client interactions. +const SocketTestHarness = struct { + listener: os.socket_t, + server: os.socket_t, + client: os.socket_t, + + fn close(self: SocketTestHarness) void { + posix.close(self.client); + posix.close(self.listener); + } +}; + +fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { + // Create a TCP server socket + var address = try net.Address.parseIp4("127.0.0.1", 0); + const listener_socket = try createListenerSocket(&address); + errdefer posix.close(listener_socket); + + // Submit 1 accept + var accept_addr: os.sockaddr = undefined; + var accept_addr_len: os.socklen_t = @sizeOf(@TypeOf(accept_addr)); + _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, 0); + + // Create a TCP client socket + const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + errdefer posix.close(client); + _ = try ring.connect(0xcccccccc, client, &address.any, address.getOsSockLen()); + + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + var cqe_accept = try ring.copy_cqe(); + if (cqe_accept.err() == .INVAL) return error.SkipZigTest; + var cqe_connect = try ring.copy_cqe(); + if (cqe_connect.err() == .INVAL) return error.SkipZigTest; + + // The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first: + if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) { + const a = cqe_accept; + const b = cqe_connect; + cqe_accept = b; + cqe_connect = a; + } + + try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); + if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); + try testing.expect(cqe_accept.res > 0); + try testing.expectEqual(@as(u32, 0), cqe_accept.flags); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xcccccccc, + .res = 0, + .flags = 0, + }, cqe_connect); + + // All good + + return SocketTestHarness{ + .listener = listener_socket, + .server = cqe_accept.res, + .client = client, + }; +} + +fn createListenerSocket(address: *net.Address) !os.socket_t { + const kernel_backlog = 1; + const listener_socket = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + errdefer posix.close(listener_socket); + + try os.setsockopt(listener_socket, os.SOL.SOCKET, os.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try os.bind(listener_socket, &address.any, address.getOsSockLen()); + try os.listen(listener_socket, kernel_backlog); + + // set address to the OS-chosen IP/port. + var slen: os.socklen_t = address.getOsSockLen(); + try os.getsockname(listener_socket, &address.any, &slen); + + return listener_socket; +} + +test "accept multishot" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IoUring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var address = try net.Address.parseIp4("127.0.0.1", 0); + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); + + // submit multishot accept operation + var addr: os.sockaddr = undefined; + var addr_len: os.socklen_t = @sizeOf(@TypeOf(addr)); + const userdata: u64 = 0xaaaaaaaa; + _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + var nr: usize = 4; // number of clients to connect + while (nr > 0) : (nr -= 1) { + // connect client + const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + errdefer posix.close(client); + try os.connect(client, &address.any, address.getOsSockLen()); + + // test accept completion + var cqe = try ring.copy_cqe(); + if (cqe.err() == .INVAL) return error.SkipZigTest; + try testing.expect(cqe.res > 0); + try testing.expect(cqe.user_data == userdata); + try testing.expect(cqe.flags & linux.IORING_CQE_F_MORE > 0); // more flag is set + + posix.close(client); + } +} + +test "accept/connect/send_zc/recv" { + try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 }); + + var ring = IoUring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); + + const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + var buffer_recv = [_]u8{0} ** 10; + + // zero-copy send + const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); + sqe_send.flags |= linux.IOSQE_IO_LINK; + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + // First completion of zero-copy send. + // IORING_CQE_F_MORE, means that there + // will be a second completion event / notification for the + // request, with the user_data field set to the same value. + // buffer_send must be keep alive until second cqe. + var cqe_send = try ring.copy_cqe(); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xeeeeeeee, + .res = buffer_send.len, + .flags = linux.IORING_CQE_F_MORE, + }, cqe_send); + + const cqe_recv = try ring.copy_cqe(); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xffffffff, + .res = buffer_recv.len, + .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + }, cqe_recv); + + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); + + // Second completion of zero-copy send. + // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer + cqe_send = try ring.copy_cqe(); + try testing.expectEqual(linux.io_uring_cqe{ + .user_data = 0xeeeeeeee, + .res = 0, + .flags = linux.IORING_CQE_F_NOTIF, + }, cqe_send); +} + +test "accept_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + var address = try net.Address.parseIp4("127.0.0.1", 0); + + // register direct file descriptors + var registered_fds = [_]os.fd_t{-1} ** 2; + try ring.register_files(registered_fds[0..]); + + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); + + const accept_userdata: u64 = 0xaaaaaaaa; + const read_userdata: u64 = 0xbbbbbbbb; + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + + for (0..2) |_| { + for (registered_fds, 0..) |_, i| { + var buffer_recv = [_]u8{0} ** 16; + const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop + + // submit accept, will chose registered fd and return index in cqe + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + // connect + const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + try os.connect(client, &address.any, address.getOsSockLen()); + defer posix.close(client); + + // accept completion + const cqe_accept = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe_accept.err()); + const fd_index = cqe_accept.res; + try testing.expect(fd_index < registered_fds.len); + try testing.expect(cqe_accept.user_data == accept_userdata); + + // send data + _ = try os.send(client, buffer_send, 0); + + // Example of how to use registered fd: + // Submit receive to fixed file returned by accept (fd_index). + // Fd field is set to registered file index, returned by accept. + // Flag linux.IOSQE_FIXED_FILE must be set. + const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); + recv_sqe.flags |= linux.IOSQE_FIXED_FILE; + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + // accept receive + const recv_cqe = try ring.copy_cqe(); + try testing.expect(recv_cqe.user_data == read_userdata); + try testing.expect(recv_cqe.res == buffer_send.len); + try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); + } + // no more available fds, accept will get NFILE error + { + // submit accept + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + // connect + const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + try os.connect(client, &address.any, address.getOsSockLen()); + defer posix.close(client); + // completion with error + const cqe_accept = try ring.copy_cqe(); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(os.E.NFILE, cqe_accept.err()); + } + // return file descriptors to kernel + try ring.register_files_update(0, registered_fds[0..]); + } + try ring.unregister_files(); +} + +test "accept_multishot_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var address = try net.Address.parseIp4("127.0.0.1", 0); + + var registered_fds = [_]os.fd_t{-1} ** 2; + try ring.register_files(registered_fds[0..]); + + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); + + const accept_userdata: u64 = 0xaaaaaaaa; + + for (0..2) |_| { + // submit multishot accept + // Will chose registered fd and return index of the selected registered file in cqe. + _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + for (registered_fds) |_| { + // connect + const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + try os.connect(client, &address.any, address.getOsSockLen()); + defer posix.close(client); + + // accept completion + const cqe_accept = try ring.copy_cqe(); + const fd_index = cqe_accept.res; + try testing.expect(fd_index < registered_fds.len); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE > 0); // has more is set + } + // No more available fds, accept will get NFILE error. + // Multishot is terminated (more flag is not set). + { + // connect + const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); + try os.connect(client, &address.any, address.getOsSockLen()); + defer posix.close(client); + // completion with error + const cqe_accept = try ring.copy_cqe(); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(os.E.NFILE, cqe_accept.err()); + try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE == 0); // has more is not set + } + // return file descriptors to kernel + try ring.register_files_update(0, registered_fds[0..]); + } + try ring.unregister_files(); +} + +test "socket" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + + var ring = IoUring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + // prepare, submit socket operation + _ = try ring.socket(0, linux.AF.INET, os.SOCK.STREAM, 0, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + // test completion + var cqe = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe.err()); + const fd: os.fd_t = @intCast(cqe.res); + try testing.expect(fd > 2); + + os.close(fd); +} + +test "socket_direct/socket_direct_alloc/close_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + + var ring = IoUring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var registered_fds = [_]os.fd_t{-1} ** 3; + try ring.register_files(registered_fds[0..]); + + // create socket in registered file descriptor at index 0 (last param) + _ = try ring.socket_direct(0, linux.AF.INET, os.SOCK.STREAM, 0, 0, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 0); + + // create socket in registered file descriptor at index 1 (last param) + _ = try ring.socket_direct(0, linux.AF.INET, os.SOCK.STREAM, 0, 0, 1); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified + + // create socket in kernel chosen file descriptor index (_alloc version) + // completion res has index from registered files + _ = try ring.socket_direct_alloc(0, linux.AF.INET, os.SOCK.STREAM, 0, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 2); // returns registered file index + + // use sockets from registered_fds in connect operation + var address = try net.Address.parseIp4("127.0.0.1", 0); + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); + const accept_userdata: u64 = 0xaaaaaaaa; + const connect_userdata: u64 = 0xbbbbbbbb; + const close_userdata: u64 = 0xcccccccc; + for (registered_fds, 0..) |_, fd_index| { + // prepare accept + _ = try ring.accept(accept_userdata, listener_socket, null, null, 0); + // prepare connect with fixed socket + const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), &address.any, address.getOsSockLen()); + connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index + // submit both + try testing.expectEqual(@as(u32, 2), try ring.submit()); + // get completions + var cqe_connect = try ring.copy_cqe(); + var cqe_accept = try ring.copy_cqe(); + // ignore order + if (cqe_connect.user_data == accept_userdata and cqe_accept.user_data == connect_userdata) { + const a = cqe_accept; + const b = cqe_connect; + cqe_accept = b; + cqe_connect = a; + } + // test connect completion + try testing.expect(cqe_connect.user_data == connect_userdata); + try testing.expectEqual(os.E.SUCCESS, cqe_connect.err()); + // test accept completion + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(os.E.SUCCESS, cqe_accept.err()); + + // submit and test close_direct + _ = try ring.close_direct(close_userdata, @intCast(fd_index)); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_close = try ring.copy_cqe(); + try testing.expect(cqe_close.user_data == close_userdata); + try testing.expectEqual(os.E.SUCCESS, cqe_close.err()); + } + + try ring.unregister_files(); +} + +test "openat_direct/close_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + + var ring = IoUring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var registered_fds = [_]os.fd_t{-1} ** 3; + try ring.register_files(registered_fds[0..]); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const path = "test_io_uring_close_direct"; + const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true }; + const mode: os.mode_t = 0o666; + const user_data: u64 = 0; + + // use registered file at index 0 (last param) + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 0); + + // use registered file at index 1 + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 0); // res is 0 when we specify index + + // let kernel choose registered file index + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 2); // chosen index is in res + + // close all open file descriptors + for (registered_fds, 0..) |_, fd_index| { + _ = try ring.close_direct(user_data, @intCast(fd_index)); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_close = try ring.copy_cqe(); + try testing.expectEqual(os.E.SUCCESS, cqe_close.err()); + } + try ring.unregister_files(); +} + +test "waitid" { + try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 }); + + var ring = IoUring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const pid = try os.fork(); + if (pid == 0) { + os.exit(7); + } + + var siginfo: os.siginfo_t = undefined; + _ = try ring.waitid(0, .PID, pid, &siginfo, os.W.EXITED, 0); + + try testing.expectEqual(1, try ring.submit()); + + const cqe_waitid = try ring.copy_cqe(); + try testing.expectEqual(0, cqe_waitid.res); + try testing.expectEqual(pid, siginfo.fields.common.first.piduid.pid); + try testing.expectEqual(7, siginfo.fields.common.second.sigchld.status); +} + +/// For use in tests. Returns SkipZigTest is kernel version is less than required. +inline fn skipKernelLessThan(required: std.SemanticVersion) !void { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var uts: linux.utsname = undefined; + const res = linux.uname(&uts); + switch (linux.getErrno(res)) { + .SUCCESS => {}, + else => |errno| return os.unexpectedErrno(errno), + } + + const release = mem.sliceTo(&uts.release, 0); + var current = try std.SemanticVersion.parse(release); + current.pre = null; // don't check pre field + if (required.order(current) == .gt) return error.SkipZigTest; +} diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig @@ -1,4228 +0,0 @@ -const std = @import("../../std.zig"); -const builtin = @import("builtin"); -const assert = std.debug.assert; -const mem = std.mem; -const net = std.net; -const os = std.os; -const posix = std.posix; -const linux = os.linux; -const testing = std.testing; - -pub const IO_Uring = struct { - fd: os.fd_t = -1, - sq: SubmissionQueue, - cq: CompletionQueue, - flags: u32, - features: u32, - - /// A friendly way to setup an io_uring, with default linux.io_uring_params. - /// `entries` must be a power of two between 1 and 32768, although the kernel will make the final - /// call on how many entries the submission and completion queues will ultimately have, - /// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. - /// Matches the interface of io_uring_queue_init() in liburing. - pub fn init(entries: u16, flags: u32) !IO_Uring { - var params = mem.zeroInit(linux.io_uring_params, .{ - .flags = flags, - .sq_thread_idle = 1000, - }); - return try IO_Uring.init_params(entries, &params); - } - - /// A powerful way to setup an io_uring, if you want to tweak linux.io_uring_params such as submission - /// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second). - /// `params` is passed by reference because the kernel needs to modify the parameters. - /// Matches the interface of io_uring_queue_init_params() in liburing. - pub fn init_params(entries: u16, p: *linux.io_uring_params) !IO_Uring { - if (entries == 0) return error.EntriesZero; - if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; - - assert(p.sq_entries == 0); - assert(p.cq_entries == 0 or p.flags & linux.IORING_SETUP_CQSIZE != 0); - assert(p.features == 0); - assert(p.wq_fd == 0 or p.flags & linux.IORING_SETUP_ATTACH_WQ != 0); - assert(p.resv[0] == 0); - assert(p.resv[1] == 0); - assert(p.resv[2] == 0); - - const res = linux.io_uring_setup(entries, p); - switch (linux.getErrno(res)) { - .SUCCESS => {}, - .FAULT => return error.ParamsOutsideAccessibleAddressSpace, - // The resv array contains non-zero data, p.flags contains an unsupported flag, - // entries out of bounds, IORING_SETUP_SQ_AFF was specified without IORING_SETUP_SQPOLL, - // or IORING_SETUP_CQSIZE was specified but linux.io_uring_params.cq_entries was invalid: - .INVAL => return error.ArgumentsInvalid, - .MFILE => return error.ProcessFdQuotaExceeded, - .NFILE => return error.SystemFdQuotaExceeded, - .NOMEM => return error.SystemResources, - // IORING_SETUP_SQPOLL was specified but effective user ID lacks sufficient privileges, - // or a container seccomp policy prohibits io_uring syscalls: - .PERM => return error.PermissionDenied, - .NOSYS => return error.SystemOutdated, - else => |errno| return os.unexpectedErrno(errno), - } - const fd = @as(os.fd_t, @intCast(res)); - assert(fd >= 0); - errdefer os.close(fd); - - // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. - // This is not an optional feature for us... if the kernel does it, we have to do it. - // The thinking on this by the kernel developers was that both the submission and the - // completion queue rings have sizes just over a power of two, but the submission queue ring - // is significantly smaller with u32 slots. By bundling both in a single mmap, the kernel - // gets the submission queue ring for free. - // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. - // We do not support the double mmap() done before 5.4, because we want to keep the - // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. - if ((p.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { - return error.SystemOutdated; - } - - // Check that the kernel has actually set params and that "impossible is nothing". - assert(p.sq_entries != 0); - assert(p.cq_entries != 0); - assert(p.cq_entries >= p.sq_entries); - - // From here on, we only need to read from params, so pass `p` by value as immutable. - // The completion queue shares the mmap with the submission queue, so pass `sq` there too. - var sq = try SubmissionQueue.init(fd, p.*); - errdefer sq.deinit(); - var cq = try CompletionQueue.init(fd, p.*, sq); - errdefer cq.deinit(); - - // Check that our starting state is as we expect. - assert(sq.head.* == 0); - assert(sq.tail.* == 0); - assert(sq.mask == p.sq_entries - 1); - // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. - assert(sq.dropped.* == 0); - assert(sq.array.len == p.sq_entries); - assert(sq.sqes.len == p.sq_entries); - assert(sq.sqe_head == 0); - assert(sq.sqe_tail == 0); - - assert(cq.head.* == 0); - assert(cq.tail.* == 0); - assert(cq.mask == p.cq_entries - 1); - assert(cq.overflow.* == 0); - assert(cq.cqes.len == p.cq_entries); - - return IO_Uring{ - .fd = fd, - .sq = sq, - .cq = cq, - .flags = p.flags, - .features = p.features, - }; - } - - pub fn deinit(self: *IO_Uring) void { - assert(self.fd >= 0); - // The mmaps depend on the fd, so the order of these calls is important: - self.cq.deinit(); - self.sq.deinit(); - os.close(self.fd); - self.fd = -1; - } - - /// Returns a pointer to a vacant SQE, or an error if the submission queue is full. - /// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. - /// However, instead of a null we return an error to force safe handling. - /// Any situation where the submission queue is full tends more towards a control flow error, - /// and the null return in liburing is more a C idiom than anything else, for lack of a better - /// alternative. In Zig, we have first-class error handling... so let's use it. - /// Matches the implementation of io_uring_get_sqe() in liburing. - pub fn get_sqe(self: *IO_Uring) !*linux.io_uring_sqe { - const head = @atomicLoad(u32, self.sq.head, .Acquire); - // Remember that these head and tail offsets wrap around every four billion operations. - // We must therefore use wrapping addition and subtraction to avoid a runtime crash. - const next = self.sq.sqe_tail +% 1; - if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; - const sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; - self.sq.sqe_tail = next; - return sqe; - } - - /// Submits the SQEs acquired via get_sqe() to the kernel. You can call this once after you have - /// called get_sqe() multiple times to setup multiple I/O requests. - /// Returns the number of SQEs submitted, if not used alongside IORING_SETUP_SQPOLL. - /// If the io_uring instance is uses IORING_SETUP_SQPOLL, the value returned on success is not - /// guaranteed to match the amount of actually submitted sqes during this call. A value higher - /// or lower, including 0, may be returned. - /// Matches the implementation of io_uring_submit() in liburing. - pub fn submit(self: *IO_Uring) !u32 { - return self.submit_and_wait(0); - } - - /// Like submit(), but allows waiting for events as well. - /// Returns the number of SQEs submitted. - /// Matches the implementation of io_uring_submit_and_wait() in liburing. - pub fn submit_and_wait(self: *IO_Uring, wait_nr: u32) !u32 { - const submitted = self.flush_sq(); - var flags: u32 = 0; - if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { - if (wait_nr > 0 or (self.flags & linux.IORING_SETUP_IOPOLL) != 0) { - flags |= linux.IORING_ENTER_GETEVENTS; - } - return try self.enter(submitted, wait_nr, flags); - } - return submitted; - } - - /// Tell the kernel we have submitted SQEs and/or want to wait for CQEs. - /// Returns the number of SQEs submitted. - pub fn enter(self: *IO_Uring, to_submit: u32, min_complete: u32, flags: u32) !u32 { - assert(self.fd >= 0); - const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); - switch (linux.getErrno(res)) { - .SUCCESS => {}, - // The kernel was unable to allocate memory or ran out of resources for the request. - // The application should wait for some completions and try again: - .AGAIN => return error.SystemResources, - // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: - .BADF => return error.FileDescriptorInvalid, - // The file descriptor is valid, but the ring is not in the right state. - // See io_uring_register(2) for how to enable the ring. - .BADFD => return error.FileDescriptorInBadState, - // The application attempted to overcommit the number of requests it can have pending. - // The application should wait for some completions and try again: - .BUSY => return error.CompletionQueueOvercommitted, - // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: - .INVAL => return error.SubmissionQueueEntryInvalid, - // The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED - // or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range - // described by `addr` and `len` is not within the buffer registered at `buf_index`: - .FAULT => return error.BufferInvalid, - .NXIO => return error.RingShuttingDown, - // The kernel believes our `self.fd` does not refer to an io_uring instance, - // or the opcode is valid but not supported by this kernel (more likely): - .OPNOTSUPP => return error.OpcodeNotSupported, - // The operation was interrupted by a delivery of a signal before it could complete. - // This can happen while waiting for events with IORING_ENTER_GETEVENTS: - .INTR => return error.SignalInterrupt, - else => |errno| return os.unexpectedErrno(errno), - } - return @as(u32, @intCast(res)); - } - - /// Sync internal state with kernel ring state on the SQ side. - /// Returns the number of all pending events in the SQ ring, for the shared ring. - /// This return value includes previously flushed SQEs, as per liburing. - /// The rationale is to suggest that an io_uring_enter() call is needed rather than not. - /// Matches the implementation of __io_uring_flush_sq() in liburing. - pub fn flush_sq(self: *IO_Uring) u32 { - if (self.sq.sqe_head != self.sq.sqe_tail) { - // Fill in SQEs that we have queued up, adding them to the kernel ring. - const to_submit = self.sq.sqe_tail -% self.sq.sqe_head; - var tail = self.sq.tail.*; - var i: usize = 0; - while (i < to_submit) : (i += 1) { - self.sq.array[tail & self.sq.mask] = self.sq.sqe_head & self.sq.mask; - tail +%= 1; - self.sq.sqe_head +%= 1; - } - // Ensure that the kernel can actually see the SQE updates when it sees the tail update. - @atomicStore(u32, self.sq.tail, tail, .Release); - } - return self.sq_ready(); - } - - /// Returns true if we are not using an SQ thread (thus nobody submits but us), - /// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. - /// For the latter case, we set the SQ thread wakeup flag. - /// Matches the implementation of sq_ring_needs_enter() in liburing. - pub fn sq_ring_needs_enter(self: *IO_Uring, flags: *u32) bool { - assert(flags.* == 0); - if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0) return true; - if ((@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { - flags.* |= linux.IORING_ENTER_SQ_WAKEUP; - return true; - } - return false; - } - - /// Returns the number of flushed and unflushed SQEs pending in the submission queue. - /// In other words, this is the number of SQEs in the submission queue, i.e. its length. - /// These are SQEs that the kernel is yet to consume. - /// Matches the implementation of io_uring_sq_ready in liburing. - pub fn sq_ready(self: *IO_Uring) u32 { - // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, - // see https://github.com/axboe/liburing/issues/92. - return self.sq.sqe_tail -% @atomicLoad(u32, self.sq.head, .Acquire); - } - - /// Returns the number of CQEs in the completion queue, i.e. its length. - /// These are CQEs that the application is yet to consume. - /// Matches the implementation of io_uring_cq_ready in liburing. - pub fn cq_ready(self: *IO_Uring) u32 { - return @atomicLoad(u32, self.cq.tail, .Acquire) -% self.cq.head.*; - } - - /// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice. - /// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs. - /// Returns the number of CQEs copied, advancing the CQ ring. - /// Provides all the wait/peek methods found in liburing, but with batching and a single method. - /// The rationale for copying CQEs rather than copying pointers is that pointers are 8 bytes - /// whereas CQEs are not much more at only 16 bytes, and this provides a safer faster interface. - /// Safer, because you no longer need to call cqe_seen(), avoiding idempotency bugs. - /// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. - /// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. - /// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. - pub fn copy_cqes(self: *IO_Uring, cqes: []linux.io_uring_cqe, wait_nr: u32) !u32 { - const count = self.copy_cqes_ready(cqes); - if (count > 0) return count; - if (self.cq_ring_needs_flush() or wait_nr > 0) { - _ = try self.enter(0, wait_nr, linux.IORING_ENTER_GETEVENTS); - return self.copy_cqes_ready(cqes); - } - return 0; - } - - fn copy_cqes_ready(self: *IO_Uring, cqes: []linux.io_uring_cqe) u32 { - const ready = self.cq_ready(); - const count = @min(cqes.len, ready); - const head = self.cq.head.* & self.cq.mask; - const tail = (self.cq.head.* +% count) & self.cq.mask; - - if (head <= tail) { - // head behind tail -> no wrapping - @memcpy(cqes[0..count], self.cq.cqes[head..tail]); - } else { - // head in front of tail -> buffer wraps - const two_copies_required: bool = self.cq.cqes.len - head < count; - const amount_to_copy_in_first = if (two_copies_required) self.cq.cqes.len - head else count; - @memcpy(cqes[0..amount_to_copy_in_first], self.cq.cqes[head .. head + amount_to_copy_in_first]); - if (two_copies_required) { - @memcpy(cqes[amount_to_copy_in_first..count], self.cq.cqes[0..tail]); - } - } - - self.cq_advance(count); - return count; - } - - /// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. - /// A convenience method for `copy_cqes()` for when you don't need to batch or peek. - pub fn copy_cqe(ring: *IO_Uring) !linux.io_uring_cqe { - var cqes: [1]linux.io_uring_cqe = undefined; - while (true) { - const count = try ring.copy_cqes(&cqes, 1); - if (count > 0) return cqes[0]; - } - } - - /// Matches the implementation of cq_ring_needs_flush() in liburing. - pub fn cq_ring_needs_flush(self: *IO_Uring) bool { - return (@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_CQ_OVERFLOW) != 0; - } - - /// For advanced use cases only that implement custom completion queue methods. - /// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance(). - /// Must be called exactly once after a zero-copy CQE has been processed by your application. - /// Not idempotent, calling more than once will result in other CQEs being lost. - /// Matches the implementation of cqe_seen() in liburing. - pub fn cqe_seen(self: *IO_Uring, cqe: *linux.io_uring_cqe) void { - _ = cqe; - self.cq_advance(1); - } - - /// For advanced use cases only that implement custom completion queue methods. - /// Matches the implementation of cq_advance() in liburing. - pub fn cq_advance(self: *IO_Uring, count: u32) void { - if (count > 0) { - // Ensure the kernel only sees the new head value after the CQEs have been read. - @atomicStore(u32, self.cq.head, self.cq.head.* +% count, .Release); - } - } - - /// Queues (but does not submit) an SQE to perform an `fsync(2)`. - /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - /// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`. - /// N.B. While SQEs are initiated in the order in which they appear in the submission queue, - /// operations execute in parallel and completions are unordered. Therefore, an application that - /// submits a write followed by an fsync in the submission queue cannot expect the fsync to - /// apply to the write, since the fsync may complete before the write is issued to the disk. - /// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, - /// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. - pub fn fsync(self: *IO_Uring, user_data: u64, fd: os.fd_t, flags: u32) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_fsync(sqe, fd, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a no-op. - /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - /// A no-op is more useful than may appear at first glance. - /// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to - /// know when the ring is idle before acting on a kill signal. - pub fn nop(self: *IO_Uring, user_data: u64) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_nop(sqe); - sqe.user_data = user_data; - return sqe; - } - - /// Used to select how the read should be handled. - pub const ReadBuffer = union(enum) { - /// io_uring will read directly into this buffer - buffer: []u8, - - /// io_uring will read directly into these buffers using readv. - iovecs: []const os.iovec, - - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group reference by `group_id` must contain at least one buffer for the read to work. - /// `len` controls the number of bytes to read into the selected buffer. - buffer_selection: struct { - group_id: u16, - len: usize, - }, - }; - - /// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` depending on the buffer type. - /// * Reading into a `ReadBuffer.buffer` uses `read(2)` - /// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)` - /// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. See https://man7.org/linux/man-pages/man2/preadv2.2.html - /// - /// Returns a pointer to the SQE. - pub fn read( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - buffer: ReadBuffer, - offset: u64, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - switch (buffer) { - .buffer => |slice| io_uring_prep_read(sqe, fd, slice, offset), - .iovecs => |vecs| io_uring_prep_readv(sqe, fd, vecs, offset), - .buffer_selection => |selection| { - io_uring_prep_rw(.READ, sqe, fd, 0, selection.len, offset); - sqe.flags |= linux.IOSQE_BUFFER_SELECT; - sqe.buf_index = selection.group_id; - }, - } - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `write(2)`. - /// Returns a pointer to the SQE. - pub fn write( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - buffer: []const u8, - offset: u64, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_write(sqe, fd, buffer, offset); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `splice(2)` - /// Either `fd_in` or `fd_out` must be a pipe. - /// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to std.math.maxInt(u64). - /// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` are read - /// from `fd_in` starting from the file offset, which is incremented by the number of bytes read. - /// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then the starting offset of `fd_in` will be `off_in`. - /// This splice operation can be used to implement sendfile by splicing to an intermediate pipe first, - /// then splice to the final destination. In fact, the implementation of sendfile in kernel uses splice internally. - /// - /// NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still fail with EINVAL if one of the - /// fd doesn't explicitly support splice peration, e.g. reading from terminal is unsupported from kernel 5.7 to 5.11. - /// See https://github.com/axboe/liburing/issues/291 - /// - /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - pub fn splice(self: *IO_Uring, user_data: u64, fd_in: os.fd_t, off_in: u64, fd_out: os.fd_t, off_out: u64, len: usize) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_splice(sqe, fd_in, off_in, fd_out, off_out, len); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a IORING_OP_READ_FIXED. - /// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. - /// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. - /// - /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - pub fn read_fixed( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - buffer: *os.iovec, - offset: u64, - buffer_index: u16, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_read_fixed(sqe, fd, buffer, offset, buffer_index); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `pwritev()`. - /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - /// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE. - /// See https://linux.die.net/man/2/pwritev. - pub fn writev( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - iovecs: []const os.iovec_const, - offset: u64, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_writev(sqe, fd, iovecs, offset); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a IORING_OP_WRITE_FIXED. - /// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. - /// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. - /// - /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - pub fn write_fixed( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - buffer: *os.iovec, - offset: u64, - buffer_index: u16, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_write_fixed(sqe, fd, buffer, offset, buffer_index); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. - /// Returns a pointer to the SQE. - /// Available since 5.5 - pub fn accept( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - addr: ?*os.sockaddr, - addrlen: ?*os.socklen_t, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_accept(sqe, fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues an multishot accept on a socket. - /// - /// Multishot variant allows an application to issue a single accept request, - /// which will repeatedly trigger a CQE when a connection request comes in. - /// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate - /// further CQEs. - /// - /// Available since 5.19 - pub fn accept_multishot( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - addr: ?*os.sockaddr, - addrlen: ?*os.socklen_t, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_multishot_accept(sqe, fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues an accept using direct (registered) file descriptors. - /// - /// To use an accept direct variant, the application must first have registered - /// a file table (with register_files). An unused table index will be - /// dynamically chosen and returned in the CQE res field. - /// - /// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE - /// flags member, and setting the SQE fd field to the direct descriptor value - /// rather than the regular file descriptor. - /// - /// Available since 5.19 - pub fn accept_direct( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - addr: ?*os.sockaddr, - addrlen: ?*os.socklen_t, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_accept_direct(sqe, fd, addr, addrlen, flags, linux.IORING_FILE_INDEX_ALLOC); - sqe.user_data = user_data; - return sqe; - } - - /// Queues an multishot accept using direct (registered) file descriptors. - /// Available since 5.19 - pub fn accept_multishot_direct( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - addr: ?*os.sockaddr, - addrlen: ?*os.socklen_t, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_multishot_accept_direct(sqe, fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket. - /// Returns a pointer to the SQE. - pub fn connect( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - addr: *const os.sockaddr, - addrlen: os.socklen_t, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_connect(sqe, fd, addr, addrlen); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`. - /// Returns a pointer to the SQE. - pub fn epoll_ctl( - self: *IO_Uring, - user_data: u64, - epfd: os.fd_t, - fd: os.fd_t, - op: u32, - ev: ?*linux.epoll_event, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_epoll_ctl(sqe, epfd, fd, op, ev); - sqe.user_data = user_data; - return sqe; - } - - /// Used to select how the recv call should be handled. - pub const RecvBuffer = union(enum) { - /// io_uring will recv directly into this buffer - buffer: []u8, - - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work. - /// `len` controls the number of bytes to read into the selected buffer. - buffer_selection: struct { - group_id: u16, - len: usize, - }, - }; - - /// Queues (but does not submit) an SQE to perform a `recv(2)`. - /// Returns a pointer to the SQE. - /// Available since 5.6 - pub fn recv( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - buffer: RecvBuffer, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - switch (buffer) { - .buffer => |slice| io_uring_prep_recv(sqe, fd, slice, flags), - .buffer_selection => |selection| { - io_uring_prep_rw(.RECV, sqe, fd, 0, selection.len, 0); - sqe.rw_flags = flags; - sqe.flags |= linux.IOSQE_BUFFER_SELECT; - sqe.buf_index = selection.group_id; - }, - } - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `send(2)`. - /// Returns a pointer to the SQE. - /// Available since 5.6 - pub fn send( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - buffer: []const u8, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_send(sqe, fd, buffer, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. - /// - /// This operation will most likely produce two CQEs. The flags field of the - /// first cqe may likely contain IORING_CQE_F_MORE, which means that there will - /// be a second cqe with the user_data field set to the same value. The user - /// must not modify the data buffer until the notification is posted. The first - /// cqe follows the usual rules and so its res field will contain the number of - /// bytes sent or a negative error code. The notification's res field will be - /// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two - /// step model is needed because the kernel may hold on to buffers for a long - /// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling - /// the lifetime of the buffers. Even errored requests may generate a - /// notification. - /// - /// Available since 6.0 - pub fn send_zc( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - buffer: []const u8, - send_flags: u32, - zc_flags: u16, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_send_zc(sqe, fd, buffer, send_flags, zc_flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. - /// Returns a pointer to the SQE. - /// Available since 6.0 - pub fn send_zc_fixed( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - buffer: []const u8, - send_flags: u32, - zc_flags: u16, - buf_index: u16, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_send_zc_fixed(sqe, fd, buffer, send_flags, zc_flags, buf_index); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `recvmsg(2)`. - /// Returns a pointer to the SQE. - /// Available since 5.3 - pub fn recvmsg( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - msg: *os.msghdr, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_recvmsg(sqe, fd, msg, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `sendmsg(2)`. - /// Returns a pointer to the SQE. - /// Available since 5.3 - pub fn sendmsg( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - msg: *const os.msghdr_const, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_sendmsg(sqe, fd, msg, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. - /// Returns a pointer to the SQE. - /// Available since 6.1 - pub fn sendmsg_zc( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - msg: *const os.msghdr_const, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_sendmsg_zc(sqe, fd, msg, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform an `openat(2)`. - /// Returns a pointer to the SQE. - /// Available since 5.6. - pub fn openat( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: os.mode_t, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_openat(sqe, fd, path, flags, mode); - sqe.user_data = user_data; - return sqe; - } - - /// Queues an openat using direct (registered) file descriptors. - /// - /// To use an accept direct variant, the application must first have registered - /// a file table (with register_files). An unused table index will be - /// dynamically chosen and returned in the CQE res field. - /// - /// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE - /// flags member, and setting the SQE fd field to the direct descriptor value - /// rather than the regular file descriptor. - /// - /// Available since 5.15 - pub fn openat_direct( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: os.mode_t, - file_index: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_openat_direct(sqe, fd, path, flags, mode, file_index); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `close(2)`. - /// Returns a pointer to the SQE. - /// Available since 5.6. - pub fn close(self: *IO_Uring, user_data: u64, fd: os.fd_t) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_close(sqe, fd); - sqe.user_data = user_data; - return sqe; - } - - /// Queues close of registered file descriptor. - /// Available since 5.15 - pub fn close_direct(self: *IO_Uring, user_data: u64, file_index: u32) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_close_direct(sqe, file_index); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to register a timeout operation. - /// Returns a pointer to the SQE. - /// - /// The timeout will complete when either the timeout expires, or after the specified number of - /// events complete (if `count` is greater than `0`). - /// - /// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an absolute timeout. - /// - /// The completion event result will be `-ETIME` if the timeout completed through expiration, - /// `0` if the timeout completed after the specified number of events, or `-ECANCELED` if the - /// timeout was removed before it expired. - /// - /// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. - pub fn timeout( - self: *IO_Uring, - user_data: u64, - ts: *const os.linux.kernel_timespec, - count: u32, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_timeout(sqe, ts, count, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to remove an existing timeout operation. - /// Returns a pointer to the SQE. - /// - /// The timeout is identified by its `user_data`. - /// - /// The completion event result will be `0` if the timeout was found and cancelled successfully, - /// `-EBUSY` if the timeout was found but expiration was already in progress, or - /// `-ENOENT` if the timeout was not found. - pub fn timeout_remove( - self: *IO_Uring, - user_data: u64, - timeout_user_data: u64, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_timeout_remove(sqe, timeout_user_data, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to add a link timeout operation. - /// Returns a pointer to the SQE. - /// - /// You need to set linux.IOSQE_IO_LINK to flags of the target operation - /// and then call this method right after the target operation. - /// See https://lwn.net/Articles/803932/ for detail. - /// - /// If the dependent request finishes before the linked timeout, the timeout - /// is canceled. If the timeout finishes before the dependent request, the - /// dependent request will be canceled. - /// - /// The completion event result of the link_timeout will be - /// `-ETIME` if the timeout finishes before the dependent request - /// (in this case, the completion event result of the dependent request will - /// be `-ECANCELED`), or - /// `-EALREADY` if the dependent request finishes before the linked timeout. - pub fn link_timeout( - self: *IO_Uring, - user_data: u64, - ts: *const os.linux.kernel_timespec, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_link_timeout(sqe, ts, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `poll(2)`. - /// Returns a pointer to the SQE. - pub fn poll_add( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - poll_mask: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_poll_add(sqe, fd, poll_mask); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to remove an existing poll operation. - /// Returns a pointer to the SQE. - pub fn poll_remove( - self: *IO_Uring, - user_data: u64, - target_user_data: u64, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_poll_remove(sqe, target_user_data); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to update the user data of an existing poll - /// operation. Returns a pointer to the SQE. - pub fn poll_update( - self: *IO_Uring, - user_data: u64, - old_user_data: u64, - new_user_data: u64, - poll_mask: u32, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_poll_update(sqe, old_user_data, new_user_data, poll_mask, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform an `fallocate(2)`. - /// Returns a pointer to the SQE. - pub fn fallocate( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - mode: i32, - offset: u64, - len: u64, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_fallocate(sqe, fd, mode, offset, len); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform an `statx(2)`. - /// Returns a pointer to the SQE. - pub fn statx( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - path: [:0]const u8, - flags: u32, - mask: u32, - buf: *linux.Statx, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_statx(sqe, fd, path, flags, mask, buf); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to remove an existing operation. - /// Returns a pointer to the SQE. - /// - /// The operation is identified by its `user_data`. - /// - /// The completion event result will be `0` if the operation was found and cancelled successfully, - /// `-EALREADY` if the operation was found but was already in progress, or - /// `-ENOENT` if the operation was not found. - pub fn cancel( - self: *IO_Uring, - user_data: u64, - cancel_user_data: u64, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_cancel(sqe, cancel_user_data, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `shutdown(2)`. - /// Returns a pointer to the SQE. - /// - /// The operation is identified by its `user_data`. - pub fn shutdown( - self: *IO_Uring, - user_data: u64, - sockfd: os.socket_t, - how: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_shutdown(sqe, sockfd, how); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `renameat2(2)`. - /// Returns a pointer to the SQE. - pub fn renameat( - self: *IO_Uring, - user_data: u64, - old_dir_fd: os.fd_t, - old_path: [*:0]const u8, - new_dir_fd: os.fd_t, - new_path: [*:0]const u8, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_renameat(sqe, old_dir_fd, old_path, new_dir_fd, new_path, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `unlinkat(2)`. - /// Returns a pointer to the SQE. - pub fn unlinkat( - self: *IO_Uring, - user_data: u64, - dir_fd: os.fd_t, - path: [*:0]const u8, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_unlinkat(sqe, dir_fd, path, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `mkdirat(2)`. - /// Returns a pointer to the SQE. - pub fn mkdirat( - self: *IO_Uring, - user_data: u64, - dir_fd: os.fd_t, - path: [*:0]const u8, - mode: os.mode_t, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_mkdirat(sqe, dir_fd, path, mode); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `symlinkat(2)`. - /// Returns a pointer to the SQE. - pub fn symlinkat( - self: *IO_Uring, - user_data: u64, - target: [*:0]const u8, - new_dir_fd: os.fd_t, - link_path: [*:0]const u8, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_symlinkat(sqe, target, new_dir_fd, link_path); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `linkat(2)`. - /// Returns a pointer to the SQE. - pub fn linkat( - self: *IO_Uring, - user_data: u64, - old_dir_fd: os.fd_t, - old_path: [*:0]const u8, - new_dir_fd: os.fd_t, - new_path: [*:0]const u8, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_linkat(sqe, old_dir_fd, old_path, new_dir_fd, new_path, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to provide a group of buffers used for commands that read/receive data. - /// Returns a pointer to the SQE. - /// - /// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via .buffer_selection. - /// - /// The kernel expects a contiguous block of memory of size (buffers_count * buffer_size). - pub fn provide_buffers( - self: *IO_Uring, - user_data: u64, - buffers: [*]u8, - buffer_size: usize, - buffers_count: usize, - group_id: usize, - buffer_id: usize, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_provide_buffers(sqe, buffers, buffer_size, buffers_count, group_id, buffer_id); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to remove a group of provided buffers. - /// Returns a pointer to the SQE. - pub fn remove_buffers( - self: *IO_Uring, - user_data: u64, - buffers_count: usize, - group_id: usize, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_remove_buffers(sqe, buffers_count, group_id); - sqe.user_data = user_data; - return sqe; - } - - /// Queues (but does not submit) an SQE to perform a `waitid(2)`. - /// Returns a pointer to the SQE. - pub fn waitid( - self: *IO_Uring, - user_data: u64, - id_type: linux.P, - id: i32, - infop: *linux.siginfo_t, - options: u32, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_waitid(sqe, id_type, id, infop, options, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Registers an array of file descriptors. - /// Every time a file descriptor is put in an SQE and submitted to the kernel, the kernel must - /// retrieve a reference to the file, and once I/O has completed the file reference must be - /// dropped. The atomic nature of this file reference can be a slowdown for high IOPS workloads. - /// This slowdown can be avoided by pre-registering file descriptors. - /// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in the SQE's flags, - /// and the SQE's fd must be set to the index of the file descriptor in the registered array. - /// Registering file descriptors will wait for the ring to idle. - /// Files are automatically unregistered by the kernel when the ring is torn down. - /// An application need unregister only if it wants to register a new array of file descriptors. - pub fn register_files(self: *IO_Uring, fds: []const os.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILES, - @as(*const anyopaque, @ptrCast(fds.ptr)), - @as(u32, @intCast(fds.len)), - ); - try handle_registration_result(res); - } - - /// Updates registered file descriptors. - /// - /// Updates are applied starting at the provided offset in the original file descriptors slice. - /// There are three kind of updates: - /// * turning a sparse entry (where the fd is -1) into a real one - /// * removing an existing entry (set the fd to -1) - /// * replacing an existing entry with a new fd - /// Adding new file descriptors must be done with `register_files`. - pub fn register_files_update(self: *IO_Uring, offset: u32, fds: []const os.fd_t) !void { - assert(self.fd >= 0); - - const FilesUpdate = extern struct { - offset: u32, - resv: u32, - fds: u64 align(8), - }; - var update = FilesUpdate{ - .offset = offset, - .resv = @as(u32, 0), - .fds = @as(u64, @intFromPtr(fds.ptr)), - }; - - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILES_UPDATE, - @as(*const anyopaque, @ptrCast(&update)), - @as(u32, @intCast(fds.len)), - ); - try handle_registration_result(res); - } - - /// Registers the file descriptor for an eventfd that will be notified of completion events on - /// an io_uring instance. - /// Only a single a eventfd can be registered at any given point in time. - pub fn register_eventfd(self: *IO_Uring, fd: os.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_EVENTFD, - @as(*const anyopaque, @ptrCast(&fd)), - 1, - ); - try handle_registration_result(res); - } - - /// Registers the file descriptor for an eventfd that will be notified of completion events on - /// an io_uring instance. Notifications are only posted for events that complete in an async manner. - /// This means that events that complete inline while being submitted do not trigger a notification event. - /// Only a single eventfd can be registered at any given point in time. - pub fn register_eventfd_async(self: *IO_Uring, fd: os.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_EVENTFD_ASYNC, - @as(*const anyopaque, @ptrCast(&fd)), - 1, - ); - try handle_registration_result(res); - } - - /// Unregister the registered eventfd file descriptor. - pub fn unregister_eventfd(self: *IO_Uring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .UNREGISTER_EVENTFD, - null, - 0, - ); - try handle_registration_result(res); - } - - /// Registers an array of buffers for use with `read_fixed` and `write_fixed`. - pub fn register_buffers(self: *IO_Uring, buffers: []const os.iovec) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_BUFFERS, - buffers.ptr, - @as(u32, @intCast(buffers.len)), - ); - try handle_registration_result(res); - } - - /// Unregister the registered buffers. - pub fn unregister_buffers(self: *IO_Uring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_BUFFERS, null, 0); - switch (linux.getErrno(res)) { - .SUCCESS => {}, - .NXIO => return error.BuffersNotRegistered, - else => |errno| return os.unexpectedErrno(errno), - } - } - - fn handle_registration_result(res: usize) !void { - switch (linux.getErrno(res)) { - .SUCCESS => {}, - // One or more fds in the array are invalid, or the kernel does not support sparse sets: - .BADF => return error.FileDescriptorInvalid, - .BUSY => return error.FilesAlreadyRegistered, - .INVAL => return error.FilesEmpty, - // Adding `nr_args` file references would exceed the maximum allowed number of files the - // user is allowed to have according to the per-user RLIMIT_NOFILE resource limit and - // the CAP_SYS_RESOURCE capability is not set, or `nr_args` exceeds the maximum allowed - // for a fixed file set (older kernels have a limit of 1024 files vs 64K files): - .MFILE => return error.UserFdQuotaExceeded, - // Insufficient kernel resources, or the caller had a non-zero RLIMIT_MEMLOCK soft - // resource limit but tried to lock more memory than the limit permitted (not enforced - // when the process is privileged with CAP_IPC_LOCK): - .NOMEM => return error.SystemResources, - // Attempt to register files on a ring already registering files or being torn down: - .NXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles, - else => |errno| return os.unexpectedErrno(errno), - } - } - - /// Unregisters all registered file descriptors previously associated with the ring. - pub fn unregister_files(self: *IO_Uring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0); - switch (linux.getErrno(res)) { - .SUCCESS => {}, - .NXIO => return error.FilesNotRegistered, - else => |errno| return os.unexpectedErrno(errno), - } - } - - /// Prepares a socket creation request. - /// New socket fd will be returned in completion result. - /// Available since 5.19 - pub fn socket( - self: *IO_Uring, - user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_socket(sqe, domain, socket_type, protocol, flags); - sqe.user_data = user_data; - return sqe; - } - - /// Prepares a socket creation request for registered file at index `file_index`. - /// Available since 5.19 - pub fn socket_direct( - self: *IO_Uring, - user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - file_index: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_socket_direct(sqe, domain, socket_type, protocol, flags, file_index); - sqe.user_data = user_data; - return sqe; - } - - /// Prepares a socket creation request for registered file, index chosen by kernel (file index alloc). - /// File index will be returned in CQE res field. - /// Available since 5.19 - pub fn socket_direct_alloc( - self: *IO_Uring, - user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - ) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - io_uring_prep_socket_direct_alloc(sqe, domain, socket_type, protocol, flags); - sqe.user_data = user_data; - return sqe; - } -}; - -pub const SubmissionQueue = struct { - head: *u32, - tail: *u32, - mask: u32, - flags: *u32, - dropped: *u32, - array: []u32, - sqes: []linux.io_uring_sqe, - mmap: []align(mem.page_size) u8, - mmap_sqes: []align(mem.page_size) u8, - - // We use `sqe_head` and `sqe_tail` in the same way as liburing: - // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. - // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. - // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. - sqe_head: u32 = 0, - sqe_tail: u32 = 0, - - pub fn init(fd: os.fd_t, p: linux.io_uring_params) !SubmissionQueue { - assert(fd >= 0); - assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); - const size = @max( - p.sq_off.array + p.sq_entries * @sizeOf(u32), - p.cq_off.cqes + p.cq_entries * @sizeOf(linux.io_uring_cqe), - ); - const mmap = try os.mmap( - null, - size, - os.PROT.READ | os.PROT.WRITE, - .{ .TYPE = .SHARED, .POPULATE = true }, - fd, - linux.IORING_OFF_SQ_RING, - ); - errdefer os.munmap(mmap); - assert(mmap.len == size); - - // The motivation for the `sqes` and `array` indirection is to make it possible for the - // application to preallocate static linux.io_uring_sqe entries and then replay them when needed. - const size_sqes = p.sq_entries * @sizeOf(linux.io_uring_sqe); - const mmap_sqes = try os.mmap( - null, - size_sqes, - os.PROT.READ | os.PROT.WRITE, - .{ .TYPE = .SHARED, .POPULATE = true }, - fd, - linux.IORING_OFF_SQES, - ); - errdefer os.munmap(mmap_sqes); - assert(mmap_sqes.len == size_sqes); - - const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); - const sqes: [*]linux.io_uring_sqe = @ptrCast(@alignCast(&mmap_sqes[0])); - // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, - // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. - assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); - return SubmissionQueue{ - .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), - .tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])), - .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*, - .flags = @ptrCast(@alignCast(&mmap[p.sq_off.flags])), - .dropped = @ptrCast(@alignCast(&mmap[p.sq_off.dropped])), - .array = array[0..p.sq_entries], - .sqes = sqes[0..p.sq_entries], - .mmap = mmap, - .mmap_sqes = mmap_sqes, - }; - } - - pub fn deinit(self: *SubmissionQueue) void { - os.munmap(self.mmap_sqes); - os.munmap(self.mmap); - } -}; - -pub const CompletionQueue = struct { - head: *u32, - tail: *u32, - mask: u32, - overflow: *u32, - cqes: []linux.io_uring_cqe, - - pub fn init(fd: os.fd_t, p: linux.io_uring_params, sq: SubmissionQueue) !CompletionQueue { - assert(fd >= 0); - assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); - const mmap = sq.mmap; - const cqes: [*]linux.io_uring_cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); - assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); - return CompletionQueue{ - .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), - .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), - .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, - .overflow = @ptrCast(@alignCast(&mmap[p.cq_off.overflow])), - .cqes = cqes[0..p.cq_entries], - }; - } - - pub fn deinit(self: *CompletionQueue) void { - _ = self; - // A no-op since we now share the mmap with the submission queue. - // Here for symmetry with the submission queue, and for any future feature support. - } -}; - -pub fn io_uring_prep_nop(sqe: *linux.io_uring_sqe) void { - sqe.* = .{ - .opcode = .NOP, - .flags = 0, - .ioprio = 0, - .fd = 0, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; -} - -pub fn io_uring_prep_fsync(sqe: *linux.io_uring_sqe, fd: os.fd_t, flags: u32) void { - sqe.* = .{ - .opcode = .FSYNC, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = flags, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; -} - -pub fn io_uring_prep_rw( - op: linux.IORING_OP, - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - addr: u64, - len: usize, - offset: u64, -) void { - sqe.* = .{ - .opcode = op, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = offset, - .addr = addr, - .len = @as(u32, @intCast(len)), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; -} - -pub fn io_uring_prep_read(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []u8, offset: u64) void { - io_uring_prep_rw(.READ, sqe, fd, @intFromPtr(buffer.ptr), buffer.len, offset); -} - -pub fn io_uring_prep_write(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []const u8, offset: u64) void { - io_uring_prep_rw(.WRITE, sqe, fd, @intFromPtr(buffer.ptr), buffer.len, offset); -} - -pub fn io_uring_prep_splice(sqe: *linux.io_uring_sqe, fd_in: os.fd_t, off_in: u64, fd_out: os.fd_t, off_out: u64, len: usize) void { - io_uring_prep_rw(.SPLICE, sqe, fd_out, undefined, len, off_out); - sqe.addr = off_in; - sqe.splice_fd_in = fd_in; -} - -pub fn io_uring_prep_readv( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - iovecs: []const os.iovec, - offset: u64, -) void { - io_uring_prep_rw(.READV, sqe, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); -} - -pub fn io_uring_prep_writev( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - iovecs: []const os.iovec_const, - offset: u64, -) void { - io_uring_prep_rw(.WRITEV, sqe, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); -} - -pub fn io_uring_prep_read_fixed(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: *os.iovec, offset: u64, buffer_index: u16) void { - io_uring_prep_rw(.READ_FIXED, sqe, fd, @intFromPtr(buffer.iov_base), buffer.iov_len, offset); - sqe.buf_index = buffer_index; -} - -pub fn io_uring_prep_write_fixed(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: *os.iovec, offset: u64, buffer_index: u16) void { - io_uring_prep_rw(.WRITE_FIXED, sqe, fd, @intFromPtr(buffer.iov_base), buffer.iov_len, offset); - sqe.buf_index = buffer_index; -} - -/// Poll masks previously used to comprise of 16 bits in the flags union of -/// a SQE, but were then extended to comprise of 32 bits in order to make -/// room for additional option flags. To ensure that the correct bits of -/// poll masks are consistently and properly read across multiple kernel -/// versions, poll masks are enforced to be little-endian. -/// https://www.spinics.net/lists/io-uring/msg02848.html -pub inline fn __io_uring_prep_poll_mask(poll_mask: u32) u32 { - return std.mem.nativeToLittle(u32, poll_mask); -} - -pub fn io_uring_prep_accept( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - addr: ?*os.sockaddr, - addrlen: ?*os.socklen_t, - flags: u32, -) void { - // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. - // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - io_uring_prep_rw(.ACCEPT, sqe, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_accept_direct( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - addr: ?*os.sockaddr, - addrlen: ?*os.socklen_t, - flags: u32, - file_index: u32, -) void { - io_uring_prep_accept(sqe, fd, addr, addrlen, flags); - __io_uring_set_target_fixed_file(sqe, file_index); -} - -pub fn io_uring_prep_multishot_accept_direct( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - addr: ?*os.sockaddr, - addrlen: ?*os.socklen_t, - flags: u32, -) void { - io_uring_prep_multishot_accept(sqe, fd, addr, addrlen, flags); - __io_uring_set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); -} - -fn __io_uring_set_target_fixed_file(sqe: *linux.io_uring_sqe, file_index: u32) void { - const sqe_file_index: u32 = if (file_index == linux.IORING_FILE_INDEX_ALLOC) - linux.IORING_FILE_INDEX_ALLOC - else - // 0 means no fixed files, indexes should be encoded as "index + 1" - file_index + 1; - // This filed is overloaded in liburing: - // splice_fd_in: i32 - // sqe_file_index: u32 - sqe.splice_fd_in = @bitCast(sqe_file_index); -} - -pub fn io_uring_prep_connect( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - addr: *const os.sockaddr, - addrlen: os.socklen_t, -) void { - // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - io_uring_prep_rw(.CONNECT, sqe, fd, @intFromPtr(addr), 0, addrlen); -} - -pub fn io_uring_prep_epoll_ctl( - sqe: *linux.io_uring_sqe, - epfd: os.fd_t, - fd: os.fd_t, - op: u32, - ev: ?*linux.epoll_event, -) void { - io_uring_prep_rw(.EPOLL_CTL, sqe, epfd, @intFromPtr(ev), op, @as(u64, @intCast(fd))); -} - -pub fn io_uring_prep_recv(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []u8, flags: u32) void { - io_uring_prep_rw(.RECV, sqe, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_send(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []const u8, flags: u32) void { - io_uring_prep_rw(.SEND, sqe, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_send_zc(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []const u8, flags: u32, zc_flags: u16) void { - io_uring_prep_rw(.SEND_ZC, sqe, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - sqe.ioprio = zc_flags; -} - -pub fn io_uring_prep_send_zc_fixed(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []const u8, flags: u32, zc_flags: u16, buf_index: u16) void { - io_uring_prep_send_zc(sqe, fd, buffer, flags, zc_flags); - sqe.ioprio |= linux.IORING_RECVSEND_FIXED_BUF; - sqe.buf_index = buf_index; -} - -pub fn io_uring_prep_sendmsg_zc( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - msg: *const os.msghdr_const, - flags: u32, -) void { - io_uring_prep_sendmsg(sqe, fd, msg, flags); - sqe.opcode = .SENDMSG_ZC; -} - -pub fn io_uring_prep_recvmsg( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - msg: *os.msghdr, - flags: u32, -) void { - linux.io_uring_prep_rw(.RECVMSG, sqe, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_sendmsg( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - msg: *const os.msghdr_const, - flags: u32, -) void { - linux.io_uring_prep_rw(.SENDMSG, sqe, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_openat( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: os.mode_t, -) void { - io_uring_prep_rw(.OPENAT, sqe, fd, @intFromPtr(path), mode, 0); - sqe.rw_flags = @bitCast(flags); -} - -pub fn io_uring_prep_openat_direct( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: os.mode_t, - file_index: u32, -) void { - io_uring_prep_openat(sqe, fd, path, flags, mode); - __io_uring_set_target_fixed_file(sqe, file_index); -} - -pub fn io_uring_prep_close(sqe: *linux.io_uring_sqe, fd: os.fd_t) void { - sqe.* = .{ - .opcode = .CLOSE, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; -} - -pub fn io_uring_prep_close_direct(sqe: *linux.io_uring_sqe, file_index: u32) void { - io_uring_prep_close(sqe, 0); - __io_uring_set_target_fixed_file(sqe, file_index); -} - -pub fn io_uring_prep_timeout( - sqe: *linux.io_uring_sqe, - ts: *const os.linux.kernel_timespec, - count: u32, - flags: u32, -) void { - io_uring_prep_rw(.TIMEOUT, sqe, -1, @intFromPtr(ts), 1, count); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_timeout_remove(sqe: *linux.io_uring_sqe, timeout_user_data: u64, flags: u32) void { - sqe.* = .{ - .opcode = .TIMEOUT_REMOVE, - .flags = 0, - .ioprio = 0, - .fd = -1, - .off = 0, - .addr = timeout_user_data, - .len = 0, - .rw_flags = flags, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; -} - -pub fn io_uring_prep_link_timeout( - sqe: *linux.io_uring_sqe, - ts: *const os.linux.kernel_timespec, - flags: u32, -) void { - linux.io_uring_prep_rw(.LINK_TIMEOUT, sqe, -1, @intFromPtr(ts), 1, 0); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_poll_add( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - poll_mask: u32, -) void { - io_uring_prep_rw(.POLL_ADD, sqe, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); - sqe.rw_flags = __io_uring_prep_poll_mask(poll_mask); -} - -pub fn io_uring_prep_poll_remove( - sqe: *linux.io_uring_sqe, - target_user_data: u64, -) void { - io_uring_prep_rw(.POLL_REMOVE, sqe, -1, target_user_data, 0, 0); -} - -pub fn io_uring_prep_poll_update( - sqe: *linux.io_uring_sqe, - old_user_data: u64, - new_user_data: u64, - poll_mask: u32, - flags: u32, -) void { - io_uring_prep_rw(.POLL_REMOVE, sqe, -1, old_user_data, flags, new_user_data); - sqe.rw_flags = __io_uring_prep_poll_mask(poll_mask); -} - -pub fn io_uring_prep_fallocate( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - mode: i32, - offset: u64, - len: u64, -) void { - sqe.* = .{ - .opcode = .FALLOCATE, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = offset, - .addr = len, - .len = @as(u32, @intCast(mode)), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; -} - -pub fn io_uring_prep_statx( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - path: [*:0]const u8, - flags: u32, - mask: u32, - buf: *linux.Statx, -) void { - io_uring_prep_rw(.STATX, sqe, fd, @intFromPtr(path), mask, @intFromPtr(buf)); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_cancel( - sqe: *linux.io_uring_sqe, - cancel_user_data: u64, - flags: u32, -) void { - io_uring_prep_rw(.ASYNC_CANCEL, sqe, -1, cancel_user_data, 0, 0); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_shutdown( - sqe: *linux.io_uring_sqe, - sockfd: os.socket_t, - how: u32, -) void { - io_uring_prep_rw(.SHUTDOWN, sqe, sockfd, 0, how, 0); -} - -pub fn io_uring_prep_renameat( - sqe: *linux.io_uring_sqe, - old_dir_fd: os.fd_t, - old_path: [*:0]const u8, - new_dir_fd: os.fd_t, - new_path: [*:0]const u8, - flags: u32, -) void { - io_uring_prep_rw( - .RENAMEAT, - sqe, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_unlinkat( - sqe: *linux.io_uring_sqe, - dir_fd: os.fd_t, - path: [*:0]const u8, - flags: u32, -) void { - io_uring_prep_rw(.UNLINKAT, sqe, dir_fd, @intFromPtr(path), 0, 0); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_mkdirat( - sqe: *linux.io_uring_sqe, - dir_fd: os.fd_t, - path: [*:0]const u8, - mode: os.mode_t, -) void { - io_uring_prep_rw(.MKDIRAT, sqe, dir_fd, @intFromPtr(path), mode, 0); -} - -pub fn io_uring_prep_symlinkat( - sqe: *linux.io_uring_sqe, - target: [*:0]const u8, - new_dir_fd: os.fd_t, - link_path: [*:0]const u8, -) void { - io_uring_prep_rw( - .SYMLINKAT, - sqe, - new_dir_fd, - @intFromPtr(target), - 0, - @intFromPtr(link_path), - ); -} - -pub fn io_uring_prep_linkat( - sqe: *linux.io_uring_sqe, - old_dir_fd: os.fd_t, - old_path: [*:0]const u8, - new_dir_fd: os.fd_t, - new_path: [*:0]const u8, - flags: u32, -) void { - io_uring_prep_rw( - .LINKAT, - sqe, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_provide_buffers( - sqe: *linux.io_uring_sqe, - buffers: [*]u8, - buffer_len: usize, - num: usize, - group_id: usize, - buffer_id: usize, -) void { - const ptr = @intFromPtr(buffers); - io_uring_prep_rw(.PROVIDE_BUFFERS, sqe, @as(i32, @intCast(num)), ptr, buffer_len, buffer_id); - sqe.buf_index = @intCast(group_id); -} - -pub fn io_uring_prep_remove_buffers( - sqe: *linux.io_uring_sqe, - num: usize, - group_id: usize, -) void { - io_uring_prep_rw(.REMOVE_BUFFERS, sqe, @as(i32, @intCast(num)), 0, 0, 0); - sqe.buf_index = @intCast(group_id); -} - -pub fn io_uring_prep_multishot_accept( - sqe: *linux.io_uring_sqe, - fd: os.fd_t, - addr: ?*os.sockaddr, - addrlen: ?*os.socklen_t, - flags: u32, -) void { - io_uring_prep_accept(sqe, fd, addr, addrlen, flags); - sqe.ioprio |= linux.IORING_ACCEPT_MULTISHOT; -} - -pub fn io_uring_prep_socket( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, -) void { - io_uring_prep_rw(.SOCKET, sqe, @intCast(domain), 0, protocol, socket_type); - sqe.rw_flags = flags; -} - -pub fn io_uring_prep_socket_direct( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - file_index: u32, -) void { - io_uring_prep_socket(sqe, domain, socket_type, protocol, flags); - __io_uring_set_target_fixed_file(sqe, file_index); -} - -pub fn io_uring_prep_socket_direct_alloc( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, -) void { - io_uring_prep_socket(sqe, domain, socket_type, protocol, flags); - __io_uring_set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); -} - -pub fn io_uring_prep_waitid( - sqe: *linux.io_uring_sqe, - id_type: linux.P, - id: i32, - infop: *linux.siginfo_t, - options: u32, - flags: u32, -) void { - io_uring_prep_rw(.WAITID, sqe, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); - sqe.rw_flags = flags; - sqe.splice_fd_in = @bitCast(options); -} - -test "structs/offsets/entries" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - try testing.expectEqual(@as(usize, 120), @sizeOf(linux.io_uring_params)); - try testing.expectEqual(@as(usize, 64), @sizeOf(linux.io_uring_sqe)); - try testing.expectEqual(@as(usize, 16), @sizeOf(linux.io_uring_cqe)); - - try testing.expectEqual(0, linux.IORING_OFF_SQ_RING); - try testing.expectEqual(0x8000000, linux.IORING_OFF_CQ_RING); - try testing.expectEqual(0x10000000, linux.IORING_OFF_SQES); - - try testing.expectError(error.EntriesZero, IO_Uring.init(0, 0)); - try testing.expectError(error.EntriesNotPowerOfTwo, IO_Uring.init(3, 0)); -} - -test "nop" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer { - ring.deinit(); - testing.expectEqual(@as(os.fd_t, -1), ring.fd) catch @panic("test failed"); - } - - const sqe = try ring.nop(0xaaaaaaaa); - try testing.expectEqual(linux.io_uring_sqe{ - .opcode = .NOP, - .flags = 0, - .ioprio = 0, - .fd = 0, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0xaaaaaaaa, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }, sqe.*); - - try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 0), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 1), ring.sq_ready()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); - - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 1), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = 0, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); - - const sqe_barrier = try ring.nop(0xbbbbbbbb); - sqe_barrier.flags |= linux.IOSQE_IO_DRAIN; - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xbbbbbbbb, - .res = 0, - .flags = 0, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 2), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 2), ring.cq.head.*); -} - -test "readv" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const fd = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer os.close(fd); - - // Linux Kernel 5.4 supports IORING_REGISTER_FILES but not sparse fd sets (i.e. an fd of -1). - // Linux Kernel 5.5 adds support for sparse fd sets. - // Compare: - // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs - // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 - // We therefore avoid stressing sparse fd sets here: - var registered_fds = [_]os.fd_t{0} ** 1; - const fd_index = 0; - registered_fds[fd_index] = fd; - try ring.register_files(registered_fds[0..]); - - var buffer = [_]u8{42} ** 128; - var iovecs = [_]os.iovec{os.iovec{ .iov_base = &buffer, .iov_len = buffer.len }}; - const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); - try testing.expectEqual(linux.IORING_OP.READV, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; - - try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = 0, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); - - try ring.unregister_files(); -} - -test "writev/fsync/readv" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(4, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_writev_fsync_readv"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; - - const buffer_write = [_]u8{42} ** 128; - const iovecs_write = [_]os.iovec_const{ - os.iovec_const{ .iov_base = &buffer_write, .iov_len = buffer_write.len }, - }; - var buffer_read = [_]u8{0} ** 128; - var iovecs_read = [_]os.iovec{ - os.iovec{ .iov_base = &buffer_read, .iov_len = buffer_read.len }, - }; - - const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); - try testing.expectEqual(linux.IORING_OP.WRITEV, sqe_writev.opcode); - try testing.expectEqual(@as(u64, 17), sqe_writev.off); - sqe_writev.flags |= linux.IOSQE_IO_LINK; - - const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); - try testing.expectEqual(linux.IORING_OP.FSYNC, sqe_fsync.opcode); - try testing.expectEqual(fd, sqe_fsync.fd); - sqe_fsync.flags |= linux.IOSQE_IO_LINK; - - const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); - try testing.expectEqual(linux.IORING_OP.READV, sqe_readv.opcode); - try testing.expectEqual(@as(u64, 17), sqe_readv.off); - - try testing.expectEqual(@as(u32, 3), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), ring.cq_ready()); - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xdddddddd, - .res = buffer_write.len, - .flags = 0, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xeeeeeeee, - .res = 0, - .flags = 0, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq_ready()); - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xffffffff, - .res = buffer_read.len, - .flags = 0, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); - - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); -} - -test "write/read" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(2, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - const path = "test_io_uring_write_read"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; - - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; - const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); - try testing.expectEqual(linux.IORING_OP.WRITE, sqe_write.opcode); - try testing.expectEqual(@as(u64, 10), sqe_write.off); - sqe_write.flags |= linux.IOSQE_IO_LINK; - const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - const cqe_write = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); - // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: - // https://lwn.net/Articles/809820/ - if (cqe_write.err() == .INVAL) return error.SkipZigTest; - if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x11111111, - .res = buffer_write.len, - .flags = 0, - }, cqe_write); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x22222222, - .res = buffer_read.len, - .flags = 0, - }, cqe_read); - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); -} - -test "splice/read" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(4, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - const path_src = "test_io_uring_splice_src"; - const file_src = try tmp.dir.createFile(path_src, .{ .read = true, .truncate = true }); - defer file_src.close(); - const fd_src = file_src.handle; - - const path_dst = "test_io_uring_splice_dst"; - const file_dst = try tmp.dir.createFile(path_dst, .{ .read = true, .truncate = true }); - defer file_dst.close(); - const fd_dst = file_dst.handle; - - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; - _ = try file_src.write(&buffer_write); - - const fds = try os.pipe(); - const pipe_offset: u64 = std.math.maxInt(u64); - - const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); - try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_to_pipe.opcode); - try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); - try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); - sqe_splice_to_pipe.flags |= linux.IOSQE_IO_LINK; - - const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); - try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_from_pipe.opcode); - try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); - try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); - sqe_splice_from_pipe.flags |= linux.IOSQE_IO_LINK; - - const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 3), try ring.submit()); - - const cqe_splice_to_pipe = try ring.copy_cqe(); - const cqe_splice_from_pipe = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); - // Prior to Linux Kernel 5.6 this is the only way to test for splice/read support: - // https://lwn.net/Articles/809820/ - if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; - if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; - if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x11111111, - .res = buffer_write.len, - .flags = 0, - }, cqe_splice_to_pipe); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x22222222, - .res = buffer_write.len, - .flags = 0, - }, cqe_splice_from_pipe); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x33333333, - .res = buffer_read.len, - .flags = 0, - }, cqe_read); - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); -} - -test "write_fixed/read_fixed" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(2, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_write_read_fixed"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; - - var raw_buffers: [2][11]u8 = undefined; - // First buffer will be written to the file. - @memset(&raw_buffers[0], 'z'); - raw_buffers[0][0.."foobar".len].* = "foobar".*; - - var buffers = [2]os.iovec{ - .{ .iov_base = &raw_buffers[0], .iov_len = raw_buffers[0].len }, - .{ .iov_base = &raw_buffers[1], .iov_len = raw_buffers[1].len }, - }; - ring.register_buffers(&buffers) catch |err| switch (err) { - error.SystemResources => { - // See https://github.com/ziglang/zig/issues/15362 - return error.SkipZigTest; - }, - else => |e| return e, - }; - - const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); - try testing.expectEqual(linux.IORING_OP.WRITE_FIXED, sqe_write.opcode); - try testing.expectEqual(@as(u64, 3), sqe_write.off); - sqe_write.flags |= linux.IOSQE_IO_LINK; - - const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); - try testing.expectEqual(linux.IORING_OP.READ_FIXED, sqe_read.opcode); - try testing.expectEqual(@as(u64, 0), sqe_read.off); - - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - const cqe_write = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x45454545, - .res = @as(i32, @intCast(buffers[0].iov_len)), - .flags = 0, - }, cqe_write); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x12121212, - .res = @as(i32, @intCast(buffers[1].iov_len)), - .flags = 0, - }, cqe_read); - - try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].iov_base[0..3]); - try testing.expectEqualSlices(u8, "foobar", buffers[1].iov_base[3..9]); - try testing.expectEqualSlices(u8, "zz", buffers[1].iov_base[9..11]); -} - -test "openat" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_openat"; - - // Workaround for LLVM bug: https://github.com/ziglang/zig/issues/12014 - const path_addr = if (builtin.zig_backend == .stage2_llvm) p: { - var workaround = path; - _ = &workaround; - break :p @intFromPtr(workaround); - } else @intFromPtr(path); - - const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; - const mode: os.mode_t = 0o666; - const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); - try testing.expectEqual(linux.io_uring_sqe{ - .opcode = .OPENAT, - .flags = 0, - .ioprio = 0, - .fd = tmp.dir.fd, - .off = 0, - .addr = path_addr, - .len = mode, - .rw_flags = @bitCast(flags), - .user_data = 0x33333333, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }, sqe_openat.*); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe_openat = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); - if (cqe_openat.err() == .INVAL) return error.SkipZigTest; - if (cqe_openat.err() == .BADF) return error.SkipZigTest; - if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); - try testing.expect(cqe_openat.res > 0); - try testing.expectEqual(@as(u32, 0), cqe_openat.flags); - - os.close(cqe_openat.res); -} - -test "close" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_close"; - const file = try tmp.dir.createFile(path, .{}); - errdefer file.close(); - - const sqe_close = try ring.close(0x44444444, file.handle); - try testing.expectEqual(linux.IORING_OP.CLOSE, sqe_close.opcode); - try testing.expectEqual(file.handle, sqe_close.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe_close = try ring.copy_cqe(); - if (cqe_close.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x44444444, - .res = 0, - .flags = 0, - }, cqe_close); -} - -test "accept/connect/send/recv" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(16, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); - - const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - - const send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); - send.flags |= linux.IOSQE_IO_LINK; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xeeeeeeee, - .res = buffer_send.len, - .flags = 0, - }, cqe_send); - - const cqe_recv = try ring.copy_cqe(); - if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xffffffff, - .res = buffer_recv.len, - // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, - }, cqe_recv); - - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); -} - -test "sendmsg/recvmsg" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(2, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var address_server = try net.Address.parseIp4("127.0.0.1", 0); - - const server = try os.socket(address_server.any.family, os.SOCK.DGRAM, 0); - defer os.close(server); - try os.setsockopt(server, os.SOL.SOCKET, os.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1))); - try os.setsockopt(server, os.SOL.SOCKET, os.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try os.bind(server, &address_server.any, address_server.getOsSockLen()); - - // set address_server to the OS-chosen IP/port. - var slen: os.socklen_t = address_server.getOsSockLen(); - try os.getsockname(server, &address_server.any, &slen); - - const client = try os.socket(address_server.any.family, os.SOCK.DGRAM, 0); - defer os.close(client); - - const buffer_send = [_]u8{42} ** 128; - const iovecs_send = [_]os.iovec_const{ - os.iovec_const{ .iov_base = &buffer_send, .iov_len = buffer_send.len }, - }; - const msg_send = os.msghdr_const{ - .name = &address_server.any, - .namelen = address_server.getOsSockLen(), - .iov = &iovecs_send, - .iovlen = 1, - .control = null, - .controllen = 0, - .flags = 0, - }; - const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); - sqe_sendmsg.flags |= linux.IOSQE_IO_LINK; - try testing.expectEqual(linux.IORING_OP.SENDMSG, sqe_sendmsg.opcode); - try testing.expectEqual(client, sqe_sendmsg.fd); - - var buffer_recv = [_]u8{0} ** 128; - var iovecs_recv = [_]os.iovec{ - os.iovec{ .iov_base = &buffer_recv, .iov_len = buffer_recv.len }, - }; - const addr = [_]u8{0} ** 4; - var address_recv = net.Address.initIp4(addr, 0); - var msg_recv: os.msghdr = os.msghdr{ - .name = &address_recv.any, - .namelen = address_recv.getOsSockLen(), - .iov = &iovecs_recv, - .iovlen = 1, - .control = null, - .controllen = 0, - .flags = 0, - }; - const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); - try testing.expectEqual(linux.IORING_OP.RECVMSG, sqe_recvmsg.opcode); - try testing.expectEqual(server, sqe_recvmsg.fd); - - try testing.expectEqual(@as(u32, 2), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - - const cqe_sendmsg = try ring.copy_cqe(); - if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x11111111, - .res = buffer_send.len, - .flags = 0, - }, cqe_sendmsg); - - const cqe_recvmsg = try ring.copy_cqe(); - if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x22222222, - .res = buffer_recv.len, - // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically - .flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, - }, cqe_recvmsg); - - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); -} - -test "timeout (after a relative time)" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const ms = 10; - const margin = 5; - const ts = os.linux.kernel_timespec{ .tv_sec = 0, .tv_nsec = ms * 1000000 }; - - const started = std.time.milliTimestamp(); - const sqe = try ring.timeout(0x55555555, &ts, 0, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe.opcode); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - const stopped = std.time.milliTimestamp(); - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x55555555, - .res = -@as(i32, @intFromEnum(linux.E.TIME)), - .flags = 0, - }, cqe); - - // Tests should not depend on timings: skip test if outside margin. - if (!std.math.approxEqAbs(f64, ms, @as(f64, @floatFromInt(stopped - started)), margin)) return error.SkipZigTest; -} - -test "timeout (after a number of completions)" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(2, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const ts = os.linux.kernel_timespec{ .tv_sec = 3, .tv_nsec = 0 }; - const count_completions: u64 = 1; - const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe_timeout.opcode); - try testing.expectEqual(count_completions, sqe_timeout.off); - _ = try ring.nop(0x77777777); - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - const cqe_nop = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x77777777, - .res = 0, - .flags = 0, - }, cqe_nop); - - const cqe_timeout = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x66666666, - .res = 0, - .flags = 0, - }, cqe_timeout); -} - -test "timeout_remove" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(2, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const ts = os.linux.kernel_timespec{ .tv_sec = 3, .tv_nsec = 0 }; - const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe_timeout.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); - - const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); - - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - // The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18: - // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second - // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second - - var cqes: [2]os.linux.io_uring_cqe = undefined; - cqes[0] = try ring.copy_cqe(); - cqes[1] = try ring.copy_cqe(); - - for (cqes) |cqe| { - // IORING_OP_TIMEOUT_REMOVE is not supported by this kernel version: - // Timeout remove operations set the fd to -1, which results in EBADF before EINVAL. - // We use IORING_FEAT_RW_CUR_POS as a safety check here to make sure we are at least pre-5.6. - // We don't want to skip this test for newer kernels. - if (cqe.user_data == 0x99999999 and - cqe.err() == .BADF and - (ring.features & linux.IORING_FEAT_RW_CUR_POS) == 0) - { - return error.SkipZigTest; - } - - try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); - - if (cqe.user_data == 0x88888888) { - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x88888888, - .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = 0, - }, cqe); - } else if (cqe.user_data == 0x99999999) { - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x99999999, - .res = 0, - .flags = 0, - }, cqe); - } - } -} - -test "accept/connect/recv/link_timeout" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(16, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); - - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - - const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - sqe_recv.flags |= linux.IOSQE_IO_LINK; - - const ts = os.linux.kernel_timespec{ .tv_sec = 0, .tv_nsec = 1000000 }; - _ = try ring.link_timeout(0x22222222, &ts, 0); - - const nr_wait = try ring.submit(); - try testing.expectEqual(@as(u32, 2), nr_wait); - - var i: usize = 0; - while (i < nr_wait) : (i += 1) { - const cqe = try ring.copy_cqe(); - switch (cqe.user_data) { - 0xffffffff => { - if (cqe.res != -@as(i32, @intFromEnum(linux.E.INTR)) and - cqe.res != -@as(i32, @intFromEnum(linux.E.CANCELED))) - { - std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); - try testing.expect(false); - } - }, - 0x22222222 => { - if (cqe.res != -@as(i32, @intFromEnum(linux.E.ALREADY)) and - cqe.res != -@as(i32, @intFromEnum(linux.E.TIME))) - { - std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); - try testing.expect(false); - } - }, - else => @panic("should not happen"), - } - } -} - -test "fallocate" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_fallocate"; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); - - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); - - const len: u64 = 65536; - const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); - try testing.expectEqual(linux.IORING_OP.FALLOCATE, sqe.opcode); - try testing.expectEqual(file.handle, sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement fallocate(): - .INVAL => return error.SkipZigTest, - // This kernel does not implement fallocate(): - .NOSYS => return error.SkipZigTest, - // The filesystem containing the file referred to by fd does not support this operation; - // or the mode is not supported by the filesystem containing the file referred to by fd: - .OPNOTSUPP => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = 0, - }, cqe); - - try testing.expectEqual(len, (try file.stat()).size); -} - -test "statx" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - const path = "test_io_uring_statx"; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); - - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); - - try file.writeAll("foobar"); - - var buf: linux.Statx = undefined; - const sqe = try ring.statx( - 0xaaaaaaaa, - tmp.dir.fd, - path, - 0, - linux.STATX_SIZE, - &buf, - ); - try testing.expectEqual(linux.IORING_OP.STATX, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement statx(): - .INVAL => return error.SkipZigTest, - // This kernel does not implement statx(): - .NOSYS => return error.SkipZigTest, - // The filesystem containing the file referred to by fd does not support this operation; - // or the mode is not supported by the filesystem containing the file referred to by fd: - .OPNOTSUPP => return error.SkipZigTest, - // not supported on older kernels (5.4) - .BADF => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = 0, - }, cqe); - - try testing.expect(buf.mask & os.linux.STATX_SIZE == os.linux.STATX_SIZE); - try testing.expectEqual(@as(u64, 6), buf.size); -} - -test "accept/connect/recv/cancel" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(16, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); - - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); - try testing.expectEqual(linux.IORING_OP.ASYNC_CANCEL, sqe_cancel.opcode); - try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - var cqe_recv = try ring.copy_cqe(); - if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - var cqe_cancel = try ring.copy_cqe(); - if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; - - // The recv/cancel CQEs may arrive in any order, the recv CQE will sometimes come first: - if (cqe_recv.user_data == 0x99999999 and cqe_cancel.user_data == 0xffffffff) { - const a = cqe_recv; - const b = cqe_cancel; - cqe_recv = b; - cqe_cancel = a; - } - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xffffffff, - .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = 0, - }, cqe_recv); - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x99999999, - .res = 0, - .flags = 0, - }, cqe_cancel); -} - -test "register_files_update" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const fd = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer os.close(fd); - - var registered_fds = [_]os.fd_t{0} ** 2; - const fd_index = 0; - const fd_index2 = 1; - registered_fds[fd_index] = fd; - registered_fds[fd_index2] = -1; - - ring.register_files(registered_fds[0..]) catch |err| switch (err) { - // Happens when the kernel doesn't support sparse entry (-1) in the file descriptors array. - error.FileDescriptorInvalid => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - }; - - // Test IORING_REGISTER_FILES_UPDATE - // Only available since Linux 5.5 - - const fd2 = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer os.close(fd2); - - registered_fds[fd_index] = fd2; - registered_fds[fd_index2] = -1; - try ring.register_files_update(0, registered_fds[0..]); - - var buffer = [_]u8{42} ** 128; - { - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; - - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = 0, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); - } - - // Test with a non-zero offset - - registered_fds[fd_index] = -1; - registered_fds[fd_index2] = -1; - try ring.register_files_update(1, registered_fds[1..]); - - { - // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; - - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = 0, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); - } - - try ring.register_files_update(0, registered_fds[0..]); - - { - // Now this should fail since both fds are sparse (-1) - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; - - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - try testing.expectEqual(os.linux.E.BADF, cqe.err()); - } - - try ring.unregister_files(); -} - -test "shutdown" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(16, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var address = try net.Address.parseIp4("127.0.0.1", 0); - - // Socket bound, expect shutdown to work - { - const server = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - defer os.close(server); - try os.setsockopt(server, os.SOL.SOCKET, os.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try os.bind(server, &address.any, address.getOsSockLen()); - try os.listen(server, 1); - - // set address to the OS-chosen IP/port. - var slen: os.socklen_t = address.getOsSockLen(); - try os.getsockname(server, &address.any, &slen); - - const shutdown_sqe = try ring.shutdown(0x445445445, server, os.linux.SHUT.RD); - try testing.expectEqual(linux.IORING_OP.SHUTDOWN, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); - - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement shutdown (kernel version < 5.11) - .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x445445445, - .res = 0, - .flags = 0, - }, cqe); - } - - // Socket not bound, expect to fail with ENOTCONN - { - const server = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - defer os.close(server); - - const shutdown_sqe = ring.shutdown(0x445445445, server, os.linux.SHUT.RD) catch |err| switch (err) { - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - }; - try testing.expectEqual(linux.IORING_OP.SHUTDOWN, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); - - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data); - try testing.expectEqual(os.linux.E.NOTCONN, cqe.err()); - } -} - -test "renameat" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const old_path = "test_io_uring_renameat_old"; - const new_path = "test_io_uring_renameat_new"; - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - // Write old file with data - - const old_file = try tmp.dir.createFile(old_path, .{ .truncate = true, .mode = 0o666 }); - defer old_file.close(); - try old_file.writeAll("hello"); - - // Submit renameat - - const sqe = try ring.renameat( - 0x12121212, - tmp.dir.fd, - old_path, - tmp.dir.fd, - new_path, - 0, - ); - try testing.expectEqual(linux.IORING_OP.RENAMEAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement renameat (kernel version < 5.11) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = 0, - }, cqe); - - // Validate that the old file doesn't exist anymore - { - _ = tmp.dir.openFile(old_path, .{}) catch |err| switch (err) { - error.FileNotFound => {}, - else => std.debug.panic("unexpected error: {}", .{err}), - }; - } - - // Validate that the new file exists with the proper content - { - const new_file = try tmp.dir.openFile(new_path, .{}); - defer new_file.close(); - - var new_file_data: [16]u8 = undefined; - const read = try new_file.readAll(&new_file_data); - try testing.expectEqualStrings("hello", new_file_data[0..read]); - } -} - -test "unlinkat" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const path = "test_io_uring_unlinkat"; - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - // Write old file with data - - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); - - // Submit unlinkat - - const sqe = try ring.unlinkat( - 0x12121212, - tmp.dir.fd, - path, - 0, - ); - try testing.expectEqual(linux.IORING_OP.UNLINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement unlinkat (kernel version < 5.11) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = 0, - }, cqe); - - // Validate that the file doesn't exist anymore - _ = tmp.dir.openFile(path, .{}) catch |err| switch (err) { - error.FileNotFound => {}, - else => std.debug.panic("unexpected error: {}", .{err}), - }; -} - -test "mkdirat" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_mkdirat"; - - // Submit mkdirat - - const sqe = try ring.mkdirat( - 0x12121212, - tmp.dir.fd, - path, - 0o0755, - ); - try testing.expectEqual(linux.IORING_OP.MKDIRAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement mkdirat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = 0, - }, cqe); - - // Validate that the directory exist - _ = try tmp.dir.openDir(path, .{}); -} - -test "symlinkat" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_symlinkat"; - const link_path = "test_io_uring_symlinkat_link"; - - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); - - // Submit symlinkat - - const sqe = try ring.symlinkat( - 0x12121212, - path, - tmp.dir.fd, - link_path, - ); - try testing.expectEqual(linux.IORING_OP.SYMLINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement symlinkat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = 0, - }, cqe); - - // Validate that the symlink exist - _ = try tmp.dir.openFile(link_path, .{}); -} - -test "linkat" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const first_path = "test_io_uring_linkat_first"; - const second_path = "test_io_uring_linkat_second"; - - // Write file with data - - const first_file = try tmp.dir.createFile(first_path, .{ .truncate = true, .mode = 0o666 }); - defer first_file.close(); - try first_file.writeAll("hello"); - - // Submit linkat - - const sqe = try ring.linkat( - 0x12121212, - tmp.dir.fd, - first_path, - tmp.dir.fd, - second_path, - 0, - ); - try testing.expectEqual(linux.IORING_OP.LINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement linkat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = 0, - }, cqe); - - // Validate the second file - const second_file = try tmp.dir.openFile(second_path, .{}); - defer second_file.close(); - - var second_file_data: [16]u8 = undefined; - const read = try second_file.readAll(&second_file_data); - try testing.expectEqualStrings("hello", second_file_data[0..read]); -} - -test "provide_buffers: read" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const fd = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer os.close(fd); - - const group_id = 1337; - const buffer_id = 0; - - const buffer_len = 128; - - var buffers: [4][buffer_len]u8 = undefined; - - // Provide 4 buffers - - { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(linux.IORING_OP.PROVIDE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Happens when the kernel is < 5.7 - .INVAL => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); - } - - // Do 4 reads which should consume all buffers - - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; - try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); - } - - // This read should fail - - { - const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - } - - // Provide 1 buffer again - - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 42); - - const reprovided_buffer_id = 2; - - { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - } - - // Final read which should work - - { - const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; - try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); - } -} - -test "remove_buffers" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const fd = try os.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer os.close(fd); - - const group_id = 1337; - const buffer_id = 0; - - const buffer_len = 128; - - var buffers: [4][buffer_len]u8 = undefined; - - // Provide 4 buffers - - { - _ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .INVAL => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); - } - - // Remove 3 buffers - - { - const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); - try testing.expectEqual(linux.IORING_OP.REMOVE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, 3), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data); - } - - // This read should work - - { - _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; - try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); - } - - // Final read should _not_ work - - { - _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - } -} - -test "provide_buffers: accept/connect/send/recv" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(16, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const group_id = 1337; - const buffer_id = 0; - - const buffer_len = 128; - var buffers: [4][buffer_len]u8 = undefined; - - // Provide 4 buffers - - { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(linux.IORING_OP.PROVIDE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Happens when the kernel is < 5.7 - .INVAL => return error.SkipZigTest, - // Happens on the kernel 5.4 - .BADF => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); - } - - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); - - // Do 4 send on the socket - - { - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - } - - var cqes: [4]linux.io_uring_cqe = undefined; - try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); - } - - // Do 4 recv which should consume all buffers - - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 1); - - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; - try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer); - } - - // This recv should fail - - { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - } - - // Provide 1 buffer again - - const reprovided_buffer_id = 2; - - { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - } - - // Redo 1 send on the server socket - - { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - _ = try ring.copy_cqe(); - } - - // Final recv which should work - - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 1); - - { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; - try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer); - } -} - -/// Used for testing server/client interactions. -const SocketTestHarness = struct { - listener: os.socket_t, - server: os.socket_t, - client: os.socket_t, - - fn close(self: SocketTestHarness) void { - posix.close(self.client); - posix.close(self.listener); - } -}; - -fn createSocketTestHarness(ring: *IO_Uring) !SocketTestHarness { - // Create a TCP server socket - var address = try net.Address.parseIp4("127.0.0.1", 0); - const listener_socket = try createListenerSocket(&address); - errdefer posix.close(listener_socket); - - // Submit 1 accept - var accept_addr: os.sockaddr = undefined; - var accept_addr_len: os.socklen_t = @sizeOf(@TypeOf(accept_addr)); - _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, 0); - - // Create a TCP client socket - const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - errdefer posix.close(client); - _ = try ring.connect(0xcccccccc, client, &address.any, address.getOsSockLen()); - - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - var cqe_accept = try ring.copy_cqe(); - if (cqe_accept.err() == .INVAL) return error.SkipZigTest; - var cqe_connect = try ring.copy_cqe(); - if (cqe_connect.err() == .INVAL) return error.SkipZigTest; - - // The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first: - if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) { - const a = cqe_accept; - const b = cqe_connect; - cqe_accept = b; - cqe_connect = a; - } - - try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); - if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); - try testing.expect(cqe_accept.res > 0); - try testing.expectEqual(@as(u32, 0), cqe_accept.flags); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xcccccccc, - .res = 0, - .flags = 0, - }, cqe_connect); - - // All good - - return SocketTestHarness{ - .listener = listener_socket, - .server = cqe_accept.res, - .client = client, - }; -} - -fn createListenerSocket(address: *net.Address) !os.socket_t { - const kernel_backlog = 1; - const listener_socket = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - errdefer posix.close(listener_socket); - - try os.setsockopt(listener_socket, os.SOL.SOCKET, os.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try os.bind(listener_socket, &address.any, address.getOsSockLen()); - try os.listen(listener_socket, kernel_backlog); - - // set address to the OS-chosen IP/port. - var slen: os.socklen_t = address.getOsSockLen(); - try os.getsockname(listener_socket, &address.any, &slen); - - return listener_socket; -} - -test "accept multishot" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var ring = IO_Uring.init(16, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var address = try net.Address.parseIp4("127.0.0.1", 0); - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); - - // submit multishot accept operation - var addr: os.sockaddr = undefined; - var addr_len: os.socklen_t = @sizeOf(@TypeOf(addr)); - const userdata: u64 = 0xaaaaaaaa; - _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - var nr: usize = 4; // number of clients to connect - while (nr > 0) : (nr -= 1) { - // connect client - const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - errdefer posix.close(client); - try os.connect(client, &address.any, address.getOsSockLen()); - - // test accept completion - var cqe = try ring.copy_cqe(); - if (cqe.err() == .INVAL) return error.SkipZigTest; - try testing.expect(cqe.res > 0); - try testing.expect(cqe.user_data == userdata); - try testing.expect(cqe.flags & linux.IORING_CQE_F_MORE > 0); // more flag is set - - posix.close(client); - } -} - -test "accept/connect/send_zc/recv" { - try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 }); - - var ring = IO_Uring.init(16, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); - - const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - var buffer_recv = [_]u8{0} ** 10; - - // zero-copy send - const send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); - send.flags |= linux.IOSQE_IO_LINK; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - // First completion of zero-copy send. - // IORING_CQE_F_MORE, means that there - // will be a second completion event / notification for the - // request, with the user_data field set to the same value. - // buffer_send must be keep alive until second cqe. - var cqe_send = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xeeeeeeee, - .res = buffer_send.len, - .flags = linux.IORING_CQE_F_MORE, - }, cqe_send); - - const cqe_recv = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xffffffff, - .res = buffer_recv.len, - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, - }, cqe_recv); - - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); - - // Second completion of zero-copy send. - // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer - cqe_send = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ - .user_data = 0xeeeeeeee, - .res = 0, - .flags = linux.IORING_CQE_F_NOTIF, - }, cqe_send); -} - -test "accept_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - var address = try net.Address.parseIp4("127.0.0.1", 0); - - // register direct file descriptors - var registered_fds = [_]os.fd_t{-1} ** 2; - try ring.register_files(registered_fds[0..]); - - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); - - const accept_userdata: u64 = 0xaaaaaaaa; - const read_userdata: u64 = 0xbbbbbbbb; - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - - for (0..2) |_| { - for (registered_fds, 0..) |_, i| { - var buffer_recv = [_]u8{0} ** 16; - const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop - - // submit accept, will chose registered fd and return index in cqe - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - // connect - const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - try os.connect(client, &address.any, address.getOsSockLen()); - defer posix.close(client); - - // accept completion - const cqe_accept = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe_accept.err()); - const fd_index = cqe_accept.res; - try testing.expect(fd_index < registered_fds.len); - try testing.expect(cqe_accept.user_data == accept_userdata); - - // send data - _ = try os.send(client, buffer_send, 0); - - // Example of how to use registered fd: - // Submit receive to fixed file returned by accept (fd_index). - // Fd field is set to registered file index, returned by accept. - // Flag linux.IOSQE_FIXED_FILE must be set. - const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); - recv_sqe.flags |= linux.IOSQE_FIXED_FILE; - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - // accept receive - const recv_cqe = try ring.copy_cqe(); - try testing.expect(recv_cqe.user_data == read_userdata); - try testing.expect(recv_cqe.res == buffer_send.len); - try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); - } - // no more available fds, accept will get NFILE error - { - // submit accept - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - // connect - const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - try os.connect(client, &address.any, address.getOsSockLen()); - defer posix.close(client); - // completion with error - const cqe_accept = try ring.copy_cqe(); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(os.E.NFILE, cqe_accept.err()); - } - // return file descriptors to kernel - try ring.register_files_update(0, registered_fds[0..]); - } - try ring.unregister_files(); -} - -test "accept_multishot_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var address = try net.Address.parseIp4("127.0.0.1", 0); - - var registered_fds = [_]os.fd_t{-1} ** 2; - try ring.register_files(registered_fds[0..]); - - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); - - const accept_userdata: u64 = 0xaaaaaaaa; - - for (0..2) |_| { - // submit multishot accept - // Will chose registered fd and return index of the selected registered file in cqe. - _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - for (registered_fds) |_| { - // connect - const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - try os.connect(client, &address.any, address.getOsSockLen()); - defer posix.close(client); - - // accept completion - const cqe_accept = try ring.copy_cqe(); - const fd_index = cqe_accept.res; - try testing.expect(fd_index < registered_fds.len); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE > 0); // has more is set - } - // No more available fds, accept will get NFILE error. - // Multishot is terminated (more flag is not set). - { - // connect - const client = try os.socket(address.any.family, os.SOCK.STREAM | os.SOCK.CLOEXEC, 0); - try os.connect(client, &address.any, address.getOsSockLen()); - defer posix.close(client); - // completion with error - const cqe_accept = try ring.copy_cqe(); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(os.E.NFILE, cqe_accept.err()); - try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE == 0); // has more is not set - } - // return file descriptors to kernel - try ring.register_files_update(0, registered_fds[0..]); - } - try ring.unregister_files(); -} - -test "socket" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - - var ring = IO_Uring.init(1, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - // prepare, submit socket operation - _ = try ring.socket(0, linux.AF.INET, os.SOCK.STREAM, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - // test completion - var cqe = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe.err()); - const fd: os.fd_t = @intCast(cqe.res); - try testing.expect(fd > 2); - - os.close(fd); -} - -test "socket_direct/socket_direct_alloc/close_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - - var ring = IO_Uring.init(2, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var registered_fds = [_]os.fd_t{-1} ** 3; - try ring.register_files(registered_fds[0..]); - - // create socket in registered file descriptor at index 0 (last param) - _ = try ring.socket_direct(0, linux.AF.INET, os.SOCK.STREAM, 0, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 0); - - // create socket in registered file descriptor at index 1 (last param) - _ = try ring.socket_direct(0, linux.AF.INET, os.SOCK.STREAM, 0, 0, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified - - // create socket in kernel chosen file descriptor index (_alloc version) - // completion res has index from registered files - _ = try ring.socket_direct_alloc(0, linux.AF.INET, os.SOCK.STREAM, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 2); // returns registered file index - - // use sockets from registered_fds in connect operation - var address = try net.Address.parseIp4("127.0.0.1", 0); - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); - const accept_userdata: u64 = 0xaaaaaaaa; - const connect_userdata: u64 = 0xbbbbbbbb; - const close_userdata: u64 = 0xcccccccc; - for (registered_fds, 0..) |_, fd_index| { - // prepare accept - _ = try ring.accept(accept_userdata, listener_socket, null, null, 0); - // prepare connect with fixed socket - const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), &address.any, address.getOsSockLen()); - connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index - // submit both - try testing.expectEqual(@as(u32, 2), try ring.submit()); - // get completions - var cqe_connect = try ring.copy_cqe(); - var cqe_accept = try ring.copy_cqe(); - // ignore order - if (cqe_connect.user_data == accept_userdata and cqe_accept.user_data == connect_userdata) { - const a = cqe_accept; - const b = cqe_connect; - cqe_accept = b; - cqe_connect = a; - } - // test connect completion - try testing.expect(cqe_connect.user_data == connect_userdata); - try testing.expectEqual(os.E.SUCCESS, cqe_connect.err()); - // test accept completion - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(os.E.SUCCESS, cqe_accept.err()); - - // submit and test close_direct - _ = try ring.close_direct(close_userdata, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_close = try ring.copy_cqe(); - try testing.expect(cqe_close.user_data == close_userdata); - try testing.expectEqual(os.E.SUCCESS, cqe_close.err()); - } - - try ring.unregister_files(); -} - -test "openat_direct/close_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - - var ring = IO_Uring.init(2, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var registered_fds = [_]os.fd_t{-1} ** 3; - try ring.register_files(registered_fds[0..]); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - const path = "test_io_uring_close_direct"; - const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true }; - const mode: os.mode_t = 0o666; - const user_data: u64 = 0; - - // use registered file at index 0 (last param) - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 0); - - // use registered file at index 1 - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 0); // res is 0 when we specify index - - // let kernel choose registered file index - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 2); // chosen index is in res - - // close all open file descriptors - for (registered_fds, 0..) |_, fd_index| { - _ = try ring.close_direct(user_data, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_close = try ring.copy_cqe(); - try testing.expectEqual(os.E.SUCCESS, cqe_close.err()); - } - try ring.unregister_files(); -} - -test "waitid" { - try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 }); - - var ring = IO_Uring.init(16, 0) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const pid = try os.fork(); - if (pid == 0) { - os.exit(7); - } - - var siginfo: os.siginfo_t = undefined; - _ = try ring.waitid(0, .PID, pid, &siginfo, os.W.EXITED, 0); - - try testing.expectEqual(1, try ring.submit()); - - const cqe_waitid = try ring.copy_cqe(); - try testing.expectEqual(0, cqe_waitid.res); - try testing.expectEqual(pid, siginfo.fields.common.first.piduid.pid); - try testing.expectEqual(7, siginfo.fields.common.second.sigchld.status); -} - -/// For use in tests. Returns SkipZigTest is kernel version is less than required. -inline fn skipKernelLessThan(required: std.SemanticVersion) !void { - if (builtin.os.tag != .linux) return error.SkipZigTest; - - var uts: linux.utsname = undefined; - const res = linux.uname(&uts); - switch (linux.getErrno(res)) { - .SUCCESS => {}, - else => |errno| return os.unexpectedErrno(errno), - } - - const release = mem.sliceTo(&uts.release, 0); - var current = try std.SemanticVersion.parse(release); - current.pre = null; // don't check pre field - if (required.order(current) == .gt) return error.SkipZigTest; -} diff --git a/lib/std/os/linux/io_uring_sqe.zig b/lib/std/os/linux/io_uring_sqe.zig @@ -0,0 +1,579 @@ +//! Contains only the definition of `io_uring_sqe`. +//! Split into its own file to compartmentalize the initialization methods. + +const std = @import("../../std.zig"); +const os = std.os; +const linux = os.linux; + +pub const io_uring_sqe = extern struct { + opcode: linux.IORING_OP, + flags: u8, + ioprio: u16, + fd: i32, + off: u64, + addr: u64, + len: u32, + rw_flags: u32, + user_data: u64, + buf_index: u16, + personality: u16, + splice_fd_in: i32, + addr3: u64, + resv: u64, + + pub fn prep_nop(sqe: *linux.io_uring_sqe) void { + sqe.* = .{ + .opcode = .NOP, + .flags = 0, + .ioprio = 0, + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_fsync(sqe: *linux.io_uring_sqe, fd: os.fd_t, flags: u32) void { + sqe.* = .{ + .opcode = .FSYNC, + .flags = 0, + .ioprio = 0, + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = flags, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_rw( + sqe: *linux.io_uring_sqe, + op: linux.IORING_OP, + fd: os.fd_t, + addr: u64, + len: usize, + offset: u64, + ) void { + sqe.* = .{ + .opcode = op, + .flags = 0, + .ioprio = 0, + .fd = fd, + .off = offset, + .addr = addr, + .len = @intCast(len), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_read(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []u8, offset: u64) void { + sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } + + pub fn prep_write(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []const u8, offset: u64) void { + sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } + + pub fn prep_splice(sqe: *linux.io_uring_sqe, fd_in: os.fd_t, off_in: u64, fd_out: os.fd_t, off_out: u64, len: usize) void { + sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); + sqe.addr = off_in; + sqe.splice_fd_in = fd_in; + } + + pub fn prep_readv( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + iovecs: []const os.iovec, + offset: u64, + ) void { + sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } + + pub fn prep_writev( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + iovecs: []const os.iovec_const, + offset: u64, + ) void { + sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } + + pub fn prep_read_fixed(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: *os.iovec, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.iov_base), buffer.iov_len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_write_fixed(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: *os.iovec, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.iov_base), buffer.iov_len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_accept( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + addr: ?*os.sockaddr, + addrlen: ?*os.socklen_t, + flags: u32, + ) void { + // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. + // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); + sqe.rw_flags = flags; + } + + pub fn prep_accept_direct( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + addr: ?*os.sockaddr, + addrlen: ?*os.socklen_t, + flags: u32, + file_index: u32, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + __io_uring_set_target_fixed_file(sqe, file_index); + } + + pub fn prep_multishot_accept_direct( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + addr: ?*os.sockaddr, + addrlen: ?*os.socklen_t, + flags: u32, + ) void { + prep_multishot_accept(sqe, fd, addr, addrlen, flags); + __io_uring_set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); + } + + fn __io_uring_set_target_fixed_file(sqe: *linux.io_uring_sqe, file_index: u32) void { + const sqe_file_index: u32 = if (file_index == linux.IORING_FILE_INDEX_ALLOC) + linux.IORING_FILE_INDEX_ALLOC + else + // 0 means no fixed files, indexes should be encoded as "index + 1" + file_index + 1; + // This filed is overloaded in liburing: + // splice_fd_in: i32 + // sqe_file_index: u32 + sqe.splice_fd_in = @bitCast(sqe_file_index); + } + + pub fn prep_connect( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + addr: *const os.sockaddr, + addrlen: os.socklen_t, + ) void { + // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); + } + + pub fn prep_epoll_ctl( + sqe: *linux.io_uring_sqe, + epfd: os.fd_t, + fd: os.fd_t, + op: u32, + ev: ?*linux.epoll_event, + ) void { + sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); + } + + pub fn prep_recv(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []u8, flags: u32) void { + sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + } + + pub fn prep_send(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []const u8, flags: u32) void { + sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + } + + pub fn prep_send_zc(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []const u8, flags: u32, zc_flags: u16) void { + sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + sqe.ioprio = zc_flags; + } + + pub fn prep_send_zc_fixed(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []const u8, flags: u32, zc_flags: u16, buf_index: u16) void { + prep_send_zc(sqe, fd, buffer, flags, zc_flags); + sqe.ioprio |= linux.IORING_RECVSEND_FIXED_BUF; + sqe.buf_index = buf_index; + } + + pub fn prep_sendmsg_zc( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + msg: *const os.msghdr_const, + flags: u32, + ) void { + prep_sendmsg(sqe, fd, msg, flags); + sqe.opcode = .SENDMSG_ZC; + } + + pub fn prep_recvmsg( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + msg: *os.msghdr, + flags: u32, + ) void { + sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = flags; + } + + pub fn prep_sendmsg( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + msg: *const os.msghdr_const, + flags: u32, + ) void { + sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = flags; + } + + pub fn prep_openat( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: os.mode_t, + ) void { + sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_openat_direct( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: os.mode_t, + file_index: u32, + ) void { + prep_openat(sqe, fd, path, flags, mode); + __io_uring_set_target_fixed_file(sqe, file_index); + } + + pub fn prep_close(sqe: *linux.io_uring_sqe, fd: os.fd_t) void { + sqe.* = .{ + .opcode = .CLOSE, + .flags = 0, + .ioprio = 0, + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_close_direct(sqe: *linux.io_uring_sqe, file_index: u32) void { + prep_close(sqe, 0); + __io_uring_set_target_fixed_file(sqe, file_index); + } + + pub fn prep_timeout( + sqe: *linux.io_uring_sqe, + ts: *const os.linux.kernel_timespec, + count: u32, + flags: u32, + ) void { + sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); + sqe.rw_flags = flags; + } + + pub fn prep_timeout_remove(sqe: *linux.io_uring_sqe, timeout_user_data: u64, flags: u32) void { + sqe.* = .{ + .opcode = .TIMEOUT_REMOVE, + .flags = 0, + .ioprio = 0, + .fd = -1, + .off = 0, + .addr = timeout_user_data, + .len = 0, + .rw_flags = flags, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_link_timeout( + sqe: *linux.io_uring_sqe, + ts: *const os.linux.kernel_timespec, + flags: u32, + ) void { + sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); + sqe.rw_flags = flags; + } + + pub fn prep_poll_add( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + poll_mask: u32, + ) void { + sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); + // Poll masks previously used to comprise of 16 bits in the flags union of + // a SQE, but were then extended to comprise of 32 bits in order to make + // room for additional option flags. To ensure that the correct bits of + // poll masks are consistently and properly read across multiple kernel + // versions, poll masks are enforced to be little-endian. + // https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + } + + pub fn prep_poll_remove( + sqe: *linux.io_uring_sqe, + target_user_data: u64, + ) void { + sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); + } + + pub fn prep_poll_update( + sqe: *linux.io_uring_sqe, + old_user_data: u64, + new_user_data: u64, + poll_mask: u32, + flags: u32, + ) void { + sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); + // Poll masks previously used to comprise of 16 bits in the flags union of + // a SQE, but were then extended to comprise of 32 bits in order to make + // room for additional option flags. To ensure that the correct bits of + // poll masks are consistently and properly read across multiple kernel + // versions, poll masks are enforced to be little-endian. + // https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + } + + pub fn prep_fallocate( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + mode: i32, + offset: u64, + len: u64, + ) void { + sqe.* = .{ + .opcode = .FALLOCATE, + .flags = 0, + .ioprio = 0, + .fd = fd, + .off = offset, + .addr = len, + .len = @intCast(mode), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_statx( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + path: [*:0]const u8, + flags: u32, + mask: u32, + buf: *linux.Statx, + ) void { + sqe.prep_rw(.STATX, fd, @intFromPtr(path), mask, @intFromPtr(buf)); + sqe.rw_flags = flags; + } + + pub fn prep_cancel( + sqe: *linux.io_uring_sqe, + cancel_user_data: u64, + flags: u32, + ) void { + sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); + sqe.rw_flags = flags; + } + + pub fn prep_shutdown( + sqe: *linux.io_uring_sqe, + sockfd: os.socket_t, + how: u32, + ) void { + sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0); + } + + pub fn prep_renameat( + sqe: *linux.io_uring_sqe, + old_dir_fd: os.fd_t, + old_path: [*:0]const u8, + new_dir_fd: os.fd_t, + new_path: [*:0]const u8, + flags: u32, + ) void { + sqe.prep_rw( + .RENAMEAT, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = flags; + } + + pub fn prep_unlinkat( + sqe: *linux.io_uring_sqe, + dir_fd: os.fd_t, + path: [*:0]const u8, + flags: u32, + ) void { + sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); + sqe.rw_flags = flags; + } + + pub fn prep_mkdirat( + sqe: *linux.io_uring_sqe, + dir_fd: os.fd_t, + path: [*:0]const u8, + mode: os.mode_t, + ) void { + sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); + } + + pub fn prep_symlinkat( + sqe: *linux.io_uring_sqe, + target: [*:0]const u8, + new_dir_fd: os.fd_t, + link_path: [*:0]const u8, + ) void { + sqe.prep_rw( + .SYMLINKAT, + new_dir_fd, + @intFromPtr(target), + 0, + @intFromPtr(link_path), + ); + } + + pub fn prep_linkat( + sqe: *linux.io_uring_sqe, + old_dir_fd: os.fd_t, + old_path: [*:0]const u8, + new_dir_fd: os.fd_t, + new_path: [*:0]const u8, + flags: u32, + ) void { + sqe.prep_rw( + .LINKAT, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = flags; + } + + pub fn prep_provide_buffers( + sqe: *linux.io_uring_sqe, + buffers: [*]u8, + buffer_len: usize, + num: usize, + group_id: usize, + buffer_id: usize, + ) void { + const ptr = @intFromPtr(buffers); + sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); + sqe.buf_index = @intCast(group_id); + } + + pub fn prep_remove_buffers( + sqe: *linux.io_uring_sqe, + num: usize, + group_id: usize, + ) void { + sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); + sqe.buf_index = @intCast(group_id); + } + + pub fn prep_multishot_accept( + sqe: *linux.io_uring_sqe, + fd: os.fd_t, + addr: ?*os.sockaddr, + addrlen: ?*os.socklen_t, + flags: u32, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + sqe.ioprio |= linux.IORING_ACCEPT_MULTISHOT; + } + + pub fn prep_socket( + sqe: *linux.io_uring_sqe, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, + ) void { + sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); + sqe.rw_flags = flags; + } + + pub fn prep_socket_direct( + sqe: *linux.io_uring_sqe, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, + file_index: u32, + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + __io_uring_set_target_fixed_file(sqe, file_index); + } + + pub fn prep_socket_direct_alloc( + sqe: *linux.io_uring_sqe, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + __io_uring_set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); + } + + pub fn prep_waitid( + sqe: *linux.io_uring_sqe, + id_type: linux.P, + id: i32, + infop: *linux.siginfo_t, + options: u32, + flags: u32, + ) void { + sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); + sqe.rw_flags = flags; + sqe.splice_fd_in = @bitCast(options); + } +}; diff --git a/lib/std/os/linux/test.zig b/lib/std/os/linux/test.zig @@ -120,3 +120,7 @@ test "fadvise" { const ret = linux.fadvise(file.handle, 0, 0, linux.POSIX_FADV.SEQUENTIAL); try expectEqual(@as(usize, 0), ret); } + +test { + _ = linux.IoUring; +} diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig @@ -3796,290 +3796,300 @@ pub const EXCEPTION_RECORD = extern struct { ExceptionInformation: [15]usize, }; -pub usingnamespace switch (native_arch) { - .x86 => struct { - pub const FLOATING_SAVE_AREA = extern struct { - ControlWord: DWORD, - StatusWord: DWORD, - TagWord: DWORD, - ErrorOffset: DWORD, - ErrorSelector: DWORD, - DataOffset: DWORD, - DataSelector: DWORD, - RegisterArea: [80]BYTE, - Cr0NpxState: DWORD, - }; +pub const FLOATING_SAVE_AREA = switch (native_arch) { + .x86 => extern struct { + ControlWord: DWORD, + StatusWord: DWORD, + TagWord: DWORD, + ErrorOffset: DWORD, + ErrorSelector: DWORD, + DataOffset: DWORD, + DataSelector: DWORD, + RegisterArea: [80]BYTE, + Cr0NpxState: DWORD, + }, + else => @compileError("FLOATING_SAVE_AREA only defined on x86"), +}; - pub const CONTEXT = extern struct { - ContextFlags: DWORD, - Dr0: DWORD, - Dr1: DWORD, - Dr2: DWORD, - Dr3: DWORD, - Dr6: DWORD, - Dr7: DWORD, - FloatSave: FLOATING_SAVE_AREA, - SegGs: DWORD, - SegFs: DWORD, - SegEs: DWORD, - SegDs: DWORD, - Edi: DWORD, - Esi: DWORD, - Ebx: DWORD, - Edx: DWORD, - Ecx: DWORD, - Eax: DWORD, - Ebp: DWORD, - Eip: DWORD, - SegCs: DWORD, - EFlags: DWORD, - Esp: DWORD, - SegSs: DWORD, - ExtendedRegisters: [512]BYTE, - - pub fn getRegs(ctx: *const CONTEXT) struct { bp: usize, ip: usize } { - return .{ .bp = ctx.Ebp, .ip = ctx.Eip }; - } - }; +pub const M128A = switch (native_arch) { + .x86_64 => extern struct { + Low: ULONGLONG, + High: LONGLONG, + }, + else => @compileError("M128A only defined on x86_64"), +}; + +pub const XMM_SAVE_AREA32 = switch (native_arch) { + .x86_64 => extern struct { + ControlWord: WORD, + StatusWord: WORD, + TagWord: BYTE, + Reserved1: BYTE, + ErrorOpcode: WORD, + ErrorOffset: DWORD, + ErrorSelector: WORD, + Reserved2: WORD, + DataOffset: DWORD, + DataSelector: WORD, + Reserved3: WORD, + MxCsr: DWORD, + MxCsr_Mask: DWORD, + FloatRegisters: [8]M128A, + XmmRegisters: [16]M128A, + Reserved4: [96]BYTE, }, - .x86_64 => struct { - pub const M128A = extern struct { + else => @compileError("XMM_SAVE_AREA32 only defined on x86_64"), +}; + +pub const NEON128 = switch (native_arch) { + .aarch64 => extern union { + DUMMYSTRUCTNAME: extern struct { Low: ULONGLONG, High: LONGLONG, - }; - - pub const XMM_SAVE_AREA32 = extern struct { - ControlWord: WORD, - StatusWord: WORD, - TagWord: BYTE, - Reserved1: BYTE, - ErrorOpcode: WORD, - ErrorOffset: DWORD, - ErrorSelector: WORD, - Reserved2: WORD, - DataOffset: DWORD, - DataSelector: WORD, - Reserved3: WORD, - MxCsr: DWORD, - MxCsr_Mask: DWORD, - FloatRegisters: [8]M128A, - XmmRegisters: [16]M128A, - Reserved4: [96]BYTE, - }; - - pub const CONTEXT = extern struct { - P1Home: DWORD64 align(16), - P2Home: DWORD64, - P3Home: DWORD64, - P4Home: DWORD64, - P5Home: DWORD64, - P6Home: DWORD64, - ContextFlags: DWORD, - MxCsr: DWORD, - SegCs: WORD, - SegDs: WORD, - SegEs: WORD, - SegFs: WORD, - SegGs: WORD, - SegSs: WORD, - EFlags: DWORD, - Dr0: DWORD64, - Dr1: DWORD64, - Dr2: DWORD64, - Dr3: DWORD64, - Dr6: DWORD64, - Dr7: DWORD64, - Rax: DWORD64, - Rcx: DWORD64, - Rdx: DWORD64, - Rbx: DWORD64, - Rsp: DWORD64, - Rbp: DWORD64, - Rsi: DWORD64, - Rdi: DWORD64, - R8: DWORD64, - R9: DWORD64, - R10: DWORD64, - R11: DWORD64, - R12: DWORD64, - R13: DWORD64, - R14: DWORD64, - R15: DWORD64, - Rip: DWORD64, - DUMMYUNIONNAME: extern union { - FltSave: XMM_SAVE_AREA32, - FloatSave: XMM_SAVE_AREA32, - DUMMYSTRUCTNAME: extern struct { - Header: [2]M128A, - Legacy: [8]M128A, - Xmm0: M128A, - Xmm1: M128A, - Xmm2: M128A, - Xmm3: M128A, - Xmm4: M128A, - Xmm5: M128A, - Xmm6: M128A, - Xmm7: M128A, - Xmm8: M128A, - Xmm9: M128A, - Xmm10: M128A, - Xmm11: M128A, - Xmm12: M128A, - Xmm13: M128A, - Xmm14: M128A, - Xmm15: M128A, - }, + }, + D: [2]f64, + S: [4]f32, + H: [8]WORD, + B: [16]BYTE, + }, + else => @compileError("NEON128 only defined on aarch64"), +}; + +pub const CONTEXT = switch (native_arch) { + .x86 => extern struct { + ContextFlags: DWORD, + Dr0: DWORD, + Dr1: DWORD, + Dr2: DWORD, + Dr3: DWORD, + Dr6: DWORD, + Dr7: DWORD, + FloatSave: FLOATING_SAVE_AREA, + SegGs: DWORD, + SegFs: DWORD, + SegEs: DWORD, + SegDs: DWORD, + Edi: DWORD, + Esi: DWORD, + Ebx: DWORD, + Edx: DWORD, + Ecx: DWORD, + Eax: DWORD, + Ebp: DWORD, + Eip: DWORD, + SegCs: DWORD, + EFlags: DWORD, + Esp: DWORD, + SegSs: DWORD, + ExtendedRegisters: [512]BYTE, + + pub fn getRegs(ctx: *const CONTEXT) struct { bp: usize, ip: usize } { + return .{ .bp = ctx.Ebp, .ip = ctx.Eip }; + } + }, + .x86_64 => extern struct { + P1Home: DWORD64 align(16), + P2Home: DWORD64, + P3Home: DWORD64, + P4Home: DWORD64, + P5Home: DWORD64, + P6Home: DWORD64, + ContextFlags: DWORD, + MxCsr: DWORD, + SegCs: WORD, + SegDs: WORD, + SegEs: WORD, + SegFs: WORD, + SegGs: WORD, + SegSs: WORD, + EFlags: DWORD, + Dr0: DWORD64, + Dr1: DWORD64, + Dr2: DWORD64, + Dr3: DWORD64, + Dr6: DWORD64, + Dr7: DWORD64, + Rax: DWORD64, + Rcx: DWORD64, + Rdx: DWORD64, + Rbx: DWORD64, + Rsp: DWORD64, + Rbp: DWORD64, + Rsi: DWORD64, + Rdi: DWORD64, + R8: DWORD64, + R9: DWORD64, + R10: DWORD64, + R11: DWORD64, + R12: DWORD64, + R13: DWORD64, + R14: DWORD64, + R15: DWORD64, + Rip: DWORD64, + DUMMYUNIONNAME: extern union { + FltSave: XMM_SAVE_AREA32, + FloatSave: XMM_SAVE_AREA32, + DUMMYSTRUCTNAME: extern struct { + Header: [2]M128A, + Legacy: [8]M128A, + Xmm0: M128A, + Xmm1: M128A, + Xmm2: M128A, + Xmm3: M128A, + Xmm4: M128A, + Xmm5: M128A, + Xmm6: M128A, + Xmm7: M128A, + Xmm8: M128A, + Xmm9: M128A, + Xmm10: M128A, + Xmm11: M128A, + Xmm12: M128A, + Xmm13: M128A, + Xmm14: M128A, + Xmm15: M128A, }, - VectorRegister: [26]M128A, - VectorControl: DWORD64, - DebugControl: DWORD64, - LastBranchToRip: DWORD64, - LastBranchFromRip: DWORD64, - LastExceptionToRip: DWORD64, - LastExceptionFromRip: DWORD64, - - pub fn getRegs(ctx: *const CONTEXT) struct { bp: usize, ip: usize, sp: usize } { - return .{ .bp = ctx.Rbp, .ip = ctx.Rip, .sp = ctx.Rsp }; - } - - pub fn setIp(ctx: *CONTEXT, ip: usize) void { - ctx.Rip = ip; - } - - pub fn setSp(ctx: *CONTEXT, sp: usize) void { - ctx.Rsp = sp; - } - }; + }, + VectorRegister: [26]M128A, + VectorControl: DWORD64, + DebugControl: DWORD64, + LastBranchToRip: DWORD64, + LastBranchFromRip: DWORD64, + LastExceptionToRip: DWORD64, + LastExceptionFromRip: DWORD64, + + pub fn getRegs(ctx: *const CONTEXT) struct { bp: usize, ip: usize, sp: usize } { + return .{ .bp = ctx.Rbp, .ip = ctx.Rip, .sp = ctx.Rsp }; + } - pub const RUNTIME_FUNCTION = extern struct { - BeginAddress: DWORD, - EndAddress: DWORD, - UnwindData: DWORD, - }; + pub fn setIp(ctx: *CONTEXT, ip: usize) void { + ctx.Rip = ip; + } - pub const KNONVOLATILE_CONTEXT_POINTERS = extern struct { - FloatingContext: [16]?*M128A, - IntegerContext: [16]?*ULONG64, - }; + pub fn setSp(ctx: *CONTEXT, sp: usize) void { + ctx.Rsp = sp; + } }, - .aarch64 => struct { - pub const NEON128 = extern union { + .aarch64 => extern struct { + ContextFlags: ULONG align(16), + Cpsr: ULONG, + DUMMYUNIONNAME: extern union { DUMMYSTRUCTNAME: extern struct { - Low: ULONGLONG, - High: LONGLONG, - }, - D: [2]f64, - S: [4]f32, - H: [8]WORD, - B: [16]BYTE, - }; - - pub const CONTEXT = extern struct { - ContextFlags: ULONG align(16), - Cpsr: ULONG, - DUMMYUNIONNAME: extern union { - DUMMYSTRUCTNAME: extern struct { - X0: DWORD64, - X1: DWORD64, - X2: DWORD64, - X3: DWORD64, - X4: DWORD64, - X5: DWORD64, - X6: DWORD64, - X7: DWORD64, - X8: DWORD64, - X9: DWORD64, - X10: DWORD64, - X11: DWORD64, - X12: DWORD64, - X13: DWORD64, - X14: DWORD64, - X15: DWORD64, - X16: DWORD64, - X17: DWORD64, - X18: DWORD64, - X19: DWORD64, - X20: DWORD64, - X21: DWORD64, - X22: DWORD64, - X23: DWORD64, - X24: DWORD64, - X25: DWORD64, - X26: DWORD64, - X27: DWORD64, - X28: DWORD64, - Fp: DWORD64, - Lr: DWORD64, - }, - X: [31]DWORD64, + X0: DWORD64, + X1: DWORD64, + X2: DWORD64, + X3: DWORD64, + X4: DWORD64, + X5: DWORD64, + X6: DWORD64, + X7: DWORD64, + X8: DWORD64, + X9: DWORD64, + X10: DWORD64, + X11: DWORD64, + X12: DWORD64, + X13: DWORD64, + X14: DWORD64, + X15: DWORD64, + X16: DWORD64, + X17: DWORD64, + X18: DWORD64, + X19: DWORD64, + X20: DWORD64, + X21: DWORD64, + X22: DWORD64, + X23: DWORD64, + X24: DWORD64, + X25: DWORD64, + X26: DWORD64, + X27: DWORD64, + X28: DWORD64, + Fp: DWORD64, + Lr: DWORD64, }, - Sp: DWORD64, - Pc: DWORD64, - V: [32]NEON128, - Fpcr: DWORD, - Fpsr: DWORD, - Bcr: [8]DWORD, - Bvr: [8]DWORD64, - Wcr: [2]DWORD, - Wvr: [2]DWORD64, - - pub fn getRegs(ctx: *const CONTEXT) struct { bp: usize, ip: usize, sp: usize } { - return .{ - .bp = ctx.DUMMYUNIONNAME.DUMMYSTRUCTNAME.Fp, - .ip = ctx.Pc, - .sp = ctx.Sp, - }; - } + X: [31]DWORD64, + }, + Sp: DWORD64, + Pc: DWORD64, + V: [32]NEON128, + Fpcr: DWORD, + Fpsr: DWORD, + Bcr: [8]DWORD, + Bvr: [8]DWORD64, + Wcr: [2]DWORD, + Wvr: [2]DWORD64, + + pub fn getRegs(ctx: *const CONTEXT) struct { bp: usize, ip: usize, sp: usize } { + return .{ + .bp = ctx.DUMMYUNIONNAME.DUMMYSTRUCTNAME.Fp, + .ip = ctx.Pc, + .sp = ctx.Sp, + }; + } - pub fn setIp(ctx: *CONTEXT, ip: usize) void { - ctx.Pc = ip; - } + pub fn setIp(ctx: *CONTEXT, ip: usize) void { + ctx.Pc = ip; + } - pub fn setSp(ctx: *CONTEXT, sp: usize) void { - ctx.Sp = sp; - } - }; + pub fn setSp(ctx: *CONTEXT, sp: usize) void { + ctx.Sp = sp; + } + }, + else => @compileError("CONTEXT is not defined for this architecture"), +}; - pub const RUNTIME_FUNCTION = extern struct { - BeginAddress: DWORD, - DUMMYUNIONNAME: extern union { - UnwindData: DWORD, - DUMMYSTRUCTNAME: packed struct { - Flag: u2, - FunctionLength: u11, - RegF: u3, - RegI: u4, - H: u1, - CR: u2, - FrameSize: u9, - }, +pub const RUNTIME_FUNCTION = switch (native_arch) { + .x86_64 => extern struct { + BeginAddress: DWORD, + EndAddress: DWORD, + UnwindData: DWORD, + }, + .aarch64 => extern struct { + BeginAddress: DWORD, + DUMMYUNIONNAME: extern union { + UnwindData: DWORD, + DUMMYSTRUCTNAME: packed struct { + Flag: u2, + FunctionLength: u11, + RegF: u3, + RegI: u4, + H: u1, + CR: u2, + FrameSize: u9, }, - }; + }, + }, + else => @compileError("RUNTIME_FUNCTION is not defined for this architecture"), +}; - pub const KNONVOLATILE_CONTEXT_POINTERS = extern struct { - X19: ?*DWORD64, - X20: ?*DWORD64, - X21: ?*DWORD64, - X22: ?*DWORD64, - X23: ?*DWORD64, - X24: ?*DWORD64, - X25: ?*DWORD64, - X26: ?*DWORD64, - X27: ?*DWORD64, - X28: ?*DWORD64, - Fp: ?*DWORD64, - Lr: ?*DWORD64, - D8: ?*DWORD64, - D9: ?*DWORD64, - D10: ?*DWORD64, - D11: ?*DWORD64, - D12: ?*DWORD64, - D13: ?*DWORD64, - D14: ?*DWORD64, - D15: ?*DWORD64, - }; +pub const KNONVOLATILE_CONTEXT_POINTERS = switch (native_arch) { + .x86_64 => extern struct { + FloatingContext: [16]?*M128A, + IntegerContext: [16]?*ULONG64, + }, + .aarch64 => extern struct { + X19: ?*DWORD64, + X20: ?*DWORD64, + X21: ?*DWORD64, + X22: ?*DWORD64, + X23: ?*DWORD64, + X24: ?*DWORD64, + X25: ?*DWORD64, + X26: ?*DWORD64, + X27: ?*DWORD64, + X28: ?*DWORD64, + Fp: ?*DWORD64, + Lr: ?*DWORD64, + D8: ?*DWORD64, + D9: ?*DWORD64, + D10: ?*DWORD64, + D11: ?*DWORD64, + D12: ?*DWORD64, + D13: ?*DWORD64, + D14: ?*DWORD64, + D15: ?*DWORD64, }, - else => struct {}, + else => @compileError("KNONVOLATILE_CONTEXT_POINTERS is not defined for this architecture"), }; pub const EXCEPTION_POINTERS = extern struct {