zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

blob c2e0d9f9 (22478B) - Raw


      1 //! This module provides functions for working conveniently with SIMD (Single Instruction; Multiple Data),
      2 //! which may offer a potential boost in performance on some targets by performing the same operations on
      3 //! multiple elements at once.
      4 //! Please be aware that some functions are known to not work on MIPS.
      5 
      6 const std = @import("std");
      7 const builtin = @import("builtin");
      8 
      9 pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?usize {
     10     // This is guesswork, if you have better suggestions can add it or edit the current here
     11     // This can run in comptime only, but stage 1 fails at it, stage 2 can understand it
     12     const element_bit_size = @max(8, std.math.ceilPowerOfTwo(u16, @bitSizeOf(T)) catch unreachable);
     13     const vector_bit_size: u16 = blk: {
     14         if (cpu.arch.isX86()) {
     15             if (T == bool and std.Target.x86.featureSetHas(.prefer_mask_registers)) return 64;
     16             if (std.Target.x86.featureSetHas(cpu.features, .avx512f) and !std.Target.x86.featureSetHasAny(cpu.features, .{ .prefer_256_bit, .prefer_128_bit })) break :blk 512;
     17             if (std.Target.x86.featureSetHasAny(cpu.features, .{ .prefer_256_bit, .avx2 }) and !std.Target.x86.featureSetHas(cpu.features, .prefer_128_bit)) break :blk 256;
     18             if (std.Target.x86.featureSetHas(cpu.features, .sse)) break :blk 128;
     19             if (std.Target.x86.featureSetHasAny(cpu.features, .{ .mmx, .@"3dnow" })) break :blk 64;
     20         } else if (cpu.arch.isARM()) {
     21             if (std.Target.arm.featureSetHas(cpu.features, .neon)) break :blk 128;
     22         } else if (cpu.arch.isAARCH64()) {
     23             // SVE allows up to 2048 bits in the specification, as of 2022 the most powerful machine has implemented 512-bit
     24             // I think is safer to just be on 128 until is more common
     25             // TODO: Check on this return when bigger values are more common
     26             if (std.Target.aarch64.featureSetHas(cpu.features, .sve)) break :blk 128;
     27             if (std.Target.aarch64.featureSetHas(cpu.features, .neon)) break :blk 128;
     28         } else if (cpu.arch.isPPC() or cpu.arch.isPPC64()) {
     29             if (std.Target.powerpc.featureSetHas(cpu.features, .altivec)) break :blk 128;
     30         } else if (cpu.arch.isMIPS()) {
     31             if (std.Target.mips.featureSetHas(cpu.features, .msa)) break :blk 128;
     32             // TODO: Test MIPS capability to handle bigger vectors
     33             //       In theory MDMX and by extension mips3d have 32 registers of 64 bits which can use in parallel
     34             //       for multiple processing, but I don't know what's optimal here, if using
     35             //       the 2048 bits or using just 64 per vector or something in between
     36             if (std.Target.mips.featureSetHas(cpu.features, std.Target.mips.Feature.mips3d)) break :blk 64;
     37         } else if (cpu.arch.isRISCV()) {
     38             // in risc-v the Vector Extension allows configurable vector sizes, but a standard size of 128 is a safe estimate
     39             if (std.Target.riscv.featureSetHas(cpu.features, .v)) break :blk 128;
     40         } else if (cpu.arch.isSPARC()) {
     41             // TODO: Test Sparc capability to handle bigger vectors
     42             //       In theory Sparc have 32 registers of 64 bits which can use in parallel
     43             //       for multiple processing, but I don't know what's optimal here, if using
     44             //       the 2048 bits or using just 64 per vector or something in between
     45             if (std.Target.sparc.featureSetHasAny(cpu.features, .{ .vis, .vis2, .vis3 })) break :blk 64;
     46         } else if (cpu.arch.isWasm()) {
     47             if (std.Target.wasm.featureSetHas(cpu.features, .simd128)) break :blk 128;
     48         }
     49         return null;
     50     };
     51     if (vector_bit_size <= element_bit_size) return null;
     52 
     53     return @divExact(vector_bit_size, element_bit_size);
     54 }
     55 
     56 /// Suggests a target-dependant vector size for a given type, or null if scalars are recommended.
     57 /// Not yet implemented for every CPU architecture.
     58 pub fn suggestVectorSize(comptime T: type) ?usize {
     59     return suggestVectorSizeForCpu(T, builtin.cpu);
     60 }
     61 
     62 test "suggestVectorSizeForCpu works with signed and unsigned values" {
     63     comptime var cpu = std.Target.Cpu.baseline(std.Target.Cpu.Arch.x86_64);
     64     comptime cpu.features.addFeature(@enumToInt(std.Target.x86.Feature.avx512f));
     65     const signed_integer_size = suggestVectorSizeForCpu(i32, cpu).?;
     66     const unsigned_integer_size = suggestVectorSizeForCpu(u32, cpu).?;
     67     try std.testing.expectEqual(@as(usize, 16), unsigned_integer_size);
     68     try std.testing.expectEqual(@as(usize, 16), signed_integer_size);
     69 }
     70 
     71 fn vectorLength(comptime VectorType: type) comptime_int {
     72     return switch (@typeInfo(VectorType)) {
     73         .Vector => |info| info.len,
     74         .Array => |info| info.len,
     75         else => @compileError("Invalid type " ++ @typeName(VectorType)),
     76     };
     77 }
     78 
     79 /// Returns the smallest type of unsigned ints capable of indexing any element within the given vector type.
     80 pub fn VectorIndex(comptime VectorType: type) type {
     81     return std.math.IntFittingRange(0, vectorLength(VectorType) - 1);
     82 }
     83 
     84 /// Returns the smallest type of unsigned ints capable of holding the length of the given vector type.
     85 pub fn VectorCount(comptime VectorType: type) type {
     86     return std.math.IntFittingRange(0, vectorLength(VectorType));
     87 }
     88 
     89 /// Returns a vector containing the first `len` integers in order from 0 to `len`-1.
     90 /// For example, `iota(i32, 8)` will return a vector containing `.{0, 1, 2, 3, 4, 5, 6, 7}`.
     91 pub inline fn iota(comptime T: type, comptime len: usize) @Vector(len, T) {
     92     comptime {
     93         var out: [len]T = undefined;
     94         for (&out, 0..) |*element, i| {
     95             element.* = switch (@typeInfo(T)) {
     96                 .Int => @intCast(T, i),
     97                 .Float => @intToFloat(T, i),
     98                 else => @compileError("Can't use type " ++ @typeName(T) ++ " in iota."),
     99             };
    100         }
    101         return @as(@Vector(len, T), out);
    102     }
    103 }
    104 
    105 /// Returns a vector containing the same elements as the input, but repeated until the desired length is reached.
    106 /// For example, `repeat(8, [_]u32{1, 2, 3})` will return a vector containing `.{1, 2, 3, 1, 2, 3, 1, 2}`.
    107 pub fn repeat(comptime len: usize, vec: anytype) @Vector(len, std.meta.Child(@TypeOf(vec))) {
    108     const Child = std.meta.Child(@TypeOf(vec));
    109 
    110     return @shuffle(Child, vec, undefined, iota(i32, len) % @splat(len, @intCast(i32, vectorLength(@TypeOf(vec)))));
    111 }
    112 
    113 /// Returns a vector containing all elements of the first vector at the lower indices followed by all elements of the second vector
    114 /// at the higher indices.
    115 pub fn join(a: anytype, b: anytype) @Vector(vectorLength(@TypeOf(a)) + vectorLength(@TypeOf(b)), std.meta.Child(@TypeOf(a))) {
    116     const Child = std.meta.Child(@TypeOf(a));
    117     const a_len = vectorLength(@TypeOf(a));
    118     const b_len = vectorLength(@TypeOf(b));
    119 
    120     return @shuffle(Child, a, b, @as([a_len]i32, iota(i32, a_len)) ++ @as([b_len]i32, ~iota(i32, b_len)));
    121 }
    122 
    123 /// Returns a vector whose elements alternates between those of each input vector.
    124 /// For example, `interlace(.{[4]u32{11, 12, 13, 14}, [4]u32{21, 22, 23, 24}})` returns a vector containing `.{11, 21, 12, 22, 13, 23, 14, 24}`.
    125 pub fn interlace(vecs: anytype) @Vector(vectorLength(@TypeOf(vecs[0])) * vecs.len, std.meta.Child(@TypeOf(vecs[0]))) {
    126     // interlace doesn't work on MIPS, for some reason.
    127     // Notes from earlier debug attempt:
    128     //  The indices are correct. The problem seems to be with the @shuffle builtin.
    129     //  On MIPS, the test that interlaces small_base gives { 0, 2, 0, 0, 64, 255, 248, 200, 0, 0 }.
    130     //  Calling this with two inputs seems to work fine, but I'll let the compile error trigger for all inputs, just to be safe.
    131     comptime if (builtin.cpu.arch.isMIPS()) @compileError("TODO: Find out why interlace() doesn't work on MIPS");
    132 
    133     const VecType = @TypeOf(vecs[0]);
    134     const vecs_arr = @as([vecs.len]VecType, vecs);
    135     const Child = std.meta.Child(@TypeOf(vecs_arr[0]));
    136 
    137     if (vecs_arr.len == 1) return vecs_arr[0];
    138 
    139     const a_vec_count = (1 + vecs_arr.len) >> 1;
    140     const b_vec_count = vecs_arr.len >> 1;
    141 
    142     const a = interlace(@ptrCast(*const [a_vec_count]VecType, vecs_arr[0..a_vec_count]).*);
    143     const b = interlace(@ptrCast(*const [b_vec_count]VecType, vecs_arr[a_vec_count..]).*);
    144 
    145     const a_len = vectorLength(@TypeOf(a));
    146     const b_len = vectorLength(@TypeOf(b));
    147     const len = a_len + b_len;
    148 
    149     const indices = comptime blk: {
    150         const count_up = iota(i32, len);
    151         const cycle = @divFloor(count_up, @splat(len, @intCast(i32, vecs_arr.len)));
    152         const select_mask = repeat(len, join(@splat(a_vec_count, true), @splat(b_vec_count, false)));
    153         const a_indices = count_up - cycle * @splat(len, @intCast(i32, b_vec_count));
    154         const b_indices = shiftElementsRight(count_up - cycle * @splat(len, @intCast(i32, a_vec_count)), a_vec_count, 0);
    155         break :blk @select(i32, select_mask, a_indices, ~b_indices);
    156     };
    157 
    158     return @shuffle(Child, a, b, indices);
    159 }
    160 
    161 /// The contents of `interlaced` is evenly split between vec_count vectors that are returned as an array. They "take turns",
    162 /// recieving one element from `interlaced` at a time.
    163 pub fn deinterlace(
    164     comptime vec_count: usize,
    165     interlaced: anytype,
    166 ) [vec_count]@Vector(
    167     vectorLength(@TypeOf(interlaced)) / vec_count,
    168     std.meta.Child(@TypeOf(interlaced)),
    169 ) {
    170     const vec_len = vectorLength(@TypeOf(interlaced)) / vec_count;
    171     const Child = std.meta.Child(@TypeOf(interlaced));
    172 
    173     var out: [vec_count]@Vector(vec_len, Child) = undefined;
    174 
    175     comptime var i: usize = 0; // for-loops don't work for this, apparently.
    176     inline while (i < out.len) : (i += 1) {
    177         const indices = comptime iota(i32, vec_len) * @splat(vec_len, @intCast(i32, vec_count)) + @splat(vec_len, @intCast(i32, i));
    178         out[i] = @shuffle(Child, interlaced, undefined, indices);
    179     }
    180 
    181     return out;
    182 }
    183 
    184 pub fn extract(
    185     vec: anytype,
    186     comptime first: VectorIndex(@TypeOf(vec)),
    187     comptime count: VectorCount(@TypeOf(vec)),
    188 ) @Vector(count, std.meta.Child(@TypeOf(vec))) {
    189     const Child = std.meta.Child(@TypeOf(vec));
    190     const len = vectorLength(@TypeOf(vec));
    191 
    192     std.debug.assert(@intCast(comptime_int, first) + @intCast(comptime_int, count) <= len);
    193 
    194     return @shuffle(Child, vec, undefined, iota(i32, count) + @splat(count, @intCast(i32, first)));
    195 }
    196 
    197 test "vector patterns" {
    198     if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .aarch64) {
    199         // https://github.com/ziglang/zig/issues/12012
    200         return error.SkipZigTest;
    201     }
    202     const base = @Vector(4, u32){ 10, 20, 30, 40 };
    203     const other_base = @Vector(4, u32){ 55, 66, 77, 88 };
    204 
    205     const small_bases = [5]@Vector(2, u8){
    206         @Vector(2, u8){ 0, 1 },
    207         @Vector(2, u8){ 2, 3 },
    208         @Vector(2, u8){ 4, 5 },
    209         @Vector(2, u8){ 6, 7 },
    210         @Vector(2, u8){ 8, 9 },
    211     };
    212 
    213     try std.testing.expectEqual([6]u32{ 10, 20, 30, 40, 10, 20 }, repeat(6, base));
    214     try std.testing.expectEqual([8]u32{ 10, 20, 30, 40, 55, 66, 77, 88 }, join(base, other_base));
    215     try std.testing.expectEqual([2]u32{ 20, 30 }, extract(base, 1, 2));
    216 
    217     if (comptime !builtin.cpu.arch.isMIPS()) {
    218         try std.testing.expectEqual([8]u32{ 10, 55, 20, 66, 30, 77, 40, 88 }, interlace(.{ base, other_base }));
    219 
    220         const small_braid = interlace(small_bases);
    221         try std.testing.expectEqual([10]u8{ 0, 2, 4, 6, 8, 1, 3, 5, 7, 9 }, small_braid);
    222         try std.testing.expectEqual(small_bases, deinterlace(small_bases.len, small_braid));
    223     }
    224 }
    225 
    226 /// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the size of a and b.
    227 pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a, b))) @TypeOf(a, b) {
    228     const len = vectorLength(@TypeOf(a, b));
    229 
    230     return extract(join(a, b), shift, len);
    231 }
    232 
    233 /// Elements are shifted rightwards (towards higher indices). New elements are added to the left, and the rightmost elements are cut off
    234 /// so that the size of the vector stays the same.
    235 pub fn shiftElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) {
    236     // It may be possible to implement shifts and rotates with a runtime-friendly slice of two joined vectors, as the length of the
    237     // slice would be comptime-known. This would permit vector shifts and rotates by a non-comptime-known amount.
    238     // However, I am unsure whether compiler optimizations would handle that well enough on all platforms.
    239     const len = vectorLength(@TypeOf(vec));
    240 
    241     return mergeShift(@splat(len, shift_in), vec, len - amount);
    242 }
    243 
    244 /// Elements are shifted leftwards (towards lower indices). New elements are added to the right, and the leftmost elements are cut off
    245 /// so that no elements with indices below 0 remain.
    246 pub fn shiftElementsLeft(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) {
    247     const len = vectorLength(@TypeOf(vec));
    248 
    249     return mergeShift(vec, @splat(len, shift_in), amount);
    250 }
    251 
    252 /// Elements are shifted leftwards (towards lower indices). Elements that leave to the left will reappear to the right in the same order.
    253 pub fn rotateElementsLeft(vec: anytype, comptime amount: VectorCount(@TypeOf(vec))) @TypeOf(vec) {
    254     return mergeShift(vec, vec, amount);
    255 }
    256 
    257 /// Elements are shifted rightwards (towards higher indices). Elements that leave to the right will reappear to the left in the same order.
    258 pub fn rotateElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec))) @TypeOf(vec) {
    259     return rotateElementsLeft(vec, vectorLength(@TypeOf(vec)) - amount);
    260 }
    261 
    262 pub fn reverseOrder(vec: anytype) @TypeOf(vec) {
    263     const Child = std.meta.Child(@TypeOf(vec));
    264     const len = vectorLength(@TypeOf(vec));
    265 
    266     return @shuffle(Child, vec, undefined, @splat(len, @intCast(i32, len) - 1) - iota(i32, len));
    267 }
    268 
    269 test "vector shifting" {
    270     const base = @Vector(4, u32){ 10, 20, 30, 40 };
    271 
    272     try std.testing.expectEqual([4]u32{ 30, 40, 999, 999 }, shiftElementsLeft(base, 2, 999));
    273     try std.testing.expectEqual([4]u32{ 999, 999, 10, 20 }, shiftElementsRight(base, 2, 999));
    274     try std.testing.expectEqual([4]u32{ 20, 30, 40, 10 }, rotateElementsLeft(base, 1));
    275     try std.testing.expectEqual([4]u32{ 40, 10, 20, 30 }, rotateElementsRight(base, 1));
    276     try std.testing.expectEqual([4]u32{ 40, 30, 20, 10 }, reverseOrder(base));
    277 }
    278 
    279 pub fn firstTrue(vec: anytype) ?VectorIndex(@TypeOf(vec)) {
    280     const len = vectorLength(@TypeOf(vec));
    281     const IndexInt = VectorIndex(@TypeOf(vec));
    282 
    283     if (!@reduce(.Or, vec)) {
    284         return null;
    285     }
    286     const indices = @select(IndexInt, vec, iota(IndexInt, len), @splat(len, ~@as(IndexInt, 0)));
    287     return @reduce(.Min, indices);
    288 }
    289 
    290 pub fn lastTrue(vec: anytype) ?VectorIndex(@TypeOf(vec)) {
    291     const len = vectorLength(@TypeOf(vec));
    292     const IndexInt = VectorIndex(@TypeOf(vec));
    293 
    294     if (!@reduce(.Or, vec)) {
    295         return null;
    296     }
    297     const indices = @select(IndexInt, vec, iota(IndexInt, len), @splat(len, @as(IndexInt, 0)));
    298     return @reduce(.Max, indices);
    299 }
    300 
    301 pub fn countTrues(vec: anytype) VectorCount(@TypeOf(vec)) {
    302     const len = vectorLength(@TypeOf(vec));
    303     const CountIntType = VectorCount(@TypeOf(vec));
    304 
    305     const one_if_true = @select(CountIntType, vec, @splat(len, @as(CountIntType, 1)), @splat(len, @as(CountIntType, 0)));
    306     return @reduce(.Add, one_if_true);
    307 }
    308 
    309 pub fn firstIndexOfValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) ?VectorIndex(@TypeOf(vec)) {
    310     const len = vectorLength(@TypeOf(vec));
    311 
    312     return firstTrue(vec == @splat(len, value));
    313 }
    314 
    315 pub fn lastIndexOfValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) ?VectorIndex(@TypeOf(vec)) {
    316     const len = vectorLength(@TypeOf(vec));
    317 
    318     return lastTrue(vec == @splat(len, value));
    319 }
    320 
    321 pub fn countElementsWithValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) VectorCount(@TypeOf(vec)) {
    322     const len = vectorLength(@TypeOf(vec));
    323 
    324     return countTrues(vec == @splat(len, value));
    325 }
    326 
    327 test "vector searching" {
    328     const base = @Vector(8, u32){ 6, 4, 7, 4, 4, 2, 3, 7 };
    329 
    330     try std.testing.expectEqual(@as(?u3, 1), firstIndexOfValue(base, 4));
    331     try std.testing.expectEqual(@as(?u3, 4), lastIndexOfValue(base, 4));
    332     try std.testing.expectEqual(@as(?u3, null), lastIndexOfValue(base, 99));
    333     try std.testing.expectEqual(@as(u4, 3), countElementsWithValue(base, 4));
    334 }
    335 
    336 /// Same as prefixScan, but with a user-provided, mathematically associative function.
    337 pub fn prefixScanWithFunc(
    338     comptime hop: isize,
    339     vec: anytype,
    340     /// The error type that `func` might return. Set this to `void` if `func` doesn't return an error union.
    341     comptime ErrorType: type,
    342     comptime func: fn (@TypeOf(vec), @TypeOf(vec)) if (ErrorType == void) @TypeOf(vec) else ErrorType!@TypeOf(vec),
    343     /// When one operand of the operation performed by `func` is this value, the result must equal the other operand.
    344     /// For example, this should be 0 for addition or 1 for multiplication.
    345     comptime identity: std.meta.Child(@TypeOf(vec)),
    346 ) if (ErrorType == void) @TypeOf(vec) else ErrorType!@TypeOf(vec) {
    347     // I haven't debugged this, but it might be a cousin of sorts to what's going on with interlace.
    348     comptime if (builtin.cpu.arch.isMIPS()) @compileError("TODO: Find out why prefixScan doesn't work on MIPS");
    349 
    350     const len = vectorLength(@TypeOf(vec));
    351 
    352     if (hop == 0) @compileError("hop can not be 0; you'd be going nowhere forever!");
    353     const abs_hop = if (hop < 0) -hop else hop;
    354 
    355     var acc = vec;
    356     comptime var i = 0;
    357     inline while ((abs_hop << i) < len) : (i += 1) {
    358         const shifted = if (hop < 0) shiftElementsLeft(acc, abs_hop << i, identity) else shiftElementsRight(acc, abs_hop << i, identity);
    359 
    360         acc = if (ErrorType == void) func(acc, shifted) else try func(acc, shifted);
    361     }
    362     return acc;
    363 }
    364 
    365 /// Returns a vector whose elements are the result of performing the specified operation on the corresponding
    366 /// element of the input vector and every hop'th element that came before it (or after, if hop is negative).
    367 /// Supports the same operations as the @reduce() builtin. Takes O(logN) to compute.
    368 /// The scan is not linear, which may affect floating point errors. This may affect the determinism of
    369 /// algorithms that use this function.
    370 pub fn prefixScan(comptime op: std.builtin.ReduceOp, comptime hop: isize, vec: anytype) @TypeOf(vec) {
    371     const VecType = @TypeOf(vec);
    372     const Child = std.meta.Child(VecType);
    373     const len = vectorLength(VecType);
    374 
    375     const identity = comptime switch (@typeInfo(Child)) {
    376         .Bool => switch (op) {
    377             .Or, .Xor => false,
    378             .And => true,
    379             else => @compileError("Invalid prefixScan operation " ++ @tagName(op) ++ " for vector of booleans."),
    380         },
    381         .Int => switch (op) {
    382             .Max => std.math.minInt(Child),
    383             .Add, .Or, .Xor => 0,
    384             .Mul => 1,
    385             .And, .Min => std.math.maxInt(Child),
    386         },
    387         .Float => switch (op) {
    388             .Max => -std.math.inf(Child),
    389             .Add => 0,
    390             .Mul => 1,
    391             .Min => std.math.inf(Child),
    392             else => @compileError("Invalid prefixScan operation " ++ @tagName(op) ++ " for vector of floats."),
    393         },
    394         else => @compileError("Invalid type " ++ @typeName(VecType) ++ " for prefixScan."),
    395     };
    396 
    397     const fn_container = struct {
    398         fn opFn(a: VecType, b: VecType) VecType {
    399             return if (Child == bool) switch (op) {
    400                 .And => @select(bool, a, b, @splat(len, false)),
    401                 .Or => @select(bool, a, @splat(len, true), b),
    402                 .Xor => a != b,
    403                 else => unreachable,
    404             } else switch (op) {
    405                 .And => a & b,
    406                 .Or => a | b,
    407                 .Xor => a ^ b,
    408                 .Add => a + b,
    409                 .Mul => a * b,
    410                 .Min => @min(a, b),
    411                 .Max => @max(a, b),
    412             };
    413         }
    414     };
    415 
    416     return prefixScanWithFunc(hop, vec, void, fn_container.opFn, identity);
    417 }
    418 
    419 test "vector prefix scan" {
    420     if (comptime builtin.cpu.arch.isMIPS()) {
    421         return error.SkipZigTest;
    422     }
    423 
    424     if (builtin.zig_backend == .stage2_llvm) {
    425         // Regressed in LLVM 14:
    426         // https://github.com/llvm/llvm-project/issues/55522
    427         return error.SkipZigTest;
    428     }
    429 
    430     const int_base = @Vector(4, i32){ 11, 23, 9, -21 };
    431     const float_base = @Vector(4, f32){ 2, 0.5, -10, 6.54321 };
    432     const bool_base = @Vector(4, bool){ true, false, true, false };
    433 
    434     try std.testing.expectEqual(iota(u8, 32) + @splat(32, @as(u8, 1)), prefixScan(.Add, 1, @splat(32, @as(u8, 1))));
    435     try std.testing.expectEqual(@Vector(4, i32){ 11, 3, 1, 1 }, prefixScan(.And, 1, int_base));
    436     try std.testing.expectEqual(@Vector(4, i32){ 11, 31, 31, -1 }, prefixScan(.Or, 1, int_base));
    437     try std.testing.expectEqual(@Vector(4, i32){ 11, 28, 21, -2 }, prefixScan(.Xor, 1, int_base));
    438     try std.testing.expectEqual(@Vector(4, i32){ 11, 34, 43, 22 }, prefixScan(.Add, 1, int_base));
    439     try std.testing.expectEqual(@Vector(4, i32){ 11, 253, 2277, -47817 }, prefixScan(.Mul, 1, int_base));
    440     try std.testing.expectEqual(@Vector(4, i32){ 11, 11, 9, -21 }, prefixScan(.Min, 1, int_base));
    441     try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 23, 23 }, prefixScan(.Max, 1, int_base));
    442 
    443     // Trying to predict all inaccuracies when adding and multiplying floats with prefixScans would be a mess, so we don't test those.
    444     try std.testing.expectEqual(@Vector(4, f32){ 2, 0.5, -10, -10 }, prefixScan(.Min, 1, float_base));
    445     try std.testing.expectEqual(@Vector(4, f32){ 2, 2, 2, 6.54321 }, prefixScan(.Max, 1, float_base));
    446 
    447     try std.testing.expectEqual(@Vector(4, bool){ true, true, false, false }, prefixScan(.Xor, 1, bool_base));
    448     try std.testing.expectEqual(@Vector(4, bool){ true, true, true, true }, prefixScan(.Or, 1, bool_base));
    449     try std.testing.expectEqual(@Vector(4, bool){ true, false, false, false }, prefixScan(.And, 1, bool_base));
    450 
    451     try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 20, 2 }, prefixScan(.Add, 2, int_base));
    452     try std.testing.expectEqual(@Vector(4, i32){ 22, 11, -12, -21 }, prefixScan(.Add, -1, int_base));
    453     try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 9, -10 }, prefixScan(.Add, 3, int_base));
    454 }