Merge pull request #23924 from jacobly0/x86_64-rewrite

x86_64: implement reduce
2025-05-28 21:37:02 -04:00
parent 4f3b59f708 9f9e7e398f
commit f25212a479
40 changed files with 46862 additions and 1121 deletions
--- a/lib/std/crypto/Certificate.zig
+++ b/lib/std/crypto/Certificate.zig
@@ -607,7 +607,7 @@ const Date = struct {
            while (month < date.month) : (month += 1) {
                const days: u64 = std.time.epoch.getDaysInMonth(
                    date.year,
-                    @as(std.time.epoch.Month, @enumFromInt(month)),
+                    @enumFromInt(month),
                );
                sec += days * std.time.epoch.secs_per_day;
            }
@@ -623,15 +623,13 @@ const Date = struct {
 };

 pub fn parseTimeDigits(text: *const [2]u8, min: u8, max: u8) !u8 {
-    const result = if (use_vectors) result: {
-        const nn: @Vector(2, u16) = .{ text[0], text[1] };
-        const zero: @Vector(2, u16) = .{ '0', '0' };
-        const mm: @Vector(2, u16) = .{ 10, 1 };
-        break :result @reduce(.Add, (nn -% zero) *% mm);
-    } else std.fmt.parseInt(u8, text, 10) catch return error.CertificateTimeInvalid;
+    const nn: @Vector(2, u16) = .{ text[0], text[1] };
+    const zero: @Vector(2, u16) = .{ '0', '0' };
+    const mm: @Vector(2, u16) = .{ 10, 1 };
+    const result = @reduce(.Add, (nn -% zero) *% mm);
    if (result < min) return error.CertificateTimeInvalid;
    if (result > max) return error.CertificateTimeInvalid;
-    return @truncate(result);
+    return @intCast(result);
 }

 test parseTimeDigits {
@@ -647,14 +645,12 @@ test parseTimeDigits {
 }

 pub fn parseYear4(text: *const [4]u8) !u16 {
-    const result = if (use_vectors) result: {
-        const nnnn: @Vector(4, u32) = .{ text[0], text[1], text[2], text[3] };
-        const zero: @Vector(4, u32) = .{ '0', '0', '0', '0' };
-        const mmmm: @Vector(4, u32) = .{ 1000, 100, 10, 1 };
-        break :result @reduce(.Add, (nnnn -% zero) *% mmmm);
-    } else std.fmt.parseInt(u16, text, 10) catch return error.CertificateTimeInvalid;
+    const nnnn: @Vector(4, u32) = .{ text[0], text[1], text[2], text[3] };
+    const zero: @Vector(4, u32) = .{ '0', '0', '0', '0' };
+    const mmmm: @Vector(4, u32) = .{ 1000, 100, 10, 1 };
+    const result = @reduce(.Add, (nnnn -% zero) *% mmmm);
    if (result > 9999) return error.CertificateTimeInvalid;
-    return @truncate(result);
+    return @intCast(result);
 }

 test parseYear4 {
@@ -858,7 +854,7 @@ pub const der = struct {

        pub fn parse(bytes: []const u8, index: u32) Element.ParseError!Element {
            var i = index;
-            const identifier = @as(Identifier, @bitCast(bytes[i]));
+            const identifier: Identifier = @bitCast(bytes[i]);
            i += 1;
            const size_byte = bytes[i];
            i += 1;
@@ -872,7 +868,7 @@ pub const der = struct {
                };
            }

-            const len_size = @as(u7, @truncate(size_byte));
+            const len_size: u7 = @truncate(size_byte);
            if (len_size > @sizeOf(u32)) {
                return error.CertificateFieldHasInvalidLength;
            }
@@ -1244,5 +1240,3 @@ pub const rsa = struct {
        return res;
    }
 };
-
-const use_vectors = @import("builtin").zig_backend != .stage2_x86_64;
--- a/lib/std/crypto/timing_safe.zig
+++ b/lib/std/crypto/timing_safe.zig
@@ -192,8 +192,6 @@ test eql {
 }

 test "eql (vectors)" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
    const random = std.crypto.random;
    const expect = std.testing.expect;
    var a: [100]u8 = undefined;
--- a/lib/std/http/Client.zig
+++ b/lib/std/http/Client.zig
@@ -13,7 +13,6 @@ const net = std.net;
 const Uri = std.Uri;
 const Allocator = mem.Allocator;
 const assert = std.debug.assert;
-const use_vectors = builtin.zig_backend != .stage2_x86_64;

 const Client = @This();
 const proto = @import("protocol.zig");
@@ -594,13 +593,10 @@ pub const Response = struct {
    }

    fn parseInt3(text: *const [3]u8) u10 {
-        if (use_vectors) {
-            const nnn: @Vector(3, u8) = text.*;
-            const zero: @Vector(3, u8) = .{ '0', '0', '0' };
-            const mmm: @Vector(3, u10) = .{ 100, 10, 1 };
-            return @reduce(.Add, @as(@Vector(3, u10), nnn -% zero) *% mmm);
-        }
-        return std.fmt.parseInt(u10, text, 10) catch unreachable;
+        const nnn: @Vector(3, u8) = text.*;
+        const zero: @Vector(3, u8) = .{ '0', '0', '0' };
+        const mmm: @Vector(3, u10) = .{ 100, 10, 1 };
+        return @reduce(.Add, (nnn -% zero) *% mmm);
    }

    test parseInt3 {
@@ -1796,5 +1792,6 @@ pub fn fetch(client: *Client, options: FetchOptions) !FetchResult {
 }

 test {
+    _ = Response;
    _ = &initDefaultProxies;
 }
--- a/lib/std/http/HeadParser.zig
+++ b/lib/std/http/HeadParser.zig
@@ -109,27 +109,21 @@ pub fn feed(p: *HeadParser, bytes: []const u8) usize {
                    continue;
                },
                else => {
+                    const Vector = @Vector(vector_len, u8);
+                    // const BoolVector = @Vector(vector_len, bool);
+                    const BitVector = @Vector(vector_len, u1);
+                    const SizeVector = @Vector(vector_len, u8);
+
                    const chunk = bytes[index..][0..vector_len];
-                    const matches = if (use_vectors) matches: {
-                        const Vector = @Vector(vector_len, u8);
-                        // const BoolVector = @Vector(vector_len, bool);
-                        const BitVector = @Vector(vector_len, u1);
-                        const SizeVector = @Vector(vector_len, u8);
+                    const v: Vector = chunk.*;
+                    // depends on https://github.com/ziglang/zig/issues/19755
+                    // const matches_r: BitVector = @bitCast(v == @as(Vector, @splat('\r')));
+                    // const matches_n: BitVector = @bitCast(v == @as(Vector, @splat('\n')));
+                    const matches_r: BitVector = @select(u1, v == @as(Vector, @splat('\r')), @as(Vector, @splat(1)), @as(Vector, @splat(0)));
+                    const matches_n: BitVector = @select(u1, v == @as(Vector, @splat('\n')), @as(Vector, @splat(1)), @as(Vector, @splat(0)));
+                    const matches_or: SizeVector = matches_r | matches_n;

-                        const v: Vector = chunk.*;
-                        const matches_r: BitVector = @bitCast(v == @as(Vector, @splat('\r')));
-                        const matches_n: BitVector = @bitCast(v == @as(Vector, @splat('\n')));
-                        const matches_or: SizeVector = matches_r | matches_n;
-
-                        break :matches @reduce(.Add, matches_or);
-                    } else matches: {
-                        var matches: u8 = 0;
-                        for (chunk) |byte| switch (byte) {
-                            '\r', '\n' => matches += 1,
-                            else => {},
-                        };
-                        break :matches matches;
-                    };
+                    const matches = @reduce(.Add, matches_or);
                    switch (matches) {
                        0 => {},
                        1 => switch (chunk[vector_len - 1]) {
@@ -357,7 +351,6 @@ inline fn intShift(comptime T: type, x: anytype) T {

 const HeadParser = @This();
 const std = @import("std");
-const use_vectors = builtin.zig_backend != .stage2_x86_64;
 const builtin = @import("builtin");

 test feed {
--- a/lib/std/simd.zig
+++ b/lib/std/simd.zig
@@ -368,7 +368,8 @@ pub fn countElementsWithValue(vec: anytype, value: std.meta.Child(@TypeOf(vec)))
 }

 test "vector searching" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) return error.SkipZigTest;

    const base = @Vector(8, u32){ 6, 4, 7, 4, 4, 2, 3, 7 };

--- a/lib/std/zig/Zir.zig
+++ b/lib/std/zig/Zir.zig
@@ -2142,7 +2142,7 @@ pub const Inst = struct {
        ref_start_index = static_len,
        _,

-        pub const static_len = 101;
+        pub const static_len = 118;

        pub fn toRef(i: Index) Inst.Ref {
            return @enumFromInt(@intFromEnum(Index.ref_start_index) + @intFromEnum(i));
@@ -2190,6 +2190,7 @@ pub const Inst = struct {
        u80_type,
        u128_type,
        i128_type,
+        u256_type,
        usize_type,
        isize_type,
        c_char_type,
@@ -2228,34 +2229,50 @@ pub const Inst = struct {
        vector_8_i8_type,
        vector_16_i8_type,
        vector_32_i8_type,
+        vector_64_i8_type,
        vector_1_u8_type,
        vector_2_u8_type,
        vector_4_u8_type,
        vector_8_u8_type,
        vector_16_u8_type,
        vector_32_u8_type,
+        vector_64_u8_type,
+        vector_2_i16_type,
        vector_4_i16_type,
        vector_8_i16_type,
        vector_16_i16_type,
+        vector_32_i16_type,
        vector_4_u16_type,
        vector_8_u16_type,
        vector_16_u16_type,
+        vector_32_u16_type,
+        vector_2_i32_type,
        vector_4_i32_type,
        vector_8_i32_type,
+        vector_16_i32_type,
        vector_4_u32_type,
        vector_8_u32_type,
+        vector_16_u32_type,
        vector_2_i64_type,
        vector_4_i64_type,
+        vector_8_i64_type,
        vector_2_u64_type,
        vector_4_u64_type,
+        vector_8_u64_type,
+        vector_1_u128_type,
        vector_2_u128_type,
+        vector_1_u256_type,
        vector_4_f16_type,
        vector_8_f16_type,
+        vector_16_f16_type,
+        vector_32_f16_type,
        vector_2_f32_type,
        vector_4_f32_type,
        vector_8_f32_type,
+        vector_16_f32_type,
        vector_2_f64_type,
        vector_4_f64_type,
+        vector_8_f64_type,
        optional_noreturn_type,
        anyerror_void_error_union_type,
        adhoc_inferred_error_set_type,
--- a/lib/std/zon/parse.zig
+++ b/lib/std/zon/parse.zig
@@ -3091,7 +3091,6 @@ test "std.zon free on error" {

 test "std.zon vector" {
    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/15330
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/15329

    const gpa = std.testing.allocator;

--- a/src/Air.zig
+++ b/src/Air.zig
@@ -973,6 +973,7 @@ pub const Inst = struct {
        u80_type = @intFromEnum(InternPool.Index.u80_type),
        u128_type = @intFromEnum(InternPool.Index.u128_type),
        i128_type = @intFromEnum(InternPool.Index.i128_type),
+        u256_type = @intFromEnum(InternPool.Index.u256_type),
        usize_type = @intFromEnum(InternPool.Index.usize_type),
        isize_type = @intFromEnum(InternPool.Index.isize_type),
        c_char_type = @intFromEnum(InternPool.Index.c_char_type),
@@ -1011,34 +1012,50 @@ pub const Inst = struct {
        vector_8_i8_type = @intFromEnum(InternPool.Index.vector_8_i8_type),
        vector_16_i8_type = @intFromEnum(InternPool.Index.vector_16_i8_type),
        vector_32_i8_type = @intFromEnum(InternPool.Index.vector_32_i8_type),
+        vector_64_i8_type = @intFromEnum(InternPool.Index.vector_64_i8_type),
        vector_1_u8_type = @intFromEnum(InternPool.Index.vector_1_u8_type),
        vector_2_u8_type = @intFromEnum(InternPool.Index.vector_2_u8_type),
        vector_4_u8_type = @intFromEnum(InternPool.Index.vector_4_u8_type),
        vector_8_u8_type = @intFromEnum(InternPool.Index.vector_8_u8_type),
        vector_16_u8_type = @intFromEnum(InternPool.Index.vector_16_u8_type),
        vector_32_u8_type = @intFromEnum(InternPool.Index.vector_32_u8_type),
+        vector_64_u8_type = @intFromEnum(InternPool.Index.vector_64_u8_type),
+        vector_2_i16_type = @intFromEnum(InternPool.Index.vector_2_i16_type),
        vector_4_i16_type = @intFromEnum(InternPool.Index.vector_4_i16_type),
        vector_8_i16_type = @intFromEnum(InternPool.Index.vector_8_i16_type),
        vector_16_i16_type = @intFromEnum(InternPool.Index.vector_16_i16_type),
+        vector_32_i16_type = @intFromEnum(InternPool.Index.vector_32_i16_type),
        vector_4_u16_type = @intFromEnum(InternPool.Index.vector_4_u16_type),
        vector_8_u16_type = @intFromEnum(InternPool.Index.vector_8_u16_type),
        vector_16_u16_type = @intFromEnum(InternPool.Index.vector_16_u16_type),
+        vector_32_u16_type = @intFromEnum(InternPool.Index.vector_32_u16_type),
+        vector_2_i32_type = @intFromEnum(InternPool.Index.vector_2_i32_type),
        vector_4_i32_type = @intFromEnum(InternPool.Index.vector_4_i32_type),
        vector_8_i32_type = @intFromEnum(InternPool.Index.vector_8_i32_type),
+        vector_16_i32_type = @intFromEnum(InternPool.Index.vector_16_i32_type),
        vector_4_u32_type = @intFromEnum(InternPool.Index.vector_4_u32_type),
        vector_8_u32_type = @intFromEnum(InternPool.Index.vector_8_u32_type),
+        vector_16_u32_type = @intFromEnum(InternPool.Index.vector_16_u32_type),
        vector_2_i64_type = @intFromEnum(InternPool.Index.vector_2_i64_type),
        vector_4_i64_type = @intFromEnum(InternPool.Index.vector_4_i64_type),
+        vector_8_i64_type = @intFromEnum(InternPool.Index.vector_8_i64_type),
        vector_2_u64_type = @intFromEnum(InternPool.Index.vector_2_u64_type),
        vector_4_u64_type = @intFromEnum(InternPool.Index.vector_4_u64_type),
+        vector_8_u64_type = @intFromEnum(InternPool.Index.vector_8_u64_type),
+        vector_1_u128_type = @intFromEnum(InternPool.Index.vector_1_u128_type),
        vector_2_u128_type = @intFromEnum(InternPool.Index.vector_2_u128_type),
+        vector_1_u256_type = @intFromEnum(InternPool.Index.vector_1_u256_type),
        vector_4_f16_type = @intFromEnum(InternPool.Index.vector_4_f16_type),
        vector_8_f16_type = @intFromEnum(InternPool.Index.vector_8_f16_type),
+        vector_16_f16_type = @intFromEnum(InternPool.Index.vector_16_f16_type),
+        vector_32_f16_type = @intFromEnum(InternPool.Index.vector_32_f16_type),
        vector_2_f32_type = @intFromEnum(InternPool.Index.vector_2_f32_type),
        vector_4_f32_type = @intFromEnum(InternPool.Index.vector_4_f32_type),
        vector_8_f32_type = @intFromEnum(InternPool.Index.vector_8_f32_type),
+        vector_16_f32_type = @intFromEnum(InternPool.Index.vector_16_f32_type),
        vector_2_f64_type = @intFromEnum(InternPool.Index.vector_2_f64_type),
        vector_4_f64_type = @intFromEnum(InternPool.Index.vector_4_f64_type),
+        vector_8_f64_type = @intFromEnum(InternPool.Index.vector_8_f64_type),
        optional_noreturn_type = @intFromEnum(InternPool.Index.optional_noreturn_type),
        anyerror_void_error_union_type = @intFromEnum(InternPool.Index.anyerror_void_error_union_type),
        adhoc_inferred_error_set_type = @intFromEnum(InternPool.Index.adhoc_inferred_error_set_type),
--- a/src/InternPool.zig
+++ b/src/InternPool.zig
@@ -4548,6 +4548,7 @@ pub const Index = enum(u32) {
    u80_type,
    u128_type,
    i128_type,
+    u256_type,
    usize_type,
    isize_type,
    c_char_type,
@@ -4588,34 +4589,50 @@ pub const Index = enum(u32) {
    vector_8_i8_type,
    vector_16_i8_type,
    vector_32_i8_type,
+    vector_64_i8_type,
    vector_1_u8_type,
    vector_2_u8_type,
    vector_4_u8_type,
    vector_8_u8_type,
    vector_16_u8_type,
    vector_32_u8_type,
+    vector_64_u8_type,
+    vector_2_i16_type,
    vector_4_i16_type,
    vector_8_i16_type,
    vector_16_i16_type,
+    vector_32_i16_type,
    vector_4_u16_type,
    vector_8_u16_type,
    vector_16_u16_type,
+    vector_32_u16_type,
+    vector_2_i32_type,
    vector_4_i32_type,
    vector_8_i32_type,
+    vector_16_i32_type,
    vector_4_u32_type,
    vector_8_u32_type,
+    vector_16_u32_type,
    vector_2_i64_type,
    vector_4_i64_type,
+    vector_8_i64_type,
    vector_2_u64_type,
    vector_4_u64_type,
+    vector_8_u64_type,
+    vector_1_u128_type,
    vector_2_u128_type,
+    vector_1_u256_type,
    vector_4_f16_type,
    vector_8_f16_type,
+    vector_16_f16_type,
+    vector_32_f16_type,
    vector_2_f32_type,
    vector_4_f32_type,
    vector_8_f32_type,
+    vector_16_f32_type,
    vector_2_f64_type,
    vector_4_f64_type,
+    vector_8_f64_type,

    optional_noreturn_type,
    anyerror_void_error_union_type,
@@ -4946,7 +4963,7 @@ pub const Index = enum(u32) {
    }
 };

-pub const static_keys = [_]Key{
+pub const static_keys: [static_len]Key = .{
    .{ .int_type = .{
        .signedness = .unsigned,
        .bits = 0,
@@ -5022,6 +5039,11 @@ pub const static_keys = [_]Key{
        .bits = 128,
    } },

+    .{ .int_type = .{
+        .signedness = .unsigned,
+        .bits = 256,
+    } },
+
    .{ .simple_type = .usize },
    .{ .simple_type = .isize },
    .{ .simple_type = .c_char },
@@ -5113,6 +5135,8 @@ pub const static_keys = [_]Key{
    .{ .vector_type = .{ .len = 16, .child = .i8_type } },
    // @Vector(32, i8)
    .{ .vector_type = .{ .len = 32, .child = .i8_type } },
+    // @Vector(64, i8)
+    .{ .vector_type = .{ .len = 64, .child = .i8_type } },
    // @Vector(1, u8)
    .{ .vector_type = .{ .len = 1, .child = .u8_type } },
    // @Vector(2, u8)
@@ -5125,50 +5149,80 @@ pub const static_keys = [_]Key{
    .{ .vector_type = .{ .len = 16, .child = .u8_type } },
    // @Vector(32, u8)
    .{ .vector_type = .{ .len = 32, .child = .u8_type } },
+    // @Vector(64, u8)
+    .{ .vector_type = .{ .len = 64, .child = .u8_type } },
+    // @Vector(2, i16)
+    .{ .vector_type = .{ .len = 2, .child = .i16_type } },
    // @Vector(4, i16)
    .{ .vector_type = .{ .len = 4, .child = .i16_type } },
    // @Vector(8, i16)
    .{ .vector_type = .{ .len = 8, .child = .i16_type } },
    // @Vector(16, i16)
    .{ .vector_type = .{ .len = 16, .child = .i16_type } },
+    // @Vector(32, i16)
+    .{ .vector_type = .{ .len = 32, .child = .i16_type } },
    // @Vector(4, u16)
    .{ .vector_type = .{ .len = 4, .child = .u16_type } },
    // @Vector(8, u16)
    .{ .vector_type = .{ .len = 8, .child = .u16_type } },
    // @Vector(16, u16)
    .{ .vector_type = .{ .len = 16, .child = .u16_type } },
+    // @Vector(32, u16)
+    .{ .vector_type = .{ .len = 32, .child = .u16_type } },
+    // @Vector(2, i32)
+    .{ .vector_type = .{ .len = 2, .child = .i32_type } },
    // @Vector(4, i32)
    .{ .vector_type = .{ .len = 4, .child = .i32_type } },
    // @Vector(8, i32)
    .{ .vector_type = .{ .len = 8, .child = .i32_type } },
+    // @Vector(16, i32)
+    .{ .vector_type = .{ .len = 16, .child = .i32_type } },
    // @Vector(4, u32)
    .{ .vector_type = .{ .len = 4, .child = .u32_type } },
    // @Vector(8, u32)
    .{ .vector_type = .{ .len = 8, .child = .u32_type } },
+    // @Vector(16, u32)
+    .{ .vector_type = .{ .len = 16, .child = .u32_type } },
    // @Vector(2, i64)
    .{ .vector_type = .{ .len = 2, .child = .i64_type } },
    // @Vector(4, i64)
    .{ .vector_type = .{ .len = 4, .child = .i64_type } },
+    // @Vector(8, i64)
+    .{ .vector_type = .{ .len = 8, .child = .i64_type } },
    // @Vector(2, u64)
    .{ .vector_type = .{ .len = 2, .child = .u64_type } },
-    // @Vector(8, u64)
+    // @Vector(4, u64)
    .{ .vector_type = .{ .len = 4, .child = .u64_type } },
+    // @Vector(8, u64)
+    .{ .vector_type = .{ .len = 8, .child = .u64_type } },
+    // @Vector(1, u128)
+    .{ .vector_type = .{ .len = 1, .child = .u128_type } },
    // @Vector(2, u128)
    .{ .vector_type = .{ .len = 2, .child = .u128_type } },
+    // @Vector(1, u256)
+    .{ .vector_type = .{ .len = 1, .child = .u256_type } },
    // @Vector(4, f16)
    .{ .vector_type = .{ .len = 4, .child = .f16_type } },
    // @Vector(8, f16)
    .{ .vector_type = .{ .len = 8, .child = .f16_type } },
+    // @Vector(16, f16)
+    .{ .vector_type = .{ .len = 16, .child = .f16_type } },
+    // @Vector(32, f16)
+    .{ .vector_type = .{ .len = 32, .child = .f16_type } },
    // @Vector(2, f32)
    .{ .vector_type = .{ .len = 2, .child = .f32_type } },
    // @Vector(4, f32)
    .{ .vector_type = .{ .len = 4, .child = .f32_type } },
    // @Vector(8, f32)
    .{ .vector_type = .{ .len = 8, .child = .f32_type } },
+    // @Vector(16, f32)
+    .{ .vector_type = .{ .len = 16, .child = .f32_type } },
    // @Vector(2, f64)
    .{ .vector_type = .{ .len = 2, .child = .f64_type } },
    // @Vector(4, f64)
    .{ .vector_type = .{ .len = 4, .child = .f64_type } },
+    // @Vector(8, f64)
+    .{ .vector_type = .{ .len = 8, .child = .f64_type } },

    // ?noreturn
    .{ .opt_type = .noreturn_type },
@@ -5246,10 +5300,6 @@ pub const static_keys = [_]Key{
 /// assert below to break an unfortunate and arguably incorrect dependency loop
 /// when compiling.
 pub const static_len = Zir.Inst.Index.static_len;
-comptime {
-    //@compileLog(static_keys.len);
-    assert(static_len == static_keys.len);
-}

 pub const Tag = enum(u8) {
    /// This special tag represents a value which was removed from this pool via
@@ -11767,6 +11817,7 @@ pub fn typeOf(ip: *const InternPool, index: Index) Index {
        .u80_type,
        .u128_type,
        .i128_type,
+        .u256_type,
        .usize_type,
        .isize_type,
        .c_char_type,
@@ -11805,34 +11856,50 @@ pub fn typeOf(ip: *const InternPool, index: Index) Index {
        .vector_8_i8_type,
        .vector_16_i8_type,
        .vector_32_i8_type,
+        .vector_64_i8_type,
        .vector_1_u8_type,
        .vector_2_u8_type,
        .vector_4_u8_type,
        .vector_8_u8_type,
        .vector_16_u8_type,
        .vector_32_u8_type,
+        .vector_64_u8_type,
+        .vector_2_i16_type,
        .vector_4_i16_type,
        .vector_8_i16_type,
        .vector_16_i16_type,
+        .vector_32_i16_type,
        .vector_4_u16_type,
        .vector_8_u16_type,
        .vector_16_u16_type,
+        .vector_32_u16_type,
+        .vector_2_i32_type,
        .vector_4_i32_type,
        .vector_8_i32_type,
+        .vector_16_i32_type,
        .vector_4_u32_type,
        .vector_8_u32_type,
+        .vector_16_u32_type,
        .vector_2_i64_type,
        .vector_4_i64_type,
+        .vector_8_i64_type,
        .vector_2_u64_type,
        .vector_4_u64_type,
+        .vector_8_u64_type,
+        .vector_1_u128_type,
        .vector_2_u128_type,
+        .vector_1_u256_type,
        .vector_4_f16_type,
        .vector_8_f16_type,
+        .vector_16_f16_type,
+        .vector_32_f16_type,
        .vector_2_f32_type,
        .vector_4_f32_type,
        .vector_8_f32_type,
+        .vector_16_f32_type,
        .vector_2_f64_type,
        .vector_4_f64_type,
+        .vector_8_f64_type,
        .optional_noreturn_type,
        .anyerror_void_error_union_type,
        .adhoc_inferred_error_set_type,
@@ -12084,6 +12151,7 @@ pub fn zigTypeTag(ip: *const InternPool, index: Index) std.builtin.TypeId {
        .u80_type,
        .u128_type,
        .i128_type,
+        .u256_type,
        .usize_type,
        .isize_type,
        .c_char_type,
@@ -12129,34 +12197,50 @@ pub fn zigTypeTag(ip: *const InternPool, index: Index) std.builtin.TypeId {
        .vector_8_i8_type,
        .vector_16_i8_type,
        .vector_32_i8_type,
+        .vector_64_i8_type,
        .vector_1_u8_type,
        .vector_2_u8_type,
        .vector_4_u8_type,
        .vector_8_u8_type,
        .vector_16_u8_type,
        .vector_32_u8_type,
+        .vector_64_u8_type,
+        .vector_2_i16_type,
        .vector_4_i16_type,
        .vector_8_i16_type,
        .vector_16_i16_type,
+        .vector_32_i16_type,
        .vector_4_u16_type,
        .vector_8_u16_type,
        .vector_16_u16_type,
+        .vector_32_u16_type,
+        .vector_2_i32_type,
        .vector_4_i32_type,
        .vector_8_i32_type,
+        .vector_16_i32_type,
        .vector_4_u32_type,
        .vector_8_u32_type,
+        .vector_16_u32_type,
        .vector_2_i64_type,
        .vector_4_i64_type,
+        .vector_8_i64_type,
        .vector_2_u64_type,
        .vector_4_u64_type,
+        .vector_8_u64_type,
+        .vector_1_u128_type,
        .vector_2_u128_type,
+        .vector_1_u256_type,
        .vector_4_f16_type,
        .vector_8_f16_type,
+        .vector_16_f16_type,
+        .vector_32_f16_type,
        .vector_2_f32_type,
        .vector_4_f32_type,
        .vector_8_f32_type,
+        .vector_16_f32_type,
        .vector_2_f64_type,
        .vector_4_f64_type,
+        .vector_8_f64_type,
        => .vector,

        .optional_noreturn_type => .optional,
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -753,6 +753,29 @@ pub const Block = struct {
        });
    }

+    fn addReduce(block: *Block, operand: Air.Inst.Ref, operation: std.builtin.ReduceOp) !Air.Inst.Ref {
+        const sema = block.sema;
+        const zcu = sema.pt.zcu;
+        const vector_ty = sema.typeOf(operand);
+        switch (vector_ty.vectorLen(zcu)) {
+            0 => unreachable,
+            1 => return block.addBinOp(.array_elem_val, operand, .zero_usize),
+            else => {},
+        }
+        const allow_optimized = switch (vector_ty.childType(zcu).zigTypeTag(zcu)) {
+            .float => true,
+            .bool, .int => false,
+            else => unreachable,
+        };
+        return block.addInst(.{
+            .tag = if (allow_optimized and block.float_mode == .optimized) .reduce_optimized else .reduce,
+            .data = .{ .reduce = .{
+                .operand = operand,
+                .operation = operation,
+            } },
+        });
+    }
+
    fn addAggregateInit(
        block: *Block,
        aggregate_ty: Type,
@@ -10307,10 +10330,7 @@ fn intCast(
                    const zeros = try sema.splat(operand_ty, try pt.intValue(operand_scalar_ty, 0));
                    const zero_inst = Air.internedToRef(zeros.toIntern());
                    const is_in_range = try block.addCmpVector(operand, zero_inst, .eq);
-                    const all_in_range = try block.addInst(.{
-                        .tag = .reduce,
-                        .data = .{ .reduce = .{ .operand = is_in_range, .operation = .And } },
-                    });
+                    const all_in_range = try block.addReduce(is_in_range, .And);
                    break :ok all_in_range;
                } else ok: {
                    const zero_inst = Air.internedToRef((try pt.intValue(operand_ty, 0)).toIntern());
@@ -10374,13 +10394,7 @@ fn intCast(

                const ok = if (is_vector) ok: {
                    const is_in_range = try block.addCmpVector(diff_unsigned, dest_range, .lte);
-                    const all_in_range = try block.addInst(.{
-                        .tag = if (block.float_mode == .optimized) .reduce_optimized else .reduce,
-                        .data = .{ .reduce = .{
-                            .operand = is_in_range,
-                            .operation = .And,
-                        } },
-                    });
+                    const all_in_range = try block.addReduce(is_in_range, .And);
                    break :ok all_in_range;
                } else ok: {
                    const is_in_range = try block.addBinOp(.cmp_lte, diff_unsigned, dest_range);
@@ -10391,13 +10405,7 @@ fn intCast(
            } else {
                const ok = if (is_vector) ok: {
                    const is_in_range = try block.addCmpVector(operand, dest_max, .lte);
-                    const all_in_range = try block.addInst(.{
-                        .tag = if (block.float_mode == .optimized) .reduce_optimized else .reduce,
-                        .data = .{ .reduce = .{
-                            .operand = is_in_range,
-                            .operation = .And,
-                        } },
-                    });
+                    const all_in_range = try block.addReduce(is_in_range, .And);
                    break :ok all_in_range;
                } else ok: {
                    const is_in_range = try block.addBinOp(.cmp_lte, operand, dest_max);
@@ -10413,13 +10421,7 @@ fn intCast(
                const zero_val = try sema.splat(operand_ty, scalar_zero);
                const zero_inst = Air.internedToRef(zero_val.toIntern());
                const is_in_range = try block.addCmpVector(operand, zero_inst, .gte);
-                const all_in_range = try block.addInst(.{
-                    .tag = if (block.float_mode == .optimized) .reduce_optimized else .reduce,
-                    .data = .{ .reduce = .{
-                        .operand = is_in_range,
-                        .operation = .And,
-                    } },
-                });
+                const all_in_range = try block.addReduce(is_in_range, .And);
                break :ok all_in_range;
            } else ok: {
                const zero_inst = Air.internedToRef((try pt.intValue(operand_ty, 0)).toIntern());
@@ -14330,13 +14332,7 @@ fn zirShl(
            const ok = if (rhs_ty.zigTypeTag(zcu) == .vector) ok: {
                const bit_count_inst = Air.internedToRef((try sema.splat(rhs_ty, bit_count_val)).toIntern());
                const lt = try block.addCmpVector(rhs, bit_count_inst, .lt);
-                break :ok try block.addInst(.{
-                    .tag = .reduce,
-                    .data = .{ .reduce = .{
-                        .operand = lt,
-                        .operation = .And,
-                    } },
-                });
+                break :ok try block.addReduce(lt, .And);
            } else ok: {
                const bit_count_inst = Air.internedToRef(bit_count_val.toIntern());
                break :ok try block.addBinOp(.cmp_lt, rhs, bit_count_inst);
@@ -14358,13 +14354,7 @@ fn zirShl(
            });
            const ov_bit = try sema.tupleFieldValByIndex(block, op_ov, 1, op_ov_tuple_ty);
            const any_ov_bit = if (lhs_ty.zigTypeTag(zcu) == .vector)
-                try block.addInst(.{
-                    .tag = if (block.float_mode == .optimized) .reduce_optimized else .reduce,
-                    .data = .{ .reduce = .{
-                        .operand = ov_bit,
-                        .operation = .Or,
-                    } },
-                })
+                try block.addReduce(ov_bit, .Or)
            else
                ov_bit;
            const zero_ov = Air.internedToRef((try pt.intValue(Type.u1, 0)).toIntern());
@@ -14490,13 +14480,7 @@ fn zirShr(
            const ok = if (rhs_ty.zigTypeTag(zcu) == .vector) ok: {
                const bit_count_inst = Air.internedToRef((try sema.splat(rhs_ty, bit_count_val)).toIntern());
                const lt = try block.addCmpVector(rhs, bit_count_inst, .lt);
-                break :ok try block.addInst(.{
-                    .tag = .reduce,
-                    .data = .{ .reduce = .{
-                        .operand = lt,
-                        .operation = .And,
-                    } },
-                });
+                break :ok try block.addReduce(lt, .And);
            } else ok: {
                const bit_count_inst = Air.internedToRef(bit_count_val.toIntern());
                break :ok try block.addBinOp(.cmp_lt, rhs, bit_count_inst);
@@ -14509,13 +14493,7 @@ fn zirShr(

            const ok = if (rhs_ty.zigTypeTag(zcu) == .vector) ok: {
                const eql = try block.addCmpVector(lhs, back, .eq);
-                break :ok try block.addInst(.{
-                    .tag = if (block.float_mode == .optimized) .reduce_optimized else .reduce,
-                    .data = .{ .reduce = .{
-                        .operand = eql,
-                        .operation = .And,
-                    } },
-                });
+                break :ok try block.addReduce(eql, .And);
            } else try block.addBinOp(.cmp_eq, lhs, back);
            try sema.addSafetyCheck(block, src, ok, .shr_overflow);
        }
@@ -15565,16 +15543,7 @@ fn zirDivExact(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Ai

            if (resolved_type.zigTypeTag(zcu) == .vector) {
                const eql = try block.addCmpVector(result, floored, .eq);
-                break :ok try block.addInst(.{
-                    .tag = switch (block.float_mode) {
-                        .strict => .reduce,
-                        .optimized => .reduce_optimized,
-                    },
-                    .data = .{ .reduce = .{
-                        .operand = eql,
-                        .operation = .And,
-                    } },
-                });
+                break :ok try block.addReduce(eql, .And);
            } else {
                const is_in_range = try block.addBinOp(switch (block.float_mode) {
                    .strict => .cmp_eq,
@@ -15594,13 +15563,7 @@ fn zirDivExact(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Ai
                const zero_val = try sema.splat(resolved_type, scalar_zero);
                const zero = Air.internedToRef(zero_val.toIntern());
                const eql = try block.addCmpVector(remainder, zero, .eq);
-                break :ok try block.addInst(.{
-                    .tag = .reduce,
-                    .data = .{ .reduce = .{
-                        .operand = eql,
-                        .operation = .And,
-                    } },
-                });
+                break :ok try block.addReduce(eql, .And);
            } else {
                const zero = Air.internedToRef(scalar_zero.toIntern());
                const is_in_range = try block.addBinOp(.cmp_eq, remainder, zero);
@@ -15829,13 +15792,7 @@ fn addDivIntOverflowSafety(
            break :ok try block.addCmpVector(casted_rhs, neg_one_ref, .neq);
        };

-        const ok = try block.addInst(.{
-            .tag = .reduce,
-            .data = .{ .reduce = .{
-                .operand = try block.addBinOp(.bool_or, lhs_ok, rhs_ok),
-                .operation = .And,
-            } },
-        });
+        const ok = try block.addReduce(try block.addBinOp(.bool_or, lhs_ok, rhs_ok), .And);
        try sema.addSafetyCheck(block, src, ok, .integer_overflow);
    } else {
        const lhs_ok: Air.Inst.Ref = if (maybe_lhs_val == null) ok: {
@@ -15886,13 +15843,7 @@ fn addDivByZeroSafety(
        const zero_val = try sema.splat(resolved_type, scalar_zero);
        const zero = Air.internedToRef(zero_val.toIntern());
        const ok = try block.addCmpVector(casted_rhs, zero, .neq);
-        break :ok try block.addInst(.{
-            .tag = if (is_int) .reduce else .reduce_optimized,
-            .data = .{ .reduce = .{
-                .operand = ok,
-                .operation = .And,
-            } },
-        });
+        break :ok try block.addReduce(ok, .And);
    } else ok: {
        const zero = Air.internedToRef(scalar_zero.toIntern());
        break :ok try block.addBinOp(if (is_int) .cmp_neq else .cmp_neq_optimized, casted_rhs, zero);
@@ -16579,13 +16530,7 @@ fn analyzeArithmetic(
                });
                const ov_bit = try sema.tupleFieldValByIndex(block, op_ov, 1, op_ov_tuple_ty);
                const any_ov_bit = if (resolved_type.zigTypeTag(zcu) == .vector)
-                    try block.addInst(.{
-                        .tag = if (block.float_mode == .optimized) .reduce_optimized else .reduce,
-                        .data = .{ .reduce = .{
-                            .operand = ov_bit,
-                            .operation = .Or,
-                        } },
-                    })
+                    try block.addReduce(ov_bit, .Or)
                else
                    ov_bit;
                const zero_ov = Air.internedToRef((try pt.intValue(Type.u1, 0)).toIntern());
@@ -22406,13 +22351,7 @@ fn zirIntFromFloat(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileErro
                const ok_pos = try block.addCmpVector(diff, Air.internedToRef((try sema.splat(operand_ty, try pt.floatValue(operand_scalar_ty, 1.0))).toIntern()), .lt);
                const ok_neg = try block.addCmpVector(diff, Air.internedToRef((try sema.splat(operand_ty, try pt.floatValue(operand_scalar_ty, -1.0))).toIntern()), .gt);
                const ok = try block.addBinOp(.bit_and, ok_pos, ok_neg);
-                break :ok try block.addInst(.{
-                    .tag = .reduce,
-                    .data = .{ .reduce = .{
-                        .operand = ok,
-                        .operation = .And,
-                    } },
-                });
+                break :ok try block.addReduce(ok, .And);
            } else ok: {
                const ok_pos = try block.addBinOp(if (block.float_mode == .optimized) .cmp_lt_optimized else .cmp_lt, diff, Air.internedToRef((try pt.floatValue(operand_ty, 1.0)).toIntern()));
                const ok_neg = try block.addBinOp(if (block.float_mode == .optimized) .cmp_gt_optimized else .cmp_gt, diff, Air.internedToRef((try pt.floatValue(operand_ty, -1.0)).toIntern()));
@@ -22555,13 +22494,7 @@ fn zirPtrFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!
                const is_non_zero = if (is_vector) all_non_zero: {
                    const zero_usize = Air.internedToRef((try sema.splat(operand_ty, .zero_usize)).toIntern());
                    const is_non_zero = try block.addCmpVector(operand_coerced, zero_usize, .neq);
-                    break :all_non_zero try block.addInst(.{
-                        .tag = .reduce,
-                        .data = .{ .reduce = .{
-                            .operand = is_non_zero,
-                            .operation = .And,
-                        } },
-                    });
+                    break :all_non_zero try block.addReduce(is_non_zero, .And);
                } else try block.addBinOp(.cmp_neq, operand_coerced, .zero_usize);
                try sema.addSafetyCheck(block, src, is_non_zero, .cast_to_null);
            }
@@ -22578,13 +22511,7 @@ fn zirPtrFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!
                const is_aligned = if (is_vector) all_aligned: {
                    const splat_zero_usize = Air.internedToRef((try sema.splat(operand_ty, .zero_usize)).toIntern());
                    const is_aligned = try block.addCmpVector(remainder, splat_zero_usize, .eq);
-                    break :all_aligned try block.addInst(.{
-                        .tag = .reduce,
-                        .data = .{ .reduce = .{
-                            .operand = is_aligned,
-                            .operation = .And,
-                        } },
-                    });
+                    break :all_aligned try block.addReduce(is_aligned, .And);
                } else try block.addBinOp(.cmp_eq, remainder, .zero_usize);
                try sema.addSafetyCheck(block, src, is_aligned, .incorrect_alignment);
            }
@@ -24540,13 +24467,7 @@ fn zirReduce(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.
    }

    try sema.requireRuntimeBlock(block, block.nodeOffset(inst_data.src_node), operand_src);
-    return block.addInst(.{
-        .tag = if (block.float_mode == .optimized) .reduce_optimized else .reduce,
-        .data = .{ .reduce = .{
-            .operand = operand,
-            .operation = operation,
-        } },
-    });
+    return block.addReduce(operand, operation);
 }

 fn zirShuffle(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -27324,13 +27245,7 @@ fn addSafetyCheckSentinelMismatch(

    const ok = if (sentinel_ty.zigTypeTag(zcu) == .vector) ok: {
        const eql = try parent_block.addCmpVector(expected_sentinel, actual_sentinel, .eq);
-        break :ok try parent_block.addInst(.{
-            .tag = .reduce,
-            .data = .{ .reduce = .{
-                .operand = eql,
-                .operation = .And,
-            } },
-        });
+        break :ok try parent_block.addReduce(eql, .And);
    } else ok: {
        assert(sentinel_ty.isSelfComparable(zcu, true));
        break :ok try parent_block.addBinOp(.cmp_eq, expected_sentinel, actual_sentinel);
@@ -36595,6 +36510,7 @@ pub fn typeHasOnePossibleValue(sema: *Sema, ty: Type) CompileError!?Value {
        .u80_type,
        .u128_type,
        .i128_type,
+        .u256_type,
        .usize_type,
        .isize_type,
        .c_char_type,
@@ -36629,34 +36545,50 @@ pub fn typeHasOnePossibleValue(sema: *Sema, ty: Type) CompileError!?Value {
        .vector_8_i8_type,
        .vector_16_i8_type,
        .vector_32_i8_type,
+        .vector_64_i8_type,
        .vector_1_u8_type,
        .vector_2_u8_type,
        .vector_4_u8_type,
        .vector_8_u8_type,
        .vector_16_u8_type,
        .vector_32_u8_type,
+        .vector_64_u8_type,
+        .vector_2_i16_type,
        .vector_4_i16_type,
        .vector_8_i16_type,
        .vector_16_i16_type,
+        .vector_32_i16_type,
        .vector_4_u16_type,
        .vector_8_u16_type,
        .vector_16_u16_type,
+        .vector_32_u16_type,
+        .vector_2_i32_type,
        .vector_4_i32_type,
        .vector_8_i32_type,
+        .vector_16_i32_type,
        .vector_4_u32_type,
        .vector_8_u32_type,
+        .vector_16_u32_type,
        .vector_2_i64_type,
        .vector_4_i64_type,
+        .vector_8_i64_type,
        .vector_2_u64_type,
        .vector_4_u64_type,
+        .vector_8_u64_type,
+        .vector_1_u128_type,
        .vector_2_u128_type,
+        .vector_1_u256_type,
        .vector_4_f16_type,
        .vector_8_f16_type,
+        .vector_16_f16_type,
+        .vector_32_f16_type,
        .vector_2_f32_type,
        .vector_4_f32_type,
        .vector_8_f32_type,
+        .vector_16_f32_type,
        .vector_2_f64_type,
        .vector_4_f64_type,
+        .vector_8_f64_type,
        .anyerror_void_error_union_type,
        => null,
        .void_type => Value.void,
--- a/src/Type.zig
+++ b/src/Type.zig
@@ -994,7 +994,7 @@ pub fn abiAlignmentInner(
                    .stage2_x86_64 => {
                        if (vector_type.child == .bool_type) {
                            if (vector_type.len > 256 and std.Target.x86.featureSetHas(target.cpu.features, .avx512f)) return .{ .scalar = .@"64" };
-                            if (vector_type.len > 128 and std.Target.x86.featureSetHas(target.cpu.features, .avx2)) return .{ .scalar = .@"32" };
+                            if (vector_type.len > 128 and std.Target.x86.featureSetHas(target.cpu.features, .avx)) return .{ .scalar = .@"32" };
                            if (vector_type.len > 64) return .{ .scalar = .@"16" };
                            const bytes = std.math.divCeil(u32, vector_type.len, 8) catch unreachable;
                            const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes);
@@ -4060,6 +4060,7 @@ pub const @"u32": Type = .{ .ip_index = .u32_type };
 pub const @"u64": Type = .{ .ip_index = .u64_type };
 pub const @"u80": Type = .{ .ip_index = .u80_type };
 pub const @"u128": Type = .{ .ip_index = .u128_type };
+pub const @"u256": Type = .{ .ip_index = .u256_type };

 pub const @"i8": Type = .{ .ip_index = .i8_type };
 pub const @"i16": Type = .{ .ip_index = .i16_type };
@@ -4109,34 +4110,50 @@ pub const slice_const_u8_sentinel_0: Type = .{ .ip_index = .slice_const_u8_senti
 pub const vector_8_i8: Type = .{ .ip_index = .vector_8_i8_type };
 pub const vector_16_i8: Type = .{ .ip_index = .vector_16_i8_type };
 pub const vector_32_i8: Type = .{ .ip_index = .vector_32_i8_type };
+pub const vector_64_i8: Type = .{ .ip_index = .vector_64_i8_type };
 pub const vector_1_u8: Type = .{ .ip_index = .vector_1_u8_type };
 pub const vector_2_u8: Type = .{ .ip_index = .vector_2_u8_type };
 pub const vector_4_u8: Type = .{ .ip_index = .vector_4_u8_type };
 pub const vector_8_u8: Type = .{ .ip_index = .vector_8_u8_type };
 pub const vector_16_u8: Type = .{ .ip_index = .vector_16_u8_type };
 pub const vector_32_u8: Type = .{ .ip_index = .vector_32_u8_type };
+pub const vector_64_u8: Type = .{ .ip_index = .vector_64_u8_type };
+pub const vector_2_i16: Type = .{ .ip_index = .vector_2_i16_type };
 pub const vector_4_i16: Type = .{ .ip_index = .vector_4_i16_type };
 pub const vector_8_i16: Type = .{ .ip_index = .vector_8_i16_type };
 pub const vector_16_i16: Type = .{ .ip_index = .vector_16_i16_type };
+pub const vector_32_i16: Type = .{ .ip_index = .vector_32_i16_type };
 pub const vector_4_u16: Type = .{ .ip_index = .vector_4_u16_type };
 pub const vector_8_u16: Type = .{ .ip_index = .vector_8_u16_type };
 pub const vector_16_u16: Type = .{ .ip_index = .vector_16_u16_type };
+pub const vector_32_u16: Type = .{ .ip_index = .vector_32_u16_type };
+pub const vector_2_i32: Type = .{ .ip_index = .vector_2_i32_type };
 pub const vector_4_i32: Type = .{ .ip_index = .vector_4_i32_type };
 pub const vector_8_i32: Type = .{ .ip_index = .vector_8_i32_type };
+pub const vector_16_i32: Type = .{ .ip_index = .vector_16_i32_type };
 pub const vector_4_u32: Type = .{ .ip_index = .vector_4_u32_type };
 pub const vector_8_u32: Type = .{ .ip_index = .vector_8_u32_type };
+pub const vector_16_u32: Type = .{ .ip_index = .vector_16_u32_type };
 pub const vector_2_i64: Type = .{ .ip_index = .vector_2_i64_type };
 pub const vector_4_i64: Type = .{ .ip_index = .vector_4_i64_type };
+pub const vector_8_i64: Type = .{ .ip_index = .vector_8_i64_type };
 pub const vector_2_u64: Type = .{ .ip_index = .vector_2_u64_type };
 pub const vector_4_u64: Type = .{ .ip_index = .vector_4_u64_type };
+pub const vector_8_u64: Type = .{ .ip_index = .vector_8_u64_type };
+pub const vector_1_u128: Type = .{ .ip_index = .vector_1_u128_type };
 pub const vector_2_u128: Type = .{ .ip_index = .vector_2_u128_type };
+pub const vector_1_u256: Type = .{ .ip_index = .vector_1_u256_type };
 pub const vector_4_f16: Type = .{ .ip_index = .vector_4_f16_type };
 pub const vector_8_f16: Type = .{ .ip_index = .vector_8_f16_type };
+pub const vector_16_f16: Type = .{ .ip_index = .vector_16_f16_type };
+pub const vector_32_f16: Type = .{ .ip_index = .vector_32_f16_type };
 pub const vector_2_f32: Type = .{ .ip_index = .vector_2_f32_type };
 pub const vector_4_f32: Type = .{ .ip_index = .vector_4_f32_type };
 pub const vector_8_f32: Type = .{ .ip_index = .vector_8_f32_type };
+pub const vector_16_f32: Type = .{ .ip_index = .vector_16_f32_type };
 pub const vector_2_f64: Type = .{ .ip_index = .vector_2_f64_type };
 pub const vector_4_f64: Type = .{ .ip_index = .vector_4_f64_type };
+pub const vector_8_f64: Type = .{ .ip_index = .vector_8_f64_type };

 pub const empty_tuple: Type = .{ .ip_index = .empty_tuple_type };

--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
--- a/src/arch/x86_64/Encoding.zig
+++ b/src/arch/x86_64/Encoding.zig
@@ -313,7 +313,7 @@ pub const Mnemonic = enum {
    @"or", out, outs, outsb, outsd, outsw,
    pause, pop, popf, popfd, popfq, push, pushfq,
    rcl, rcr,
-    rdfsbase, rdgsbase, rdmsr, rdpid, rdpkru, rdpmc, rdrand, rdseed, rdssd, rdssq, rdtsc, rdtscp,
+    rdfsbase, rdgsbase, rdmsr, rdpid, rdpkru, rdpmc, rdrand, rdseed, rdsspd, rdsspq, rdtsc, rdtscp,
    ret, rol, ror, rsm,
    sahf, sal, sar, sbb,
    scas, scasb, scasd, scasq, scasw,
@@ -336,7 +336,7 @@ pub const Mnemonic = enum {
    fcom, fcomi, fcomip, fcomp, fcompp, fcos,
    fdecstp, fdiv, fdivp, fdivr, fdivrp, ffree,
    fiadd, ficom, ficomp, fidiv, fidivr, fild, fimul, fincstp, finit,
-    fist, fistp, fisttp, fisub, fisubr,
+    fist, fistp, fisub, fisubr,
    fld, fld1, fldcw, fldenv, fldl2e, fldl2t, fldlg2, fldln2, fldpi, fldz,
    fmul, fmulp,
    fnclex, fninit, fnop, fnsave, fnstcw, fnstenv, fnstsw,
@@ -349,19 +349,18 @@ pub const Mnemonic = enum {
    // MMX
    emms, movd, movq,
    packssdw, packsswb, packuswb,
-    paddb, paddd, paddq, paddsb, paddsw, paddusb, paddusw, paddw,
+    paddb, paddd, paddsb, paddsw, paddusb, paddusw, paddw,
    pand, pandn, por, pxor,
    pcmpeqb, pcmpeqd, pcmpeqw,
    pcmpgtb, pcmpgtd, pcmpgtw,
-    pmulhw, pmullw,
+    pmaddwd, pmulhw, pmullw,
    pslld, psllq, psllw,
    psrad, psraw,
    psrld, psrlq, psrlw,
-    psubb, psubd, psubq, psubsb, psubsw, psubusb, psubusw, psubw,
+    psubb, psubd, psubsb, psubsw, psubusb, psubusw, psubw,
    // SSE
    addps, addss,
-    andps,
-    andnps,
+    andnps, andps,
    cmpps, cmpss, comiss,
    cvtpi2ps, cvtps2pi, cvtsi2ss, cvtss2si, cvttps2pi, cvttss2si,
    divps, divss,
@@ -374,9 +373,11 @@ pub const Mnemonic = enum {
    movss, movups,
    mulps, mulss,
    orps,
+    pavgb, pavgw,
    pextrw, pinsrw,
-    pmaxsw, pmaxub, pminsw, pminub, pmovmskb,
+    pmaxsw, pmaxub, pminsw, pminub, pmovmskb, pmulhuw,
    prefetchit0, prefetchit1, prefetchnta, prefetcht0, prefetcht1, prefetcht2, prefetchw, prefetchwt1,
+    psadbw, pshufw,
    shufps,
    sqrtps, sqrtss,
    stmxcsr,
@@ -397,15 +398,16 @@ pub const Mnemonic = enum {
    maxpd, maxsd,
    minpd, minsd,
    movapd,
-    movdqa, movdqu,
+    movdq2q, movdqa, movdqu,
    movhpd, movlpd,
-    movmskpd,
+    movmskpd, movq2dq,
    //movsd,
    movupd,
    mulpd, mulsd,
    orpd,
+    paddq, pmuludq,
    pshufd, pshufhw, pshuflw,
-    pslldq, psrldq,
+    pslldq, psrldq, psubq,
    punpckhbw, punpckhdq, punpckhqdq, punpckhwd,
    punpcklbw, punpckldq, punpcklqdq, punpcklwd,
    shufpd,
@@ -414,9 +416,17 @@ pub const Mnemonic = enum {
    ucomisd, unpckhpd, unpcklpd,
    xorpd,
    // SSE3
-    addsubpd, addsubps, haddpd, haddps, lddqu, movddup, movshdup, movsldup,
+    addsubpd, addsubps,
+    fisttp,
+    haddpd, haddps,
+    hsubpd, hsubps,
+    lddqu,
+    movddup, movshdup, movsldup,
    // SSSE3
-    pabsb, pabsd, pabsw, palignr, pshufb,
+    pabsb, pabsd, pabsw, palignr,
+    phaddw, phaddsw, phaddd, phsubw, phsubsw, phsubd,
+    pmaddubsw, pmulhrsw, pshufb,
+    psignb, psignd, psignw,
    // SSE4.1
    blendpd, blendps, blendvpd, blendvps,
    dppd, dpps,
@@ -426,11 +436,12 @@ pub const Mnemonic = enum {
    pblendvb, pblendw,
    pcmpeqq,
    pextrb, pextrd, pextrq,
+    phminposuw,
    pinsrb, pinsrd, pinsrq,
    pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw,
    pmovsxbd, pmovsxbq, pmovsxbw, pmovsxdq, pmovsxwd, pmovsxwq,
    pmovzxbd, pmovzxbq, pmovzxbw, pmovzxdq, pmovzxwd, pmovzxwq,
-    pmulld,
+    pmuldq, pmulld,
    ptest,
    roundpd, roundps, roundsd, roundss,
    // SSE4.2
@@ -458,7 +469,7 @@ pub const Mnemonic = enum {
    vdppd, vdpps,
    vextractf128, vextractps,
    vgf2p8affineinvqb, vgf2p8affineqb, vgf2p8mulb,
-    vhaddpd, vhaddps,
+    vhaddpd, vhaddps, vhsubpd, vhsubps,
    vinsertf128, vinsertps,
    vlddqu, vldmxcsr,
    vmaskmovpd, vmaskmovps,
@@ -480,21 +491,24 @@ pub const Mnemonic = enum {
    vpabsb, vpabsd, vpabsw,
    vpackssdw, vpacksswb, vpackusdw, vpackuswb,
    vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw,
-    vpalignr, vpand, vpandn,
+    vpalignr, vpand, vpandn, vpavgb, vpavgw,
    vpblendvb, vpblendw, vpclmulqdq,
    vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
    vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
    vperm2f128, vpermilpd, vpermilps,
    vpextrb, vpextrd, vpextrq, vpextrw,
+    vphaddw, vphaddsw, vphaddd, vphminposuw, vphsubw, vphsubsw, vphsubd,
    vpinsrb, vpinsrd, vpinsrq, vpinsrw,
+    vpmaddubsw, vpmaddwd,
    vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
    vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw,
    vpmovmskb,
    vpmovsxbd, vpmovsxbq, vpmovsxbw, vpmovsxdq, vpmovsxwd, vpmovsxwq,
    vpmovzxbd, vpmovzxbq, vpmovzxbw, vpmovzxdq, vpmovzxwd, vpmovzxwq,
-    vpmulhw, vpmulld, vpmullw,
+    vpmuldq, vpmulhrsw, vpmulhuw, vpmulhw, vpmulld, vpmullw, vpmuludq,
    vpor,
-    vpshufb, vpshufd, vpshufhw, vpshuflw,
+    vpsadbw, vpshufb, vpshufd, vpshufhw, vpshuflw,
+    vpsignb, vpsignd, vpsignw,
    vpslld, vpslldq, vpsllq, vpsllw,
    vpsrad, vpsraq, vpsraw,
    vpsrld, vpsrldq, vpsrlq, vpsrlw,
@@ -779,7 +793,7 @@ pub const Op = enum {
    pub fn isImmediate(op: Op) bool {
        // zig fmt: off
        return switch (op) {
-            .imm8, .imm16, .imm32, .imm64, 
+            .imm8, .imm16, .imm32, .imm64,
            .imm8s, .imm16s, .imm32s,
            .rel8, .rel16, .rel32,
            .unity,
@@ -986,6 +1000,7 @@ pub const Feature = enum {
    sse,
    sse2,
    sse3,
+    @"sse3 x87",
    sse4_1,
    sse4_2,
    ssse3,
@@ -1015,7 +1030,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op
 }

 const mnemonic_to_encodings_map = init: {
-    @setEvalBranchQuota(5_800);
+    @setEvalBranchQuota(5_900);
    const ModrmExt = u3;
    const Entry = struct { Mnemonic, OpEn, []const Op, []const u8, ModrmExt, Mode, Feature };
    const encodings: []const Entry = @import("encodings.zon");
@@ -1024,17 +1039,17 @@ const mnemonic_to_encodings_map = init: {
    var mnemonic_map: [mnemonic_count][]Data = @splat(&.{});
    for (encodings) |entry| mnemonic_map[@intFromEnum(entry[0])].len += 1;
    var data_storage: [encodings.len]Data = undefined;
-    var storage_i: usize = 0;
+    var storage_index: usize = 0;
    for (&mnemonic_map) |*value| {
-        value.ptr = data_storage[storage_i..].ptr;
-        storage_i += value.len;
+        value.ptr = data_storage[storage_index..].ptr;
+        storage_index += value.len;
    }
-    var mnemonic_i: [mnemonic_count]usize = @splat(0);
+    var mnemonic_index: [mnemonic_count]usize = @splat(0);
    const ops_len = @typeInfo(@FieldType(Data, "ops")).array.len;
    const opc_len = @typeInfo(@FieldType(Data, "opc")).array.len;
    for (encodings) |entry| {
-        const i = &mnemonic_i[@intFromEnum(entry[0])];
-        mnemonic_map[@intFromEnum(entry[0])][i.*] = .{
+        const index = &mnemonic_index[@intFromEnum(entry[0])];
+        mnemonic_map[@intFromEnum(entry[0])][index.*] = .{
            .op_en = entry[1],
            .ops = (entry[2] ++ .{.none} ** (ops_len - entry[2].len)).*,
            .opc_len = entry[3].len,
@@ -1043,14 +1058,14 @@ const mnemonic_to_encodings_map = init: {
            .mode = entry[5],
            .feature = entry[6],
        };
-        i.* += 1;
+        index.* += 1;
    }
    const final_storage = data_storage;
    var final_map: [mnemonic_count][]const Data = @splat(&.{});
-    storage_i = 0;
+    storage_index = 0;
    for (&final_map, mnemonic_map) |*final_value, value| {
-        final_value.* = final_storage[storage_i..][0..value.len];
-        storage_i += value.len;
+        final_value.* = final_storage[storage_index..][0..value.len];
+        storage_index += value.len;
    }
    break :init final_map;
 };
--- a/src/arch/x86_64/Lower.zig
+++ b/src/arch/x86_64/Lower.zig
@@ -567,7 +567,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
 }

 fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
-    @setEvalBranchQuota(2_500);
+    @setEvalBranchQuota(2_800);
    const fixes = switch (inst.ops) {
        .none => inst.data.none.fixes,
        .inst => inst.data.inst.fixes,
@@ -601,9 +601,9 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
        var buf: [max_len]u8 = undefined;

        const fixes_name = @tagName(fixes);
-        const pattern = fixes_name[if (std.mem.indexOfScalar(u8, fixes_name, ' ')) |i| i + 1 else 0..];
-        const wildcard_i = std.mem.indexOfScalar(u8, pattern, '_').?;
-        const parts = .{ pattern[0..wildcard_i], @tagName(inst.tag), pattern[wildcard_i + 1 ..] };
+        const pattern = fixes_name[if (std.mem.indexOfScalar(u8, fixes_name, ' ')) |i| i + " ".len else 0..];
+        const wildcard_index = std.mem.indexOfScalar(u8, pattern, '_').?;
+        const parts = .{ pattern[0..wildcard_index], @tagName(inst.tag), pattern[wildcard_index + "_".len ..] };
        const err_msg = "unsupported mnemonic: ";
        const mnemonic = std.fmt.bufPrint(&buf, "{s}{s}{s}", parts) catch
            return lower.fail(err_msg ++ "'{s}{s}{s}'", parts);
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@@ -34,6 +34,16 @@ pub const Inst = struct {
        /// ___ 4
        _4,

+        /// ___ Demote
+        _demote,
+        /// ___ Flush
+        _flush,
+        /// ___ Flush Optimized
+        _flushopt,
+        /// ___ Instructions With T0 Hint
+        _it0,
+        /// ___ Instructions With T0 Hint
+        _it1,
        /// ___ With NTA Hint
        _nta,
        /// System Call ___
@@ -44,6 +54,8 @@ pub const Inst = struct {
        _t1,
        /// ___ With T2 Hint
        _t2,
+        /// ___ Write Back
+        _wb,
        /// ___ With Intent to Write and T1 Hint
        _wt1,

@@ -53,6 +65,8 @@ pub const Inst = struct {
        _csspq,
        /// ___ FS Segment Base
        _fsbase,
+        /// ___ GS
+        _gs,
        /// ___ GS Segment Base
        _gsbase,
        /// ___ Model Specific Register
@@ -67,8 +81,14 @@ pub const Inst = struct {
        _pmc,
        /// ___ Random Number
        _rand,
+        /// ___ r Busy Flag in a Supervisor Shadow Stack token
+        _rssbsy,
        /// ___ Random Seed
        _seed,
+        /// ___ Shadow Stack Doubleword
+        _ssd,
+        /// ___ Shadow Stack Quadword
+        _ssq,
        /// ___ Shadow Stack Pointer Doubleword
        _sspd,
        /// ___ Shadow Stack Pointer Quadword
@@ -77,9 +97,15 @@ pub const Inst = struct {
        _tsc,
        /// ___ Time-Stamp Counter And Processor ID
        _tscp,
+        /// ___ User Shadow Stack Doubleword
+        _ussd,
+        /// ___ User Shadow Stack Quadword
+        _ussq,
        /// VEX-Encoded ___ MXCSR
        v_mxcsr,

+        /// Byte ___
+        b_,
        /// Interrupt ___
        /// Integer ___
        i_,
@@ -118,6 +144,8 @@ pub const Inst = struct {
        _ld,
        /// ___ Left Without Affecting Flags
        _lx,
+        /// ___ Mask
+        _msk,
        /// ___ Right
        /// ___ For Reading
        /// ___ Register
@@ -139,6 +167,7 @@ pub const Inst = struct {
        /// ___ Below
        _b,
        /// ___ Below Or Equal
+        /// ___ Big Endian
        _be,
        /// ___ Carry
        /// ___ Carry Flag
@@ -212,8 +241,12 @@ pub const Inst = struct {
        _w,
        /// ___ Doubleword
        //_d,
+        /// ___ Double Quadword to Quadword
+        _dq2q,
        /// ___ QuadWord
        _q,
+        /// ___ Quadword to Double Quadword
+        _q2dq,

        /// ___ String
        //_s,
@@ -369,6 +402,8 @@ pub const Inst = struct {
        fn_sw,
        /// Float Extended ___
        fx_,
+        /// Float Extended ___ 64
+        fx_64,

        /// ___ in 32-bit and Compatibility Mode
        _32,
@@ -386,7 +421,18 @@ pub const Inst = struct {
        /// Packed ___ Quadword
        p_q,
        /// Packed ___ Double Quadword
+        /// Packed ___ Doubleword to Quadword
        p_dq,
+        /// Packed ___ Unsigned Doubleword to Quadword
+        p_udq,
+        /// Packed Carry-Less ___ Quadword to Double Quadword
+        pcl_qdq,
+        /// Packed Half ___ Doubleword
+        ph_d,
+        /// Packed Half ___ Saturate Word
+        ph_sw,
+        /// Packed Half ___ Word
+        ph_w,
        /// ___ Aligned Packed Integer Values
        _dqa,
        /// ___ Unaligned Packed Integer Values
@@ -400,6 +446,10 @@ pub const Inst = struct {
        //_sd,
        /// ___ Packed Double-Precision Values
        _pd,
+        /// Half ___ Packed Single-Precision Values
+        h_ps,
+        /// Half ___ Packed Double-Precision Values
+        h_pd,

        /// ___ Internal Caches
        //_d,
@@ -427,7 +477,7 @@ pub const Inst = struct {
        v_w,
        /// VEX-Encoded ___ Doubleword
        v_d,
-        /// VEX-Encoded ___ QuadWord
+        /// VEX-Encoded ___ Quadword
        v_q,
        /// VEX-Encoded ___ Aligned Packed Integer Values
        v_dqa,
@@ -446,7 +496,18 @@ pub const Inst = struct {
        /// VEX-Encoded Packed ___ Quadword
        vp_q,
        /// VEX-Encoded Packed ___ Double Quadword
+        /// VEX-Encoded Packed ___ Doubleword to Quadword
        vp_dq,
+        /// VEX-Encoded Packed ___ Unsigned Doubleword to Quadword
+        vp_udq,
+        /// VEx-Encoded Packed Carry-Less ___ Quadword to Double Quadword
+        vpcl_qdq,
+        /// VEX-Encoded Packed Half ___ Doubleword
+        vph_d,
+        /// VEX-Encoded Packed Half ___ Saturate Word
+        vph_sw,
+        /// VEX-Encoded Packed Half ___ Word
+        vph_w,
        /// VEX-Encoded ___ Scalar Single-Precision Values
        v_ss,
        /// VEX-Encoded ___ Packed Single-Precision Values
@@ -457,6 +518,10 @@ pub const Inst = struct {
        v_pd,
        /// VEX-Encoded ___ 128-Bits Of Floating-Point Data
        v_f128,
+        /// VEX-Encoded Half ___ Packed Single-Precision Values
+        vh_ps,
+        /// VEX-Encoded Half ___ Packed Double-Precision Values
+        vh_pd,

        /// ___ 128-bit key with key locker
        _128,
@@ -504,6 +569,10 @@ pub const Inst = struct {
        /// Add scalar single-precision floating-point values
        /// Add packed double-precision floating-point values
        /// Add scalar double-precision floating-point values
+        /// Packed single-precision floating-point horizontal add
+        /// Packed double-precision floating-point horizontal add
+        /// Packed horizontal add
+        /// Packed horizontal add and saturate
        add,
        /// Logical and
        /// Bitwise logical and of packed single-precision floating-point values
@@ -515,12 +584,15 @@ pub const Inst = struct {
        /// Bit scan reverse
        bs,
        /// Byte swap
-        bswap,
+        /// Swap GS base register
+        swap,
        /// Bit test
        /// Bit test and complement
        /// Bit test and reset
        /// Bit test and set
        bt,
+        /// Check array index against bounds
+        bound,
        /// Call
        /// Fast system call
        call,
@@ -536,17 +608,12 @@ pub const Inst = struct {
        /// Clear interrupt flag
        /// Clear task-switched flag in CR0
        /// Clear user interrupt flag
-        cl,
        /// Cache line demote
-        cldemote,
        /// Flush cache line
-        clflush,
        /// Flush cache line optimized
-        clflushopt,
        /// Clear busy flag in a supervisor shadow stack token
-        clrssbsy,
        /// Cache line write back
-        clwb,
+        cl,
        /// Complement carry flag
        cmc,
        /// Conditional move
@@ -644,15 +711,16 @@ pub const Inst = struct {
        lzcnt,
        /// Move
        /// Move data from string to string
+        /// Move data after swapping bytes
        /// Move scalar single-precision floating-point value
        /// Move scalar double-precision floating-point value
        /// Move doubleword
        /// Move quadword
        /// Move aligned packed integer values
        /// Move unaligned packed integer values
+        /// Move quadword from XMM to MMX technology register
+        /// Move quadword from MMX technology to XMM register
        mov,
-        /// Move data after swapping bytes
-        movbe,
        /// Move with sign extension
        movsx,
        /// Move with zero extension
@@ -663,6 +731,9 @@ pub const Inst = struct {
        /// Multiply scalar single-precision floating-point values
        /// Multiply packed double-precision floating-point values
        /// Multiply scalar double-precision floating-point values
+        /// Multiply packed unsigned doubleword integers
+        /// Multiply packed doubleword integers
+        /// Carry-less multiplication quadword
        mul,
        /// Two's complement negation
        neg,
@@ -729,6 +800,8 @@ pub const Inst = struct {
        sca,
        /// Send user interprocessor interrupt
        senduipi,
+        /// Serialize instruction execution
+        serialize,
        /// Set byte on condition
        set,
        /// Logical shift left
@@ -750,6 +823,10 @@ pub const Inst = struct {
        /// Subtract scalar single-precision floating-point values
        /// Subtract packed double-precision floating-point values
        /// Subtract scalar double-precision floating-point values
+        /// Packed single-precision floating-point horizontal subtract
+        /// Packed double-precision floating-point horizontal subtract
+        /// Packed horizontal subtract
+        /// Packed horizontal subtract and saturate
        sub,
        /// Set carry flag
        /// Set direction flag
@@ -764,8 +841,6 @@ pub const Inst = struct {
        st,
        /// Store string
        sto,
-        /// Swap GS base register
-        swapgs,
        /// Test condition
        /// Logical compare
        /// Packed bit test
@@ -780,6 +855,8 @@ pub const Inst = struct {
        /// Write to model specific register
        /// Write to model specific register
        /// Write to model specific register
+        /// Write to shadow stack
+        /// Write to user shadow stack
        wr,
        /// Exchange and add
        xadd,
@@ -896,6 +973,10 @@ pub const Inst = struct {
        cmpgt,
        /// Empty MMX technology state
        emms,
+        /// Multiply and add packed signed and unsigned bytes
+        maddubs,
+        /// Multiply and add packed integers
+        maddw,
        /// Multiply packed signed integers and store low result
        mull,
        /// Multiply packed signed integers and store high result
@@ -924,6 +1005,8 @@ pub const Inst = struct {
        unpcklwd,

        // SSE
+        /// Average packed integers
+        avg,
        /// Convert packed doubleword integers to packed single-precision floating-point values
        /// Convert packed doubleword integers to packed double-precision floating-point values
        cvtpi2,
@@ -986,9 +1069,13 @@ pub const Inst = struct {
        /// Move unaligned packed single-precision floating-point values
        /// Move unaligned packed double-precision floating-point values
        movu,
+        /// Multiply packed unsigned integers and store high result
+        mulhu,
        /// Prefetch data into caches
        /// Prefetch data into caches with intent to write
        prefetch,
+        /// Compute sum of absolute differences
+        sadb,
        /// Packed interleave shuffle of quadruplets of single-precision floating-point values
        /// Packed interleave shuffle of pairs of double-precision floating-point values
        /// Shuffle packed doublewords
@@ -1048,9 +1135,6 @@ pub const Inst = struct {
        /// Packed single-precision floating-point add/subtract
        /// Packed double-precision floating-point add/subtract
        addsub,
-        /// Packed single-precision floating-point horizontal add
-        /// Packed double-precision floating-point horizontal add
-        hadd,
        /// Replicate double floating-point values
        movddup,
        /// Replicate single floating-point values
@@ -1061,6 +1145,10 @@ pub const Inst = struct {
        // SSSE3
        /// Packed align right
        alignr,
+        /// Packed multiply high with round and scale
+        mulhrs,
+        /// Packed sign
+        sign,

        // SSE4.1
        /// Pack with unsigned saturation
@@ -1085,6 +1173,8 @@ pub const Inst = struct {
        /// Insert scalar single-precision floating-point value
        /// Insert packed floating-point values
        insert,
+        /// Packed horizontal word minimum
+        minposu,
        /// Packed move with sign extend
        movsxb,
        movsxd,
@@ -1103,10 +1193,6 @@ pub const Inst = struct {
        /// Accumulate CRC32 value
        crc32,

-        // PCLMUL
-        /// Carry-less multiplication quadword
-        clmulq,
-
        // AES
        /// Perform one round of an AES decryption flow
        /// Perform ten rounds of AES decryption flow with key locker using 128-bit key
@@ -1626,12 +1712,51 @@ pub const Inst = struct {
        reg_list: RegisterList,
    };

-    // Make sure we don't accidentally make instructions bigger than expected.
-    // Note that in safety builds, Zig is allowed to insert a secret field for safety checks.
    comptime {
        if (!std.debug.runtime_safety) {
+            // Make sure we don't accidentally make instructions bigger than expected.
+            // Note that in safety builds, Zig is allowed to insert a secret field for safety checks.
            assert(@sizeOf(Data) == 8);
        }
+        const Mnemonic = @import("Encoding.zig").Mnemonic;
+        if (@typeInfo(Mnemonic).@"enum".fields.len != 977 or
+            @typeInfo(Fixes).@"enum".fields.len != 231 or
+            @typeInfo(Tag).@"enum".fields.len != 251)
+        {
+            const cond_src = (struct {
+                fn src() std.builtin.SourceLocation {
+                    return @src();
+                }
+            }).src();
+            @setEvalBranchQuota(1_750_000);
+            for (@typeInfo(Mnemonic).@"enum".fields) |mnemonic| {
+                if (mnemonic.name[0] == '.') continue;
+                for (@typeInfo(Fixes).@"enum".fields) |fixes| {
+                    const pattern = fixes.name[if (std.mem.indexOfScalar(u8, fixes.name, ' ')) |index| index + " ".len else 0..];
+                    const wildcard_index = std.mem.indexOfScalar(u8, pattern, '_').?;
+                    const mnem_prefix = pattern[0..wildcard_index];
+                    const mnem_suffix = pattern[wildcard_index + "_".len ..];
+                    if (!std.mem.startsWith(u8, mnemonic.name, mnem_prefix)) continue;
+                    if (!std.mem.endsWith(u8, mnemonic.name, mnem_suffix)) continue;
+                    if (@hasField(
+                        Tag,
+                        mnemonic.name[mnem_prefix.len .. mnemonic.name.len - mnem_suffix.len],
+                    )) break;
+                } else @compileError("'" ++ mnemonic.name ++ "' is not encodable in Mir");
+            }
+            @compileError(std.fmt.comptimePrint(
+                \\All mnemonics are encodable in Mir! You may now change the condition at {s}:{d} to:
+                \\if (@typeInfo(Mnemonic).@"enum".fields.len != {d} or
+                \\    @typeInfo(Fixes).@"enum".fields.len != {d} or
+                \\    @typeInfo(Tag).@"enum".fields.len != {d})
+            , .{
+                cond_src.file,
+                cond_src.line - 6,
+                @typeInfo(Mnemonic).@"enum".fields.len,
+                @typeInfo(Fixes).@"enum".fields.len,
+                @typeInfo(Tag).@"enum".fields.len,
+            }));
+        }
    }
 };

--- a/src/arch/x86_64/encodings.zon
+++ b/src/arch/x86_64/encodings.zon
@@ -684,8 +684,8 @@
    .{ .rdseed, .m, .{ .r32 }, .{ 0x0f, 0xc7 }, 7, .none,  .rdseed },
    .{ .rdseed, .m, .{ .r64 }, .{ 0x0f, 0xc7 }, 7, .long,  .rdseed },

-    .{ .rdssd, .m, .{ .r32 }, .{ 0xf3, 0x0f, 0x1e }, 1, .none, .shstk },
-    .{ .rdssq, .m, .{ .r64 }, .{ 0xf3, 0x0f, 0x1e }, 1, .long, .shstk },
+    .{ .rdsspd, .m, .{ .r32 }, .{ 0xf3, 0x0f, 0x1e }, 1, .none, .shstk },
+    .{ .rdsspq, .m, .{ .r64 }, .{ 0xf3, 0x0f, 0x1e }, 1, .long, .shstk },

    .{ .rdtsc, .z, .{}, .{ 0x0f, 0x31 }, 0, .none, .none },

@@ -1111,10 +1111,10 @@
    .{ .fcomp,  .z, .{      }, .{ 0xd8, 0xd9 }, 0, .none, .x87 },
    .{ .fcompp, .z, .{      }, .{ 0xde, 0xd9 }, 0, .none, .x87 },

-    .{ .fcomi,   .zo, .{ .st0, .st }, .{ 0xdb, 0xf0 }, 0, .none, .x87 },
-    .{ .fcomip,  .zo, .{ .st0, .st }, .{ 0xdf, 0xf0 }, 0, .none, .x87 },
-    .{ .fucomi,  .zo, .{ .st0, .st }, .{ 0xdb, 0xe8 }, 0, .none, .x87 },
-    .{ .fucomip, .zo, .{ .st0, .st }, .{ 0xdf, 0xe8 }, 0, .none, .x87 },
+    .{ .fcomi,   .zo, .{ .st0, .st }, .{ 0xdb, 0xf0 }, 0, .none, .@"cmov x87" },
+    .{ .fcomip,  .zo, .{ .st0, .st }, .{ 0xdf, 0xf0 }, 0, .none, .@"cmov x87" },
+    .{ .fucomi,  .zo, .{ .st0, .st }, .{ 0xdb, 0xe8 }, 0, .none, .@"cmov x87" },
+    .{ .fucomip, .zo, .{ .st0, .st }, .{ 0xdf, 0xe8 }, 0, .none, .@"cmov x87" },

    .{ .fcos, .z, .{}, .{ 0xd9, 0xff }, 0, .none, .x87 },

@@ -1160,10 +1160,6 @@
    .{ .fistp, .m, .{ .m32 }, .{ 0xdb }, 3, .none, .x87 },
    .{ .fistp, .m, .{ .m64 }, .{ 0xdf }, 7, .none, .x87 },

-    .{ .fisttp, .m, .{ .m16 }, .{ 0xdf }, 1, .none, .x87 },
-    .{ .fisttp, .m, .{ .m32 }, .{ 0xdb }, 1, .none, .x87 },
-    .{ .fisttp, .m, .{ .m64 }, .{ 0xdd }, 1, .none, .x87 },
-
    .{ .fld, .m, .{ .m32 }, .{ 0xd9       }, 0, .none, .x87 },
    .{ .fld, .m, .{ .m64 }, .{ 0xdd       }, 0, .none, .x87 },
    .{ .fld, .m, .{ .m80 }, .{ 0xdb       }, 5, .none, .x87 },
@@ -1528,6 +1524,8 @@

    .{ .pinsrw, .rmi, .{ .xmm, .r32_m16, .imm8 }, .{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 },

+    .{ .pmaddwd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf5 }, 0, .none, .sse2 },
+
    .{ .pmaxsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xee }, 0, .none, .sse2 },

    .{ .pmaxub, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xde }, 0, .none, .sse2 },
@@ -1536,12 +1534,18 @@

    .{ .pminub, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xda }, 0, .none, .sse2 },

+    .{ .pmulhuw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe4 }, 0, .none, .sse2 },
+
    .{ .pmulhw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe5 }, 0, .none, .sse2 },

    .{ .pmullw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xd5 }, 0, .none, .sse2 },

+    .{ .pmuludq, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf4 }, 0, .none, .sse2 },
+
    .{ .por, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xeb }, 0, .none, .sse2 },

+    .{ .psadbw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf6 }, 0, .none, .sse2 },
+
    .{ .pshufd, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x70 }, 0, .none, .sse2 },

    .{ .pshufhw, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0xf3, 0x0f, 0x70 }, 0, .none, .sse2 },
@@ -1618,6 +1622,10 @@

    .{ .addsubps, .rm, .{ .xmm, .xmm_m128 }, .{ 0xf2, 0x0f, 0xd0 }, 0, .none, .sse3 },

+    .{ .fisttp, .m, .{ .m16 }, .{ 0xdf }, 1, .none, .@"sse3 x87" },
+    .{ .fisttp, .m, .{ .m32 }, .{ 0xdb }, 1, .none, .@"sse3 x87" },
+    .{ .fisttp, .m, .{ .m64 }, .{ 0xdd }, 1, .none, .@"sse3 x87" },
+
    .{ .haddpd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x7c }, 0, .none, .sse3 },

    .{ .haddps, .rm, .{ .xmm, .xmm_m128 }, .{ 0xf2, 0x0f, 0x7c }, 0, .none, .sse3 },
@@ -1640,8 +1648,26 @@

    .{ .palignr, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x0f }, 0, .none, .ssse3 },

+    .{ .phaddw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x01 }, 0, .none, .ssse3 },
+    .{ .phaddd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x02 }, 0, .none, .ssse3 },
+
+    .{ .phaddsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x03 }, 0, .none, .ssse3 },
+
+    .{ .phsubw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x05 }, 0, .none, .ssse3 },
+    .{ .phsubd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x06 }, 0, .none, .ssse3 },
+
+    .{ .phsubsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x07 }, 0, .none, .ssse3 },
+
+    .{ .pmaddubsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x04 }, 0, .none, .ssse3 },
+
+    .{ .pmulhrsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x0b }, 0, .none, .ssse3 },
+
    .{ .pshufb, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .none, .ssse3 },

+    .{ .psignb, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x08 }, 0, .none, .ssse3 },
+    .{ .psignw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x09 }, 0, .none, .ssse3 },
+    .{ .psignd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x0a }, 0, .none, .ssse3 },
+
    // SSE4.1
    .{ .blendpd, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .none, .sse4_1 },

@@ -1676,6 +1702,8 @@

    .{ .pextrw, .mri, .{ .r32_m16, .xmm, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 },

+    .{ .phminposuw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x41 }, 0, .none, .sse4_1 },
+
    .{ .pinsrb, .rmi, .{ .xmm, .r32_m8, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .none, .sse4_1 },
    .{ .pinsrd, .rmi, .{ .xmm, .rm32,   .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 },
    .{ .pinsrq, .rmi, .{ .xmm, .rm64,   .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 },
@@ -1708,6 +1736,8 @@
    .{ .pmovzxwq, .rm, .{ .xmm, .xmm_m32 }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .none, .sse4_1 },
    .{ .pmovzxdq, .rm, .{ .xmm, .xmm_m64 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .none, .sse4_1 },

+    .{ .pmuldq, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .none, .sse4_1 },
+
    .{ .pmulld, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 },

    .{ .ptest, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x17 }, 0, .none, .sse4_1 },
@@ -2125,12 +2155,28 @@
    .{ .vpextrw, .rmi, .{ .r32,     .xmm, .imm8 }, .{ 0x66, 0x0f,       0xc5 }, 0, .vex_128_w0, .avx },
    .{ .vpextrw, .mri, .{ .r32_m16, .xmm, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_w0, .avx },

+    .{ .vphaddw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x01 }, 0, .vex_128_wig, .avx },
+    .{ .vphaddd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x02 }, 0, .vex_128_wig, .avx },
+
+    .{ .vphaddsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x03 }, 0, .vex_128_wig, .avx },
+
+    .{ .vphminposuw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x41 }, 0, .vex_128_wig, .avx },
+
+    .{ .vphsubw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x05 }, 0, .vex_128_wig, .avx },
+    .{ .vphsubd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x06 }, 0, .vex_128_wig, .avx },
+
+    .{ .vphsubsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x07 }, 0, .vex_128_wig, .avx },
+
    .{ .vpinsrb, .rvmi, .{ .xmm, .xmm, .r32_m8, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx },
    .{ .vpinsrd, .rvmi, .{ .xmm, .xmm, .rm32,   .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx },
    .{ .vpinsrq, .rvmi, .{ .xmm, .xmm, .rm64,   .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx },

    .{ .vpinsrw, .rvmi, .{ .xmm, .xmm, .r32_m16, .imm8 }, .{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_w0, .avx },

+    .{ .vpmaddubsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x04 }, 0, .vex_128_wig, .avx },
+
+    .{ .vpmaddwd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf5 }, 0, .vex_128_wig, .avx },
+
    .{ .vpmaxsb, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_128_wig, .avx },
    .{ .vpmaxsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f,       0xee }, 0, .vex_128_wig, .avx },
    .{ .vpmaxsd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_128_wig, .avx },
@@ -2166,14 +2212,24 @@
    .{ .vpmovzxwq, .rm, .{ .xmm, .xmm_m32 }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_128_wig, .avx },
    .{ .vpmovzxdq, .rm, .{ .xmm, .xmm_m64 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_128_wig, .avx },

+    .{ .vpmuldq, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .vex_128_wig, .avx },
+
+    .{ .vpmulhrsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x0b }, 0, .vex_128_wig, .avx },
+
+    .{ .vpmulhuw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe4 }, 0, .vex_128_wig, .avx },
+
    .{ .vpmulhw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx },

    .{ .vpmulld, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx },

    .{ .vpmullw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xd5 }, 0, .vex_128_wig, .avx },

+    .{ .vpmuludq, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf4 }, 0, .vex_128_wig, .avx },
+
    .{ .vpor, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx },

+    .{ .vpsadbw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf6 }, 0, .vex_128_wig, .avx },
+
    .{ .vpshufb, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_128_wig, .avx },

    .{ .vpshufd, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
@@ -2182,6 +2238,10 @@

    .{ .vpshuflw, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0xf2, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },

+    .{ .vpsignb, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x08 }, 0, .vex_128_wig, .avx },
+    .{ .vpsignw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x09 }, 0, .vex_128_wig, .avx },
+    .{ .vpsignd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x0a }, 0, .vex_128_wig, .avx },
+
    .{ .vpsllw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf1 }, 0, .vex_128_wig, .avx },
    .{ .vpsllw, .vmi, .{ .xmm, .xmm, .imm8     }, .{ 0x66, 0x0f, 0x71 }, 6, .vex_128_wig, .avx },
    .{ .vpslld, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf2 }, 0, .vex_128_wig, .avx },
@@ -2439,6 +2499,16 @@

    .{ .vpcmpgtq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x37 }, 0, .vex_256_wig, .avx2 },

+    .{ .vphaddw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x01 }, 0, .vex_256_wig, .avx2 },
+    .{ .vphaddd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x02 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vphaddsw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x03 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vphaddw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x05 }, 0, .vex_256_wig, .avx2 },
+    .{ .vphaddd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x06 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vphaddsw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x07 }, 0, .vex_256_wig, .avx2 },
+
    .{ .vperm2i128, .rvmi, .{ .ymm, .ymm, .ymm_m256, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x46 }, 0, .vex_256_w0, .avx2 },

    .{ .vpermd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x36 }, 0, .vex_256_w0, .avx2 },
@@ -2449,6 +2519,10 @@

    .{ .vpermq, .rmi, .{ .ymm, .ymm_m256, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x00 }, 0, .vex_256_w1, .avx2 },

+    .{ .vpmaddubsw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x04 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpmaddwd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xf5 }, 0, .vex_256_wig, .avx2 },
+
    .{ .vpmaskmovd, .rvm, .{ .xmm,  .xmm, .m128 }, .{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w0, .avx2 },
    .{ .vpmaskmovd, .rvm, .{ .ymm,  .ymm, .m256 }, .{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_256_w0, .avx2 },
    .{ .vpmaskmovq, .rvm, .{ .xmm,  .xmm, .m128 }, .{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w1, .avx2 },
@@ -2493,14 +2567,24 @@
    .{ .vpmovzxwq, .rm, .{ .ymm, .xmm_m64  }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_256_wig, .avx2 },
    .{ .vpmovzxdq, .rm, .{ .ymm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_256_wig, .avx2 },

+    .{ .vpmuldq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpmulhrsw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x0b }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpmulhuw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xe4 }, 0, .vex_256_wig, .avx2 },
+
    .{ .vpmulhw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx2 },

    .{ .vpmulld, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx2 },

    .{ .vpmullw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xd5 }, 0, .vex_256_wig, .avx2 },

+    .{ .vpmuludq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xf4 }, 0, .vex_256_wig, .avx2 },
+
    .{ .vpor, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 },

+    .{ .vpsadbw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xf6 }, 0, .vex_256_wig, .avx2 },
+
    .{ .vpshufb, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_256_wig, .avx2 },
    .{ .vpshufd, .rmi, .{ .ymm, .ymm_m256, .imm8 }, .{ 0x66, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },

@@ -2508,6 +2592,10 @@

    .{ .vpshuflw, .rmi, .{ .ymm, .ymm_m256, .imm8 }, .{ 0xf2, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },

+    .{ .vpsignb, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x08 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpsignw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x09 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpsignd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x0a }, 0, .vex_256_wig, .avx2 },
+
    .{ .vpsllw, .rvm, .{ .ymm, .ymm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf1 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsllw, .vmi, .{ .ymm, .ymm, .imm8     }, .{ 0x66, 0x0f, 0x71 }, 6, .vex_256_wig, .avx2 },
    .{ .vpslld, .rvm, .{ .ymm, .ymm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf2 }, 0, .vex_256_wig, .avx2 },
--- a/src/codegen/c/Type.zig
+++ b/src/codegen/c/Type.zig
@@ -1374,6 +1374,10 @@ pub const Pool = struct {
            .i64_type => return .i64,
            .u80_type, .u128_type => return .u128,
            .i128_type => return .i128,
+            .u256_type => return pool.fromIntInfo(allocator, .{
+                .signedness = .unsigned,
+                .bits = 256,
+            }, mod, kind),
            .usize_type => return .usize,
            .isize_type => return .isize,
            .c_char_type => return .{ .index = .char },
@@ -1488,6 +1492,21 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_64_i8_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .i8,
+                    .len = 64,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.i8.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_1_u8_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .u8,
@@ -1578,6 +1597,36 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_64_u8_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .u8,
+                    .len = 64,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.u8.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
+            .vector_2_i16_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .i16,
+                    .len = 2,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.i16.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_4_i16_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .i16,
@@ -1623,6 +1672,21 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_32_i16_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .i16,
+                    .len = 32,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.i16.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_4_u16_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .u16,
@@ -1668,6 +1732,36 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_32_u16_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .u16,
+                    .len = 32,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.u16.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
+            .vector_2_i32_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .i32,
+                    .len = 2,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.i32.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_4_i32_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .i32,
@@ -1698,6 +1792,21 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_16_i32_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .i32,
+                    .len = 16,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.i32.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_4_u32_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .u32,
@@ -1728,6 +1837,21 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_16_u32_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .u32,
+                    .len = 16,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.u32.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_2_i64_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .i64,
@@ -1758,6 +1882,21 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_8_i64_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .i64,
+                    .len = 8,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.i64.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_2_u64_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .u64,
@@ -1788,6 +1927,36 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_8_u64_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .u64,
+                    .len = 8,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.u64.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
+            .vector_1_u128_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .u128,
+                    .len = 1,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.u128.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_2_u128_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .u128,
@@ -1803,6 +1972,24 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_1_u256_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = try pool.fromIntInfo(allocator, .{
+                        .signedness = .unsigned,
+                        .bits = 256,
+                    }, mod, kind),
+                    .len = 1,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.u256.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_4_f16_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .f16,
@@ -1833,6 +2020,36 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_16_f16_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .f16,
+                    .len = 16,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.f16.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
+            .vector_32_f16_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .f16,
+                    .len = 32,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.f16.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_2_f32_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .f32,
@@ -1878,6 +2095,21 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_16_f32_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .f32,
+                    .len = 16,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.f32.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
            .vector_2_f64_type => {
                const vector_ctype = try pool.getVector(allocator, .{
                    .elem_ctype = .f64,
@@ -1908,6 +2140,21 @@ pub const Pool = struct {
                };
                return pool.fromFields(allocator, .@"struct", &fields, kind);
            },
+            .vector_8_f64_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .f64,
+                    .len = 8,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.f64.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },

            .undef,
            .zero,
--- a/test/behavior/array.zig
+++ b/test/behavior/array.zig
@@ -970,7 +970,6 @@ test "store array of array of structs at comptime" {
 }

 test "accessing multidimensional global array at comptime" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
--- a/test/behavior/cast.zig
+++ b/test/behavior/cast.zig
@@ -2714,7 +2714,6 @@ test "result type is preserved into comptime block" {
 }

 test "bitcast vector" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO

--- a/test/behavior/floatop.zig
+++ b/test/behavior/floatop.zig
@@ -236,11 +236,11 @@ test "vector cmp f16" {
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
    if (builtin.cpu.arch.isArm()) return error.SkipZigTest;
    if (builtin.cpu.arch.isPowerPC64()) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .hexagon) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;

    try testCmpVector(f16);
    try comptime testCmpVector(f16);
@@ -250,11 +250,11 @@ test "vector cmp f32" {
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
    if (builtin.cpu.arch.isArm()) return error.SkipZigTest;
    if (builtin.cpu.arch.isPowerPC64()) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .hexagon) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;

    try testCmpVector(f32);
    try comptime testCmpVector(f32);
@@ -263,11 +263,11 @@ test "vector cmp f32" {
 test "vector cmp f64" {
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
    if (builtin.cpu.arch.isArm()) return error.SkipZigTest;
    if (builtin.cpu.arch.isPowerPC64()) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .hexagon) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;

    try testCmpVector(f64);
    try comptime testCmpVector(f64);
@@ -279,25 +279,32 @@ test "vector cmp f128" {
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch.isArm()) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
    if (builtin.cpu.arch.isArm()) return error.SkipZigTest;
    if (builtin.cpu.arch.isPowerPC64()) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .hexagon) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;

    try testCmpVector(f128);
    try comptime testCmpVector(f128);
 }

 test "vector cmp f80/c_longdouble" {
-    if (true) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .hexagon) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .powerpc64le) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;

    try testCmpVector(f80);
    try comptime testCmpVector(f80);
    try testCmpVector(c_longdouble);
    try comptime testCmpVector(c_longdouble);
 }
+
 fn testCmpVector(comptime T: type) !void {
+    @setEvalBranchQuota(2_000);
    var edges = [_]T{
        -math.inf(T),
        -math.floatMax(T),
--- a/test/behavior/packed-struct.zig
+++ b/test/behavior/packed-struct.zig
@@ -1164,7 +1164,6 @@ test "assignment to non-byte-aligned field in packed struct" {
 }

 test "packed struct field pointer aligned properly" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
--- a/test/behavior/sizeof_and_typeof.zig
+++ b/test/behavior/sizeof_and_typeof.zig
@@ -326,7 +326,6 @@ test "lazy abi size used in comparison" {
 }

 test "peer type resolution with @TypeOf doesn't trigger dependency loop check" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
--- a/test/behavior/slice.zig
+++ b/test/behavior/slice.zig
@@ -994,7 +994,6 @@ test "modify slice length at comptime" {
 }

 test "slicing zero length array field of struct" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
@@ -1011,7 +1010,6 @@ test "slicing zero length array field of struct" {
 }

 test "slicing slices gives correct result" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
@@ -1026,7 +1024,6 @@ test "slicing slices gives correct result" {
 }

 test "get address of element of zero-sized slice" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
@@ -1040,7 +1037,6 @@ test "get address of element of zero-sized slice" {
 }

 test "sentinel-terminated 0-length slices" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
--- a/test/behavior/struct.zig
+++ b/test/behavior/struct.zig
@@ -1835,7 +1835,6 @@ test "tuple with comptime-only field" {
 }

 test "extern struct fields are aligned to 1" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO

@@ -1905,7 +1904,6 @@ test "array of structs inside struct initialized with undefined" {
 }

 test "runtime call in nested initializer" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
@@ -1938,7 +1936,6 @@ test "runtime call in nested initializer" {
 }

 test "runtime value in nested initializer passed as pointer to function" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO

@@ -1963,7 +1960,6 @@ test "runtime value in nested initializer passed as pointer to function" {
 }

 test "struct field default value is a call" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
--- a/test/behavior/try.zig
+++ b/test/behavior/try.zig
@@ -47,7 +47,6 @@ test "try then not executed with assignment" {
 }

 test "`try`ing an if/else expression" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
@@ -69,7 +68,6 @@ test "`try`ing an if/else expression" {
 }

 test "'return try' of empty error set in function returning non-error" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
--- a/test/behavior/undefined.zig
+++ b/test/behavior/undefined.zig
@@ -89,7 +89,6 @@ test "type name of undefined" {
 var buf: []u8 = undefined;

 test "reslice of undefined global var slice" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
--- a/test/behavior/union.zig
+++ b/test/behavior/union.zig
@@ -2129,7 +2129,6 @@ test "copied union field doesn't alias source" {
 }

 test "create union(enum) from other union(enum)" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
@@ -2254,7 +2253,6 @@ test "matching captures causes union equivalence" {
 }

 test "signed enum tag with negative value" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
--- a/test/behavior/vector.zig
+++ b/test/behavior/vector.zig
@@ -756,15 +756,15 @@ test "vector shift operators" {

 test "vector reduce operation" {
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch.isArm()) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
-    if (builtin.cpu.arch.isMIPS64()) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/21091
-    if (builtin.cpu.arch.isSPARC()) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/23719
+    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch.isMIPS64()) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/21091
+    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch.isSPARC()) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/23719
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;

    const S = struct {
        fn testReduce(comptime op: std.builtin.ReduceOp, x: anytype, expected: anytype) !void {
@@ -1548,7 +1548,6 @@ test "index into comptime-known vector is comptime-known" {
 }

 test "arithmetic on zero-length vectors" {
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -1567,7 +1566,6 @@ test "arithmetic on zero-length vectors" {

 test "@reduce on bool vector" {
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO

--- a/test/behavior/x86_64.zig
+++ b/test/behavior/x86_64.zig
@@ -7,8 +7,8 @@ test {
    if (builtin.object_format == .macho) return error.SkipZigTest;
    // COFF linker does not support the new backend.
    if (builtin.object_format == .coff) return error.SkipZigTest;
+    _ = @import("x86_64/access.zig");
    _ = @import("x86_64/binary.zig");
    _ = @import("x86_64/cast.zig");
-    _ = @import("x86_64/mem.zig");
    _ = @import("x86_64/unary.zig");
 }
--- a/test/behavior/x86_64/access.zig
+++ b/test/behavior/x86_64/access.zig
--- a/test/behavior/x86_64/binary.zig
+++ b/test/behavior/x86_64/binary.zig
@@ -5434,6 +5434,60 @@ test optionalsNotEqual {
    try test_optionals_not_equal.testFloats();
 }

+inline fn reduceAndEqual(comptime Type: type, lhs: Type, rhs: Type) bool {
+    return @reduce(.And, lhs == rhs);
+}
+test reduceAndEqual {
+    const test_reduce_and_equal = binary(reduceAndEqual, .{});
+    try test_reduce_and_equal.testIntVectors();
+    try test_reduce_and_equal.testFloatVectors();
+}
+
+inline fn reduceAndNotEqual(comptime Type: type, lhs: Type, rhs: Type) bool {
+    return @reduce(.And, lhs != rhs);
+}
+test reduceAndNotEqual {
+    const test_reduce_and_not_equal = binary(reduceAndNotEqual, .{});
+    try test_reduce_and_not_equal.testIntVectors();
+    try test_reduce_and_not_equal.testFloatVectors();
+}
+
+inline fn reduceOrEqual(comptime Type: type, lhs: Type, rhs: Type) bool {
+    return @reduce(.Or, lhs == rhs);
+}
+test reduceOrEqual {
+    const test_reduce_or_equal = binary(reduceOrEqual, .{});
+    try test_reduce_or_equal.testIntVectors();
+    try test_reduce_or_equal.testFloatVectors();
+}
+
+inline fn reduceOrNotEqual(comptime Type: type, lhs: Type, rhs: Type) bool {
+    return @reduce(.Or, lhs != rhs);
+}
+test reduceOrNotEqual {
+    const test_reduce_or_not_equal = binary(reduceOrNotEqual, .{});
+    try test_reduce_or_not_equal.testIntVectors();
+    try test_reduce_or_not_equal.testFloatVectors();
+}
+
+inline fn reduceXorEqual(comptime Type: type, lhs: Type, rhs: Type) bool {
+    return @reduce(.Xor, lhs == rhs);
+}
+test reduceXorEqual {
+    const test_reduce_xor_equal = binary(reduceXorEqual, .{});
+    try test_reduce_xor_equal.testIntVectors();
+    try test_reduce_xor_equal.testFloatVectors();
+}
+
+inline fn reduceXorNotEqual(comptime Type: type, lhs: Type, rhs: Type) bool {
+    return @reduce(.Xor, lhs != rhs);
+}
+test reduceXorNotEqual {
+    const test_reduce_xor_not_equal = binary(reduceXorNotEqual, .{});
+    try test_reduce_xor_not_equal.testIntVectors();
+    try test_reduce_xor_not_equal.testFloatVectors();
+}
+
 inline fn mulAdd(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(@mulAdd(Type, lhs, rhs, rhs)) {
    return @mulAdd(Type, lhs, rhs, rhs);
 }
--- a/test/behavior/x86_64/build.zig
+++ b/test/behavior/x86_64/build.zig
@@ -87,7 +87,7 @@ pub fn build(b: *std.Build) void {
        .{
            .cpu_arch = .x86_64,
            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v2 },
-            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .gfni }),
+            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .fast_hops, .gfni, .pclmul, .slow_incdec }),
        },
        .{
            .cpu_arch = .x86_64,
@@ -97,6 +97,7 @@ pub fn build(b: *std.Build) void {
        .{
            .cpu_arch = .x86_64,
            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v3 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .fast_hops, .gfni, .pclmul, .slow_incdec }),
            .cpu_features_sub = std.Target.x86.featureSet(&.{.avx2}),
        },
        .{
@@ -106,7 +107,7 @@ pub fn build(b: *std.Build) void {
        .{
            .cpu_arch = .x86_64,
            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v3 },
-            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .gfni }),
+            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .fast_hops, .gfni, .slow_incdec, .vpclmulqdq }),
        },
        .{
            .cpu_arch = .x86_64,
@@ -116,9 +117,9 @@ pub fn build(b: *std.Build) void {
        const target = b.resolveTargetQuery(query);
        const cpu = query.serializeCpuAlloc(b.allocator) catch @panic("OOM");
        for ([_][]const u8{
+            "access.zig",
            "binary.zig",
            "cast.zig",
-            "mem.zig",
            "unary.zig",
        }) |path| {
            const test_mod = b.createModule(.{
--- a/test/behavior/x86_64/math.zig
+++ b/test/behavior/x86_64/math.zig
@@ -125,7 +125,7 @@ fn boolOr(lhs: anytype, rhs: @TypeOf(lhs)) @TypeOf(lhs) {
    @compileError("unsupported boolOr type: " ++ @typeName(@TypeOf(lhs)));
 }

-pub const Compare = enum { strict, relaxed, approx, approx_int };
+pub const Compare = enum { strict, relaxed, approx, approx_int, approx_or_overflow };
 // noinline for a more helpful stack trace
 pub noinline fn checkExpected(expected: anytype, actual: @TypeOf(expected), comptime compare: Compare) !void {
    const Expected = @TypeOf(expected);
@@ -137,20 +137,32 @@ pub noinline fn checkExpected(expected: anytype, actual: @TypeOf(expected), comp
                break :unexpected switch (compare) {
                    .strict => boolOr(unequal, sign(expected) != sign(actual)),
                    .relaxed => unequal,
-                    .approx, .approx_int => comptime unreachable,
+                    .approx, .approx_int, .approx_or_overflow => comptime unreachable,
                };
            },
-            .approx, .approx_int => {
+            .approx, .approx_int, .approx_or_overflow => {
                const epsilon = math.floatEps(Scalar(Expected));
-                const tolerance = @sqrt(epsilon);
-                break :unexpected @abs(expected - actual) > @max(
+                const tolerance = switch (compare) {
+                    .strict, .relaxed => comptime unreachable,
+                    .approx, .approx_int => @sqrt(epsilon),
+                    .approx_or_overflow => @exp2(@log2(epsilon) * 0.4),
+                };
+                const approx_unequal = @abs(expected - actual) > @max(
                    @abs(expected) * splat(Expected, tolerance),
                    splat(Expected, switch (compare) {
                        .strict, .relaxed => comptime unreachable,
-                        .approx => tolerance,
+                        .approx, .approx_or_overflow => tolerance,
                        .approx_int => 1,
                    }),
                );
+                break :unexpected switch (compare) {
+                    .strict, .relaxed => comptime unreachable,
+                    .approx, .approx_int => approx_unequal,
+                    .approx_or_overflow => boolAnd(approx_unequal, boolOr(boolAnd(
+                        @abs(expected) != splat(Expected, inf(Expected)),
+                        @abs(actual) != splat(Expected, inf(Expected)),
+                    ), sign(expected) != sign(actual))),
+                };
            },
        },
        .@"struct" => |@"struct"| inline for (@"struct".fields) |field| {
--- a/test/behavior/x86_64/unary.zig
+++ b/test/behavior/x86_64/unary.zig
--- a/test/cases/compile_errors/@import_zon_bad_type.zig
+++ b/test/cases/compile_errors/@import_zon_bad_type.zig
@@ -117,9 +117,9 @@ export fn testMutablePointer() void {
 // tmp.zig:37:38: note: imported here
 // neg_inf.zon:1:1: error: expected type '?u8'
 // tmp.zig:57:28: note: imported here
-// neg_inf.zon:1:1: error: expected type 'tmp.testNonExhaustiveEnum__enum_501'
+// neg_inf.zon:1:1: error: expected type 'tmp.testNonExhaustiveEnum__enum_518'
 // tmp.zig:62:39: note: imported here
-// neg_inf.zon:1:1: error: expected type 'tmp.testUntaggedUnion__union_503'
+// neg_inf.zon:1:1: error: expected type 'tmp.testUntaggedUnion__union_520'
 // tmp.zig:67:44: note: imported here
-// neg_inf.zon:1:1: error: expected type 'tmp.testTaggedUnionVoid__union_506'
+// neg_inf.zon:1:1: error: expected type 'tmp.testTaggedUnionVoid__union_523'
 // tmp.zig:72:50: note: imported here
--- a/test/cases/compile_errors/anytype_param_requires_comptime.zig
+++ b/test/cases/compile_errors/anytype_param_requires_comptime.zig
@@ -15,6 +15,6 @@ pub export fn entry() void {
 // error
 //
 // :7:25: error: unable to resolve comptime value
-// :7:25: note: initializer of comptime-only struct 'tmp.S.foo__anon_475.C' must be comptime-known
+// :7:25: note: initializer of comptime-only struct 'tmp.S.foo__anon_492.C' must be comptime-known
 // :4:16: note: struct requires comptime because of this field
 // :4:16: note: types are not available at runtime
--- a/test/cases/compile_errors/bogus_method_call_on_slice.zig
+++ b/test/cases/compile_errors/bogus_method_call_on_slice.zig
@@ -16,5 +16,5 @@ pub export fn entry2() void {
 //
 // :3:6: error: no field or member function named 'copy' in '[]const u8'
 // :9:8: error: no field or member function named 'bar' in '@TypeOf(.{})'
-// :12:18: error: no field or member function named 'bar' in 'tmp.entry2__struct_479'
+// :12:18: error: no field or member function named 'bar' in 'tmp.entry2__struct_496'
 // :12:6: note: struct declared here
--- a/test/cases/compile_errors/coerce_anon_struct.zig
+++ b/test/cases/compile_errors/coerce_anon_struct.zig
@@ -6,6 +6,6 @@ export fn foo() void {

 // error
 //
-// :4:16: error: expected type 'tmp.T', found 'tmp.foo__struct_468'
+// :4:16: error: expected type 'tmp.T', found 'tmp.foo__struct_485'
 // :3:16: note: struct declared here
 // :1:11: note: struct declared here
--- a/test/cases/compile_errors/redundant_try.zig
+++ b/test/cases/compile_errors/redundant_try.zig
@@ -44,9 +44,9 @@ comptime {
 //
 // :5:23: error: expected error union type, found 'comptime_int'
 // :10:23: error: expected error union type, found '@TypeOf(.{})'
-// :15:23: error: expected error union type, found 'tmp.test2__struct_505'
+// :15:23: error: expected error union type, found 'tmp.test2__struct_522'
 // :15:23: note: struct declared here
-// :20:27: error: expected error union type, found 'tmp.test3__struct_507'
+// :20:27: error: expected error union type, found 'tmp.test3__struct_524'
 // :20:27: note: struct declared here
 // :25:23: error: expected error union type, found 'struct { comptime *const [5:0]u8 = "hello" }'
 // :31:13: error: expected error union type, found 'u32'
--- a/test/cases/float_mode_optimized_reduce.zig
+++ b/test/cases/float_mode_optimized_reduce.zig
@@ -8,5 +8,5 @@ pub fn main() void {
 }

 // run
-// backend=llvm
-//
+// backend=stage2,llvm
+// target=x86_64-linux