Merge pull request #18906 from jacobly0/x86_64-tests - zig - fork of https://codeberg.org/ziglang/zig

commit 91fb211faa3f37d08da55b0c8df92a6475624316 (tree)
parent d656c2a7abe90d00ef6dbc3731b82bd26180038a
Author: Andrew Kelley <andrew@ziglang.org>
Date:   Sun, 25 Feb 2024 21:43:20 -0800

Merge pull request #18906 from jacobly0/x86_64-tests

x86_64: pass more tests
Diffstat:
M lib/std/crypto/aes.zig  | 2 +-
M lib/std/crypto/blake3.zig  | 2 +-
M lib/std/crypto/salsa20.zig  | 5 ++++-
M lib/std/crypto/sha2.zig  | 2 +-
M lib/std/meta.zig  | 3 ++-
M lib/std/unicode.zig  | 299 +++++++++++++++++++++++++++++++++++++++----------------------------------------
M lib/std/zig/c_translation.zig  | 8 +++-----
M src/InternPool.zig  | 9 +++++++--
M src/Sema.zig  | 33 +++++++++++++++++++++++++++++++--
M src/arch/x86_64/CodeGen.zig  | 1986 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M src/arch/x86_64/Encoding.zig  | 15 +++++++++++----
M src/arch/x86_64/Lower.zig  | 14 +++++++++++++-
M src/arch/x86_64/Mir.zig  | 30 +++++++++++++++++++++++++++---
M src/arch/x86_64/encodings.zig  | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M src/codegen.zig  | 57 +++++++++++++++++++++++++++------------------------------
M src/codegen/c.zig  | 98 ++++++++++++++++++++++++++++++++++++-------------------------------------------
M src/codegen/llvm.zig  | 10 ++++++++--
M src/type.zig  | 49 ++++++++++++++++++++++++++++++++++++++++---------
M test/behavior/bitcast.zig  | 2 +-
M test/behavior/cast.zig  | 54 +++++++++++++++++++++++++++++++++---------------------
M test/behavior/optional.zig  | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
M test/behavior/select.zig  | 4 ++--
M test/behavior/shuffle.zig  | 3 ++-
M test/behavior/vector.zig  | 42 +++++++++++++++++++++++++++++-------------

24 files changed, 2239 insertions(+), 666 deletions(-)
diff --git a/lib/std/crypto/aes.zig b/lib/std/crypto/aes.zig
@@ -6,7 +6,7 @@ const has_aesni = std.Target.x86.featureSetHas(builtin.cpu.features, .aes);
 const has_avx = std.Target.x86.featureSetHas(builtin.cpu.features, .avx);
 const has_armaes = std.Target.aarch64.featureSetHas(builtin.cpu.features, .aes);
 // C backend doesn't currently support passing vectors to inline asm.
-const impl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and builtin.zig_backend != .stage2_x86_64 and has_aesni and has_avx) impl: {
+const impl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and has_aesni and has_avx) impl: {
     break :impl @import("aes/aesni.zig");
 } else if (builtin.cpu.arch == .aarch64 and builtin.zig_backend != .stage2_c and has_armaes)
 impl: {
diff --git a/lib/std/crypto/blake3.zig b/lib/std/crypto/blake3.zig
@@ -200,7 +200,7 @@ const CompressGeneric = struct {
     }
 };
 
-const compress = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_x86_64)
+const compress = if (builtin.cpu.arch == .x86_64)
     CompressVectorized.compress
 else
     CompressGeneric.compress;
diff --git a/lib/std/crypto/salsa20.zig b/lib/std/crypto/salsa20.zig
@@ -302,7 +302,10 @@ fn SalsaNonVecImpl(comptime rounds: comptime_int) type {
     };
 }
 
-const SalsaImpl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_x86_64) SalsaVecImpl else SalsaNonVecImpl;
+const SalsaImpl = if (builtin.cpu.arch == .x86_64)
+    SalsaVecImpl
+else
+    SalsaNonVecImpl;
 
 fn keyToWords(key: [32]u8) [8]u32 {
     var k: [8]u32 = undefined;
diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig
@@ -238,7 +238,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                         return;
                     },
                     // C backend doesn't currently support passing vectors to inline asm.
-                    .x86_64 => if (builtin.zig_backend != .stage2_c and builtin.zig_backend != .stage2_x86_64 and comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sha, .avx2 })) {
+                    .x86_64 => if (builtin.zig_backend != .stage2_c and comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sha, .avx2 })) {
                         var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
                         var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
                         const s_v = @as(*[16]v4u32, @ptrCast(&s));
diff --git a/lib/std/meta.zig b/lib/std/meta.zig
@@ -1286,5 +1286,6 @@ test "hasUniqueRepresentation" {
     try testing.expect(!hasUniqueRepresentation([]u8));
     try testing.expect(!hasUniqueRepresentation([]const u8));
 
-    try testing.expect(hasUniqueRepresentation(@Vector(4, u16)));
+    try testing.expect(hasUniqueRepresentation(@Vector(std.simd.suggestVectorLength(u8) orelse 1, u8)));
+    try testing.expect(@sizeOf(@Vector(3, u8)) == 3 or !hasUniqueRepresentation(@Vector(3, u8)));
 }
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
@@ -239,18 +239,19 @@ pub fn utf8ValidateSlice(input: []const u8) bool {
 fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
     var remaining = input;
 
-    const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
-    const Chunk = @Vector(chunk_len, u8);
-
-    // Fast path. Check for and skip ASCII characters at the start of the input.
-    while (remaining.len >= chunk_len) {
-        const chunk: Chunk = remaining[0..chunk_len].*;
-        const mask: Chunk = @splat(0x80);
-        if (@reduce(.Or, chunk & mask == mask)) {
-            // found a non ASCII byte
-            break;
+    if (std.simd.suggestVectorLength(u8)) |chunk_len| {
+        const Chunk = @Vector(chunk_len, u8);
+
+        // Fast path. Check for and skip ASCII characters at the start of the input.
+        while (remaining.len >= chunk_len) {
+            const chunk: Chunk = remaining[0..chunk_len].*;
+            const mask: Chunk = @splat(0x80);
+            if (@reduce(.Or, chunk & mask == mask)) {
+                // found a non ASCII byte
+                break;
+            }
+            remaining = remaining[chunk_len..];
         }
-        remaining = remaining[chunk_len..];
     }
 
     // default lowest and highest continuation byte
@@ -601,9 +602,9 @@ fn testUtf8IteratorOnAscii() !void {
     const s = Utf8View.initComptime("abc");
 
     var it1 = s.iterator();
-    try testing.expect(std.mem.eql(u8, "a", it1.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "b", it1.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "c", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "a", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "b", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "c", it1.nextCodepointSlice().?));
     try testing.expect(it1.nextCodepointSlice() == null);
 
     var it2 = s.iterator();
@@ -631,9 +632,9 @@ fn testUtf8ViewOk() !void {
     const s = Utf8View.initComptime("東京市");
 
     var it1 = s.iterator();
-    try testing.expect(std.mem.eql(u8, "東", it1.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "京", it1.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "市", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "東", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "京", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "市", it1.nextCodepointSlice().?));
     try testing.expect(it1.nextCodepointSlice() == null);
 
     var it2 = s.iterator();
@@ -771,20 +772,20 @@ fn testUtf8Peeking() !void {
     const s = Utf8View.initComptime("noël");
     var it = s.iterator();
 
-    try testing.expect(std.mem.eql(u8, "n", it.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "n", it.nextCodepointSlice().?));
 
-    try testing.expect(std.mem.eql(u8, "o", it.peek(1)));
-    try testing.expect(std.mem.eql(u8, "oë", it.peek(2)));
-    try testing.expect(std.mem.eql(u8, "oël", it.peek(3)));
-    try testing.expect(std.mem.eql(u8, "oël", it.peek(4)));
-    try testing.expect(std.mem.eql(u8, "oël", it.peek(10)));
+    try testing.expect(mem.eql(u8, "o", it.peek(1)));
+    try testing.expect(mem.eql(u8, "oë", it.peek(2)));
+    try testing.expect(mem.eql(u8, "oël", it.peek(3)));
+    try testing.expect(mem.eql(u8, "oël", it.peek(4)));
+    try testing.expect(mem.eql(u8, "oël", it.peek(10)));
 
-    try testing.expect(std.mem.eql(u8, "o", it.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "ë", it.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "l", it.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "o", it.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "ë", it.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "l", it.nextCodepointSlice().?));
     try testing.expect(it.nextCodepointSlice() == null);
 
-    try testing.expect(std.mem.eql(u8, &[_]u8{}, it.peek(1)));
+    try testing.expect(mem.eql(u8, &[_]u8{}, it.peek(1)));
 }
 
 fn testError(bytes: []const u8, expected_err: anyerror) !void {
@@ -926,59 +927,50 @@ test "fmtUtf8" {
 }
 
 fn utf16LeToUtf8ArrayListImpl(
-    array_list: *std.ArrayList(u8),
+    result: *std.ArrayList(u8),
     utf16le: []const u16,
     comptime surrogates: Surrogates,
 ) (switch (surrogates) {
     .cannot_encode_surrogate_half => Utf16LeToUtf8AllocError,
     .can_encode_surrogate_half => mem.Allocator.Error,
 })!void {
-    // optimistically guess that it will all be ascii.
-    try array_list.ensureTotalCapacityPrecise(utf16le.len);
+    assert(result.capacity >= utf16le.len);
 
     var remaining = utf16le;
-    if (builtin.zig_backend != .stage2_x86_64) {
-        const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
+    vectorized: {
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
         const Chunk = @Vector(chunk_len, u16);
 
         // Fast path. Check for and encode ASCII characters at the start of the input.
         while (remaining.len >= chunk_len) {
             const chunk: Chunk = remaining[0..chunk_len].*;
-            const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));
+            const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
             if (@reduce(.Or, chunk | mask != mask)) {
                 // found a non ASCII code unit
                 break;
             }
-            const chunk_byte_len = chunk_len * 2;
-            const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;
-            const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);
-            const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];
+            const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
             // We allocated enough space to encode every UTF-16 code unit
             // as ASCII, so if the entire string is ASCII then we are
             // guaranteed to have enough space allocated
-            array_list.appendSliceAssumeCapacity(&ascii_bytes);
+            result.addManyAsArrayAssumeCapacity(chunk_len).* = ascii_chunk;
             remaining = remaining[chunk_len..];
         }
     }
 
-    var out_index: usize = array_list.items.len;
     switch (surrogates) {
         .cannot_encode_surrogate_half => {
             var it = Utf16LeIterator.init(remaining);
             while (try it.nextCodepoint()) |codepoint| {
                 const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
-                try array_list.resize(array_list.items.len + utf8_len);
-                assert((utf8Encode(codepoint, array_list.items[out_index..]) catch unreachable) == utf8_len);
-                out_index += utf8_len;
+                assert((utf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
             }
         },
         .can_encode_surrogate_half => {
             var it = Wtf16LeIterator.init(remaining);
             while (it.nextCodepoint()) |codepoint| {
                 const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
-                try array_list.resize(array_list.items.len + utf8_len);
-                assert((wtf8Encode(codepoint, array_list.items[out_index..]) catch unreachable) == utf8_len);
-                out_index += utf8_len;
+                assert((wtf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
             }
         },
     }
@@ -986,8 +978,9 @@ fn utf16LeToUtf8ArrayListImpl(
 
 pub const Utf16LeToUtf8AllocError = mem.Allocator.Error || Utf16LeToUtf8Error;
 
-pub fn utf16LeToUtf8ArrayList(array_list: *std.ArrayList(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void {
-    return utf16LeToUtf8ArrayListImpl(array_list, utf16le, .cannot_encode_surrogate_half);
+pub fn utf16LeToUtf8ArrayList(result: *std.ArrayList(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void {
+    try result.ensureTotalCapacityPrecise(utf16le.len);
+    return utf16LeToUtf8ArrayListImpl(result, utf16le, .cannot_encode_surrogate_half);
 }
 
 /// Deprecated; renamed to utf16LeToUtf8Alloc
@@ -999,8 +992,7 @@ pub fn utf16LeToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) Utf16L
     var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len);
     errdefer result.deinit();
 
-    try utf16LeToUtf8ArrayList(&result, utf16le);
-
+    try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
     return result.toOwnedSlice();
 }
 
@@ -1013,8 +1005,7 @@ pub fn utf16LeToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) Utf16
     var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len + 1);
     errdefer result.deinit();
 
-    try utf16LeToUtf8ArrayList(&result, utf16le);
-
+    try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
     return result.toOwnedSliceSentinel(0);
 }
 
@@ -1026,27 +1017,24 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr
     .cannot_encode_surrogate_half => Utf16LeToUtf8Error,
     .can_encode_surrogate_half => error{},
 })!usize {
-    var end_index: usize = 0;
+    var dest_index: usize = 0;
 
     var remaining = utf16le;
-    if (builtin.zig_backend != .stage2_x86_64) {
-        const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
+    vectorized: {
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
         const Chunk = @Vector(chunk_len, u16);
 
         // Fast path. Check for and encode ASCII characters at the start of the input.
         while (remaining.len >= chunk_len) {
             const chunk: Chunk = remaining[0..chunk_len].*;
-            const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));
+            const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
             if (@reduce(.Or, chunk | mask != mask)) {
                 // found a non ASCII code unit
                 break;
             }
-            const chunk_byte_len = chunk_len * 2;
-            const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;
-            const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);
-            const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];
-            @memcpy(utf8[end_index .. end_index + chunk_len], &ascii_bytes);
-            end_index += chunk_len;
+            const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
+            utf8[dest_index..][0..chunk_len].* = ascii_chunk;
+            dest_index += chunk_len;
             remaining = remaining[chunk_len..];
         }
     }
@@ -1055,7 +1043,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr
         .cannot_encode_surrogate_half => {
             var it = Utf16LeIterator.init(remaining);
             while (try it.nextCodepoint()) |codepoint| {
-                end_index += utf8Encode(codepoint, utf8[end_index..]) catch |err| switch (err) {
+                dest_index += utf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
                     // The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
                     // which is within the valid codepoint range.
                     error.CodepointTooLarge => unreachable,
@@ -1068,7 +1056,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr
         .can_encode_surrogate_half => {
             var it = Wtf16LeIterator.init(remaining);
             while (it.nextCodepoint()) |codepoint| {
-                end_index += wtf8Encode(codepoint, utf8[end_index..]) catch |err| switch (err) {
+                dest_index += wtf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
                     // The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
                     // which is within the valid codepoint range.
                     error.CodepointTooLarge => unreachable,
@@ -1076,7 +1064,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr
             }
         },
     }
-    return end_index;
+    return dest_index;
 }
 
 /// Deprecated; renamed to utf16LeToUtf8
@@ -1149,14 +1137,12 @@ test utf16LeToUtf8 {
     }
 }
 
-fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8, comptime surrogates: Surrogates) !void {
-    // optimistically guess that it will not require surrogate pairs
-    try array_list.ensureTotalCapacityPrecise(utf8.len);
+fn utf8ToUtf16LeArrayListImpl(result: *std.ArrayList(u16), utf8: []const u8, comptime surrogates: Surrogates) !void {
+    assert(result.capacity >= utf8.len);
 
     var remaining = utf8;
-    // Need support for std.simd.interlace
-    if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) {
-        const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
+    vectorized: {
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
         const Chunk = @Vector(chunk_len, u8);
 
         // Fast path. Check for and encode ASCII characters at the start of the input.
@@ -1167,9 +1153,8 @@ fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8,
                 // found a non ASCII code unit
                 break;
             }
-            const zeroes: Chunk = @splat(0);
-            const utf16_chunk: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes });
-            array_list.appendSliceAssumeCapacity(std.mem.bytesAsSlice(u16, &utf16_chunk));
+            const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk);
+            result.addManyAsArrayAssumeCapacity(chunk_len).* = utf16_chunk;
             remaining = remaining[chunk_len..];
         }
     }
@@ -1181,21 +1166,18 @@ fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8,
     var it = view.iterator();
     while (it.nextCodepoint()) |codepoint| {
         if (codepoint < 0x10000) {
-            const short = @as(u16, @intCast(codepoint));
-            try array_list.append(mem.nativeToLittle(u16, short));
+            try result.append(mem.nativeToLittle(u16, @intCast(codepoint)));
         } else {
             const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
             const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
-            var out: [2]u16 = undefined;
-            out[0] = mem.nativeToLittle(u16, high);
-            out[1] = mem.nativeToLittle(u16, low);
-            try array_list.appendSlice(out[0..]);
+            try result.appendSlice(&.{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) });
         }
     }
 }
 
-pub fn utf8ToUtf16LeArrayList(array_list: *std.ArrayList(u16), utf8: []const u8) error{ InvalidUtf8, OutOfMemory }!void {
-    return utf8ToUtf16LeArrayListImpl(array_list, utf8, .cannot_encode_surrogate_half);
+pub fn utf8ToUtf16LeArrayList(result: *std.ArrayList(u16), utf8: []const u8) error{ InvalidUtf8, OutOfMemory }!void {
+    try result.ensureTotalCapacityPrecise(utf8.len);
+    return utf8ToUtf16LeArrayListImpl(result, utf8, .cannot_encode_surrogate_half);
 }
 
 pub fn utf8ToUtf16LeAlloc(allocator: mem.Allocator, utf8: []const u8) error{ InvalidUtf8, OutOfMemory }![]u16 {
@@ -1204,7 +1186,6 @@ pub fn utf8ToUtf16LeAlloc(allocator: mem.Allocator, utf8: []const u8) error{ Inv
     errdefer result.deinit();
 
     try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);
-
     return result.toOwnedSlice();
 }
 
@@ -1217,7 +1198,6 @@ pub fn utf8ToUtf16LeAllocZ(allocator: mem.Allocator, utf8: []const u8) error{ In
     errdefer result.deinit();
 
     try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);
-
     return result.toOwnedSliceSentinel(0);
 }
 
@@ -1228,12 +1208,11 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) error{InvalidUtf8}!usize 
 }
 
 pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates: Surrogates) !usize {
-    var dest_i: usize = 0;
+    var dest_index: usize = 0;
 
     var remaining = utf8;
-    // Need support for std.simd.interlace
-    if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) {
-        const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
+    vectorized: {
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
         const Chunk = @Vector(chunk_len, u8);
 
         // Fast path. Check for and encode ASCII characters at the start of the input.
@@ -1244,57 +1223,60 @@ pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates: 
                 // found a non ASCII code unit
                 break;
             }
-            const zeroes: Chunk = @splat(0);
-            const utf16_bytes: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes });
-            @memcpy(utf16le[dest_i..][0..chunk_len], std.mem.bytesAsSlice(u16, &utf16_bytes));
-            dest_i += chunk_len;
+            const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk);
+            utf16le[dest_index..][0..chunk_len].* = utf16_chunk;
+            dest_index += chunk_len;
             remaining = remaining[chunk_len..];
         }
     }
 
-    var src_i: usize = 0;
-    while (src_i < remaining.len) {
-        const n = utf8ByteSequenceLength(remaining[src_i]) catch return switch (surrogates) {
-            .cannot_encode_surrogate_half => error.InvalidUtf8,
-            .can_encode_surrogate_half => error.InvalidWtf8,
-        };
-        const next_src_i = src_i + n;
-        const codepoint = switch (surrogates) {
-            .cannot_encode_surrogate_half => utf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidUtf8,
-            .can_encode_surrogate_half => wtf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidWtf8,
-        };
+    const view = switch (surrogates) {
+        .cannot_encode_surrogate_half => try Utf8View.init(remaining),
+        .can_encode_surrogate_half => try Wtf8View.init(remaining),
+    };
+    var it = view.iterator();
+    while (it.nextCodepoint()) |codepoint| {
         if (codepoint < 0x10000) {
-            const short = @as(u16, @intCast(codepoint));
-            utf16le[dest_i] = mem.nativeToLittle(u16, short);
-            dest_i += 1;
+            utf16le[dest_index] = mem.nativeToLittle(u16, @intCast(codepoint));
+            dest_index += 1;
         } else {
             const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
             const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
-            utf16le[dest_i] = mem.nativeToLittle(u16, high);
-            utf16le[dest_i + 1] = mem.nativeToLittle(u16, low);
-            dest_i += 2;
+            utf16le[dest_index..][0..2].* = .{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) };
+            dest_index += 2;
         }
-        src_i = next_src_i;
     }
-    return dest_i;
+    return dest_index;
 }
 
 test "utf8ToUtf16Le" {
-    var utf16le: [2]u16 = [_]u16{0} ** 2;
+    var utf16le: [128]u16 = undefined;
     {
         const length = try utf8ToUtf16Le(utf16le[0..], "𐐷");
-        try testing.expectEqual(@as(usize, 2), length);
-        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..]));
+        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..length]));
     }
     {
         const length = try utf8ToUtf16Le(utf16le[0..], "\u{10FFFF}");
-        try testing.expectEqual(@as(usize, 2), length);
-        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..]));
+        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..length]));
     }
     {
         const result = utf8ToUtf16Le(utf16le[0..], "\xf4\x90\x80\x80");
         try testing.expectError(error.InvalidUtf8, result);
     }
+    {
+        const length = try utf8ToUtf16Le(utf16le[0..], "This string has been designed to test the vectorized implementat" ++
+            "ion by beginning with one hundred twenty-seven ASCII characters¡");
+        try testing.expectEqualSlices(u8, &.{
+            'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ',  0,
+            'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o',  0,
+            ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r',  0,
+            'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't',  0,
+            'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g',  0,
+            ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e',  0,
+            'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A',  0,
+            'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0,
+        }, mem.sliceAsBytes(utf16le[0..length]));
+    }
 }
 
 test utf8ToUtf16LeArrayList {
@@ -1339,25 +1321,40 @@ test utf8ToUtf16LeAllocZ {
     {
         const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "𐐷");
         defer testing.allocator.free(utf16);
-        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16[0..]));
+        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16));
         try testing.expect(utf16[2] == 0);
     }
     {
         const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "\u{10FFFF}");
         defer testing.allocator.free(utf16);
-        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16[0..]));
+        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16));
         try testing.expect(utf16[2] == 0);
     }
     {
         const result = utf8ToUtf16LeAllocZ(testing.allocator, "\xf4\x90\x80\x80");
         try testing.expectError(error.InvalidUtf8, result);
     }
+    {
+        const utf16 = try utf8ToUtf16LeWithNull(testing.allocator, "This string has been designed to test the vectorized implementat" ++
+            "ion by beginning with one hundred twenty-seven ASCII characters¡");
+        defer testing.allocator.free(utf16);
+        try testing.expectEqualSlices(u8, &.{
+            'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ',  0,
+            'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o',  0,
+            ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r',  0,
+            'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't',  0,
+            'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g',  0,
+            ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e',  0,
+            'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A',  0,
+            'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0,
+        }, mem.sliceAsBytes(utf16));
+    }
 }
 
 /// Converts a UTF-8 string literal into a UTF-16LE string literal.
-pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch unreachable:0]u16 {
+pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 {
     return comptime blk: {
-        const len: usize = calcUtf16LeLen(utf8) catch |err| @compileError(err);
+        const len: usize = calcUtf16LeLen(utf8) catch unreachable;
         var utf16le: [len:0]u16 = [_:0]u16{0} ** len;
         const utf16le_len = utf8ToUtf16Le(&utf16le, utf8[0..]) catch |err| @compileError(err);
         assert(len == utf16le_len);
@@ -1438,12 +1435,12 @@ test "fmtUtf16Le" {
     try expectFmt("", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral(""))});
     try expectFmt("foo", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("foo"))});
     try expectFmt("𐐷", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("𐐷"))});
-    try expectFmt("퟿", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xd7", native_endian)})});
-    try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xd8", native_endian)})});
-    try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xdb", native_endian)})});
-    try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xdc", native_endian)})});
-    try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xdf", native_endian)})});
-    try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xe0", native_endian)})});
+    try expectFmt("퟿", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xd7", native_endian)})});
+    try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xd8", native_endian)})});
+    try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdb", native_endian)})});
+    try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xdc", native_endian)})});
+    try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdf", native_endian)})});
+    try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xe0", native_endian)})});
 }
 
 test "utf8ToUtf16LeStringLiteral" {
@@ -1686,8 +1683,9 @@ pub const Wtf8Iterator = struct {
     }
 };
 
-pub fn wtf16LeToWtf8ArrayList(array_list: *std.ArrayList(u8), utf16le: []const u16) mem.Allocator.Error!void {
-    return utf16LeToUtf8ArrayListImpl(array_list, utf16le, .can_encode_surrogate_half);
+pub fn wtf16LeToWtf8ArrayList(result: *std.ArrayList(u8), utf16le: []const u16) mem.Allocator.Error!void {
+    try result.ensureTotalCapacityPrecise(utf16le.len);
+    return utf16LeToUtf8ArrayListImpl(result, utf16le, .can_encode_surrogate_half);
 }
 
 /// Caller must free returned memory.
@@ -1696,8 +1694,7 @@ pub fn wtf16LeToWtf8Alloc(allocator: mem.Allocator, wtf16le: []const u16) mem.Al
     var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len);
     errdefer result.deinit();
 
-    try wtf16LeToWtf8ArrayList(&result, wtf16le);
-
+    try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
     return result.toOwnedSlice();
 }
 
@@ -1707,8 +1704,7 @@ pub fn wtf16LeToWtf8AllocZ(allocator: mem.Allocator, wtf16le: []const u16) mem.A
     var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len + 1);
     errdefer result.deinit();
 
-    try wtf16LeToWtf8ArrayList(&result, wtf16le);
-
+    try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
     return result.toOwnedSliceSentinel(0);
 }
 
@@ -1716,8 +1712,9 @@ pub fn wtf16LeToWtf8(wtf8: []u8, wtf16le: []const u16) usize {
     return utf16LeToUtf8Impl(wtf8, wtf16le, .can_encode_surrogate_half) catch |err| switch (err) {};
 }
 
-pub fn wtf8ToWtf16LeArrayList(array_list: *std.ArrayList(u16), wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }!void {
-    return utf8ToUtf16LeArrayListImpl(array_list, wtf8, .can_encode_surrogate_half);
+pub fn wtf8ToWtf16LeArrayList(result: *std.ArrayList(u16), wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }!void {
+    try result.ensureTotalCapacityPrecise(wtf8.len);
+    return utf8ToUtf16LeArrayListImpl(result, wtf8, .can_encode_surrogate_half);
 }
 
 pub fn wtf8ToWtf16LeAlloc(allocator: mem.Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![]u16 {
@@ -1726,7 +1723,6 @@ pub fn wtf8ToWtf16LeAlloc(allocator: mem.Allocator, wtf8: []const u8) error{ Inv
     errdefer result.deinit();
 
     try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);
-
     return result.toOwnedSlice();
 }
 
@@ -1736,7 +1732,6 @@ pub fn wtf8ToWtf16LeAllocZ(allocator: mem.Allocator, wtf8: []const u8) error{ In
     errdefer result.deinit();
 
     try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);
-
     return result.toOwnedSliceSentinel(0);
 }
 
@@ -1895,7 +1890,7 @@ pub const Wtf16LeIterator = struct {
 
     pub fn init(s: []const u16) Wtf16LeIterator {
         return Wtf16LeIterator{
-            .bytes = std.mem.sliceAsBytes(s),
+            .bytes = mem.sliceAsBytes(s),
             .i = 0,
         };
     }
@@ -1908,12 +1903,12 @@ pub const Wtf16LeIterator = struct {
         assert(it.i <= it.bytes.len);
         if (it.i == it.bytes.len) return null;
         var code_units: [2]u16 = undefined;
-        code_units[0] = std.mem.readInt(u16, it.bytes[it.i..][0..2], .little);
+        code_units[0] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
         it.i += 2;
         surrogate_pair: {
             if (utf16IsHighSurrogate(code_units[0])) {
                 if (it.i >= it.bytes.len) break :surrogate_pair;
-                code_units[1] = std.mem.readInt(u16, it.bytes[it.i..][0..2], .little);
+                code_units[1] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
                 const codepoint = utf16DecodeSurrogatePair(&code_units) catch break :surrogate_pair;
                 it.i += 2;
                 return codepoint;
@@ -2030,31 +2025,31 @@ fn testRoundtripWtf16(wtf16le: []const u16) !void {
 
 test "well-formed WTF-16 roundtrips" {
     try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD83D), // high surrogate
-        std.mem.nativeToLittle(u16, 0xDCA9), // low surrogate
+        mem.nativeToLittle(u16, 0xD83D), // high surrogate
+        mem.nativeToLittle(u16, 0xDCA9), // low surrogate
     });
     try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD83D), // high surrogate
-        std.mem.nativeToLittle(u16, ' '), // not surrogate
-        std.mem.nativeToLittle(u16, 0xDCA9), // low surrogate
+        mem.nativeToLittle(u16, 0xD83D), // high surrogate
+        mem.nativeToLittle(u16, ' '), // not surrogate
+        mem.nativeToLittle(u16, 0xDCA9), // low surrogate
     });
     try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD800), // high surrogate
-        std.mem.nativeToLittle(u16, 0xDBFF), // high surrogate
+        mem.nativeToLittle(u16, 0xD800), // high surrogate
+        mem.nativeToLittle(u16, 0xDBFF), // high surrogate
     });
     try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD800), // high surrogate
-        std.mem.nativeToLittle(u16, 0xE000), // not surrogate
+        mem.nativeToLittle(u16, 0xD800), // high surrogate
+        mem.nativeToLittle(u16, 0xE000), // not surrogate
     });
     try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD7FF), // not surrogate
-        std.mem.nativeToLittle(u16, 0xDC00), // low surrogate
+        mem.nativeToLittle(u16, 0xD7FF), // not surrogate
+        mem.nativeToLittle(u16, 0xDC00), // low surrogate
     });
     try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0x61), // not surrogate
-        std.mem.nativeToLittle(u16, 0xDC00), // low surrogate
+        mem.nativeToLittle(u16, 0x61), // not surrogate
+        mem.nativeToLittle(u16, 0xDC00), // low surrogate
     });
     try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xDC00), // low surrogate
+        mem.nativeToLittle(u16, 0xDC00), // low surrogate
     });
 }
diff --git a/lib/std/zig/c_translation.zig b/lib/std/zig/c_translation.zig
@@ -308,14 +308,12 @@ test "promoteIntLiteral" {
 
 /// Convert from clang __builtin_shufflevector index to Zig @shuffle index
 /// clang requires __builtin_shufflevector index arguments to be integer constants.
-/// negative values for `this_index` indicate "don't care" so we arbitrarily choose 0
+/// negative values for `this_index` indicate "don't care".
 /// clang enforces that `this_index` is less than the total number of vector elements
 /// See https://ziglang.org/documentation/master/#shuffle
 /// See https://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-shufflevector
 pub fn shuffleVectorIndex(comptime this_index: c_int, comptime source_vector_len: usize) i32 {
-    if (this_index <= 0) return 0;
-
-    const positive_index = @as(usize, @intCast(this_index));
+    const positive_index = std.math.cast(usize, this_index) orelse return undefined;
     if (positive_index < source_vector_len) return @as(i32, @intCast(this_index));
     const b_index = positive_index - source_vector_len;
     return ~@as(i32, @intCast(b_index));
@@ -324,7 +322,7 @@ pub fn shuffleVectorIndex(comptime this_index: c_int, comptime source_vector_len
 test "shuffleVectorIndex" {
     const vector_len: usize = 4;
 
-    try testing.expect(shuffleVectorIndex(-1, vector_len) == 0);
+    _ = shuffleVectorIndex(-1, vector_len);
 
     try testing.expect(shuffleVectorIndex(0, vector_len) == 0);
     try testing.expect(shuffleVectorIndex(1, vector_len) == 1);
diff --git a/src/InternPool.zig b/src/InternPool.zig
@@ -3587,6 +3587,7 @@ pub const Alignment = enum(u6) {
     @"8" = 3,
     @"16" = 4,
     @"32" = 5,
+    @"64" = 6,
     none = std.math.maxInt(u6),
     _,
 
@@ -7403,10 +7404,14 @@ pub fn isIntegerType(ip: *const InternPool, ty: Index) bool {
         .c_ulong_type,
         .c_longlong_type,
         .c_ulonglong_type,
-        .c_longdouble_type,
         .comptime_int_type,
         => true,
-        else => ip.indexToKey(ty) == .int_type,
+        else => switch (ip.items.items(.tag)[@intFromEnum(ty)]) {
+            .type_int_signed,
+            .type_int_unsigned,
+            => true,
+            else => false,
+        },
     };
 }
 
diff --git a/src/Sema.zig b/src/Sema.zig
@@ -23315,7 +23315,8 @@ fn checkVectorElemType(
     const mod = sema.mod;
     switch (ty.zigTypeTag(mod)) {
         .Int, .Float, .Bool => return,
-        else => if (ty.isPtrAtRuntime(mod)) return,
+        .Optional, .Pointer => if (ty.isPtrAtRuntime(mod)) return,
+        else => {},
     }
     return sema.fail(block, ty_src, "expected integer, float, bool, or pointer for the vector element type; found '{}'", .{ty.fmt(mod)});
 }
@@ -28442,7 +28443,7 @@ const CoerceOpts = struct {
     report_err: bool = true,
     /// Ignored if `report_err == false`.
     is_ret: bool = false,
-    /// Should coercion to comptime_int ermit an error message.
+    /// Should coercion to comptime_int emit an error message.
     no_cast_to_comptime_int: bool = false,
 
     param_src: struct {
@@ -31845,6 +31846,34 @@ fn coerceArrayLike(
     }
 
     const dest_elem_ty = dest_ty.childType(mod);
+    if (dest_ty.isVector(mod) and inst_ty.isVector(mod) and (try sema.resolveValue(inst)) == null) {
+        const inst_elem_ty = inst_ty.childType(mod);
+        switch (dest_elem_ty.zigTypeTag(mod)) {
+            .Int => if (inst_elem_ty.isInt(mod)) {
+                // integer widening
+                const dst_info = dest_elem_ty.intInfo(mod);
+                const src_info = inst_elem_ty.intInfo(mod);
+                if ((src_info.signedness == dst_info.signedness and dst_info.bits >= src_info.bits) or
+                    // small enough unsigned ints can get casted to large enough signed ints
+                    (dst_info.signedness == .signed and dst_info.bits > src_info.bits))
+                {
+                    try sema.requireRuntimeBlock(block, inst_src, null);
+                    return block.addTyOp(.intcast, dest_ty, inst);
+                }
+            },
+            .Float => if (inst_elem_ty.isRuntimeFloat()) {
+                // float widening
+                const src_bits = inst_elem_ty.floatBits(target);
+                const dst_bits = dest_elem_ty.floatBits(target);
+                if (dst_bits >= src_bits) {
+                    try sema.requireRuntimeBlock(block, inst_src, null);
+                    return block.addTyOp(.fpext, dest_ty, inst);
+                }
+            },
+            else => {},
+        }
+    }
+
     const element_vals = try sema.arena.alloc(InternPool.Index, dest_len);
     const element_refs = try sema.arena.alloc(Air.Inst.Ref, dest_len);
     var runtime_src: ?LazySrcLoc = null;
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
@@ -1547,6 +1547,27 @@ fn asmRegisterRegisterMemory(
     });
 }
 
+fn asmRegisterRegisterMemoryRegister(
+    self: *Self,
+    tag: Mir.Inst.FixedTag,
+    reg1: Register,
+    reg2: Register,
+    m: Memory,
+    reg3: Register,
+) !void {
+    _ = try self.addInst(.{
+        .tag = tag[1],
+        .ops = .rrmr,
+        .data = .{ .rrrx = .{
+            .fixes = tag[0],
+            .r1 = reg1,
+            .r2 = reg2,
+            .r3 = reg3,
+            .payload = try self.addExtra(Mir.Memory.encode(m)),
+        } },
+    });
+}
+
 fn asmMemory(self: *Self, tag: Mir.Inst.FixedTag, m: Memory) !void {
     _ = try self.addInst(.{
         .tag = tag[1],
@@ -1570,6 +1591,25 @@ fn asmRegisterMemory(self: *Self, tag: Mir.Inst.FixedTag, reg: Register, m: Memo
     });
 }
 
+fn asmRegisterMemoryRegister(
+    self: *Self,
+    tag: Mir.Inst.FixedTag,
+    reg1: Register,
+    m: Memory,
+    reg2: Register,
+) !void {
+    _ = try self.addInst(.{
+        .tag = tag[1],
+        .ops = .rmr,
+        .data = .{ .rrx = .{
+            .fixes = tag[0],
+            .r1 = reg1,
+            .r2 = reg2,
+            .payload = try self.addExtra(Mir.Memory.encode(m)),
+        } },
+    });
+}
+
 fn asmRegisterMemoryImmediate(
     self: *Self,
     tag: Mir.Inst.FixedTag,
@@ -2570,7 +2610,8 @@ fn restoreState(self: *Self, state: State, deaths: []const Air.Inst.Index, compt
 
     const ExpectedContents = [@typeInfo(RegisterManager.TrackedRegisters).Array.len]RegisterLock;
     var stack align(@max(@alignOf(ExpectedContents), @alignOf(std.heap.StackFallbackAllocator(0)))) =
-        if (opts.update_tracking) ({}) else std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa);
+        if (opts.update_tracking)
+    {} else std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa);
 
     var reg_locks = if (opts.update_tracking) {} else try std.ArrayList(RegisterLock).initCapacity(
         stack.get(),
@@ -2812,11 +2853,14 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
 }
 
 fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
+    const mod = self.bin_file.comp.module.?;
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const dst_ty = self.typeOfIndex(inst);
-    const dst_bits = dst_ty.floatBits(self.target.*);
+    const dst_scalar_ty = dst_ty.scalarType(mod);
+    const dst_bits = dst_scalar_ty.floatBits(self.target.*);
     const src_ty = self.typeOf(ty_op.operand);
-    const src_bits = src_ty.floatBits(self.target.*);
+    const src_scalar_ty = src_ty.scalarType(mod);
+    const src_bits = src_scalar_ty.floatBits(self.target.*);
 
     const result = result: {
         if (switch (src_bits) {
@@ -2840,94 +2884,290 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
             },
             else => unreachable,
         }) {
+            if (dst_ty.isVector(mod)) break :result null;
             var callee_buf: ["__extend?f?f2".len]u8 = undefined;
             break :result try self.genCall(.{ .lib = .{
-                .return_type = self.floatCompilerRtAbiType(dst_ty, src_ty).toIntern(),
-                .param_types = &.{self.floatCompilerRtAbiType(src_ty, dst_ty).toIntern()},
+                .return_type = self.floatCompilerRtAbiType(dst_scalar_ty, src_scalar_ty).toIntern(),
+                .param_types = &.{self.floatCompilerRtAbiType(src_scalar_ty, dst_scalar_ty).toIntern()},
                 .callee = std.fmt.bufPrint(&callee_buf, "__extend{c}f{c}f2", .{
                     floatCompilerRtAbiName(src_bits),
                     floatCompilerRtAbiName(dst_bits),
                 }) catch unreachable,
-            } }, &.{src_ty}, &.{.{ .air_ref = ty_op.operand }});
+            } }, &.{src_scalar_ty}, &.{.{ .air_ref = ty_op.operand }});
         }
 
+        const src_abi_size: u32 = @intCast(src_ty.abiSize(mod));
         const src_mcv = try self.resolveInst(ty_op.operand);
         const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
             src_mcv
         else
             try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv);
-        const dst_reg = dst_mcv.getReg().?.to128();
+        const dst_reg = dst_mcv.getReg().?;
+        const dst_alias = registerAlias(dst_reg, @intCast(@max(dst_ty.abiSize(mod), 16)));
         const dst_lock = self.register_manager.lockReg(dst_reg);
         defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
 
+        const vec_len = if (dst_ty.isVector(mod)) dst_ty.vectorLen(mod) else 1;
         if (src_bits == 16) {
             assert(self.hasFeature(.f16c));
             const mat_src_reg = if (src_mcv.isRegister())
                 src_mcv.getReg().?
             else
                 try self.copyToTmpRegister(src_ty, src_mcv);
-            try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg, mat_src_reg.to128());
+            try self.asmRegisterRegister(
+                .{ .v_ps, .cvtph2 },
+                dst_alias,
+                registerAlias(mat_src_reg, src_abi_size),
+            );
             switch (dst_bits) {
                 32 => {},
                 64 => try self.asmRegisterRegisterRegister(
                     .{ .v_sd, .cvtss2 },
-                    dst_reg,
-                    dst_reg,
-                    dst_reg,
+                    dst_alias,
+                    dst_alias,
+                    dst_alias,
                 ),
                 else => unreachable,
             }
         } else {
             assert(src_bits == 32 and dst_bits == 64);
-            if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
-                .{ .v_sd, .cvtss2 },
-                dst_reg,
-                dst_reg,
-                try src_mcv.mem(self, .dword),
-            ) else try self.asmRegisterRegisterRegister(
-                .{ .v_sd, .cvtss2 },
-                dst_reg,
-                dst_reg,
-                (if (src_mcv.isRegister())
-                    src_mcv.getReg().?
-                else
-                    try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
-            ) else if (src_mcv.isMemory()) try self.asmRegisterMemory(
-                .{ ._sd, .cvtss2 },
-                dst_reg,
-                try src_mcv.mem(self, .dword),
+            if (self.hasFeature(.avx)) switch (vec_len) {
+                1 => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
+                    .{ .v_sd, .cvtss2 },
+                    dst_alias,
+                    dst_alias,
+                    try src_mcv.mem(self, self.memSize(src_ty)),
+                ) else try self.asmRegisterRegisterRegister(
+                    .{ .v_sd, .cvtss2 },
+                    dst_alias,
+                    dst_alias,
+                    registerAlias(if (src_mcv.isRegister())
+                        src_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(src_ty, src_mcv), src_abi_size),
+                ),
+                2...4 => if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                    .{ .v_pd, .cvtps2 },
+                    dst_alias,
+                    try src_mcv.mem(self, self.memSize(src_ty)),
+                ) else try self.asmRegisterRegister(
+                    .{ .v_pd, .cvtps2 },
+                    dst_alias,
+                    registerAlias(if (src_mcv.isRegister())
+                        src_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(src_ty, src_mcv), src_abi_size),
+                ),
+                else => break :result null,
+            } else if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                switch (vec_len) {
+                    1 => .{ ._sd, .cvtss2 },
+                    2 => .{ ._pd, .cvtps2 },
+                    else => break :result null,
+                },
+                dst_alias,
+                try src_mcv.mem(self, self.memSize(src_ty)),
             ) else try self.asmRegisterRegister(
-                .{ ._sd, .cvtss2 },
-                dst_reg,
-                (if (src_mcv.isRegister())
+                switch (vec_len) {
+                    1 => .{ ._sd, .cvtss2 },
+                    2 => .{ ._pd, .cvtps2 },
+                    else => break :result null,
+                },
+                dst_alias,
+                registerAlias(if (src_mcv.isRegister())
                     src_mcv.getReg().?
                 else
-                    try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
+                    try self.copyToTmpRegister(src_ty, src_mcv), src_abi_size),
             );
         }
         break :result dst_mcv;
-    };
+    } orelse return self.fail("TODO implement airFpext from {} to {}", .{
+        src_ty.fmt(mod), dst_ty.fmt(mod),
+    });
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
 fn airIntCast(self: *Self, inst: Air.Inst.Index) !void {
     const mod = self.bin_file.comp.module.?;
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const result: MCValue = result: {
-        const src_ty = self.typeOf(ty_op.operand);
-        const src_int_info = src_ty.intInfo(mod);
+    const src_ty = self.typeOf(ty_op.operand);
+    const dst_ty = self.typeOfIndex(inst);
 
-        const dst_ty = self.typeOfIndex(inst);
-        const dst_int_info = dst_ty.intInfo(mod);
-        const abi_size: u32 = @intCast(dst_ty.abiSize(mod));
+    const result = @as(?MCValue, result: {
+        const dst_abi_size: u32 = @intCast(dst_ty.abiSize(mod));
 
-        const min_ty = if (dst_int_info.bits < src_int_info.bits) dst_ty else src_ty;
+        const src_int_info = src_ty.intInfo(mod);
+        const dst_int_info = dst_ty.intInfo(mod);
         const extend = switch (src_int_info.signedness) {
             .signed => dst_int_info,
             .unsigned => src_int_info,
         }.signedness;
 
         const src_mcv = try self.resolveInst(ty_op.operand);
+        if (dst_ty.isVector(mod)) {
+            const src_abi_size: u32 = @intCast(src_ty.abiSize(mod));
+            const max_abi_size = @max(dst_abi_size, src_abi_size);
+            if (max_abi_size > @as(u32, if (self.hasFeature(.avx2)) 32 else 16)) break :result null;
+            const has_avx = self.hasFeature(.avx);
+
+            const dst_elem_abi_size = dst_ty.childType(mod).abiSize(mod);
+            const src_elem_abi_size = src_ty.childType(mod).abiSize(mod);
+            switch (math.order(dst_elem_abi_size, src_elem_abi_size)) {
+                .lt => {
+                    const mir_tag: Mir.Inst.FixedTag = switch (dst_elem_abi_size) {
+                        else => break :result null,
+                        1 => switch (src_elem_abi_size) {
+                            else => break :result null,
+                            2 => switch (dst_int_info.signedness) {
+                                .signed => if (has_avx) .{ .vp_b, .ackssw } else .{ .p_b, .ackssw },
+                                .unsigned => if (has_avx) .{ .vp_b, .ackusw } else .{ .p_b, .ackusw },
+                            },
+                        },
+                        2 => switch (src_elem_abi_size) {
+                            else => break :result null,
+                            4 => switch (dst_int_info.signedness) {
+                                .signed => if (has_avx) .{ .vp_w, .ackssd } else .{ .p_w, .ackssd },
+                                .unsigned => if (has_avx)
+                                    .{ .vp_w, .ackusd }
+                                else if (self.hasFeature(.sse4_1))
+                                    .{ .p_w, .ackusd }
+                                else
+                                    break :result null,
+                            },
+                        },
+                    };
+
+                    const dst_mcv: MCValue = if (src_mcv.isRegister() and
+                        self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+                        src_mcv
+                    else if (has_avx and src_mcv.isRegister())
+                        .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+                    else
+                        try self.copyToRegisterWithInstTracking(inst, src_ty, src_mcv);
+                    const dst_reg = dst_mcv.getReg().?;
+                    const dst_alias = registerAlias(dst_reg, dst_abi_size);
+
+                    if (has_avx) try self.asmRegisterRegisterRegister(
+                        mir_tag,
+                        dst_alias,
+                        registerAlias(if (src_mcv.isRegister())
+                            src_mcv.getReg().?
+                        else
+                            dst_reg, src_abi_size),
+                        dst_alias,
+                    ) else try self.asmRegisterRegister(
+                        mir_tag,
+                        dst_alias,
+                        dst_alias,
+                    );
+                    break :result dst_mcv;
+                },
+                .eq => if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+                    break :result src_mcv
+                else {
+                    const dst_mcv = try self.allocRegOrMem(inst, true);
+                    try self.genCopy(dst_ty, dst_mcv, src_mcv, .{});
+                    break :result dst_mcv;
+                },
+                .gt => if (self.hasFeature(.sse4_1)) {
+                    const mir_tag: Mir.Inst.FixedTag = .{ switch (dst_elem_abi_size) {
+                        else => break :result null,
+                        2 => if (has_avx) .vp_w else .p_w,
+                        4 => if (has_avx) .vp_d else .p_d,
+                        8 => if (has_avx) .vp_q else .p_q,
+                    }, switch (src_elem_abi_size) {
+                        else => break :result null,
+                        1 => switch (extend) {
+                            .signed => .movsxb,
+                            .unsigned => .movzxb,
+                        },
+                        2 => switch (extend) {
+                            .signed => .movsxw,
+                            .unsigned => .movzxw,
+                        },
+                        4 => switch (extend) {
+                            .signed => .movsxd,
+                            .unsigned => .movzxd,
+                        },
+                    } };
+
+                    const dst_mcv: MCValue = if (src_mcv.isRegister() and
+                        self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+                        src_mcv
+                    else
+                        .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) };
+                    const dst_reg = dst_mcv.getReg().?;
+                    const dst_alias = registerAlias(dst_reg, dst_abi_size);
+
+                    if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                        mir_tag,
+                        dst_alias,
+                        try src_mcv.mem(self, self.memSize(src_ty)),
+                    ) else try self.asmRegisterRegister(
+                        mir_tag,
+                        dst_alias,
+                        registerAlias(if (src_mcv.isRegister())
+                            src_mcv.getReg().?
+                        else
+                            try self.copyToTmpRegister(src_ty, src_mcv), src_abi_size),
+                    );
+                    break :result dst_mcv;
+                } else {
+                    const mir_tag: Mir.Inst.FixedTag = switch (dst_elem_abi_size) {
+                        else => break :result null,
+                        2 => switch (src_elem_abi_size) {
+                            else => break :result null,
+                            1 => .{ .p_, .unpcklbw },
+                        },
+                        4 => switch (src_elem_abi_size) {
+                            else => break :result null,
+                            2 => .{ .p_, .unpcklwd },
+                        },
+                        8 => switch (src_elem_abi_size) {
+                            else => break :result null,
+                            2 => .{ .p_, .unpckldq },
+                        },
+                    };
+
+                    const dst_mcv: MCValue = if (src_mcv.isRegister() and
+                        self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+                        src_mcv
+                    else
+                        try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv);
+                    const dst_reg = dst_mcv.getReg().?;
+
+                    const ext_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse);
+                    const ext_alias = registerAlias(ext_reg, src_abi_size);
+                    const ext_lock = self.register_manager.lockRegAssumeUnused(ext_reg);
+                    defer self.register_manager.unlockReg(ext_lock);
+
+                    try self.asmRegisterRegister(.{ .p_, .xor }, ext_alias, ext_alias);
+                    switch (extend) {
+                        .signed => try self.asmRegisterRegister(
+                            .{ switch (src_elem_abi_size) {
+                                else => unreachable,
+                                1 => .p_b,
+                                2 => .p_w,
+                                4 => .p_d,
+                            }, .cmpgt },
+                            ext_alias,
+                            registerAlias(dst_reg, src_abi_size),
+                        ),
+                        .unsigned => {},
+                    }
+                    try self.asmRegisterRegister(
+                        mir_tag,
+                        registerAlias(dst_reg, dst_abi_size),
+                        registerAlias(ext_reg, dst_abi_size),
+                    );
+                    break :result dst_mcv;
+                },
+            }
+            @compileError("unreachable");
+        }
+
+        const min_ty = if (dst_int_info.bits < src_int_info.bits) dst_ty else src_ty;
+
         const src_storage_bits: u16 = switch (src_mcv) {
             .register, .register_offset => 64,
             .register_pair => 128,
@@ -2945,13 +3185,13 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void {
         };
 
         if (dst_int_info.bits <= src_int_info.bits) break :result if (dst_mcv.isRegister())
-            .{ .register = registerAlias(dst_mcv.getReg().?, abi_size) }
+            .{ .register = registerAlias(dst_mcv.getReg().?, dst_abi_size) }
         else
             dst_mcv;
 
         if (dst_mcv.isRegister()) {
             try self.truncateRegister(src_ty, dst_mcv.getReg().?);
-            break :result .{ .register = registerAlias(dst_mcv.getReg().?, abi_size) };
+            break :result .{ .register = registerAlias(dst_mcv.getReg().?, dst_abi_size) };
         }
 
         const src_limbs_len = math.divCeil(u16, src_int_info.bits, 64) catch unreachable;
@@ -2999,7 +3239,9 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void {
         );
 
         break :result dst_mcv;
-    };
+    }) orelse return self.fail("TODO implement airIntCast from {} to {}", .{
+        src_ty.fmt(mod), dst_ty.fmt(mod),
+    });
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
@@ -3022,7 +3264,7 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
             src_mcv
         else if (dst_abi_size <= 8)
             try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv)
-        else if (dst_abi_size <= 16) dst: {
+        else if (dst_abi_size <= 16 and !dst_ty.isVector(mod)) dst: {
             const dst_regs =
                 try self.register_manager.allocRegs(2, .{ inst, inst }, abi.RegisterClass.gp);
             const dst_mcv: MCValue = .{ .register_pair = dst_regs };
@@ -3032,26 +3274,29 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
             try self.genCopy(dst_ty, dst_mcv, src_mcv, .{});
             break :dst dst_mcv;
         } else dst: {
-            const dst_mcv = try self.allocRegOrMem(inst, true);
-            try self.genCopy(dst_ty, dst_mcv, src_mcv, .{});
+            const dst_mcv = try self.allocRegOrMemAdvanced(src_ty, inst, true);
+            try self.genCopy(src_ty, dst_mcv, src_mcv, .{});
             break :dst dst_mcv;
         };
 
         if (dst_ty.zigTypeTag(mod) == .Vector) {
             assert(src_ty.zigTypeTag(mod) == .Vector and dst_ty.vectorLen(mod) == src_ty.vectorLen(mod));
-            const dst_info = dst_ty.childType(mod).intInfo(mod);
-            const src_info = src_ty.childType(mod).intInfo(mod);
-            const mir_tag = @as(?Mir.Inst.FixedTag, switch (dst_info.bits) {
-                8 => switch (src_info.bits) {
-                    16 => switch (dst_ty.vectorLen(mod)) {
+            const dst_elem_ty = dst_ty.childType(mod);
+            const dst_elem_abi_size: u32 = @intCast(dst_elem_ty.abiSize(mod));
+            const src_elem_ty = src_ty.childType(mod);
+            const src_elem_abi_size: u32 = @intCast(src_elem_ty.abiSize(mod));
+
+            const mir_tag = @as(?Mir.Inst.FixedTag, switch (dst_elem_abi_size) {
+                1 => switch (src_elem_abi_size) {
+                    2 => switch (dst_ty.vectorLen(mod)) {
                         1...8 => if (self.hasFeature(.avx)) .{ .vp_b, .ackusw } else .{ .p_b, .ackusw },
                         9...16 => if (self.hasFeature(.avx2)) .{ .vp_b, .ackusw } else null,
                         else => null,
                     },
                     else => null,
                 },
-                16 => switch (src_info.bits) {
-                    32 => switch (dst_ty.vectorLen(mod)) {
+                2 => switch (src_elem_abi_size) {
+                    4 => switch (dst_ty.vectorLen(mod)) {
                         1...4 => if (self.hasFeature(.avx))
                             .{ .vp_w, .ackusd }
                         else if (self.hasFeature(.sse4_1))
@@ -3066,12 +3311,14 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
                 else => null,
             }) orelse return self.fail("TODO implement airTrunc for {}", .{dst_ty.fmt(mod)});
 
-            const elem_ty = src_ty.childType(mod);
-            const mask_val = try mod.intValue(elem_ty, @as(u64, math.maxInt(u64)) >> @intCast(64 - dst_info.bits));
+            const dst_info = dst_elem_ty.intInfo(mod);
+            const src_info = src_elem_ty.intInfo(mod);
+
+            const mask_val = try mod.intValue(src_elem_ty, @as(u64, math.maxInt(u64)) >> @intCast(64 - dst_info.bits));
 
             const splat_ty = try mod.vectorType(.{
                 .len = @intCast(@divExact(@as(u64, if (src_abi_size > 16) 256 else 128), src_info.bits)),
-                .child = elem_ty.ip_index,
+                .child = src_elem_ty.ip_index,
             });
             const splat_abi_size: u32 = @intCast(splat_ty.abiSize(mod));
 
@@ -3086,22 +3333,40 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
                 else => .{ .register = try self.copyToTmpRegister(Type.usize, splat_mcv.address()) },
             };
 
-            const dst_reg = registerAlias(dst_mcv.getReg().?, src_abi_size);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, src_abi_size);
             if (self.hasFeature(.avx)) {
                 try self.asmRegisterRegisterMemory(
                     .{ .vp_, .@"and" },
-                    dst_reg,
-                    dst_reg,
+                    dst_alias,
+                    dst_alias,
                     try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)),
                 );
-                try self.asmRegisterRegisterRegister(mir_tag, dst_reg, dst_reg, dst_reg);
+                if (src_abi_size > 16) {
+                    const temp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse);
+                    const temp_lock = self.register_manager.lockRegAssumeUnused(temp_reg);
+                    defer self.register_manager.unlockReg(temp_lock);
+
+                    try self.asmRegisterRegisterImmediate(
+                        .{ if (self.hasFeature(.avx2)) .v_i128 else .v_f128, .extract },
+                        registerAlias(temp_reg, dst_abi_size),
+                        dst_alias,
+                        Immediate.u(1),
+                    );
+                    try self.asmRegisterRegisterRegister(
+                        mir_tag,
+                        registerAlias(dst_reg, dst_abi_size),
+                        registerAlias(dst_reg, dst_abi_size),
+                        registerAlias(temp_reg, dst_abi_size),
+                    );
+                } else try self.asmRegisterRegisterRegister(mir_tag, dst_alias, dst_alias, dst_alias);
             } else {
                 try self.asmRegisterMemory(
                     .{ .p_, .@"and" },
-                    dst_reg,
+                    dst_alias,
                     try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)),
                 );
-                try self.asmRegisterRegister(mir_tag, dst_reg, dst_reg);
+                try self.asmRegisterRegister(mir_tag, dst_alias, dst_alias);
             }
             break :result dst_mcv;
         }
@@ -4045,7 +4310,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
             if (dst_info.bits > 128 and dst_info.signedness == .unsigned) {
                 const slow_inc = self.hasFeature(.slow_incdec);
                 const abi_size: u32 = @intCast(dst_ty.abiSize(mod));
-                const limb_len = std.math.divCeil(u32, abi_size, 8) catch unreachable;
+                const limb_len = math.divCeil(u32, abi_size, 8) catch unreachable;
 
                 try self.spillRegisters(&.{ .rax, .rcx, .rdx });
                 const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rax, .rcx, .rdx });
@@ -4534,7 +4799,7 @@ fn airShlShrBinOp(self: *Self, inst: Air.Inst.Index) !void {
         switch (lhs_ty.zigTypeTag(mod)) {
             .Int => {
                 try self.spillRegisters(&.{.rcx});
-                try self.register_manager.getReg(.rcx, null);
+                try self.register_manager.getKnownReg(.rcx, null);
                 const lhs_mcv = try self.resolveInst(bin_op.lhs);
                 const rhs_mcv = try self.resolveInst(bin_op.rhs);
 
@@ -6560,7 +6825,7 @@ fn floatSign(self: *Self, inst: Air.Inst.Index, operand: Air.Inst.Ref, ty: Type)
 
             const dst_mcv: MCValue = .{ .register = .st0 };
             if (!std.meta.eql(src_mcv, dst_mcv) or !self.reuseOperand(inst, operand, 0, src_mcv))
-                try self.register_manager.getReg(.st0, inst);
+                try self.register_manager.getKnownReg(.st0, inst);
 
             try self.genCopy(ty, dst_mcv, src_mcv, .{});
             switch (tag) {
@@ -6894,7 +7159,7 @@ fn airAbs(self: *Self, inst: Air.Inst.Index) !void {
                 },
                 else => {
                     const abi_size: u31 = @intCast(ty.abiSize(mod));
-                    const limb_len = std.math.divCeil(u31, abi_size, 8) catch unreachable;
+                    const limb_len = math.divCeil(u31, abi_size, 8) catch unreachable;
 
                     const tmp_regs =
                         try self.register_manager.allocRegs(3, .{null} ** 3, abi.RegisterClass.gp);
@@ -8181,7 +8446,7 @@ fn genShiftBinOpMir(
                         try self.asmRegisterImmediate(
                             .{ ._, .@"and" },
                             .cl,
-                            Immediate.u(std.math.maxInt(u6)),
+                            Immediate.u(math.maxInt(u6)),
                         );
                         try self.asmRegisterImmediate(
                             .{ ._r, .sh },
@@ -8218,7 +8483,7 @@ fn genShiftBinOpMir(
                         try self.asmRegisterImmediate(
                             .{ ._, .@"and" },
                             .cl,
-                            Immediate.u(std.math.maxInt(u6)),
+                            Immediate.u(math.maxInt(u6)),
                         );
                         try self.asmRegisterImmediate(
                             .{ ._r, .sh },
@@ -8283,7 +8548,7 @@ fn genShiftBinOpMir(
                     }, .sh },
                     temp_regs[2].to64(),
                     temp_regs[3].to64(),
-                    Immediate.u(shift_imm & std.math.maxInt(u6)),
+                    Immediate.u(shift_imm & math.maxInt(u6)),
                 ),
                 else => try self.asmRegisterRegisterRegister(.{ switch (tag[0]) {
                     ._l => ._ld,
@@ -8338,7 +8603,7 @@ fn genShiftBinOpMir(
             .immediate => |shift_imm| try self.asmRegisterImmediate(
                 tag,
                 temp_regs[2].to64(),
-                Immediate.u(shift_imm & std.math.maxInt(u6)),
+                Immediate.u(shift_imm & math.maxInt(u6)),
             ),
             else => try self.asmRegisterRegister(tag, temp_regs[2].to64(), .cl),
         }
@@ -8794,7 +9059,7 @@ fn genShiftBinOp(
         lhs_ty.fmt(mod),
     });
 
-    try self.register_manager.getReg(.rcx, null);
+    try self.register_manager.getKnownReg(.rcx, null);
     const rcx_lock = self.register_manager.lockReg(.rcx);
     defer if (rcx_lock) |lock| self.register_manager.unlockReg(lock);
 
@@ -8933,7 +9198,7 @@ fn genMulDivBinOp(
         switch (tag) {
             .mul, .mul_wrap => {
                 const slow_inc = self.hasFeature(.slow_incdec);
-                const limb_len = std.math.divCeil(u32, src_abi_size, 8) catch unreachable;
+                const limb_len = math.divCeil(u32, src_abi_size, 8) catch unreachable;
 
                 try self.spillRegisters(&.{ .rax, .rcx, .rdx });
                 const reg_locks = self.register_manager.lockRegs(3, .{ .rax, .rcx, .rdx });
@@ -9117,8 +9382,8 @@ fn genMulDivBinOp(
                 .rem => maybe_inst,
                 else => null,
             };
-            try self.register_manager.getReg(.rax, track_inst_rax);
-            try self.register_manager.getReg(.rdx, track_inst_rdx);
+            try self.register_manager.getKnownReg(.rax, track_inst_rax);
+            try self.register_manager.getKnownReg(.rdx, track_inst_rdx);
 
             try self.genIntMulDivOpMir(switch (signedness) {
                 .signed => switch (tag) {
@@ -9158,8 +9423,11 @@ fn genMulDivBinOp(
         },
 
         .mod => {
-            try self.register_manager.getReg(.rax, null);
-            try self.register_manager.getReg(.rdx, if (signedness == .unsigned) maybe_inst else null);
+            try self.register_manager.getKnownReg(.rax, null);
+            try self.register_manager.getKnownReg(
+                .rdx,
+                if (signedness == .unsigned) maybe_inst else null,
+            );
 
             switch (signedness) {
                 .signed => {
@@ -9200,8 +9468,11 @@ fn genMulDivBinOp(
         },
 
         .div_floor => {
-            try self.register_manager.getReg(.rax, if (signedness == .unsigned) maybe_inst else null);
-            try self.register_manager.getReg(.rdx, null);
+            try self.register_manager.getKnownReg(
+                .rax,
+                if (signedness == .unsigned) maybe_inst else null,
+            );
+            try self.register_manager.getKnownReg(.rdx, null);
 
             const lhs_lock: ?RegisterLock = switch (lhs_mcv) {
                 .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
@@ -9445,7 +9716,7 @@ fn genBinOp(
         .rem, .mod => unreachable,
         .max, .min => if (lhs_ty.scalarType(mod).isRuntimeFloat()) registerAlias(
             if (!self.hasFeature(.avx) and self.hasFeature(.sse4_1)) mask: {
-                try self.register_manager.getReg(.xmm0, null);
+                try self.register_manager.getKnownReg(.xmm0, null);
                 break :mask .xmm0;
             } else try self.register_manager.allocReg(null, abi.RegisterClass.sse),
             abi_size,
@@ -10820,96 +11091,35 @@ fn genBinOp(
                 lhs_copy_reg.?,
                 mask_reg,
             ) else {
-                try self.asmRegisterRegister(
-                    @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) {
-                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                            32 => .{ ._ps, .@"and" },
-                            64 => .{ ._pd, .@"and" },
-                            16, 80, 128 => null,
-                            else => unreachable,
-                        },
-                        .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
-                            .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) {
-                                32 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...4 => .{ ._ps, .@"and" },
-                                    else => null,
-                                },
-                                64 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...2 => .{ ._pd, .@"and" },
-                                    else => null,
-                                },
-                                16, 80, 128 => null,
-                                else => unreachable,
-                            },
-                            else => unreachable,
-                        },
+                const mir_fixes = @as(?Mir.Inst.Fixes, switch (lhs_ty.zigTypeTag(mod)) {
+                    .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                        32 => ._ps,
+                        64 => ._pd,
+                        16, 80, 128 => null,
                         else => unreachable,
-                    }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{
-                        @tagName(air_tag), lhs_ty.fmt(mod),
-                    }),
-                    dst_reg,
-                    mask_reg,
-                );
-                try self.asmRegisterRegister(
-                    @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) {
-                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                            32 => .{ ._ps, .andn },
-                            64 => .{ ._pd, .andn },
-                            16, 80, 128 => null,
-                            else => unreachable,
-                        },
-                        .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
-                            .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) {
-                                32 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...4 => .{ ._ps, .andn },
-                                    else => null,
-                                },
-                                64 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...2 => .{ ._pd, .andn },
-                                    else => null,
-                                },
-                                16, 80, 128 => null,
-                                else => unreachable,
+                    },
+                    .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
+                        .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) {
+                            32 => switch (lhs_ty.vectorLen(mod)) {
+                                1...4 => ._ps,
+                                else => null,
                             },
-                            else => unreachable,
-                        },
-                        else => unreachable,
-                    }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{
-                        @tagName(air_tag), lhs_ty.fmt(mod),
-                    }),
-                    mask_reg,
-                    lhs_copy_reg.?,
-                );
-                try self.asmRegisterRegister(
-                    @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) {
-                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                            32 => .{ ._ps, .@"or" },
-                            64 => .{ ._pd, .@"or" },
-                            16, 80, 128 => null,
-                            else => unreachable,
-                        },
-                        .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
-                            .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) {
-                                32 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...4 => .{ ._ps, .@"or" },
-                                    else => null,
-                                },
-                                64 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...2 => .{ ._pd, .@"or" },
-                                    else => null,
-                                },
-                                16, 80, 128 => null,
-                                else => unreachable,
+                            64 => switch (lhs_ty.vectorLen(mod)) {
+                                1...2 => ._pd,
+                                else => null,
                             },
+                            16, 80, 128 => null,
                             else => unreachable,
                         },
                         else => unreachable,
-                    }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{
-                        @tagName(air_tag), lhs_ty.fmt(mod),
-                    }),
-                    dst_reg,
-                    mask_reg,
-                );
+                    },
+                    else => unreachable,
+                }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(air_tag), lhs_ty.fmt(mod),
+                });
+                try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_reg, mask_reg);
+                try self.asmRegisterRegister(.{ mir_fixes, .andn }, mask_reg, lhs_copy_reg.?);
+                try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_reg, mask_reg);
             }
         },
         .cmp_lt, .cmp_lte, .cmp_eq, .cmp_gte, .cmp_gt, .cmp_neq => {
@@ -12192,48 +12402,10 @@ fn airRetLoad(self: *Self, inst: Air.Inst.Index) !void {
 fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
     const mod = self.bin_file.comp.module.?;
     const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const ty = self.typeOf(bin_op.lhs);
+    var ty = self.typeOf(bin_op.lhs);
+    var null_compare: ?Mir.Inst.Index = null;
 
     const result: Condition = result: {
-        switch (ty.zigTypeTag(mod)) {
-            .Float => {
-                const float_bits = ty.floatBits(self.target.*);
-                if (switch (float_bits) {
-                    16 => !self.hasFeature(.f16c),
-                    32, 64 => false,
-                    80, 128 => true,
-                    else => unreachable,
-                }) {
-                    var callee_buf: ["__???f2".len]u8 = undefined;
-                    const ret = try self.genCall(.{ .lib = .{
-                        .return_type = .i32_type,
-                        .param_types = &.{ ty.toIntern(), ty.toIntern() },
-                        .callee = std.fmt.bufPrint(&callee_buf, "__{s}{c}f2", .{
-                            switch (op) {
-                                .eq => "eq",
-                                .neq => "ne",
-                                .lt => "lt",
-                                .lte => "le",
-                                .gt => "gt",
-                                .gte => "ge",
-                            },
-                            floatCompilerRtAbiName(float_bits),
-                        }) catch unreachable,
-                    } }, &.{ ty, ty }, &.{ .{ .air_ref = bin_op.lhs }, .{ .air_ref = bin_op.rhs } });
-                    try self.genBinOpMir(.{ ._, .@"test" }, Type.i32, ret, ret);
-                    break :result switch (op) {
-                        .eq => .e,
-                        .neq => .ne,
-                        .lt => .l,
-                        .lte => .le,
-                        .gt => .g,
-                        .gte => .ge,
-                    };
-                }
-            },
-            else => {},
-        }
-
         try self.spillEflagsIfOccupied();
 
         const lhs_mcv = try self.resolveInst(bin_op.lhs);
@@ -12261,6 +12433,103 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
         defer for (rhs_locks) |rhs_lock| if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
 
         switch (ty.zigTypeTag(mod)) {
+            .Float => {
+                const float_bits = ty.floatBits(self.target.*);
+                if (switch (float_bits) {
+                    16 => !self.hasFeature(.f16c),
+                    32, 64 => false,
+                    80, 128 => true,
+                    else => unreachable,
+                }) {
+                    var callee_buf: ["__???f2".len]u8 = undefined;
+                    const ret = try self.genCall(.{ .lib = .{
+                        .return_type = .i32_type,
+                        .param_types = &.{ ty.toIntern(), ty.toIntern() },
+                        .callee = std.fmt.bufPrint(&callee_buf, "__{s}{c}f2", .{
+                            switch (op) {
+                                .eq => "eq",
+                                .neq => "ne",
+                                .lt => "lt",
+                                .lte => "le",
+                                .gt => "gt",
+                                .gte => "ge",
+                            },
+                            floatCompilerRtAbiName(float_bits),
+                        }) catch unreachable,
+                    } }, &.{ ty, ty }, &.{ .{ .air_ref = bin_op.lhs }, .{ .air_ref = bin_op.rhs } });
+                    try self.genBinOpMir(.{ ._, .@"test" }, Type.i32, ret, ret);
+                    break :result switch (op) {
+                        .eq => .e,
+                        .neq => .ne,
+                        .lt => .l,
+                        .lte => .le,
+                        .gt => .g,
+                        .gte => .ge,
+                    };
+                }
+            },
+            .Optional => if (!ty.optionalReprIsPayload(mod)) {
+                const opt_ty = ty;
+                const opt_abi_size: u31 = @intCast(opt_ty.abiSize(mod));
+                ty = opt_ty.optionalChild(mod);
+                const payload_abi_size: u31 = @intCast(ty.abiSize(mod));
+
+                const temp_lhs_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                const temp_lhs_lock = self.register_manager.lockRegAssumeUnused(temp_lhs_reg);
+                defer self.register_manager.unlockReg(temp_lhs_lock);
+
+                if (lhs_mcv.isMemory()) try self.asmRegisterMemory(
+                    .{ ._, .mov },
+                    temp_lhs_reg.to8(),
+                    try lhs_mcv.address().offset(payload_abi_size).deref().mem(self, .byte),
+                ) else {
+                    try self.genSetReg(temp_lhs_reg, opt_ty, lhs_mcv, .{});
+                    try self.asmRegisterImmediate(
+                        .{ ._r, .sh },
+                        registerAlias(temp_lhs_reg, opt_abi_size),
+                        Immediate.u(payload_abi_size * 8),
+                    );
+                }
+
+                const payload_compare = payload_compare: {
+                    if (rhs_mcv.isMemory()) {
+                        const rhs_mem =
+                            try rhs_mcv.address().offset(payload_abi_size).deref().mem(self, .byte);
+                        try self.asmMemoryRegister(.{ ._, .@"test" }, rhs_mem, temp_lhs_reg.to8());
+                        const payload_compare = try self.asmJccReloc(.nz, undefined);
+                        try self.asmRegisterMemory(.{ ._, .cmp }, temp_lhs_reg.to8(), rhs_mem);
+                        break :payload_compare payload_compare;
+                    }
+
+                    const temp_rhs_reg = try self.copyToTmpRegister(opt_ty, rhs_mcv);
+                    const temp_rhs_lock = self.register_manager.lockRegAssumeUnused(temp_rhs_reg);
+                    defer self.register_manager.unlockReg(temp_rhs_lock);
+
+                    try self.asmRegisterImmediate(
+                        .{ ._r, .sh },
+                        registerAlias(temp_rhs_reg, opt_abi_size),
+                        Immediate.u(payload_abi_size * 8),
+                    );
+                    try self.asmRegisterRegister(
+                        .{ ._, .@"test" },
+                        temp_lhs_reg.to8(),
+                        temp_rhs_reg.to8(),
+                    );
+                    const payload_compare = try self.asmJccReloc(.nz, undefined);
+                    try self.asmRegisterRegister(
+                        .{ ._, .cmp },
+                        temp_lhs_reg.to8(),
+                        temp_rhs_reg.to8(),
+                    );
+                    break :payload_compare payload_compare;
+                };
+                null_compare = try self.asmJmpReloc(undefined);
+                self.performReloc(payload_compare);
+            },
+            else => {},
+        }
+
+        switch (ty.zigTypeTag(mod)) {
             else => {
                 const abi_size: u16 = @intCast(ty.abiSize(mod));
                 const may_flip: enum {
@@ -12571,6 +12840,7 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
         }
     };
 
+    if (null_compare) |reloc| self.performReloc(reloc);
     self.eflags_inst = inst;
     return self.finishAir(inst, .{ .eflags = result }, .{ bin_op.lhs, bin_op.rhs, .none });
 }
@@ -13521,6 +13791,7 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         } else if (constraint.len == 1 and std.ascii.isDigit(constraint[0])) arg: {
             const index = std.fmt.charToDigit(constraint[0], 10) catch unreachable;
             if (index >= args.items.len) return self.fail("constraint out of bounds: '{s}'", .{constraint});
+            try self.genCopy(ty, args.items[index], input_mcv, .{});
             break :arg args.items[index];
         } else return self.fail("invalid constraint: '{s}'", .{constraint});
         if (arg_mcv.getReg()) |reg| if (RegisterManager.indexOfRegIntoTracked(reg)) |_| {
@@ -13619,25 +13890,26 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
             label_gop.value_ptr.target = @intCast(self.mir_instructions.len);
         } else continue;
 
-        var mnem_size: ?Memory.Size = null;
-        const mnem_tag = mnem: {
-            mnem_size = if (mem.endsWith(u8, mnem_str, "b"))
-                .byte
-            else if (mem.endsWith(u8, mnem_str, "w"))
-                .word
-            else if (mem.endsWith(u8, mnem_str, "l"))
-                .dword
-            else if (mem.endsWith(u8, mnem_str, "q"))
-                .qword
-            else if (mem.endsWith(u8, mnem_str, "t"))
-                .tbyte
-            else
-                break :mnem null;
-            break :mnem std.meta.stringToEnum(Instruction.Mnemonic, mnem_str[0 .. mnem_str.len - 1]);
-        } orelse mnem: {
+        var mnem_size: ?Memory.Size = if (mem.endsWith(u8, mnem_str, "b"))
+            .byte
+        else if (mem.endsWith(u8, mnem_str, "w"))
+            .word
+        else if (mem.endsWith(u8, mnem_str, "l"))
+            .dword
+        else if (mem.endsWith(u8, mnem_str, "q") and
+            (std.mem.indexOfScalar(u8, "vp", mnem_str[0]) == null or !mem.endsWith(u8, mnem_str, "dq")))
+            .qword
+        else if (mem.endsWith(u8, mnem_str, "t"))
+            .tbyte
+        else
+            null;
+        const mnem_tag = while (true) break std.meta.stringToEnum(
+            Instruction.Mnemonic,
+            mnem_str[0 .. mnem_str.len - @intFromBool(mnem_size != null)],
+        ) orelse if (mnem_size) |_| {
             mnem_size = null;
-            break :mnem std.meta.stringToEnum(Instruction.Mnemonic, mnem_str);
-        } orelse return self.fail("invalid mnemonic: '{s}'", .{mnem_str});
+            continue;
+        } else return self.fail("invalid mnemonic: '{s}'", .{mnem_str});
         if (@as(?Memory.Size, switch (mnem_tag) {
             .clflush => .byte,
             .fldenv, .fnstenv, .fstenv => .none,
@@ -14135,30 +14407,8 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                     else => {},
                 },
                 .Int => switch (ty.childType(mod).intInfo(mod).bits) {
-                    8 => switch (ty.vectorLen(mod)) {
-                        1 => if (self.hasFeature(.avx)) return .{ .vex_insert_extract = .{
-                            .insert = .{ .vp_b, .insr },
-                            .extract = .{ .vp_b, .extr },
-                        } } else if (self.hasFeature(.sse4_2)) return .{ .insert_extract = .{
-                            .insert = .{ .p_b, .insr },
-                            .extract = .{ .p_b, .extr },
-                        } },
-                        2 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
-                            .insert = .{ .vp_w, .insr },
-                            .extract = .{ .vp_w, .extr },
-                        } } else .{ .insert_extract = .{
-                            .insert = .{ .p_w, .insr },
-                            .extract = .{ .p_w, .extr },
-                        } },
-                        3...4 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_d, .mov }
-                        else
-                            .{ ._d, .mov } },
-                        5...8 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_q, .mov }
-                        else
-                            .{ ._q, .mov } },
-                        9...16 => return .{ .move = if (self.hasFeature(.avx))
+                    1...8 => switch (ty.vectorLen(mod)) {
+                        1...16 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
                         else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
                         17...32 => if (self.hasFeature(.avx))
@@ -14168,23 +14418,8 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                                 .{ .v_, .movdqu } },
                         else => {},
                     },
-                    16 => switch (ty.vectorLen(mod)) {
-                        1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
-                            .insert = .{ .vp_w, .insr },
-                            .extract = .{ .vp_w, .extr },
-                        } } else .{ .insert_extract = .{
-                            .insert = .{ .p_w, .insr },
-                            .extract = .{ .p_w, .extr },
-                        } },
-                        2 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_d, .mov }
-                        else
-                            .{ ._d, .mov } },
-                        3...4 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_q, .mov }
-                        else
-                            .{ ._q, .mov } },
-                        5...8 => return .{ .move = if (self.hasFeature(.avx))
+                    9...16 => switch (ty.vectorLen(mod)) {
+                        1...8 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
                         else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
                         9...16 => if (self.hasFeature(.avx))
@@ -14194,16 +14429,8 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                                 .{ .v_, .movdqu } },
                         else => {},
                     },
-                    32 => switch (ty.vectorLen(mod)) {
-                        1 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_d, .mov }
-                        else
-                            .{ ._d, .mov } },
-                        2 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_q, .mov }
-                        else
-                            .{ ._q, .mov } },
-                        3...4 => return .{ .move = if (self.hasFeature(.avx))
+                    17...32 => switch (ty.vectorLen(mod)) {
+                        1...4 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
                         else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
                         5...8 => if (self.hasFeature(.avx))
@@ -14213,12 +14440,8 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                                 .{ .v_, .movdqu } },
                         else => {},
                     },
-                    64 => switch (ty.vectorLen(mod)) {
-                        1 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_q, .mov }
-                        else
-                            .{ ._q, .mov } },
-                        2 => return .{ .move = if (self.hasFeature(.avx))
+                    33...64 => switch (ty.vectorLen(mod)) {
+                        1...2 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
                         else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
                         3...4 => if (self.hasFeature(.avx))
@@ -14228,7 +14451,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                                 .{ .v_, .movdqu } },
                         else => {},
                     },
-                    128 => switch (ty.vectorLen(mod)) {
+                    65...128 => switch (ty.vectorLen(mod)) {
                         1 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
                         else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
@@ -14239,7 +14462,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                                 .{ .v_, .movdqu } },
                         else => {},
                     },
-                    256 => switch (ty.vectorLen(mod)) {
+                    129...256 => switch (ty.vectorLen(mod)) {
                         1 => if (self.hasFeature(.avx))
                             return .{ .move = if (aligned)
                                 .{ .v_, .movdqa }
@@ -14251,11 +14474,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                 },
                 .Pointer, .Optional => if (ty.childType(mod).isPtrAtRuntime(mod))
                     switch (ty.vectorLen(mod)) {
-                        1 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_q, .mov }
-                        else
-                            .{ ._q, .mov } },
-                        2 => return .{ .move = if (self.hasFeature(.avx))
+                        1...2 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
                         else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
                         3...4 => if (self.hasFeature(.avx))
@@ -14269,22 +14488,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                     unreachable,
                 .Float => switch (ty.childType(mod).floatBits(self.target.*)) {
                     16 => switch (ty.vectorLen(mod)) {
-                        1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
-                            .insert = .{ .vp_w, .insr },
-                            .extract = .{ .vp_w, .extr },
-                        } } else .{ .insert_extract = .{
-                            .insert = .{ .p_w, .insr },
-                            .extract = .{ .p_w, .extr },
-                        } },
-                        2 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_d, .mov }
-                        else
-                            .{ ._d, .mov } },
-                        3...4 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_q, .mov }
-                        else
-                            .{ ._q, .mov } },
-                        5...8 => return .{ .move = if (self.hasFeature(.avx))
+                        1...8 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
                         else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
                         9...16 => if (self.hasFeature(.avx))
@@ -14295,15 +14499,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                         else => {},
                     },
                     32 => switch (ty.vectorLen(mod)) {
-                        1 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_ss, .mov }
-                        else
-                            .{ ._ss, .mov } },
-                        2 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_sd, .mov }
-                        else
-                            .{ ._sd, .mov } },
-                        3...4 => return .{ .move = if (self.hasFeature(.avx))
+                        1...4 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
                         else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } },
                         5...8 => if (self.hasFeature(.avx))
@@ -14314,11 +14510,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                         else => {},
                     },
                     64 => switch (ty.vectorLen(mod)) {
-                        1 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_sd, .mov }
-                        else
-                            .{ ._sd, .mov } },
-                        2 => return .{ .move = if (self.hasFeature(.avx))
+                        1...2 => return .{ .move = if (self.hasFeature(.avx))
                             if (aligned) .{ .v_pd, .mova } else .{ .v_pd, .movu }
                         else if (aligned) .{ ._pd, .mova } else .{ ._pd, .movu } },
                         3...4 => if (self.hasFeature(.avx))
@@ -14633,7 +14825,7 @@ fn genSetReg(
                 ty,
                 dst_reg.class(),
                 self.getFrameAddrAlignment(frame_addr).compare(.gte, Alignment.fromLog2Units(
-                    std.math.log2_int_ceil(u10, @divExact(dst_reg.bitSize(), 8)),
+                    math.log2_int_ceil(u10, @divExact(dst_reg.bitSize(), 8)),
                 )),
             ),
             .lea_frame => .{ .move = .{ ._, .lea } },
@@ -16296,7 +16488,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                     },
                     65...128 => switch (vector_len) {
                         else => null,
-                        1...2 => .{ .vp_i128, .broadcast },
+                        1...2 => .{ .v_i128, .broadcast },
                     },
                 }) orelse break :avx2;
 
@@ -16310,7 +16502,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                     registerAlias(dst_reg, @intCast(vector_ty.abiSize(mod))),
                     try src_mcv.mem(self, self.memSize(scalar_ty)),
                 ) else {
-                    if (mir_tag[0] == .vp_i128) break :avx2;
+                    if (mir_tag[0] == .v_i128) break :avx2;
                     try self.genSetReg(dst_reg, scalar_ty, src_mcv, .{});
                     try self.asmRegisterRegister(
                         mir_tag,
@@ -16352,7 +16544,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                     .{ if (self.hasFeature(.avx)) .vp_w else .p_w, .shufl },
                     dst_alias,
                     dst_alias,
-                    Immediate.u(0),
+                    Immediate.u(0b00_00_00_00),
                 );
                 if (switch (scalar_bits) {
                     1...8 => vector_len > 4,
@@ -16563,18 +16755,1158 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
 }
 
 fn airSelect(self: *Self, inst: Air.Inst.Index) !void {
+    const mod = self.bin_file.comp.module.?;
     const pl_op = self.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
     const extra = self.air.extraData(Air.Bin, pl_op.payload).data;
-    _ = extra;
-    return self.fail("TODO implement airSelect for x86_64", .{});
-    //return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
+    const ty = self.typeOfIndex(inst);
+    const vec_len = ty.vectorLen(mod);
+    const elem_ty = ty.childType(mod);
+    const elem_abi_size: u32 = @intCast(elem_ty.abiSize(mod));
+    const abi_size: u32 = @intCast(ty.abiSize(mod));
+    const pred_ty = self.typeOf(pl_op.operand);
+
+    const result = result: {
+        const has_blend = self.hasFeature(.sse4_1);
+        const has_avx = self.hasFeature(.avx);
+        const need_xmm0 = has_blend and !has_avx;
+        const pred_mcv = try self.resolveInst(pl_op.operand);
+        const mask_reg = mask: {
+            switch (pred_mcv) {
+                .register => |pred_reg| switch (pred_reg.class()) {
+                    .general_purpose => {},
+                    .sse => if (need_xmm0 and pred_reg.id() != comptime Register.xmm0.id()) {
+                        try self.register_manager.getKnownReg(.xmm0, null);
+                        try self.genSetReg(.xmm0, pred_ty, pred_mcv, .{});
+                        break :mask .xmm0;
+                    } else break :mask if (has_blend)
+                        pred_reg
+                    else
+                        try self.copyToTmpRegister(pred_ty, pred_mcv),
+                    else => unreachable,
+                },
+                else => {},
+            }
+            const mask_reg: Register = if (need_xmm0) mask_reg: {
+                try self.register_manager.getKnownReg(.xmm0, null);
+                break :mask_reg .xmm0;
+            } else try self.register_manager.allocReg(null, abi.RegisterClass.sse);
+            const mask_alias = registerAlias(mask_reg, abi_size);
+            const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg);
+            defer self.register_manager.unlockReg(mask_lock);
+
+            const pred_fits_in_elem = vec_len <= elem_abi_size;
+            if (self.hasFeature(.avx2) and abi_size <= 32) {
+                if (pred_mcv.isRegister()) broadcast: {
+                    try self.asmRegisterRegister(
+                        .{ .v_d, .mov },
+                        mask_reg.to128(),
+                        pred_mcv.getReg().?.to32(),
+                    );
+                    if (pred_fits_in_elem and vec_len > 1) try self.asmRegisterRegister(
+                        .{ switch (elem_abi_size) {
+                            1 => .vp_b,
+                            2 => .vp_w,
+                            3...4 => .vp_d,
+                            5...8 => .vp_q,
+                            9...16 => {
+                                try self.asmRegisterRegisterRegisterImmediate(
+                                    .{ .v_f128, .insert },
+                                    mask_alias,
+                                    mask_alias,
+                                    mask_reg.to128(),
+                                    Immediate.u(1),
+                                );
+                                break :broadcast;
+                            },
+                            17...32 => break :broadcast,
+                            else => unreachable,
+                        }, .broadcast },
+                        mask_alias,
+                        mask_reg.to128(),
+                    );
+                } else try self.asmRegisterMemory(
+                    .{ switch (vec_len) {
+                        1...8 => .vp_b,
+                        9...16 => .vp_w,
+                        17...32 => .vp_d,
+                        else => unreachable,
+                    }, .broadcast },
+                    mask_alias,
+                    if (pred_mcv.isMemory()) try pred_mcv.mem(self, .byte) else .{
+                        .base = .{ .reg = (try self.copyToTmpRegister(
+                            Type.usize,
+                            pred_mcv.address(),
+                        )).to64() },
+                        .mod = .{ .rm = .{ .size = .byte } },
+                    },
+                );
+            } else if (abi_size <= 16) broadcast: {
+                try self.asmRegisterRegister(
+                    .{ if (has_avx) .v_d else ._d, .mov },
+                    mask_alias,
+                    (if (pred_mcv.isRegister())
+                        pred_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(pred_ty, pred_mcv.address())).to32(),
+                );
+                if (!pred_fits_in_elem or vec_len == 1) break :broadcast;
+                if (elem_abi_size <= 1) {
+                    if (has_avx) try self.asmRegisterRegisterRegister(
+                        .{ .vp_, .unpcklbw },
+                        mask_alias,
+                        mask_alias,
+                        mask_alias,
+                    ) else try self.asmRegisterRegister(
+                        .{ .p_, .unpcklbw },
+                        mask_alias,
+                        mask_alias,
+                    );
+                    if (abi_size <= 2) break :broadcast;
+                }
+                if (elem_abi_size <= 2) {
+                    try self.asmRegisterRegisterImmediate(
+                        .{ if (has_avx) .vp_w else .p_w, .shufl },
+                        mask_alias,
+                        mask_alias,
+                        Immediate.u(0b00_00_00_00),
+                    );
+                    if (abi_size <= 8) break :broadcast;
+                }
+                try self.asmRegisterRegisterImmediate(
+                    .{ if (has_avx) .vp_d else .p_d, .shuf },
+                    mask_alias,
+                    mask_alias,
+                    Immediate.u(switch (elem_abi_size) {
+                        1...2, 5...8 => 0b01_00_01_00,
+                        3...4 => 0b00_00_00_00,
+                        else => unreachable,
+                    }),
+                );
+            } else return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)});
+            const elem_bits: u16 = @intCast(elem_abi_size * 8);
+            const mask_elem_ty = try mod.intType(.unsigned, elem_bits);
+            const mask_ty = try mod.vectorType(.{ .len = vec_len, .child = mask_elem_ty.toIntern() });
+            if (!pred_fits_in_elem) if (self.hasFeature(.ssse3)) {
+                var mask_elems: [32]InternPool.Index = undefined;
+                for (mask_elems[0..vec_len], 0..) |*elem, bit| elem.* = try mod.intern(.{ .int = .{
+                    .ty = mask_elem_ty.toIntern(),
+                    .storage = .{ .u64 = bit / elem_bits },
+                } });
+                const mask_mcv = try self.genTypedValue(.{
+                    .ty = mask_ty,
+                    .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{
+                        .ty = mask_ty.toIntern(),
+                        .storage = .{ .elems = mask_elems[0..vec_len] },
+                    } })),
+                });
+                const mask_mem: Memory = .{
+                    .base = .{ .reg = try self.copyToTmpRegister(Type.usize, mask_mcv.address()) },
+                    .mod = .{ .rm = .{ .size = self.memSize(ty) } },
+                };
+                if (has_avx) try self.asmRegisterRegisterMemory(
+                    .{ .vp_b, .shuf },
+                    mask_alias,
+                    mask_alias,
+                    mask_mem,
+                ) else try self.asmRegisterMemory(
+                    .{ .p_b, .shuf },
+                    mask_alias,
+                    mask_mem,
+                );
+            } else return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)});
+            {
+                var mask_elems: [32]InternPool.Index = undefined;
+                for (mask_elems[0..vec_len], 0..) |*elem, bit| elem.* = try mod.intern(.{ .int = .{
+                    .ty = mask_elem_ty.toIntern(),
+                    .storage = .{ .u64 = @as(u32, 1) << @intCast(bit & (elem_bits - 1)) },
+                } });
+                const mask_mcv = try self.genTypedValue(.{
+                    .ty = mask_ty,
+                    .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{
+                        .ty = mask_ty.toIntern(),
+                        .storage = .{ .elems = mask_elems[0..vec_len] },
+                    } })),
+                });
+                const mask_mem: Memory = .{
+                    .base = .{ .reg = try self.copyToTmpRegister(Type.usize, mask_mcv.address()) },
+                    .mod = .{ .rm = .{ .size = self.memSize(ty) } },
+                };
+                if (has_avx) {
+                    try self.asmRegisterRegisterMemory(
+                        .{ .vp_, .@"and" },
+                        mask_alias,
+                        mask_alias,
+                        mask_mem,
+                    );
+                    try self.asmRegisterRegisterMemory(
+                        .{ .vp_d, .cmpeq },
+                        mask_alias,
+                        mask_alias,
+                        mask_mem,
+                    );
+                } else {
+                    try self.asmRegisterMemory(
+                        .{ .p_, .@"and" },
+                        mask_alias,
+                        mask_mem,
+                    );
+                    try self.asmRegisterMemory(
+                        .{ .p_d, .cmpeq },
+                        mask_alias,
+                        mask_mem,
+                    );
+                }
+            }
+            break :mask mask_reg;
+        };
+        const mask_alias = registerAlias(mask_reg, abi_size);
+        const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg);
+        defer self.register_manager.unlockReg(mask_lock);
+
+        const lhs_mcv = try self.resolveInst(extra.lhs);
+        const lhs_lock = switch (lhs_mcv) {
+            .register => |lhs_reg| self.register_manager.lockRegAssumeUnused(lhs_reg),
+            else => null,
+        };
+        defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+        const rhs_mcv = try self.resolveInst(extra.rhs);
+        const rhs_lock = switch (rhs_mcv) {
+            .register => |rhs_reg| self.register_manager.lockReg(rhs_reg),
+            else => null,
+        };
+        defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+        const reuse_mcv = if (has_blend) rhs_mcv else lhs_mcv;
+        const dst_mcv: MCValue = if (reuse_mcv.isRegister() and self.reuseOperand(
+            inst,
+            if (has_blend) extra.rhs else extra.lhs,
+            @intFromBool(has_blend),
+            reuse_mcv,
+        )) reuse_mcv else if (has_avx)
+            .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+        else
+            try self.copyToRegisterWithInstTracking(inst, ty, reuse_mcv);
+        const dst_reg = dst_mcv.getReg().?;
+        const dst_alias = registerAlias(dst_reg, abi_size);
+        const dst_lock = self.register_manager.lockReg(dst_reg);
+        defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
+
+        const mir_tag = @as(?Mir.Inst.FixedTag, switch (ty.childType(mod).zigTypeTag(mod)) {
+            else => null,
+            .Int => switch (abi_size) {
+                0 => unreachable,
+                1...16 => if (has_avx)
+                    .{ .vp_b, .blendv }
+                else if (has_blend)
+                    .{ .p_b, .blendv }
+                else
+                    .{ .p_, undefined },
+                17...32 => if (self.hasFeature(.avx2))
+                    .{ .vp_b, .blendv }
+                else
+                    null,
+                else => null,
+            },
+            .Float => switch (ty.childType(mod).floatBits(self.target.*)) {
+                else => unreachable,
+                16, 80, 128 => null,
+                32 => switch (vec_len) {
+                    0 => unreachable,
+                    1...4 => if (has_avx) .{ .v_ps, .blendv } else .{ ._ps, .blendv },
+                    5...8 => if (has_avx) .{ .v_ps, .blendv } else null,
+                    else => null,
+                },
+                64 => switch (vec_len) {
+                    0 => unreachable,
+                    1...2 => if (has_avx) .{ .v_pd, .blendv } else .{ ._pd, .blendv },
+                    3...4 => if (has_avx) .{ .v_pd, .blendv } else null,
+                    else => null,
+                },
+            },
+        }) orelse return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)});
+        if (has_avx) {
+            const rhs_alias = if (rhs_mcv.isRegister())
+                registerAlias(rhs_mcv.getReg().?, abi_size)
+            else rhs: {
+                try self.genSetReg(dst_reg, ty, rhs_mcv, .{});
+                break :rhs dst_alias;
+            };
+            if (lhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryRegister(
+                mir_tag,
+                dst_alias,
+                rhs_alias,
+                try lhs_mcv.mem(self, self.memSize(ty)),
+                mask_alias,
+            ) else try self.asmRegisterRegisterRegisterRegister(
+                mir_tag,
+                dst_alias,
+                rhs_alias,
+                registerAlias(if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(ty, lhs_mcv), abi_size),
+                mask_alias,
+            );
+        } else if (has_blend) if (lhs_mcv.isMemory()) try self.asmRegisterMemoryRegister(
+            mir_tag,
+            dst_alias,
+            try lhs_mcv.mem(self, self.memSize(ty)),
+            mask_alias,
+        ) else try self.asmRegisterRegisterRegister(
+            mir_tag,
+            dst_alias,
+            registerAlias(if (lhs_mcv.isRegister())
+                lhs_mcv.getReg().?
+            else
+                try self.copyToTmpRegister(ty, lhs_mcv), abi_size),
+            mask_alias,
+        ) else {
+            const mir_fixes = @as(?Mir.Inst.Fixes, switch (elem_ty.zigTypeTag(mod)) {
+                else => null,
+                .Int => .p_,
+                .Float => switch (elem_ty.floatBits(self.target.*)) {
+                    32 => ._ps,
+                    64 => ._pd,
+                    16, 80, 128 => null,
+                    else => unreachable,
+                },
+            }) orelse return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)});
+            try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_alias, mask_alias);
+            if (rhs_mcv.isMemory()) try self.asmRegisterMemory(
+                .{ mir_fixes, .andn },
+                mask_alias,
+                try rhs_mcv.mem(self, Memory.Size.fromSize(abi_size)),
+            ) else try self.asmRegisterRegister(
+                .{ mir_fixes, .andn },
+                mask_alias,
+                if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(ty, rhs_mcv),
+            );
+            try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_alias, mask_alias);
+        }
+        break :result dst_mcv;
+    };
+    return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
 fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
+    const mod = self.bin_file.comp.module.?;
     const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-    _ = ty_pl;
-    return self.fail("TODO implement airShuffle for x86_64", .{});
-    //return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+    const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
+
+    const dst_ty = self.typeOfIndex(inst);
+    const elem_ty = dst_ty.childType(mod);
+    const elem_abi_size: u16 = @intCast(elem_ty.abiSize(mod));
+    const dst_abi_size: u32 = @intCast(dst_ty.abiSize(mod));
+    const lhs_ty = self.typeOf(extra.a);
+    const lhs_abi_size: u32 = @intCast(lhs_ty.abiSize(mod));
+    const rhs_ty = self.typeOf(extra.b);
+    const rhs_abi_size: u32 = @intCast(rhs_ty.abiSize(mod));
+    const max_abi_size = @max(dst_abi_size, lhs_abi_size, rhs_abi_size);
+
+    const ExpectedContents = [32]?i32;
+    var stack align(@max(@alignOf(ExpectedContents), @alignOf(std.heap.StackFallbackAllocator(0)))) =
+        std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa);
+    const allocator = stack.get();
+
+    const mask_elems = try allocator.alloc(?i32, extra.mask_len);
+    defer allocator.free(mask_elems);
+    for (mask_elems, 0..) |*mask_elem, elem_index| {
+        const mask_elem_val =
+            Value.fromInterned(extra.mask).elemValue(mod, elem_index) catch unreachable;
+        mask_elem.* = if (mask_elem_val.isUndef(mod))
+            null
+        else
+            @intCast(mask_elem_val.toSignedInt(mod));
+    }
+
+    const has_avx = self.hasFeature(.avx);
+    const result = @as(?MCValue, result: {
+        for (mask_elems) |mask_elem| {
+            if (mask_elem) |_| break;
+        } else break :result try self.allocRegOrMem(inst, true);
+
+        for (mask_elems, 0..) |mask_elem, elem_index| {
+            if (mask_elem orelse continue != elem_index) break;
+        } else {
+            const lhs_mcv = try self.resolveInst(extra.a);
+            if (self.reuseOperand(inst, extra.a, 0, lhs_mcv)) break :result lhs_mcv;
+            const dst_mcv = try self.allocRegOrMem(inst, true);
+            try self.genCopy(dst_ty, dst_mcv, lhs_mcv, .{});
+            break :result dst_mcv;
+        }
+
+        for (mask_elems, 0..) |mask_elem, elem_index| {
+            if (~(mask_elem orelse continue) != elem_index) break;
+        } else {
+            const rhs_mcv = try self.resolveInst(extra.b);
+            if (self.reuseOperand(inst, extra.b, 1, rhs_mcv)) break :result rhs_mcv;
+            const dst_mcv = try self.allocRegOrMem(inst, true);
+            try self.genCopy(dst_ty, dst_mcv, rhs_mcv, .{});
+            break :result dst_mcv;
+        }
+
+        for ([_]Mir.Inst.Tag{ .unpckl, .unpckh }) |variant| unpck: {
+            if (elem_abi_size > 8) break :unpck;
+            if (dst_abi_size > @as(u32, if (if (elem_abi_size >= 4)
+                has_avx
+            else
+                self.hasFeature(.avx2)) 32 else 16)) break :unpck;
+
+            var sources = [1]?u1{null} ** 2;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index =
+                    math.cast(u5, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :unpck;
+                const elem_byte = (elem_index >> 1) * elem_abi_size;
+                if (mask_elem_index * elem_abi_size != (elem_byte & 0b0111) | @as(u4, switch (variant) {
+                    .unpckl => 0b0000,
+                    .unpckh => 0b1000,
+                    else => unreachable,
+                }) | (elem_byte << 1 & 0b10000)) break :unpck;
+
+                const source = @intFromBool(mask_elem < 0);
+                if (sources[elem_index & 0b00001]) |prev_source| {
+                    if (source != prev_source) break :unpck;
+                } else sources[elem_index & 0b00001] = source;
+            }
+            if (sources[0] orelse break :unpck == sources[1] orelse break :unpck) break :unpck;
+
+            const operands = [2]Air.Inst.Ref{ extra.a, extra.b };
+            const operand_tys = [2]Type{ lhs_ty, rhs_ty };
+            const lhs_mcv = try self.resolveInst(operands[sources[0].?]);
+            const rhs_mcv = try self.resolveInst(operands[sources[1].?]);
+
+            const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                self.reuseOperand(inst, operands[sources[0].?], sources[0].?, lhs_mcv))
+                lhs_mcv
+            else if (has_avx and lhs_mcv.isRegister())
+                .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+            else
+                try self.copyToRegisterWithInstTracking(inst, operand_tys[sources[0].?], lhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, max_abi_size);
+
+            const mir_tag: Mir.Inst.FixedTag = if ((elem_abi_size >= 4 and elem_ty.isRuntimeFloat()) or
+                (dst_abi_size > 16 and !self.hasFeature(.avx2))) .{ switch (elem_abi_size) {
+                4 => if (has_avx) .v_ps else ._ps,
+                8 => if (has_avx) .v_pd else ._pd,
+                else => unreachable,
+            }, variant } else .{ if (has_avx) .vp_ else .p_, switch (variant) {
+                .unpckl => switch (elem_abi_size) {
+                    1 => .unpcklbw,
+                    2 => .unpcklwd,
+                    4 => .unpckldq,
+                    8 => .unpcklqdq,
+                    else => unreachable,
+                },
+                .unpckh => switch (elem_abi_size) {
+                    1 => .unpckhbw,
+                    2 => .unpckhwd,
+                    4 => .unpckhdq,
+                    8 => .unpckhqdq,
+                    else => unreachable,
+                },
+                else => unreachable,
+            } };
+            if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemory(
+                mir_tag,
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+            ) else try self.asmRegisterRegisterRegister(
+                mir_tag,
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+            ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemory(
+                mir_tag,
+                dst_alias,
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+            ) else try self.asmRegisterRegister(
+                mir_tag,
+                dst_alias,
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+            );
+            break :result dst_mcv;
+        }
+
+        pshufd: {
+            if (elem_abi_size != 4) break :pshufd;
+            if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :pshufd;
+
+            var control: u8 = 0b00_00_00_00;
+            var sources = [1]?u1{null} ** 1;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index: u3 = @intCast(if (mask_elem < 0) ~mask_elem else mask_elem);
+                if (mask_elem_index & 0b100 != elem_index & 0b100) break :pshufd;
+
+                const source = @intFromBool(mask_elem < 0);
+                if (sources[0]) |prev_source| {
+                    if (source != prev_source) break :pshufd;
+                } else sources[(elem_index & 0b010) >> 1] = source;
+
+                const select_bit: u3 = @intCast((elem_index & 0b011) << 1);
+                const select = @as(u8, @intCast(mask_elem_index & 0b011)) << select_bit;
+                if (elem_index & 0b100 == 0)
+                    control |= select
+                else if (control & @as(u8, 0b11) << select_bit != select) break :pshufd;
+            }
+
+            const operands = [2]Air.Inst.Ref{ extra.a, extra.b };
+            const operand_tys = [2]Type{ lhs_ty, rhs_ty };
+            const src_mcv = try self.resolveInst(operands[sources[0] orelse break :pshufd]);
+
+            const dst_reg = if (src_mcv.isRegister() and
+                self.reuseOperand(inst, operands[sources[0].?], sources[0].?, src_mcv))
+                src_mcv.getReg().?
+            else
+                try self.register_manager.allocReg(inst, abi.RegisterClass.sse);
+            const dst_alias = registerAlias(dst_reg, max_abi_size);
+
+            if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                .{ if (has_avx) .vp_d else .p_d, .shuf },
+                dst_alias,
+                try src_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+                Immediate.u(control),
+            ) else try self.asmRegisterRegisterImmediate(
+                .{ if (has_avx) .vp_d else .p_d, .shuf },
+                dst_alias,
+                registerAlias(if (src_mcv.isRegister())
+                    src_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[0].?], src_mcv), max_abi_size),
+                Immediate.u(control),
+            );
+            break :result .{ .register = dst_reg };
+        }
+
+        shufps: {
+            if (elem_abi_size != 4) break :shufps;
+            if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :shufps;
+
+            var control: u8 = 0b00_00_00_00;
+            var sources = [1]?u1{null} ** 2;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index: u3 = @intCast(if (mask_elem < 0) ~mask_elem else mask_elem);
+                if (mask_elem_index & 0b100 != elem_index & 0b100) break :shufps;
+
+                const source = @intFromBool(mask_elem < 0);
+                if (sources[(elem_index & 0b010) >> 1]) |prev_source| {
+                    if (source != prev_source) break :shufps;
+                } else sources[(elem_index & 0b010) >> 1] = source;
+
+                const select_bit: u3 = @intCast((elem_index & 0b011) << 1);
+                const select = @as(u8, @intCast(mask_elem_index & 0b011)) << select_bit;
+                if (elem_index & 0b100 == 0)
+                    control |= select
+                else if (control & @as(u8, 0b11) << select_bit != select) break :shufps;
+            }
+            if (sources[0] orelse break :shufps == sources[1] orelse break :shufps) break :shufps;
+
+            const operands = [2]Air.Inst.Ref{ extra.a, extra.b };
+            const operand_tys = [2]Type{ lhs_ty, rhs_ty };
+            const lhs_mcv = try self.resolveInst(operands[sources[0].?]);
+            const rhs_mcv = try self.resolveInst(operands[sources[1].?]);
+
+            const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                self.reuseOperand(inst, operands[sources[0].?], sources[0].?, lhs_mcv))
+                lhs_mcv
+            else if (has_avx and lhs_mcv.isRegister())
+                .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+            else
+                try self.copyToRegisterWithInstTracking(inst, operand_tys[sources[0].?], lhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, max_abi_size);
+
+            if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                .{ .v_ps, .shuf },
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+                Immediate.u(control),
+            ) else try self.asmRegisterRegisterRegisterImmediate(
+                .{ .v_ps, .shuf },
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+                Immediate.u(control),
+            ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                .{ ._ps, .shuf },
+                dst_alias,
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+                Immediate.u(control),
+            ) else try self.asmRegisterRegisterImmediate(
+                .{ ._ps, .shuf },
+                dst_alias,
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+                Immediate.u(control),
+            );
+            break :result dst_mcv;
+        }
+
+        shufpd: {
+            if (elem_abi_size != 8) break :shufpd;
+            if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :shufpd;
+
+            var control: u4 = 0b0_0_0_0;
+            var sources = [1]?u1{null} ** 2;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index: u2 = @intCast(if (mask_elem < 0) ~mask_elem else mask_elem);
+                if (mask_elem_index & 0b10 != elem_index & 0b10) break :shufpd;
+
+                const source = @intFromBool(mask_elem < 0);
+                if (sources[elem_index & 0b01]) |prev_source| {
+                    if (source != prev_source) break :shufpd;
+                } else sources[elem_index & 0b01] = source;
+
+                control |= @as(u4, @intCast(mask_elem_index & 0b01)) << @intCast(elem_index);
+            }
+            if (sources[0] orelse break :shufpd == sources[1] orelse break :shufpd) break :shufpd;
+
+            const operands: [2]Air.Inst.Ref = .{ extra.a, extra.b };
+            const operand_tys: [2]Type = .{ lhs_ty, rhs_ty };
+            const lhs_mcv = try self.resolveInst(operands[sources[0].?]);
+            const rhs_mcv = try self.resolveInst(operands[sources[1].?]);
+
+            const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                self.reuseOperand(inst, operands[sources[0].?], sources[0].?, lhs_mcv))
+                lhs_mcv
+            else if (has_avx and lhs_mcv.isRegister())
+                .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+            else
+                try self.copyToRegisterWithInstTracking(inst, operand_tys[sources[0].?], lhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, max_abi_size);
+
+            if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                .{ .v_pd, .shuf },
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+                Immediate.u(control),
+            ) else try self.asmRegisterRegisterRegisterImmediate(
+                .{ .v_pd, .shuf },
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+                Immediate.u(control),
+            ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                .{ ._pd, .shuf },
+                dst_alias,
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+                Immediate.u(control),
+            ) else try self.asmRegisterRegisterImmediate(
+                .{ ._pd, .shuf },
+                dst_alias,
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+                Immediate.u(control),
+            );
+            break :result dst_mcv;
+        }
+
+        blend: {
+            if (elem_abi_size < 2) break :blend;
+            if (dst_abi_size > @as(u32, if (has_avx) 32 else 16)) break :blend;
+            if (!self.hasFeature(.sse4_1)) break :blend;
+
+            var control: u8 = 0b0_0_0_0_0_0_0_0;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index =
+                    math.cast(u4, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :blend;
+                if (mask_elem_index != elem_index) break :blend;
+
+                const select = @as(u8, @intFromBool(mask_elem < 0)) << @truncate(elem_index);
+                if (elem_index & 0b1000 == 0)
+                    control |= select
+                else if (control & @as(u8, 0b1) << @truncate(elem_index) != select) break :blend;
+            }
+
+            if (!elem_ty.isRuntimeFloat() and self.hasFeature(.avx2)) vpblendd: {
+                const expanded_control = switch (elem_abi_size) {
+                    4 => control,
+                    8 => @as(u8, if (control & 0b0001 != 0) 0b00_00_00_11 else 0b00_00_00_00) |
+                        @as(u8, if (control & 0b0010 != 0) 0b00_00_11_00 else 0b00_00_00_00) |
+                        @as(u8, if (control & 0b0100 != 0) 0b00_11_00_00 else 0b00_00_00_00) |
+                        @as(u8, if (control & 0b1000 != 0) 0b11_00_00_00 else 0b00_00_00_00),
+                    else => break :vpblendd,
+                };
+
+                const lhs_mcv = try self.resolveInst(extra.a);
+                const lhs_reg = if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(dst_ty, lhs_mcv);
+                const lhs_lock = self.register_manager.lockReg(lhs_reg);
+                defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+                const rhs_mcv = try self.resolveInst(extra.b);
+                const dst_reg = try self.register_manager.allocReg(inst, abi.RegisterClass.sse);
+                if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                    .{ .vp_d, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(lhs_reg, dst_abi_size),
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    Immediate.u(expanded_control),
+                ) else try self.asmRegisterRegisterRegisterImmediate(
+                    .{ .vp_d, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(lhs_reg, dst_abi_size),
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    Immediate.u(expanded_control),
+                );
+                break :result .{ .register = dst_reg };
+            }
+
+            if (!elem_ty.isRuntimeFloat() or elem_abi_size == 2) pblendw: {
+                const expanded_control = switch (elem_abi_size) {
+                    2 => control,
+                    4 => if (dst_abi_size <= 16 or
+                        @as(u4, @intCast(control >> 4)) == @as(u4, @truncate(control >> 0)))
+                        @as(u8, if (control & 0b0001 != 0) 0b00_00_00_11 else 0b00_00_00_00) |
+                            @as(u8, if (control & 0b0010 != 0) 0b00_00_11_00 else 0b00_00_00_00) |
+                            @as(u8, if (control & 0b0100 != 0) 0b00_11_00_00 else 0b00_00_00_00) |
+                            @as(u8, if (control & 0b1000 != 0) 0b11_00_00_00 else 0b00_00_00_00)
+                    else
+                        break :pblendw,
+                    8 => if (dst_abi_size <= 16 or
+                        @as(u2, @intCast(control >> 2)) == @as(u2, @truncate(control >> 0)))
+                        @as(u8, if (control & 0b01 != 0) 0b0000_1111 else 0b0000_0000) |
+                            @as(u8, if (control & 0b10 != 0) 0b1111_0000 else 0b0000_0000)
+                    else
+                        break :pblendw,
+                    16 => break :pblendw,
+                    else => unreachable,
+                };
+
+                const lhs_mcv = try self.resolveInst(extra.a);
+                const rhs_mcv = try self.resolveInst(extra.b);
+
+                const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                    self.reuseOperand(inst, extra.a, 0, lhs_mcv))
+                    lhs_mcv
+                else if (has_avx and lhs_mcv.isRegister())
+                    .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+                else
+                    try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv);
+                const dst_reg = dst_mcv.getReg().?;
+
+                if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                    .{ .vp_w, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(if (lhs_mcv.isRegister())
+                        lhs_mcv.getReg().?
+                    else
+                        dst_reg, dst_abi_size),
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    Immediate.u(expanded_control),
+                ) else try self.asmRegisterRegisterRegisterImmediate(
+                    .{ .vp_w, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(if (lhs_mcv.isRegister())
+                        lhs_mcv.getReg().?
+                    else
+                        dst_reg, dst_abi_size),
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    Immediate.u(expanded_control),
+                ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                    .{ .p_w, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    Immediate.u(expanded_control),
+                ) else try self.asmRegisterRegisterImmediate(
+                    .{ .p_w, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    Immediate.u(expanded_control),
+                );
+                break :result .{ .register = dst_reg };
+            }
+
+            const expanded_control = switch (elem_abi_size) {
+                4, 8 => control,
+                16 => @as(u4, if (control & 0b01 != 0) 0b00_11 else 0b00_00) |
+                    @as(u4, if (control & 0b10 != 0) 0b11_00 else 0b00_00),
+                else => unreachable,
+            };
+
+            const lhs_mcv = try self.resolveInst(extra.a);
+            const rhs_mcv = try self.resolveInst(extra.b);
+
+            const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                self.reuseOperand(inst, extra.a, 0, lhs_mcv))
+                lhs_mcv
+            else if (has_avx and lhs_mcv.isRegister())
+                .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+            else
+                try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+
+            if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                switch (elem_abi_size) {
+                    4 => .{ .v_ps, .blend },
+                    8, 16 => .{ .v_pd, .blend },
+                    else => unreachable,
+                },
+                registerAlias(dst_reg, dst_abi_size),
+                registerAlias(if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    dst_reg, dst_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                Immediate.u(expanded_control),
+            ) else try self.asmRegisterRegisterRegisterImmediate(
+                switch (elem_abi_size) {
+                    4 => .{ .v_ps, .blend },
+                    8, 16 => .{ .v_pd, .blend },
+                    else => unreachable,
+                },
+                registerAlias(dst_reg, dst_abi_size),
+                registerAlias(if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    dst_reg, dst_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                Immediate.u(expanded_control),
+            ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                switch (elem_abi_size) {
+                    4 => .{ ._ps, .blend },
+                    8, 16 => .{ ._pd, .blend },
+                    else => unreachable,
+                },
+                registerAlias(dst_reg, dst_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                Immediate.u(expanded_control),
+            ) else try self.asmRegisterRegisterImmediate(
+                switch (elem_abi_size) {
+                    4 => .{ ._ps, .blend },
+                    8, 16 => .{ ._pd, .blend },
+                    else => unreachable,
+                },
+                registerAlias(dst_reg, dst_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                Immediate.u(expanded_control),
+            );
+            break :result .{ .register = dst_reg };
+        }
+
+        blendv: {
+            if (dst_abi_size > @as(u32, if (if (elem_abi_size >= 4)
+                has_avx
+            else
+                self.hasFeature(.avx2)) 32 else 16)) break :blendv;
+
+            const select_mask_elem_ty = try mod.intType(.unsigned, elem_abi_size * 8);
+            const select_mask_ty = try mod.vectorType(.{
+                .len = @intCast(mask_elems.len),
+                .child = select_mask_elem_ty.toIntern(),
+            });
+            var select_mask_elems: [32]InternPool.Index = undefined;
+            for (
+                select_mask_elems[0..mask_elems.len],
+                mask_elems,
+                0..,
+            ) |*select_mask_elem, maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index =
+                    math.cast(u5, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :blendv;
+                if (mask_elem_index != elem_index) break :blendv;
+
+                select_mask_elem.* = (if (mask_elem < 0)
+                    try select_mask_elem_ty.maxIntScalar(mod, select_mask_elem_ty)
+                else
+                    try select_mask_elem_ty.minIntScalar(mod, select_mask_elem_ty)).toIntern();
+            }
+            const select_mask_mcv = try self.genTypedValue(.{
+                .ty = select_mask_ty,
+                .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{
+                    .ty = select_mask_ty.toIntern(),
+                    .storage = .{ .elems = select_mask_elems[0..mask_elems.len] },
+                } })),
+            });
+
+            if (self.hasFeature(.sse4_1)) {
+                const mir_tag: Mir.Inst.FixedTag = .{
+                    if ((elem_abi_size >= 4 and elem_ty.isRuntimeFloat()) or
+                        (dst_abi_size > 16 and !self.hasFeature(.avx2))) switch (elem_abi_size) {
+                        4 => if (has_avx) .v_ps else ._ps,
+                        8 => if (has_avx) .v_pd else ._pd,
+                        else => unreachable,
+                    } else if (has_avx) .vp_b else .p_b,
+                    .blendv,
+                };
+
+                const select_mask_reg = if (!has_avx) reg: {
+                    try self.register_manager.getKnownReg(.xmm0, null);
+                    try self.genSetReg(.xmm0, select_mask_elem_ty, select_mask_mcv, .{});
+                    break :reg .xmm0;
+                } else try self.copyToTmpRegister(select_mask_ty, select_mask_mcv);
+                const select_mask_alias = registerAlias(select_mask_reg, dst_abi_size);
+                const select_mask_lock = self.register_manager.lockRegAssumeUnused(select_mask_reg);
+                defer self.register_manager.unlockReg(select_mask_lock);
+
+                const lhs_mcv = try self.resolveInst(extra.a);
+                const rhs_mcv = try self.resolveInst(extra.b);
+
+                const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                    self.reuseOperand(inst, extra.a, 0, lhs_mcv))
+                    lhs_mcv
+                else if (has_avx and lhs_mcv.isRegister())
+                    .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+                else
+                    try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv);
+                const dst_reg = dst_mcv.getReg().?;
+                const dst_alias = registerAlias(dst_reg, dst_abi_size);
+
+                if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryRegister(
+                    mir_tag,
+                    dst_alias,
+                    if (lhs_mcv.isRegister())
+                        registerAlias(lhs_mcv.getReg().?, dst_abi_size)
+                    else
+                        dst_alias,
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    select_mask_alias,
+                ) else try self.asmRegisterRegisterRegisterRegister(
+                    mir_tag,
+                    dst_alias,
+                    if (lhs_mcv.isRegister())
+                        registerAlias(lhs_mcv.getReg().?, dst_abi_size)
+                    else
+                        dst_alias,
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    select_mask_alias,
+                ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryRegister(
+                    mir_tag,
+                    dst_alias,
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    select_mask_alias,
+                ) else try self.asmRegisterRegisterRegister(
+                    mir_tag,
+                    dst_alias,
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    select_mask_alias,
+                );
+                break :result dst_mcv;
+            }
+
+            const lhs_mcv = try self.resolveInst(extra.a);
+            const rhs_mcv = try self.resolveInst(extra.b);
+
+            const dst_mcv: MCValue = if (rhs_mcv.isRegister() and
+                self.reuseOperand(inst, extra.b, 1, rhs_mcv))
+                rhs_mcv
+            else
+                try self.copyToRegisterWithInstTracking(inst, dst_ty, rhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, dst_abi_size);
+
+            const mask_reg = try self.copyToTmpRegister(select_mask_ty, select_mask_mcv);
+            const mask_alias = registerAlias(mask_reg, dst_abi_size);
+            const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg);
+            defer self.register_manager.unlockReg(mask_lock);
+
+            const mir_fixes: Mir.Inst.Fixes = if (elem_ty.isRuntimeFloat())
+                switch (elem_ty.floatBits(self.target.*)) {
+                    16, 80, 128 => .p_,
+                    32 => ._ps,
+                    64 => ._pd,
+                    else => unreachable,
+                }
+            else
+                .p_;
+            try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_alias, mask_alias);
+            if (lhs_mcv.isMemory()) try self.asmRegisterMemory(
+                .{ mir_fixes, .andn },
+                mask_alias,
+                try lhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+            ) else try self.asmRegisterRegister(
+                .{ mir_fixes, .andn },
+                mask_alias,
+                if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(dst_ty, lhs_mcv),
+            );
+            try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_alias, mask_alias);
+            break :result dst_mcv;
+        }
+
+        pshufb: {
+            if (max_abi_size > 16) break :pshufb;
+            if (!self.hasFeature(.ssse3)) break :pshufb;
+
+            const temp_regs =
+                try self.register_manager.allocRegs(2, .{ inst, null }, abi.RegisterClass.sse);
+            const temp_locks = self.register_manager.lockRegsAssumeUnused(2, temp_regs);
+            defer for (temp_locks) |lock| self.register_manager.unlockReg(lock);
+
+            const lhs_temp_alias = registerAlias(temp_regs[0], max_abi_size);
+            try self.genSetReg(temp_regs[0], lhs_ty, .{ .air_ref = extra.a }, .{});
+
+            const rhs_temp_alias = registerAlias(temp_regs[1], max_abi_size);
+            try self.genSetReg(temp_regs[1], rhs_ty, .{ .air_ref = extra.b }, .{});
+
+            var lhs_mask_elems: [16]InternPool.Index = undefined;
+            for (lhs_mask_elems[0..max_abi_size], 0..) |*lhs_mask_elem, byte_index| {
+                const elem_index = byte_index / elem_abi_size;
+                lhs_mask_elem.* = try mod.intern(.{ .int = .{
+                    .ty = .u8_type,
+                    .storage = .{ .u64 = if (elem_index >= mask_elems.len) 0b1_00_00000 else elem: {
+                        const mask_elem = mask_elems[elem_index] orelse break :elem 0b1_00_00000;
+                        if (mask_elem < 0) break :elem 0b1_00_00000;
+                        const mask_elem_index: u31 = @intCast(mask_elem);
+                        const byte_off: u32 = @intCast(byte_index % elem_abi_size);
+                        break :elem @intCast(mask_elem_index * elem_abi_size + byte_off);
+                    } },
+                } });
+            }
+            const lhs_mask_ty = try mod.vectorType(.{ .len = max_abi_size, .child = .u8_type });
+            const lhs_mask_mcv = try self.genTypedValue(.{
+                .ty = lhs_mask_ty,
+                .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{
+                    .ty = lhs_mask_ty.toIntern(),
+                    .storage = .{ .elems = lhs_mask_elems[0..max_abi_size] },
+                } })),
+            });
+            const lhs_mask_mem: Memory = .{
+                .base = .{ .reg = try self.copyToTmpRegister(Type.usize, lhs_mask_mcv.address()) },
+                .mod = .{ .rm = .{ .size = Memory.Size.fromSize(@max(max_abi_size, 16)) } },
+            };
+            if (has_avx) try self.asmRegisterRegisterMemory(
+                .{ .vp_b, .shuf },
+                lhs_temp_alias,
+                lhs_temp_alias,
+                lhs_mask_mem,
+            ) else try self.asmRegisterMemory(
+                .{ .p_b, .shuf },
+                lhs_temp_alias,
+                lhs_mask_mem,
+            );
+
+            var rhs_mask_elems: [16]InternPool.Index = undefined;
+            for (rhs_mask_elems[0..max_abi_size], 0..) |*rhs_mask_elem, byte_index| {
+                const elem_index = byte_index / elem_abi_size;
+                rhs_mask_elem.* = try mod.intern(.{ .int = .{
+                    .ty = .u8_type,
+                    .storage = .{ .u64 = if (elem_index >= mask_elems.len) 0b1_00_00000 else elem: {
+                        const mask_elem = mask_elems[elem_index] orelse break :elem 0b1_00_00000;
+                        if (mask_elem >= 0) break :elem 0b1_00_00000;
+                        const mask_elem_index: u31 = @intCast(~mask_elem);
+                        const byte_off: u32 = @intCast(byte_index % elem_abi_size);
+                        break :elem @intCast(mask_elem_index * elem_abi_size + byte_off);
+                    } },
+                } });
+            }
+            const rhs_mask_ty = try mod.vectorType(.{ .len = max_abi_size, .child = .u8_type });
+            const rhs_mask_mcv = try self.genTypedValue(.{
+                .ty = rhs_mask_ty,
+                .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{
+                    .ty = rhs_mask_ty.toIntern(),
+                    .storage = .{ .elems = rhs_mask_elems[0..max_abi_size] },
+                } })),
+            });
+            const rhs_mask_mem: Memory = .{
+                .base = .{ .reg = try self.copyToTmpRegister(Type.usize, rhs_mask_mcv.address()) },
+                .mod = .{ .rm = .{ .size = Memory.Size.fromSize(@max(max_abi_size, 16)) } },
+            };
+            if (has_avx) try self.asmRegisterRegisterMemory(
+                .{ .vp_b, .shuf },
+                rhs_temp_alias,
+                rhs_temp_alias,
+                rhs_mask_mem,
+            ) else try self.asmRegisterMemory(
+                .{ .p_b, .shuf },
+                rhs_temp_alias,
+                rhs_mask_mem,
+            );
+
+            if (has_avx) try self.asmRegisterRegisterRegister(
+                .{ switch (elem_ty.zigTypeTag(mod)) {
+                    else => break :result null,
+                    .Int => .vp_,
+                    .Float => switch (elem_ty.floatBits(self.target.*)) {
+                        32 => .v_ps,
+                        64 => .v_pd,
+                        16, 80, 128 => break :result null,
+                        else => unreachable,
+                    },
+                }, .@"or" },
+                lhs_temp_alias,
+                lhs_temp_alias,
+                rhs_temp_alias,
+            ) else try self.asmRegisterRegister(
+                .{ switch (elem_ty.zigTypeTag(mod)) {
+                    else => break :result null,
+                    .Int => .p_,
+                    .Float => switch (elem_ty.floatBits(self.target.*)) {
+                        32 => ._ps,
+                        64 => ._pd,
+                        16, 80, 128 => break :result null,
+                        else => unreachable,
+                    },
+                }, .@"or" },
+                lhs_temp_alias,
+                rhs_temp_alias,
+            );
+            break :result .{ .register = temp_regs[0] };
+        }
+
+        break :result null;
+    }) orelse return self.fail("TODO implement airShuffle from {} and {} to {} with {}", .{
+        lhs_ty.fmt(mod), rhs_ty.fmt(mod), dst_ty.fmt(mod),
+        Value.fromInterned(extra.mask).fmtValue(
+            Type.fromInterned(mod.intern_pool.typeOf(extra.mask)),
+            mod,
+        ),
+    });
+    return self.finishAir(inst, result, .{ extra.a, extra.b, .none });
 }
 
 fn airReduce(self: *Self, inst: Air.Inst.Index) !void {
@@ -16751,7 +18083,7 @@ fn airAggregateInit(self: *Self, inst: Air.Inst.Index) !void {
             },
             .Array, .Vector => {
                 const elem_ty = result_ty.childType(mod);
-                if (result_ty.isVector(mod) and elem_ty.bitSize(mod) == 1) {
+                if (result_ty.isVector(mod) and elem_ty.toIntern() == .bool_type) {
                     const result_size: u32 = @intCast(result_ty.abiSize(mod));
                     const dst_reg = try self.register_manager.allocReg(inst, abi.RegisterClass.gp);
                     try self.asmRegisterRegister(
@@ -17801,7 +19133,7 @@ fn splitType(self: *Self, ty: Type) ![2]Type {
                 else => unreachable,
             },
             .float => Type.f32,
-            .float_combine => try mod.vectorType(.{ .len = 2, .child = .f32_type }),
+            .float_combine => try mod.arrayType(.{ .len = 2, .child = .f32_type }),
             .sse => Type.f64,
             else => break,
         };
diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig
@@ -324,16 +324,19 @@ pub const Mnemonic = enum {
     // SSE3
     movddup, movshdup, movsldup,
     // SSSE3
-    pabsb, pabsd, pabsw, palignr,
+    pabsb, pabsd, pabsw, palignr, pshufb,
     // SSE4.1
     blendpd, blendps, blendvpd, blendvps,
     extractps,
     insertps,
     packusdw,
+    pblendvb, pblendw,
     pcmpeqq,
     pextrb, pextrd, pextrq,
     pinsrb, pinsrd, pinsrq,
     pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw,
+    pmovsxbd, pmovsxbq, pmovsxbw, pmovsxdq, pmovsxwd, pmovsxwq,
+    pmovzxbd, pmovzxbq, pmovzxbw, pmovzxdq, pmovzxwd, pmovzxwq,
     pmulld,
     roundpd, roundps, roundsd, roundss,
     // SSE4.2
@@ -377,7 +380,8 @@ pub const Mnemonic = enum {
     vpabsb, vpabsd, vpabsw,
     vpackssdw, vpacksswb, vpackusdw, vpackuswb,
     vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw,
-    vpalignr, vpand, vpandn, vpclmulqdq,
+    vpalignr, vpand, vpandn,
+    vpblendvb, vpblendw, vpclmulqdq,
     vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
     vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
     vpextrb, vpextrd, vpextrq, vpextrw,
@@ -385,9 +389,11 @@ pub const Mnemonic = enum {
     vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
     vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw,
     vpmovmskb,
+    vpmovsxbd, vpmovsxbq, vpmovsxbw, vpmovsxdq, vpmovsxwd, vpmovsxwq,
+    vpmovzxbd, vpmovzxbq, vpmovzxbw, vpmovzxdq, vpmovzxwd, vpmovzxwq,
     vpmulhw, vpmulld, vpmullw,
     vpor,
-    vpshufd, vpshufhw, vpshuflw,
+    vpshufb, vpshufd, vpshufhw, vpshuflw,
     vpslld, vpslldq, vpsllq, vpsllw,
     vpsrad, vpsraq, vpsraw,
     vpsrld, vpsrldq, vpsrlq, vpsrlw,
@@ -409,7 +415,8 @@ pub const Mnemonic = enum {
     vfmadd132sd, vfmadd213sd, vfmadd231sd,
     vfmadd132ss, vfmadd213ss, vfmadd231ss,
     // AVX2
-    vpbroadcastb, vpbroadcastd, vpbroadcasti128, vpbroadcastq, vpbroadcastw,
+    vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw,
+    vextracti128, vinserti128, vpblendd,
     // zig fmt: on
 };
 
diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig
@@ -477,8 +477,9 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
         .rri_s, .rri_u => inst.data.rri.fixes,
         .ri_s, .ri_u => inst.data.ri.fixes,
         .ri64, .rm, .rmi_s, .mr => inst.data.rx.fixes,
-        .mrr, .rrm => inst.data.rrx.fixes,
+        .mrr, .rrm, .rmr => inst.data.rrx.fixes,
         .rmi, .mri => inst.data.rix.fixes,
+        .rrmr => inst.data.rrrx.fixes,
         .rrmi => inst.data.rrix.fixes,
         .mi_u, .mi_s => inst.data.x.fixes,
         .m => inst.data.x.fixes,
@@ -565,6 +566,11 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
             .{ .reg = inst.data.rx.r1 },
             .{ .mem = lower.mem(inst.data.rx.payload) },
         },
+        .rmr => &.{
+            .{ .reg = inst.data.rrx.r1 },
+            .{ .mem = lower.mem(inst.data.rrx.payload) },
+            .{ .reg = inst.data.rrx.r2 },
+        },
         .rmi => &.{
             .{ .reg = inst.data.rix.r1 },
             .{ .mem = lower.mem(inst.data.rix.payload) },
@@ -597,6 +603,12 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
             .{ .reg = inst.data.rrx.r2 },
             .{ .mem = lower.mem(inst.data.rrx.payload) },
         },
+        .rrmr => &.{
+            .{ .reg = inst.data.rrrx.r1 },
+            .{ .reg = inst.data.rrrx.r2 },
+            .{ .mem = lower.mem(inst.data.rrrx.payload) },
+            .{ .reg = inst.data.rrrx.r3 },
+        },
         .rrmi => &.{
             .{ .reg = inst.data.rrix.r1 },
             .{ .reg = inst.data.rrix.r2 },
diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig
@@ -230,6 +230,8 @@ pub const Inst = struct {
         v_d,
         /// VEX-Encoded ___ QuadWord
         v_q,
+        /// VEX-Encoded ___ Integer Data
+        v_i128,
         /// VEX-Encoded Packed ___
         vp_,
         /// VEX-Encoded Packed ___ Byte
@@ -242,8 +244,6 @@ pub const Inst = struct {
         vp_q,
         /// VEX-Encoded Packed ___ Double Quadword
         vp_dq,
-        /// VEX-Encoded Packed ___ Integer Data
-        vp_i128,
         /// VEX-Encoded ___ Scalar Single-Precision Values
         v_ss,
         /// VEX-Encoded ___ Packed Single-Precision Values
@@ -654,10 +654,19 @@ pub const Inst = struct {
         /// Variable blend scalar double-precision floating-point values
         blendv,
         /// Extract packed floating-point values
+        /// Extract packed integer values
         extract,
         /// Insert scalar single-precision floating-point value
         /// Insert packed floating-point values
         insert,
+        /// Packed move with sign extend
+        movsxb,
+        movsxd,
+        movsxw,
+        /// Packed move with zero extend
+        movzxb,
+        movzxd,
+        movzxw,
         /// Round packed single-precision floating-point values
         /// Round scalar single-precision floating-point value
         /// Round packed double-precision floating-point values
@@ -688,6 +697,7 @@ pub const Inst = struct {
         sha256rnds2,
 
         /// Load with broadcast floating-point data
+        /// Load integer and broadcast
         broadcast,
 
         /// Convert 16-bit floating-point values to single-precision floating-point values
@@ -762,8 +772,11 @@ pub const Inst = struct {
         /// Uses `imm` payload.
         rel,
         /// Register, memory operands.
-        /// Uses `rx` payload.
+        /// Uses `rx` payload with extra data of type `Memory`.
         rm,
+        /// Register, memory, register operands.
+        /// Uses `rrx` payload with extra data of type `Memory`.
+        rmr,
         /// Register, memory, immediate (word) operands.
         /// Uses `rix` payload with extra data of type `Memory`.
         rmi,
@@ -776,6 +789,9 @@ pub const Inst = struct {
         /// Register, register, memory.
         /// Uses `rrix` payload with extra data of type `Memory`.
         rrm,
+        /// Register, register, memory, register.
+        /// Uses `rrrx` payload with extra data of type `Memory`.
+        rrmr,
         /// Register, register, memory, immediate (byte) operands.
         /// Uses `rrix` payload with extra data of type `Memory`.
         rrmi,
@@ -953,6 +969,14 @@ pub const Inst = struct {
             r2: Register,
             payload: u32,
         },
+        /// Register, register, register, followed by Custom payload found in extra.
+        rrrx: struct {
+            fixes: Fixes = ._,
+            r1: Register,
+            r2: Register,
+            r3: Register,
+            payload: u32,
+        },
         /// Register, byte immediate, followed by Custom payload found in extra.
         rix: struct {
             fixes: Fixes = ._,
diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig
@@ -1185,6 +1185,8 @@ pub const table = [_]Entry{
 
     .{ .palignr, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0f }, 0, .none, .ssse3 },
 
+    .{ .pshufb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .none, .ssse3 },
+
     // SSE4.1
     .{ .blendpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .none, .sse4_1 },
 
@@ -1202,6 +1204,11 @@ pub const table = [_]Entry{
 
     .{ .packusdw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .none, .sse4_1 },
 
+    .{ .pblendvb, .rm, &.{ .xmm, .xmm_m128        }, &.{ 0x66, 0x0f, 0x38, 0x10 }, 0, .none, .sse4_1 },
+    .{ .pblendvb, .rm, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x10 }, 0, .none, .sse4_1 },
+
+    .{ .pblendw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .none, .sse4_1 },
+
     .{ .pcmpeqq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x29 }, 0, .none, .sse4_1 },
 
     .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 },
@@ -1228,6 +1235,20 @@ pub const table = [_]Entry{
 
     .{ .pminud, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .none, .sse4_1 },
 
+    .{ .pmovsxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .none, .sse4_1 },
+    .{ .pmovsxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .none, .sse4_1 },
+    .{ .pmovsxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .none, .sse4_1 },
+    .{ .pmovsxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .none, .sse4_1 },
+    .{ .pmovsxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .none, .sse4_1 },
+    .{ .pmovsxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .none, .sse4_1 },
+
+    .{ .pmovzxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .none, .sse4_1 },
+    .{ .pmovzxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .none, .sse4_1 },
+    .{ .pmovzxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .none, .sse4_1 },
+    .{ .pmovzxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .none, .sse4_1 },
+    .{ .pmovzxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .none, .sse4_1 },
+    .{ .pmovzxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .none, .sse4_1 },
+
     .{ .pmulld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 },
 
     .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 },
@@ -1528,6 +1549,10 @@ pub const table = [_]Entry{
 
     .{ .vpandn, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_128_wig, .avx },
 
+    .{ .vpblendvb, .rvmr, &.{ .xmm, .xmm, .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_128_w0, .avx },
+
+    .{ .vpblendw, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_128_wig, .avx },
+
     .{ .vpclmulqdq, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x44 }, 0, .vex_128_wig, .@"pclmul avx" },
 
     .{ .vpcmpeqb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_128_wig, .avx },
@@ -1576,6 +1601,20 @@ pub const table = [_]Entry{
     .{ .vpmovmskb, .rm, &.{ .r32, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_128_wig, .avx },
     .{ .vpmovmskb, .rm, &.{ .r64, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_128_wig, .avx },
 
+    .{ .vpmovsxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .vex_128_wig, .avx },
+
+    .{ .vpmovzxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_128_wig, .avx },
+
     .{ .vpmulhw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx },
 
     .{ .vpmulld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx },
@@ -1584,6 +1623,8 @@ pub const table = [_]Entry{
 
     .{ .vpor, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx },
 
+    .{ .vpshufb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_128_wig, .avx },
+
     .{ .vpshufd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
 
     .{ .vpshufhw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
@@ -1728,6 +1769,10 @@ pub const table = [_]Entry{
     .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 },
     .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 },
 
+    .{ .vextracti128, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x39 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vinserti128, .rvmi, &.{ .ymm, .ymm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x38 }, 0, .vex_256_w0, .avx2 },
+
     .{ .vpabsb, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1c }, 0, .vex_256_wig, .avx2 },
     .{ .vpabsd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1e }, 0, .vex_256_wig, .avx2 },
     .{ .vpabsw, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1d }, 0, .vex_256_wig, .avx2 },
@@ -1756,6 +1801,13 @@ pub const table = [_]Entry{
 
     .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpblendd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x02 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpblendd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x02 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpblendvb, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpblendw, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_256_wig, .avx2 },
+
     .{ .vpbroadcastb,    .rm, &.{ .xmm, .xmm_m8  }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_128_w0, .avx2 },
     .{ .vpbroadcastb,    .rm, &.{ .ymm, .xmm_m8  }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_256_w0, .avx2 },
     .{ .vpbroadcastw,    .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x79 }, 0, .vex_128_w0, .avx2 },
@@ -1764,7 +1816,7 @@ pub const table = [_]Entry{
     .{ .vpbroadcastd,    .rm, &.{ .ymm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x58 }, 0, .vex_256_w0, .avx2 },
     .{ .vpbroadcastq,    .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_128_w0, .avx2 },
     .{ .vpbroadcastq,    .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_256_w0, .avx2 },
-    .{ .vpbroadcasti128, .rm, &.{ .ymm, .m128    }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },
+    .{ .vbroadcasti128,  .rm, &.{ .ymm, .m128    }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },
 
     .{ .vpcmpeqb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_256_wig, .avx2 },
     .{ .vpcmpeqw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x75 }, 0, .vex_256_wig, .avx2 },
@@ -1799,6 +1851,20 @@ pub const table = [_]Entry{
     .{ .vpmovmskb, .rm, &.{ .r32, .ymm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_256_wig, .avx2 },
     .{ .vpmovmskb, .rm, &.{ .r64, .ymm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpmovsxbw, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxbd, .rm, &.{ .ymm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxbq, .rm, &.{ .ymm, .xmm_m32  }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxwd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxwq, .rm, &.{ .ymm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxdq, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpmovzxbw, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxbd, .rm, &.{ .ymm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxbq, .rm, &.{ .ymm, .xmm_m32  }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxwd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxwq, .rm, &.{ .ymm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxdq, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_256_wig, .avx2 },
+
     .{ .vpmulhw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx2 },
 
     .{ .vpmulld, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx2 },
@@ -1807,6 +1873,7 @@ pub const table = [_]Entry{
 
     .{ .vpor, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpshufb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_256_wig, .avx2 },
     .{ .vpshufd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },
 
     .{ .vpshufhw, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },
diff --git a/src/codegen.zig b/src/codegen.zig
@@ -405,7 +405,7 @@ pub fn generateSymbol(
             .vector_type => |vector_type| {
                 const abi_size = math.cast(usize, typed_value.ty.abiSize(mod)) orelse
                     return error.Overflow;
-                if (Type.fromInterned(vector_type.child).bitSize(mod) == 1) {
+                if (vector_type.child == .bool_type) {
                     const bytes = try code.addManyAsSlice(abi_size);
                     @memset(bytes, 0xaa);
                     var index: usize = 0;
@@ -443,37 +443,34 @@ pub fn generateSymbol(
                             },
                         }) byte.* |= mask else byte.* &= ~mask;
                     }
-                } else switch (aggregate.storage) {
-                    .bytes => |bytes| try code.appendSlice(bytes),
-                    .elems, .repeated_elem => {
-                        var index: u64 = 0;
-                        while (index < vector_type.len) : (index += 1) {
-                            switch (try generateSymbol(bin_file, src_loc, .{
-                                .ty = Type.fromInterned(vector_type.child),
-                                .val = Value.fromInterned(switch (aggregate.storage) {
-                                    .bytes => unreachable,
-                                    .elems => |elems| elems[
-                                        math.cast(usize, index) orelse return error.Overflow
-                                    ],
-                                    .repeated_elem => |elem| elem,
-                                }),
-                            }, code, debug_output, reloc_info)) {
-                                .ok => {},
-                                .fail => |em| return .{ .fail = em },
+                } else {
+                    switch (aggregate.storage) {
+                        .bytes => |bytes| try code.appendSlice(bytes),
+                        .elems, .repeated_elem => {
+                            var index: u64 = 0;
+                            while (index < vector_type.len) : (index += 1) {
+                                switch (try generateSymbol(bin_file, src_loc, .{
+                                    .ty = Type.fromInterned(vector_type.child),
+                                    .val = Value.fromInterned(switch (aggregate.storage) {
+                                        .bytes => unreachable,
+                                        .elems => |elems| elems[
+                                            math.cast(usize, index) orelse return error.Overflow
+                                        ],
+                                        .repeated_elem => |elem| elem,
+                                    }),
+                                }, code, debug_output, reloc_info)) {
+                                    .ok => {},
+                                    .fail => |em| return .{ .fail = em },
+                                }
                             }
-                        }
-                    },
-                }
+                        },
+                    }
 
-                const padding = abi_size - (math.cast(usize, math.divCeil(
-                    u64,
-                    Type.fromInterned(vector_type.child).bitSize(mod) * vector_type.len,
-                    8,
-                ) catch |err| switch (err) {
-                    error.DivisionByZero => unreachable,
-                    else => |e| return e,
-                }) orelse return error.Overflow);
-                if (padding > 0) try code.appendNTimes(0, padding);
+                    const padding = abi_size -
+                        (math.cast(usize, Type.fromInterned(vector_type.child).abiSize(mod) * vector_type.len) orelse
+                        return error.Overflow);
+                    if (padding > 0) try code.appendNTimes(0, padding);
+                }
             },
             .anon_struct_type => |tuple| {
                 const struct_begin = code.items.len;
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
@@ -4140,9 +4140,7 @@ fn airCmpOp(
     if (need_cast) try writer.writeAll("(void*)");
     try f.writeCValue(writer, lhs, .Other);
     try v.elem(f, writer);
-    try writer.writeByte(' ');
     try writer.writeAll(compareOperatorC(operator));
-    try writer.writeByte(' ');
     if (need_cast) try writer.writeAll("(void*)");
     try f.writeCValue(writer, rhs, .Other);
     try v.elem(f, writer);
@@ -4181,41 +4179,28 @@ fn airEquality(
     const writer = f.object.writer();
     const inst_ty = f.typeOfIndex(inst);
     const local = try f.allocLocal(inst, inst_ty);
+    const a = try Assignment.start(f, writer, inst_ty);
     try f.writeCValue(writer, local, .Other);
-    try writer.writeAll(" = ");
+    try a.assign(f, writer);
 
     if (operand_ty.zigTypeTag(mod) == .Optional and !operand_ty.optionalReprIsPayload(mod)) {
-        // (A && B)  || (C && (A == B))
-        // A = lhs.is_null  ;  B = rhs.is_null  ;  C = rhs.payload == lhs.payload
-
-        switch (operator) {
-            .eq => {},
-            .neq => try writer.writeByte('!'),
-            else => unreachable,
-        }
-        try writer.writeAll("((");
-        try f.writeCValue(writer, lhs, .Other);
-        try writer.writeAll(".is_null && ");
-        try f.writeCValue(writer, rhs, .Other);
-        try writer.writeAll(".is_null) || (");
-        try f.writeCValue(writer, lhs, .Other);
-        try writer.writeAll(".payload == ");
-        try f.writeCValue(writer, rhs, .Other);
-        try writer.writeAll(".payload && ");
+        try f.writeCValueMember(writer, lhs, .{ .identifier = "is_null" });
+        try writer.writeAll(" || ");
+        try f.writeCValueMember(writer, rhs, .{ .identifier = "is_null" });
+        try writer.writeAll(" ? ");
+        try f.writeCValueMember(writer, lhs, .{ .identifier = "is_null" });
+        try writer.writeAll(compareOperatorC(operator));
+        try f.writeCValueMember(writer, rhs, .{ .identifier = "is_null" });
+        try writer.writeAll(" : ");
+        try f.writeCValueMember(writer, lhs, .{ .identifier = "payload" });
+        try writer.writeAll(compareOperatorC(operator));
+        try f.writeCValueMember(writer, rhs, .{ .identifier = "payload" });
+    } else {
         try f.writeCValue(writer, lhs, .Other);
-        try writer.writeAll(".is_null == ");
+        try writer.writeAll(compareOperatorC(operator));
         try f.writeCValue(writer, rhs, .Other);
-        try writer.writeAll(".is_null));\n");
-
-        return local;
     }
-
-    try f.writeCValue(writer, lhs, .Other);
-    try writer.writeByte(' ');
-    try writer.writeAll(compareOperatorC(operator));
-    try writer.writeByte(' ');
-    try f.writeCValue(writer, rhs, .Other);
-    try writer.writeAll(";\n");
+    try a.end(f, writer);
 
     return local;
 }
@@ -6109,41 +6094,48 @@ fn airFloatCast(f: *Function, inst: Air.Inst.Index) !CValue {
     const ty_op = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
 
     const inst_ty = f.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType(mod);
     const operand = try f.resolveInst(ty_op.operand);
     try reap(f, inst, &.{ty_op.operand});
     const operand_ty = f.typeOf(ty_op.operand);
+    const scalar_ty = operand_ty.scalarType(mod);
     const target = f.object.dg.module.getTarget();
-    const operation = if (inst_ty.isRuntimeFloat() and operand_ty.isRuntimeFloat())
-        if (inst_ty.floatBits(target) < operand_ty.floatBits(target)) "trunc" else "extend"
-    else if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat())
-        if (inst_ty.isSignedInt(mod)) "fix" else "fixuns"
-    else if (inst_ty.isRuntimeFloat() and operand_ty.isInt(mod))
-        if (operand_ty.isSignedInt(mod)) "float" else "floatun"
+    const operation = if (inst_scalar_ty.isRuntimeFloat() and scalar_ty.isRuntimeFloat())
+        if (inst_scalar_ty.floatBits(target) < scalar_ty.floatBits(target)) "trunc" else "extend"
+    else if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat())
+        if (inst_scalar_ty.isSignedInt(mod)) "fix" else "fixuns"
+    else if (inst_scalar_ty.isRuntimeFloat() and scalar_ty.isInt(mod))
+        if (scalar_ty.isSignedInt(mod)) "float" else "floatun"
     else
         unreachable;
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorize.start(f, inst, writer, operand_ty);
+    const a = try Assignment.start(f, writer, scalar_ty);
     try f.writeCValue(writer, local, .Other);
-
-    try writer.writeAll(" = ");
-    if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat()) {
+    try v.elem(f, writer);
+    try a.assign(f, writer);
+    if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat()) {
         try writer.writeAll("zig_wrap_");
-        try f.object.dg.renderTypeForBuiltinFnName(writer, inst_ty);
+        try f.object.dg.renderTypeForBuiltinFnName(writer, inst_scalar_ty);
         try writer.writeByte('(');
     }
     try writer.writeAll("zig_");
     try writer.writeAll(operation);
-    try writer.writeAll(compilerRtAbbrev(operand_ty, mod));
-    try writer.writeAll(compilerRtAbbrev(inst_ty, mod));
+    try writer.writeAll(compilerRtAbbrev(scalar_ty, mod));
+    try writer.writeAll(compilerRtAbbrev(inst_scalar_ty, mod));
     try writer.writeByte('(');
     try f.writeCValue(writer, operand, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeByte(')');
-    if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat()) {
-        try f.object.dg.renderBuiltinInfo(writer, inst_ty, .bits);
+    if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat()) {
+        try f.object.dg.renderBuiltinInfo(writer, inst_scalar_ty, .bits);
         try writer.writeByte(')');
     }
-    try writer.writeAll(";\n");
+    try a.end(f, writer);
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -6315,7 +6307,7 @@ fn airCmpBuiltinCall(
     try v.elem(f, writer);
     try f.object.dg.renderBuiltinInfo(writer, scalar_ty, info);
     try writer.writeByte(')');
-    if (!ref_ret) try writer.print(" {s} {}", .{
+    if (!ref_ret) try writer.print("{s}{}", .{
         compareOperatorC(operator),
         try f.fmtIntLiteral(Type.i32, try mod.intValue(Type.i32, 0)),
     });
@@ -7661,12 +7653,12 @@ fn compareOperatorAbbrev(operator: std.math.CompareOperator) []const u8 {
 
 fn compareOperatorC(operator: std.math.CompareOperator) []const u8 {
     return switch (operator) {
-        .lt => "<",
-        .lte => "<=",
-        .eq => "==",
-        .gte => ">=",
-        .gt => ">",
-        .neq => "!=",
+        .lt => " < ",
+        .lte => " <= ",
+        .eq => " == ",
+        .gte => " >= ",
+        .gt => " > ",
+        .neq => " != ",
     };
 }
 
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
@@ -8646,8 +8646,6 @@ pub const FuncGen = struct {
         const operand_ty = self.typeOf(ty_op.operand);
         const dest_ty = self.typeOfIndex(inst);
         const target = mod.getTarget();
-        const dest_bits = dest_ty.floatBits(target);
-        const src_bits = operand_ty.floatBits(target);
 
         if (intrinsicsAllowed(dest_ty, target) and intrinsicsAllowed(operand_ty, target)) {
             return self.wip.cast(.fpext, operand, try o.lowerType(dest_ty), "");
@@ -8655,11 +8653,19 @@ pub const FuncGen = struct {
             const operand_llvm_ty = try o.lowerType(operand_ty);
             const dest_llvm_ty = try o.lowerType(dest_ty);
 
+            const dest_bits = dest_ty.scalarType(mod).floatBits(target);
+            const src_bits = operand_ty.scalarType(mod).floatBits(target);
             const fn_name = try o.builder.fmt("__extend{s}f{s}f2", .{
                 compilerRtFloatAbbrev(src_bits), compilerRtFloatAbbrev(dest_bits),
             });
 
             const libc_fn = try self.getLibcFunction(fn_name, &.{operand_llvm_ty}, dest_llvm_ty);
+            if (dest_ty.isVector(mod)) return self.buildElementwiseCall(
+                libc_fn,
+                &.{operand},
+                try o.builder.poisonValue(dest_llvm_ty),
+                dest_ty.vectorLen(mod),
+            );
             return self.wip.call(
                 .normal,
                 .ccc,
diff --git a/src/type.zig b/src/type.zig
@@ -905,11 +905,32 @@ pub const Type = struct {
                     return Type.fromInterned(array_type.child).abiAlignmentAdvanced(mod, strat);
                 },
                 .vector_type => |vector_type| {
-                    const bits_u64 = try bitSizeAdvanced(Type.fromInterned(vector_type.child), mod, opt_sema);
-                    const bits: u32 = @intCast(bits_u64);
-                    const bytes = ((bits * vector_type.len) + 7) / 8;
-                    const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes);
-                    return .{ .scalar = Alignment.fromByteUnits(alignment) };
+                    if (vector_type.len == 0) return .{ .scalar = .@"1" };
+                    switch (mod.comp.getZigBackend()) {
+                        else => {
+                            const elem_bits: u32 = @intCast(try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema));
+                            if (elem_bits == 0) return .{ .scalar = .@"1" };
+                            const bytes = ((elem_bits * vector_type.len) + 7) / 8;
+                            const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes);
+                            return .{ .scalar = Alignment.fromByteUnits(alignment) };
+                        },
+                        .stage2_x86_64 => {
+                            if (vector_type.child == .bool_type) {
+                                if (vector_type.len > 256 and std.Target.x86.featureSetHas(target.cpu.features, .avx512f)) return .{ .scalar = .@"64" };
+                                if (vector_type.len > 128 and std.Target.x86.featureSetHas(target.cpu.features, .avx2)) return .{ .scalar = .@"32" };
+                                if (vector_type.len > 64) return .{ .scalar = .@"16" };
+                                const bytes = std.math.divCeil(u32, vector_type.len, 8) catch unreachable;
+                                const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes);
+                                return .{ .scalar = Alignment.fromByteUnits(alignment) };
+                            }
+                            const elem_bytes: u32 = @intCast((try Type.fromInterned(vector_type.child).abiSizeAdvanced(mod, strat)).scalar);
+                            if (elem_bytes == 0) return .{ .scalar = .@"1" };
+                            const bytes = elem_bytes * vector_type.len;
+                            if (bytes > 32 and std.Target.x86.featureSetHas(target.cpu.features, .avx512f)) return .{ .scalar = .@"64" };
+                            if (bytes > 16 and std.Target.x86.featureSetHas(target.cpu.features, .avx)) return .{ .scalar = .@"32" };
+                            return .{ .scalar = .@"16" };
+                        },
+                    }
                 },
 
                 .opt_type => return abiAlignmentAdvancedOptional(ty, mod, strat),
@@ -1237,9 +1258,6 @@ pub const Type = struct {
                             .storage = .{ .lazy_size = ty.toIntern() },
                         } }))) },
                     };
-                    const elem_bits = try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema);
-                    const total_bits = elem_bits * vector_type.len;
-                    const total_bytes = (total_bits + 7) / 8;
                     const alignment = switch (try ty.abiAlignmentAdvanced(mod, strat)) {
                         .scalar => |x| x,
                         .val => return .{ .val = Value.fromInterned((try mod.intern(.{ .int = .{
@@ -1247,6 +1265,18 @@ pub const Type = struct {
                             .storage = .{ .lazy_size = ty.toIntern() },
                         } }))) },
                     };
+                    const total_bytes = switch (mod.comp.getZigBackend()) {
+                        else => total_bytes: {
+                            const elem_bits = try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema);
+                            const total_bits = elem_bits * vector_type.len;
+                            break :total_bytes (total_bits + 7) / 8;
+                        },
+                        .stage2_x86_64 => total_bytes: {
+                            if (vector_type.child == .bool_type) break :total_bytes std.math.divCeil(u32, vector_type.len, 8) catch unreachable;
+                            const elem_bytes: u32 = @intCast((try Type.fromInterned(vector_type.child).abiSizeAdvanced(mod, strat)).scalar);
+                            break :total_bytes elem_bytes * vector_type.len;
+                        },
+                    };
                     return AbiSizeAdvanced{ .scalar = alignment.forward(total_bytes) };
                 },
 
@@ -2108,7 +2138,8 @@ pub const Type = struct {
 
     /// Returns true if and only if the type is a fixed-width integer.
     pub fn isInt(self: Type, mod: *const Module) bool {
-        return self.isSignedInt(mod) or self.isUnsignedInt(mod);
+        return self.toIntern() != .comptime_int_type and
+            mod.intern_pool.isIntegerType(self.toIntern());
     }
 
     /// Returns true if and only if the type is a fixed-width, signed integer.
diff --git a/test/behavior/bitcast.zig b/test/behavior/bitcast.zig
@@ -336,7 +336,7 @@ test "comptime @bitCast packed struct to int and back" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     if (builtin.zig_backend == .stage2_llvm and native_endian == .big) {
         // https://github.com/ziglang/zig/issues/13782
diff --git a/test/behavior/cast.zig b/test/behavior/cast.zig
@@ -601,25 +601,25 @@ test "cast *[1][*]const u8 to [*]const ?[*]const u8" {
 
 test "@intCast on vector" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const S = struct {
         fn doTheTest() !void {
             // Upcast (implicit, equivalent to @intCast)
             var up0: @Vector(2, u8) = [_]u8{ 0x55, 0xaa };
             _ = &up0;
-            const up1 = @as(@Vector(2, u16), up0);
-            const up2 = @as(@Vector(2, u32), up0);
-            const up3 = @as(@Vector(2, u64), up0);
+            const up1: @Vector(2, u16) = up0;
+            const up2: @Vector(2, u32) = up0;
+            const up3: @Vector(2, u64) = up0;
             // Downcast (safety-checked)
             var down0 = up3;
             _ = &down0;
-            const down1 = @as(@Vector(2, u32), @intCast(down0));
-            const down2 = @as(@Vector(2, u16), @intCast(down0));
-            const down3 = @as(@Vector(2, u8), @intCast(down0));
+            const down1: @Vector(2, u32) = @intCast(down0);
+            const down2: @Vector(2, u16) = @intCast(down0);
+            const down3: @Vector(2, u8) = @intCast(down0);
 
             try expect(mem.eql(u16, &@as([2]u16, up1), &[2]u16{ 0x55, 0xaa }));
             try expect(mem.eql(u32, &@as([2]u32, up2), &[2]u32{ 0x55, 0xaa }));
@@ -629,20 +629,10 @@ test "@intCast on vector" {
             try expect(mem.eql(u16, &@as([2]u16, down2), &[2]u16{ 0x55, 0xaa }));
             try expect(mem.eql(u8, &@as([2]u8, down3), &[2]u8{ 0x55, 0xaa }));
         }
-
-        fn doTheTestFloat() !void {
-            var vec: @Vector(2, f32) = @splat(1234.0);
-            _ = &vec;
-            const wider: @Vector(2, f64) = vec;
-            try expect(wider[0] == 1234.0);
-            try expect(wider[1] == 1234.0);
-        }
     };
 
     try S.doTheTest();
     try comptime S.doTheTest();
-    try S.doTheTestFloat();
-    try comptime S.doTheTestFloat();
 }
 
 test "@floatCast cast down" {
@@ -2340,10 +2330,31 @@ test "@floatCast on vector" {
 
     const S = struct {
         fn doTheTest() !void {
-            var a: @Vector(3, f64) = .{ 1.5, 2.5, 3.5 };
-            _ = &a;
-            const b: @Vector(3, f32) = @floatCast(a);
-            try expectEqual(@Vector(3, f32){ 1.5, 2.5, 3.5 }, b);
+            {
+                var a: @Vector(2, f64) = .{ 1.5, 2.5 };
+                _ = &a;
+                const b: @Vector(2, f32) = @floatCast(a);
+                try expectEqual(@Vector(2, f32){ 1.5, 2.5 }, b);
+            }
+            {
+                var a: @Vector(2, f32) = .{ 3.25, 4.25 };
+                _ = &a;
+                const b: @Vector(2, f64) = @floatCast(a);
+                try expectEqual(@Vector(2, f64){ 3.25, 4.25 }, b);
+            }
+            {
+                var a: @Vector(2, f32) = .{ 5.75, 6.75 };
+                _ = &a;
+                const b: @Vector(2, f64) = a;
+                try expectEqual(@Vector(2, f64){ 5.75, 6.75 }, b);
+            }
+            {
+                var vec: @Vector(2, f32) = @splat(1234.0);
+                _ = &vec;
+                const wider: @Vector(2, f64) = vec;
+                try expect(wider[0] == 1234.0);
+                try expect(wider[1] == 1234.0);
+            }
         }
     };
 
@@ -2441,6 +2452,7 @@ test "@intFromBool on vector" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const S = struct {
         fn doTheTest() !void {
diff --git a/test/behavior/optional.zig b/test/behavior/optional.zig
@@ -110,44 +110,89 @@ test "nested optional field in struct" {
     try expect(s.x.?.y == 127);
 }
 
-test "equality compare optional with non-optional" {
+test "equality compare optionals and non-optionals" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
 
-    try test_cmp_optional_non_optional();
-    try comptime test_cmp_optional_non_optional();
+    const S = struct {
+        fn doTheTest() !void {
+            var five: isize = 5;
+            var ten: isize = 10;
+            var opt_null: ?isize = null;
+            var opt_ten: ?isize = 10;
+            _ = .{ &five, &ten, &opt_null, &opt_ten };
+            try expect(opt_null != five);
+            try expect(opt_null != ten);
+            try expect(opt_ten != five);
+            try expect(opt_ten == ten);
+
+            var opt_int: ?isize = null;
+            try expect(opt_int != five);
+            try expect(opt_int != ten);
+            try expect(opt_int == opt_null);
+            try expect(opt_int != opt_ten);
+
+            opt_int = 10;
+            try expect(opt_int != five);
+            try expect(opt_int == ten);
+            try expect(opt_int != opt_null);
+            try expect(opt_int == opt_ten);
+
+            opt_int = five;
+            try expect(opt_int == five);
+            try expect(opt_int != ten);
+            try expect(opt_int != opt_null);
+            try expect(opt_int != opt_ten);
+
+            // test evaluation is always lexical
+            // ensure that the optional isn't always computed before the non-optional
+            var mutable_state: i32 = 0;
+            _ = blk1: {
+                mutable_state += 1;
+                break :blk1 @as(?f64, 10.0);
+            } != blk2: {
+                try expect(mutable_state == 1);
+                break :blk2 @as(f64, 5.0);
+            };
+            _ = blk1: {
+                mutable_state += 1;
+                break :blk1 @as(f64, 10.0);
+            } != blk2: {
+                try expect(mutable_state == 2);
+                break :blk2 @as(?f64, 5.0);
+            };
+        }
+    };
+
+    try S.doTheTest();
+    try comptime S.doTheTest();
 }
 
-fn test_cmp_optional_non_optional() !void {
-    var ten: i32 = 10;
-    var opt_ten: ?i32 = 10;
-    var five: i32 = 5;
-    var int_n: ?i32 = null;
-
-    _ = .{ &ten, &opt_ten, &five, &int_n };
-
-    try expect(int_n != ten);
-    try expect(opt_ten == ten);
-    try expect(opt_ten != five);
-
-    // test evaluation is always lexical
-    // ensure that the optional isn't always computed before the non-optional
-    var mutable_state: i32 = 0;
-    _ = blk1: {
-        mutable_state += 1;
-        break :blk1 @as(?f64, 10.0);
-    } != blk2: {
-        try expect(mutable_state == 1);
-        break :blk2 @as(f64, 5.0);
-    };
-    _ = blk1: {
-        mutable_state += 1;
-        break :blk1 @as(f64, 10.0);
-    } != blk2: {
-        try expect(mutable_state == 2);
-        break :blk2 @as(?f64, 5.0);
-    };
+test "compare optionals with modified payloads" {
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
+
+    var lhs: ?bool = false;
+    const lhs_payload = &lhs.?;
+    var rhs: ?bool = true;
+    const rhs_payload = &rhs.?;
+    try expect(lhs != rhs and !(lhs == rhs));
+
+    lhs = null;
+    lhs_payload.* = false;
+    rhs = false;
+    try expect(lhs != rhs and !(lhs == rhs));
+
+    lhs = true;
+    rhs = null;
+    rhs_payload.* = true;
+    try expect(lhs != rhs and !(lhs == rhs));
+
+    lhs = null;
+    lhs_payload.* = false;
+    rhs = null;
+    rhs_payload.* = true;
+    try expect(lhs == rhs and !(lhs != rhs));
 }
 
 test "unwrap function call with optional pointer return value" {
diff --git a/test/behavior/select.zig b/test/behavior/select.zig
@@ -5,7 +5,6 @@ const expect = std.testing.expect;
 
 test "@select vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -36,11 +35,12 @@ fn selectVectors() !void {
 
 test "@select arrays" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) return error.SkipZigTest;
 
     try comptime selectArrays();
     try selectArrays();
diff --git a/test/behavior/shuffle.zig b/test/behavior/shuffle.zig
@@ -4,10 +4,11 @@ const mem = std.mem;
 const expect = std.testing.expect;
 
 test "@shuffle int" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) return error.SkipZigTest;
 
     const S = struct {
         fn doTheTest() !void {
diff --git a/test/behavior/vector.zig b/test/behavior/vector.zig
@@ -29,7 +29,7 @@ test "vector wrap operators" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest;
 
     const S = struct {
         fn doTheTest() !void {
@@ -906,22 +906,26 @@ test "vector @reduce comptime" {
 }
 
 test "mask parameter of @shuffle is comptime scope" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) return error.SkipZigTest;
 
     const __v4hi = @Vector(4, i16);
-    var v4_a = __v4hi{ 0, 0, 0, 0 };
-    var v4_b = __v4hi{ 0, 0, 0, 0 };
+    var v4_a = __v4hi{ 1, 2, 3, 4 };
+    var v4_b = __v4hi{ 5, 6, 7, 8 };
     _ = .{ &v4_a, &v4_b };
     const shuffled: __v4hi = @shuffle(i16, v4_a, v4_b, @Vector(4, i32){
         std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len),
-        std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len),
-        std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len),
-        std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len),
+        std.zig.c_translation.shuffleVectorIndex(2, @typeInfo(@TypeOf(v4_a)).Vector.len),
+        std.zig.c_translation.shuffleVectorIndex(4, @typeInfo(@TypeOf(v4_a)).Vector.len),
+        std.zig.c_translation.shuffleVectorIndex(6, @typeInfo(@TypeOf(v4_a)).Vector.len),
     });
-    _ = shuffled;
+    try expect(shuffled[0] == 1);
+    try expect(shuffled[1] == 3);
+    try expect(shuffled[2] == 5);
+    try expect(shuffled[3] == 7);
 }
 
 test "saturating add" {
@@ -1177,10 +1181,22 @@ test "@shlWithOverflow" {
 }
 
 test "alignment of vectors" {
-    try expect(@alignOf(@Vector(2, u8)) == 2);
-    try expect(@alignOf(@Vector(2, u1)) == 1);
-    try expect(@alignOf(@Vector(1, u1)) == 1);
-    try expect(@alignOf(@Vector(2, u16)) == 4);
+    try expect(@alignOf(@Vector(2, u8)) == switch (builtin.zig_backend) {
+        else => 2,
+        .stage2_x86_64 => 16,
+    });
+    try expect(@alignOf(@Vector(2, u1)) == switch (builtin.zig_backend) {
+        else => 1,
+        .stage2_x86_64 => 16,
+    });
+    try expect(@alignOf(@Vector(1, u1)) == switch (builtin.zig_backend) {
+        else => 1,
+        .stage2_x86_64 => 16,
+    });
+    try expect(@alignOf(@Vector(2, u16)) == switch (builtin.zig_backend) {
+        else => 4,
+        .stage2_x86_64 => 16,
+    });
 }
 
 test "loading the second vector from a slice of vectors" {
@@ -1316,10 +1332,10 @@ test "modRem with zero divisor" {
 
 test "array operands to shuffle are coerced to vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const mask = [5]i32{ -1, 0, 1, 2, 3 };

	zig fork of https://codeberg.org/ziglang/zig
	Log \| Files \| Refs \| README \| LICENSE

M	lib/std/crypto/aes.zig	\|	2	+-
M	lib/std/crypto/blake3.zig	\|	2	+-
M	lib/std/crypto/salsa20.zig	\|	5	++++-
M	lib/std/crypto/sha2.zig	\|	2	+-
M	lib/std/meta.zig	\|	3	++-
M	lib/std/unicode.zig	\|	299	+++++++++++++++++++++++++++++++++++++++----------------------------------------
M	lib/std/zig/c_translation.zig	\|	8	+++-----
M	src/InternPool.zig	\|	9	+++++++--
M	src/Sema.zig	\|	33	+++++++++++++++++++++++++++++++--
M	src/arch/x86_64/CodeGen.zig	\|	1986	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M	src/arch/x86_64/Encoding.zig	\|	15	+++++++++++----
M	src/arch/x86_64/Lower.zig	\|	14	+++++++++++++-
M	src/arch/x86_64/Mir.zig	\|	30	+++++++++++++++++++++++++++---
M	src/arch/x86_64/encodings.zig	\|	69	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M	src/codegen.zig	\|	57	+++++++++++++++++++++++++++------------------------------
M	src/codegen/c.zig	\|	98	++++++++++++++++++++++++++++++++++++-------------------------------------------
M	src/codegen/llvm.zig	\|	10	++++++++--
M	src/type.zig	\|	49	++++++++++++++++++++++++++++++++++++++++---------
M	test/behavior/bitcast.zig	\|	2	+-
M	test/behavior/cast.zig	\|	54	+++++++++++++++++++++++++++++++++---------------------
M	test/behavior/optional.zig	\|	109	++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
M	test/behavior/select.zig	\|	4	++--
M	test/behavior/shuffle.zig	\|	3	++-
M	test/behavior/vector.zig	\|	42	+++++++++++++++++++++++++++++-------------