compiler: fully switch to new `@bitCast` semantics - zig - fork of https://codeberg.org/ziglang/zig

commit 6d2c8349c53707f9eeee7554bd7717667718ce4d (tree)
parent f9a6149b34af662a0d524f2f64f00e8857afc13b
Author: Matthew Lugg <mlugg@mlugg.co.uk>
Date:   Wed, 17 Jun 2026 12:11:23 +0100

compiler: fully switch to new `@bitCast` semantics

Diffstat:
M CMakeLists.txt  | 2 +-
M src/Air.zig  | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M src/Air/Legalize.zig  | 207 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M src/Air/Liveness.zig  | 13 ++++++++++---
M src/Air/Liveness/Verify.zig  | 13 ++++++++++---
A src/Air/Verify.zig  | 465 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/Air/print.zig  | 13 ++++++++++---
M src/Sema.zig  | 854 +++++++++++++++++++++++++++++++++++--------------------------------------------
M src/Sema/LowerZon.zig  | 15 ++++-----------
D src/Sema/bitcast.zig  | 774 -------------------------------------------------------------------------------
M src/Sema/comptime_ptr_access.zig  | 321 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
A src/Sema/reinterpret.zig  | 576 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/Type.zig  | 153 ++++++++++++++++++++++++-------------------------------------------------------
M src/Value.zig  | 269 ++++++++++++++++++++++++++++++++++++++++---------------------------------------
M src/Zcu/PerThread.zig  | 4 ++++
M src/codegen/aarch64/Select.zig  | 30 +++++++++++++++++++++++-------
M src/codegen/aarch64/abi.zig  | 4 ++--
M src/codegen/arm/abi.zig  | 12 ++++++------
M src/codegen/c.zig  | 374 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M src/codegen/c/type.zig  | 34 +++++++++++++++-------------------
M src/codegen/llvm.zig  | 253 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M src/codegen/llvm/FuncGen.zig  | 1046 +++++++++++++++++++++++++++++++++++--------------------------------------------
M src/codegen/mips/abi.zig  | 9 ++++-----
M src/codegen/riscv64/CodeGen.zig  | 19 ++++++++++++-------
M src/codegen/riscv64/abi.zig  | 11 +++++------
M src/codegen/sparc64/CodeGen.zig  | 15 +++++++++++----
M src/codegen/spirv/CodeGen.zig  | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
M src/codegen/wasm/CodeGen.zig  | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M src/codegen/x86_64/CodeGen.zig  | 51 ++++++++++++++++++++++++++++++++-------------------

29 files changed, 3159 insertions(+), 2673 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -342,7 +342,7 @@ set(ZIG_STAGE2_SOURCES
     src/Package/Module.zig
     src/RangeSet.zig
     src/Sema.zig
-    src/Sema/bitcast.zig
+    src/Sema/reinterpret.zig
     src/Sema/comptime_ptr_access.zig
     src/Sema/type_resolution.zig
     src/Type.zig
diff --git a/src/Air.zig b/src/Air.zig
@@ -17,6 +17,7 @@ const print = @import("Air/print.zig");
 
 pub const Legalize = @import("Air/Legalize.zig");
 pub const Liveness = @import("Air/Liveness.zig");
+pub const Verify = @import("Air/Verify.zig");
 
 instructions: std.MultiArrayList(Inst).Slice,
 /// The meaning of this data is determined by `Inst.Tag` value.
@@ -276,10 +277,50 @@ pub const Inst = struct {
         /// Boolean or binary NOT.
         /// Uses the `ty_op` field.
         not,
-        /// Reinterpret the bits of a value as a different type.  This is like `@bitCast` but
-        /// also supports enums and pointers.
+        /// Implements `@bitCast`.
+        ///
+        /// Uses the `ty_op` field.
+        bit_cast,
+        /// Cast a pointer to a different pointer type. The result type is a slice iff the operand
+        /// type is a slice (the length of the slice does not change). All other pointer attributes
+        /// except for the address space may change.
+        ///
+        /// Supports vectors of pointers.
+        ///
+        /// Uses the `ty_op` field.
+        ptr_cast,
+        /// Cast an integer to a pointer (not a slice). Operand type is always `usize`.
+        ///
+        /// Supports vectors of integers.
+        ///
+        /// Uses the `ty_op` field.
+        ptr_from_int,
+        /// Cast a pointer (not a slice) to an integer. Result type is always `usize`.
+        ///
+        /// Supports vectors of pointers.
+        ///
+        /// Uses the `ty_op` field.
+        int_from_ptr,
+        /// Cast an error set `E1` to a different error set `E2`, or cast an error union `E1!T` to
+        /// an error union `E2!T` with the same payload type but a different error set type.
+        ///
+        /// Uses the `ty_op` field.
+        error_cast,
+        /// Cast an integer to an error set type. The integer operand type is unsigned and has bit
+        /// width equal to `zcu.errorSetBits()`.
+        ///
+        /// Uses the `ty_op` field.
+        error_from_int,
+        /// Cast an error set to an integer type. The integer destination type is unsigned and has
+        /// bit width equal to `zcu.errorSetBits()`.
+        ///
+        /// Uses the `ty_op` field.
+        int_from_error,
+        /// Cast an enum value to a tagged union, whose tag type is that enum, and which has no
+        /// payload bits (i.e. all payloads are equivalent to `void`).
+        ///
         /// Uses the `ty_op` field.
-        bitcast,
+        union_from_enum,
         /// A block runs its body which always ends with a `noreturn` instruction,
         /// so the only way to proceed to the code after the `block` is to encounter a `br`
         /// that targets this `block`.  If the `block` type is `noreturn`,
@@ -589,13 +630,13 @@ pub const Inst = struct {
         /// the integer tag type of the enum.
         /// See `trunc` for integer truncation.
         /// Uses the `ty_op` field.
-        intcast,
-        /// Like `intcast`, but includes two safety checks:
+        int_cast,
+        /// Like `int_cast`, but includes two safety checks:
         /// * triggers a safety panic if the cast truncates bits
         /// * triggers a safety panic if the destination type is an exhaustive enum
         ///   and the operand is not a valid value of this type; i.e. equivalent to
         ///   a safety check based on `.is_named_enum_value`
-        intcast_safe,
+        int_cast_safe,
         /// Truncate higher bits from an integer, resulting in an integer type with the same
         /// sign but an equal or smaller number of bits.
         /// Uses the `ty_op` field.
@@ -1667,12 +1708,19 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool)
         => return datas[@intFromEnum(inst)].ty_pl.ty.toType(),
 
         .not,
-        .bitcast,
+        .bit_cast,
+        .ptr_cast,
+        .ptr_from_int,
+        .int_from_ptr,
+        .error_cast,
+        .error_from_int,
+        .int_from_error,
+        .union_from_enum,
         .load,
         .fpext,
         .fptrunc,
-        .intcast,
-        .intcast_safe,
+        .int_cast,
+        .int_cast_safe,
         .trunc,
         .optional_payload,
         .optional_payload_ptr,
@@ -1913,7 +1961,7 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool {
         .add_safe,
         .sub_safe,
         .mul_safe,
-        .intcast_safe,
+        .int_cast_safe,
         .int_from_float_safe,
         .int_from_float_optimized_safe,
         .legalize_vec_store_elem,
@@ -1965,7 +2013,14 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool {
         .shl_sat,
         .xor,
         .not,
-        .bitcast,
+        .bit_cast,
+        .ptr_cast,
+        .ptr_from_int,
+        .int_from_ptr,
+        .error_cast,
+        .error_from_int,
+        .int_from_error,
+        .union_from_enum,
         .ret_addr,
         .frame_addr,
         .clz,
@@ -2009,7 +2064,7 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool {
         .is_non_err,
         .fptrunc,
         .fpext,
-        .intcast,
+        .int_cast,
         .trunc,
         .optional_payload,
         .optional_payload_ptr,
diff --git a/src/Air/Legalize.zig b/src/Air/Legalize.zig
@@ -75,6 +75,9 @@ pub const Feature = enum {
     scalarize_shl_sat,
     scalarize_xor,
     scalarize_not,
+    scalarize_ptr_cast,
+    scalarize_ptr_from_int,
+    scalarize_int_from_ptr,
     scalarize_clz,
     scalarize_ctz,
     scalarize_popcount,
@@ -100,8 +103,8 @@ pub const Feature = enum {
     scalarize_cmp_vector_optimized,
     scalarize_fptrunc,
     scalarize_fpext,
-    scalarize_intcast,
-    scalarize_intcast_safe,
+    scalarize_int_cast,
+    scalarize_int_cast_safe,
     scalarize_trunc,
     scalarize_int_from_float,
     scalarize_int_from_float_optimized,
@@ -115,24 +118,24 @@ pub const Feature = enum {
     scalarize_select,
     scalarize_mul_add,
 
-    // Below are several different features for scalarizing `bitcast` in different scenarios. It is
+    // Below are several different features for scalarizing `bit_cast` in different scenarios. It is
     // valid to enable any combination of these features.
 
-    /// Scalarize `bitcast` where the operand or result type is an array.
-    scalarize_bitcast_array,
-    /// Scalarize `bitcast` where either:
+    /// Scalarize `bit_cast` where the operand or result type is an array.
+    scalarize_bit_cast_array,
+    /// Scalarize `bit_cast` where either:
     ///
     /// * operand type is `@Vector(n, A), but result type is not `@Vector(n, B)`; or
     /// * result type is `@Vector(n, A), but operand type is not `@Vector(n, B)`
     ///
-    /// This effectively scalarizes any `bitcast` to/from a vector, *unless* the operation can be
+    /// This effectively scalarizes any `bit_cast` to/from a vector, *unless* the operation can be
     /// performed by bitcasting each vector element and returning a vector of the results.
     ///
     /// If this feature is enabled, the following AIR instruction tags may be emitted:
     /// * `.legalize_vec_elem_val`
     /// * `.legalize_vec_store_elem`
-    scalarize_bitcast_vector_non_elementwise,
-    /// Scalarize `bitcast` where the operand or result type is an array or vector whose element
+    scalarize_bit_cast_vector_non_elementwise,
+    /// Scalarize `bit_cast` where the operand or result type is an array or vector whose element
     /// type `E` has `@bitSizeOf(E) != 8 * @sizeOf(E)`. These are the cases where the backend may
     /// need to sign- or zero-extend multiple elements to populate "padding" bits.
     ///
@@ -142,18 +145,18 @@ pub const Feature = enum {
     /// If this feature is enabled, the following AIR instruction tags may be emitted:
     /// * `.legalize_vec_elem_val`
     /// * `.legalize_vec_store_elem`
-    scalarize_bitcast_padded_elems,
+    scalarize_bit_cast_padded_elems,
 
     /// Legalize (shift lhs, (splat rhs)) -> (shift lhs, rhs)
     unsplat_shift_rhs,
     /// Legalize reduce of a one element vector to a bitcast.
-    reduce_one_elem_to_bitcast,
+    reduce_one_elem_to_bit_cast,
     /// Legalize splat to a one element vector to a bitcast.
-    splat_one_elem_to_bitcast,
+    splat_one_elem_to_bit_cast,
 
-    /// Replace `intcast_safe` with an explicit safety check which `call`s the panic function on failure.
-    /// Not compatible with `scalarize_intcast_safe`.
-    expand_intcast_safe,
+    /// Replace `int_cast_safe` with an explicit safety check which `call`s the panic function on failure.
+    /// Not compatible with `scalarize_int_cast_safe`.
+    expand_int_cast_safe,
     /// Replace `int_from_float_safe` with an explicit safety check which `call`s the panic function on failure.
     /// Not compatible with `scalarize_int_from_float_safe`.
     expand_int_from_float_safe,
@@ -178,9 +181,9 @@ pub const Feature = enum {
     /// Currently assumes little endian and a specific integer layout where the lsb of every integer is the lsb of the
     /// first byte of memory until bit pointers know their backing type.
     expand_packed_store,
-    /// Replace `struct_field_val` of a packed field with a `bitcast` to integer, `shr`, `trunc`, and `bitcast` to field type.
+    /// Replace `struct_field_val` of a packed field with a `bit_cast` to integer, `shr`, `trunc`, and `bit_cast` to field type.
     expand_packed_struct_field_val,
-    /// Replace `aggregate_init` of a packed struct with a sequence of `shl_exact`, `bitcast`, `intcast`, and `bit_or`.
+    /// Replace `aggregate_init` of a packed struct with a sequence of `shl_exact`, `bit_cast`, `int_cast`, and `bit_or`.
     expand_packed_aggregate_init,
 
     /// Replace all arithmetic operations on 16-bit floating-point types with calls to soft-float
@@ -274,8 +277,11 @@ pub const Feature = enum {
             .cmp_vector_optimized => .scalarize_cmp_vector_optimized,
             .fptrunc => .scalarize_fptrunc,
             .fpext => .scalarize_fpext,
-            .intcast => .scalarize_intcast,
-            .intcast_safe => .scalarize_intcast_safe,
+            .int_cast => .scalarize_int_cast,
+            .int_cast_safe => .scalarize_int_cast_safe,
+            .ptr_cast => .scalarize_ptr_cast,
+            .ptr_from_int => .scalarize_ptr_from_int,
+            .int_from_ptr => .scalarize_int_from_ptr,
             .trunc => .scalarize_trunc,
             .int_from_float => .scalarize_int_from_float,
             .int_from_float_optimized => .scalarize_int_from_float_optimized,
@@ -495,7 +501,10 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
             .popcount,
             .byte_swap,
             .bit_reverse,
-            .intcast,
+            .int_cast,
+            .ptr_cast,
+            .ptr_from_int,
+            .int_from_ptr,
             .trunc,
             => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) {
                 const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
@@ -569,19 +578,19 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                     },
                 }
             },
-            .bitcast => if (l.features.hasAny(&.{
-                .scalarize_bitcast_array,
-                .scalarize_bitcast_vector_non_elementwise,
-                .scalarize_bitcast_padded_elems,
+            .bit_cast => if (l.features.hasAny(&.{
+                .scalarize_bit_cast_array,
+                .scalarize_bit_cast_vector_non_elementwise,
+                .scalarize_bit_cast_padded_elems,
             })) {
                 if (try l.scalarizeBitcastBlockPayload(inst)) |payload| {
                     continue :inst l.replaceInst(inst, .block, payload);
                 }
             },
-            .intcast_safe => if (l.features.has(.expand_intcast_safe)) {
-                assert(!l.features.has(.scalarize_intcast_safe)); // it doesn't make sense to do both
+            .int_cast_safe => if (l.features.has(.expand_int_cast_safe)) {
+                assert(!l.features.has(.scalarize_int_cast_safe)); // it doesn't make sense to do both
                 continue :inst l.replaceInst(inst, .block, try l.safeIntcastBlockPayload(inst));
-            } else if (l.features.has(.scalarize_intcast_safe)) {
+            } else if (l.features.has(.scalarize_int_cast_safe)) {
                 const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
                 if (ty_op.ty.toType().isVector(zcu)) {
                     continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op));
@@ -797,10 +806,10 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
             inline .reduce, .reduce_optimized => |air_tag| {
                 const reduce = l.air_instructions.items(.data)[@intFromEnum(inst)].reduce;
                 const vector_ty = l.typeOf(reduce.operand);
-                if (l.features.has(.reduce_one_elem_to_bitcast)) {
+                if (l.features.has(.reduce_one_elem_to_bit_cast)) {
                     switch (vector_ty.vectorLen(zcu)) {
                         0 => unreachable,
-                        1 => continue :inst l.replaceInst(inst, .bitcast, .{ .ty_op = .{
+                        1 => continue :inst l.replaceInst(inst, .bit_cast, .{ .ty_op = .{
                             .ty = .fromType(vector_ty.childType(zcu)),
                             .operand = reduce.operand,
                         } }),
@@ -817,11 +826,11 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                     .soft_float => unreachable, // the operand is not a scalar
                 }
             },
-            .splat => if (l.features.has(.splat_one_elem_to_bitcast)) {
+            .splat => if (l.features.has(.splat_one_elem_to_bit_cast)) {
                 const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
                 switch (ty_op.ty.toType().vectorLen(zcu)) {
                     0 => unreachable,
-                    1 => continue :inst l.replaceInst(inst, .bitcast, .{ .ty_op = .{
+                    1 => continue :inst l.replaceInst(inst, .bit_cast, .{ .ty_op = .{
                         .ty = ty_op.ty,
                         .operand = ty_op.operand,
                     } }),
@@ -887,7 +896,7 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                                 const field_bits = agg_ty.fieldType(field_index, zcu).bitSize(zcu);
                                 if (field_bits == struct_bits) {
                                     // Just bitcast this field.
-                                    continue :inst l.replaceInst(inst, .bitcast, .{ .ty_op = .{
+                                    continue :inst l.replaceInst(inst, .bit_cast, .{ .ty_op = .{
                                         .ty = .fromType(agg_ty),
                                         .operand = @enumFromInt(l.air_extra.items[ty_pl.payload + field_index]),
                                     } });
@@ -934,6 +943,10 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
             .legalize_vec_store_elem,
             .legalize_compiler_rt_call,
             .spirv_runtime_array_len,
+            .error_cast,
+            .error_from_int,
+            .int_from_error,
+            .union_from_enum,
             => {},
         }
     }
@@ -956,7 +969,7 @@ fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, form: Scalariz
 
     if (result_is_array) {
         // This is only allowed when legalizing an elementwise bitcast.
-        assert(orig.tag == .bitcast);
+        assert(orig.tag == .bit_cast);
         assert(form == .ty_op);
     }
 
@@ -1474,7 +1487,7 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
         var operand_to_int: bool = true;
         var int_to_dest: bool = true;
 
-        if (l.features.has(.scalarize_bitcast_array)) {
+        if (l.features.has(.scalarize_bit_cast_array)) {
             if (operand_tag == .array) {
                 operand_to_dest = false;
                 operand_to_int = false;
@@ -1485,7 +1498,7 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
             }
         }
 
-        if (l.features.has(.scalarize_bitcast_vector_non_elementwise)) {
+        if (l.features.has(.scalarize_bit_cast_vector_non_elementwise)) {
             if (operand_tag == .vector) operand_to_int = false;
             if (dest_tag == .vector) int_to_dest = false;
 
@@ -1499,7 +1512,7 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
             }
         }
 
-        if (l.features.has(.scalarize_bitcast_padded_elems)) {
+        if (l.features.has(.scalarize_bit_cast_padded_elems)) {
             if (operand_tag == .array or operand_tag == .vector) {
                 const elem_ty = operand_ty.childType(zcu);
                 if (elem_ty.bitSize(zcu) != 8 * elem_ty.abiSize(zcu)) {
@@ -1554,6 +1567,12 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
             break :uint_val main_block.addBitCast(l, uint_ty, ty_op.operand);
         }
 
+        if (operand_ty.arrayLenIncludingSentinel(zcu) == 1) {
+            _ = main_block.stealCapacity(18);
+            const elem = main_block.addBinOp(l, .array_elem_val, ty_op.operand, .zero_usize).toRef();
+            break :uint_val main_block.addBitCast(l, uint_ty, elem);
+        }
+
         // %1 = block({
         //   %2 = alloc(*usize)
         //   %3 = alloc(*uN)
@@ -1562,8 +1581,8 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
         //   %6 = loop({
         //     %7 = load(%2)
         //     %8 = array_elem_val(orig_operand, %7)
-        //     %9 = bitcast(uE, %8)
-        //     %10 = intcast(uN, %9)
+        //     %9 = bit_cast(uE, %8)
+        //     %10 = int_cast(uN, %9)
         //     %11 = load(%3)
         //     %12 = shl_exact(%11, <uS, E>)
         //     %13 = bit_or(%12, %10)
@@ -1613,7 +1632,7 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
             index_val,
         ).toRef();
         const elem_uint = loop.block.addBitCast(l, elem_uint_ty, raw_elem);
-        const elem_extended = loop.block.addTyOp(l, .intcast, uint_ty, elem_uint).toRef();
+        const elem_extended = loop.block.addTyOp(l, .int_cast, uint_ty, elem_uint).toRef();
         const old_result = loop.block.addTyOp(l, .load, uint_ty, result_ptr).toRef();
         const shifted_result = loop.block.addBinOp(l, .shl_exact, old_result, .fromValue(elem_bits_val)).toRef();
         const new_result = loop.block.addBinOp(l, .bit_or, shifted_result, elem_extended).toRef();
@@ -1648,6 +1667,19 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
         _ = main_block.stealCapacity(17);
         const result = main_block.addBitCast(l, dest_ty, uint_val);
         main_block.addBr(l, orig_inst, result);
+    } else if (dest_ty.arrayLenIncludingSentinel(zcu) == 1) {
+        _ = main_block.stealCapacity(16);
+        const elem = main_block.addBitCast(l, dest_ty.childType(zcu), uint_val);
+        const aggregate_init_payload_start = l.air_extra.items.len;
+        try l.air_extra.append(zcu.gpa, @intFromEnum(elem));
+        const result = main_block.add(l, .{
+            .tag = .aggregate_init,
+            .data = .{ .ty_pl = .{
+                .ty = .fromType(dest_ty),
+                .payload = @intCast(aggregate_init_payload_start),
+            } },
+        }).toRef();
+        main_block.addBr(l, orig_inst, result);
     } else {
         // %1 = alloc(*usize)
         // %2 = alloc(*@Vector(N, Result))
@@ -1655,10 +1687,10 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
         // %4 = loop({
         //   %5 = load(%1)
         //   %6 = mul(%5, <usize, E>)
-        //   %7 = intcast(uS, %6)
+        //   %7 = int_cast(uS, %6)
         //   %8 = shr(uint_val, %7)
         //   %9 = trunc(uE, %8)
-        //   %10 = bitcast(Result, %9)
+        //   %10 = bit_cast(Result, %9)
         //   %11 = legalize_vec_store_elem(%2, %5, %10)
         //   %12 = cmp_eq(%5, <usize, vec_len>)
         //   %13 = cond_br(%12, {
@@ -1687,7 +1719,7 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?
 
         const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef();
         const bit_offset = loop.block.addBinOp(l, .mul, index_val, .fromValue(try pt.intValue(.usize, elem_bits))).toRef();
-        const casted_bit_offset = loop.block.addTyOp(l, .intcast, shift_ty, bit_offset).toRef();
+        const casted_bit_offset = loop.block.addTyOp(l, .int_cast, shift_ty, bit_offset).toRef();
         const shifted_uint = loop.block.addBinOp(l, .shr, uint_val, casted_bit_offset).toRef();
         const elem_uint = loop.block.addTyOp(l, .trunc, elem_uint_ty, shifted_uint).toRef();
         const elem_val = loop.block.addBitCast(l, elem_ty, elem_uint);
@@ -2065,7 +2097,7 @@ fn safeIntcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.In
     //     %5 = call(@panic.invalidEnumValue, [])
     //     %6 = unreach()
     //   }, {
-    //     %7 = intcast(@res_ty, %y)
+    //     %7 = int_cast(@res_ty, %y)
     //     %8 = is_named_enum_value(%7)
     //     %9 = cond_br(%8, {
     //       %10 = br(%x, %7)
@@ -2087,7 +2119,7 @@ fn safeIntcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.In
     //     %6 = call(@panic.invalidEnumValue, [])
     //     %7 = unreach()
     //   }, {
-    //     %8 = intcast(@res_ty, %y)
+    //     %8 = int_cast(@res_ty, %y)
     //     %9 = br(%x, %8)
     //   })
     // })
@@ -2140,9 +2172,9 @@ fn safeIntcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.In
         cur_block = &condbr.else_block;
     }
 
-    // Now we know we're in-range, we can intcast:
+    // Now we know we're in-range, we can int_cast:
     const cast_inst = cur_block.add(l, .{
-        .tag = .intcast,
+        .tag = .int_cast,
         .data = .{ .ty_op = .{
             .ty = Air.internedToRef(dest_ty.toIntern()),
             .operand = operand_ref,
@@ -2313,7 +2345,7 @@ fn safeArithmeticBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, overflow_
     // %1 = add_with_overflow(%x, %y)
     // %2 = struct_field_val(%1, .@"1")
     // %3 = reduce(%2, .@"or")
-    // %4 = bitcast(%3, @bool_type)
+    // %4 = bit_cast(%3, @bool_type)
     // %5 = cond_br(%4, {
     //   %6 = call(@panic.integerOverflow, [])
     //   %7 = unreach()
@@ -2419,7 +2451,7 @@ fn packedLoadBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Ins
                                 .tag = .load,
                                 .data = .{ .ty_op = .{
                                     .ty = Air.internedToRef(load_ty.toIntern()),
-                                    .operand = res_block.addBitCast(l, load_ptr_ty: {
+                                    .operand = res_block.addPtrCast(l, load_ptr_ty: {
                                         var load_ptr_info = ptr_info;
                                         load_ptr_info.child = load_ty.toIntern();
                                         load_ptr_info.flags.vector_index = .none;
@@ -2462,23 +2494,17 @@ fn packedStoreBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.In
 
     var res_block: Block = .init(&inst_buf);
     {
-        const backing_ptr_inst = res_block.add(l, .{
-            .tag = .bitcast,
-            .data = .{ .ty_op = .{
-                .ty = Air.internedToRef((load_store_ptr_ty: {
-                    var load_ptr_info = ptr_info;
-                    load_ptr_info.child = load_store_ty.toIntern();
-                    load_ptr_info.flags.vector_index = .none;
-                    load_ptr_info.packed_offset = .{ .host_size = 0, .bit_offset = 0 };
-                    break :load_store_ptr_ty try pt.ptrType(load_ptr_info);
-                }).toIntern()),
-                .operand = orig_bin_op.lhs,
-            } },
-        });
+        const backing_ptr = res_block.addPtrCast(l, load_store_ptr_ty: {
+            var load_ptr_info = ptr_info;
+            load_ptr_info.child = load_store_ty.toIntern();
+            load_ptr_info.flags.vector_index = .none;
+            load_ptr_info.packed_offset = .{ .host_size = 0, .bit_offset = 0 };
+            break :load_store_ptr_ty try pt.ptrType(load_ptr_info);
+        }, orig_bin_op.lhs);
         _ = res_block.add(l, .{
             .tag = .store,
             .data = .{ .bin_op = .{
-                .lhs = backing_ptr_inst.toRef(),
+                .lhs = backing_ptr,
                 .rhs = res_block.add(l, .{
                     .tag = .bit_or,
                     .data = .{ .bin_op = .{
@@ -2489,7 +2515,7 @@ fn packedStoreBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.In
                                     .tag = .load,
                                     .data = .{ .ty_op = .{
                                         .ty = Air.internedToRef(load_store_ty.toIntern()),
-                                        .operand = backing_ptr_inst.toRef(),
+                                        .operand = backing_ptr,
                                     } },
                                 }).toRef(),
                                 .rhs = Air.internedToRef((keep_mask: {
@@ -2518,7 +2544,7 @@ fn packedStoreBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.In
                             .tag = .shl_exact,
                             .data = .{ .bin_op = .{
                                 .lhs = res_block.add(l, .{
-                                    .tag = .intcast,
+                                    .tag = .int_cast,
                                     .data = .{ .ty_op = .{
                                         .ty = Air.internedToRef(load_store_ty.toIntern()),
                                         .operand = res_block.addBitCast(l, operand_int_ty, orig_bin_op.rhs),
@@ -2616,7 +2642,7 @@ fn packedAggregateInitBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Erro
 
         const shifted = main_block.addBinOp(l, .shl_exact, cur_uint, field_bit_size_ref).toRef();
         const field_as_uint = main_block.addBitCast(l, field_uint_ty, field_val);
-        const field_extended = main_block.addTyOp(l, .intcast, uint_ty, field_as_uint).toRef();
+        const field_extended = main_block.addTyOp(l, .int_cast, uint_ty, field_as_uint).toRef();
         cur_uint = main_block.addBinOp(l, .bit_or, shifted, field_extended).toRef();
     }
 
@@ -2805,18 +2831,51 @@ const Block = struct {
         });
     }
 
-    /// Adds a `bitcast` instruction to `b`. This is a thin wrapper that omits the instruction for
+    /// Adds a `bit_cast` instruction to `b`. This is a thin wrapper that omits the instruction for
     /// no-op casts.
     fn addBitCast(
         b: *Block,
         l: *Legalize,
-        ty: Type,
+        result_ty: Type,
         operand: Air.Inst.Ref,
     ) Air.Inst.Ref {
-        if (ty.toIntern() != l.typeOf(operand).toIntern()) return b.add(l, .{
-            .tag = .bitcast,
+        const zcu = l.pt.zcu;
+        const operand_ty = l.typeOf(operand);
+        assert(!operand_ty.isPtrAtRuntime(zcu));
+        assert(!operand_ty.isSliceAtRuntime(zcu));
+        assert(!result_ty.isPtrAtRuntime(zcu));
+        assert(!result_ty.isSliceAtRuntime(zcu));
+        if (result_ty.toIntern() != operand_ty.toIntern()) return b.add(l, .{
+            .tag = .bit_cast,
+            .data = .{ .ty_op = .{
+                .ty = .fromType(result_ty),
+                .operand = operand,
+            } },
+        }).toRef();
+        _ = b.stealCapacity(1);
+        return operand;
+    }
+
+    /// Adds a `ptr_cast` instruction to `b`. This is a thin wrapper that omits the instruction for
+    /// no-op casts.
+    fn addPtrCast(
+        b: *Block,
+        l: *Legalize,
+        result_ty: Type,
+        operand: Air.Inst.Ref,
+    ) Air.Inst.Ref {
+        const zcu = l.pt.zcu;
+        const operand_ty = l.typeOf(operand);
+        if (operand_ty.isSliceAtRuntime(zcu)) {
+            assert(result_ty.isSliceAtRuntime(zcu));
+        } else {
+            assert(operand_ty.isPtrAtRuntime(zcu));
+            assert(result_ty.isPtrAtRuntime(zcu));
+        }
+        if (result_ty.toIntern() != operand_ty.toIntern()) return b.add(l, .{
+            .tag = .ptr_cast,
             .data = .{ .ty_op = .{
-                .ty = Air.internedToRef(ty.toIntern()),
+                .ty = .fromType(result_ty),
                 .operand = operand,
             } },
         }).toRef();
@@ -3157,7 +3216,7 @@ fn softFloatFromInt(l: *Legalize, orig_inst: Air.Inst.Index) Error!union(enum) {
         var main_block: Block = .init(&inst_buf);
         try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
 
-        const extended_val = main_block.addTyOp(l, .intcast, extended_ty, ty_op.operand).toRef();
+        const extended_val = main_block.addTyOp(l, .int_cast, extended_ty, ty_op.operand).toRef();
         const call_inst = try main_block.addCompilerRtCall(l, func, &.{extended_val});
         const casted_result = main_block.addBitCast(l, dest_ty, call_inst.toRef());
         main_block.addBr(l, orig_inst, casted_result);
@@ -3184,7 +3243,7 @@ fn softFloatFromInt(l: *Legalize, orig_inst: Air.Inst.Index) Error!union(enum) {
     try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
 
     const extended_val: Air.Inst.Ref = if (extended_ty.toIntern() != src_ty.toIntern()) ext: {
-        break :ext main_block.addTyOp(l, .intcast, extended_ty, ty_op.operand).toRef();
+        break :ext main_block.addTyOp(l, .int_cast, extended_ty, ty_op.operand).toRef();
     } else ext: {
         _ = main_block.stealCapacity(1);
         break :ext ty_op.operand;
@@ -3249,7 +3308,7 @@ fn softIntFromFloat(l: *Legalize, orig_inst: Air.Inst.Index) Error!union(enum) {
         try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
 
         const call_inst = try main_block.addCompilerRtCall(l, func, &.{ty_op.operand});
-        const casted_val = main_block.addTyOp(l, .intcast, dest_ty, call_inst.toRef()).toRef();
+        const casted_val = main_block.addTyOp(l, .int_cast, dest_ty, call_inst.toRef()).toRef();
         main_block.addBr(l, orig_inst, casted_val);
 
         return .{ .block_payload = .{ .ty_pl = .{
@@ -3273,7 +3332,7 @@ fn softIntFromFloat(l: *Legalize, orig_inst: Air.Inst.Index) Error!union(enum) {
     const bits_val = try pt.intValue(.usize, dest_info.bits);
     _ = try main_block.addCompilerRtCall(l, func, &.{ extended_ptr, .fromValue(bits_val), ty_op.operand });
     const extended_val = main_block.addTyOp(l, .load, extended_ty, extended_ptr).toRef();
-    const result_val = main_block.addTyOp(l, .intcast, dest_ty, extended_val).toRef();
+    const result_val = main_block.addTyOp(l, .int_cast, dest_ty, extended_val).toRef();
     main_block.addBr(l, orig_inst, result_val);
 
     return .{ .block_payload = .{ .ty_pl = .{
diff --git a/src/Air/Liveness.zig b/src/Air/Liveness.zig
@@ -488,12 +488,19 @@ fn analyzeInst(
         => return analyzeFuncEnd(a, pass, data, inst, .{ .none, .none, .none }),
 
         .not,
-        .bitcast,
+        .bit_cast,
+        .ptr_cast,
+        .ptr_from_int,
+        .int_from_ptr,
+        .error_cast,
+        .error_from_int,
+        .int_from_error,
+        .union_from_enum,
         .load,
         .fpext,
         .fptrunc,
-        .intcast,
-        .intcast_safe,
+        .int_cast,
+        .int_cast_safe,
         .trunc,
         .optional_payload,
         .optional_payload_ptr,
diff --git a/src/Air/Liveness/Verify.zig b/src/Air/Liveness/Verify.zig
@@ -78,12 +78,19 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void {
 
             // unary
             .not,
-            .bitcast,
+            .bit_cast,
+            .ptr_cast,
+            .ptr_from_int,
+            .int_from_ptr,
+            .error_cast,
+            .error_from_int,
+            .int_from_error,
+            .union_from_enum,
             .load,
             .fpext,
             .fptrunc,
-            .intcast,
-            .intcast_safe,
+            .int_cast,
+            .int_cast_safe,
             .trunc,
             .optional_payload,
             .optional_payload_ptr,
diff --git a/src/Air/Verify.zig b/src/Air/Verify.zig
@@ -0,0 +1,465 @@
+/// Verifies that AIR is valid, in that every instruction has valid operands and types. In compiler
+/// builds with debug extensions, this is run on all AIR, both before `Air.Legalize` is run and (if
+/// it is run) after it.
+///
+/// This verification pass is currently highly incomplete---expand it as needed.
+const Verify = @This();
+
+zcu: *Zcu,
+func_index: InternPool.Index,
+ret_ty: Type,
+air: *const Air,
+cur_inst: Air.Inst.Index,
+
+pub fn run(pt: Zcu.PerThread, func_index: InternPool.Index, air: *const Air) void {
+    if (!@import("build_options").enable_debug_extensions) {
+        // `Air.Verify` is a debugging feature---it should not be used in release builds because it
+        // has little benefit and negatively affects compiler performance.
+        return;
+    }
+
+    const zcu = pt.zcu;
+
+    const func_ty: Type = Value.fromInterned(func_index).typeOf(zcu);
+    const ret_ty = func_ty.fnReturnType(zcu);
+
+    var verify: Verify = .{
+        .zcu = zcu,
+        .func_index = func_index,
+        .ret_ty = ret_ty,
+        .air = air,
+        .cur_inst = undefined, // populated by `body(...)`
+    };
+    verify.body(air.getMainBody()) catch |verify_err| switch (verify_err) {
+        error.VerifyFail => {
+            const ip = &zcu.intern_pool;
+            const func_nav = ip.indexToKey(func_index).func.owner_nav;
+            const func_fqn = ip.getNav(func_nav).fqn.toSlice(ip);
+            log.info("AIR for '{s}':", .{func_fqn});
+            const io = zcu.comp.io;
+            const stderr = io.lockStderr(&.{}, null) catch |err| switch (err) {
+                error.Canceled => return io.recancel(),
+            };
+            defer io.unlockStderr();
+            air.write(&stderr.file_writer.interface, pt, null) catch |err| switch (err) {
+                error.WriteFailed => switch (stderr.file_writer.err.?) {
+                    error.Canceled => return io.recancel(),
+                    else => {},
+                },
+            };
+        },
+    };
+}
+
+const Error = error{VerifyFail};
+
+fn fail(verify: *Verify, msg: []const u8) Error {
+    const ip = &verify.zcu.intern_pool;
+    const func_nav = ip.indexToKey(verify.func_index).func.owner_nav;
+    const func_fqn = ip.getNav(func_nav).fqn.toSlice(ip);
+    log.err("'{s}', %{d}: {s}", .{ func_fqn, verify.cur_inst, msg });
+    return error.VerifyFail;
+}
+
+fn body(verify: *Verify, body_insts: []const Air.Inst.Index) Error!void {
+    const zcu = verify.zcu;
+    const ip = &zcu.intern_pool;
+    const air = verify.air;
+    const tags = air.instructions.items(.tag);
+    const data = air.instructions.items(.data);
+    for (body_insts, 0..) |inst, body_index| {
+        verify.cur_inst = inst;
+        switch (tags[@intFromEnum(inst)]) {
+            .block => {
+                const block = air.unwrapBlock(inst);
+                try verify.body(block.body);
+            },
+            .dbg_inline_block => {
+                const block = air.unwrapDbgBlock(inst);
+                try verify.body(block.body);
+            },
+            .@"try", .try_cold => {
+                const @"try" = air.unwrapTry(inst);
+                try verify.body(@"try".else_body);
+            },
+            .try_ptr, .try_ptr_cold => {
+                const try_ptr = air.unwrapTryPtr(inst);
+                try verify.body(try_ptr.else_body);
+            },
+            .loop => {
+                const block = air.unwrapBlock(inst);
+                try verify.body(block.body);
+            },
+            .cond_br => {
+                const cond_br = air.unwrapCondBr(inst);
+                try verify.body(cond_br.then_body);
+                try verify.body(cond_br.else_body);
+            },
+            .switch_br, .loop_switch_br => {
+                const switch_br = air.unwrapSwitch(inst);
+                var it = switch_br.iterateCases();
+                while (it.next()) |case| {
+                    try verify.body(case.body);
+                }
+                const else_body = it.elseBody();
+                if (else_body.len > 0) {
+                    try verify.body(else_body);
+                }
+            },
+            .ret, .ret_safe => {
+                const operand = data[@intFromEnum(inst)].un_op;
+                if (air.typeOf(operand, ip).toIntern() != verify.ret_ty.toIntern()) return verify.fail("bad return type");
+            },
+            .ret_load => {
+                const operand = data[@intFromEnum(inst)].un_op;
+                const ptr_ty = air.typeOf(operand, ip);
+                if (ptr_ty.zigTypeTag(zcu) != .pointer) return verify.fail("operand is not a pointer");
+                if (ptr_ty.ptrSize(zcu) != .one) return verify.fail("pointer size is not '.one'");
+                if (ptr_ty.childType(zcu).toIntern() != verify.ret_ty.toIntern()) return verify.fail("bad return type");
+            },
+
+            .bit_cast => {
+                const ty_op = data[@intFromEnum(inst)].ty_op;
+                const operand_ty = air.typeOf(ty_op.operand, ip);
+                const result_ty = ty_op.ty.toType();
+                // Enums are allowed here even if their backing type is implicit.
+                if (!operand_ty.hasBitRepresentation(zcu) and operand_ty.zigTypeTag(zcu) != .@"enum") {
+                    return verify.fail("bad operand type");
+                }
+                if (!result_ty.hasBitRepresentation(zcu) and result_ty.zigTypeTag(zcu) != .@"enum") {
+                    return verify.fail("bad result type");
+                }
+                if (operand_ty.isPtrAtRuntime(zcu)) return verify.fail("bad operand type (pointer)");
+                if (result_ty.isPtrAtRuntime(zcu)) return verify.fail("bad result type (pointer)");
+                if (operand_ty.bitSize(zcu) != result_ty.bitSize(zcu)) return verify.fail("bit size mismatch");
+            },
+            .ptr_cast => {
+                const ty_op = data[@intFromEnum(inst)].ty_op;
+                const operand_ty = air.typeOf(ty_op.operand, ip);
+                const result_ty = ty_op.ty.toType();
+                const operand_scalar_ty = operand_ty.scalarType(zcu);
+                const result_scalar_ty = result_ty.scalarType(zcu);
+                if (operand_ty.isSliceAtRuntime(zcu)) {
+                    if (!result_ty.isSliceAtRuntime(zcu)) return verify.fail("operand is slice, but result is not");
+                } else {
+                    if (!operand_scalar_ty.isPtrAtRuntime(zcu)) return verify.fail("bad operand type");
+                    if (!result_scalar_ty.isPtrAtRuntime(zcu)) return verify.fail("operand is pointer, but result is not");
+                    if (operand_ty.isVector(zcu) and !result_ty.isVector(zcu)) return verify.fail("operand is vector, but result is not");
+                    if (!operand_ty.isVector(zcu) and result_ty.isVector(zcu)) return verify.fail("result is vector, but operand is not");
+                }
+                if (operand_scalar_ty.ptrAddressSpace(zcu) != result_scalar_ty.ptrAddressSpace(zcu)) {
+                    return verify.fail("illegal change to address space");
+                }
+            },
+            .ptr_from_int => {
+                const ty_op = data[@intFromEnum(inst)].ty_op;
+                const operand_ty = air.typeOf(ty_op.operand, ip);
+                const result_ty = ty_op.ty.toType();
+                const operand_scalar_ty = operand_ty.scalarType(zcu);
+                const result_scalar_ty = result_ty.scalarType(zcu);
+                if (operand_scalar_ty.toIntern() != .usize_type) return verify.fail("bad operand type");
+                if (!result_scalar_ty.isPtrAtRuntime(zcu)) return verify.fail("bad result type");
+                if (operand_ty.isVector(zcu) and !result_ty.isVector(zcu)) return verify.fail("operand is vector, but result is not");
+                if (!operand_ty.isVector(zcu) and result_ty.isVector(zcu)) return verify.fail("result is vector, but operand is not");
+            },
+            .int_from_ptr => {
+                const ty_op = data[@intFromEnum(inst)].ty_op;
+                const operand_ty = air.typeOf(ty_op.operand, ip);
+                const result_ty = ty_op.ty.toType();
+                const operand_scalar_ty = operand_ty.scalarType(zcu);
+                const result_scalar_ty = result_ty.scalarType(zcu);
+                if (!operand_scalar_ty.isPtrAtRuntime(zcu)) return verify.fail("bad operand type");
+                if (result_scalar_ty.toIntern() != .usize_type) return verify.fail("bad result type");
+                if (operand_ty.isVector(zcu) and !result_ty.isVector(zcu)) return verify.fail("operand is vector, but result is not");
+                if (!operand_ty.isVector(zcu) and result_ty.isVector(zcu)) return verify.fail("result is vector, but operand is not");
+            },
+            .error_cast => {
+                const ty_op = data[@intFromEnum(inst)].ty_op;
+                const operand_ty = air.typeOf(ty_op.operand, ip);
+                const result_ty = ty_op.ty.toType();
+                switch (operand_ty.zigTypeTag(zcu)) {
+                    else => return verify.fail("bad operand type"),
+                    .error_union => {
+                        if (result_ty.zigTypeTag(zcu) != .error_union) {
+                            return verify.fail("operand is error union, but result is not");
+                        }
+                        if (operand_ty.errorUnionPayload(zcu).toIntern() != result_ty.errorUnionPayload(zcu).toIntern()) {
+                            return verify.fail("error union payload type differs");
+                        }
+                    },
+                    .error_set => if (result_ty.zigTypeTag(zcu) != .error_set) {
+                        return verify.fail("operand is error set, but result is not");
+                    },
+                }
+            },
+            .error_from_int => {
+                const ty_op = data[@intFromEnum(inst)].ty_op;
+                const operand_ty = air.typeOf(ty_op.operand, ip);
+                const result_ty = ty_op.ty.toType();
+                if (!operand_ty.isUnsignedInt(zcu)) return verify.fail("bad operand type");
+                if (operand_ty.bitSize(zcu) != zcu.errorSetBits()) return verify.fail("bad operand bit size");
+                if (result_ty.zigTypeTag(zcu) != .error_set) return verify.fail("bad result type");
+            },
+            .int_from_error => {
+                const ty_op = data[@intFromEnum(inst)].ty_op;
+                const operand_ty = air.typeOf(ty_op.operand, ip);
+                const result_ty = ty_op.ty.toType();
+                if (operand_ty.zigTypeTag(zcu) != .error_set) return verify.fail("bad operand type");
+                if (!result_ty.isUnsignedInt(zcu)) return verify.fail("bad result type");
+                if (result_ty.bitSize(zcu) != zcu.errorSetBits()) return verify.fail("bad result bit size");
+            },
+            .union_from_enum => {
+                const ty_op = data[@intFromEnum(inst)].ty_op;
+                const operand_ty = air.typeOf(ty_op.operand, ip);
+                const result_ty = ty_op.ty.toType();
+                if (operand_ty.zigTypeTag(zcu) != .@"enum") return verify.fail("bad operand type");
+                if (result_ty.zigTypeTag(zcu) != .@"union") return verify.fail("bad result type");
+                const union_tag_ty = result_ty.unionTagType(zcu) orelse return verify.fail("union type is not tagged");
+                if (union_tag_ty.toIntern() != operand_ty.toIntern()) return verify.fail("union tag type does not match operand type");
+            },
+
+            .ptr_elem_ptr => {
+                const ty_pl = data[@intFromEnum(inst)].ty_pl;
+                const bin_op = air.extraData(Air.Bin, ty_pl.payload).data;
+                const ptr_ty = air.typeOf(bin_op.lhs, ip);
+                const result_ty = ty_pl.ty.toType();
+                if (ptr_ty.zigTypeTag(zcu) != .pointer) return verify.fail("bad pointer type");
+                if (result_ty.zigTypeTag(zcu) != .pointer) return verify.fail("bad result type");
+                const ptr_info = ptr_ty.ptrInfo(zcu);
+                const result_ptr_info = result_ty.ptrInfo(zcu);
+                if (ptr_info.packed_offset.host_size != 0) return verify.fail("pointer type is bitpacked pointer");
+                if (result_ptr_info.packed_offset.host_size != 0) return verify.fail("result type is bitpacked pointer");
+            },
+
+            .arg,
+            .add,
+            .add_safe,
+            .add_optimized,
+            .add_wrap,
+            .add_sat,
+            .sub,
+            .sub_safe,
+            .sub_optimized,
+            .sub_wrap,
+            .sub_sat,
+            .mul,
+            .mul_safe,
+            .mul_optimized,
+            .mul_wrap,
+            .mul_sat,
+            .div_float,
+            .div_float_optimized,
+            .div_trunc,
+            .div_trunc_optimized,
+            .div_floor,
+            .div_floor_optimized,
+            .div_exact,
+            .div_exact_optimized,
+            .rem,
+            .rem_optimized,
+            .mod,
+            .mod_optimized,
+            .ptr_add,
+            .ptr_sub,
+            .max,
+            .min,
+            .add_with_overflow,
+            .sub_with_overflow,
+            .mul_with_overflow,
+            .shl_with_overflow,
+            .alloc,
+            .inferred_alloc,
+            .inferred_alloc_comptime,
+            .ret_ptr,
+            .assembly,
+            .bit_and,
+            .bit_or,
+            .shr,
+            .shr_exact,
+            .shl,
+            .shl_exact,
+            .shl_sat,
+            .xor,
+            .not,
+            .repeat,
+            .br,
+            .trap,
+            .breakpoint,
+            .ret_addr,
+            .frame_addr,
+            .call,
+            .call_always_tail,
+            .call_never_tail,
+            .call_never_inline,
+            .clz,
+            .ctz,
+            .popcount,
+            .byte_swap,
+            .bit_reverse,
+            .sqrt,
+            .sin,
+            .cos,
+            .tan,
+            .exp,
+            .exp2,
+            .log,
+            .log2,
+            .log10,
+            .abs,
+            .floor,
+            .ceil,
+            .round,
+            .trunc_float,
+            .neg,
+            .neg_optimized,
+            .cmp_lt,
+            .cmp_lt_optimized,
+            .cmp_lte,
+            .cmp_lte_optimized,
+            .cmp_eq,
+            .cmp_eq_optimized,
+            .cmp_gte,
+            .cmp_gte_optimized,
+            .cmp_gt,
+            .cmp_gt_optimized,
+            .cmp_neq,
+            .cmp_neq_optimized,
+            .cmp_vector,
+            .cmp_vector_optimized,
+            .switch_dispatch,
+            .dbg_stmt,
+            .dbg_empty_stmt,
+            .dbg_var_ptr,
+            .dbg_var_val,
+            .dbg_arg_inline,
+            .is_null,
+            .is_non_null,
+            .is_null_ptr,
+            .is_non_null_ptr,
+            .is_err,
+            .is_non_err,
+            .is_err_ptr,
+            .is_non_err_ptr,
+            .load,
+            .store,
+            .store_safe,
+            .unreach,
+            .fptrunc,
+            .fpext,
+            .int_cast,
+            .int_cast_safe,
+            .trunc,
+            .optional_payload,
+            .optional_payload_ptr,
+            .optional_payload_ptr_set,
+            .wrap_optional,
+            .unwrap_errunion_payload,
+            .unwrap_errunion_err,
+            .unwrap_errunion_payload_ptr,
+            .unwrap_errunion_err_ptr,
+            .errunion_payload_ptr_set,
+            .wrap_errunion_payload,
+            .wrap_errunion_err,
+            .struct_field_ptr,
+            .struct_field_ptr_index_0,
+            .struct_field_ptr_index_1,
+            .struct_field_ptr_index_2,
+            .struct_field_ptr_index_3,
+            .struct_field_val,
+            .set_union_tag,
+            .get_union_tag,
+            .slice,
+            .slice_len,
+            .slice_ptr,
+            .ptr_slice_len_ptr,
+            .ptr_slice_ptr_ptr,
+            .array_elem_val,
+            .slice_elem_val,
+            .slice_elem_ptr,
+            .ptr_elem_val,
+            .array_to_slice,
+            .int_from_float,
+            .int_from_float_optimized,
+            .int_from_float_safe,
+            .int_from_float_optimized_safe,
+            .float_from_int,
+            .reduce,
+            .reduce_optimized,
+            .splat,
+            .shuffle_one,
+            .shuffle_two,
+            .select,
+            .memset,
+            .memset_safe,
+            .memcpy,
+            .memmove,
+            .cmpxchg_weak,
+            .cmpxchg_strong,
+            .atomic_load,
+            .atomic_store_unordered,
+            .atomic_store_monotonic,
+            .atomic_store_release,
+            .atomic_store_seq_cst,
+            .atomic_rmw,
+            .is_named_enum_value,
+            .tag_name,
+            .error_name,
+            .error_set_has_value,
+            .aggregate_init,
+            .union_init,
+            .prefetch,
+            .mul_add,
+            .field_parent_ptr,
+            .wasm_memory_size,
+            .wasm_memory_grow,
+            .cmp_lte_errors_len,
+            .err_return_trace,
+            .set_err_return_trace,
+            .addrspace_cast,
+            .save_err_return_trace_index,
+            .runtime_nav_ptr,
+            .c_va_arg,
+            .c_va_copy,
+            .c_va_end,
+            .c_va_start,
+            .spirv_runtime_array_len,
+            .work_item_id,
+            .work_group_size,
+            .work_group_id,
+            .legalize_vec_store_elem,
+            .legalize_vec_elem_val,
+            .legalize_compiler_rt_call,
+            => {},
+        }
+        if (air.typeOfIndex(inst, ip).isNoReturn(zcu)) {
+            if (body_index == body_insts.len - 1) return;
+
+            // HACK: right now, we emit the safety check for noreturn functions returning in a weird
+            // way, where the `call` instruction is `noreturn` but there are still instructions
+            // following it. We need to figure out a better way to represent that! That safety check
+            // probably just needs to live exclusively in backends; putting AIR instructions after a
+            // call implies that we have e.g. a valid stack at that point, which we can't actually
+            // assume when the user has gotten a function's ABI wrong.
+            switch (tags[@intFromEnum(inst)]) {
+                .call,
+                .call_always_tail,
+                .call_never_tail,
+                .call_never_inline,
+                => continue,
+                else => {},
+            }
+
+            return verify.fail("body contains instructions after noreturn");
+        }
+    }
+    return verify.fail("body does not terminate noreturn");
+}
+
+const std = @import("std");
+const log = std.log.scoped(.air_verify);
+
+const Zcu = @import("../Zcu.zig");
+const InternPool = @import("../InternPool.zig");
+const Air = @import("../Air.zig");
+const Type = @import("../Type.zig");
+const Value = @import("../Value.zig");
diff --git a/src/Air/print.zig b/src/Air/print.zig
@@ -232,12 +232,19 @@ const Writer = struct {
             .arg => try w.writeArg(s, inst),
 
             .not,
-            .bitcast,
+            .bit_cast,
+            .ptr_cast,
+            .ptr_from_int,
+            .int_from_ptr,
+            .error_cast,
+            .error_from_int,
+            .int_from_error,
+            .union_from_enum,
             .load,
             .fptrunc,
             .fpext,
-            .intcast,
-            .intcast_safe,
+            .int_cast,
+            .int_cast_safe,
             .trunc,
             .optional_payload,
             .optional_payload_ptr,
diff --git a/src/Sema.zig b/src/Sema.zig
@@ -583,16 +583,6 @@ pub const Block = struct {
         });
     }
 
-    fn addBitCast(block: *Block, ty: Type, operand: Air.Inst.Ref) Allocator.Error!Air.Inst.Ref {
-        return block.addInst(.{
-            .tag = .bitcast,
-            .data = .{ .ty_op = .{
-                .ty = Air.internedToRef(ty.toIntern()),
-                .operand = operand,
-            } },
-        });
-    }
-
     fn addNoOp(block: *Block, tag: Air.Inst.Tag) error{OutOfMemory}!Air.Inst.Ref {
         return block.addInst(.{
             .tag = tag,
@@ -3113,14 +3103,14 @@ fn zirRefDeref(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Ai
             // https://github.com/ziglang/zig/issues/6597
             if (sema.resolveValue(operand)) |operand_val| {
                 if (!operand_val.isNull(zcu)) {
-                    break :single_ptr try sema.coerceInMemory(operand_val, single_ptr_ty);
+                    break :single_ptr .fromValue(try pt.getCoerced(operand_val, single_ptr_ty));
                 }
             }
             if (block.wantSafety()) {
                 const is_non_null = try block.addUnOp(.is_non_null, operand);
                 try sema.addSafetyCheck(block, src, is_non_null, .unwrap_null);
             }
-            const single_ptr = try block.addBitCast(single_ptr_ty, operand);
+            const single_ptr = try block.addTyOp(.ptr_cast, single_ptr_ty, operand);
             try sema.checkKnownAllocPtr(block, operand, single_ptr);
             break :single_ptr single_ptr;
         },
@@ -3586,7 +3576,7 @@ fn resolveComptimeKnownAllocPtr(sema: *Sema, block: *Block, alloc: Air.Inst.Ref,
                     .{ .elem = idx_val.toUnsignedInt(zcu) },
                 };
             },
-            .bitcast => .{
+            .ptr_cast => .{
                 tmp_air.instructions.items(.data)[@intFromEnum(air_ptr)].ty_op.operand,
                 .same_addr,
             },
@@ -3729,7 +3719,7 @@ fn finishResolveComptimeKnownAllocPtr(
     // This instruction has type `alloc_ty`, meaning we can rewrite the `alloc` AIR instruction to
     // this one to drop the side effect. We also need to rewrite the stores; we'll turn them to this
     // too because it doesn't really matter what they become.
-    const nop_inst: Air.Inst = .{ .tag = .bitcast, .data = .{ .ty_op = .{
+    const nop_inst: Air.Inst = .{ .tag = .ptr_from_int, .data = .{ .ty_op = .{
         .ty = .fromIntern(alloc_ty.toIntern()),
         .operand = .zero_usize,
     } } };
@@ -3779,7 +3769,7 @@ fn makePtrConst(sema: *Sema, block: *Block, alloc: Air.Inst.Ref) CompileError!Ai
         return Air.internedToRef((try sema.pt.getCoerced(val, const_ptr_ty)).toIntern());
     }
 
-    return block.addBitCast(const_ptr_ty, alloc);
+    return block.addTyOp(.ptr_cast, const_ptr_ty, alloc);
 }
 
 fn zirAllocInferredComptime(
@@ -7330,7 +7320,7 @@ fn analyzeCall(
         if (resolved_ty == .none) break :r result_raw;
         // TODO: mutate in place the previous instruction if possible
         // rather than adding a bitcast instruction.
-        break :r try block.addBitCast(.fromInterned(resolved_ty), result_raw);
+        break :r try block.addTyOp(.error_cast, .fromInterned(resolved_ty), result_raw);
     };
 
     if (block.isComptime()) {
@@ -7636,7 +7626,7 @@ fn zirIntFromError(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstD
     }
 
     try sema.requireRuntimeBlock(block, src, operand_src);
-    return block.addBitCast(err_int_ty, operand);
+    return block.addTyOp(.int_from_error, err_int_ty, operand);
 }
 
 fn zirErrorFromInt(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstData) CompileError!Air.Inst.Ref {
@@ -7674,13 +7664,7 @@ fn zirErrorFromInt(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstD
         const ok = try block.addBinOp(.bit_and, is_lte_len, is_non_zero);
         try sema.addSafetyCheck(block, src, ok, .invalid_error_code);
     }
-    return block.addInst(.{
-        .tag = .bitcast,
-        .data = .{ .ty_op = .{
-            .ty = .anyerror_type,
-            .operand = operand,
-        } },
-    });
+    return block.addTyOp(.error_from_int, .anyerror, operand);
 }
 
 fn zirMergeErrorSets(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -7863,7 +7847,7 @@ fn zirIntFromEnum(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError
     }
 
     try sema.requireRuntimeBlock(block, src, operand_src);
-    return block.addBitCast(int_tag_ty, enum_tag);
+    return block.addTyOp(.bit_cast, int_tag_ty, enum_tag);
 }
 
 fn zirEnumFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -7922,9 +7906,9 @@ fn zirEnumFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError
     try sema.requireRuntimeBlock(block, src, operand_src);
     if (block.wantSafety()) {
         try sema.preparePanicId(src, .invalid_enum_value);
-        return block.addTyOp(.intcast_safe, dest_ty, operand);
+        return block.addTyOp(.int_cast_safe, dest_ty, operand);
     }
-    return block.addTyOp(.intcast, dest_ty, operand);
+    return block.addTyOp(.int_cast, dest_ty, operand);
 }
 
 /// Pointer in, pointer out.
@@ -9152,7 +9136,7 @@ fn zirIntFromPtr(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!
     try sema.requireRuntimeBlock(block, block.nodeOffset(inst_data.src_node), ptr_src);
     try sema.validateRuntimeValue(block, ptr_src, operand);
     try sema.checkLogicalPtrOperation(block, ptr_src, ptr_ty);
-    return block.addBitCast(dest_ty, operand);
+    return block.addTyOp(.int_from_ptr, dest_ty, operand);
 }
 
 fn zirFieldPtrLoad(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -9314,9 +9298,9 @@ fn intCast(
     try sema.requireRuntimeBlock(block, src, operand_src);
     if (block.wantSafety()) {
         try sema.preparePanicId(src, .integer_out_of_bounds);
-        return block.addTyOp(.intcast_safe, dest_ty, operand);
+        return block.addTyOp(.int_cast_safe, dest_ty, operand);
     }
-    return block.addTyOp(.intcast, dest_ty, operand);
+    return block.addTyOp(.int_cast, dest_ty, operand);
 }
 
 fn zirBitcast(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -9330,158 +9314,53 @@ fn zirBitcast(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air
     const dest_ty = try sema.resolveDestType(block, src, extra.lhs, .remove_eu_opt, "@bitCast");
     const operand = sema.resolveInst(extra.rhs);
     const operand_ty = sema.typeOf(operand);
-    switch (dest_ty.zigTypeTag(zcu)) {
-        .@"anyframe",
-        .comptime_float,
-        .comptime_int,
-        .enum_literal,
-        .error_set,
-        .error_union,
-        .@"fn",
-        .frame,
-        .noreturn,
-        .null,
-        .@"opaque",
-        .spirv,
-        .optional,
-        .type,
-        .undefined,
-        .void,
-        => return sema.fail(block, src, "cannot @bitCast to '{f}'", .{dest_ty.fmt(pt)}),
-
-        .@"enum" => {
-            const msg = msg: {
-                const msg = try sema.errMsg(src, "cannot @bitCast to '{f}'", .{dest_ty.fmt(pt)});
-                errdefer msg.destroy(sema.gpa);
-                switch (operand_ty.zigTypeTag(zcu)) {
-                    .int, .comptime_int => try sema.errNote(src, msg, "use @enumFromInt to cast from '{f}'", .{operand_ty.fmt(pt)}),
-                    else => {},
-                }
-
-                break :msg msg;
-            };
-            return sema.failWithOwnedErrorMsg(block, msg);
-        },
 
-        .pointer => {
-            const msg = msg: {
-                const msg = try sema.errMsg(src, "cannot @bitCast to '{f}'", .{dest_ty.fmt(pt)});
-                errdefer msg.destroy(sema.gpa);
-                switch (operand_ty.zigTypeTag(zcu)) {
-                    .int, .comptime_int => try sema.errNote(src, msg, "use @ptrFromInt to cast from '{f}'", .{operand_ty.fmt(pt)}),
-                    .pointer => try sema.errNote(src, msg, "use @ptrCast to cast from '{f}'", .{operand_ty.fmt(pt)}),
-                    else => {},
-                }
-
-                break :msg msg;
-            };
-            return sema.failWithOwnedErrorMsg(block, msg);
-        },
-        .@"struct", .@"union" => if (dest_ty.containerLayout(zcu) == .auto) {
-            const container = switch (dest_ty.zigTypeTag(zcu)) {
-                .@"struct" => "struct",
-                .@"union" => "union",
-                else => unreachable,
-            };
-            return sema.fail(block, src, "cannot @bitCast to '{f}'; {s} does not have a guaranteed in-memory layout", .{
-                dest_ty.fmt(pt), container,
-            });
-        },
-        .array => {
-            const elem_ty = dest_ty.childType(zcu);
-            if (!elem_ty.hasWellDefinedLayout(zcu)) {
-                const msg = msg: {
-                    const msg = try sema.errMsg(src, "cannot @bitCast to '{f}'", .{dest_ty.fmt(pt)});
-                    errdefer msg.destroy(sema.gpa);
-                    try sema.errNote(src, msg, "array element type '{f}' does not have a guaranteed in-memory layout", .{elem_ty.fmt(pt)});
-                    break :msg msg;
-                };
-                return sema.failWithOwnedErrorMsg(block, msg);
+    // Check for pointers before checking `hasBitRepresentation` so we can emit a better message for slices.
+    switch (dest_ty.scalarType(zcu).zigTypeTag(zcu)) {
+        .pointer, .optional => return sema.failWithOwnedErrorMsg(block, msg: {
+            const msg = try sema.errMsg(src, "cannot @bitCast to '{f}'", .{dest_ty.fmt(pt)});
+            errdefer msg.destroy(sema.gpa);
+            switch (operand_ty.zigTypeTag(zcu)) {
+                .int, .comptime_int => try sema.errNote(src, msg, "use @ptrFromInt to cast from '{f}'", .{operand_ty.fmt(pt)}),
+                .pointer => try sema.errNote(src, msg, "use @ptrCast to cast from '{f}'", .{operand_ty.fmt(pt)}),
+                else => {},
             }
-        },
 
-        .bool,
-        .float,
-        .int,
-        .vector,
-        => {},
-    }
-    switch (operand_ty.zigTypeTag(zcu)) {
-        .@"anyframe",
-        .comptime_float,
-        .comptime_int,
-        .enum_literal,
-        .error_set,
-        .error_union,
-        .@"fn",
-        .frame,
-        .noreturn,
-        .null,
-        .@"opaque",
-        .spirv,
-        .optional,
-        .type,
-        .undefined,
-        .void,
-        => return sema.fail(block, operand_src, "cannot @bitCast from '{f}'", .{operand_ty.fmt(pt)}),
-
-        .@"enum" => {
-            const msg = msg: {
-                const msg = try sema.errMsg(operand_src, "cannot @bitCast from '{f}'", .{operand_ty.fmt(pt)});
-                errdefer msg.destroy(sema.gpa);
-                switch (dest_ty.zigTypeTag(zcu)) {
-                    .int, .comptime_int => try sema.errNote(operand_src, msg, "use @intFromEnum to cast to '{f}'", .{dest_ty.fmt(pt)}),
-                    else => {},
-                }
-
-                break :msg msg;
-            };
-            return sema.failWithOwnedErrorMsg(block, msg);
+            break :msg msg;
+        }),
+        .array => switch (dest_ty.arrayBase(zcu)[0].zigTypeTag(zcu)) {
+            .pointer, .optional => return sema.fail(block, src, "cannot @bitCast to '{f}'", .{dest_ty.fmt(pt)}),
+            else => {},
         },
-        .pointer => {
-            const msg = msg: {
-                const msg = try sema.errMsg(operand_src, "cannot @bitCast from '{f}'", .{operand_ty.fmt(pt)});
-                errdefer msg.destroy(sema.gpa);
-                switch (dest_ty.zigTypeTag(zcu)) {
-                    .int, .comptime_int => try sema.errNote(operand_src, msg, "use @intFromPtr to cast to '{f}'", .{dest_ty.fmt(pt)}),
-                    .pointer => try sema.errNote(operand_src, msg, "use @ptrCast to cast to '{f}'", .{dest_ty.fmt(pt)}),
-                    else => {},
-                }
+        else => {},
+    }
+    if (!dest_ty.hasBitRepresentation(zcu)) {
+        return sema.fail(block, src, "cannot @bitCast to '{f}'", .{dest_ty.fmt(pt)});
+    }
 
-                break :msg msg;
-            };
-            return sema.failWithOwnedErrorMsg(block, msg);
-        },
-        .@"struct", .@"union" => if (operand_ty.containerLayout(zcu) == .auto) {
-            const container = switch (operand_ty.zigTypeTag(zcu)) {
-                .@"struct" => "struct",
-                .@"union" => "union",
-                else => unreachable,
-            };
-            return sema.fail(block, operand_src, "cannot @bitCast from '{f}'; {s} does not have a guaranteed in-memory layout", .{
-                operand_ty.fmt(pt), container,
-            });
-        },
-        .array => {
-            const elem_ty = operand_ty.childType(zcu);
-            if (!elem_ty.hasWellDefinedLayout(zcu)) {
-                const msg = msg: {
-                    const msg = try sema.errMsg(src, "cannot @bitCast from '{f}'", .{operand_ty.fmt(pt)});
-                    errdefer msg.destroy(sema.gpa);
-                    try sema.errNote(src, msg, "array element type '{f}' does not have a guaranteed in-memory layout", .{elem_ty.fmt(pt)});
-                    break :msg msg;
-                };
-                return sema.failWithOwnedErrorMsg(block, msg);
+    // Check for pointers before checking `hasBitRepresentation` so we can emit a better message for slices.
+    switch (operand_ty.scalarType(zcu).zigTypeTag(zcu)) {
+        .pointer, .optional => return sema.failWithOwnedErrorMsg(block, msg: {
+            const msg = try sema.errMsg(operand_src, "cannot @bitCast from '{f}'", .{operand_ty.fmt(pt)});
+            errdefer msg.destroy(sema.gpa);
+            switch (dest_ty.zigTypeTag(zcu)) {
+                .int, .comptime_int => try sema.errNote(operand_src, msg, "use @intFromPtr to cast to '{f}'", .{dest_ty.fmt(pt)}),
+                .pointer => try sema.errNote(operand_src, msg, "use @ptrCast to cast to '{f}'", .{dest_ty.fmt(pt)}),
+                else => {},
             }
+            break :msg msg;
+        }),
+        .array => switch (operand_ty.arrayBase(zcu)[0].zigTypeTag(zcu)) {
+            .pointer, .optional => return sema.fail(block, operand_src, "cannot @bitCast from '{f}'", .{dest_ty.fmt(pt)}),
+            else => {},
         },
-
-        .bool,
-        .float,
-        .int,
-        .vector,
-        => {},
+        else => {},
+    }
+    if (!operand_ty.hasBitRepresentation(zcu)) {
+        return sema.fail(block, operand_src, "cannot @bitCast from '{f}'", .{operand_ty.fmt(pt)});
     }
-    return sema.bitCast(block, dest_ty, operand, block.nodeOffset(inst_data.src_node), operand_src);
+
+    return sema.bitCast(block, dest_ty, operand, block.nodeOffset(inst_data.src_node));
 }
 
 fn zirFloatCast(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -12026,11 +11905,17 @@ fn analyzeSwitchCaptures(
                 .@"inline" => unreachable, // handled above
                 .has_ranges => unreachable, // not possible for error set
                 .special => {
-                    if (else_err_ty) |err_ty| {
-                        break :payload_ref try sema.bitCast(case_block, err_ty, loaded_operand, operand_src, null);
-                    } else {
+                    const capture_err_ty = else_err_ty orelse {
                         try sema.analyzeUnreachable(case_block, operand_src, false);
                         break :payload_ref .unreachable_value;
+                    };
+                    if (sema.resolveValue(loaded_operand)) |err_val| {
+                        break :payload_ref .fromIntern(try pt.intern(.{ .err = .{
+                            .ty = capture_err_ty.toIntern(),
+                            .name = zcu.intern_pool.indexToKey(err_val.toIntern()).err.name,
+                        } }));
+                    } else {
+                        break :payload_ref try case_block.addTyOp(.error_cast, capture_err_ty, loaded_operand);
                     }
                 },
                 .item_refs => |item_refs| {
@@ -12040,8 +11925,15 @@ fn analyzeSwitchCaptures(
                         const item_val = sema.resolveValue(item_ref).?;
                         names.putAssumeCapacityNoClobber(item_val.getErrorName(zcu).unwrap().?, {});
                     }
-                    const narrowed_ty = try pt.errorSetFromUnsortedNames(names.keys());
-                    break :payload_ref try sema.bitCast(case_block, narrowed_ty, loaded_operand, operand_src, null);
+                    const capture_err_ty = try pt.errorSetFromUnsortedNames(names.keys());
+                    if (sema.resolveValue(loaded_operand)) |err_val| {
+                        break :payload_ref .fromIntern(try pt.intern(.{ .err = .{
+                            .ty = capture_err_ty.toIntern(),
+                            .name = zcu.intern_pool.indexToKey(err_val.toIntern()).err.name,
+                        } }));
+                    } else {
+                        break :payload_ref try case_block.addTyOp(.error_cast, capture_err_ty, loaded_operand);
+                    }
                 },
             }
         }
@@ -12259,40 +12151,9 @@ fn analyzeSwitchPayloadCaptureTaggedUnion(
         return case_block.addStructFieldVal(loaded_operand, first_field_index, capture_ty);
     }
 
-    // We may have to emit a switch block which coerces the operand to the capture type.
-    // If we can, try to avoid that using in-memory coercions.
-    const first_non_imc = in_mem: {
-        for (field_indices, 0..) |field_idx, i| {
-            const field_ty: Type = .fromInterned(union_obj.field_types.get(ip)[field_idx]);
-            if (.ok != try sema.coerceInMemoryAllowed(case_block, capture_ty, field_ty, false, zcu.getTarget(), .unneeded, .unneeded, null)) {
-                break :in_mem i;
-            }
-        }
-        // All fields are in-memory coercible to the resolved type!
-        // Just take the first field and bitcast the result.
-        const uncoerced = try case_block.addStructFieldVal(loaded_operand, first_field_index, first_field_ty);
-        return case_block.addBitCast(capture_ty, uncoerced);
-    };
-
     // By-val capture with heterogeneous types which are not all in-memory coercible to
     // the resolved capture type. We finally have to fall back to the ugly method.
 
-    // However, let's first track which operands are in-memory coercible. There may well
-    // be several, and we can squash all of these cases into the same switch prong using
-    // a simple bitcast. We'll make this the 'else' prong.
-
-    var in_mem_coercible: std.bit_set.Dynamic = try .initFull(sema.arena, field_indices.len);
-    in_mem_coercible.unset(first_non_imc);
-    {
-        const next = first_non_imc + 1;
-        for (field_indices[next..], next..) |field_idx, i| {
-            const field_ty: Type = .fromInterned(union_obj.field_types.get(ip)[field_idx]);
-            if (.ok != try sema.coerceInMemoryAllowed(case_block, capture_ty, field_ty, false, zcu.getTarget(), .unneeded, .unneeded, null)) {
-                in_mem_coercible.unset(i);
-            }
-        }
-    }
-
     const capture_block_inst = try case_block.addInstAsIndex(.{
         .tag = .block,
         .data = .{
@@ -12303,23 +12164,19 @@ fn analyzeSwitchPayloadCaptureTaggedUnion(
         },
     });
 
-    const prong_count = field_indices.len - in_mem_coercible.count();
-
-    const estimated_extra = prong_count * 6 + (prong_count / 10); // 2 for Case, 1 item, probably 3 insts; plus hints
+    const estimated_extra = field_indices.len * 6 + (field_indices.len / 10); // 2 for Case, 1 item, probably 3 insts; plus hints
     var cases_extra = try std.ArrayList(u32).initCapacity(gpa, estimated_extra);
     defer cases_extra.deinit(gpa);
 
     {
         // All branch hints are `.none`, so just add zero elems.
         comptime assert(@intFromEnum(std.lang.BranchHint.none) == 0);
-        const need_elems = std.math.divCeil(usize, prong_count + 1, 10) catch unreachable;
+        const need_elems = std.math.divCeil(usize, field_indices.len + 1, 10) catch unreachable;
         try cases_extra.appendNTimes(gpa, 0, need_elems);
     }
 
     {
-        // Non-bitcast cases
-        var it = in_mem_coercible.iterator(.{ .kind = .unset });
-        while (it.next()) |idx| {
+        for (field_indices, item_refs, 0..) |field_index, item, item_index| {
             var coerce_block = case_block.makeSubBlock();
             defer coerce_block.instructions.deinit(sema.gpa);
 
@@ -12328,13 +12185,12 @@ fn analyzeSwitchPayloadCaptureTaggedUnion(
                 .offset = .{ .switch_case_item = .{
                     .switch_node_offset = switch_node_offset,
                     .case_idx = capture_src.offset.switch_capture.case_idx,
-                    .item_idx = .{ .kind = .single, .value = @intCast(idx) },
+                    .item_idx = .{ .kind = .single, .value = @intCast(item_index) },
                 } },
             };
 
-            const field_idx = field_indices[idx];
-            const field_ty: Type = .fromInterned(union_obj.field_types.get(ip)[field_idx]);
-            const uncoerced = try coerce_block.addStructFieldVal(loaded_operand, field_idx, field_ty);
+            const field_ty: Type = .fromInterned(union_obj.field_types.get(ip)[field_index]);
+            const uncoerced = try coerce_block.addStructFieldVal(loaded_operand, field_index, field_ty);
             const coerced = try sema.coerce(&coerce_block, capture_ty, uncoerced, case_src);
             _ = try coerce_block.addBr(capture_block_inst, coerced);
 
@@ -12346,24 +12202,16 @@ fn analyzeSwitchPayloadCaptureTaggedUnion(
                 .ranges_len = 0,
                 .body_len = @intCast(coerce_block.instructions.items.len),
             }));
-            cases_extra.appendAssumeCapacity(@intFromEnum(item_refs[idx])); // item
+            cases_extra.appendAssumeCapacity(@intFromEnum(item)); // item
             cases_extra.appendSliceAssumeCapacity(@ptrCast(coerce_block.instructions.items)); // body
         }
     }
     const else_body_len = len: {
-        // 'else' prong uses a bitcast
-        var coerce_block = case_block.makeSubBlock();
-        defer coerce_block.instructions.deinit(sema.gpa);
-
-        const first_imc_item_idx = in_mem_coercible.findFirstSet().?;
-        const first_imc_field_idx = field_indices[first_imc_item_idx];
-        const first_imc_field_ty: Type = .fromInterned(union_obj.field_types.get(ip)[first_imc_field_idx]);
-        const uncoerced = try coerce_block.addStructFieldVal(loaded_operand, first_imc_field_idx, first_imc_field_ty);
-        const coerced = try coerce_block.addBitCast(capture_ty, uncoerced);
-        _ = try coerce_block.addBr(capture_block_inst, coerced);
-
-        try cases_extra.appendSlice(gpa, @ptrCast(coerce_block.instructions.items));
-        break :len coerce_block.instructions.items.len;
+        // 'else' prong is unreachable
+        const result_index: Air.Inst.Index = @enumFromInt(sema.air_instructions.len);
+        try sema.air_instructions.append(gpa, .{ .tag = .unreach, .data = .{ .no_op = {} } });
+        try cases_extra.append(gpa, @intFromEnum(result_index));
+        break :len 1;
     };
 
     try sema.air_extra.ensureUnusedCapacity(gpa, @typeInfo(Air.SwitchBr).@"struct".field_names.len +
@@ -12378,7 +12226,7 @@ fn analyzeSwitchPayloadCaptureTaggedUnion(
             .pl_op = .{
                 .operand = undefined, // set by switch below
                 .payload = sema.addExtraAssumeCapacity(Air.SwitchBr{
-                    .cases_len = @intCast(prong_count),
+                    .cases_len = @intCast(field_indices.len),
                     .else_body_len = @intCast(else_body_len),
                 }),
             },
@@ -12488,7 +12336,7 @@ fn resolveSwitchItem(
             // being switched on if their prong body is `=> comptime unreachable,`.
             switch (try sema.coerceInMemoryAllowedErrorSets(block, item_ty, uncoerced_ty, item_src, item_src)) {
                 .ok => if (sema.resolveValue(uncoerced)) |uncoerced_val| {
-                    break :item_ref try sema.coerceInMemory(uncoerced_val, item_ty);
+                    break :item_ref .fromValue(try pt.getCoerced(uncoerced_val, item_ty));
                 },
                 .missing_error => if (prong_is_comptime_unreach) {
                     break :item_ref uncoerced;
@@ -13394,8 +13242,8 @@ fn zirArrayCat(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Ai
         defer trash_block.instructions.deinit(sema.gpa);
 
         const instructions = [_]Air.Inst.Ref{
-            try trash_block.addBitCast(lhs_info.elem_type, .void_value),
-            try trash_block.addBitCast(rhs_info.elem_type, .void_value),
+            try trash_block.addTyOp(.bit_cast, lhs_info.elem_type, .void_value),
+            try trash_block.addTyOp(.bit_cast, rhs_info.elem_type, .void_value),
         };
         break :t try sema.resolvePeerTypes(block, src, &instructions, .{
             .override = &[_]?LazySrcLoc{ lhs_src, rhs_src },
@@ -13552,7 +13400,7 @@ fn zirArrayCat(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Ai
             });
 
             const many_ty = slice_ty.slicePtrFieldType(zcu);
-            const many_alloc = try block.addBitCast(many_ty, mutable_alloc);
+            const many_alloc = try block.addTyOp(.ptr_cast, many_ty, mutable_alloc);
 
             // lhs_dest_slice = dest[0..lhs.len]
             if (lhs_len > 0) {
@@ -13601,7 +13449,7 @@ fn zirArrayCat(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Ai
                 try sema.storePtr2(block, src, elem_ptr, src, init, lhs_src, .store);
             }
 
-            return block.addBitCast(constant_alloc_ty, mutable_alloc);
+            return block.addTyOp(.ptr_cast, constant_alloc_ty, mutable_alloc);
         }
 
         var elem_i: u32 = 0;
@@ -13634,7 +13482,7 @@ fn zirArrayCat(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Ai
             try sema.storePtr2(block, src, elem_ptr, src, init, lhs_src, .store);
         }
 
-        return block.addBitCast(constant_alloc_ty, mutable_alloc);
+        return block.addTyOp(.ptr_cast, constant_alloc_ty, mutable_alloc);
     }
 
     const element_refs = try sema.arena.alloc(Air.Inst.Ref, result_len);
@@ -14917,8 +14765,8 @@ fn analyzeArithmetic(
                 try sema.requireRuntimeBlock(block, src, runtime_src);
                 try sema.checkLogicalPtrOperation(block, src, lhs_ty);
                 try sema.checkLogicalPtrOperation(block, src, rhs_ty);
-                const lhs_int = try block.addBitCast(.usize, lhs);
-                const rhs_int = try block.addBitCast(.usize, rhs);
+                const lhs_int = try block.addTyOp(.int_from_ptr, .usize, lhs);
+                const rhs_int = try block.addTyOp(.int_from_ptr, .usize, rhs);
                 const address = try block.addBinOp(.sub_wrap, lhs_int, rhs_int);
                 return try block.addBinOp(.div_exact, address, try pt.intRef(.usize, elem_size));
             }
@@ -15186,15 +15034,76 @@ fn zirAsm(
                 break :out_ty sema.typeOf(inst).childType(zcu);
             }
         };
-        if (!out_ty.hasWellDefinedLayout(zcu)) {
-            return sema.failWithOwnedErrorMsg(block, msg: {
-                const msg = try sema.errMsg(output_src, "invalid inline assembly output type; '{f}' does not have a guaranteed in-memory layout", .{
-                    out_ty.fmt(pt),
-                });
+        switch (out_ty.zigTypeTag(zcu)) {
+            .int, .float, .bool, .vector => {},
+
+            .pointer => if (out_ty.isSlice(zcu)) return sema.failWithOwnedErrorMsg(block, msg: {
+                const msg = try sema.errMsg(output_src, "invalid inline assembly output type '{f}'", .{out_ty.fmt(pt)});
                 errdefer msg.destroy(gpa);
-                try sema.addDeclaredHereNote(msg, out_ty);
+                try sema.errNote(output_src, msg, "consider separate outputs for 'ptr' and 'len'", .{});
                 break :msg msg;
-            });
+            }),
+
+            .optional => if (!out_ty.isPtrLikeOptional(zcu)) {
+                return sema.fail(block, output_src, "invalid inline assembly output type '{f}'", .{out_ty.fmt(pt)});
+            },
+
+            .@"enum" => switch (ip.loadEnumType(out_ty.toIntern()).int_tag_mode) {
+                .explicit => {},
+                .auto => return sema.failWithOwnedErrorMsg(block, msg: {
+                    const msg = try sema.errMsg(output_src, "invalid inline assembly output type '{f}'", .{out_ty.fmt(pt)});
+                    errdefer msg.destroy(gpa);
+                    try sema.errNote(out_ty.srcLoc(zcu), msg, "integer tag type of enum is inferred", .{});
+                    try sema.errNote(out_ty.srcLoc(zcu), msg, "consider explicitly specifying the integer tag type", .{});
+                    break :msg msg;
+                }),
+            },
+
+            .@"struct" => switch (out_ty.containerLayout(zcu)) {
+                .@"packed" => {},
+                .auto, .@"extern" => return sema.failWithOwnedErrorMsg(block, msg: {
+                    const msg = try sema.errMsg(output_src, "invalid inline assembly output type '{f}'", .{out_ty.fmt(pt)});
+                    errdefer msg.destroy(gpa);
+                    try sema.errNote(output_src, msg, "struct types cannot be passed to inline assembly", .{});
+                    try sema.addDeclaredHereNote(msg, out_ty);
+                    break :msg msg;
+                }),
+            },
+
+            .@"union" => switch (out_ty.containerLayout(zcu)) {
+                .@"packed" => {},
+                .auto, .@"extern" => return sema.failWithOwnedErrorMsg(block, msg: {
+                    const msg = try sema.errMsg(output_src, "invalid inline assembly output type '{f}'", .{out_ty.fmt(pt)});
+                    errdefer msg.destroy(gpa);
+                    try sema.errNote(output_src, msg, "union types cannot be passed to inline assembly", .{});
+                    try sema.addDeclaredHereNote(msg, out_ty);
+                    break :msg msg;
+                }),
+            },
+
+            .array => return sema.failWithOwnedErrorMsg(block, msg: {
+                const msg = try sema.errMsg(output_src, "invalid inline assembly output type '{f}'", .{out_ty.fmt(pt)});
+                errdefer msg.destroy(gpa);
+                try sema.errNote(output_src, msg, "array types cannot be passed to inline assembly", .{});
+                break :msg msg;
+            }),
+
+            .void,
+            .type,
+            .noreturn,
+            .comptime_float,
+            .comptime_int,
+            .undefined,
+            .null,
+            .error_union,
+            .error_set,
+            .@"fn",
+            .@"opaque",
+            .frame,
+            .@"anyframe",
+            .enum_literal,
+            .spirv,
+            => return sema.fail(block, output_src, "invalid inline assembly output type '{f}'", .{out_ty.fmt(pt)}),
         }
 
         const constraint = sema.code.nullTerminatedString(output.data.constraint);
@@ -15598,37 +15507,13 @@ fn zirBitSizeOf(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     const inst_data = sema.code.instructions.items(.data)[@intFromEnum(inst)].un_node;
     const operand_src = block.builtinCallArgSrc(inst_data.src_node, 0);
     const operand_ty = try sema.resolveType(block, operand_src, inst_data.operand);
-    switch (operand_ty.zigTypeTag(zcu)) {
-        .@"fn",
-        .noreturn,
-        .undefined,
-        .null,
-        .@"opaque",
-        .spirv,
-        .type,
-        .enum_literal,
-        .comptime_float,
-        .comptime_int,
-        => return sema.fail(block, operand_src, "no size available for type '{f}'", .{operand_ty.fmt(pt)}),
-
-        .void,
-        => return .zero,
-
-        .bool,
-        .int,
-        .float,
-        .pointer,
-        .array,
-        .@"struct",
-        .optional,
-        .error_union,
-        .error_set,
-        .@"enum",
-        .@"union",
-        .vector,
-        .frame,
-        .@"anyframe",
-        => {},
+    if (!operand_ty.hasBitRepresentation(zcu) and
+        // TODO: allow these types too for now because this is used in some places. We need to
+        // figure out whether we think errors and auto-enums have bit representations!
+        operand_ty.zigTypeTag(zcu) != .error_set and
+        operand_ty.zigTypeTag(zcu) != .@"enum")
+    {
+        return sema.fail(block, operand_src, "no bit size available for type '{f}'", .{operand_ty.fmt(pt)});
     }
     try sema.ensureLayoutResolved(operand_ty, operand_src, .size_of);
     return .fromValue(try pt.intValue(.comptime_int, operand_ty.bitSize(zcu)));
@@ -18179,13 +18064,19 @@ fn zirPtrType(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air
     } else 0;
 
     if (host_size != 0) {
+        try sema.ensureLayoutResolved(elem_ty, elem_ty_src, .bit_ptr_child);
+        if (elem_ty.unpackable(zcu)) |reason| return sema.failWithOwnedErrorMsg(block, msg: {
+            const msg = try sema.errMsg(elem_ty_src, "bit-pointer cannot refer to value of type '{f}'", .{elem_ty.fmt(pt)});
+            errdefer msg.destroy(sema.gpa);
+            try sema.explainWhyTypeIsUnpackable(msg, elem_ty_src, reason);
+            break :msg msg;
+        });
+        const elem_bit_size = elem_ty.bitSize(zcu);
         if (bit_offset >= host_size * 8) {
             return sema.fail(block, bitoffset_src, "packed type '{f}' at bit offset {d} starts {d} bits after the end of a {d} byte host integer", .{
                 elem_ty.fmt(pt), bit_offset, bit_offset - host_size * 8, host_size,
             });
         }
-        try sema.ensureLayoutResolved(elem_ty, elem_ty_src, .bit_ptr_child);
-        const elem_bit_size = elem_ty.bitSize(zcu);
         if (elem_bit_size > host_size * 8 - bit_offset) {
             return sema.fail(block, bitoffset_src, "packed type '{f}' at bit offset {d} ends {d} bits after the end of a {d} byte host integer", .{
                 elem_ty.fmt(pt), bit_offset, elem_bit_size - (host_size * 8 - bit_offset), host_size,
@@ -18201,15 +18092,6 @@ fn zirPtrType(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air
         return sema.fail(block, elem_ty_src, "indexable pointer to opaque type '{f}' not allowed", .{elem_ty.fmt(pt)});
     }
 
-    if (host_size != 0) {
-        if (elem_ty.unpackable(zcu)) |reason| return sema.failWithOwnedErrorMsg(block, msg: {
-            const msg = try sema.errMsg(elem_ty_src, "bit-pointer cannot refer to value of type '{f}'", .{elem_ty.fmt(pt)});
-            errdefer msg.destroy(sema.gpa);
-            try sema.explainWhyTypeIsUnpackable(msg, elem_ty_src, reason);
-            break :msg msg;
-        });
-    }
-
     const ty = try pt.ptrType(.{
         .child = elem_ty.toIntern(),
         .sentinel = sentinel,
@@ -18379,7 +18261,7 @@ fn zirUnionInit(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     const payload = try sema.coerce(block, field_ty, sema.resolveInst(extra.init), payload_src);
 
     if (union_ty.containerLayout(zcu) == .@"packed") {
-        return sema.bitCast(block, union_ty, payload, block.nodeOffset(inst_data.src_node), payload_src);
+        return sema.bitCast(block, union_ty, payload, block.nodeOffset(inst_data.src_node));
     }
 
     if (sema.resolveValue(payload)) |payload_val| {
@@ -18516,7 +18398,7 @@ fn zirStructInit(
         const init_inst = try sema.coerce(block, field_ty, uncoerced_init_inst, field_src);
 
         if (resolved_ty.containerLayout(zcu) == .@"packed") {
-            const union_val = try sema.bitCast(block, resolved_ty, init_inst, src, field_src);
+            const union_val = try sema.bitCast(block, resolved_ty, init_inst, src);
             const result_val = try sema.coerce(block, result_ty, union_val, src);
             if (is_ref) {
                 return sema.analyzeRef(block, src, result_val, .none);
@@ -18680,20 +18562,15 @@ fn finishStructInit(
         },
         .@"packed" => {
             const buf = try sema.arena.alloc(u8, @intCast((struct_ty.bitSize(zcu) + 7) / 8));
+            @memset(buf, 0);
             var bit_offset: u16 = 0;
             for (field_inits) |field_init| {
                 const field_val = sema.resolveValue(field_init).?;
-                field_val.writeToPackedMemory(zcu, buf, bit_offset) catch |err| switch (err) {
-                    error.ReinterpretDeclRef => unreachable, // bitpack fields cannot be pointers
-                    error.OutOfMemory => |e| return e,
-                };
+                field_val.writeToPackedMemory(zcu, buf, bit_offset);
                 bit_offset += @intCast(field_val.typeOf(zcu).bitSize(zcu));
             }
             assert(bit_offset == struct_ty.bitSize(zcu));
-            const struct_val = Value.readFromPackedMemory(struct_ty, pt, buf, 0, sema.arena) catch |err| switch (err) {
-                error.IllDefinedMemoryLayout => unreachable, // bitpacks have well-defined layout
-                error.OutOfMemory => |e| return e,
-            };
+            const struct_val: Value = try .readFromPackedMemory(struct_ty, pt, buf, 0);
             const final_val_ref = try sema.coerce(block, result_ty, .fromValue(struct_val), init_src);
             return sema.addConstantMaybeRef(sema.resolveValue(final_val_ref).?, is_ref);
         },
@@ -19387,7 +19264,7 @@ fn zirIntFromBool(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError
         }
         return Air.internedToRef((try pt.aggregateValue(dest_ty, new_elems)).toIntern());
     }
-    return block.addBitCast(dest_ty, operand);
+    return block.addTyOp(.bit_cast, dest_ty, operand);
 }
 
 fn zirErrorName(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -21245,7 +21122,7 @@ fn zirPtrFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!
             try sema.addSafetyCheck(block, src, is_aligned, .incorrect_alignment);
         }
     }
-    return block.addBitCast(dest_ty, operand_coerced);
+    return block.addTyOp(.ptr_from_int, dest_ty, operand_coerced);
 }
 
 fn ptrFromIntVal(
@@ -21444,7 +21321,7 @@ fn zirErrorCast(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstData
             .error_union => try block.addTyOp(.unwrap_errunion_err, operand_err_ty, operand),
             else => unreachable,
         };
-        const err_int_inst = try block.addBitCast(err_int_ty, err_code_inst);
+        const err_int_inst = try block.addTyOp(.int_from_error, err_int_ty, err_code_inst);
         if (dest_tag == .error_union) {
             const zero_err = try pt.intRef(err_int_ty, 0);
             const is_zero = try block.addBinOp(.cmp_eq, err_int_inst, zero_err);
@@ -21464,10 +21341,10 @@ fn zirErrorCast(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstData
     }
 
     if (operand_tag == .error_set and dest_tag == .error_union) {
-        const err_val = try block.addBitCast(dest_err_ty, operand);
+        const err_val = try block.addTyOp(.error_cast, dest_err_ty, operand);
         return block.addTyOp(.wrap_errunion_err, dest_ty, err_val);
     } else {
-        return block.addBitCast(dest_ty, operand);
+        return block.addTyOp(.error_cast, dest_ty, operand);
     }
 }
 
@@ -22015,7 +21892,7 @@ fn ptrCastFull(
     // `operand_ptr` converted to an integer, for safety checks.
     const operand_ptr_int: Air.Inst.Ref = if (need_null_check or need_align_check) i: {
         assert(need_operand_ptr);
-        break :i try block.addBitCast(.usize, operand_ptr);
+        break :i try block.addTyOp(.int_from_ptr, .usize, operand_ptr);
     } else .none;
 
     if (need_null_check) {
@@ -22042,8 +21919,8 @@ fn ptrCastFull(
 
     if (dest_info.flags.size == .slice) {
         if (src_info.flags.size == .slice and !flags.addrspace_cast and !slice_needs_len_change) {
-            // Fast path: just bitcast!
-            return block.addBitCast(dest_ty, operand);
+            // Fast path: just pointer cast!
+            return block.addTyOp(.ptr_cast, dest_ty, operand);
         }
 
         // We need to deconstruct the slice (if applicable) and reconstruct it.
@@ -22101,7 +21978,7 @@ fn ptrCastFull(
             else => unreachable,
         };
         const coerced_ptr = if (operand_ptr_ty.toIntern() != want_ptr_ty.toIntern()) ptr: {
-            break :ptr try block.addBitCast(want_ptr_ty, operand_ptr);
+            break :ptr try block.addTyOp(.ptr_cast, want_ptr_ty, operand_ptr);
         } else operand_ptr;
 
         return block.addInst(.{
@@ -22116,12 +21993,11 @@ fn ptrCastFull(
         });
     } else {
         assert(need_operand_ptr);
-        // We just need to bitcast the pointer, if necessary.
-        // It might not be necessary, since we might have just needed the `addrspace_cast`.
+        // We just need a ptr_cast, if even that (we might only have needed the `addrspace_cast`).
         const result = if (sema.typeOf(operand_ptr).toIntern() == dest_ty.toIntern())
             operand_ptr
         else
-            try block.addBitCast(dest_ty, operand_ptr);
+            try block.addTyOp(.ptr_cast, dest_ty, operand_ptr);
 
         try sema.checkKnownAllocPtr(block, operand, result);
         return result;
@@ -22157,7 +22033,7 @@ fn zirPtrCastNoDest(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.Inst
     }
 
     try sema.requireRuntimeBlock(block, src, null);
-    const new_ptr = try block.addBitCast(dest_ty, operand);
+    const new_ptr = try block.addTyOp(.ptr_cast, dest_ty, operand);
     try sema.checkKnownAllocPtr(block, operand, new_ptr);
     return new_ptr;
 }
@@ -24259,7 +24135,7 @@ fn analyzeMinMax(
     // where we have refined the range, so we should be doing an intcast.
     assert(intermediate_scalar_ty.zigTypeTag(zcu) == .int);
     assert(result_scalar_ty.zigTypeTag(zcu) == .int);
-    return block.addTyOp(.intcast, result_ty, cur_result);
+    return block.addTyOp(.int_cast, result_ty, cur_result);
 }
 
 fn upgradeToArrayPtr(sema: *Sema, block: *Block, ptr: Air.Inst.Ref, len: u64) !Air.Inst.Ref {
@@ -24289,7 +24165,7 @@ fn upgradeToArrayPtr(sema: *Sema, block: *Block, ptr: Air.Inst.Ref, len: u64) !A
         try block.addTyOp(.slice_ptr, ptr_ty.slicePtrFieldType(zcu), ptr)
     else
         ptr;
-    return block.addBitCast(new_ty, non_slice_ptr);
+    return block.addTyOp(.ptr_cast, new_ty, non_slice_ptr);
 }
 
 fn zirMemcpy(
@@ -25087,7 +24963,7 @@ fn zirBuiltinExtern(
         const casted_ptr_val = try pt.getCoerced(uncasted_ptr_val, result_ptr_ty);
         return Air.internedToRef(casted_ptr_val.toIntern());
     } else {
-        return block.addBitCast(result_ptr_ty, uncasted_ptr);
+        return block.addTyOp(.ptr_cast, result_ptr_ty, uncasted_ptr);
     }
 }
 
@@ -25533,7 +25409,7 @@ pub fn explainWhyTypeIsNotExtern(
             .param_ty => try sema.errNote(src_loc, msg, "arrays are not allowed as a parameter type", .{}),
             else => try sema.explainWhyTypeIsNotExtern(msg, src_loc, ty.childType(zcu), .element),
         },
-        .vector => try sema.explainWhyTypeIsNotExtern(msg, src_loc, ty.childType(zcu), .element),
+        .vector => try sema.errNote(src_loc, msg, "vectors have no guaranteed in-memory representation", .{}),
         .optional => try sema.errNote(src_loc, msg, "non-pointer optionals have no guaranteed in-memory representation", .{}),
     }
 }
@@ -25801,7 +25677,7 @@ fn addSafetyCheckSentinelMismatch(
                     .address_space = ptr_info.flags.address_space,
                 },
             });
-            const many_ptr = try parent_block.addBitCast(many_ptr_ty, ptr);
+            const many_ptr = try parent_block.addTyOp(.ptr_cast, many_ptr_ty, ptr);
             break :s try parent_block.addBinOp(.ptr_elem_val, many_ptr, sentinel_index);
         },
         .many => unreachable,
@@ -26278,7 +26154,7 @@ fn fieldPtr(
                     },
                     .packed_offset = ptr_ptr_info.packed_offset,
                 });
-                return sema.bitCast(block, result_ty, object_ptr, src, null);
+                return block.addTyOp(.ptr_cast, result_ty, object_ptr);
             } else {
                 return sema.fail(
                     block,
@@ -26996,16 +26872,13 @@ fn unionFieldVal(
                     break :msg msg;
                 });
             },
-            .@"extern" => if (try sema.bitCastVal(union_val, field_ty, 0, 0, 0)) |field_val| {
+            .@"extern" => if (try sema.castMemory(union_val, field_ty, 0)) |field_val| {
                 return .fromValue(field_val);
             } else {
                 // Runtime-known due to a pointer-to-integer conversion.
             },
             .@"packed" => {
-                const field_val = try sema.bitCastVal(union_val, field_ty, 0, union_ty.bitSize(zcu), 0) orelse {
-                    unreachable; // `null` is only possible if the input value contains a pointer, which a packed union cannot.
-                };
-                return .fromValue(field_val);
+                return .fromValue(try sema.bitCastVal(union_val, field_ty));
             },
         }
     }
@@ -27104,7 +26977,7 @@ fn elemPtrOneLayerOnly(
 
             if (child_ty.abiSize(zcu) == 0) {
                 // zero-bit child type; just bitcast the pointer
-                return block.addBitCast(result_ty, indexable);
+                return block.addTyOp(.ptr_cast, result_ty, indexable);
             }
 
             return block.addPtrElemPtr(indexable, elem_index, result_ty);
@@ -27401,68 +27274,36 @@ fn elemPtrVector(
     }
 
     const elem_ty = vector_ty.childType(zcu);
-    const elem_bits = elem_ty.bitSize(zcu);
-    // Exiting this block means the operation is a runtime one.
-    const elem_ptr_ty: Type = if (elem_bits < 8 or !std.math.isPowerOfTwo(elem_bits)) elem_ptr_ty: {
-        // Use a packed pointer (i.e. vector_index != 0)
-        const vector_ptr_info = vector_ptr_ty.ptrInfo(zcu);
-        const elem_ptr_ty = try pt.ptrType(.{
-            .child = elem_ty.toIntern(),
-            .flags = .{
-                .size = .one,
-                .alignment = vector_ptr_info.flags.alignment,
-                .is_const = vector_ptr_info.flags.is_const,
-                .is_volatile = vector_ptr_info.flags.is_volatile,
-                .is_allowzero = vector_ptr_info.flags.is_allowzero,
-                .address_space = vector_ptr_info.flags.address_space,
-                .vector_index = @enumFromInt(index),
-            },
-            .packed_offset = .{
-                .host_size = @intCast(vector_len),
-                .bit_offset = 0,
-            },
-        });
-        if (maybe_vector_ptr_val) |ptr_val| {
-            if (ptr_val.isUndef(zcu)) return pt.undefRef(elem_ptr_ty);
-            return .fromValue(try pt.getCoerced(ptr_val, elem_ptr_ty));
-        }
-        break :elem_ptr_ty elem_ptr_ty;
-    } else elem_ptr_ty: {
-        // Use a normal pointer (i.e. vector_index == 0)
-        const vector_ptr_info = vector_ptr_ty.ptrInfo(zcu);
-        const elem_ptr_ty = try pt.ptrType(.{
-            .child = elem_ty.toIntern(),
-            .flags = .{
-                .size = .one,
-                // TODO: this logic was ported from old code, but it's bogus. This entire block will
-                // go away when https://github.com/ziglang/zig/issues/24061 is implemented anyway.
-                .alignment = switch (vector_ptr_info.flags.alignment) {
-                    .none => .none,
-                    else => |vec_align| switch (index * elem_ty.abiSize(zcu)) {
-                        0 => vec_align,
-                        else => |byte_offset| .minStrict(vec_align, .fromLog2Units(@ctz(byte_offset))),
-                    },
-                },
-                .is_const = vector_ptr_info.flags.is_const,
-                .is_volatile = vector_ptr_info.flags.is_volatile,
-                .is_allowzero = vector_ptr_info.flags.is_allowzero,
-                .address_space = vector_ptr_info.flags.address_space,
-            },
-        });
-        if (maybe_vector_ptr_val) |ptr_val| {
-            if (ptr_val.isUndef(zcu)) return pt.undefRef(elem_ptr_ty);
-            const bit_offset = index * @divExact(elem_ty.bitSize(zcu), 8);
-            return .fromValue(try ptr_val.getOffsetPtr(bit_offset, elem_ptr_ty, pt));
-        }
-        break :elem_ptr_ty elem_ptr_ty;
-    };
+
+    const vector_ptr_info = vector_ptr_ty.ptrInfo(zcu);
+    const elem_ptr_ty = try pt.ptrType(.{
+        .child = elem_ty.toIntern(),
+        .flags = .{
+            .size = .one,
+            .alignment = vector_ptr_info.flags.alignment,
+            .is_const = vector_ptr_info.flags.is_const,
+            .is_volatile = vector_ptr_info.flags.is_volatile,
+            .is_allowzero = vector_ptr_info.flags.is_allowzero,
+            .address_space = vector_ptr_info.flags.address_space,
+            .vector_index = @enumFromInt(index),
+        },
+        .packed_offset = .{
+            .host_size = @intCast(vector_len),
+            .bit_offset = 0,
+        },
+    });
+
+    if (maybe_vector_ptr_val) |ptr_val| {
+        if (ptr_val.isUndef(zcu)) return pt.undefRef(elem_ptr_ty);
+        return .fromValue(try pt.getCoerced(ptr_val, elem_ptr_ty));
+    }
 
     if (!init) {
         try sema.validateRuntimeElemAccess(block, elem_index_src, elem_ty, vector_ptr_src);
         try sema.validateRuntimeValue(block, vector_ptr_src, vector_ptr);
     }
 
-    return block.addPtrElemPtr(vector_ptr, elem_index, elem_ptr_ty);
+    return block.addTyOp(.ptr_cast, elem_ptr_ty, vector_ptr);
 }
 
 fn elemPtrSpirvRuntimeArray(
@@ -27544,7 +27385,7 @@ fn elemPtrArray(
 
     if (array_ty.childType(zcu).abiSize(zcu) == 0) {
         // zero-bit child type; just bitcast the pointer
-        return block.addBitCast(elem_ptr_ty, array_ptr);
+        return block.addTyOp(.ptr_cast, elem_ptr_ty, array_ptr);
     }
 
     return block.addPtrElemPtr(array_ptr, elem_index, elem_ptr_ty);
@@ -27670,7 +27511,7 @@ fn elemPtrSlice(
     if (elem_ty.abiSize(zcu) == 0) {
         // zero-bit child type; just extract the pointer and bitcast it
         const slice_ptr = try block.addTyOp(.slice_ptr, slice_ty.slicePtrFieldType(zcu), slice);
-        return block.addBitCast(elem_ptr_ty, slice_ptr);
+        return block.addTyOp(.ptr_cast, elem_ptr_ty, slice_ptr);
     }
     return block.addSliceElemPtr(slice, elem_index, elem_ptr_ty);
 }
@@ -27753,12 +27594,31 @@ fn coerceExtra(
     var in_memory_result = try sema.coerceInMemoryAllowed(block, dest_ty, inst_ty, false, target, dest_ty_src, inst_src, maybe_inst_val);
     if (in_memory_result == .ok) {
         if (maybe_inst_val) |val| {
-            return sema.coerceInMemory(val, dest_ty);
-        }
-        try sema.requireRuntimeBlock(block, inst_src, null);
-        const new_val = try block.addBitCast(dest_ty, inst);
-        try sema.checkKnownAllocPtr(block, inst, new_val);
-        return new_val;
+            return .fromValue(try pt.getCoerced(val, dest_ty));
+        }
+        const coerced: Air.Inst.Ref = switch (in_memory_result.ok) {
+            .none => coerced: {
+                const @"addrspace" = target_util.defaultAddressSpace(zcu.getTarget(), .local);
+                const src_ptr_ty = try pt.ptrType(.{
+                    .child = inst_ty.toIntern(),
+                    .flags = .{ .size = .one, .address_space = @"addrspace" },
+                });
+                const dest_ptr_ty = try pt.ptrType(.{
+                    .child = dest_ty.toIntern(),
+                    .flags = .{ .size = .one, .address_space = @"addrspace" },
+                });
+                const ptr = try block.addTy(.alloc, src_ptr_ty);
+                _ = try block.addBinOp(.store_safe, ptr, inst);
+                const casted_ptr = try block.addTyOp(.ptr_cast, dest_ptr_ty, ptr);
+                break :coerced try block.addTyOp(.load, dest_ty, casted_ptr);
+            },
+            .same_type => unreachable, // we checked for equal types just above
+            .bit_cast => try block.addTyOp(.bit_cast, dest_ty, inst),
+            .ptr_cast => try block.addTyOp(.ptr_cast, dest_ty, inst),
+            .error_cast => try block.addTyOp(.error_cast, dest_ty, inst),
+        };
+        try sema.checkKnownAllocPtr(block, inst, coerced);
+        return coerced;
     }
 
     switch (dest_ty.zigTypeTag(zcu)) {
@@ -27872,8 +27732,8 @@ fn coerceExtra(
 
                 if (dest_info.sentinel != .none) {
                     if (array_ty.sentinel(zcu)) |inst_sent| {
-                        if (Air.internedToRef(dest_info.sentinel) !=
-                            try sema.coerceInMemory(inst_sent, dst_elem_type))
+                        if (dest_info.sentinel !=
+                            (try pt.getCoerced(inst_sent, dst_elem_type)).toIntern())
                         {
                             in_memory_result = .{ .ptr_sentinel = .{
                                 .actual = inst_sent,
@@ -28067,8 +27927,8 @@ fn coerceExtra(
                     }
 
                     if (dest_info.sentinel == .none or inst_info.sentinel == .none or
-                        Air.internedToRef(dest_info.sentinel) !=
-                            try sema.coerceInMemory(Value.fromInterned(inst_info.sentinel), .fromInterned(dest_info.child)))
+                        dest_info.sentinel !=
+                            (try pt.getCoerced(.fromInterned(inst_info.sentinel), .fromInterned(dest_info.child))).toIntern())
                         break :p;
 
                     const slice_ptr = try sema.analyzeSlicePtr(block, inst_src, inst, inst_ty);
@@ -28117,7 +27977,7 @@ fn coerceExtra(
                     (dst_info.signedness == .signed and dst_info.bits > src_info.bits))
                 {
                     try sema.requireRuntimeBlock(block, inst_src, null);
-                    return block.addTyOp(.intcast, dest_ty, inst);
+                    return block.addTyOp(.int_cast, dest_ty, inst);
                 }
             },
             else => {},
@@ -28379,16 +28239,8 @@ fn coerceExtra(
     return sema.failWithOwnedErrorMsg(block, msg);
 }
 
-fn coerceInMemory(
-    sema: *Sema,
-    val: Value,
-    dst_ty: Type,
-) CompileError!Air.Inst.Ref {
-    return Air.internedToRef((try sema.pt.getCoerced(val, dst_ty)).toIntern());
-}
-
 const InMemoryCoercionResult = union(enum) {
-    ok,
+    ok: Strategy,
     no_match: Pair,
     int_not_coercible: Int,
     comptime_int_not_coercible: TypeValuePair,
@@ -28424,6 +28276,21 @@ const InMemoryCoercionResult = union(enum) {
     double_ptr_to_anyopaque: Pair,
     slice_to_anyopaque: Pair,
 
+    const Strategy = enum {
+        /// There isn't a special strategy for this particular coercion---we'll just need to
+        /// reinterpret the bytes in memory.
+        none,
+
+        /// The source and destination types are equal, so no explicit cast operation is necessary.
+        same_type,
+        /// The coercion can be lowered to `Air.Inst.Tag.bit_cast`.
+        bit_cast,
+        /// The coercion can be lowered to `Air.Inst.Tag.ptr_cast`.
+        ptr_cast,
+        /// The coercion can be lowered to `Air.Inst.Tag.error_cast`.
+        error_cast,
+    };
+
     const Pair = struct {
         actual: Type,
         wanted: Type,
@@ -28797,7 +28664,7 @@ pub fn coerceInMemoryAllowed(
     }
 
     if (dest_ty.eql(src_ty))
-        return .ok;
+        return .{ .ok = .same_type };
 
     const dest_tag = dest_ty.zigTypeTag(zcu);
     const src_tag = src_ty.zigTypeTag(zcu);
@@ -28810,7 +28677,7 @@ pub fn coerceInMemoryAllowed(
         if (dest_info.signedness == src_info.signedness and
             dest_info.bits == src_info.bits)
         {
-            return .ok;
+            return .{ .ok = .bit_cast };
         }
 
         if ((src_info.signedness == dest_info.signedness and dest_info.bits < src_info.bits) or
@@ -28818,7 +28685,7 @@ pub fn coerceInMemoryAllowed(
             (dest_info.signedness == .signed and src_info.signedness == .unsigned and dest_info.bits <= src_info.bits) or
             (dest_info.signedness == .unsigned and src_info.signedness == .signed))
         {
-            return InMemoryCoercionResult{ .int_not_coercible = .{
+            return .{ .int_not_coercible = .{
                 .actual_signedness = src_info.signedness,
                 .wanted_signedness = dest_info.signedness,
                 .actual_bits = src_info.bits,
@@ -28841,7 +28708,7 @@ pub fn coerceInMemoryAllowed(
         const dest_bits = dest_ty.floatBits(target);
         const src_bits = src_ty.floatBits(target);
         if (dest_bits == src_bits) {
-            return .ok;
+            return .{ .ok = .bit_cast };
         }
     }
 
@@ -28864,24 +28731,38 @@ pub fn coerceInMemoryAllowed(
     if (dest_tag == .error_union and src_tag == .error_union) {
         const dest_payload = dest_ty.errorUnionPayload(zcu);
         const src_payload = src_ty.errorUnionPayload(zcu);
-        const child = try sema.coerceInMemoryAllowed(block, dest_payload, src_payload, dest_is_mut, target, dest_src, src_src, null);
-        if (child != .ok) {
-            return .{ .error_union_payload = .{
-                .child = try child.dupe(sema.arena),
+        const payload_strat = switch (try sema.coerceInMemoryAllowed(block, dest_payload, src_payload, dest_is_mut, target, dest_src, src_src, null)) {
+            .ok => |strat| strat,
+            else => |payload_result| return .{ .error_union_payload = .{
+                .child = try payload_result.dupe(sema.arena),
                 .actual = src_payload,
                 .wanted = dest_payload,
-            } };
+            } },
+        };
+        switch (try sema.coerceInMemoryAllowed(block, dest_ty.errorUnionSet(zcu), src_ty.errorUnionSet(zcu), dest_is_mut, target, dest_src, src_src, null)) {
+            .ok => {},
+            else => |err_set_result| return err_set_result,
         }
-        return try sema.coerceInMemoryAllowed(block, dest_ty.errorUnionSet(zcu), src_ty.errorUnionSet(zcu), dest_is_mut, target, dest_src, src_src, null);
+        return switch (payload_strat) {
+            .same_type => .{ .ok = .error_cast },
+            else => .{ .ok = .none },
+        };
     }
 
     // Error Sets
     if (dest_tag == .error_set and src_tag == .error_set) {
-        const res1 = try sema.coerceInMemoryAllowedErrorSets(block, dest_ty, src_ty, dest_src, src_src);
-        if (!dest_is_mut or res1 != .ok) return res1;
-        // src -> dest is okay, but `dest_is_mut`, so it needs to be allowed in the other direction.
-        const res2 = try sema.coerceInMemoryAllowedErrorSets(block, src_ty, dest_ty, src_src, dest_src);
-        return res2;
+        switch (try sema.coerceInMemoryAllowedErrorSets(block, dest_ty, src_ty, dest_src, src_src)) {
+            .ok => |strat| assert(strat == .error_cast),
+            else => |result| return result,
+        }
+        if (dest_is_mut) {
+            // src -> dest is okay, but `dest_is_mut`, so it needs to be allowed in the other direction.
+            switch (try sema.coerceInMemoryAllowedErrorSets(block, src_ty, dest_ty, src_src, dest_src)) {
+                .ok => |strat| assert(strat == .error_cast),
+                else => |result| return result,
+            }
+        }
+        return .{ .ok = .error_cast };
     }
 
     // Arrays
@@ -28896,9 +28777,9 @@ pub fn coerceInMemoryAllowed(
         }
 
         const child = try sema.coerceInMemoryAllowed(block, dest_info.elem_type, src_info.elem_type, dest_is_mut, target, dest_src, src_src, null);
-        switch (child) {
-            .ok => {},
-            .no_match => return child,
+        const child_strat = switch (child) {
+            .ok => |strat| strat,
+            .no_match => |no_match| return .{ .no_match = no_match },
             else => {
                 return .{ .array_elem = .{
                     .child = try child.dupe(sema.arena),
@@ -28906,7 +28787,7 @@ pub fn coerceInMemoryAllowed(
                     .wanted = dest_info.elem_type,
                 } };
             },
-        }
+        };
         const ok_sent = (dest_info.sentinel == null and src_info.sentinel == null) or
             (src_info.sentinel != null and
                 dest_info.sentinel != null and
@@ -28922,7 +28803,10 @@ pub fn coerceInMemoryAllowed(
                 .ty = dest_info.elem_type,
             } };
         }
-        return .ok;
+        return .{ .ok = switch (child_strat) {
+            .bit_cast => .bit_cast,
+            else => .none,
+        } };
     }
 
     // Vectors
@@ -28938,16 +28822,18 @@ pub fn coerceInMemoryAllowed(
 
         const dest_elem_ty = dest_ty.scalarType(zcu);
         const src_elem_ty = src_ty.scalarType(zcu);
-        const child = try sema.coerceInMemoryAllowed(block, dest_elem_ty, src_elem_ty, dest_is_mut, target, dest_src, src_src, null);
-        if (child != .ok) {
-            return .{ .vector_elem = .{
-                .child = try child.dupe(sema.arena),
+        switch (try sema.coerceInMemoryAllowed(block, dest_elem_ty, src_elem_ty, dest_is_mut, target, dest_src, src_src, null)) {
+            .ok => |child_strat| return .{ .ok = switch (child_strat) {
+                .bit_cast => .bit_cast,
+                .ptr_cast => .ptr_cast,
+                else => .none,
+            } },
+            else => |child_result| return .{ .vector_elem = .{
+                .child = try child_result.dupe(sema.arena),
                 .actual = src_elem_ty,
                 .wanted = dest_elem_ty,
-            } };
+            } },
         }
-
-        return .ok;
     }
 
     // Optionals
@@ -28971,7 +28857,7 @@ pub fn coerceInMemoryAllowed(
             } };
         }
 
-        return .ok;
+        return .{ .ok = .none };
     }
 
     // Tuples (with in-memory-coercible fields)
@@ -28985,7 +28871,7 @@ pub fn coerceInMemoryAllowed(
             const field = try sema.coerceInMemoryAllowed(block, dest_field_ty, src_field_ty, dest_is_mut, target, dest_src, src_src, null);
             if (field != .ok) break :tuple;
         }
-        return .ok;
+        return .{ .ok = .none };
     }
 
     return .{ .no_match = .{
@@ -29008,13 +28894,13 @@ fn coerceInMemoryAllowedErrorSets(
     const ip = &zcu.intern_pool;
 
     const dest_set: InternPool.Key.ErrorSetType = err_set: switch (dest_ty.toIntern()) {
-        .anyerror_type => return .ok,
+        .anyerror_type => return .{ .ok = .error_cast },
         .adhoc_inferred_error_set_type => {
             // We are trying to coerce an error set to the current function's
             // inferred error set.
             const dst_ies = sema.fn_ret_ty_ies.?;
             try dst_ies.addErrorSet(src_ty, ip, sema.arena);
-            return .ok;
+            return .{ .ok = .error_cast };
         },
         else => |err_set_ty| switch (ip.indexToKey(err_set_ty)) {
             .inferred_error_set_type => |func_index| {
@@ -29023,7 +28909,7 @@ fn coerceInMemoryAllowedErrorSets(
                         // We are trying to coerce an error set to the current function's
                         // inferred error set.
                         try dst_ies.addErrorSet(src_ty, ip, sema.arena);
-                        return .ok;
+                        return .{ .ok = .error_cast };
                     }
                 }
                 try sema.ensureFuncIesResolved(block, dest_src, func_index);
@@ -29062,7 +28948,7 @@ fn coerceInMemoryAllowedErrorSets(
         ) };
     }
 
-    return .ok;
+    return .{ .ok = .error_cast };
 }
 
 fn coerceInMemoryAllowedFns(
@@ -29179,7 +29065,7 @@ fn coerceInMemoryAllowedFns(
         }
     }
 
-    return .ok;
+    return .{ .ok = .none };
 }
 
 fn callconvCoerceAllowed(
@@ -29256,7 +29142,7 @@ fn coerceInMemoryAllowedPtrs(
     const ok_ptr_size = src_info.flags.size == dest_info.flags.size or
         src_info.flags.size == .c or dest_info.flags.size == .c;
     if (!ok_ptr_size) {
-        return InMemoryCoercionResult{ .ptr_size = .{
+        return .{ .ptr_size = .{
             .actual = src_info.flags.size,
             .wanted = dest_info.flags.size,
         } };
@@ -29392,14 +29278,14 @@ fn coerceInMemoryAllowedPtrs(
             break :a dest_child.abiAlignment(zcu);
         } else dest_info.flags.alignment;
         if (dest_align.compare(if (dest_is_mut) .neq else .gt, src_align)) {
-            return InMemoryCoercionResult{ .ptr_alignment = .{
+            return .{ .ptr_alignment = .{
                 .actual = src_align,
                 .wanted = dest_align,
             } };
         }
     }
 
-    return .ok;
+    return .{ .ok = .ptr_cast };
 }
 
 fn coerceVarArgParam(
@@ -29703,7 +29589,6 @@ fn bitCast(
     dest_ty: Type,
     inst: Air.Inst.Ref,
     inst_src: LazySrcLoc,
-    operand_src: ?LazySrcLoc,
 ) CompileError!Air.Inst.Ref {
     const pt = sema.pt;
     const zcu = pt.zcu;
@@ -29712,6 +29597,11 @@ fn bitCast(
     old_ty.assertHasLayout(zcu);
     try sema.ensureLayoutResolved(dest_ty, inst_src, .init);
 
+    assert(old_ty.hasBitRepresentation(zcu));
+    assert(dest_ty.hasBitRepresentation(zcu));
+    assert(old_ty.scalarType(zcu).zigTypeTag(zcu) != .pointer);
+    assert(dest_ty.scalarType(zcu).zigTypeTag(zcu) != .pointer);
+
     const dest_bits = dest_ty.bitSize(zcu);
     const old_bits = old_ty.bitSize(zcu);
 
@@ -29725,20 +29615,30 @@ fn bitCast(
     }
 
     if (sema.resolveValue(inst)) |val| {
-        if (val.isUndef(zcu))
-            return pt.undefRef(dest_ty);
-        if (old_ty.zigTypeTag(zcu) == .error_set and dest_ty.zigTypeTag(zcu) == .error_set) {
-            // Special case: we sometimes call `bitCast` on error set values, but they
-            // don't have a well-defined layout, so we can't use `bitCastVal` on them.
-            return Air.internedToRef((try pt.getCoerced(val, dest_ty)).toIntern());
-        }
-        if (try sema.bitCastVal(val, dest_ty, 0, 0, 0)) |result_val| {
-            return Air.internedToRef(result_val.toIntern());
-        }
+        return .fromValue(try sema.bitCastVal(val, dest_ty));
     }
-    try sema.requireRuntimeBlock(block, inst_src, operand_src);
     try sema.validateRuntimeValue(block, inst_src, inst);
-    return block.addBitCast(dest_ty, inst);
+    return block.addTyOp(.bit_cast, dest_ty, inst);
+}
+
+/// Supports only types which `@bitCast` supports, so pointers are *not* supported.
+pub fn bitCastVal(
+    sema: *Sema,
+    val: Value,
+    dest_ty: Type,
+) Allocator.Error!Value {
+    const pt = sema.pt;
+    const zcu = pt.zcu;
+    const bit_size = dest_ty.bitSize(zcu);
+    assert(val.typeOf(zcu).bitSize(zcu) == bit_size);
+    if (val.isUndef(zcu)) {
+        return pt.undefValue(dest_ty);
+    } else {
+        const buf = try sema.arena.alloc(u8, @intCast((bit_size + 7) / 8));
+        @memset(buf, 0);
+        val.writeToPackedMemory(zcu, buf, 0);
+        return .readFromPackedMemory(dest_ty, pt, buf, 0);
+    }
 }
 
 fn coerceArrayPtrToSlice(
@@ -29855,14 +29755,17 @@ fn coerceCompatiblePtrs(
         );
     }
     try sema.requireRuntimeBlock(block, inst_src, null);
-    const inst_allows_zero = inst_ty.zigTypeTag(zcu) != .pointer or inst_ty.ptrAllowsZero(zcu);
-    if (block.wantSafety() and inst_allows_zero and !dest_ty.ptrAllowsZero(zcu)) {
+    const maybe_zero: bool = switch (inst_ty.toIntern()) {
+        .usize_type, .isize_type => true,
+        else => inst_ty.ptrAllowsZero(zcu),
+    };
+    if (block.wantSafety() and maybe_zero and !dest_ty.ptrAllowsZero(zcu)) {
         try sema.checkLogicalPtrOperation(block, inst_src, inst_ty);
         const actual_ptr = if (inst_ty.isSlice(zcu))
             try sema.analyzeSlicePtr(block, inst_src, inst, inst_ty)
         else
             inst;
-        const ptr_int = try block.addBitCast(.usize, actual_ptr);
+        const ptr_int = try block.addTyOp(.int_from_ptr, .usize, actual_ptr);
         const is_non_zero = try block.addBinOp(.cmp_neq, ptr_int, .zero_usize);
         const ok = if (inst_ty.isSlice(zcu)) ok: {
             const len = try sema.analyzeSliceLen(block, inst_src, inst);
@@ -29871,7 +29774,14 @@ fn coerceCompatiblePtrs(
         } else is_non_zero;
         try sema.addSafetyCheck(block, inst_src, ok, .cast_to_null);
     }
-    const new_ptr = try sema.bitCast(block, dest_ty, inst, inst_src, null);
+    const new_ptr: Air.Inst.Ref = switch (inst_ty.toIntern()) {
+        .usize_type => try block.addTyOp(.ptr_from_int, dest_ty, inst),
+        .isize_type => new_ptr: {
+            const usize_inst = try block.addTyOp(.bit_cast, .usize, inst);
+            break :new_ptr try block.addTyOp(.ptr_from_int, dest_ty, usize_inst);
+        },
+        else => try block.addTyOp(.ptr_cast, dest_ty, inst),
+    };
     try sema.checkKnownAllocPtr(block, inst, new_ptr);
     return new_ptr;
 }
@@ -29968,7 +29878,7 @@ fn coerceEnumToUnion(
             return .fromValue(opv);
         } else {
             // The union layout is just the tag, so we can bitcast the enum straight to the union.
-            return block.addBitCast(union_ty, enum_tag);
+            return block.addTyOp(.union_from_enum, union_ty, enum_tag);
         }
     }
 
@@ -30017,18 +29927,6 @@ fn coerceArrayLike(
     const inst_ty = sema.typeOf(inst);
     const target = zcu.getTarget();
 
-    // try coercion of the whole array
-    const in_memory_result = try sema.coerceInMemoryAllowed(block, dest_ty, inst_ty, false, target, dest_ty_src, inst_src, null);
-    if (in_memory_result == .ok) {
-        if (sema.resolveValue(inst)) |inst_val| {
-            // These types share the same comptime value representation.
-            return sema.coerceInMemory(inst_val, dest_ty);
-        }
-        try sema.requireRuntimeBlock(block, inst_src, null);
-        return block.addBitCast(dest_ty, inst);
-    }
-
-    // otherwise, try element by element
     const inst_len = inst_ty.arrayLen(zcu);
     const dest_len = try sema.usizeCast(block, dest_ty_src, dest_ty.arrayLen(zcu));
     if (dest_len != inst_len) {
@@ -30055,7 +29953,7 @@ fn coerceArrayLike(
                     (dst_info.signedness == .signed and dst_info.bits > src_info.bits))
                 {
                     try sema.requireRuntimeBlock(block, inst_src, null);
-                    return block.addTyOp(.intcast, dest_ty, inst);
+                    return block.addTyOp(.int_cast, dest_ty, inst);
                 }
             },
             .float => if (inst_elem_ty.isRuntimeFloat()) {
@@ -30582,7 +30480,7 @@ fn analyzeRef(
 
     // Cast to the constant pointer type. We do this directly rather than going via `coerce` to
     // avoid errors in the `block.isComptime()` case.
-    return block.addBitCast(ptr_type, alloc);
+    return block.addTyOp(.ptr_cast, ptr_type, alloc);
 }
 
 fn analyzeLoad(
@@ -31334,7 +31232,7 @@ fn analyzeSlice(
 
         const opt_new_ptr_val = sema.resolveValue(new_ptr);
         const new_ptr_val = opt_new_ptr_val orelse {
-            const result = try block.addBitCast(return_ty, new_ptr);
+            const result = try block.addTyOp(.ptr_cast, return_ty, new_ptr);
             if (block.wantSafety()) {
                 // requirement: slicing C ptr is non-null
                 if (ptr_ptr_child_ty.isCPtr(zcu)) {
@@ -34366,8 +34264,8 @@ pub fn flushExports(sema: *Sema) !void {
     }
 }
 
-pub const bitCastVal = @import("Sema/bitcast.zig").bitCast;
-pub const bitCastSpliceVal = @import("Sema/bitcast.zig").bitCastSplice;
+pub const castMemory = @import("Sema/reinterpret.zig").castMemory;
+pub const spliceMemory = @import("Sema/reinterpret.zig").spliceMemory;
 
 const loadComptimePtr = @import("Sema/comptime_ptr_access.zig").loadComptimePtr;
 const ComptimeLoadResult = @import("Sema/comptime_ptr_access.zig").ComptimeLoadResult;
diff --git a/src/Sema/LowerZon.zig b/src/Sema/LowerZon.zig
@@ -815,20 +815,15 @@ fn lowerStruct(self: *LowerZon, node: Zoir.Node.Index, res_ty: Type) !InternPool
         .@"packed" => result: {
             const arena = self.sema.arena;
             const buf = try arena.alloc(u8, @intCast((res_ty.bitSize(zcu) + 7) / 8));
+            @memset(buf, 0);
             var bit_offset: u16 = 0;
             for (field_values) |field_ip| {
                 const field_val: Value = .fromInterned(field_ip);
-                field_val.writeToPackedMemory(zcu, buf, bit_offset) catch |err| switch (err) {
-                    error.ReinterpretDeclRef => unreachable, // bitpack fields cannot be pointers
-                    error.OutOfMemory => |e| return e,
-                };
+                field_val.writeToPackedMemory(zcu, buf, bit_offset);
                 bit_offset += @intCast(field_val.typeOf(zcu).bitSize(zcu));
             }
             assert(bit_offset == res_ty.bitSize(zcu));
-            break :result Value.readFromPackedMemory(res_ty, pt, buf, 0, arena) catch |err| switch (err) {
-                error.IllDefinedMemoryLayout => unreachable, // bitpacks have well-defined layout
-                error.OutOfMemory => |e| return e,
-            };
+            break :result try .readFromPackedMemory(res_ty, pt, buf, 0);
         },
     };
     return result.toIntern();
@@ -981,9 +976,7 @@ fn lowerUnion(self: *LowerZon, node: Zoir.Node.Index, res_ty: Type) !InternPool.
     };
     const result: Value = switch (union_info.layout) {
         .auto, .@"extern" => try pt.unionValue(res_ty, tag, val),
-        .@"packed" => try self.sema.bitCastVal(val, res_ty, 0, 0, 0) orelse {
-            unreachable; // `null` is only possible if the input value contains a pointer, which a packed union cannot.
-        },
+        .@"packed" => try self.sema.bitCastVal(val, res_ty),
     };
     return result.toIntern();
 }
diff --git a/src/Sema/bitcast.zig b/src/Sema/bitcast.zig
@@ -1,774 +0,0 @@
-//! This file contains logic for bit-casting arbitrary values at comptime, including splicing
-//! bits together for comptime stores of bit-pointers. The strategy is to "flatten" values to
-//! a sequence of values in *packed* memory, and then unflatten through a combination of special
-//! cases (particularly for pointers and `undefined` values) and in-memory buffer reinterprets.
-//!
-//! This is a little awkward on big-endian targets, as non-packed datastructures (e.g. `extern struct`)
-//! have their fields reversed when represented as packed memory on such targets.
-
-/// If `host_bits` is `0`, attempts to convert the memory at offset
-/// `byte_offset` into `val` to a non-packed value of type `dest_ty`,
-/// ignoring `bit_offset`.
-///
-/// Otherwise, `byte_offset` is an offset in bytes into `val` to a
-/// non-packed value consisting of `host_bits` bits. A value of type
-/// `dest_ty` will be interpreted at a packed offset of `bit_offset`
-/// into this value.
-///
-/// Returns `null` if the operation must be performed at runtime.
-pub fn bitCast(
-    sema: *Sema,
-    val: Value,
-    dest_ty: Type,
-    byte_offset: u64,
-    host_bits: u64,
-    bit_offset: u64,
-) CompileError!?Value {
-    return bitCastInner(sema, val, dest_ty, byte_offset, host_bits, bit_offset) catch |err| switch (err) {
-        error.ReinterpretDeclRef => return null,
-        error.IllDefinedMemoryLayout => unreachable,
-        error.Unimplemented => @panic("unimplemented bitcast"),
-        else => |e| return e,
-    };
-}
-
-/// Uses bitcasting to splice the value `splice_val` into `val`,
-/// replacing overlapping bits and returning the modified value.
-///
-/// If `host_bits` is `0`, splices `splice_val` at an offset
-/// `byte_offset` bytes into the virtual memory of `val`, ignoring
-/// `bit_offset`.
-///
-/// Otherwise, `byte_offset` is an offset into bytes into `val` to
-/// a non-packed value consisting of `host_bits` bits. The value
-/// `splice_val` will be placed at a packed offset of `bit_offset`
-/// into this value.
-pub fn bitCastSplice(
-    sema: *Sema,
-    val: Value,
-    splice_val: Value,
-    byte_offset: u64,
-    host_bits: u64,
-    bit_offset: u64,
-) CompileError!?Value {
-    return bitCastSpliceInner(sema, val, splice_val, byte_offset, host_bits, bit_offset) catch |err| switch (err) {
-        error.ReinterpretDeclRef => return null,
-        error.IllDefinedMemoryLayout => unreachable,
-        error.Unimplemented => @panic("unimplemented bitcast"),
-        else => |e| return e,
-    };
-}
-
-const BitCastError = CompileError || error{ ReinterpretDeclRef, IllDefinedMemoryLayout, Unimplemented };
-
-fn bitCastInner(
-    sema: *Sema,
-    val: Value,
-    dest_ty: Type,
-    byte_offset: u64,
-    host_bits: u64,
-    bit_offset: u64,
-) BitCastError!Value {
-    const pt = sema.pt;
-    const zcu = pt.zcu;
-    const endian = zcu.getTarget().cpu.arch.endian();
-
-    if (dest_ty.toIntern() == val.typeOf(zcu).toIntern() and bit_offset == 0) {
-        return val;
-    }
-
-    const val_ty = val.typeOf(zcu);
-
-    val_ty.assertHasLayout(zcu);
-    dest_ty.assertHasLayout(zcu);
-
-    assert(val_ty.hasWellDefinedLayout(zcu));
-
-    const abi_pad_bits, const host_pad_bits = if (host_bits > 0)
-        .{ val_ty.abiSize(zcu) * 8 - host_bits, host_bits - val_ty.bitSize(zcu) }
-    else
-        .{ val_ty.abiSize(zcu) * 8 - val_ty.bitSize(zcu), 0 };
-
-    const skip_bits = switch (endian) {
-        .little => bit_offset + byte_offset * 8,
-        .big => if (host_bits > 0)
-            val_ty.abiSize(zcu) * 8 - byte_offset * 8 - host_bits + bit_offset
-        else
-            val_ty.abiSize(zcu) * 8 - byte_offset * 8 - dest_ty.bitSize(zcu),
-    };
-
-    var unpack: UnpackValueBits = .{
-        .pt = sema.pt,
-        .arena = sema.arena,
-        .skip_bits = skip_bits,
-        .remaining_bits = dest_ty.bitSize(zcu),
-        .unpacked = std.array_list.Managed(InternPool.Index).init(sema.arena),
-    };
-    switch (endian) {
-        .little => {
-            try unpack.add(val);
-            try unpack.padding(abi_pad_bits);
-        },
-        .big => {
-            try unpack.padding(abi_pad_bits);
-            try unpack.add(val);
-        },
-    }
-    try unpack.padding(host_pad_bits);
-
-    var pack: PackValueBits = .{
-        .pt = sema.pt,
-        .arena = sema.arena,
-        .unpacked = unpack.unpacked.items,
-    };
-    return pack.get(dest_ty);
-}
-
-fn bitCastSpliceInner(
-    sema: *Sema,
-    val: Value,
-    splice_val: Value,
-    byte_offset: u64,
-    host_bits: u64,
-    bit_offset: u64,
-) BitCastError!Value {
-    const pt = sema.pt;
-    const zcu = pt.zcu;
-    const endian = zcu.getTarget().cpu.arch.endian();
-    const val_ty = val.typeOf(zcu);
-    const splice_val_ty = splice_val.typeOf(zcu);
-
-    val_ty.assertHasLayout(zcu);
-    splice_val_ty.assertHasLayout(zcu);
-
-    const splice_bits = splice_val_ty.bitSize(zcu);
-
-    const splice_offset = switch (endian) {
-        .little => bit_offset + byte_offset * 8,
-        .big => if (host_bits > 0)
-            val_ty.abiSize(zcu) * 8 - byte_offset * 8 - host_bits + bit_offset
-        else
-            val_ty.abiSize(zcu) * 8 - byte_offset * 8 - splice_bits,
-    };
-
-    assert(splice_offset + splice_bits <= val_ty.abiSize(zcu) * 8);
-
-    const abi_pad_bits, const host_pad_bits = if (host_bits > 0)
-        .{ val_ty.abiSize(zcu) * 8 - host_bits, host_bits - val_ty.bitSize(zcu) }
-    else
-        .{ val_ty.abiSize(zcu) * 8 - val_ty.bitSize(zcu), 0 };
-
-    var unpack: UnpackValueBits = .{
-        .pt = pt,
-        .arena = sema.arena,
-        .skip_bits = 0,
-        .remaining_bits = splice_offset,
-        .unpacked = std.array_list.Managed(InternPool.Index).init(sema.arena),
-    };
-    switch (endian) {
-        .little => {
-            try unpack.add(val);
-            try unpack.padding(abi_pad_bits);
-        },
-        .big => {
-            try unpack.padding(abi_pad_bits);
-            try unpack.add(val);
-        },
-    }
-    try unpack.padding(host_pad_bits);
-
-    unpack.remaining_bits = splice_bits;
-    try unpack.add(splice_val);
-
-    unpack.skip_bits = splice_offset + splice_bits;
-    unpack.remaining_bits = val_ty.abiSize(zcu) * 8 - splice_offset - splice_bits;
-    switch (endian) {
-        .little => {
-            try unpack.add(val);
-            try unpack.padding(abi_pad_bits);
-        },
-        .big => {
-            try unpack.padding(abi_pad_bits);
-            try unpack.add(val);
-        },
-    }
-    try unpack.padding(host_pad_bits);
-
-    var pack: PackValueBits = .{
-        .pt = pt,
-        .arena = sema.arena,
-        .unpacked = unpack.unpacked.items,
-    };
-    switch (endian) {
-        .little => {},
-        .big => try pack.padding(abi_pad_bits),
-    }
-    return pack.get(val_ty);
-}
-
-/// Recurses through struct fields, array elements, etc, to get a sequence of "primitive" values
-/// which are bit-packed in memory to represent a single value. `unpacked` represents a series
-/// of values in *packed* memory - therefore, on big-endian targets, the first element of this
-/// list contains bits from the *final* byte of the value.
-const UnpackValueBits = struct {
-    pt: Zcu.PerThread,
-    arena: Allocator,
-    skip_bits: u64,
-    remaining_bits: u64,
-    extra_bits: u64 = undefined,
-    unpacked: std.array_list.Managed(InternPool.Index),
-
-    fn add(unpack: *UnpackValueBits, val: Value) BitCastError!void {
-        const pt = unpack.pt;
-        const zcu = pt.zcu;
-        const endian = zcu.getTarget().cpu.arch.endian();
-        const ip = &zcu.intern_pool;
-
-        if (unpack.remaining_bits == 0) {
-            return;
-        }
-
-        const ty = val.typeOf(zcu);
-        const bit_size = ty.bitSize(zcu);
-
-        if (unpack.skip_bits >= bit_size) {
-            unpack.skip_bits -= bit_size;
-            return;
-        }
-
-        switch (ip.indexToKey(val.toIntern())) {
-            .int_type,
-            .ptr_type,
-            .array_type,
-            .vector_type,
-            .opt_type,
-            .anyframe_type,
-            .error_union_type,
-            .simple_type,
-            .struct_type,
-            .tuple_type,
-            .union_type,
-            .opaque_type,
-            .spirv_type,
-            .enum_type,
-            .func_type,
-            .error_set_type,
-            .inferred_error_set_type,
-            .@"extern",
-            .func,
-            .err,
-            .error_union,
-            .enum_literal,
-            .slice,
-            .memoized_call,
-            => unreachable, // ill-defined layout or not real values
-
-            .undef,
-            .int,
-            .enum_tag,
-            .simple_value,
-            .float,
-            .ptr,
-            .opt,
-            => try unpack.primitive(val),
-
-            .bitpack => |bitpack| try unpack.primitive(.fromInterned(bitpack.backing_int_val)),
-
-            .aggregate => switch (ty.zigTypeTag(zcu)) {
-                .vector => {
-                    const len: usize = @intCast(ty.arrayLen(zcu));
-                    for (0..len) |i| {
-                        // We reverse vector elements in packed memory on BE targets.
-                        const real_idx = switch (endian) {
-                            .little => i,
-                            .big => len - i - 1,
-                        };
-                        const elem_val = try val.elemValue(pt, real_idx);
-                        try unpack.add(elem_val);
-                    }
-                },
-                .array => {
-                    // Each element is padded up to its ABI size. Padding bits are undefined.
-                    // The final element does not have trailing padding.
-                    // Elements are reversed in packed memory on BE targets.
-                    const elem_ty = ty.childType(zcu);
-                    const pad_bits = elem_ty.abiSize(zcu) * 8 - elem_ty.bitSize(zcu);
-                    const len = ty.arrayLen(zcu);
-                    const maybe_sent = ty.sentinel(zcu);
-
-                    if (endian == .big) if (maybe_sent) |s| {
-                        try unpack.add(s);
-                        if (len != 0) try unpack.padding(pad_bits);
-                    };
-
-                    for (0..@intCast(len)) |i| {
-                        // We reverse array elements in packed memory on BE targets.
-                        const real_idx = switch (endian) {
-                            .little => i,
-                            .big => len - i - 1,
-                        };
-                        const elem_val = try val.elemValue(pt, @intCast(real_idx));
-                        try unpack.add(elem_val);
-                        if (i != len - 1) try unpack.padding(pad_bits);
-                    }
-
-                    if (endian == .little) if (maybe_sent) |s| {
-                        if (len != 0) try unpack.padding(pad_bits);
-                        try unpack.add(s);
-                    };
-                },
-                .@"struct" => switch (ty.containerLayout(zcu)) {
-                    .auto => unreachable, // ill-defined layout
-                    .@"extern" => switch (endian) {
-                        .little => {
-                            var cur_bit_off: u64 = 0;
-                            var it = zcu.typeToStruct(ty).?.iterateRuntimeOrder(ip);
-                            while (it.next()) |field_idx| {
-                                const want_bit_off = ty.structFieldOffset(field_idx, zcu) * 8;
-                                const pad_bits = want_bit_off - cur_bit_off;
-                                const field_val = try val.fieldValue(pt, field_idx);
-                                try unpack.padding(pad_bits);
-                                try unpack.add(field_val);
-                                cur_bit_off = want_bit_off + field_val.typeOf(zcu).bitSize(zcu);
-                            }
-                            // Add trailing padding bits.
-                            try unpack.padding(bit_size - cur_bit_off);
-                        },
-                        .big => {
-                            var cur_bit_off: u64 = bit_size;
-                            var it = zcu.typeToStruct(ty).?.iterateRuntimeOrderReverse(ip);
-                            while (it.next()) |field_idx| {
-                                const field_val = try val.fieldValue(pt, field_idx);
-                                const field_ty = field_val.typeOf(zcu);
-                                const want_bit_off = ty.structFieldOffset(field_idx, zcu) * 8 + field_ty.bitSize(zcu);
-                                const pad_bits = cur_bit_off - want_bit_off;
-                                try unpack.padding(pad_bits);
-                                try unpack.add(field_val);
-                                cur_bit_off = want_bit_off - field_ty.bitSize(zcu);
-                            }
-                            assert(cur_bit_off == 0);
-                        },
-                    },
-                    .@"packed" => {
-                        // Just add all fields in order. There are no padding bits.
-                        // This is identical between LE and BE targets.
-                        for (0..ty.structFieldCount(zcu)) |i| {
-                            const field_val = try val.fieldValue(pt, i);
-                            try unpack.add(field_val);
-                        }
-                    },
-                },
-                else => unreachable,
-            },
-
-            .un => |un| {
-                // We actually don't care about the tag here!
-                // Instead, we just need to write the payload value, plus any necessary padding.
-                // This correctly handles the case where `tag == .none`, since the payload is then
-                // either an integer or a byte array, both of which we can unpack.
-                const payload_val = Value.fromInterned(un.val);
-                const pad_bits = bit_size - payload_val.typeOf(zcu).bitSize(zcu);
-                if (endian == .little or ty.containerLayout(zcu) == .@"packed") {
-                    try unpack.add(payload_val);
-                    try unpack.padding(pad_bits);
-                } else {
-                    try unpack.padding(pad_bits);
-                    try unpack.add(payload_val);
-                }
-            },
-        }
-    }
-
-    fn padding(unpack: *UnpackValueBits, pad_bits: u64) BitCastError!void {
-        if (pad_bits == 0) return;
-        const pt = unpack.pt;
-        // Figure out how many full bytes and leftover bits there are.
-        const bytes = pad_bits / 8;
-        const bits = pad_bits % 8;
-        // Add undef u8 values for the bytes...
-        const undef_u8 = try pt.undefValue(Type.u8);
-        for (0..@intCast(bytes)) |_| {
-            try unpack.primitive(undef_u8);
-        }
-        // ...and an undef int for the leftover bits.
-        if (bits == 0) return;
-        const bits_ty = try pt.intType(.unsigned, @intCast(bits));
-        const bits_val = try pt.undefValue(bits_ty);
-        try unpack.primitive(bits_val);
-    }
-
-    fn primitive(unpack: *UnpackValueBits, val: Value) BitCastError!void {
-        const pt = unpack.pt;
-        const zcu = pt.zcu;
-
-        if (unpack.remaining_bits == 0) {
-            return;
-        }
-
-        const ty = val.typeOf(pt.zcu);
-        const bit_size = ty.bitSize(zcu);
-
-        // Note that this skips all zero-bit types.
-        if (unpack.skip_bits >= bit_size) {
-            unpack.skip_bits -= bit_size;
-            return;
-        }
-
-        if (unpack.skip_bits > 0) {
-            const skip = unpack.skip_bits;
-            unpack.skip_bits = 0;
-            return unpack.splitPrimitive(val, skip, bit_size - skip);
-        }
-
-        if (unpack.remaining_bits < bit_size) {
-            return unpack.splitPrimitive(val, 0, unpack.remaining_bits);
-        }
-
-        unpack.remaining_bits -|= bit_size;
-
-        try unpack.unpacked.append(val.toIntern());
-    }
-
-    fn splitPrimitive(unpack: *UnpackValueBits, val: Value, bit_offset: u64, bit_count: u64) BitCastError!void {
-        const pt = unpack.pt;
-        const zcu = pt.zcu;
-        const ty = val.typeOf(pt.zcu);
-
-        const val_bits = ty.bitSize(zcu);
-        assert(bit_offset + bit_count <= val_bits);
-
-        switch (pt.zcu.intern_pool.indexToKey(val.toIntern())) {
-            // In the `ptr` case, this will return `error.ReinterpretDeclRef`
-            // if we're trying to split a non-integer pointer value.
-            .int, .float, .enum_tag, .ptr, .opt => {
-                // This @intCast is okay because no primitive can exceed the size of a u16.
-                const int_ty = try unpack.pt.intType(.unsigned, @intCast(bit_count));
-                const buf = try unpack.arena.alloc(u8, @intCast((val_bits + 7) / 8));
-                try val.writeToPackedMemory(zcu, buf, 0);
-                const sub_val = try Value.readFromPackedMemory(int_ty, unpack.pt, buf, @intCast(bit_offset), unpack.arena);
-                try unpack.primitive(sub_val);
-            },
-            .undef => try unpack.padding(bit_count),
-            // The only values here with runtime bits are `true` and `false.
-            // These are both 1 bit, so will never need truncating.
-            .simple_value => unreachable,
-            else => unreachable, // zero-bit or not primitives
-        }
-    }
-};
-
-/// Given a sequence of bit-packed values in packed memory (see `UnpackValueBits`),
-/// reconstructs a value of an arbitrary type, with correct handling of `undefined`
-/// values and of pointers which align in virtual memory.
-const PackValueBits = struct {
-    pt: Zcu.PerThread,
-    arena: Allocator,
-    bit_offset: u64 = 0,
-    unpacked: []const InternPool.Index,
-
-    fn get(pack: *PackValueBits, ty: Type) BitCastError!Value {
-        const pt = pack.pt;
-        const zcu = pt.zcu;
-        const endian = zcu.getTarget().cpu.arch.endian();
-        const ip = &zcu.intern_pool;
-        const arena = pack.arena;
-        switch (ty.zigTypeTag(zcu)) {
-            .vector => {
-                // Elements are bit-packed.
-                const len = ty.arrayLen(zcu);
-                const elem_ty = ty.childType(zcu);
-                const elems = try arena.alloc(InternPool.Index, @intCast(len));
-                // We reverse vector elements in packed memory on BE targets.
-                switch (endian) {
-                    .little => for (elems) |*elem| {
-                        elem.* = (try pack.get(elem_ty)).toIntern();
-                    },
-                    .big => {
-                        var i = elems.len;
-                        while (i > 0) {
-                            i -= 1;
-                            elems[i] = (try pack.get(elem_ty)).toIntern();
-                        }
-                    },
-                }
-                return pt.aggregateValue(ty, elems);
-            },
-            .array => {
-                // Each element is padded up to its ABI size. The final element does not have trailing padding.
-                const len = ty.arrayLen(zcu);
-                const elem_ty = ty.childType(zcu);
-                const maybe_sent = ty.sentinel(zcu);
-                const pad_bits = elem_ty.abiSize(zcu) * 8 - elem_ty.bitSize(zcu);
-                const elems = try arena.alloc(InternPool.Index, @intCast(len));
-
-                if (endian == .big and maybe_sent != null) {
-                    // TODO: validate sentinel was preserved!
-                    try pack.padding(elem_ty.bitSize(zcu));
-                    if (len != 0) try pack.padding(pad_bits);
-                }
-
-                for (0..elems.len) |i| {
-                    const real_idx = switch (endian) {
-                        .little => i,
-                        .big => len - i - 1,
-                    };
-                    elems[@intCast(real_idx)] = (try pack.get(elem_ty)).toIntern();
-                    if (i != len - 1) try pack.padding(pad_bits);
-                }
-
-                if (endian == .little and maybe_sent != null) {
-                    // TODO: validate sentinel was preserved!
-                    if (len != 0) try pack.padding(pad_bits);
-                    try pack.padding(elem_ty.bitSize(zcu));
-                }
-
-                return pt.aggregateValue(ty, elems);
-            },
-            .@"struct" => switch (ty.containerLayout(zcu)) {
-                .auto => unreachable, // ill-defined layout
-                .@"extern" => {
-                    const elems = try arena.alloc(InternPool.Index, ty.structFieldCount(zcu));
-                    @memset(elems, .none);
-                    switch (endian) {
-                        .little => {
-                            var cur_bit_off: u64 = 0;
-                            var it = zcu.typeToStruct(ty).?.iterateRuntimeOrder(ip);
-                            while (it.next()) |field_idx| {
-                                const want_bit_off = ty.structFieldOffset(field_idx, zcu) * 8;
-                                try pack.padding(want_bit_off - cur_bit_off);
-                                const field_ty = ty.fieldType(field_idx, zcu);
-                                elems[field_idx] = (try pack.get(field_ty)).toIntern();
-                                cur_bit_off = want_bit_off + field_ty.bitSize(zcu);
-                            }
-                            try pack.padding(ty.bitSize(zcu) - cur_bit_off);
-                        },
-                        .big => {
-                            var cur_bit_off: u64 = ty.bitSize(zcu);
-                            var it = zcu.typeToStruct(ty).?.iterateRuntimeOrderReverse(ip);
-                            while (it.next()) |field_idx| {
-                                const field_ty = ty.fieldType(field_idx, zcu);
-                                const want_bit_off = ty.structFieldOffset(field_idx, zcu) * 8 + field_ty.bitSize(zcu);
-                                try pack.padding(cur_bit_off - want_bit_off);
-                                elems[field_idx] = (try pack.get(field_ty)).toIntern();
-                                cur_bit_off = want_bit_off - field_ty.bitSize(zcu);
-                            }
-                            assert(cur_bit_off == 0);
-                        },
-                    }
-                    // Any fields which do not have runtime bits should be OPV or comptime fields.
-                    // Fill those values now.
-                    for (elems, 0..) |*elem, field_idx| {
-                        if (elem.* != .none) continue;
-                        const val = (try ty.structFieldValueComptime(pt, field_idx)).?;
-                        elem.* = val.toIntern();
-                    }
-                    return pt.aggregateValue(ty, elems);
-                },
-                .@"packed" => {
-                    const backing_int_val = try pack.primitive(ty.bitpackBackingInt(zcu));
-                    if (backing_int_val.isUndef(zcu)) return pt.undefValue(ty);
-                    return pt.bitpackValue(ty, backing_int_val);
-                },
-            },
-            .@"union" => switch (ty.containerLayout(zcu)) {
-                .auto => unreachable, // ill-defined layout
-                .@"extern" => {
-                    // We will attempt to read as the backing representation. If this emits
-                    // `error.ReinterpretDeclRef`, we will try each union field, preferring larger ones.
-                    // We will also attempt smaller fields when we get `undefined`, as if some bits are
-                    // defined we want to include them.
-                    // TODO: this is very very bad. We need a more sophisticated union representation.
-
-                    const prev_unpacked = pack.unpacked;
-                    const prev_bit_offset = pack.bit_offset;
-
-                    const backing_ty = try ty.externUnionBackingType(pt);
-
-                    backing: {
-                        const backing_val = pack.get(backing_ty) catch |err| switch (err) {
-                            error.ReinterpretDeclRef => {
-                                pack.unpacked = prev_unpacked;
-                                pack.bit_offset = prev_bit_offset;
-                                break :backing;
-                            },
-                            else => |e| return e,
-                        };
-                        if (backing_val.isUndef(zcu)) {
-                            pack.unpacked = prev_unpacked;
-                            pack.bit_offset = prev_bit_offset;
-                            break :backing;
-                        }
-                        return Value.fromInterned(try pt.internUnion(.{
-                            .ty = ty.toIntern(),
-                            .tag = .none,
-                            .val = backing_val.toIntern(),
-                        }));
-                    }
-
-                    const field_order = try pack.arena.alloc(u32, ty.unionTagTypeHypothetical(zcu).enumFieldCount(zcu));
-                    for (field_order, 0..) |*f, i| f.* = @intCast(i);
-                    // Sort `field_order` to put the fields with the largest bit sizes first.
-                    const SizeSortCtx = struct {
-                        zcu: *Zcu,
-                        field_types: []const InternPool.Index,
-                        fn lessThan(ctx: @This(), a_idx: u32, b_idx: u32) bool {
-                            const a_ty = Type.fromInterned(ctx.field_types[a_idx]);
-                            const b_ty = Type.fromInterned(ctx.field_types[b_idx]);
-                            return a_ty.bitSize(ctx.zcu) > b_ty.bitSize(ctx.zcu);
-                        }
-                    };
-                    std.mem.sortUnstable(u32, field_order, SizeSortCtx{
-                        .zcu = zcu,
-                        .field_types = zcu.typeToUnion(ty).?.field_types.get(ip),
-                    }, SizeSortCtx.lessThan);
-
-                    const padding_after = endian == .little or ty.containerLayout(zcu) == .@"packed";
-
-                    for (field_order) |field_idx| {
-                        const field_ty = Type.fromInterned(zcu.typeToUnion(ty).?.field_types.get(ip)[field_idx]);
-                        const pad_bits = ty.bitSize(zcu) - field_ty.bitSize(zcu);
-                        if (!padding_after) try pack.padding(pad_bits);
-                        const field_val = pack.get(field_ty) catch |err| switch (err) {
-                            error.ReinterpretDeclRef => {
-                                pack.unpacked = prev_unpacked;
-                                pack.bit_offset = prev_bit_offset;
-                                continue;
-                            },
-                            else => |e| return e,
-                        };
-                        if (padding_after) try pack.padding(pad_bits);
-                        if (field_val.isUndef(zcu)) {
-                            pack.unpacked = prev_unpacked;
-                            pack.bit_offset = prev_bit_offset;
-                            continue;
-                        }
-                        const tag_val = try pt.enumValueFieldIndex(ty.unionTagTypeHypothetical(zcu), field_idx);
-                        return Value.fromInterned(try pt.internUnion(.{
-                            .ty = ty.toIntern(),
-                            .tag = tag_val.toIntern(),
-                            .val = field_val.toIntern(),
-                        }));
-                    }
-
-                    // No field could represent the value. Just do whatever happens when we try to read
-                    // the backing type - either `undefined` or `error.ReinterpretDeclRef`.
-                    const backing_val = try pack.get(backing_ty);
-                    return Value.fromInterned(try pt.internUnion(.{
-                        .ty = ty.toIntern(),
-                        .tag = .none,
-                        .val = backing_val.toIntern(),
-                    }));
-                },
-                .@"packed" => {
-                    const backing_int_val = try pack.primitive(ty.bitpackBackingInt(zcu));
-                    if (backing_int_val.isUndef(zcu)) return pt.undefValue(ty);
-                    return pt.bitpackValue(ty, backing_int_val);
-                },
-            },
-            else => return pack.primitive(ty),
-        }
-    }
-
-    fn padding(pack: *PackValueBits, pad_bits: u64) BitCastError!void {
-        _ = pack.prepareBits(pad_bits);
-    }
-
-    fn primitive(pack: *PackValueBits, want_ty: Type) BitCastError!Value {
-        const pt = pack.pt;
-        const zcu = pt.zcu;
-
-        if (try want_ty.onePossibleValue(pt)) |opv| return opv;
-
-        const vals, const bit_offset = pack.prepareBits(want_ty.bitSize(zcu));
-
-        for (vals) |val| {
-            if (!Value.fromInterned(val).isUndef(zcu)) break;
-        } else {
-            // All bits of the value are `undefined`.
-            return pt.undefValue(want_ty);
-        }
-
-        // TODO: we need to decide how to handle partially-undef values here.
-        // Currently, a value with some undefined bits becomes `0xAA` so that we
-        // preserve the well-defined bits, because we can't currently represent
-        // a partially-undefined primitive (e.g. an int with some undef bits).
-        // In future, we probably want to take one of these two routes:
-        // * Define that if any bits are `undefined`, the entire value is `undefined`.
-        //   This is a major breaking change, and probably a footgun.
-        // * Introduce tracking for partially-undef values at comptime.
-        //   This would complicate a lot of operations in Sema, such as basic
-        //   arithmetic.
-        // This design complexity is tracked by #19634.
-
-        ptr_cast: {
-            if (vals.len != 1) break :ptr_cast;
-            const val = Value.fromInterned(vals[0]);
-            if (!val.typeOf(zcu).isPtrAtRuntime(zcu)) break :ptr_cast;
-            if (!want_ty.isPtrAtRuntime(zcu)) break :ptr_cast;
-            return pt.getCoerced(val, want_ty);
-        }
-
-        // Reinterpret via an in-memory buffer.
-
-        var buf_bits: u64 = 0;
-        for (vals) |ip_val| {
-            const val = Value.fromInterned(ip_val);
-            const ty = val.typeOf(pt.zcu);
-            buf_bits += ty.bitSize(zcu);
-        }
-
-        const buf = try pack.arena.alloc(u8, @intCast((buf_bits + 7) / 8));
-        // We will skip writing undefined values, so mark the buffer as `0xAA` so we get "undefined" bits.
-        @memset(buf, 0xAA);
-        var cur_bit_off: usize = 0;
-        for (vals) |ip_val| {
-            const val = Value.fromInterned(ip_val);
-            const ty = val.typeOf(zcu);
-            if (!val.isUndef(zcu)) {
-                try val.writeToPackedMemory(zcu, buf, cur_bit_off);
-            }
-            cur_bit_off += @intCast(ty.bitSize(zcu));
-        }
-
-        return Value.readFromPackedMemory(want_ty, pt, buf, @intCast(bit_offset), pack.arena);
-    }
-
-    fn prepareBits(pack: *PackValueBits, need_bits: u64) struct { []const InternPool.Index, u64 } {
-        if (need_bits == 0) return .{ &.{}, 0 };
-
-        const pt = pack.pt;
-        const zcu = pt.zcu;
-
-        var bits: u64 = 0;
-        var len: usize = 0;
-        while (bits < pack.bit_offset + need_bits) {
-            bits += Value.fromInterned(pack.unpacked[len]).typeOf(pt.zcu).bitSize(zcu);
-            len += 1;
-        }
-
-        const result_vals = pack.unpacked[0..len];
-        const result_offset = pack.bit_offset;
-
-        const extra_bits = bits - pack.bit_offset - need_bits;
-        if (extra_bits == 0) {
-            pack.unpacked = pack.unpacked[len..];
-            pack.bit_offset = 0;
-        } else {
-            pack.unpacked = pack.unpacked[len - 1 ..];
-            pack.bit_offset = Value.fromInterned(pack.unpacked[0]).typeOf(pt.zcu).bitSize(zcu) - extra_bits;
-        }
-
-        return .{ result_vals, result_offset };
-    }
-};
-
-const std = @import("std");
-const Allocator = std.mem.Allocator;
-const assert = std.debug.assert;
-
-const Sema = @import("../Sema.zig");
-const Zcu = @import("../Zcu.zig");
-const InternPool = @import("../InternPool.zig");
-const Type = @import("../Type.zig");
-const Value = @import("../Value.zig");
-const CompileError = Zcu.CompileError;
diff --git a/src/Sema/comptime_ptr_access.zig b/src/Sema/comptime_ptr_access.zig
@@ -14,27 +14,46 @@ pub const ComptimeLoadResult = union(enum) {
 pub fn loadComptimePtr(sema: *Sema, block: *Block, src: LazySrcLoc, ptr: Value) !ComptimeLoadResult {
     const pt = sema.pt;
     const zcu = pt.zcu;
+
     const ptr_info = ptr.typeOf(pt.zcu).ptrInfo(pt.zcu);
-    // TODO: host size for vectors is terrible
-    const host_bits = switch (ptr_info.flags.vector_index) {
-        .none => ptr_info.packed_offset.host_size * 8,
-        else => ptr_info.packed_offset.host_size * Type.fromInterned(ptr_info.child).bitSize(zcu),
-    };
-    const bit_offset = if (host_bits != 0) bit_offset: {
-        const child_bits = Type.fromInterned(ptr_info.child).bitSize(zcu);
-        const bit_offset = ptr_info.packed_offset.bit_offset + switch (ptr_info.flags.vector_index) {
-            .none => 0,
-            else => |idx| switch (pt.zcu.getTarget().cpu.arch.endian()) {
-                .little => child_bits * @intFromEnum(idx),
-                .big => host_bits - child_bits * (@intFromEnum(idx) + 1), // element order reversed on big endian
-            },
-        };
-        if (child_bits + bit_offset > host_bits) {
+    const elem_ty: Type = .fromInterned(ptr_info.child);
+    const host_size = ptr_info.packed_offset.host_size;
+
+    if (host_size == 0) {
+        return loadComptimePtrInner(sema, block, src, ptr, elem_ty, 0);
+    }
+
+    assert(elem_ty.hasBitRepresentation(zcu));
+    if (ptr_info.flags.vector_index == .none) {
+        if (ptr_info.packed_offset.bit_offset + elem_ty.bitSize(zcu) > host_size * 8) {
             return .exceeds_host_size;
         }
-        break :bit_offset bit_offset;
-    } else 0;
-    return loadComptimePtrInner(sema, block, src, ptr, bit_offset, host_bits, Type.fromInterned(ptr_info.child), 0);
+        const load_ty: Type = try pt.intType(.unsigned, host_size * 8);
+        const backing_int_mv = switch (try loadComptimePtrInner(sema, block, src, ptr, load_ty, 0)) {
+            else => |result| return result,
+            .success => |mv| mv,
+        };
+        const backing_int_val = try backing_int_mv.intern(pt, sema.arena);
+        const buf = try sema.arena.alloc(u8, host_size);
+        @memset(buf, 0);
+        backing_int_val.writeToPackedMemory(zcu, buf, 0);
+        const result_val: Value = try .readFromPackedMemory(elem_ty, pt, buf, ptr_info.packed_offset.bit_offset);
+        return .{ .success = .{ .interned = result_val.toIntern() } };
+    }
+    if (@intFromEnum(ptr_info.flags.vector_index) >= host_size) {
+        return .exceeds_host_size;
+    }
+    const load_ty: Type = try pt.vectorType(.{
+        .len = host_size,
+        .child = elem_ty.toIntern(),
+    });
+    const vector_mv = switch (try loadComptimePtrInner(sema, block, src, ptr, load_ty, 0)) {
+        else => |result| return result,
+        .success => |mv| mv,
+    };
+    const vector_val = try vector_mv.intern(pt, sema.arena);
+    const result_val = try vector_val.elemValue(pt, @intFromEnum(ptr_info.flags.vector_index));
+    return .{ .success = .{ .interned = result_val.toIntern() } };
 }
 
 pub const ComptimeStoreResult = union(enum) {
@@ -52,7 +71,8 @@ pub const ComptimeStoreResult = union(enum) {
 };
 
 /// Perform a comptime load of value `store_val` to a pointer.
-/// The pointer's type is ignored.
+///
+/// Asserts that the type of `store_val` equals the element type of the pointer type.
 pub fn storeComptimePtr(
     sema: *Sema,
     block: *Block,
@@ -62,42 +82,84 @@ pub fn storeComptimePtr(
 ) !ComptimeStoreResult {
     const pt = sema.pt;
     const zcu = pt.zcu;
-    const ptr_info = ptr.typeOf(zcu).ptrInfo(zcu);
-    assert(store_val.typeOf(zcu).toIntern() == ptr_info.child);
 
-    {
-        const store_ty: Type = .fromInterned(ptr_info.child);
-        if (!store_ty.comptimeOnly(zcu) and !store_ty.hasRuntimeBits(zcu)) {
-            // zero-bit store; nothing to do
-            return .success;
-        }
+    const ptr_info = ptr.typeOf(pt.zcu).ptrInfo(pt.zcu);
+    const elem_ty: Type = .fromInterned(ptr_info.child);
+    const host_size = ptr_info.packed_offset.host_size;
+    assert(store_val.typeOf(zcu).toIntern() == elem_ty.toIntern());
+
+    if (host_size == 0) {
+        return storeComptimePtrInner(sema, block, src, ptr, store_val);
     }
 
-    // TODO: host size for vectors is terrible
-    const host_bits = switch (ptr_info.flags.vector_index) {
-        .none => ptr_info.packed_offset.host_size * 8,
-        else => ptr_info.packed_offset.host_size * Type.fromInterned(ptr_info.child).bitSize(zcu),
-    };
-    const bit_offset = ptr_info.packed_offset.bit_offset + switch (ptr_info.flags.vector_index) {
-        .none => 0,
-        else => |idx| switch (zcu.getTarget().cpu.arch.endian()) {
-            .little => Type.fromInterned(ptr_info.child).bitSize(zcu) * @intFromEnum(idx),
-            .big => host_bits - Type.fromInterned(ptr_info.child).bitSize(zcu) * (@intFromEnum(idx) + 1), // element order reversed on big endian
-        },
-    };
-    const pseudo_store_ty = if (host_bits > 0) t: {
-        const need_bits = Type.fromInterned(ptr_info.child).bitSize(zcu);
-        if (need_bits + bit_offset > host_bits) {
+    assert(elem_ty.hasBitRepresentation(zcu));
+    if (ptr_info.flags.vector_index == .none) {
+        if (ptr_info.packed_offset.bit_offset + elem_ty.bitSize(zcu) > host_size * 8) {
             return .exceeds_host_size;
         }
-        break :t try sema.pt.intType(.unsigned, @intCast(host_bits));
-    } else Type.fromInterned(ptr_info.child);
+        const backing_ty: Type = try pt.intType(.unsigned, host_size * 8);
+        const backing_int_mv = switch (try loadComptimePtrInner(sema, block, src, ptr, backing_ty, 0)) {
+            .success => |mv| mv,
+            .runtime_load => return .runtime_store,
+            inline else => |payload, tag| return @unionInit(ComptimeStoreResult, @tagName(tag), payload),
+        };
+        const old_backing_int_val = try backing_int_mv.intern(pt, sema.arena);
+        const buf = try sema.arena.alloc(u8, host_size);
+        @memset(buf, 0);
+        old_backing_int_val.writeToPackedMemory(zcu, buf, 0);
+        // Write the new element...
+        store_val.writeToPackedMemory(zcu, buf, ptr_info.packed_offset.bit_offset);
+        // ...then read the resulting backing integer value...
+        const new_backing_int_val: Value = try .readFromPackedMemory(backing_ty, pt, buf, 0);
+        // ...and store that back into memory
+        return storeComptimePtrInner(sema, block, src, ptr, new_backing_int_val);
+    }
+
+    if (@intFromEnum(ptr_info.flags.vector_index) >= host_size) {
+        return .exceeds_host_size;
+    }
+    const vec_ty: Type = try pt.vectorType(.{
+        .len = host_size,
+        .child = elem_ty.toIntern(),
+    });
+    const vector_mv = switch (try loadComptimePtrInner(sema, block, src, ptr, vec_ty, 0)) {
+        .success => |mv| mv,
+        .runtime_load => return .runtime_store,
+        inline else => |payload, tag| return @unionInit(ComptimeStoreResult, @tagName(tag), payload),
+    };
+    const old_vector_val = try vector_mv.intern(pt, sema.arena);
+    const elems_buf = try sema.arena.alloc(InternPool.Index, host_size);
+    for (elems_buf, 0..) |*elem, elem_index| {
+        const elem_val = try old_vector_val.elemValue(pt, elem_index);
+        elem.* = elem_val.toIntern();
+    }
+    elems_buf[@intFromEnum(ptr_info.flags.vector_index)] = store_val.toIntern();
+    const new_vector_val = try pt.aggregateValue(vec_ty, elems_buf);
+    return storeComptimePtrInner(sema, block, src, ptr, new_vector_val);
+}
 
-    const strat = try prepareComptimePtrStore(sema, block, src, ptr, pseudo_store_ty, 0);
+/// Like `storeComptimePtr`, except ignores the type of `ptr`, instead treating it as a single-item
+/// pointer to `store_val.typeOf(zcu)`.
+fn storeComptimePtrInner(
+    sema: *Sema,
+    block: *Block,
+    src: LazySrcLoc,
+    ptr: Value,
+    store_val: Value,
+) !ComptimeStoreResult {
+    const pt = sema.pt;
+    const zcu = pt.zcu;
+    const store_ty = store_val.typeOf(zcu);
+
+    if (store_ty.classify(zcu) == .one_possible_value) {
+        // zero-bit store; nothing to do
+        return .success;
+    }
+
+    const strat = try prepareComptimePtrStore(sema, block, src, ptr, store_ty, 0);
 
     // Propagate errors and handle comptime fields.
     switch (strat) {
-        .direct, .index, .flat_index, .reinterpret => {},
         .comptime_field => {
             // To "store" to a comptime field, just perform a load of the field
             // and see if the store value matches.
@@ -125,79 +187,60 @@ pub fn storeComptimePtr(
         .inactive_union_field => return .inactive_union_field,
         .needed_well_defined => |ty| return .{ .needed_well_defined = ty },
         .out_of_bounds => |ty| return .{ .out_of_bounds = ty },
-    }
-
-    // Check the store is not inside a runtime condition
-    try checkComptimeVarStore(sema, block, src, strat.alloc());
-
-    if (host_bits == 0) {
-        // We can attempt a direct store depending on the strategy.
-        switch (strat) {
-            .direct => |direct| {
-                const want_ty = direct.val.typeOf(zcu);
-                const coerced_store_val = try pt.getCoerced(store_val, want_ty);
-                direct.val.* = .{ .interned = coerced_store_val.toIntern() };
-                return .success;
-            },
-            .index => |index| {
-                const want_ty = index.val.typeOf(zcu).childType(zcu);
-                const coerced_store_val = try pt.getCoerced(store_val, want_ty);
-                try index.val.setElem(pt, sema.arena, @intCast(index.elem_index), .{ .interned = coerced_store_val.toIntern() });
-                return .success;
-            },
-            .flat_index => |flat| {
-                const store_elems = store_val.typeOf(zcu).arrayBase(zcu)[1];
-                const flat_elems = try sema.arena.alloc(InternPool.Index, @intCast(store_elems));
-                {
-                    var next_idx: u64 = 0;
-                    var skip: u64 = 0;
-                    try flattenArray(sema, .{ .interned = store_val.toIntern() }, &skip, &next_idx, flat_elems);
-                }
-                for (flat_elems, 0..) |elem, idx| {
-                    // TODO: recursiveIndex in a loop does a lot of redundant work!
-                    // Better would be to gather all the store targets into an array.
-                    var index: u64 = flat.flat_elem_index + idx;
-                    const val_ptr, const final_idx = (try recursiveIndex(sema, flat.val, &index)).?;
-                    try val_ptr.setElem(pt, sema.arena, @intCast(final_idx), .{ .interned = elem });
-                }
-                return .success;
-            },
-            .reinterpret => {},
-            else => unreachable,
-        }
-    }
 
-    // Either there is a bit offset, or the strategy required reinterpreting.
-    // Therefore, we must perform a bitcast.
+        .direct => |direct| {
+            try checkComptimeVarStore(sema, block, src, direct.alloc);
+            const want_ty = direct.val.typeOf(zcu);
+            const coerced_store_val = try pt.getCoerced(store_val, want_ty);
+            direct.val.* = .{ .interned = coerced_store_val.toIntern() };
+            return .success;
+        },
 
-    const val_ptr: *MutableValue, const byte_offset: u64 = switch (strat) {
-        .direct => |direct| .{ direct.val, 0 },
-        .index => |index| .{
-            index.val,
-            index.elem_index * index.val.typeOf(zcu).childType(zcu).abiSize(zcu),
+        .index => |index| {
+            try checkComptimeVarStore(sema, block, src, index.alloc);
+            const want_ty = index.val.typeOf(zcu).childType(zcu);
+            const coerced_store_val = try pt.getCoerced(store_val, want_ty);
+            try index.val.setElem(pt, sema.arena, @intCast(index.elem_index), .{ .interned = coerced_store_val.toIntern() });
+            return .success;
         },
-        .flat_index => |flat| .{ flat.val, flat.flat_elem_index * flat.val.typeOf(zcu).arrayBase(zcu)[0].abiSize(zcu) },
-        .reinterpret => |reinterpret| .{ reinterpret.val, reinterpret.byte_offset },
-        else => unreachable,
-    };
 
-    if (!val_ptr.typeOf(zcu).hasWellDefinedLayout(zcu)) {
-        return .{ .needed_well_defined = val_ptr.typeOf(zcu) };
-    }
+        .flat_index => |flat| {
+            try checkComptimeVarStore(sema, block, src, flat.alloc);
+            const store_elems = store_val.typeOf(zcu).arrayBase(zcu)[1];
+            const flat_elems = try sema.arena.alloc(InternPool.Index, @intCast(store_elems));
+            {
+                var next_idx: u64 = 0;
+                var skip: u64 = 0;
+                try flattenArray(sema, .{ .interned = store_val.toIntern() }, &skip, &next_idx, flat_elems);
+            }
+            for (flat_elems, 0..) |elem, idx| {
+                // TODO: recursiveIndex in a loop does a lot of redundant work!
+                // Better would be to gather all the store targets into an array.
+                var index: u64 = flat.flat_elem_index + idx;
+                const val_ptr, const final_idx = (try recursiveIndex(sema, flat.val, &index)).?;
+                try val_ptr.setElem(pt, sema.arena, @intCast(final_idx), .{ .interned = elem });
+            }
+            return .success;
+        },
 
-    if (!store_val.typeOf(zcu).hasWellDefinedLayout(zcu)) {
-        return .{ .needed_well_defined = store_val.typeOf(zcu) };
+        .reinterpret => |reinterpret| {
+            try checkComptimeVarStore(sema, block, src, reinterpret.alloc);
+            if (!reinterpret.val.typeOf(zcu).hasWellDefinedLayout(zcu)) {
+                return .{ .needed_well_defined = reinterpret.val.typeOf(zcu) };
+            }
+            if (!store_ty.hasWellDefinedLayout(zcu)) {
+                return .{ .needed_well_defined = store_ty };
+            }
+            const old_val = try reinterpret.val.intern(pt, sema.arena);
+            const new_val = try sema.spliceMemory(
+                old_val,
+                store_val,
+                reinterpret.byte_offset,
+            ) orelse return .runtime_store;
+            reinterpret.val.* = .{ .interned = new_val.toIntern() };
+            return .success;
+        },
     }
-
-    const new_val = try sema.bitCastSpliceVal(
-        try val_ptr.intern(pt, sema.arena),
-        store_val,
-        byte_offset,
-        host_bits,
-        bit_offset,
-    ) orelse return .runtime_store;
-    val_ptr.* = .{ .interned = new_val.toIntern() };
-    return .success;
 }
 
 /// Perform a comptime load of type `load_ty` from a pointer.
@@ -207,8 +250,6 @@ fn loadComptimePtrInner(
     block: *Block,
     src: LazySrcLoc,
     ptr_val: Value,
-    bit_offset: u64,
-    host_bits: u64,
     load_ty: Type,
     /// If `load_ty` is an array, this is the number of array elements to skip
     /// before `load_ty`. Otherwise, it is ignored and may be `undefined`.
@@ -244,7 +285,7 @@ fn loadComptimePtrInner(
         .eu_payload => |base_ptr_ip| val: {
             const base_ptr = Value.fromInterned(base_ptr_ip);
             const base_ty = base_ptr.typeOf(zcu).childType(zcu);
-            switch (try loadComptimePtrInner(sema, block, src, base_ptr, 0, 0, base_ty, undefined)) {
+            switch (try loadComptimePtrInner(sema, block, src, base_ptr, base_ty, undefined)) {
                 .success => |eu_val| switch (eu_val.unpackErrorUnion(zcu)) {
                     .undef => return .undef,
                     .err => |err| return .{ .err_payload = err },
@@ -256,7 +297,7 @@ fn loadComptimePtrInner(
         .opt_payload => |base_ptr_ip| val: {
             const base_ptr = Value.fromInterned(base_ptr_ip);
             const base_ty = base_ptr.typeOf(zcu).childType(zcu);
-            switch (try loadComptimePtrInner(sema, block, src, base_ptr, 0, 0, base_ty, undefined)) {
+            switch (try loadComptimePtrInner(sema, block, src, base_ptr, base_ty, undefined)) {
                 .success => |eu_val| switch (eu_val.unpackOptional(zcu)) {
                     .undef => return .undef,
                     .null => return .null_payload,
@@ -283,7 +324,7 @@ fn loadComptimePtrInner(
                 .child = base_ty.toIntern(),
             });
 
-            switch (try loadComptimePtrInner(sema, block, src, base_ptr, 0, 0, want_ty, base_index.index)) {
+            switch (try loadComptimePtrInner(sema, block, src, base_ptr, want_ty, base_index.index)) {
                 .success => |arr_val| break :val arr_val,
                 else => |err| return err,
             }
@@ -293,7 +334,7 @@ fn loadComptimePtrInner(
             const base_ty = base_ptr.typeOf(zcu).childType(zcu);
 
             // Field of a slice, or of an auto-layout struct or union.
-            const agg_val = switch (try loadComptimePtrInner(sema, block, src, base_ptr, 0, 0, base_ty, undefined)) {
+            const agg_val = switch (try loadComptimePtrInner(sema, block, src, base_ptr, base_ty, undefined)) {
                 .success => |val| val,
                 else => |err| return err,
             };
@@ -324,7 +365,7 @@ fn loadComptimePtrInner(
         },
     };
 
-    if (ptr.byte_offset == 0 and host_bits == 0) {
+    if (ptr.byte_offset == 0) {
         if (load_ty.zigTypeTag(zcu) != .array or array_offset == 0) {
             if (.ok == try sema.coerceInMemoryAllowed(
                 block,
@@ -343,8 +384,6 @@ fn loadComptimePtrInner(
     }
 
     restructure_array: {
-        if (host_bits != 0) break :restructure_array;
-
         // We might also be changing the length of an array, or restructuring it.
         // e.g. [1][2][3]T -> [3][2]T.
         // This case is important because it's permitted for types with ill-defined layouts.
@@ -402,7 +441,7 @@ fn loadComptimePtrInner(
         cur_offset += load_ty.childType(zcu).abiSize(zcu) * array_offset;
     }
 
-    const need_bytes = if (host_bits > 0) (host_bits + 7) / 8 else load_ty.abiSize(zcu);
+    const need_bytes = load_ty.abiSize(zcu);
 
     if (cur_offset + need_bytes > cur_val.typeOf(zcu).abiSize(zcu)) {
         return .{ .out_of_bounds = cur_val.typeOf(zcu) };
@@ -453,7 +492,7 @@ fn loadComptimePtrInner(
             },
             .@"struct" => switch (cur_ty.containerLayout(zcu)) {
                 .auto => unreachable, // ill-defined layout
-                .@"packed" => break, // let the bitcast logic handle this
+                .@"packed" => break, // let the memory reinterpret logic handle this
                 .@"extern" => for (0..cur_ty.structFieldCount(zcu)) |field_idx| {
                     const start_off = cur_ty.structFieldOffset(field_idx, zcu);
                     const end_off = start_off + cur_ty.fieldType(field_idx, zcu).abiSize(zcu);
@@ -466,9 +505,9 @@ fn loadComptimePtrInner(
             },
             .@"union" => switch (cur_ty.containerLayout(zcu)) {
                 .auto => unreachable, // ill-defined layout
-                .@"packed" => break, // let the bitcast logic handle this
+                .@"packed" => break, // let the memory reinterpret logic handle this
                 .@"extern" => {
-                    // TODO: we have to let bitcast logic handle this for now.
+                    // TODO: we have to let the memory reinterpret logic handle this for now.
                     // Otherwise, we might traverse into a union field which doesn't allow pointers.
                     // Figure out a solution!
                     if (true) break;
@@ -495,27 +534,13 @@ fn loadComptimePtrInner(
 
     // Fast path: check again if we're now at the type we want to load.
     // If so, just return the loaded value.
-    if (cur_offset == 0 and host_bits == 0 and cur_val.typeOf(zcu).toIntern() == load_ty.toIntern()) {
+    if (cur_offset == 0 and cur_val.typeOf(zcu).toIntern() == load_ty.toIntern()) {
         return .{ .success = cur_val };
     }
 
-    var bitcast_src_val = try cur_val.intern(sema.pt, sema.arena);
-
-    if (host_bits != 0) {
-        const src_bit_size = bitcast_src_val.typeOf(zcu).bitSize(zcu);
-        if (src_bit_size > host_bits) {
-            const truncate_ty = try pt.intType(.unsigned, @intCast(host_bits));
-            bitcast_src_val = try pt.getCoerced(bitcast_src_val, truncate_ty);
-        }
-    }
-
-    const result_val = try sema.bitCastVal(
-        bitcast_src_val,
-        load_ty,
-        cur_offset,
-        host_bits,
-        bit_offset,
-    ) orelse return .runtime_load;
+    // Otherwise, use the memory reinterpretation logic to pull out the bytes we need.
+    const reinterpret_val = try cur_val.intern(pt, sema.arena);
+    const result_val = try sema.castMemory(reinterpret_val, load_ty, cur_offset) orelse return .runtime_load;
     return .{ .success = .{ .interned = result_val.toIntern() } };
 }
 
@@ -546,7 +571,7 @@ const ComptimeStoreStrategy = union(enum) {
         val: *MutableValue,
         flat_elem_index: u64,
     },
-    /// This value should be reinterpreted using bitcast logic to perform the
+    /// This value should be reinterpreted using `Sema.spliceMemory` to perform
     /// store. Only returned if `store_ty` and the type of `val` both have
     /// well-defined layouts.
     reinterpret: struct {
@@ -886,7 +911,7 @@ fn prepareComptimePtrStore(
             },
             .@"struct" => switch (cur_ty.containerLayout(zcu)) {
                 .auto => unreachable, // ill-defined layout
-                .@"packed" => break, // let the bitcast logic handle this
+                .@"packed" => break, // let the memory reinterp logic handle this
                 .@"extern" => for (0..cur_ty.structFieldCount(zcu)) |field_idx| {
                     const start_off = cur_ty.structFieldOffset(field_idx, zcu);
                     const end_off = start_off + cur_ty.fieldType(field_idx, zcu).abiSize(zcu);
@@ -899,9 +924,9 @@ fn prepareComptimePtrStore(
             },
             .@"union" => switch (cur_ty.containerLayout(zcu)) {
                 .auto => unreachable, // ill-defined layout
-                .@"packed" => break, // let the bitcast logic handle this
+                .@"packed" => break, // let the memory reinterp logic handle this
                 .@"extern" => {
-                    // TODO: we have to let bitcast logic handle this for now.
+                    // TODO: we have to let the memory reinterp logic handle this for now.
                     // Otherwise, we might traverse into a union field which doesn't allow pointers.
                     // Figure out a solution!
                     if (true) break;
diff --git a/src/Sema/reinterpret.zig b/src/Sema/reinterpret.zig
@@ -0,0 +1,576 @@
+//! This file contains logic for bit-casting arbitrary values at comptime, including splicing
+//! bits together for comptime stores of bit-pointers. The strategy is to "flatten" values to
+//! a sequence of values in *packed* memory, and then unflatten through a combination of special
+//! cases (particularly for pointers and `undefined` values) and in-memory buffer reinterprets.
+//!
+//! This is a little awkward on big-endian targets, as non-packed datastructures (e.g. `extern struct`)
+//! have their fields reversed when represented as packed memory on such targets.
+
+/// If `host_bits` is `0`, attempts to convert the memory at offset
+/// `byte_offset` into `val` to a non-packed value of type `dest_ty`,
+/// ignoring `bit_offset`.
+///
+/// Otherwise, `byte_offset` is an offset in bytes into `val` to a
+/// non-packed value consisting of `host_bits` bits. A value of type
+/// `dest_ty` will be interpreted at a packed offset of `bit_offset`
+/// into this value.
+///
+/// Returns `null` if the operation must be performed at runtime.
+pub fn castMemory(
+    sema: *Sema,
+    val: Value,
+    dest_ty: Type,
+    byte_offset: u64,
+) CompileError!?Value {
+    const pt = sema.pt;
+    const zcu = pt.zcu;
+
+    const val_ty = val.typeOf(zcu);
+
+    if (dest_ty.toIntern() == val_ty.toIntern()) {
+        assert(byte_offset == 0);
+        return val;
+    }
+
+    val_ty.assertHasLayout(zcu);
+    dest_ty.assertHasLayout(zcu);
+
+    var unpack: UnpackValueBytes = .{
+        .pt = pt,
+        .arena = sema.arena,
+        .skip_bytes = byte_offset,
+        .remaining_bytes = dest_ty.abiSize(zcu),
+        .unpacked = .init(sema.arena),
+    };
+    unpack.add(val) catch |err| switch (err) {
+        error.ReinterpretDeclRef => return null,
+        error.OutOfMemory => |e| return e,
+    };
+
+    var pack: PackValueBytes = .{
+        .pt = pt,
+        .arena = sema.arena,
+        .unpacked = unpack.unpacked.items,
+    };
+    return pack.get(dest_ty) catch |err| switch (err) {
+        error.ReinterpretDeclRef => return null,
+        error.OutOfMemory => |e| return e,
+    };
+}
+
+/// Splice the value `splice_val` into `val` at the given `byte_offset`, replacing overlapping bits
+/// and returning the modified value.
+pub fn spliceMemory(
+    sema: *Sema,
+    val: Value,
+    splice_val: Value,
+    byte_offset: u64,
+) CompileError!?Value {
+    const pt = sema.pt;
+    const zcu = pt.zcu;
+    const val_ty = val.typeOf(zcu);
+    const splice_val_ty = splice_val.typeOf(zcu);
+
+    val_ty.assertHasLayout(zcu);
+    splice_val_ty.assertHasLayout(zcu);
+
+    var unpack: UnpackValueBytes = .{
+        .pt = pt,
+        .arena = sema.arena,
+        .skip_bytes = 0,
+        .remaining_bytes = byte_offset,
+        .unpacked = .init(sema.arena),
+    };
+    unpack.add(val) catch |err| switch (err) {
+        error.ReinterpretDeclRef => return null,
+        error.OutOfMemory => |e| return e,
+    };
+
+    const splice_len = splice_val_ty.abiSize(zcu);
+
+    unpack.remaining_bytes = splice_len;
+    unpack.add(splice_val) catch |err| switch (err) {
+        error.ReinterpretDeclRef => return null,
+        error.OutOfMemory => |e| return e,
+    };
+
+    unpack.skip_bytes = byte_offset + splice_len;
+    unpack.remaining_bytes = val_ty.abiSize(zcu) * 8 - byte_offset - splice_len;
+    unpack.add(val) catch |err| switch (err) {
+        error.ReinterpretDeclRef => return null,
+        error.OutOfMemory => |e| return e,
+    };
+
+    var pack: PackValueBytes = .{
+        .pt = pt,
+        .arena = sema.arena,
+        .unpacked = unpack.unpacked.items,
+    };
+    return pack.get(val_ty) catch |err| switch (err) {
+        error.ReinterpretDeclRef => return null,
+        error.OutOfMemory => |e| return e,
+    };
+}
+
+/// Recurses through struct fields, array elements, etc, to get a sequence of "primitive" values
+/// which are bit-packed in memory to represent a single value. `unpacked` represents a series
+/// of values in *packed* memory - therefore, on big-endian targets, the first element of this
+/// list contains bits from the *final* byte of the value.
+const UnpackValueBytes = struct {
+    pt: Zcu.PerThread,
+    arena: Allocator,
+    skip_bytes: u64,
+    remaining_bytes: u64,
+    unpacked: std.array_list.Managed(InternPool.Index),
+
+    fn add(unpack: *UnpackValueBytes, val: Value) (error{ReinterpretDeclRef} || Allocator.Error)!void {
+        const pt = unpack.pt;
+        const zcu = pt.zcu;
+        const ip = &zcu.intern_pool;
+
+        if (unpack.remaining_bytes == 0) {
+            return;
+        }
+
+        const ty = val.typeOf(zcu);
+        const size = ty.abiSize(zcu);
+
+        if (unpack.skip_bytes >= size) {
+            unpack.skip_bytes -= size;
+            return;
+        }
+
+        switch (ip.indexToKey(val.toIntern())) {
+            .int_type,
+            .ptr_type,
+            .array_type,
+            .vector_type,
+            .opt_type,
+            .anyframe_type,
+            .error_union_type,
+            .simple_type,
+            .struct_type,
+            .tuple_type,
+            .union_type,
+            .opaque_type,
+            .spirv_type,
+            .enum_type,
+            .func_type,
+            .error_set_type,
+            .inferred_error_set_type,
+            .@"extern",
+            .func,
+            .err,
+            .error_union,
+            .enum_literal,
+            .slice,
+            .memoized_call,
+            => unreachable, // ill-defined layout or not real values
+
+            .undef,
+            .int,
+            .enum_tag,
+            .simple_value,
+            .float,
+            .ptr,
+            .opt,
+            => try unpack.primitive(val),
+
+            .bitpack => |bitpack| try unpack.primitive(.fromInterned(bitpack.backing_int_val)),
+
+            .aggregate => switch (ty.zigTypeTag(zcu)) {
+                .vector => unreachable, // ill-defined layout
+                .array => {
+                    for (0..@intCast(ty.arrayLen(zcu))) |elem_index| {
+                        const elem_val = try val.elemValue(pt, @intCast(elem_index));
+                        try unpack.add(elem_val);
+                    }
+                    if (ty.sentinel(zcu)) |s| {
+                        try unpack.add(s);
+                    }
+                },
+                .@"struct" => switch (ty.containerLayout(zcu)) {
+                    .auto => unreachable, // ill-defined layout
+                    .@"packed" => unreachable, // uses `.bitpack`, not `.aggregate`
+                    .@"extern" => {
+                        var it = ip.loadStructType(ty.toIntern()).iterateRuntimeOrder(ip);
+                        var offset: u64 = 0;
+                        while (it.next()) |field_index| {
+                            const pad_bytes = ty.structFieldOffset(field_index, zcu) - offset;
+                            const field_val = try val.fieldValue(pt, field_index);
+                            try unpack.padding(pad_bytes);
+                            try unpack.add(field_val);
+                            offset += pad_bytes + field_val.typeOf(zcu).abiSize(zcu);
+                        }
+                        try unpack.padding(size - offset);
+                    },
+                },
+                else => unreachable,
+            },
+
+            .un => |un| {
+                const payload_val = Value.fromInterned(un.val);
+                const pad_bytes = size - payload_val.typeOf(zcu).abiSize(zcu);
+                try unpack.add(payload_val);
+                try unpack.padding(pad_bytes);
+            },
+        }
+    }
+
+    fn padding(unpack: *UnpackValueBytes, num_bytes: u64) Allocator.Error!void {
+        if (num_bytes == 0) return;
+        const undef_u8 = try unpack.pt.undefValue(Type.u8);
+        for (0..@intCast(num_bytes)) |_| {
+            unpack.primitive(undef_u8) catch |err| switch (err) {
+                error.OutOfMemory => |e| return e,
+                error.ReinterpretDeclRef => unreachable,
+            };
+        }
+    }
+
+    fn primitive(unpack: *UnpackValueBytes, val: Value) (error{ReinterpretDeclRef} || Allocator.Error)!void {
+        const pt = unpack.pt;
+        const zcu = pt.zcu;
+
+        if (unpack.remaining_bytes == 0) {
+            return;
+        }
+
+        const ty = val.typeOf(pt.zcu);
+        const size = ty.abiSize(zcu);
+
+        if (unpack.skip_bytes >= size) {
+            unpack.skip_bytes -= size;
+            return;
+        }
+
+        if (unpack.skip_bytes > 0) {
+            const offset = unpack.skip_bytes;
+            unpack.skip_bytes = 0;
+            return unpack.splitPrimitive(val, offset, @min(size - offset, unpack.remaining_bytes));
+        }
+
+        if (unpack.remaining_bytes < size) {
+            return unpack.splitPrimitive(val, 0, unpack.remaining_bytes);
+        }
+
+        unpack.remaining_bytes -= size;
+        try unpack.unpacked.append(val.toIntern());
+    }
+
+    fn splitPrimitive(unpack: *UnpackValueBytes, val: Value, offset: u64, len: u64) (error{ReinterpretDeclRef} || Allocator.Error)!void {
+        const pt = unpack.pt;
+        const zcu = pt.zcu;
+        const ty = val.typeOf(pt.zcu);
+
+        assert(offset + len <= ty.abiSize(zcu));
+
+        try unpack.unpacked.ensureUnusedCapacity(@intCast(len));
+        unpack.remaining_bytes -= len;
+
+        switch (pt.zcu.intern_pool.indexToKey(val.toIntern())) {
+            // In the `ptr` case, this will return `error.ReinterpretDeclRef`
+            // if we're trying to split a non-integer pointer value.
+            .int, .float, .enum_tag, .ptr, .opt => {
+                const buf = try unpack.arena.alloc(u8, @intCast(ty.abiSize(zcu)));
+                val.writeToMemory(zcu, buf) catch |err| switch (err) {
+                    error.IllDefinedMemoryLayout => unreachable,
+                    else => |e| return e,
+                };
+                for (buf[@intCast(offset)..][0..@intCast(len)]) |byte_raw| {
+                    const byte_val = try pt.intValue(.u8, byte_raw);
+                    unpack.unpacked.appendAssumeCapacity(byte_val.toIntern());
+                }
+            },
+            .undef => {
+                const undef_u8 = try pt.undefValue(.u8);
+                for (0..@intCast(len)) |_| {
+                    unpack.unpacked.appendAssumeCapacity(undef_u8.toIntern());
+                }
+            },
+            // The only values here with runtime bits are `true` and `false`.
+            // These are both 1 byte, so will never need splitting.
+            .simple_value => unreachable,
+            else => unreachable, // zero-bit or not primitives
+        }
+    }
+};
+
+/// Given a sequence of bit-packed values in packed memory (see `UnpackValueBytes`),
+/// reconstructs a value of an arbitrary type, with correct handling of `undefined`
+/// values and of pointers which align in virtual memory.
+const PackValueBytes = struct {
+    pt: Zcu.PerThread,
+    arena: Allocator,
+    byte_offset: u64 = 0,
+    unpacked: []const InternPool.Index,
+
+    fn get(pack: *PackValueBytes, ty: Type) (Allocator.Error || error{ReinterpretDeclRef})!Value {
+        const pt = pack.pt;
+        const zcu = pt.zcu;
+        const ip = &zcu.intern_pool;
+        const arena = pack.arena;
+        switch (ty.zigTypeTag(zcu)) {
+            .vector => unreachable, // ill-defined layout
+            .array => {
+                // Each element is padded up to its ABI size. The final element does not have trailing padding.
+                const elem_ty = ty.childType(zcu);
+                const elems = try arena.alloc(InternPool.Index, @intCast(ty.arrayLen(zcu)));
+
+                for (elems) |*elem| {
+                    elem.* = (try pack.get(elem_ty)).toIntern();
+                }
+
+                if (ty.sentinel(zcu)) |s| {
+                    _ = s; // TODO: validate sentinel was preserved!
+                    pack.padding(elem_ty.abiSize(zcu));
+                }
+
+                return pt.aggregateValue(ty, elems);
+            },
+            .@"struct" => switch (ty.containerLayout(zcu)) {
+                .auto => unreachable, // ill-defined layout
+                .@"extern" => {
+                    const elems = try arena.alloc(InternPool.Index, ty.structFieldCount(zcu));
+                    @memset(elems, .none);
+                    var offset: u64 = 0;
+                    var it = ip.loadStructType(ty.toIntern()).iterateRuntimeOrder(ip);
+                    while (it.next()) |field_index| {
+                        const field_ty = ty.fieldType(field_index, zcu);
+                        const pad_bytes = ty.structFieldOffset(field_index, zcu) - offset;
+                        pack.padding(pad_bytes);
+                        elems[field_index] = (try pack.get(field_ty)).toIntern();
+                        offset += pad_bytes + field_ty.abiSize(zcu);
+                    }
+                    pack.padding(ty.abiSize(zcu) - offset);
+                    // Any fields which do not have runtime bits should be OPV or comptime fields.
+                    // Fill those values now.
+                    for (elems, 0..) |*elem, field_index| {
+                        if (elem.* != .none) continue;
+                        const val = (try ty.structFieldValueComptime(pt, field_index)).?;
+                        elem.* = val.toIntern();
+                    }
+                    return pt.aggregateValue(ty, elems);
+                },
+                .@"packed" => {
+                    const backing_int_val = try pack.primitive(ty.bitpackBackingInt(zcu));
+                    if (backing_int_val.isUndef(zcu)) return pt.undefValue(ty);
+                    return pt.bitpackValue(ty, backing_int_val);
+                },
+            },
+            .@"union" => switch (ty.containerLayout(zcu)) {
+                .auto => unreachable, // ill-defined layout
+                .@"extern" => {
+                    // We will attempt to read as the backing representation. If this emits
+                    // `error.ReinterpretDeclRef`, we will try each union field, preferring larger ones.
+                    // We will also attempt smaller fields when we get `undefined`, as if some bits are
+                    // defined we want to include them.
+                    // TODO: this is very very bad. We need a more sophisticated union representation.
+
+                    const prev_unpacked = pack.unpacked;
+                    const prev_byte_offset = pack.byte_offset;
+
+                    const backing_ty = try ty.externUnionBackingType(pt);
+
+                    const backing_result: enum { undef, reinterpret_decl_ref } = backing: {
+                        const backing_val = pack.get(backing_ty) catch |err| switch (err) {
+                            error.ReinterpretDeclRef => break :backing .reinterpret_decl_ref,
+                            else => |e| return e,
+                        };
+                        if (backing_val.isUndef(zcu)) break :backing .undef;
+                        return .fromInterned(try pt.internUnion(.{
+                            .ty = ty.toIntern(),
+                            .tag = .none,
+                            .val = backing_val.toIntern(),
+                        }));
+                    };
+
+                    const field_order = try pack.arena.alloc(u32, ty.unionTagTypeHypothetical(zcu).enumFieldCount(zcu));
+                    for (field_order, 0..) |*f, i| f.* = @intCast(i);
+                    // Sort `field_order` to put the fields with the largest ABI sizes first.
+                    const SizeSortCtx = struct {
+                        zcu: *const Zcu,
+                        field_types: []const InternPool.Index,
+                        fn lessThan(ctx: @This(), a_idx: u32, b_idx: u32) bool {
+                            const a_ty: Type = .fromInterned(ctx.field_types[a_idx]);
+                            const b_ty: Type = .fromInterned(ctx.field_types[b_idx]);
+                            return a_ty.abiSize(ctx.zcu) > b_ty.abiSize(ctx.zcu);
+                        }
+                    };
+                    std.mem.sortUnstable(u32, field_order, SizeSortCtx{
+                        .zcu = zcu,
+                        .field_types = zcu.typeToUnion(ty).?.field_types.get(ip),
+                    }, SizeSortCtx.lessThan);
+
+                    for (field_order) |field_index| {
+                        pack.unpacked = prev_unpacked;
+                        pack.byte_offset = prev_byte_offset;
+                        const field_ty = ty.fieldType(field_index, zcu);
+                        const field_val = pack.get(field_ty) catch |err| switch (err) {
+                            error.ReinterpretDeclRef => continue,
+                            else => |e| return e,
+                        };
+                        if (field_val.isUndef(zcu)) continue;
+                        pack.padding(ty.abiSize(zcu) - field_ty.abiSize(zcu));
+                        const tag_val = try pt.enumValueFieldIndex(ty.unionTagTypeHypothetical(zcu), field_index);
+                        return pt.unionValue(ty, tag_val, field_val);
+                    }
+
+                    // No field could represent the value. Just do whatever happens when we try to read
+                    // the backing type - either `undefined` or `error.ReinterpretDeclRef`.
+                    switch (backing_result) {
+                        .undef => return pt.undefValue(ty),
+                        .reinterpret_decl_ref => return error.ReinterpretDeclRef,
+                    }
+                },
+                .@"packed" => {
+                    const backing_int_val = try pack.primitive(ty.bitpackBackingInt(zcu));
+                    if (backing_int_val.isUndef(zcu)) return pt.undefValue(ty);
+                    return pt.bitpackValue(ty, backing_int_val);
+                },
+            },
+            .@"enum" => {
+                const tag_int_val = try pack.primitive(ty.intTagType(zcu));
+                if (tag_int_val.isUndef(zcu)) return pt.undefValue(ty);
+                return pt.enumValue(ty, tag_int_val.toIntern());
+            },
+            else => return pack.primitive(ty),
+        }
+    }
+
+    fn padding(pack: *PackValueBytes, num_bytes: u64) void {
+        _ = pack.prepareBytes(num_bytes);
+    }
+
+    fn primitive(pack: *PackValueBytes, want_ty: Type) (Allocator.Error || error{ReinterpretDeclRef})!Value {
+        const pt = pack.pt;
+        const zcu = pt.zcu;
+
+        if (try want_ty.onePossibleValue(pt)) |opv| return opv;
+
+        const vals, const byte_offset = pack.prepareBytes(want_ty.abiSize(zcu));
+
+        for (vals) |val| {
+            if (!Value.fromInterned(val).isUndef(zcu)) break;
+        } else {
+            // All bits of the value are `undefined`.
+            return pt.undefValue(want_ty);
+        }
+
+        // TODO: we need to decide how to handle partially-undef values here.
+        // Currently, a value with some undefined bits becomes `0xAA` so that we
+        // preserve the well-defined bits, because we can't currently represent
+        // a partially-undefined primitive (e.g. an int with some undef bits).
+        // In future, we probably want to take one of these two routes:
+        // * Define that if any bits are `undefined`, the entire value is `undefined`.
+        //   This is a major breaking change, and probably a footgun.
+        // * Introduce tracking for partially-undef values at comptime.
+        //   This would complicate a lot of operations in Sema, such as basic
+        //   arithmetic.
+        // This design complexity is tracked by #19634.
+
+        if (vals.len == 1 and
+            want_ty.isPtrAtRuntime(zcu) and
+            Value.fromInterned(vals[0]).typeOf(zcu).isPtrAtRuntime(zcu))
+        {
+            return pt.getCoerced(.fromInterned(vals[0]), want_ty);
+        }
+
+        // Reinterpret via an in-memory buffer.
+
+        var buf_len: u64 = 0;
+        for (vals) |ip_val| {
+            const val: Value = .fromInterned(ip_val);
+            buf_len += val.typeOf(zcu).abiSize(zcu);
+        }
+
+        const buf = try pack.arena.alloc(u8, @intCast(buf_len));
+        {
+            var offset: usize = 0;
+            for (vals) |ip_val| {
+                const val: Value = .fromInterned(ip_val);
+                const ty = val.typeOf(zcu);
+                const size = ty.abiSize(zcu);
+                if (val.isUndef(zcu)) {
+                    @memset(buf[offset..][0..@intCast(size)], 0xAA);
+                } else {
+                    val.writeToMemory(zcu, buf[offset..][0..@intCast(size)]) catch |err| switch (err) {
+                        error.IllDefinedMemoryLayout => unreachable,
+                        else => |e| return e,
+                    };
+                }
+                offset += @intCast(size);
+            }
+        }
+        const bytes = buf[@intCast(byte_offset)..];
+
+        const target = zcu.getTarget();
+        const endian = target.cpu.arch.endian();
+        switch (want_ty.zigTypeTag(zcu)) {
+            .bool => return .makeBool(bytes[0] != 0),
+            .int => return .readIntFromMemory(want_ty, pt, bytes, pack.arena),
+            .float => switch (want_ty.floatBits(target)) {
+                16 => return pt.floatValue(want_ty, @as(f16, @bitCast(std.mem.readInt(u16, bytes[0..2], endian)))),
+                32 => return pt.floatValue(want_ty, @as(f32, @bitCast(std.mem.readInt(u32, bytes[0..4], endian)))),
+                64 => return pt.floatValue(want_ty, @as(f64, @bitCast(std.mem.readInt(u64, bytes[0..8], endian)))),
+                80 => return pt.floatValue(want_ty, @as(f80, @bitCast(std.mem.readInt(u80, bytes[0..10], endian)))),
+                128 => return pt.floatValue(want_ty, @as(f128, @bitCast(std.mem.readInt(u128, bytes[0..16], endian)))),
+                else => unreachable,
+            },
+            .pointer => {
+                assert(!want_ty.isSlice(zcu));
+                const ptr_addr = std.mem.readVarInt(u64, bytes[0..@intCast(want_ty.abiSize(zcu))], endian);
+                return pt.ptrIntValue(want_ty, ptr_addr);
+            },
+            .optional => {
+                assert(want_ty.isPtrLikeOptional(zcu));
+                const ptr_ty = want_ty.optionalChild(zcu);
+                const ptr_addr = std.mem.readVarInt(u64, bytes[0..@intCast(want_ty.abiSize(zcu))], endian);
+                return .fromInterned(try pt.intern(.{ .opt = .{
+                    .ty = want_ty.toIntern(),
+                    .val = if (ptr_addr == 0) .none else (try pt.ptrIntValue(ptr_ty, ptr_addr)).toIntern(),
+                } }));
+            },
+            else => unreachable,
+        }
+    }
+
+    fn prepareBytes(pack: *PackValueBytes, need_bytes: u64) struct { []const InternPool.Index, u64 } {
+        if (need_bytes == 0) return .{ &.{}, 0 };
+
+        const pt = pack.pt;
+        const zcu = pt.zcu;
+
+        var bytes: u64 = 0;
+        var len: usize = 0;
+        while (bytes < pack.byte_offset + need_bytes) {
+            bytes += Value.fromInterned(pack.unpacked[len]).typeOf(zcu).abiSize(zcu);
+            len += 1;
+        }
+
+        const result_vals = pack.unpacked[0..len];
+        const result_offset = pack.byte_offset;
+
+        const extra_bytes = bytes - pack.byte_offset - need_bytes;
+        if (extra_bytes == 0) {
+            pack.unpacked = pack.unpacked[len..];
+            pack.byte_offset = 0;
+        } else {
+            pack.unpacked = pack.unpacked[len - 1 ..];
+            pack.byte_offset = Value.fromInterned(pack.unpacked[0]).typeOf(zcu).abiSize(zcu) - extra_bytes;
+        }
+
+        return .{ result_vals, result_offset };
+    }
+};
+
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const assert = std.debug.assert;
+
+const Sema = @import("../Sema.zig");
+const Zcu = @import("../Zcu.zig");
+const InternPool = @import("../InternPool.zig");
+const Type = @import("../Type.zig");
+const Value = @import("../Value.zig");
+const CompileError = Zcu.CompileError;
diff --git a/src/Type.zig b/src/Type.zig
@@ -757,9 +757,9 @@ pub fn hasWellDefinedLayout(ty: Type, zcu: *const Zcu) bool {
     const ip = &zcu.intern_pool;
     return switch (ip.indexToKey(ty.toIntern())) {
         .int_type,
-        .vector_type,
         => true,
 
+        .vector_type,
         .error_union_type,
         .error_set_type,
         .inferred_error_set_type,
@@ -1241,112 +1241,17 @@ pub fn errorAbiSize(zcu: *const Zcu) u64 {
 }
 
 /// Asserts that `ty` is not an opaque or comptime-only type.
-/// Once #19755 is implemented, this query will only work on types with a defined bit-level representation.
 pub fn bitSize(ty: Type, zcu: *const Zcu) u64 {
-    const target = zcu.getTarget();
-    const ip = &zcu.intern_pool;
-    assertHasLayout(ty, zcu);
-    return switch (ip.indexToKey(ty.toIntern())) {
-        .int_type => |int_type| int_type.bits,
-        .ptr_type => |ptr_type| switch (ptr_type.flags.size) {
-            .slice => target.ptrBitWidth() * 2,
-            else => target.ptrBitWidth(),
-        },
-        .anyframe_type => target.ptrBitWidth(),
-        .array_type => |array_type| {
-            const elem_ty: Type = .fromInterned(array_type.child);
-            const len = array_type.lenIncludingSentinel();
-            return switch (zcu.comp.getZigBackend()) {
-                .stage2_x86_64, .stage2_llvm => len * elem_ty.bitSize(zcu),
-                // this case will be removed under #19755
-                else => switch (len) {
-                    0 => 0,
-                    else => (len - 1) * 8 * elem_ty.abiSize(zcu) + elem_ty.bitSize(zcu),
-                },
-            };
-        },
-        .vector_type => |vec| vec.len * Type.fromInterned(vec.child).bitSize(zcu),
-        .error_set_type, .inferred_error_set_type => zcu.errorSetBits(),
-        .func_type => unreachable,
-
-        .simple_type => |t| switch (t) {
-            .void => 0,
-            .bool => 1,
-            .anyerror, .adhoc_inferred_error_set => zcu.errorSetBits(),
-            .usize, .isize => target.ptrBitWidth(),
-
-            .c_char => target.cTypeBitSize(.char),
-            .c_short => target.cTypeBitSize(.short),
-            .c_ushort => target.cTypeBitSize(.ushort),
-            .c_int => target.cTypeBitSize(.int),
-            .c_uint => target.cTypeBitSize(.uint),
-            .c_long => target.cTypeBitSize(.long),
-            .c_ulong => target.cTypeBitSize(.ulong),
-            .c_longlong => target.cTypeBitSize(.longlong),
-            .c_ulonglong => target.cTypeBitSize(.ulonglong),
-            .c_longdouble => target.cTypeBitSize(.longdouble),
-
-            .f16 => 16,
-            .f32 => 32,
-            .f64 => 64,
-            .f80 => 80,
-            .f128 => 128,
-
-            .anyopaque => unreachable,
-            .type => unreachable,
-            .comptime_int => unreachable,
-            .comptime_float => unreachable,
-            .noreturn => unreachable,
-            .null => unreachable,
-            .undefined => unreachable,
-            .enum_literal => unreachable,
-            .generic_poison => unreachable,
-        },
-
-        .struct_type => {
-            const struct_obj = ip.loadStructType(ty.toIntern());
-            switch (struct_obj.layout) {
-                .@"packed" => return Type.fromInterned(struct_obj.packed_backing_int_type).bitSize(zcu),
-                .auto, .@"extern" => return struct_obj.size * 8, // will be `unreachable` under #19755
-            }
-        },
-        .union_type => {
-            const union_obj = ip.loadUnionType(ty.toIntern());
-            switch (union_obj.layout) {
-                .@"packed" => return Type.fromInterned(union_obj.packed_backing_int_type).bitSize(zcu),
-                .auto, .@"extern" => return union_obj.size * 8, // will be `unreachable` under #19755
-            }
+    return switch (ty.zigTypeTag(zcu)) {
+        .void => 0,
+        .bool => 1,
+        .float => ty.floatBits(zcu.getTarget()),
+        .pointer, .optional => {
+            assert(ty.isPtrAtRuntime(zcu));
+            return zcu.getTarget().ptrBitWidth();
         },
-        .enum_type => Type.fromInterned(ip.loadEnumType(ty.toIntern()).int_tag_type).bitSize(zcu),
-
-        // will be `unreachable` under #19755
-        .opt_type,
-        .error_union_type,
-        .tuple_type,
-        => ty.abiSize(zcu) * 8,
-
-        .opaque_type, .spirv_type => unreachable,
-
-        // values, not types
-        .undef,
-        .simple_value,
-        .@"extern",
-        .func,
-        .int,
-        .err,
-        .error_union,
-        .enum_literal,
-        .enum_tag,
-        .float,
-        .ptr,
-        .slice,
-        .opt,
-        .aggregate,
-        .un,
-        .bitpack,
-        // memoization, not types
-        .memoized_call,
-        => unreachable,
+        .array, .vector => ty.arrayLenIncludingSentinel(zcu) * ty.childType(zcu).bitSize(zcu),
+        else => ty.intInfo(zcu).bits,
     };
 }
 
@@ -1528,6 +1433,7 @@ pub fn nullablePtrElem(ty: Type, zcu: *const Zcu) Type {
 /// * `[*]T`
 /// * `[*c]T`
 /// * `@SpirvType(.{ .runtime_array = T })`
+/// * `*@SpirvType(.{ .runtime_array = T })`
 pub fn indexableElem(ty: Type, zcu: *const Zcu) Type {
     const ip = &zcu.intern_pool;
     return switch (ip.indexToKey(ty.toIntern())) {
@@ -3181,6 +3087,8 @@ pub fn validateExtern(ty: Type, position: ExternPosition, zcu: *const Zcu) bool 
         .frame,
         => false,
 
+        .vector => position == .param_ty or position == .ret_ty,
+
         .void => switch (position) {
             .ret_ty,
             .union_field,
@@ -3259,7 +3167,6 @@ pub fn validateExtern(ty: Type, position: ExternPosition, zcu: *const Zcu) bool 
             .other,
             => ty.childType(zcu).validateExtern(.element, zcu),
         },
-        .vector => ty.childType(zcu).validateExtern(.element, zcu),
         .optional => ty.isPtrLikeOptional(zcu),
     };
 }
@@ -3272,6 +3179,40 @@ fn validateExternCallconv(cc: std.lang.CallingConvention) bool {
     };
 }
 
+/// Returns whether `ty` is considered by Zig to have a bit-level representation, meaning it is
+/// allowed as the operand to `@bitSizeOf`. This is a superset of packable types.
+pub fn hasBitRepresentation(ty: Type, zcu: *const Zcu) bool {
+    return switch (ty.zigTypeTag(zcu)) {
+        .@"fn",
+        .noreturn,
+        .undefined,
+        .null,
+        .@"opaque",
+        .spirv,
+        .type,
+        .enum_literal,
+        .comptime_float,
+        .comptime_int,
+        .error_set,
+        .error_union,
+        .frame,
+        .@"anyframe",
+        => false,
+
+        .void,
+        .bool,
+        .int,
+        .float,
+        => true,
+
+        .@"enum" => zcu.intern_pool.loadEnumType(ty.toIntern()).int_tag_mode == .explicit,
+        .pointer, .optional => ty.isPtrAtRuntime(zcu),
+        .@"struct", .@"union" => ty.containerLayout(zcu) == .@"packed",
+
+        .array, .vector => ty.childType(zcu).hasBitRepresentation(zcu),
+    };
+}
+
 /// Asserts that `ty` has resolved layout.
 pub fn assertHasLayout(ty: Type, zcu: *const Zcu) void {
     if (!std.debug.runtime_safety) {
diff --git a/src/Value.zig b/src/Value.zig
@@ -248,7 +248,6 @@ pub fn toBool(val: Value) bool {
 pub fn writeToMemory(val: Value, zcu: *const Zcu, buffer: []u8) error{
     ReinterpretDeclRef,
     IllDefinedMemoryLayout,
-    Unimplemented,
     OutOfMemory,
 }!void {
     const target = zcu.getTarget();
@@ -257,35 +256,50 @@ pub fn writeToMemory(val: Value, zcu: *const Zcu, buffer: []u8) error{
     const ty = val.typeOf(zcu);
     if (val.isUndef(zcu)) {
         const size: usize = @intCast(ty.abiSize(zcu));
-        @memset(buffer[0..size], 0xaa);
+        @memset(buffer[0..size], 0xAA);
         return;
     }
-    switch (ty.zigTypeTag(zcu)) {
+    tag: switch (ty.zigTypeTag(zcu)) {
+        .type => return error.IllDefinedMemoryLayout,
+        .comptime_float => return error.IllDefinedMemoryLayout,
+        .comptime_int => return error.IllDefinedMemoryLayout,
+        .undefined => return error.IllDefinedMemoryLayout,
+        .null => return error.IllDefinedMemoryLayout,
+        .error_union => return error.IllDefinedMemoryLayout,
+        .enum_literal => return error.IllDefinedMemoryLayout,
+        .@"fn" => return error.IllDefinedMemoryLayout,
+        .spirv => return error.IllDefinedMemoryLayout,
+        .@"opaque" => unreachable,
+        .frame => unreachable,
+        .@"anyframe" => unreachable,
+        .noreturn => unreachable,
         .void => {},
         .bool => {
             buffer[0] = @intFromBool(val.toBool());
         },
-        .int, .@"enum", .error_set, .pointer => |tag| {
-            const int_ty = if (tag == .pointer) int_ty: {
-                if (ty.isSlice(zcu)) return error.IllDefinedMemoryLayout;
-                if (ip.getBackingAddrTag(val.toIntern()).? != .int) return error.ReinterpretDeclRef;
-                break :int_ty Type.usize;
-            } else ty;
-            const int_info = int_ty.intInfo(zcu);
-            const bits = int_info.bits;
-            const byte_count: u16 = @intCast((@as(u17, bits) + 7) / 8);
-
+        .pointer => {
+            if (ty.isSlice(zcu)) return error.IllDefinedMemoryLayout;
+            if (ip.getBackingAddrTag(val.toIntern()).? != .int) return error.ReinterpretDeclRef;
+            continue :tag .int;
+        },
+        .int, .@"enum", .error_set => {
             var bigint_buffer: BigIntSpace = undefined;
             const bigint = val.toBigInt(&bigint_buffer, zcu);
-            bigint.writeTwosComplement(buffer[0..byte_count], endian);
-        },
-        .float => switch (ty.floatBits(target)) {
-            16 => std.mem.writeInt(u16, buffer[0..2], @bitCast(val.toFloat(f16, zcu)), endian),
-            32 => std.mem.writeInt(u32, buffer[0..4], @bitCast(val.toFloat(f32, zcu)), endian),
-            64 => std.mem.writeInt(u64, buffer[0..8], @bitCast(val.toFloat(f64, zcu)), endian),
-            80 => std.mem.writeInt(u80, buffer[0..10], @bitCast(val.toFloat(f80, zcu)), endian),
-            128 => std.mem.writeInt(u128, buffer[0..16], @bitCast(val.toFloat(f128, zcu)), endian),
-            else => unreachable,
+            bigint.writeTwosComplement(buffer[0..@intCast(ty.abiSize(zcu))], endian);
+        },
+        .float => {
+            const float_bits = ty.floatBits(target);
+            switch (float_bits) {
+                16 => std.mem.writeInt(u16, buffer[0..2], @bitCast(val.toFloat(f16, zcu)), endian),
+                32 => std.mem.writeInt(u32, buffer[0..4], @bitCast(val.toFloat(f32, zcu)), endian),
+                64 => std.mem.writeInt(u64, buffer[0..8], @bitCast(val.toFloat(f64, zcu)), endian),
+                80 => std.mem.writeInt(u80, buffer[0..10], @bitCast(val.toFloat(f80, zcu)), endian),
+                128 => std.mem.writeInt(u128, buffer[0..16], @bitCast(val.toFloat(f128, zcu)), endian),
+                else => unreachable,
+            }
+            const float_bytes = @divExact(float_bits, 8);
+            const total_bytes: usize = @intCast(ty.abiSize(zcu));
+            @memset(buffer[float_bytes..total_bytes], 0); // padding
         },
         .array => {
             const aggregate = ip.indexToKey(val.toIntern()).aggregate;
@@ -302,28 +316,33 @@ pub fn writeToMemory(val: Value, zcu: *const Zcu, buffer: []u8) error{
                 }
                 buf_off += elem_size;
             }
+            if (ty.sentinel(zcu)) |sentinel_val| {
+                try sentinel_val.writeToMemory(zcu, buffer[buf_off..]);
+            }
         },
-        .vector => {
-            // We use byte_count instead of abi_size here, so that any padding bytes
-            // follow the data bytes, on both big- and little-endian systems.
-            const byte_count = (@as(usize, @intCast(ty.bitSize(zcu))) + 7) / 8;
-            return writeToPackedMemory(val, zcu, buffer[0..byte_count], 0);
-        },
+        .vector => return error.IllDefinedMemoryLayout,
         .@"struct" => {
             const struct_type = zcu.typeToStruct(ty) orelse return error.IllDefinedMemoryLayout;
             switch (struct_type.layout) {
                 .auto => return error.IllDefinedMemoryLayout,
-                .@"extern" => for (0..struct_type.field_types.len) |field_index| {
-                    const off: usize = @intCast(ty.structFieldOffset(field_index, zcu));
-                    const field_val = Value.fromInterned(switch (ip.indexToKey(val.toIntern()).aggregate.storage) {
-                        .bytes => |bytes| {
-                            buffer[off] = bytes.at(field_index, ip);
-                            continue;
-                        },
-                        .elems => |elems| elems[field_index],
-                        .repeated_elem => |elem| elem,
-                    });
-                    try writeToMemory(field_val, zcu, buffer[off..]);
+                .@"extern" => {
+                    var last_off: usize = 0;
+                    for (struct_type.field_types.get(ip), 0..) |field_ty_ip, field_index| {
+                        const off: usize = @intCast(ty.structFieldOffset(field_index, zcu));
+                        @memset(buffer[last_off..off], 0xAA);
+                        const field_val = Value.fromInterned(switch (ip.indexToKey(val.toIntern()).aggregate.storage) {
+                            .bytes => |bytes| {
+                                buffer[off] = bytes.at(field_index, ip);
+                                continue;
+                            },
+                            .elems => |elems| elems[field_index],
+                            .repeated_elem => |elem| elem,
+                        });
+                        try writeToMemory(field_val, zcu, buffer[off..]);
+                        last_off = @intCast(off + Type.fromInterned(field_ty_ip).abiSize(zcu));
+                    }
+                    const struct_size: usize = @intCast(ty.abiSize(zcu));
+                    @memset(buffer[last_off..struct_size], 0xAA);
                 },
                 .@"packed" => {
                     const int_index = ip.indexToKey(val.toIntern()).bitpack.backing_int_val;
@@ -335,6 +354,9 @@ pub fn writeToMemory(val: Value, zcu: *const Zcu, buffer: []u8) error{
             .auto => return error.IllDefinedMemoryLayout, // Sema is supposed to have emitted a compile error already
             .@"extern" => {
                 const payload_val = val.unionPayload(zcu);
+                const payload_size: usize = @intCast(payload_val.typeOf(zcu).abiSize(zcu));
+                const union_size: usize = @intCast(ty.abiSize(zcu));
+                @memset(buffer[payload_size..union_size], 0xAA);
                 return writeToMemory(payload_val, zcu, buffer);
             },
             .@"packed" => {
@@ -352,7 +374,6 @@ pub fn writeToMemory(val: Value, zcu: *const Zcu, buffer: []u8) error{
                 @memset(buffer[0..@intCast(byte_count)], 0); // null pointer
             }
         },
-        else => return error.Unimplemented,
     }
 }
 
@@ -360,12 +381,15 @@ pub fn writeToMemory(val: Value, zcu: *const Zcu, buffer: []u8) error{
 ///
 /// Both the start and the end of the provided buffer must be tight, since
 /// big-endian packed memory layouts start at the end of the buffer.
+///
+/// Supports arrays and vectors, for which the value is written in logical bit
+/// order, i.e. with the first element at bit offset 0.
 pub fn writeToPackedMemory(
     val: Value,
     zcu: *const Zcu,
     buffer: []u8,
     bit_offset: usize,
-) error{ ReinterpretDeclRef, OutOfMemory }!void {
+) void {
     const ip = &zcu.intern_pool;
     const target = zcu.getTarget();
     const endian = target.cpu.arch.endian();
@@ -392,13 +416,7 @@ pub fn writeToPackedMemory(
         },
         .@"enum" => {
             const int_val = val.intFromEnum(zcu);
-            return int_val.writeToPackedMemory(zcu, buffer, bit_offset);
-        },
-        .pointer => {
-            assert(!ty.isSlice(zcu)); // No well defined layout.
-            if (ip.getBackingAddrTag(val.toIntern()).? != .int) return error.ReinterpretDeclRef;
-            const addr = val.toUnsignedInt(zcu);
-            std.mem.writeVarPackedInt(buffer, bit_offset, zcu.getTarget().ptrBitWidth(), addr, endian);
+            int_val.writeToPackedMemory(zcu, buffer, bit_offset);
         },
         .int => {
             const bits = ty.intInfo(zcu).bits;
@@ -416,47 +434,46 @@ pub fn writeToPackedMemory(
             128 => std.mem.writePackedInt(u128, buffer, bit_offset, @bitCast(val.toFloat(f128, zcu)), endian),
             else => unreachable,
         },
-        .vector => {
-            const elem_ty = ty.childType(zcu);
-            const elem_bit_size: u16 = @intCast(elem_ty.bitSize(zcu));
-            const len: usize = @intCast(ty.arrayLen(zcu));
-
-            var bits: u16 = 0;
-            var elem_i: usize = 0;
-            const aggregate = ip.indexToKey(val.toIntern()).aggregate;
-            while (elem_i < len) : (elem_i += 1) {
-                // On big-endian systems, LLVM reverses the element order of vectors by default
-                const tgt_elem_i = if (endian == .big) len - elem_i - 1 else elem_i;
-                switch (aggregate.storage) {
-                    .bytes => |bytes| std.mem.writePackedInt(u8, buffer, bit_offset + bits, bytes.at(tgt_elem_i, ip), endian),
-                    .elems => |elems| try Value.fromInterned(elems[tgt_elem_i]).writeToPackedMemory(zcu, buffer, bit_offset + bits),
-                    .repeated_elem => |elem| try Value.fromInterned(elem).writeToPackedMemory(zcu, buffer, bit_offset + bits),
-                }
-                bits += elem_bit_size;
-            }
-        },
         .@"struct", .@"union" => {
             assert(ty.containerLayout(zcu) == .@"packed");
             const int_val: Value = .fromInterned(ip.indexToKey(val.toIntern()).bitpack.backing_int_val);
-            return int_val.writeToPackedMemory(zcu, buffer, bit_offset);
+            int_val.writeToPackedMemory(zcu, buffer, bit_offset);
         },
-        .optional => {
-            assert(ty.isPtrLikeOptional(zcu));
-            if (val.optionalValue(zcu)) |ptr_val| {
-                return ptr_val.writeToPackedMemory(zcu, buffer, bit_offset);
-            } else {
-                return Value.zero_usize.writeToPackedMemory(zcu, buffer, bit_offset);
+        .array, .vector => {
+            const elem_bits: usize = @intCast(ty.childType(zcu).bitSize(zcu));
+            const len: usize = @intCast(ty.arrayLen(zcu));
+            var elem_bit_off: usize = bit_offset;
+            switch (ip.indexToKey(val.toIntern()).aggregate.storage) {
+                .repeated_elem => |elem_val_ip| {
+                    const elem_val: Value = .fromInterned(elem_val_ip);
+                    for (0..len) |_| {
+                        elem_val.writeToPackedMemory(zcu, buffer, elem_bit_off);
+                        elem_bit_off += elem_bits;
+                    }
+                },
+                .elems => |elems| for (elems[0..len]) |elem_val_ip| {
+                    const elem_val: Value = .fromInterned(elem_val_ip);
+                    elem_val.writeToPackedMemory(zcu, buffer, elem_bit_off);
+                    elem_bit_off += elem_bits;
+                },
+                .bytes => |bytes| for (bytes.toSlice(len, ip)) |raw_byte| {
+                    std.mem.writeVarPackedInt(buffer, elem_bit_off, elem_bits, raw_byte, endian);
+                    elem_bit_off += elem_bits;
+                },
+            }
+            if (ty.sentinel(zcu)) |sentinel_val| {
+                sentinel_val.writeToPackedMemory(zcu, buffer, elem_bit_off);
             }
         },
-        else => @panic("TODO implement writeToPackedMemory for more types"),
+        else => unreachable,
     }
 }
 
-/// Load a Value from the contents of `buffer`, where `ty` is an unsigned integer type.
+/// Load a Value from the contents of `buffer`, where `ty` is any integer type.
 ///
 /// Asserts that buffer.len >= ty.abiSize(). The buffer is allowed to extend past
 /// the end of the value in memory.
-pub fn readUintFromMemory(
+pub fn readIntFromMemory(
     ty: Type,
     pt: Zcu.PerThread,
     buffer: []const u8,
@@ -465,23 +482,28 @@ pub fn readUintFromMemory(
     const zcu = pt.zcu;
     const endian = zcu.getTarget().cpu.arch.endian();
 
-    assert(ty.isUnsignedInt(zcu));
-    const bits = ty.intInfo(zcu).bits;
-    const byte_count: u16 = @intCast((@as(u17, bits) + 7) / 8);
+    const int = ty.intInfo(zcu);
+    const abi_size: usize = @intCast(ty.abiSize(zcu));
+    const exact_buf = buffer[0..abi_size];
 
-    assert(buffer.len >= byte_count);
-
-    if (bits <= 64) {
-        const val = std.mem.readVarInt(u64, buffer[0..byte_count], endian);
-        const result = (val << @as(u6, @intCast(64 - bits))) >> @as(u6, @intCast(64 - bits));
-        return pt.intValue(ty, result);
+    if (abi_size <= 8) {
+        const shift: u6 = @intCast(64 - int.bits);
+        switch (int.signedness) {
+            .unsigned => {
+                const x = std.mem.readVarInt(u64, exact_buf, endian);
+                return pt.intValue(ty, (x << shift) >> shift);
+            },
+            .signed => {
+                const x = std.mem.readVarInt(i64, exact_buf, endian);
+                return pt.intValue(ty, (x << shift) >> shift);
+            },
+        }
     } else {
-        const Limb = std.math.big.Limb;
-        const limb_count = (byte_count + @sizeOf(Limb) - 1) / @sizeOf(Limb);
-        const limbs_buffer = try arena.alloc(Limb, limb_count);
+        const limb_count = std.math.big.int.calcTwosCompLimbCount(int.bits);
+        const limbs_buffer = try arena.alloc(std.math.big.Limb, limb_count);
 
         var bigint: BigIntMutable = .init(limbs_buffer, 0);
-        bigint.readTwosComplement(buffer[0..byte_count], bits, endian, .unsigned);
+        bigint.readTwosComplement(exact_buf, int.bits, endian, int.signedness);
         return pt.intValue_big(ty, bigint.toConst());
     }
 }
@@ -490,17 +512,17 @@ pub fn readUintFromMemory(
 ///
 /// Both the start and the end of the provided buffer must be tight, since
 /// big-endian packed memory layouts start at the end of the buffer.
+///
+/// Supports arrays and vectors, for which the value is read in logical bit
+/// order, i.e. with the first element at bit offset 0.
 pub fn readFromPackedMemory(
     ty: Type,
     pt: Zcu.PerThread,
     buffer: []const u8,
     bit_offset: usize,
-    gpa: Allocator,
-) error{
-    IllDefinedMemoryLayout,
-    OutOfMemory,
-}!Value {
+) Allocator.Error!Value {
     const zcu = pt.zcu;
+    const gpa = zcu.comp.gpa;
     const target = zcu.getTarget();
     const endian = target.cpu.arch.endian();
     switch (ty.zigTypeTag(zcu)) {
@@ -543,7 +565,7 @@ pub fn readFromPackedMemory(
         },
         .@"enum" => {
             const int_ty = ty.intTagType(zcu);
-            const int_val = try Value.readFromPackedMemory(int_ty, pt, buffer, bit_offset, gpa);
+            const int_val: Value = try .readFromPackedMemory(int_ty, pt, buffer, bit_offset);
             return pt.getCoerced(int_val, ty);
         },
         .float => return Value.fromInterned(try pt.intern(.{ .float = .{
@@ -557,40 +579,25 @@ pub fn readFromPackedMemory(
                 else => unreachable,
             },
         } })),
-        .vector => {
-            const elem_ty = ty.childType(zcu);
-            const elems = try gpa.alloc(InternPool.Index, @intCast(ty.arrayLen(zcu)));
-            defer gpa.free(elems);
-
-            var bits: u16 = 0;
-            const elem_bit_size: u16 = @intCast(elem_ty.bitSize(zcu));
-            for (elems, 0..) |_, i| {
-                // On big-endian systems, LLVM reverses the element order of vectors by default
-                const tgt_elem_i = if (endian == .big) elems.len - i - 1 else i;
-                elems[tgt_elem_i] = (try readFromPackedMemory(elem_ty, pt, buffer, bit_offset + bits, gpa)).toIntern();
-                bits += elem_bit_size;
-            }
-            return pt.aggregateValue(ty, elems);
-        },
         .@"struct", .@"union" => {
             assert(ty.containerLayout(zcu) == .@"packed");
-            const int_val: Value = try .readFromPackedMemory(ty.bitpackBackingInt(zcu), pt, buffer, bit_offset, gpa);
+            const int_val: Value = try .readFromPackedMemory(ty.bitpackBackingInt(zcu), pt, buffer, bit_offset);
             return pt.bitpackValue(ty, int_val);
         },
-        .pointer => {
-            assert(!ty.isSlice(zcu)); // No well defined layout.
-            const addr = (try readFromPackedMemory(Type.usize, pt, buffer, bit_offset, gpa)).toUnsignedInt(zcu);
-            return pt.ptrIntValue(ty, addr);
-        },
-        .optional => {
-            assert(ty.isPtrLikeOptional(zcu));
-            const addr = (try readFromPackedMemory(Type.usize, pt, buffer, bit_offset, gpa)).toUnsignedInt(zcu);
-            return .fromInterned(try pt.intern(.{ .opt = .{
-                .ty = ty.toIntern(),
-                .val = if (addr == 0) .none else (try pt.ptrIntValue(ty.childType(zcu), addr)).toIntern(),
-            } }));
+        .array, .vector => {
+            const elem_ty = ty.childType(zcu);
+            const elem_bits: usize = @intCast(elem_ty.bitSize(zcu));
+            const elems_buf = try gpa.alloc(InternPool.Index, @intCast(ty.arrayLen(zcu)));
+            defer gpa.free(elems_buf);
+            var elem_bit_off: usize = bit_offset;
+            for (elems_buf) |*elem| {
+                const elem_val = try readFromPackedMemory(elem_ty, pt, buffer, elem_bit_off);
+                elem.* = elem_val.toIntern();
+                elem_bit_off += elem_bits;
+            }
+            return pt.aggregateValue(ty, elems_buf);
         },
-        else => @panic("TODO implement readFromPackedMemory for more types"),
+        else => unreachable,
     }
 }
 
@@ -887,14 +894,9 @@ pub fn fieldValue(val: Value, pt: Zcu.PerThread, index: usize) !Value {
             const bfa = bfa_state.allocator();
             const buf = try bfa.alloc(u8, @intCast((ty.bitSize(zcu) + 7) / 8));
             defer bfa.free(buf);
-            int_val.writeToPackedMemory(zcu, buf, 0) catch |err| switch (err) {
-                error.ReinterpretDeclRef => unreachable, // it's an integer
-                error.OutOfMemory => |e| return e,
-            };
-            return Value.readFromPackedMemory(field_ty, pt, buf, field_bit_offset, bfa) catch |err| switch (err) {
-                error.IllDefinedMemoryLayout => unreachable, // it's a bitpack
-                error.OutOfMemory => |e| return e,
-            };
+            @memset(buf, 0);
+            int_val.writeToPackedMemory(zcu, buf, 0);
+            return .readFromPackedMemory(field_ty, pt, buf, field_bit_offset);
         },
         else => unreachable,
     };
@@ -1619,7 +1621,6 @@ pub fn hasRepeatedByteRepr(val: Value, zcu: *const Zcu) !?u8 {
         // code late in compilation. So, this error handling is too aggressive and
         // causes some false negatives, causing less-than-ideal code generation.
         error.IllDefinedMemoryLayout => return null,
-        error.Unimplemented => return null,
     };
     const first_byte = byte_buffer[0];
     for (byte_buffer[1..]) |byte| {
diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig
@@ -4544,8 +4544,12 @@ fn runCodegenInner(pt: Zcu.PerThread, func_index: InternPool.Index, air: *Air) e
     tracy_trace.addText(fqn.toSlice(ip));
     tracy_trace.addTextFmt("func_ip_index={d}", .{func_index});
 
+    Air.Verify.run(pt, func_index, air);
+
     if (codegen.legalizeFeatures(pt, nav)) |features| {
         try air.legalize(pt, features);
+        // Verify the AIR again post-legalization.
+        Air.Verify.run(pt, func_index, air);
     }
 
     var liveness: ?Air.Liveness = if (codegen.wantsLiveness(pt, nav))
diff --git a/src/codegen/aarch64/Select.zig b/src/codegen/aarch64/Select.zig
@@ -292,8 +292,8 @@ pub fn analyze(isel: *Select, air_body: []const Air.Inst.Index) !void {
         .load,
         .fptrunc,
         .fpext,
-        .intcast,
-        .intcast_safe,
+        .int_cast,
+        .int_cast_safe,
         .trunc,
         .optional_payload,
         .optional_payload_ptr,
@@ -334,7 +334,15 @@ pub fn analyze(isel: *Select, air_body: []const Air.Inst.Index) !void {
             air_inst_index = air_body[air_body_index];
             continue :air_tag air_tags[@intFromEnum(air_inst_index)];
         },
-        .bitcast => {
+        .bit_cast,
+        .ptr_cast,
+        .ptr_from_int,
+        .int_from_ptr,
+        .error_cast,
+        .error_from_int,
+        .int_from_error,
+        .union_from_enum,
+        => {
             const ty_op = air_data[@intFromEnum(air_inst_index)].ty_op;
             maybe_noop: {
                 if (ty_op.ty.toInterned().? != isel.air.typeOf(ty_op.operand, ip).toIntern()) break :maybe_noop;
@@ -3190,7 +3198,15 @@ pub fn body(isel: *Select, air_body: []const Air.Inst.Index) error{ OutOfMemory,
             }
             if (air.next()) |next_air_tag| continue :air_tag next_air_tag;
         },
-        .bitcast => |air_tag| {
+        .bit_cast,
+        .ptr_cast,
+        .ptr_from_int,
+        .int_from_ptr,
+        .error_cast,
+        .error_from_int,
+        .int_from_error,
+        .union_from_enum,
+        => |air_tag| {
             if (isel.live_values.fetchRemove(air.inst_index)) |dst_vi| unused: {
                 defer dst_vi.value.deref(isel);
                 const ty_op = air.data(air.inst_index).ty_op;
@@ -5221,7 +5237,7 @@ pub fn body(isel: *Select, air_body: []const Air.Inst.Index) error{ OutOfMemory,
             }
             if (air.next()) |next_air_tag| continue :air_tag next_air_tag;
         },
-        .intcast => |air_tag| {
+        .int_cast => |air_tag| {
             if (isel.live_values.fetchRemove(air.inst_index)) |dst_vi| unused: {
                 defer dst_vi.value.deref(isel);
 
@@ -5312,7 +5328,7 @@ pub fn body(isel: *Select, air_body: []const Air.Inst.Index) error{ OutOfMemory,
             }
             if (air.next()) |next_air_tag| continue :air_tag next_air_tag;
         },
-        .intcast_safe => |air_tag| {
+        .int_cast_safe => |air_tag| {
             if (isel.live_values.fetchRemove(air.inst_index)) |dst_vi| unused: {
                 defer dst_vi.value.deref(isel);
 
@@ -11355,7 +11371,7 @@ fn writeToMemory(isel: *Select, constant: Constant, buffer: []u8) error{OutOfMem
     if (try isel.writeKeyToMemory(ip.indexToKey(constant.toIntern()), buffer)) return true;
     constant.writeToMemory(zcu, buffer) catch |err| switch (err) {
         error.OutOfMemory => |e| return e,
-        error.ReinterpretDeclRef, error.Unimplemented, error.IllDefinedMemoryLayout => return false,
+        error.ReinterpretDeclRef, error.IllDefinedMemoryLayout => return false,
     };
     return true;
 }
diff --git a/src/codegen/aarch64/abi.zig b/src/codegen/aarch64/abi.zig
@@ -21,7 +21,7 @@ pub fn classifyType(ty: Type, zcu: *Zcu) Class {
             if (ty.containerLayout(zcu) == .@"packed") return .byval;
             if (countFloats(ty, zcu)) |float| return .{ .float_array = float.count };
 
-            const bit_size = ty.bitSize(zcu);
+            const bit_size = ty.abiSize(zcu) * 8;
             if (bit_size > 128) return .memory;
             if (bit_size > 64) return .double_integer;
             return .integer;
@@ -30,7 +30,7 @@ pub fn classifyType(ty: Type, zcu: *Zcu) Class {
             if (ty.containerLayout(zcu) == .@"packed") return .byval;
             if (countFloats(ty, zcu)) |float| return .{ .float_array = float.count };
 
-            const bit_size = ty.bitSize(zcu);
+            const bit_size = ty.abiSize(zcu) * 8;
             if (bit_size > 128) return .memory;
             if (bit_size > 64) return .double_integer;
             return .integer;
diff --git a/src/codegen/arm/abi.zig b/src/codegen/arm/abi.zig
@@ -30,11 +30,11 @@ pub fn classifyType(ty: Type, zcu: *Zcu, ctx: Context) Class {
     const ip = &zcu.intern_pool;
     switch (ty.zigTypeTag(zcu)) {
         .@"struct" => {
-            const bit_size = ty.bitSize(zcu);
             if (ty.containerLayout(zcu) == .@"packed") {
-                if (bit_size > 64) return .memory;
+                if (ty.bitSize(zcu) > 64) return .memory;
                 return .byval;
             }
+            const bit_size = ty.abiSize(zcu) * 8;
             if (bit_size > max_byval_size) return .memory;
             const float_count = countFloats(ty, zcu, &maybe_float_bits);
             if (float_count <= byval_float_count) return .byval;
@@ -47,17 +47,17 @@ pub fn classifyType(ty: Type, zcu: *Zcu, ctx: Context) Class {
             var i: u32 = 0;
             while (i < fields) : (i += 1) {
                 const field_ty = ty.fieldType(i, zcu);
-                if (field_ty.bitSize(zcu) > 32) return Class.arrSize(bit_size, 64);
+                if (field_ty.abiSize(zcu) > 4) return Class.arrSize(bit_size, 64);
             }
             return Class.arrSize(bit_size, 32);
         },
         .@"union" => {
-            const bit_size = ty.bitSize(zcu);
             const union_obj = zcu.typeToUnion(ty).?;
             if (union_obj.layout == .@"packed") {
-                if (bit_size > 64) return .memory;
+                if (ty.bitSize(zcu) > 64) return .memory;
                 return .byval;
             }
+            const bit_size = ty.abiSize(zcu) * 8;
             if (bit_size > max_byval_size) return .memory;
             const float_count = countFloats(ty, zcu, &maybe_float_bits);
             if (float_count <= byval_float_count) return .byval;
@@ -67,7 +67,7 @@ pub fn classifyType(ty: Type, zcu: *Zcu, ctx: Context) Class {
             }
 
             for (union_obj.field_types.get(ip)) |field_ty| {
-                if (Type.fromInterned(field_ty).bitSize(zcu) > 32) {
+                if (Type.fromInterned(field_ty).abiSize(zcu) > 4) {
                     return Class.arrSize(bit_size, 64);
                 }
             }
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
@@ -27,7 +27,7 @@ pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
     return comptime switch (dev.env.supports(.legalize)) {
         inline false, true => |supports_legalize| &.init(.{
             // we don't currently ask zig1 to use safe optimization modes
-            .expand_intcast_safe = supports_legalize,
+            .expand_int_cast_safe = supports_legalize,
             .expand_int_from_float_safe = supports_legalize,
             .expand_int_from_float_optimized_safe = supports_legalize,
             .expand_add_safe = supports_legalize,
@@ -38,6 +38,9 @@ pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
             .expand_packed_store = true,
             .expand_packed_struct_field_val = true,
             .expand_packed_aggregate_init = true,
+
+            .scalarize_bit_cast_array = true,
+            .scalarize_bit_cast_vector_non_elementwise = true,
         }),
     };
 }
@@ -2636,9 +2639,9 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) Error!void {
             // zig fmt: off
             .inferred_alloc, .inferred_alloc_comptime => unreachable,
 
-            // No "scalarize" legalizations are enabled, so these instructions never appear.
-            .legalize_vec_elem_val   => unreachable,
-            .legalize_vec_store_elem => unreachable,
+            // Possible because `Air.Legalize.scalarize_bit_cast_vector_non_elementwise` is enabled.
+            .legalize_vec_elem_val   => try airArrayElemVal(f, inst),
+            .legalize_vec_store_elem => try airLegalizeVecStoreElem(f, inst),
             // No soft float legalizations are enabled.
             .legalize_compiler_rt_call => unreachable,
 
@@ -2751,8 +2754,15 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) Error!void {
             .alloc            => try airAlloc(f, inst),
             .ret_ptr          => try airRetPtr(f, inst),
             .assembly         => try airAsm(f, inst),
-            .bitcast          => try airBitcast(f, inst),
-            .intcast          => try airIntCast(f, inst),
+            .ptr_cast         => try airPtrCast(f, inst),
+            .ptr_from_int     => try airSimpleCast(f, inst),
+            .int_from_ptr     => try airSimpleCast(f, inst),
+            .error_cast       => try airNopCast(f, inst),
+            .error_from_int   => try airNopCast(f, inst),
+            .int_from_error   => try airNopCast(f, inst),
+            .union_from_enum  => try airUnionFromEnum(f, inst),
+            .bit_cast         => try airBitCast(f, inst),
+            .int_cast         => try airIntCast(f, inst),
             .trunc            => try airTrunc(f, inst),
             .load             => try airLoad(f, inst),
             .store            => try airStore(f, inst, false),
@@ -2864,7 +2874,7 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) Error!void {
             .add_safe,
             .sub_safe,
             .mul_safe,
-            .intcast_safe,
+            .int_cast_safe,
             .int_from_float_safe,
             .int_from_float_optimized_safe,
             => return f.fail("TODO implement safety_checked_instructions", .{}),
@@ -3083,6 +3093,28 @@ fn airArrayElemVal(f: *Function, inst: Air.Inst.Index) !CValue {
     return local;
 }
 
+fn airLegalizeVecStoreElem(f: *Function, inst: Air.Inst.Index) !CValue {
+    const pl_op = f.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
+    const extra = f.air.extraData(Air.Bin, pl_op.payload).data;
+
+    const vec_ptr = try f.resolveInst(pl_op.operand);
+    const index = try f.resolveInst(extra.lhs);
+    const elem = try f.resolveInst(extra.rhs);
+    try reap(f, inst, &.{ pl_op.operand, extra.lhs, extra.rhs });
+
+    const w = &f.code.writer;
+
+    try f.writeCValueDerefMember(w, vec_ptr, .{ .identifier = "array" });
+    try w.writeByte('[');
+    try f.writeCValue(w, index, .other);
+    try w.writeAll("] = ");
+    try f.writeCValue(w, elem, .other);
+    try w.writeByte(';');
+    try f.newline();
+
+    return .none;
+}
+
 fn airAlloc(f: *Function, inst: Air.Inst.Index) !CValue {
     const pt = f.dg.pt;
     const zcu = pt.zcu;
@@ -3190,35 +3222,42 @@ fn airLoad(f: *Function, inst: Air.Inst.Index) !CValue {
 
     try reap(f, inst, &.{ty_op.operand});
 
-    const is_aligned = if (ptr_info.flags.alignment != .none)
-        ptr_info.flags.alignment.order(src_ty.abiAlignment(zcu)).compare(.gte)
-    else
-        true;
+    const is_aligned = switch (ptr_info.flags.alignment) {
+        .none => true,
+        else => |ptr_align| ptr_align.compare(.gte, src_ty.abiAlignment(zcu)),
+    };
 
     const w = &f.code.writer;
     const local = try f.allocLocal(inst, src_ty);
-    const v = try Vectorize.start(f, inst, w, ptr_ty);
 
     if (!is_aligned) {
         try w.writeAll("memcpy(&");
         try f.writeCValue(w, local, .other);
-        try v.elem(f, w);
         try w.writeAll(", (const char *)");
-        try f.writeCValue(w, operand, .other);
-        try v.elem(f, w);
+        switch (ptr_info.flags.vector_index) {
+            .none => try f.writeCValue(w, operand, .other),
+            else => |index| {
+                try w.writeByte('&');
+                try f.writeCValue(w, operand, .other);
+                try w.print("[{d}]", .{@intFromEnum(index)});
+            },
+        }
         try w.writeAll(", sizeof(");
         try f.renderType(w, src_ty);
         try w.writeAll("))");
     } else {
         try f.writeCValue(w, local, .other);
-        try v.elem(f, w);
         try w.writeAll(" = ");
-        try f.writeCValueDeref(w, operand);
-        try v.elem(f, w);
+        switch (ptr_info.flags.vector_index) {
+            .none => try f.writeCValueDeref(w, operand),
+            else => |index| {
+                try f.writeCValue(w, operand, .other);
+                try w.print("[{d}]", .{@intFromEnum(index)});
+            },
+        }
     }
     try w.writeByte(';');
     try f.newline();
-    try v.end(f, inst, w);
 
     return local;
 }
@@ -3433,21 +3472,24 @@ fn airStore(f: *Function, inst: Air.Inst.Index, safety: bool) !CValue {
         // underlying type as the lhs (i.e. they must both be arrays of the same underlying type).
         assert(src_ty.eql(.fromInterned(ptr_info.child)));
 
-        const v = try Vectorize.start(f, inst, w, ptr_ty);
         try w.writeAll("memcpy((char *)");
-        try f.writeCValue(w, ptr_val, .other);
-        try v.elem(f, w);
+        switch (ptr_info.flags.vector_index) {
+            .none => try f.writeCValue(w, ptr_val, .other),
+            else => |index| {
+                try w.writeByte('&');
+                try f.writeCValue(w, ptr_val, .other);
+                try w.print("[{d}]", .{@intFromEnum(index)});
+            },
+        }
         try w.writeAll(", &");
         switch (src_val) {
             .constant => |val| try f.dg.renderValueAsLvalue(w, val),
             else => try f.writeCValue(w, src_val, .other),
         }
-        try v.elem(f, w);
         try w.writeAll(", sizeof(");
         try f.renderType(w, src_ty);
         try w.writeAll("));");
         try f.newline();
-        try v.end(f, inst, w);
     } else {
         switch (ptr_val) {
             .local_ref => |ptr_local_index| switch (src_val) {
@@ -3457,15 +3499,18 @@ fn airStore(f: *Function, inst: Air.Inst.Index, safety: bool) !CValue {
             },
             else => {},
         }
-        const v = try Vectorize.start(f, inst, w, ptr_ty);
-        try f.writeCValueDeref(w, ptr_val);
-        try v.elem(f, w);
+
+        switch (ptr_info.flags.vector_index) {
+            .none => try f.writeCValueDeref(w, ptr_val),
+            else => |index| {
+                try f.writeCValue(w, ptr_val, .other);
+                try w.print("[{d}]", .{@intFromEnum(index)});
+            },
+        }
         try w.writeAll(" = ");
         try f.writeCValue(w, src_val, .other);
-        try v.elem(f, w);
         try w.writeByte(';');
         try f.newline();
-        try v.end(f, inst, w);
     }
     return .none;
 }
@@ -3613,9 +3658,9 @@ fn airCmpOp(
     const lhs_ty = f.typeOf(data.lhs);
     const scalar_ty = lhs_ty.scalarType(zcu);
 
-    const scalar_bits = scalar_ty.bitSize(zcu);
-    if (scalar_ty.isInt(zcu) and scalar_bits > 64)
-        return airCmpBuiltinCall(
+    if (scalar_ty.isInt(zcu)) {
+        const scalar_bits = scalar_ty.bitSize(zcu);
+        if (scalar_bits > 64) return airCmpBuiltinCall(
             f,
             inst,
             data,
@@ -3623,6 +3668,7 @@ fn airCmpOp(
             .cmp,
             if (scalar_bits > 128) .bits else .none,
         );
+    }
     if (scalar_ty.isRuntimeFloat())
         return airCmpBuiltinCall(f, inst, data, operator, .operator, .none);
 
@@ -3668,9 +3714,9 @@ fn airEquality(
     const bin_op = f.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
 
     const operand_ty = f.typeOf(bin_op.lhs);
-    const operand_bits = operand_ty.bitSize(zcu);
-    if (operand_ty.isAbiInt(zcu) and operand_bits > 64)
-        return airCmpBuiltinCall(
+    if (operand_ty.isAbiInt(zcu)) {
+        const operand_bits = operand_ty.bitSize(zcu);
+        if (operand_bits > 64) return airCmpBuiltinCall(
             f,
             inst,
             bin_op,
@@ -3678,6 +3724,7 @@ fn airEquality(
             .cmp,
             if (operand_bits > 128) .bits else .none,
         );
+    }
     if (operand_ty.isRuntimeFloat())
         return airCmpBuiltinCall(f, inst, bin_op, operator, .operator, .none);
 
@@ -4258,125 +4305,240 @@ fn airSwitchDispatch(f: *Function, inst: Air.Inst.Index) !void {
     try w.print("goto zig_switch_{d}_loop;\n", .{@intFromEnum(br.block_inst)});
 }
 
-fn airBitcast(f: *Function, inst: Air.Inst.Index) !CValue {
+fn airPtrCast(f: *Function, inst: Air.Inst.Index) Error!CValue {
+    const zcu = f.dg.pt.zcu;
+
+    const dest_ty = f.typeOfIndex(inst);
+    const ptr_ty = switch (dest_ty.zigTypeTag(zcu)) {
+        .optional => dest_ty.childType(zcu),
+        .pointer => dest_ty,
+        else => unreachable,
+    };
+
+    if (!ptr_ty.isSlice(zcu)) {
+        return airSimpleCast(f, inst);
+    }
+
+    // For slice casts we need to assign both fields.
+
     const ty_op = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const inst_ty = f.typeOfIndex(inst);
+    const operand = try f.resolveInst(ty_op.operand);
+
+    const w = &f.code.writer;
+    const dest_local = try f.allocLocal(inst, dest_ty);
+
+    try f.writeCValueMember(w, dest_local, .{ .identifier = "ptr" });
+    try w.writeAll(" = (");
+    try f.renderType(w, ptr_ty.slicePtrFieldType(zcu));
+    try w.writeByte(')');
+    try f.writeCValueMember(w, operand, .{ .identifier = "ptr" });
+    try w.writeByte(';');
+    try f.newline();
+
+    try f.writeCValueMember(w, dest_local, .{ .identifier = "len" });
+    try w.writeAll(" = ");
+    try f.writeCValueMember(w, operand, .{ .identifier = "len" });
+    try w.writeByte(';');
+    try f.newline();
+
+    try reap(f, inst, &.{ty_op.operand});
+    return dest_local;
+}
+
+fn airSimpleCast(f: *Function, inst: Air.Inst.Index) Error!CValue {
+    const zcu = f.dg.pt.zcu;
+
+    const ty_op = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+    const dest_ty = f.typeOfIndex(inst);
+    const operand_ty = f.typeOf(ty_op.operand);
+    const operand = try f.resolveInst(ty_op.operand);
+
+    const w = &f.code.writer;
+    const dest_local = try f.allocLocal(inst, dest_ty);
+    const v: Vectorize = try .start(f, inst, w, operand_ty);
+    try f.writeCValue(w, dest_local, .other);
+    try v.elem(f, w);
+    try w.writeAll(" = (");
+    try f.renderType(w, dest_ty.scalarType(zcu));
+    try w.writeByte(')');
+    try f.writeCValue(w, operand, .other);
+    try v.elem(f, w);
+    try w.writeByte(';');
+    try f.newline();
+    try v.end(f, inst, w);
+
+    try reap(f, inst, &.{ty_op.operand});
+    return dest_local;
+}
 
+fn airNopCast(f: *Function, inst: Air.Inst.Index) Error!CValue {
+    const zcu = f.dg.pt.zcu;
+
+    const ty_op = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+    const dest_ty = f.typeOfIndex(inst);
+    const operand_ty = f.typeOf(ty_op.operand);
     const operand = try f.resolveInst(ty_op.operand);
+
+    assert(operand_ty.abiSize(zcu) == dest_ty.abiSize(zcu));
+    assert(operand_ty.isAbiInt(zcu) == dest_ty.isAbiInt(zcu));
+
+    try reap(f, inst, &.{ty_op.operand});
+    return f.moveCValue(inst, dest_ty, operand);
+}
+
+fn airUnionFromEnum(f: *Function, inst: Air.Inst.Index) Error!CValue {
+    const zcu = f.dg.pt.zcu;
+
+    const ty_op = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+    const dest_ty = f.typeOfIndex(inst);
     const operand_ty = f.typeOf(ty_op.operand);
+    const operand = try f.resolveInst(ty_op.operand);
+
+    assert(dest_ty.zigTypeTag(zcu) == .@"union");
+    assert(operand_ty.zigTypeTag(zcu) == .@"enum");
+
+    const w = &f.code.writer;
+    const dest_local = try f.allocLocal(inst, dest_ty);
+    try f.writeCValueMember(w, dest_local, .{ .identifier = "tag" });
+    try w.writeAll(" = ");
+    try f.writeCValue(w, operand, .other);
+    try w.writeByte(';');
+    try f.newline();
 
-    const bitcasted = try bitcast(f, inst_ty, operand, operand_ty);
     try reap(f, inst, &.{ty_op.operand});
-    return f.moveCValue(inst, inst_ty, bitcasted);
+    return dest_local;
 }
 
-fn bitcast(f: *Function, dest_ty: Type, operand: CValue, operand_ty: Type) !CValue {
+fn airBitCast(f: *Function, inst: Air.Inst.Index) Error!CValue {
     const pt = f.dg.pt;
     const zcu = pt.zcu;
-    const target = &f.dg.mod.resolved_target.result;
     const w = &f.code.writer;
 
-    if (operand_ty.isAbiInt(zcu) and dest_ty.isAbiInt(zcu)) {
-        const src_info = dest_ty.intInfo(zcu);
-        const dest_info = operand_ty.intInfo(zcu);
-        if (src_info.signedness == dest_info.signedness and
-            src_info.bits == dest_info.bits) return operand;
-    }
+    const ty_op = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+    const dest_ty = f.typeOfIndex(inst);
 
-    if (dest_ty.isPtrAtRuntime(zcu) or operand_ty.isPtrAtRuntime(zcu)) {
-        const local = try f.allocLocal(null, dest_ty);
-        try f.writeCValue(w, local, .other);
+    const operand = try f.resolveInst(ty_op.operand);
+    const operand_ty = f.typeOf(ty_op.operand);
+
+    const dest_local = try f.allocLocal(inst, dest_ty);
+
+    // Because we have `scalarize_bit_cast_array` and `scalarize_bit_cast_vector_non_elementwise`
+    // enabled, we usually only see scalars here. The only case in which we may see vectors is when
+    // the operation happens elementwise, which we can handle with `Vectorize`.
+    var v: Vectorize = try .start(f, inst, w, operand_ty);
+    const operand_scalar_ty = operand_ty.scalarType(zcu);
+    const dest_scalar_ty = dest_ty.scalarType(zcu);
+
+    // Some cases are handled with a simple cast:
+    // * float -> float
+    // * bool -> int
+    if ((operand_scalar_ty.isRuntimeFloat() and dest_scalar_ty.isRuntimeFloat()) or
+        (operand_scalar_ty.toIntern() == .bool_type and dest_scalar_ty.isAbiInt(zcu)))
+    {
+        try f.writeCValue(w, dest_local, .other);
+        try v.elem(f, w);
         try w.writeAll(" = (");
-        try f.renderType(w, dest_ty);
+        try f.renderType(w, dest_scalar_ty);
         try w.writeByte(')');
         try f.writeCValue(w, operand, .other);
+        try v.elem(f, w);
         try w.writeByte(';');
         try f.newline();
-        return local;
-    }
-
-    const local = try f.allocLocal(null, dest_ty);
-    // On big-endian targets, copying ABI integers with padding bits is awkward, because the padding bits are at the low bytes of the value.
-    // We need to offset the source or destination pointer appropriately and copy the right number of bytes.
-    if (target.cpu.arch.endian() == .big and dest_ty.isAbiInt(zcu) and !operand_ty.isAbiInt(zcu)) {
-        // e.g. [10]u8 -> u80. We need to offset the destination so that we copy to the least significant bits of the integer.
-        const offset = dest_ty.abiSize(zcu) - operand_ty.abiSize(zcu);
-        try w.writeAll("memcpy((char *)&");
-        try f.writeCValue(w, local, .other);
-        try w.print(" + {d}, &", .{offset});
-        switch (operand) {
-            .constant => |val| try f.dg.renderValueAsLvalue(w, val),
-            else => try f.writeCValue(w, operand, .other),
-        }
-        try w.print(", {d});", .{operand_ty.abiSize(zcu)});
-    } else if (target.cpu.arch.endian() == .big and operand_ty.isAbiInt(zcu) and !dest_ty.isAbiInt(zcu)) {
-        // e.g. u80 -> [10]u8. We need to offset the source so that we copy from the least significant bits of the integer.
-        const offset = operand_ty.abiSize(zcu) - dest_ty.abiSize(zcu);
+    } else if (dest_scalar_ty.toIntern() == .bool_type) {
+        // If the result is a boolean type, just check if the operand is non-zero.
+        assert(operand_scalar_ty.isAbiInt(zcu));
+        try f.writeCValue(w, dest_local, .other);
+        try v.elem(f, w);
+        try w.writeAll(" = ");
+        try f.writeCValue(w, operand, .other);
+        try v.elem(f, w);
+        try w.writeAll(" != 0;");
+        try f.newline();
+    } else if (dest_scalar_ty.isRuntimeFloat()) {
+        // For int->float, just do a memcpy.
+        assert(operand_scalar_ty.isAbiInt(zcu));
         try w.writeAll("memcpy(&");
-        try f.writeCValue(w, local, .other);
-        try w.writeAll(", (const char *)&");
+        try f.writeCValue(w, dest_local, .other);
+        try v.elem(f, w);
+        try w.writeAll(", &");
         switch (operand) {
             .constant => |val| try f.dg.renderValueAsLvalue(w, val),
             else => try f.writeCValue(w, operand, .other),
         }
-        try w.print(" + {d}, {d});", .{ offset, dest_ty.abiSize(zcu) });
+        try v.elem(f, w);
+        try w.print(", {d});", .{@min(operand_scalar_ty.abiSize(zcu), dest_scalar_ty.abiSize(zcu))});
+        try f.newline();
     } else {
+        // The only remaining possibility is that the result is an integer. We will need to use
+        // `zig_wrap_*` to correct the "padding" bits after we populate the value bits.
+        assert(dest_scalar_ty.isAbiInt(zcu));
+        assert(operand_scalar_ty.isRuntimeFloat() or operand_scalar_ty.isAbiInt(zcu));
+
+        // memcpy the value...
         try w.writeAll("memcpy(&");
-        try f.writeCValue(w, local, .other);
+        try f.writeCValue(w, dest_local, .other);
+        try v.elem(f, w);
         try w.writeAll(", &");
         switch (operand) {
             .constant => |val| try f.dg.renderValueAsLvalue(w, val),
             else => try f.writeCValue(w, operand, .other),
         }
-        try w.print(", {d});", .{@min(dest_ty.abiSize(zcu), operand_ty.abiSize(zcu))});
-    }
-
-    try f.newline();
+        try v.elem(f, w);
+        try w.print(", {d});", .{@min(operand_scalar_ty.abiSize(zcu), dest_scalar_ty.abiSize(zcu))});
+        try f.newline();
 
-    // Ensure padding bits have the expected value.
-    if (dest_ty.isAbiInt(zcu)) {
-        switch (CType.classifyInt(dest_ty, zcu)) {
+        // ...and ensure padding bits have the correct value.
+        switch (CType.classifyInt(dest_scalar_ty, zcu)) {
             .void => unreachable, // opv
             .small => {
-                try f.writeCValue(w, local, .other);
+                try f.writeCValue(w, dest_local, .other);
+                try v.elem(f, w);
                 try w.writeAll(" = zig_wrap_");
-                try f.dg.renderTypeForBuiltinFnName(w, dest_ty);
+                try f.dg.renderTypeForBuiltinFnName(w, dest_scalar_ty);
                 try w.writeByte('(');
-                try f.writeCValue(w, local, .other);
-                try f.dg.renderBuiltinInfo(w, dest_ty, .bits);
+                try f.writeCValue(w, dest_local, .other);
+                try v.elem(f, w);
+                try f.dg.renderBuiltinInfo(w, dest_scalar_ty, .bits);
                 try w.writeAll(");");
                 try f.newline();
             },
             .big => |big| {
-                const dest_info = dest_ty.intInfo(zcu);
-                const padding_index: u16 = switch (target.cpu.arch.endian()) {
+                const dest_info = dest_scalar_ty.intInfo(zcu);
+                const padding_index: u16 = switch (f.dg.mod.resolved_target.result.cpu.arch.endian()) {
                     .little => big.limbs_len - 1,
                     .big => 0,
                 };
                 const wrap_bits = ((dest_info.bits - 1) % big.limb_size.bits()) + 1;
                 if (big.limb_size != .@"128" or dest_info.signedness == .unsigned) {
-                    try f.writeCValueMember(w, local, .{ .identifier = "limbs" });
-                    try w.print("[{d}] = zig_wrap_{c}{d}(", .{
+                    try f.writeCValue(w, dest_local, .other);
+                    try v.elem(f, w);
+                    try w.print(".limbs[{d}] = zig_wrap_{c}{d}(", .{
                         padding_index,
                         signAbbrev(dest_info.signedness),
                         big.limb_size.bits(),
                     });
-                    try f.writeCValueMember(w, local, .{ .identifier = "limbs" });
-                    try w.print("[{d}], {d});", .{ padding_index, wrap_bits });
+                    try f.writeCValue(w, dest_local, .other);
+                    try v.elem(f, w);
+                    try w.print(".limbs[{d}], {d});", .{ padding_index, wrap_bits });
                 } else {
-                    try f.writeCValueMember(w, local, .{ .identifier = "limbs" });
-                    try w.print("[{d}] = zig_bitCast_u128(zig_wrap_i128(zig_bitCast_i128(", .{
+                    try f.writeCValue(w, dest_local, .other);
+                    try v.elem(f, w);
+                    try w.print(".limbs[{d}] = zig_bitCast_u128(zig_wrap_i128(zig_bitCast_i128(", .{
                         padding_index,
                     });
-                    try f.writeCValueMember(w, local, .{ .identifier = "limbs" });
-                    try w.print("[{d}]), {d}));", .{ padding_index, wrap_bits });
+                    try f.writeCValue(w, dest_local, .other);
+                    try v.elem(f, w);
+                    try w.print(".limbs[{d}]), {d}));", .{ padding_index, wrap_bits });
                     try f.newline();
                 }
             },
         }
     }
 
-    return local;
+    try v.end(f, inst, w);
+
+    try reap(f, inst, &.{ty_op.operand});
+    return dest_local;
 }
 
 fn airTrap(f: *Function) !void {
@@ -6151,28 +6313,32 @@ fn airMemset(f: *Function, inst: Air.Inst.Index, safety: bool) !CValue {
         return .none;
     }
 
-    if (elem_abi_size == 1 and !dest_ty.isVolatilePtr(zcu)) {
-        const bitcasted = try bitcast(f, .u8, value, elem_ty);
+    if (elem_abi_size == 1 and elem_ty.isAbiInt(zcu) and !dest_ty.isVolatilePtr(zcu)) {
         try w.writeAll("memset(");
         switch (dest_ty.ptrSize(zcu)) {
             .slice => {
                 try f.writeCValueMember(w, dest_slice, .{ .identifier = "ptr" });
-                try w.writeAll(", ");
-                try f.writeCValue(w, bitcasted, .other);
+                try w.writeAll(", *(const char *)&");
+                switch (value) {
+                    .constant => |v| try f.dg.renderValueAsLvalue(w, v),
+                    else => try f.writeCValue(w, value, .other),
+                }
                 try w.writeAll(", ");
                 try f.writeCValueMember(w, dest_slice, .{ .identifier = "len" });
             },
             .one => {
                 try f.writeCValue(w, dest_slice, .other);
-                try w.writeAll(", ");
-                try f.writeCValue(w, bitcasted, .other);
+                try w.writeAll(", *(const char *)&");
+                switch (value) {
+                    .constant => |v| try f.dg.renderValueAsLvalue(w, v),
+                    else => try f.writeCValue(w, value, .other),
+                }
                 try w.print(", {d}", .{dest_ty.childType(zcu).arrayLen(zcu)});
             },
             .many, .c => unreachable,
         }
         try w.writeAll(");");
         try f.newline();
-        try f.freeCValue(inst, bitcasted);
         try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
         return .none;
     }
diff --git a/src/codegen/c/type.zig b/src/codegen/c/type.zig
@@ -514,40 +514,39 @@ pub const CType = union(enum) {
         }
     }
     fn classifyBitInt(signedness: std.lang.Signedness, bits: u16, zcu: *const Zcu) IntClass {
-        const is_ez80 = zcu.getTarget().cpu.arch == .ez80;
-        return switch (bits) {
+        const target = zcu.getTarget();
+        return switch (std.zig.target.intByteSize(target, bits)) {
             0 => .void,
-            1...8 => switch (signedness) {
+            1 => switch (signedness) {
                 .unsigned => .{ .small = .uint8_t },
                 .signed => .{ .small = .int8_t },
             },
-            9...16 => switch (signedness) {
+            2 => switch (signedness) {
                 .unsigned => .{ .small = .uint16_t },
                 .signed => .{ .small = .int16_t },
             },
-            17...24 => switch (signedness) {
-                .unsigned => .{ .small = if (is_ez80) .uint24_t else .uint32_t },
-                .signed => .{ .small = if (is_ez80) .int24_t else .int32_t },
+            3 => switch (signedness) {
+                .unsigned => .{ .small = .uint24_t },
+                .signed => .{ .small = .int24_t },
             },
-            25...32 => switch (signedness) {
+            4 => switch (signedness) {
                 .unsigned => .{ .small = .uint32_t },
                 .signed => .{ .small = .int32_t },
             },
-            33...48 => switch (signedness) {
-                .unsigned => .{ .small = if (is_ez80) .uint48_t else .uint64_t },
-                .signed => .{ .small = if (is_ez80) .int48_t else .int64_t },
+            6 => switch (signedness) {
+                .unsigned => .{ .small = .uint48_t },
+                .signed => .{ .small = .int48_t },
             },
-            49...64 => switch (signedness) {
+            8 => switch (signedness) {
                 .unsigned => .{ .small = .uint64_t },
                 .signed => .{ .small = .int64_t },
             },
-            65...128 => switch (signedness) {
+            16 => switch (signedness) {
                 .unsigned => .{ .small = .zig_u128 },
                 .signed => .{ .small = .zig_i128 },
             },
-            else => {
+            else => |n| {
                 @branchHint(.unlikely);
-                const target = zcu.getTarget();
                 const limb_bytes = std.zig.target.intAlignment(target, bits);
                 return .{ .big = .{
                     .limb_size = switch (limb_bytes) {
@@ -558,10 +557,7 @@ pub const CType = union(enum) {
                         16 => .@"128",
                         else => unreachable,
                     },
-                    .limbs_len = @divExact(
-                        std.zig.target.intByteSize(target, bits),
-                        limb_bytes,
-                    ),
+                    .limbs_len = @divExact(n, limb_bytes),
                 } };
             },
         };
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
@@ -21,8 +21,7 @@ const Zcu = @import("../Zcu.zig");
 const aarch64_c_abi = @import("aarch64/abi.zig");
 const FuncGen = @import("llvm/FuncGen.zig");
 const isByRef = FuncGen.isByRef;
-const firstParamSRet = FuncGen.firstParamSRet;
-const lowerFnRetTy = FuncGen.lowerFnRetTy;
+const fnReturnStrat = FuncGen.fnReturnStrat;
 const iterateParamTypes = FuncGen.iterateParamTypes;
 const ccAbiPromoteInt = FuncGen.ccAbiPromoteInt;
 
@@ -37,10 +36,10 @@ pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
         .expand_int_from_float_safe,
         .expand_int_from_float_optimized_safe,
 
-        .scalarize_bitcast_array,
+        .scalarize_bit_cast_array,
         // Needed because LLVM's `bitcast` on vectors is endian-specific unless the source and dest
         // types are vectors with equal length (hence also with equal bits-per-element).
-        .scalarize_bitcast_vector_non_elementwise,
+        .scalarize_bit_cast_vector_non_elementwise,
     });
 }
 
@@ -732,8 +731,8 @@ pub const Object = struct {
 
         // TODO: Address space
         const slice_ty = Type.slice_const_u8_sentinel_0;
-        const llvm_usize_ty = try o.lowerType(.usize);
-        const llvm_slice_ty = try o.lowerType(slice_ty);
+        const llvm_usize_ty = try o.lowerType(.usize, .in_memory);
+        const llvm_slice_ty = try o.lowerType(slice_ty, .in_memory);
         const llvm_table_ty = try o.builder.arrayType(1 + error_name_list.len, llvm_slice_ty);
 
         llvm_errors[0] = try o.builder.undefConst(llvm_slice_ty);
@@ -799,7 +798,7 @@ pub const Object = struct {
         {
             if (o.errors_len_variable != .none) {
                 const errors_len = zcu.intern_pool.global_error_set.getNamesFromMainThread().len;
-                const init_val = try o.builder.intConst(try o.errorIntType(), errors_len);
+                const init_val = try o.builder.intConst(try o.errorIntType(.in_memory), errors_len);
                 try o.errors_len_variable.setInitializer(init_val, &o.builder);
             }
             try o.genErrorNameTable();
@@ -1191,7 +1190,7 @@ pub const Object = struct {
         };
         {
             const global = llvm_function.ptrConst(&o.builder).global.ptr(&o.builder);
-            global.type = try o.lowerType(fn_ty);
+            global.type = try o.lowerType(fn_ty, .in_memory);
             global.addr_space = toLlvmAddressSpace(nav.resolved.?.@"addrspace", target);
             global.linkage = if (o.builder.strip) .private else .internal;
             global.visibility = .default;
@@ -1440,10 +1439,10 @@ pub const Object = struct {
             // represent (because it doesn't have runtime bits), we instead lower as the zero-size
             // type `[0 x i8]`. I don't think the type on an extern declaration actually does much
             // anyway.
-            if (nav_ty.isRuntimeFnOrHasRuntimeBits(zcu)) break :ty try o.lowerType(nav_ty);
+            if (nav_ty.isRuntimeFnOrHasRuntimeBits(zcu)) break :ty try o.lowerType(nav_ty, .in_memory);
             break :ty try o.builder.arrayType(0, .i8);
         } else if (nav_ty.hasRuntimeBits(zcu)) ty: {
-            break :ty try o.lowerType(nav_ty);
+            break :ty try o.lowerType(nav_ty, .in_memory);
         } else {
             // This is a non-extern zero-bit `Nav`---we're not interested in it.
             // TODO: we might need to rethink this a little under incremental compilation. If a
@@ -1536,7 +1535,7 @@ pub const Object = struct {
             llvm_variable.setAlignment(llvm_align, &o.builder);
             llvm_variable.setSection(llvm_section, &o.builder);
             llvm_variable.setMutability(if (resolved.@"const") .constant else .global, &o.builder);
-            try llvm_variable.setInitializer(if (opt_extern != null) .no_init else try o.lowerValue(resolved.value), &o.builder);
+            try llvm_variable.setInitializer(if (opt_extern != null) .no_init else try o.lowerValue(resolved.value, .in_memory), &o.builder);
             llvm_variable.setThreadLocal(tl: {
                 if (resolved.@"threadlocal" and !mod.single_threaded) break :tl .generaldynamic;
                 break :tl .default;
@@ -2134,7 +2133,7 @@ pub const Object = struct {
                 defer debug_param_types.deinit(gpa);
 
                 // Return type goes first.
-                if (firstParamSRet(fn_info, zcu, target)) {
+                if (try fnReturnStrat(o, fn_info) == .sret) {
                     // Actual return type is void, then first arg is the sret pointer.
                     const ptr_ty = try pt.singleMutPtrType(.fromInterned(fn_info.return_type));
                     debug_param_types.appendAssumeCapacity(try o.getDebugType(pt, .void));
@@ -2682,12 +2681,12 @@ pub const Object = struct {
         if (fn_info.return_type == .noreturn_type) try attributes.addFnAttr(.noreturn, &o.builder);
 
         var it = iterateParamTypes(o, fn_info);
-        if (firstParamSRet(fn_info, zcu, target)) {
+        if (try fnReturnStrat(o, fn_info) == .sret) {
             // Sret pointers must not be address 0
             try attributes.addParamAttr(it.llvm_index, .nonnull, &o.builder);
             try attributes.addParamAttr(it.llvm_index, .@"noalias", &o.builder);
 
-            const raw_llvm_ret_ty = try o.lowerType(.fromInterned(fn_info.return_type));
+            const raw_llvm_ret_ty = try o.lowerType(.fromInterned(fn_info.return_type), .in_memory);
             try attributes.addParamAttr(it.llvm_index, .{ .sret = raw_llvm_ret_ty }, &o.builder);
             it.llvm_index += 1;
         } else if (ccAbiPromoteInt(fn_info.cc, zcu, Type.fromInterned(fn_info.return_type))) |s| switch (s) {
@@ -2729,9 +2728,7 @@ pub const Object = struct {
             },
             .byref => {
                 const param_ty: Type = .fromInterned(fn_info.param_types.get(ip)[it.zig_index - 1]);
-                const param_llvm_ty = try o.lowerType(param_ty);
-                const alignment = param_ty.abiAlignment(zcu);
-                try o.addByRefParamAttrs(&attributes, it.llvm_index - 1, alignment.toLlvm(), it.byval_attr, param_llvm_ty);
+                try o.addByRefParamAttrs(&attributes, it.llvm_index - 1, it.byval_attr, param_ty);
             },
             .byref_mut => try attributes.addParamAttr(it.llvm_index - 1, .noundef, &o.builder),
             .slice => {
@@ -2841,14 +2838,31 @@ pub const Object = struct {
         }
     }
 
-    pub fn errorIntType(o: *Object) Allocator.Error!Builder.Type {
-        return o.builder.intType(o.zcu.errorSetBits());
+    pub const TypeRepr = enum {
+        /// The representation of the type when it is being manipulated as a value in a function.
+        /// e.g. Zig `u5` -> LLVM `i5`
+        by_value,
+        /// The representation of the type when it is stored in memory.
+        /// e.g. Zig `u5` -> LLVM `i8`
+        in_memory,
+    };
+
+    pub fn errorIntType(o: *Object, repr: TypeRepr) Allocator.Error!Builder.Type {
+        return o.builder.intType(switch (repr) {
+            .by_value => o.zcu.errorSetBits(),
+            .in_memory => @intCast(Type.anyerror.abiSize(o.zcu) * 8),
+        });
     }
 
-    pub fn lowerType(o: *Object, t: Type) Allocator.Error!Builder.Type {
+    pub fn lowerType(o: *Object, t: Type, repr: TypeRepr) Allocator.Error!Builder.Type {
         const zcu = o.zcu;
         const target = zcu.getTarget();
         const ip = &zcu.intern_pool;
+
+        if (repr == .by_value) {
+            assert(!isByRef(t, zcu)); // by-ref types must only be manipulated in memory
+        }
+
         return switch (t.toIntern()) {
             .u0_type => unreachable, // no runtime bits
             inline .u1_type,
@@ -2864,7 +2878,10 @@ pub const Object = struct {
             .u80_type,
             .u128_type,
             .i128_type,
-            => |tag| @field(Builder.Type, "i" ++ @tagName(tag)[1 .. @tagName(tag).len - "_type".len]),
+            => |tag| switch (repr) {
+                .by_value => @field(Builder.Type, "i" ++ @tagName(tag)[1 .. @tagName(tag).len - "_type".len]),
+                .in_memory => try o.builder.intType(@intCast(t.abiSize(zcu) * 8)),
+            },
             .usize_type, .isize_type => try o.builder.intType(target.ptrBitWidth()),
             inline .c_char_type,
             .c_short_type,
@@ -2899,7 +2916,7 @@ pub const Object = struct {
                 return .i8;
             },
             .bool_type => .i1,
-            .anyerror_type => try o.errorIntType(),
+            .anyerror_type => try o.errorIntType(repr),
             .void_type => unreachable, // no runtime bits
             .type_type => unreachable, // no runtime bits
             .comptime_int_type => unreachable, // no runtime bits
@@ -2919,10 +2936,10 @@ pub const Object = struct {
             => .ptr,
             .slice_const_u8_type,
             .slice_const_u8_sentinel_0_type,
-            => try o.builder.structType(.normal, &.{ .ptr, try o.lowerType(.usize) }),
+            => try o.builder.structType(.normal, &.{ .ptr, try o.lowerType(.usize, repr) }),
             .anyerror_void_error_union_type,
             .adhoc_inferred_error_set_type,
-            => try o.errorIntType(),
+            => try o.errorIntType(repr),
             .generic_poison_type => unreachable,
             // values, not types
             .undef,
@@ -2948,7 +2965,10 @@ pub const Object = struct {
             .none,
             => unreachable,
             else => switch (ip.indexToKey(t.toIntern())) {
-                .int_type => |int_type| try o.builder.intType(int_type.bits),
+                .int_type => |int_type| switch (repr) {
+                    .by_value => try o.builder.intType(int_type.bits),
+                    .in_memory => try o.builder.intType(@intCast(t.abiSize(zcu) * 8)),
+                },
                 .ptr_type => |ptr_type| type: {
                     const ptr_ty = try o.builder.ptrType(
                         toLlvmAddressSpace(ptr_type.flags.address_space, target),
@@ -2957,18 +2977,18 @@ pub const Object = struct {
                         .one, .many, .c => ptr_ty,
                         .slice => try o.builder.structType(.normal, &.{
                             ptr_ty,
-                            try o.lowerType(.usize),
+                            try o.lowerType(.usize, repr),
                         }),
                     };
                 },
                 .array_type => |array_type| o.builder.arrayType(
                     array_type.lenIncludingSentinel(),
-                    try o.lowerType(.fromInterned(array_type.child)),
+                    try o.lowerType(.fromInterned(array_type.child), repr),
                 ),
                 .vector_type => |vector_type| o.builder.vectorType(
                     .normal,
                     vector_type.len,
-                    try o.lowerType(.fromInterned(vector_type.child)),
+                    try o.lowerType(.fromInterned(vector_type.child), .by_value),
                 ),
                 .opt_type => |child_ty| {
                     // Must stay in sync with `opt_payload` logic in `lowerPtr`.
@@ -2978,8 +2998,11 @@ pub const Object = struct {
                         .runtime, .partially_comptime => {},
                     }
 
-                    const payload_ty = try o.lowerType(.fromInterned(child_ty));
-                    if (t.optionalReprIsPayload(zcu)) return payload_ty;
+                    if (t.optionalReprIsPayload(zcu)) {
+                        return o.lowerType(.fromInterned(child_ty), repr);
+                    }
+
+                    const payload_ty = try o.lowerType(.fromInterned(child_ty), repr);
 
                     comptime assert(optional_layout_version == 3);
                     var fields: [3]Builder.Type = .{ payload_ty, .i8, undefined };
@@ -2997,7 +3020,7 @@ pub const Object = struct {
                 .error_union_type => |error_union_type| {
                     // Must stay in sync with `codegen.errUnionPayloadOffset`.
                     // See logic in `lowerPtr`.
-                    const error_type = try o.errorIntType();
+                    const error_type = try o.errorIntType(repr);
 
                     switch (Type.fromInterned(error_union_type.payload_type).classify(zcu)) {
                         .fully_comptime => unreachable,
@@ -3005,7 +3028,7 @@ pub const Object = struct {
                         .runtime, .partially_comptime => {},
                     }
 
-                    const payload_type = try o.lowerType(.fromInterned(error_union_type.payload_type));
+                    const payload_type = try o.lowerType(.fromInterned(error_union_type.payload_type), repr);
 
                     const payload_align = Type.fromInterned(error_union_type.payload_type).abiAlignment(zcu);
                     const error_align: InternPool.Alignment = .fromByteUnits(std.zig.target.intAlignment(target, zcu.errorSetBits()));
@@ -3040,16 +3063,14 @@ pub const Object = struct {
                 },
                 .simple_type => unreachable,
                 .struct_type => {
-                    if (o.type_map.get(t.toIntern())) |value| return value;
-
                     const struct_type = ip.loadStructType(t.toIntern());
 
                     if (struct_type.layout == .@"packed") {
-                        const int_ty = try o.lowerType(.fromInterned(struct_type.packed_backing_int_type));
-                        try o.type_map.put(o.gpa, t.toIntern(), int_ty);
-                        return int_ty;
+                        return o.lowerType(.fromInterned(struct_type.packed_backing_int_type), repr);
                     }
 
+                    if (o.type_map.get(t.toIntern())) |value| return value;
+
                     assert(struct_type.size > 0);
 
                     var llvm_field_types: std.ArrayList(Builder.Type) = .empty;
@@ -3083,7 +3104,7 @@ pub const Object = struct {
 
                         if (!field_ty.hasRuntimeBits(zcu)) continue;
 
-                        try llvm_field_types.append(o.gpa, try o.lowerType(field_ty));
+                        try llvm_field_types.append(o.gpa, try o.lowerType(field_ty, repr));
 
                         offset += field_ty.abiSize(zcu);
                     }
@@ -3139,7 +3160,7 @@ pub const Object = struct {
                         if (!Type.fromInterned(field_ty).hasRuntimeBits(zcu)) {
                             continue;
                         }
-                        try llvm_field_types.append(o.gpa, try o.lowerType(.fromInterned(field_ty)));
+                        try llvm_field_types.append(o.gpa, try o.lowerType(.fromInterned(field_ty), repr));
 
                         offset += Type.fromInterned(field_ty).abiSize(zcu);
                     }
@@ -3156,28 +3177,24 @@ pub const Object = struct {
                     return o.builder.structType(.normal, llvm_field_types.items);
                 },
                 .union_type => {
-                    if (o.type_map.get(t.toIntern())) |value| return value;
-
                     const union_obj = ip.loadUnionType(t.toIntern());
 
                     if (union_obj.layout == .@"packed") {
-                        const int_ty = try o.lowerType(.fromInterned(union_obj.packed_backing_int_type));
-                        try o.type_map.put(o.gpa, t.toIntern(), int_ty);
-                        return int_ty;
+                        return o.lowerType(.fromInterned(union_obj.packed_backing_int_type), repr);
                     }
 
-                    assert(union_obj.size > 0);
-
                     const layout = Type.getUnionLayout(union_obj, zcu);
 
                     if (layout.payload_size == 0) {
-                        const enum_tag_ty = try o.lowerType(.fromInterned(union_obj.enum_tag_type));
-                        try o.type_map.put(o.gpa, t.toIntern(), enum_tag_ty);
-                        return enum_tag_ty;
+                        return o.lowerType(.fromInterned(union_obj.enum_tag_type), repr);
                     }
 
+                    if (o.type_map.get(t.toIntern())) |value| return value;
+
+                    assert(union_obj.size > 0);
+
                     const aligned_field_ty = Type.fromInterned(union_obj.field_types.get(ip)[layout.most_aligned_field]);
-                    const aligned_field_llvm_ty = try o.lowerType(aligned_field_ty);
+                    const aligned_field_llvm_ty = try o.lowerType(aligned_field_ty, repr);
 
                     const payload_ty = ty: {
                         if (layout.most_aligned_field_size == layout.payload_size) {
@@ -3203,7 +3220,7 @@ pub const Object = struct {
                         );
                         return ty;
                     }
-                    const enum_tag_ty = try o.lowerType(.fromInterned(union_obj.enum_tag_type));
+                    const enum_tag_ty = try o.lowerType(.fromInterned(union_obj.enum_tag_type), repr);
 
                     // Put the tag before or after the payload depending on which one's
                     // alignment is greater.
@@ -3232,9 +3249,9 @@ pub const Object = struct {
                     return ty;
                 },
                 .opaque_type, .spirv_type => unreachable, // no runtime bits
-                .enum_type => try o.lowerType(t.intTagType(zcu)),
+                .enum_type => try o.lowerType(t.intTagType(zcu), repr),
                 .func_type => |func_type| try o.lowerFnType(t, func_type),
-                .error_set_type, .inferred_error_set_type => try o.errorIntType(),
+                .error_set_type, .inferred_error_set_type => try o.errorIntType(repr),
                 // values, not types
                 .undef,
                 .simple_value,
@@ -3266,12 +3283,12 @@ pub const Object = struct {
 
         assert(fn_ty.fnHasRuntimeBits(zcu));
 
-        const ret_ty = try lowerFnRetTy(o, fn_info);
+        const ret_strat = try fnReturnStrat(o, fn_info);
 
         var llvm_params: std.ArrayList(Builder.Type) = .empty;
         defer llvm_params.deinit(o.gpa);
 
-        if (firstParamSRet(fn_info, zcu, target)) {
+        if (ret_strat == .sret) {
             try llvm_params.append(o.gpa, .ptr);
         }
 
@@ -3286,7 +3303,7 @@ pub const Object = struct {
             .no_bits => continue,
             .byval => {
                 const param_ty = Type.fromInterned(fn_info.param_types.get(ip)[it.zig_index - 1]);
-                try llvm_params.append(o.gpa, try o.lowerType(param_ty));
+                try llvm_params.append(o.gpa, try o.lowerType(param_ty, if (isByRef(param_ty, zcu)) .in_memory else .by_value));
             },
             .byref, .byref_mut => {
                 try llvm_params.append(o.gpa, .ptr);
@@ -3301,7 +3318,7 @@ pub const Object = struct {
                 const param_ty = Type.fromInterned(fn_info.param_types.get(ip)[it.zig_index - 1]);
                 try llvm_params.appendSlice(o.gpa, &.{
                     try o.builder.ptrType(toLlvmAddressSpace(param_ty.ptrAddressSpace(zcu), target)),
-                    try o.lowerType(.usize),
+                    try o.lowerType(.usize, .by_value),
                 });
             },
             .multiple_llvm_types => {
@@ -3309,7 +3326,7 @@ pub const Object = struct {
             },
             .float_array => |count| {
                 const param_ty = Type.fromInterned(fn_info.param_types.get(ip)[it.zig_index - 1]);
-                const float_ty = try o.lowerType(aarch64_c_abi.getFloatArrayType(param_ty, zcu).?);
+                const float_ty = try o.lowerType(aarch64_c_abi.getFloatArrayType(param_ty, zcu).?, .in_memory);
                 try llvm_params.append(o.gpa, try o.builder.arrayType(count, float_ty));
             },
             .i32_array, .i64_array => |arr_len| {
@@ -3321,14 +3338,19 @@ pub const Object = struct {
             },
         };
 
-        return o.builder.fnType(
-            ret_ty,
-            llvm_params.items,
-            if (fn_info.is_var_args) .vararg else .normal,
-        );
+        const llvm_ret_ty: Builder.Type = switch (ret_strat) {
+            .void, .sret => .void,
+            .by_val => try o.lowerType(.fromInterned(fn_info.return_type), .by_value),
+            .mem_cast => |llvm_ret_ty| llvm_ret_ty,
+        };
+        const llvm_fn_kind: Builder.Type.Function.Kind = switch (fn_info.is_var_args) {
+            true => .vararg,
+            false => .normal,
+        };
+        return o.builder.fnType(llvm_ret_ty, llvm_params.items, llvm_fn_kind);
     }
 
-    pub fn lowerValue(o: *Object, arg_val: InternPool.Index) Allocator.Error!Builder.Constant {
+    pub fn lowerValue(o: *Object, arg_val: InternPool.Index, repr: TypeRepr) Allocator.Error!Builder.Constant {
         const zcu = o.zcu;
         const ip = &zcu.intern_pool;
         const target = zcu.getTarget();
@@ -3360,7 +3382,7 @@ pub const Object = struct {
             .inferred_error_set_type,
             => unreachable, // types, not values
 
-            .undef => return o.builder.undefConst(try o.lowerType(ty)),
+            .undef => return o.builder.undefConst(try o.lowerType(ty, repr)),
             .simple_value => |simple_value| switch (simple_value) {
                 .void => unreachable, // non-runtime value
                 .null => unreachable, // non-runtime value
@@ -3375,15 +3397,15 @@ pub const Object = struct {
             .int => {
                 var bigint_space: Value.BigIntSpace = undefined;
                 const bigint = val.toBigInt(&bigint_space, zcu);
-                const llvm_int_ty = try o.builder.intType(ty.intInfo(zcu).bits);
+                const llvm_int_ty = try o.lowerType(ty, repr);
                 return o.builder.bigIntConst(llvm_int_ty, bigint);
             },
             .err => |err| {
                 const int = zcu.intern_pool.getErrorValueIfExists(err.name).?;
-                return o.builder.intConst(try o.errorIntType(), int);
+                return o.builder.intConst(try o.errorIntType(repr), int);
             },
             .error_union => |error_union| {
-                const llvm_error_ty = try o.errorIntType();
+                const llvm_error_ty = try o.errorIntType(repr);
                 const llvm_error_value = switch (error_union.val) {
                     .err_name => |name| try o.builder.intConst(
                         llvm_error_ty,
@@ -3401,8 +3423,8 @@ pub const Object = struct {
                 const payload_align = payload_type.abiAlignment(zcu);
                 const error_align = Type.errorAbiAlignment(zcu);
                 const llvm_payload_value = switch (error_union.val) {
-                    .err_name => try o.builder.undefConst(try o.lowerType(payload_type)),
-                    .payload => |payload| try o.lowerValue(payload),
+                    .err_name => try o.builder.undefConst(try o.lowerType(payload_type, repr)),
+                    .payload => |payload| try o.lowerValue(payload, repr),
                 };
 
                 var fields: [3]Builder.Type = undefined;
@@ -3417,7 +3439,7 @@ pub const Object = struct {
                 fields[0] = vals[0].typeOf(&o.builder);
                 fields[1] = vals[1].typeOf(&o.builder);
 
-                const llvm_ty = try o.lowerType(ty);
+                const llvm_ty = try o.lowerType(ty, repr);
                 const llvm_ty_fields = llvm_ty.structFields(&o.builder);
                 if (llvm_ty_fields.len > 2) {
                     assert(llvm_ty_fields.len == 3);
@@ -3429,7 +3451,7 @@ pub const Object = struct {
                     fields[0..llvm_ty_fields.len],
                 ), vals[0..llvm_ty_fields.len]);
             },
-            .enum_tag => |enum_tag| o.lowerValue(enum_tag.int),
+            .enum_tag => |enum_tag| o.lowerValue(enum_tag.int, repr),
             .float => switch (ty.floatBits(target)) {
                 16 => if (backendSupportsF16(target))
                     try o.builder.halfConst(val.toFloat(f16, zcu))
@@ -3445,9 +3467,9 @@ pub const Object = struct {
                 else => unreachable,
             },
             .ptr => try o.lowerPtr(arg_val, 0),
-            .slice => |slice| return o.builder.structConst(try o.lowerType(ty), &.{
-                try o.lowerValue(slice.ptr),
-                try o.lowerValue(slice.len),
+            .slice => |slice| return o.builder.structConst(try o.lowerType(ty, repr), &.{
+                try o.lowerValue(slice.ptr, repr),
+                try o.lowerValue(slice.len, repr),
             }),
             .opt => |opt| {
                 comptime assert(optional_layout_version == 3);
@@ -3457,7 +3479,7 @@ pub const Object = struct {
                 if (!payload_ty.hasRuntimeBits(zcu)) {
                     return non_null_bit;
                 }
-                const llvm_ty = try o.lowerType(ty);
+                const llvm_ty = try o.lowerType(ty, repr);
                 if (ty.optionalReprIsPayload(zcu)) return switch (opt.val) {
                     .none => switch (llvm_ty.tag(&o.builder)) {
                         .integer => try o.builder.intConst(llvm_ty, 0),
@@ -3465,15 +3487,15 @@ pub const Object = struct {
                         .structure => try o.builder.zeroInitConst(llvm_ty),
                         else => unreachable,
                     },
-                    else => |payload| try o.lowerValue(payload),
+                    else => |payload| try o.lowerValue(payload, repr),
                 };
                 assert(payload_ty.zigTypeTag(zcu) != .@"fn");
 
                 var fields: [3]Builder.Type = undefined;
                 var vals: [3]Builder.Constant = undefined;
                 vals[0] = switch (opt.val) {
-                    .none => try o.builder.undefConst(try o.lowerType(payload_ty)),
-                    else => |payload| try o.lowerValue(payload),
+                    .none => try o.builder.undefConst(try o.lowerType(payload_ty, repr)),
+                    else => |payload| try o.lowerValue(payload, repr),
                 };
                 vals[1] = non_null_bit;
                 fields[0] = vals[0].typeOf(&o.builder);
@@ -3490,14 +3512,14 @@ pub const Object = struct {
                     fields[0..llvm_ty_fields.len],
                 ), vals[0..llvm_ty_fields.len]);
             },
-            .bitpack => |bitpack| return o.lowerValue(bitpack.backing_int_val),
+            .bitpack => |bitpack| return o.lowerValue(bitpack.backing_int_val, repr),
             .aggregate => |aggregate| switch (ip.indexToKey(ty.toIntern())) {
                 .array_type => |array_type| switch (aggregate.storage) {
                     .bytes => |bytes| try o.builder.stringConst(try o.builder.string(
                         bytes.toSlice(array_type.lenIncludingSentinel(), ip),
                     )),
                     .elems => |elems| {
-                        const array_ty = try o.lowerType(ty);
+                        const array_ty = try o.lowerType(ty, repr);
                         const elem_ty = array_ty.childType(&o.builder);
                         assert(elems.len == array_ty.aggregateLen(&o.builder));
 
@@ -3515,7 +3537,7 @@ pub const Object = struct {
 
                         var need_unnamed = false;
                         for (vals, fields, elems) |*result_val, *result_field, elem| {
-                            result_val.* = try o.lowerValue(elem);
+                            result_val.* = try o.lowerValue(elem, repr);
                             result_field.* = result_val.typeOf(&o.builder);
                             if (result_field.* != elem_ty) need_unnamed = true;
                         }
@@ -3527,7 +3549,7 @@ pub const Object = struct {
                     .repeated_elem => |elem| {
                         const len: usize = @intCast(array_type.len);
                         const len_including_sentinel: usize = @intCast(array_type.lenIncludingSentinel());
-                        const array_ty = try o.lowerType(ty);
+                        const array_ty = try o.lowerType(ty, repr);
                         const elem_ty = array_ty.childType(&o.builder);
 
                         const ExpectedContents = extern struct {
@@ -3543,12 +3565,12 @@ pub const Object = struct {
                         defer allocator.free(fields);
 
                         var need_unnamed = false;
-                        @memset(vals[0..len], try o.lowerValue(elem));
+                        @memset(vals[0..len], try o.lowerValue(elem, repr));
                         @memset(fields[0..len], vals[0].typeOf(&o.builder));
                         if (fields[0] != elem_ty) need_unnamed = true;
 
                         if (array_type.sentinel != .none) {
-                            vals[len] = try o.lowerValue(array_type.sentinel);
+                            vals[len] = try o.lowerValue(array_type.sentinel, repr);
                             fields[len] = vals[len].typeOf(&o.builder);
                             if (fields[len] != elem_ty) need_unnamed = true;
                         }
@@ -3560,7 +3582,7 @@ pub const Object = struct {
                     },
                 },
                 .vector_type => |vector_type| {
-                    const vector_ty = try o.lowerType(ty);
+                    const vector_ty = try o.lowerType(ty, repr);
                     switch (aggregate.storage) {
                         .bytes, .elems => {
                             const ExpectedContents = [Builder.expected_fields_len]Builder.Constant;
@@ -3575,7 +3597,7 @@ pub const Object = struct {
                                     result_val.* = try o.builder.intConst(.i8, byte);
                                 },
                                 .elems => |elems| for (vals, elems) |*result_val, elem| {
-                                    result_val.* = try o.lowerValue(elem);
+                                    result_val.* = try o.lowerValue(elem, .by_value);
                                 },
                                 .repeated_elem => unreachable,
                             }
@@ -3583,12 +3605,12 @@ pub const Object = struct {
                         },
                         .repeated_elem => |elem| return o.builder.splatConst(
                             vector_ty,
-                            try o.lowerValue(elem),
+                            try o.lowerValue(elem, .by_value),
                         ),
                     }
                 },
                 .tuple_type => |tuple| {
-                    const struct_ty = try o.lowerType(ty);
+                    const struct_ty = try o.lowerType(ty, repr);
                     const llvm_len = struct_ty.aggregateLen(&o.builder);
 
                     const ExpectedContents = extern struct {
@@ -3633,8 +3655,8 @@ pub const Object = struct {
 
                         vals[llvm_index] = switch (aggregate.storage) {
                             .bytes => |bytes| try o.builder.intConst(.i8, bytes.at(field_index, ip)),
-                            .elems => |elems| try o.lowerValue(elems[field_index]),
-                            .repeated_elem => |elem| try o.lowerValue(elem),
+                            .elems => |elems| try o.lowerValue(elems[field_index], repr),
+                            .repeated_elem => |elem| try o.lowerValue(elem, repr),
                         };
                         fields[llvm_index] = vals[llvm_index].typeOf(&o.builder);
                         if (fields[llvm_index] != struct_ty.structFields(&o.builder)[llvm_index])
@@ -3663,7 +3685,7 @@ pub const Object = struct {
                 },
                 .struct_type => {
                     const struct_type = ip.loadStructType(ty.toIntern());
-                    const struct_ty = try o.lowerType(ty);
+                    const struct_ty = try o.lowerType(ty, repr);
                     assert(struct_type.layout != .@"packed");
                     const llvm_len = struct_ty.aggregateLen(&o.builder);
 
@@ -3707,8 +3729,8 @@ pub const Object = struct {
 
                         vals[llvm_index] = switch (aggregate.storage) {
                             .bytes => |bytes| try o.builder.intConst(.i8, bytes.at(field_index, ip)),
-                            .elems => |elems| try o.lowerValue(elems[field_index]),
-                            .repeated_elem => |elem| try o.lowerValue(elem),
+                            .elems => |elems| try o.lowerValue(elems[field_index], repr),
+                            .repeated_elem => |elem| try o.lowerValue(elem, repr),
                         };
                         fields[llvm_index] = vals[llvm_index].typeOf(&o.builder);
                         if (fields[llvm_index] != struct_ty.structFields(&o.builder)[llvm_index])
@@ -3738,9 +3760,9 @@ pub const Object = struct {
                 else => unreachable,
             },
             .un => |un| {
-                const union_ty = try o.lowerType(ty);
+                const union_ty = try o.lowerType(ty, repr);
                 const layout = ty.unionGetLayout(zcu);
-                if (layout.payload_size == 0) return o.lowerValue(un.tag);
+                if (layout.payload_size == 0) return o.lowerValue(un.tag, repr);
 
                 const union_obj = zcu.typeToUnion(ty).?;
                 const container_layout = union_obj.layout;
@@ -3761,7 +3783,7 @@ pub const Object = struct {
                         const padding_len = layout.payload_size;
                         break :p try o.builder.undefConst(try o.builder.arrayType(padding_len, .i8));
                     }
-                    const payload = try o.lowerValue(un.val);
+                    const payload = try o.lowerValue(un.val, repr);
                     const payload_ty = payload.typeOf(&o.builder);
                     if (payload_ty != union_ty.structFields(&o.builder)[
                         @intFromBool(layout.tag_size > 0 and layout.tag_align.compare(.gte, layout.payload_align))
@@ -3776,7 +3798,7 @@ pub const Object = struct {
                     );
                 } else p: {
                     assert(layout.tag_size == 0);
-                    const union_val = try o.lowerValue(un.val);
+                    const union_val = try o.lowerValue(un.val, repr);
                     need_unnamed = true;
                     break :p union_val;
                 };
@@ -3786,7 +3808,7 @@ pub const Object = struct {
                     try o.builder.structType(union_ty.structKind(&o.builder), &.{payload_ty})
                 else
                     union_ty, &.{payload});
-                const tag = try o.lowerValue(un.tag);
+                const tag = try o.lowerValue(un.tag, repr);
                 const tag_ty = tag.typeOf(&o.builder);
                 var fields: [3]Builder.Type = undefined;
                 var vals: [3]Builder.Constant = undefined;
@@ -3840,8 +3862,8 @@ pub const Object = struct {
             },
             .int => try o.builder.castConst(
                 .inttoptr,
-                try o.builder.intConst(try o.lowerType(.usize), offset),
-                try o.lowerType(.fromInterned(ptr.ty)),
+                try o.builder.intConst(try o.lowerType(.usize, .by_value), offset),
+                try o.lowerType(.fromInterned(ptr.ty), .by_value),
             ),
             .eu_payload => |eu_ptr| try o.lowerPtr(
                 eu_ptr,
@@ -3888,7 +3910,7 @@ pub const Object = struct {
         @"addrspace": std.lang.AddressSpace,
     ) Allocator.Error!Builder.Constant {
         const addr: u64 = @"align".toByteUnits().?;
-        const llvm_usize = try o.lowerType(.usize);
+        const llvm_usize = try o.lowerType(.usize, .by_value);
         const llvm_addr = try o.builder.intConst(llvm_usize, addr);
         const llvm_ptr_ty = try o.builder.ptrType(toLlvmAddressSpace(@"addrspace", o.zcu.getTarget()));
         return o.builder.castConst(.inttoptr, llvm_addr, llvm_ptr_ty);
@@ -3932,11 +3954,11 @@ pub const Object = struct {
         }
         errdefer assert(o.uav_map.remove(.{ .val = uav_val, .@"addrspace" = @"addrspace" }));
 
-        const llvm_ty = try o.lowerType(uav_ty);
+        const llvm_ty = try o.lowerType(uav_ty, .in_memory);
         const llvm_name = try o.builder.strtabStringFmt("__anon_{d}", .{@intFromEnum(uav_val)});
         const llvm_variable = try o.builder.addVariable(llvm_name, llvm_ty, llvm_addrspace);
         gop.value_ptr.* = llvm_variable;
-        try llvm_variable.setInitializer(try o.lowerValue(uav_val), &o.builder);
+        try llvm_variable.setInitializer(try o.lowerValue(uav_val, .in_memory), &o.builder);
         llvm_variable.setMutability(.constant, &o.builder);
         llvm_variable.setAlignment(@"align".toLlvm(), &o.builder);
         const llvm_global = llvm_variable.ptrConst(&o.builder).global;
@@ -4008,7 +4030,7 @@ pub const Object = struct {
                 .x86_64_interrupt,
                 .x86_interrupt,
                 => {
-                    const child_type = try lowerType(o, Type.fromInterned(ptr_info.child));
+                    const child_type = try o.lowerType(.fromInterned(ptr_info.child), .in_memory);
                     try attributes.addParamAttr(llvm_arg_i, .{ .byval = child_type }, &o.builder);
                 },
             }
@@ -4030,14 +4052,15 @@ pub const Object = struct {
         o: *Object,
         attributes: *Builder.FunctionAttributes.Wip,
         llvm_arg_i: u32,
-        alignment: Builder.Alignment,
         byval: bool,
-        param_llvm_ty: Builder.Type,
+        param_ty: Type,
     ) Allocator.Error!void {
+        const llvm_param_ty = try o.lowerType(param_ty, .in_memory);
+        const alignment = param_ty.abiAlignment(o.zcu).toLlvm();
         try attributes.addParamAttr(llvm_arg_i, .nonnull, &o.builder);
         try attributes.addParamAttr(llvm_arg_i, .readonly, &o.builder);
         try attributes.addParamAttr(llvm_arg_i, .{ .@"align" = .wrap(alignment) }, &o.builder);
-        if (byval) try attributes.addParamAttr(llvm_arg_i, .{ .byval = param_llvm_ty }, &o.builder);
+        if (byval) try attributes.addParamAttr(llvm_arg_i, .{ .byval = llvm_param_ty }, &o.builder);
     }
 
     pub fn getErrorNameTable(o: *Object) Allocator.Error!Builder.Variable.Index {
@@ -4062,7 +4085,7 @@ pub const Object = struct {
     pub fn getErrorsLen(o: *Object) Allocator.Error!Builder.Variable.Index {
         const builder = &o.builder;
         if (o.errors_len_variable == .none) {
-            const llvm_err_int_ty = try o.errorIntType();
+            const llvm_err_int_ty = try o.errorIntType(.in_memory);
             const name = try builder.strtabString("__zig_errors_len");
             const variable_index = try builder.addVariable(name, llvm_err_int_ty, .default);
             variable_index.setMutability(.constant, builder);
@@ -4102,9 +4125,9 @@ pub const Object = struct {
         const ip = &zcu.intern_pool;
         const loaded_enum = ip.loadEnumType(enum_ty.toIntern());
 
-        const llvm_usize_ty = try o.lowerType(.usize);
-        const llvm_ret_ty = try o.lowerType(.slice_const_u8_sentinel_0);
-        const llvm_int_ty = try o.lowerType(.fromInterned(loaded_enum.int_tag_type));
+        const llvm_usize_ty = try o.lowerType(.usize, .by_value);
+        const llvm_ret_ty = try o.lowerType(.slice_const_u8_sentinel_0, .by_value);
+        const llvm_int_ty = try o.lowerType(.fromInterned(loaded_enum.int_tag_type), .by_value);
 
         function_index.ptrConst(&o.builder).global.ptr(&o.builder).type =
             try o.builder.fnType(llvm_ret_ty, &.{llvm_int_ty}, .normal);
@@ -4153,7 +4176,7 @@ pub const Object = struct {
             const return_block = try wip.block(1, "Name");
             const llvm_tag_val = switch (loaded_enum.field_values.getOrNone(ip, field_index)) {
                 .none => try o.builder.intConst(llvm_int_ty, field_index), // auto-numbered
-                else => |tag_val_ip| try o.lowerValue(tag_val_ip),
+                else => |tag_val_ip| try o.lowerValue(tag_val_ip, .by_value),
             };
             try wip_switch.addCase(llvm_tag_val, return_block, &wip);
 
@@ -4199,7 +4222,7 @@ pub const Object = struct {
         const ip = &zcu.intern_pool;
         const loaded_enum = ip.loadEnumType(enum_ty.toIntern());
 
-        const llvm_int_ty = try o.lowerType(.fromInterned(loaded_enum.int_tag_type));
+        const llvm_int_ty = try o.lowerType(.fromInterned(loaded_enum.int_tag_type), .by_value);
         function_index.ptrConst(&o.builder).global.ptr(&o.builder).type =
             try o.builder.fnType(.i1, &.{llvm_int_ty}, .normal);
 
@@ -4226,7 +4249,7 @@ pub const Object = struct {
 
         if (loaded_enum.field_values.len > 0) {
             for (loaded_enum.field_values.get(ip)) |tag_val_ip| {
-                const llvm_tag_val = try o.lowerValue(tag_val_ip);
+                const llvm_tag_val = try o.lowerValue(tag_val_ip, .by_value);
                 try wip_switch.addCase(llvm_tag_val, named_block, &wip);
             }
         } else {
diff --git a/src/codegen/llvm/FuncGen.zig b/src/codegen/llvm/FuncGen.zig
@@ -164,7 +164,7 @@ fn resolveValue(self: *FuncGen, val: Value) Allocator.Error!Builder.Constant {
     const zcu = o.zcu;
     const ty = val.typeOf(zcu);
     if (!isByRef(ty, zcu)) {
-        return o.lowerValue(val.toIntern());
+        return o.lowerValue(val.toIntern(), .by_value);
     } else {
         // We need a pointer to a global constant, i.e. a UAV.
         return o.lowerUavRef(
@@ -193,12 +193,13 @@ pub fn genMainBody(fg: *FuncGen) TodoError!void {
     var it = iterateParamTypes(o, fn_info);
 
     // Populate `fg.ret_ptr`...
-    if (firstParamSRet(fn_info, zcu, zcu.getTarget())) {
-        fg.ret_ptr = fg.wip.arg(it.llvm_index);
-        it.llvm_index += 1;
-    } else {
-        fg.ret_ptr = .none;
-    }
+    fg.ret_ptr = switch (try fnReturnStrat(o, fn_info)) {
+        .sret => rp: {
+            defer it.llvm_index += 1;
+            break :rp fg.wip.arg(it.llvm_index);
+        },
+        else => .none,
+    };
     // ...and `fg.err_ret_trace`...
     if (fn_info.cc == .auto and comp.config.any_error_tracing) {
         fg.err_ret_trace = fg.wip.arg(it.llvm_index);
@@ -224,7 +225,7 @@ pub fn genMainBody(fg: *FuncGen) TodoError!void {
 
                 if (isByRef(param_ty, zcu)) {
                     const alignment = param_ty.abiAlignment(zcu).toLlvm();
-                    const arg_ptr = try fg.buildAlloca(try o.lowerType(param_ty), alignment);
+                    const arg_ptr = try fg.buildZigAlloca(param_ty, .none);
                     // We don't need to handle non-ABI-sized integer types in memory here since they
                     // are never by-ref.
                     _ = try fg.wip.store(.normal, param, arg_ptr, alignment);
@@ -248,9 +249,8 @@ pub fn genMainBody(fg: *FuncGen) TodoError!void {
                 const param_ty: Type = .fromInterned(param_types[it.zig_index - 1]);
                 const param = fg.wip.arg(it.llvm_index - 1);
 
-                const param_llvm_ty = try o.lowerType(param_ty);
                 const alignment = param_ty.abiAlignment(zcu).toLlvm();
-                const arg_ptr = try fg.buildAlloca(param_llvm_ty, alignment);
+                const arg_ptr = try fg.buildZigAlloca(param_ty, .none);
                 _ = try fg.wip.store(.normal, param, arg_ptr, alignment);
 
                 if (isByRef(param_ty, zcu)) {
@@ -264,7 +264,7 @@ pub fn genMainBody(fg: *FuncGen) TodoError!void {
                 const param_ty: Type = .fromInterned(param_types[it.zig_index - 1]);
                 assert(!isByRef(param_ty, zcu));
                 const slice_val = try fg.wip.buildAggregate(
-                    try o.lowerType(param_ty),
+                    try o.lowerType(param_ty, .by_value),
                     &.{ fg.wip.arg(it.llvm_index - 2), fg.wip.arg(it.llvm_index - 1) },
                     "",
                 );
@@ -291,11 +291,10 @@ pub fn genMainBody(fg: *FuncGen) TodoError!void {
             },
             .float_array => {
                 const param_ty: Type = .fromInterned(param_types[it.zig_index - 1]);
-                const param_llvm_ty = try o.lowerType(param_ty);
                 const param = fg.wip.arg(it.llvm_index - 1);
 
                 const alignment = param_ty.abiAlignment(zcu).toLlvm();
-                const arg_ptr = try fg.buildAlloca(param_llvm_ty, alignment);
+                const arg_ptr = try fg.buildZigAlloca(param_ty, .none);
                 _ = try fg.wip.store(.normal, param, arg_ptr, alignment);
 
                 if (isByRef(param_ty, zcu)) {
@@ -349,13 +348,13 @@ fn genBody(self: *FuncGen, body: []const Air.Inst.Index, coverage_point: Air.Cov
             try fuzz.pcs.append(gpa, pc);
         },
     }
-    for (body, 0..) |inst, i| {
+    for (body) |inst| {
         if (self.liveness.isUnused(inst) and !self.air.mustLower(inst, ip)) continue;
 
         const val: Builder.Value = switch (air_tags[@intFromEnum(inst)]) {
             // zig fmt: off
 
-            // Required due to `.scalarize_bitcast_vector_non_elementwise` being enabled.
+            // Required due to `.scalarize_bit_cast_vector_non_elementwise` being enabled.
             .legalize_vec_elem_val   => try self.airLegalizeVecElemVal(inst),
             .legalize_vec_store_elem => try self.airLegalizeVecStoreElem(inst),
 
@@ -461,29 +460,36 @@ fn genBody(self: *FuncGen, body: []const Air.Inst.Index, coverage_point: Air.Cov
             .is_err          => try self.airIsErr(inst, .ne, false),
             .is_err_ptr      => try self.airIsErr(inst, .ne, true),
 
-            .alloc          => try self.airAlloc(inst),
-            .ret_ptr        => try self.airRetPtr(inst),
-            .arg            => try self.airArg(inst),
-            .bitcast        => try self.airBitCast(inst),
-            .breakpoint     => try self.airBreakpoint(inst),
-            .ret_addr       => try self.airRetAddr(inst),
-            .frame_addr     => try self.airFrameAddress(inst),
-            .@"try"         => try self.airTry(inst, false),
-            .try_cold       => try self.airTry(inst, true),
-            .try_ptr        => try self.airTryPtr(inst, false),
-            .try_ptr_cold   => try self.airTryPtr(inst, true),
-            .intcast        => try self.airIntCast(inst, false),
-            .intcast_safe   => try self.airIntCast(inst, true),
-            .trunc          => try self.airTrunc(inst),
-            .fptrunc        => try self.airFptrunc(inst),
-            .fpext          => try self.airFpext(inst),
-            .load           => try self.airLoad(inst),
-            .not            => try self.airNot(inst),
-            .store          => try self.airStore(inst, false),
-            .store_safe     => try self.airStore(inst, true),
-            .assembly       => try self.airAssembly(inst),
-            .slice_ptr      => try self.airSliceField(inst, 0),
-            .slice_len      => try self.airSliceField(inst, 1),
+            .alloc           => try self.airAlloc(inst),
+            .ret_ptr         => try self.airRetPtr(inst),
+            .arg             => try self.airArg(inst),
+            .bit_cast        => try self.airBitCast(inst),
+            .ptr_cast        => try self.airNopCast(inst),
+            .ptr_from_int    => try self.airPtrFromInt(inst),
+            .int_from_ptr    => try self.airIntFromPtr(inst),
+            .error_cast      => try self.airNopCast(inst),
+            .error_from_int  => try self.airNopCast(inst),
+            .int_from_error  => try self.airNopCast(inst),
+            .union_from_enum => try self.airUnionFromEnum(inst),
+            .breakpoint      => try self.airBreakpoint(inst),
+            .ret_addr        => try self.airRetAddr(inst),
+            .frame_addr      => try self.airFrameAddress(inst),
+            .@"try"          => try self.airTry(inst, false),
+            .try_cold        => try self.airTry(inst, true),
+            .try_ptr         => try self.airTryPtr(inst, false),
+            .try_ptr_cold    => try self.airTryPtr(inst, true),
+            .int_cast        => try self.airIntCast(inst, false),
+            .int_cast_safe   => try self.airIntCast(inst, true),
+            .trunc           => try self.airTrunc(inst),
+            .fptrunc         => try self.airFptrunc(inst),
+            .fpext           => try self.airFpext(inst),
+            .load            => try self.airLoad(inst),
+            .not             => try self.airNot(inst),
+            .store           => try self.airStore(inst, false),
+            .store_safe      => try self.airStore(inst, true),
+            .assembly        => try self.airAssembly(inst),
+            .slice_ptr       => try self.airSliceField(inst, 0),
+            .slice_len       => try self.airSliceField(inst, 1),
 
             .ptr_slice_ptr_ptr => try self.airPtrSliceFieldPtr(inst, 0),
             .ptr_slice_len_ptr => try self.airPtrSliceFieldPtr(inst, 1),
@@ -561,9 +567,9 @@ fn genBody(self: *FuncGen, body: []const Air.Inst.Index, coverage_point: Air.Cov
             .set_err_return_trace        => try self.airSetErrReturnTrace(inst),
             .save_err_return_trace_index => try self.airSaveErrReturnTraceIndex(inst),
 
-            .wrap_optional         => try self.airWrapOptional(body[i..]),
-            .wrap_errunion_payload => try self.airWrapErrUnionPayload(body[i..]),
-            .wrap_errunion_err     => try self.airWrapErrUnionErr(body[i..]),
+            .wrap_optional         => try self.airWrapOptional(inst),
+            .wrap_errunion_payload => try self.airWrapErrUnionPayload(inst),
+            .wrap_errunion_err     => try self.airWrapErrUnionErr(inst),
 
             .wasm_memory_size => try self.airWasmMemorySize(inst),
             .wasm_memory_grow => try self.airWasmMemoryGrow(inst),
@@ -746,7 +752,7 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
         break :llvm_fn try self.resolveInst(air_call.callee);
     };
     const target = zcu.getTarget();
-    const sret = firstParamSRet(fn_info, zcu, target);
+    const ret_strat = try fnReturnStrat(o, fn_info);
 
     var llvm_args = std.array_list.Managed(Builder.Value).init(self.gpa);
     defer llvm_args.deinit();
@@ -764,20 +770,21 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
         .no_suspend, .always_inline, .compile_time => unreachable,
     }
 
-    const ret_ptr = if (sret) ret_ptr: {
-        const llvm_ret_ty = try o.lowerType(return_type);
-        try attributes.addParamAttr(0, .{ .sret = llvm_ret_ty }, &o.builder);
+    const sret_alloc: ?Builder.Value = switch (ret_strat) {
+        .sret => sret_alloc: {
+            try attributes.addParamAttr(0, .{ .sret = try o.lowerType(return_type, .in_memory) }, &o.builder);
 
-        const alignment = return_type.abiAlignment(zcu).toLlvm();
-        const ret_ptr = try self.buildAlloca(llvm_ret_ty, alignment);
-        try llvm_args.append(ret_ptr);
-        break :ret_ptr ret_ptr;
-    } else ret_ptr: {
-        if (ccAbiPromoteInt(fn_info.cc, zcu, Type.fromInterned(fn_info.return_type))) |s| switch (s) {
-            .signed => try attributes.addRetAttr(.signext, &o.builder),
-            .unsigned => try attributes.addRetAttr(.zeroext, &o.builder),
-        };
-        break :ret_ptr null;
+            const ptr = try self.buildZigAlloca(return_type, .none);
+            try llvm_args.append(ptr);
+            break :sret_alloc ptr;
+        },
+        else => sret_alloc: {
+            if (ccAbiPromoteInt(fn_info.cc, zcu, .fromInterned(fn_info.return_type))) |s| switch (s) {
+                .signed => try attributes.addRetAttr(.signext, &o.builder),
+                .unsigned => try attributes.addRetAttr(.zeroext, &o.builder),
+            };
+            break :sret_alloc null;
+        },
     };
 
     const err_return_tracing = fn_info.cc == .auto and zcu.comp.config.any_error_tracing;
@@ -793,11 +800,11 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
             const arg = args[it.zig_index - 1];
             const param_ty = self.typeOf(arg);
             const llvm_arg = try self.resolveInst(arg);
-            const llvm_param_ty = try o.lowerType(param_ty);
             if (isByRef(param_ty, zcu)) {
                 const alignment = param_ty.abiAlignment(zcu).toLlvm();
                 // We don't need to handle non-ABI-sized integer types in memory here since they are
                 // never by-ref.
+                const llvm_param_ty = try o.lowerType(param_ty, .in_memory);
                 const loaded = try self.wip.load(.normal, llvm_param_ty, llvm_arg, alignment, "");
                 try llvm_args.append(loaded);
             } else {
@@ -811,9 +818,7 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
             if (isByRef(param_ty, zcu)) {
                 try llvm_args.append(llvm_arg);
             } else {
-                const alignment = param_ty.abiAlignment(zcu).toLlvm();
-                const param_llvm_ty = llvm_arg.typeOfWip(&self.wip);
-                const arg_ptr = try self.buildAlloca(param_llvm_ty, alignment);
+                const arg_ptr = try self.buildZigAlloca(param_ty, .none);
                 try self.store(arg_ptr, .none, llvm_arg, param_ty, .normal);
                 try llvm_args.append(arg_ptr);
             }
@@ -823,9 +828,7 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
             const param_ty = self.typeOf(arg);
             const llvm_arg = try self.resolveInst(arg);
 
-            const alignment = param_ty.abiAlignment(zcu).toLlvm();
-            const param_llvm_ty = try o.lowerType(param_ty);
-            const arg_ptr = try self.buildAlloca(param_llvm_ty, alignment);
+            const arg_ptr = try self.buildZigAlloca(param_ty, .none);
             try self.store(arg_ptr, .none, llvm_arg, param_ty, .normal);
             try llvm_args.append(arg_ptr);
         },
@@ -877,18 +880,16 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
             const arg_ty = self.typeOf(arg);
             const arg_val = try self.resolveInst(arg);
 
-            const arg_align = arg_ty.abiAlignment(zcu);
-
             const arg_ptr: Builder.Value = if (!isByRef(arg_ty, zcu)) ptr: {
-                const ptr = try self.buildAlloca(try o.lowerType(arg_ty), arg_align.toLlvm());
+                const ptr = try self.buildZigAlloca(arg_ty, .none);
                 try self.store(ptr, .none, arg_val, arg_ty, .normal);
                 break :ptr ptr;
             } else arg_val;
 
-            const float_ty = try o.lowerType(aarch64_c_abi.getFloatArrayType(arg_ty, zcu).?);
+            const float_ty = try o.lowerType(aarch64_c_abi.getFloatArrayType(arg_ty, zcu).?, .in_memory);
             const array_ty = try o.builder.arrayType(count, float_ty);
 
-            const loaded = try self.wip.load(.normal, array_ty, arg_ptr, arg_align.toLlvm(), "");
+            const loaded = try self.wip.load(.normal, array_ty, arg_ptr, arg_ty.abiAlignment(zcu).toLlvm(), "");
             try llvm_args.append(loaded);
         },
         .i32_array, .i64_array => |arr_len| {
@@ -897,16 +898,14 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
             const arg_ty = self.typeOf(arg);
             const arg_val = try self.resolveInst(arg);
 
-            const arg_align = arg_ty.abiAlignment(zcu);
-
             const arg_ptr: Builder.Value = if (!isByRef(arg_ty, zcu)) ptr: {
-                const ptr = try self.buildAlloca(try o.lowerType(arg_ty), arg_align.toLlvm());
+                const ptr = try self.buildZigAlloca(arg_ty, .none);
                 try self.store(ptr, .none, arg_val, arg_ty, .normal);
                 break :ptr ptr;
             } else arg_val;
 
             const array_ty = try o.builder.arrayType(arr_len, try o.builder.intType(@intCast(elem_size)));
-            const loaded = try self.wip.load(.normal, array_ty, arg_ptr, arg_align.toLlvm(), "");
+            const loaded = try self.wip.load(.normal, array_ty, arg_ptr, arg_ty.abiAlignment(zcu).toLlvm(), "");
             try llvm_args.append(loaded);
         },
     };
@@ -916,7 +915,7 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
     {
         // Add argument attributes.
         it = iterateParamTypes(o, fn_info);
-        it.llvm_index += @intFromBool(sret);
+        it.llvm_index += @intFromBool(ret_strat == .sret);
         it.llvm_index += @intFromBool(err_return_tracing);
         var remaining_inreg_int = cc_info.inreg_int_params;
         var remaining_inreg_float = cc_info.inreg_float_params;
@@ -945,10 +944,8 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
             },
             .byref => {
                 const param_index = it.zig_index - 1;
-                const param_ty = Type.fromInterned(fn_info.param_types.get(ip)[param_index]);
-                const param_llvm_ty = try o.lowerType(param_ty);
-                const alignment = param_ty.abiAlignment(zcu).toLlvm();
-                try o.addByRefParamAttrs(&attributes, it.llvm_index - 1, alignment, it.byval_attr, param_llvm_ty);
+                const param_ty: Type = .fromInterned(fn_info.param_types.get(ip)[param_index]);
+                try o.addByRefParamAttrs(&attributes, it.llvm_index - 1, it.byval_attr, param_ty);
             },
             .byref_mut => try attributes.addParamAttr(it.llvm_index - 1, .noundef, &o.builder),
             // No attributes needed for these.
@@ -998,7 +995,7 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
         },
         cc_info.llvm_cc,
         try attributes.finish(&o.builder),
-        try o.lowerType(zig_fn_ty),
+        try o.lowerType(zig_fn_ty, .by_value),
         llvm_fn,
         llvm_args.items,
         "",
@@ -1008,48 +1005,27 @@ fn airCall(self: *FuncGen, inst: Air.Inst.Index, modifier: std.lang.CallModifier
         return .none;
     }
 
-    if (self.liveness.isUnused(inst) or !return_type.hasRuntimeBits(zcu)) {
+    if (self.liveness.isUnused(inst)) {
         return .none;
     }
 
-    const llvm_ret_ty = try o.lowerType(return_type);
-    if (ret_ptr) |rp| {
-        if (isByRef(return_type, zcu)) {
-            return rp;
-        } else {
-            // our by-ref status disagrees with sret so we must load.
-            return self.load(rp, .none, return_type, .normal);
-        }
-    }
-
-    const abi_ret_ty = try lowerFnRetTy(o, fn_info);
-
-    if (abi_ret_ty != llvm_ret_ty) {
-        // In this case the function return type is honoring the calling convention by having
-        // a different LLVM type than the usual one. We solve this here at the callsite
-        // by using our canonical type, then loading it if necessary.
-        const alignment = return_type.abiAlignment(zcu).toLlvm();
-        const rp = try self.buildAlloca(abi_ret_ty, alignment);
-        // We don't need to handle non-ABI-sized integer types in memory here since they can only be
-        // returned from `CallingConvention.auto` functions, in which case `abi_ret_ty` will equal
-        // `llvm_ret_ty` anyway.
-        _ = try self.wip.store(.normal, call, rp, alignment);
-        return if (isByRef(return_type, zcu))
-            rp
-        else
-            try self.load(rp, .none, return_type, .normal);
-    }
+    // We exit this `switch` if we have a pointer to the return value.
+    const ret_val_ptr: Builder.Value = switch (ret_strat) {
+        .void => return .none,
+        .by_val => return call,
 
+        .sret => sret_alloc.?,
+        .mem_cast => |llvm_ret_ty| ret_val_ptr: {
+            const alignment = return_type.abiAlignment(zcu).toLlvm();
+            const ptr = try self.buildAlloca(llvm_ret_ty, alignment);
+            _ = try self.wip.store(.normal, call, ptr, alignment);
+            break :ret_val_ptr ptr;
+        },
+    };
     if (isByRef(return_type, zcu)) {
-        // our by-ref status disagrees with sret so we must allocate, store,
-        // and return the allocation pointer.
-        const alignment = return_type.abiAlignment(zcu).toLlvm();
-        const rp = try self.buildAlloca(llvm_ret_ty, alignment);
-        // We don't need to handle non-ABI-sized integer types here since they are never by-ref.
-        _ = try self.wip.store(.normal, call, rp, alignment);
-        return rp;
+        return ret_val_ptr;
     } else {
-        return call;
+        return self.load(ret_val_ptr, .none, return_type, .normal);
     }
 }
 
@@ -1059,7 +1035,7 @@ fn buildSimplePanic(fg: *FuncGen, panic_id: Zcu.SimplePanicId) Allocator.Error!v
     const target = zcu.getTarget();
     const panic_func = zcu.funcInfo(zcu.std_lang_decl_values.get(panic_id.toStdLangDecl()));
     const fn_info = zcu.typeToFunc(.fromInterned(panic_func.ty)).?;
-    const llvm_panic_fn_ty = try o.lowerType(.fromInterned(panic_func.ty));
+    const llvm_panic_fn_ty = try o.lowerType(.fromInterned(panic_func.ty), .by_value);
 
     const llvm_panic_fn_ref = try o.lowerNavRef(panic_func.owner_nav);
 
@@ -1083,64 +1059,21 @@ fn airRet(self: *FuncGen, inst: Air.Inst.Index, safety: bool) Allocator.Error!vo
     const zcu = o.zcu;
     const ip = &zcu.intern_pool;
     const un_op = self.air.instructions.items(.data)[@intFromEnum(inst)].un_op;
-    const ret_ty = self.typeOf(un_op);
-
-    if (self.ret_ptr != .none) {
-        const operand = try self.resolveInst(un_op);
-        const val_is_undef = if (un_op.toInterned()) |i| Value.fromInterned(i).isUndef(zcu) else false;
-        if (val_is_undef and safety and !self.needMemsetWorkaround(ret_ty.abiSize(zcu))) {
-            const len = try o.builder.intValue(try o.lowerType(.usize), ret_ty.abiSize(zcu));
-            _ = try self.wip.callMemSet(
-                self.ret_ptr,
-                ret_ty.abiAlignment(zcu).toLlvm(),
-                try o.builder.intValue(.i8, 0xaa),
-                len,
-                .normal,
-                self.disable_intrinsics,
-            );
-            const owner_mod = self.ownerModule();
-            if (owner_mod.valgrind) {
-                try self.valgrindMarkUndef(self.ret_ptr, len);
-            }
-            _ = try self.wip.retVoid();
-            return;
-        }
-
-        const unwrapped_operand = operand.unwrap();
-        const unwrapped_ret = self.ret_ptr.unwrap();
 
-        // Return value was stored previously
-        if (unwrapped_operand == .instruction and unwrapped_ret == .instruction and unwrapped_operand.instruction == unwrapped_ret.instruction) {
-            _ = try self.wip.retVoid();
-            return;
-        }
+    const ret_ty = self.typeOf(un_op);
 
-        try self.store(self.ret_ptr, .none, operand, ret_ty, .normal);
-        _ = try self.wip.retVoid();
-        return;
-    }
     const fn_info = zcu.typeToFunc(Type.fromInterned(ip.getNav(self.nav_index).resolved.?.type)).?;
-    if (!ret_ty.hasRuntimeBits(zcu)) {
-        if (Type.fromInterned(fn_info.return_type).isError(zcu)) {
-            // Functions with an empty error set are emitted with an error code
-            // return type and return zero so they can be function pointers coerced
-            // to functions that return anyerror.
-            _ = try self.wip.ret(try o.builder.intValue(try o.errorIntType(), 0));
-        } else {
-            _ = try self.wip.retVoid();
-        }
-        return;
-    }
 
-    const llvm_ret_ty = try o.lowerType(ret_ty);
-    const abi_ret_ty = try lowerFnRetTy(o, fn_info);
-    const operand = try self.resolveInst(un_op);
+    const ret_strat = try fnReturnStrat(o, fn_info);
     const val_is_undef = if (un_op.toInterned()) |i| Value.fromInterned(i).isUndef(zcu) else false;
     const ret_ty_align = ret_ty.abiAlignment(zcu);
 
     if (val_is_undef and safety and !self.needMemsetWorkaround(ret_ty.abiSize(zcu))) {
-        const rp = try self.buildAlloca(llvm_ret_ty, ret_ty_align.toLlvm());
-        const len = try o.builder.intValue(try o.lowerType(.usize), ret_ty.abiSize(zcu));
+        const rp = switch (self.ret_ptr) {
+            .none => try self.buildZigAlloca(ret_ty, .none),
+            else => |rp| rp,
+        };
+        const len = try o.builder.intValue(try o.lowerType(.usize, .by_value), ret_ty.abiSize(zcu));
         _ = try self.wip.callMemSet(
             rp,
             ret_ty_align.toLlvm(),
@@ -1153,36 +1086,46 @@ fn airRet(self: *FuncGen, inst: Air.Inst.Index, safety: bool) Allocator.Error!vo
         if (owner_mod.valgrind) {
             try self.valgrindMarkUndef(rp, len);
         }
-        if (fn_info.cc == .auto and abi_ret_ty == llvm_ret_ty) {
-            assert(!isByRef(ret_ty, zcu));
-            // The return type could be a non-ABI-sized integer, so use `FuncGen.load` to make sure
-            // we load it from memory correctly.
-            const loaded = try self.load(rp, .none, ret_ty, .normal);
-            _ = try self.wip.ret(loaded);
-        } else {
-            const loaded = try self.wip.load(.normal, abi_ret_ty, rp, ret_ty_align.toLlvm(), "");
-            _ = try self.wip.ret(loaded);
+        switch (ret_strat) {
+            .void => unreachable, // value is undef so return type cannot be OPV
+            .sret => {
+                // We just stored directly to `self.ret_ptr`.
+                _ = try self.wip.retVoid();
+            },
+            .by_val => {
+                const loaded = try self.load(rp, .none, ret_ty, .normal);
+                _ = try self.wip.ret(loaded);
+            },
+            .mem_cast => |llvm_abi_ret_ty| {
+                const loaded = try self.wip.load(.normal, llvm_abi_ret_ty, rp, ret_ty_align.toLlvm(), "");
+                _ = try self.wip.ret(loaded);
+            },
         }
         return;
     }
 
-    if (isByRef(ret_ty, zcu)) {
-        // operand is a pointer however self.ret_ptr is null so that means we need to return a value.
-        // No need to handle non-ABI-sized integer types in memory here since they are never by-ref.
-        _ = try self.wip.ret(try self.wip.load(.normal, abi_ret_ty, operand, ret_ty_align.toLlvm(), ""));
-        return;
-    }
-
-    if (abi_ret_ty == llvm_ret_ty) {
-        _ = try self.wip.ret(operand);
-    } else {
-        const rp = try self.buildAlloca(llvm_ret_ty, ret_ty_align.toLlvm());
-        try self.store(rp, .none, operand, ret_ty, .normal);
-        // No need to handle non-ABI-sized integer types in memory here since they can only be
-        // returned from `CallingConvention.auto` functions, in which case `abi_ret_ty` will equal
-        // `llvm_ret_ty` anyway.
-        const ret_val = try self.wip.load(.normal, abi_ret_ty, rp, ret_ty_align.toLlvm(), "");
-        _ = try self.wip.ret(ret_val);
+    switch (ret_strat) {
+        .void => _ = try self.wip.retVoid(),
+        .sret => {
+            const operand = try self.resolveInst(un_op);
+            try self.store(self.ret_ptr, .none, operand, ret_ty, .normal);
+            _ = try self.wip.retVoid();
+        },
+        .by_val => {
+            assert(!isByRef(ret_ty, zcu));
+            const operand = try self.resolveInst(un_op);
+            _ = try self.wip.ret(operand);
+        },
+        .mem_cast => |llvm_ret_ty| {
+            const operand = try self.resolveInst(un_op);
+            const ptr: Builder.Value = if (!isByRef(ret_ty, zcu)) ptr: {
+                const ptr = try self.buildZigAlloca(ret_ty, .none);
+                try self.store(ptr, .none, operand, ret_ty, .normal);
+                break :ptr ptr;
+            } else operand;
+            const ret_val = try self.wip.load(.normal, llvm_ret_ty, ptr, ret_ty_align.toLlvm(), "");
+            _ = try self.wip.ret(ret_val);
+        },
     }
 }
 
@@ -1194,23 +1137,24 @@ fn airRetLoad(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!void {
     const ptr_ty = self.typeOf(un_op);
     const ret_ty = ptr_ty.childType(zcu);
     const fn_info = zcu.typeToFunc(.fromInterned(ip.getNav(self.nav_index).resolved.?.type)).?;
-    if (!ret_ty.hasRuntimeBits(zcu) or self.ret_ptr != .none) {
-        _ = try self.wip.retVoid();
-        return;
-    }
     const ptr = try self.resolveInst(un_op);
-    const llvm_ret_ty = try o.lowerType(ret_ty);
-    const abi_ret_ty = try lowerFnRetTy(o, fn_info);
-    if (fn_info.cc == .auto and abi_ret_ty == llvm_ret_ty) {
-        assert(!isByRef(ret_ty, zcu));
-        // The return type could be a non-ABI-sized integer, so use `FuncGen.load` to make sure we
-        // load it from memory correctly.
-        const loaded = try self.load(ptr, .none, ret_ty, .normal);
-        _ = try self.wip.ret(loaded);
-    } else {
-        const ret_ty_align = ret_ty.abiAlignment(zcu);
-        const loaded = try self.wip.load(.normal, abi_ret_ty, ptr, ret_ty_align.toLlvm(), "");
-        _ = try self.wip.ret(loaded);
+    switch (try fnReturnStrat(o, fn_info)) {
+        .void => _ = try self.wip.retVoid(),
+        .sret => {
+            assert(self.ret_ptr != .none);
+            _ = try self.wip.retVoid();
+        },
+        .by_val => {
+            assert(self.ret_ptr == .none);
+            const loaded = try self.load(ptr, .none, ret_ty, .normal);
+            _ = try self.wip.ret(loaded);
+        },
+        .mem_cast => |llvm_abi_ret_ty| {
+            assert(self.ret_ptr == .none);
+            const ret_ty_align = ret_ty.abiAlignment(zcu);
+            const loaded = try self.wip.load(.normal, llvm_abi_ret_ty, ptr, ret_ty_align.toLlvm(), "");
+            _ = try self.wip.ret(loaded);
+        },
     }
 }
 
@@ -1218,7 +1162,7 @@ fn airCVaArg(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const list = try self.resolveInst(ty_op.operand);
     const arg_ty = ty_op.ty.toType();
-    const llvm_arg_ty = try self.object.lowerType(arg_ty);
+    const llvm_arg_ty = try self.object.lowerType(arg_ty, .by_value);
 
     return self.wip.vaArg(list, llvm_arg_ty, "");
 }
@@ -1229,10 +1173,8 @@ fn airCVaCopy(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Valu
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const src_list = try self.resolveInst(ty_op.operand);
     const va_list_ty = ty_op.ty.toType();
-    const llvm_va_list_ty = try o.lowerType(va_list_ty);
 
-    const result_alignment = va_list_ty.abiAlignment(zcu).toLlvm();
-    const dest_list = try self.buildAlloca(llvm_va_list_ty, result_alignment);
+    const dest_list = try self.buildZigAlloca(va_list_ty, .none);
 
     _ = try self.wip.callIntrinsic(.normal, .none, .va_copy, &.{dest_list.typeOfWip(&self.wip)}, &.{ dest_list, src_list }, "");
     return if (isByRef(va_list_ty, zcu))
@@ -1253,10 +1195,8 @@ fn airCVaStart(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Val
     const o = self.object;
     const zcu = o.zcu;
     const va_list_ty = self.typeOfIndex(inst);
-    const llvm_va_list_ty = try o.lowerType(va_list_ty);
 
-    const result_alignment = va_list_ty.abiAlignment(zcu).toLlvm();
-    const dest_list = try self.buildAlloca(llvm_va_list_ty, result_alignment);
+    const dest_list = try self.buildZigAlloca(va_list_ty, .none);
 
     _ = try self.wip.callIntrinsic(.normal, .none, .va_start, &.{dest_list.typeOfWip(&self.wip)}, &.{dest_list}, "");
     return if (isByRef(va_list_ty, zcu))
@@ -1434,18 +1374,10 @@ fn lowerBlock(
 
     // Create a phi node only if the block returns a value.
     if (have_block_result) {
-        const raw_llvm_ty = try o.lowerType(inst_ty);
-        const llvm_ty: Builder.Type = ty: {
-            // If the zig tag type is a function, this represents an actual function body; not
-            // a pointer to it. LLVM IR allows the call instruction to use function bodies instead
-            // of function pointers, however the phi makes it a runtime value and therefore
-            // the LLVM type has to be wrapped in a pointer.
-            if (inst_ty.zigTypeTag(zcu) == .@"fn" or isByRef(inst_ty, zcu)) {
-                break :ty .ptr;
-            }
-            break :ty raw_llvm_ty;
+        const llvm_ty: Builder.Type = switch (isByRef(inst_ty, zcu)) {
+            true => .ptr,
+            false => try o.lowerType(inst_ty, .by_value),
         };
-
         parent_bb.ptr(&self.wip).incoming = @intCast(breaks.list.len);
         const phi = try self.wip.phi(llvm_ty, "");
         phi.finish(breaks.list.items(.val), breaks.list.items(.bb), &self.wip);
@@ -1551,7 +1483,7 @@ fn lowerSwitchDispatch(
         const table_index = try self.wip.conv(
             .unsigned,
             try self.wip.bin(.@"sub nuw", cond, jmp_table.min.toValue(), ""),
-            try o.lowerType(.usize),
+            try o.lowerType(.usize, .by_value),
             "",
         );
         const target_ptr_ptr = try self.ptraddScaled(
@@ -1576,7 +1508,7 @@ fn lowerSwitchDispatch(
     // The switch prongs will correspond to our scalar cases. Ranges will
     // be handled by conditional branches in the `else` prong.
 
-    const llvm_usize = try o.lowerType(.usize);
+    const llvm_usize = try o.lowerType(.usize, .by_value);
     const cond_int = if (cond_ty.zigTypeTag(zcu) == .pointer)
         try self.wip.cast(.ptrtoint, cond, llvm_usize, "")
     else
@@ -1771,7 +1703,6 @@ fn lowerTry(
     const zcu = o.zcu;
     const payload_ty = err_union_ty.errorUnionPayload(zcu);
     const payload_has_bits = payload_ty.hasRuntimeBits(zcu);
-    const error_type = try o.errorIntType();
 
     const operand_align: InternPool.Alignment = if (operand_is_ptr) operand_ptr_align else err_union_ty.abiAlignment(zcu);
 
@@ -1792,7 +1723,7 @@ fn lowerTry(
                 if (err_union_ty.isVolatilePtr(zcu)) .@"volatile" else .normal,
             );
         };
-        const zero = try o.builder.intValue(error_type, 0);
+        const zero = try o.builder.intValue(try o.errorIntType(.by_value), 0);
         const is_err = try fg.wip.icmp(.ne, loaded, zero, "");
 
         const return_block = try fg.wip.block(1, "TryRet");
@@ -1929,8 +1860,8 @@ fn airSwitchBr(self: *FuncGen, inst: Air.Inst.Index, is_dispatch_loop: bool) Tod
         const table_includes_else = item_count != table_len;
 
         break :jmp_table .{
-            .min = try o.lowerValue(min.toIntern()),
-            .max = try o.lowerValue(max.toIntern()),
+            .min = try o.lowerValue(min.toIntern(), .by_value),
+            .max = try o.lowerValue(max.toIntern(), .by_value),
             .in_bounds_hint = if (table_includes_else) .none else switch (switch_br.getElseHint()) {
                 .none, .cold => .none,
                 .unpredictable => .unpredictable,
@@ -2088,9 +2019,9 @@ fn airArrayToSlice(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const operand_ty = self.typeOf(ty_op.operand);
     const array_ty = operand_ty.childType(zcu);
-    const llvm_usize = try o.lowerType(.usize);
+    const llvm_usize = try o.lowerType(.usize, .by_value);
     const len = try o.builder.intValue(llvm_usize, array_ty.arrayLen(zcu));
-    const slice_llvm_ty = try o.lowerType(self.typeOfIndex(inst));
+    const slice_llvm_ty = try o.lowerType(self.typeOfIndex(inst), .by_value);
     const operand = try self.resolveInst(ty_op.operand);
     return self.wip.buildAggregate(slice_llvm_ty, &.{ operand, len }, "");
 }
@@ -2107,7 +2038,7 @@ fn airFloatFromInt(self: *FuncGen, inst: Air.Inst.Index) TodoError!Builder.Value
 
     const dest_ty = self.typeOfIndex(inst);
     const dest_scalar_ty = dest_ty.scalarType(zcu);
-    const dest_llvm_ty = try o.lowerType(dest_ty);
+    const dest_llvm_ty = try o.lowerType(dest_ty, .by_value);
     const target = zcu.getTarget();
 
     if (intrinsicsAllowed(dest_scalar_ty, target)) return self.wip.conv(
@@ -2175,7 +2106,7 @@ fn airIntFromFloat(
 
     const dest_ty = self.typeOfIndex(inst);
     const dest_scalar_ty = dest_ty.scalarType(zcu);
-    const dest_llvm_ty = try o.lowerType(dest_ty);
+    const dest_llvm_ty = try o.lowerType(dest_ty, .by_value);
 
     if (intrinsicsAllowed(operand_scalar_ty, target)) {
         // TODO set fast math flag
@@ -2209,7 +2140,7 @@ fn airIntFromFloat(
         compiler_rt_dest_abbrev,
     });
 
-    const operand_llvm_ty = try o.lowerType(operand_ty);
+    const operand_llvm_ty = try o.lowerType(operand_ty, .by_value);
     const libc_fn = try o.getLibcFunction(fn_name, &.{operand_llvm_ty}, libc_ret_ty);
     var result = try self.wip.call(
         .normal,
@@ -2234,7 +2165,7 @@ fn sliceOrArrayPtr(fg: *FuncGen, ptr: Builder.Value, ty: Type) Allocator.Error!B
 fn sliceOrArrayLenInBytes(fg: *FuncGen, ptr: Builder.Value, ty: Type) Allocator.Error!Builder.Value {
     const o = fg.object;
     const zcu = o.zcu;
-    const llvm_usize = try o.lowerType(.usize);
+    const llvm_usize = try o.lowerType(.usize, .by_value);
     switch (ty.ptrSize(zcu)) {
         .slice => {
             const len = try fg.wip.extractValue(ptr, &.{1}, "");
@@ -2370,9 +2301,6 @@ fn airPtrElemPtr(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.V
     const base_ptr = try self.resolveInst(bin_op.lhs);
     const rhs = try self.resolveInst(bin_op.rhs);
 
-    const elem_ptr = ty_pl.ty.toType();
-    if (elem_ptr.ptrInfo(zcu).flags.vector_index != .none) return base_ptr;
-
     return self.ptraddScaled(base_ptr, rhs, elem_ty.abiSize(zcu));
 }
 
@@ -2435,7 +2363,7 @@ fn airStructFieldVal(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Build
             },
             .float => {
                 // bitcast int->float
-                return self.wip.cast(.bitcast, field_int_val, try o.lowerType(field_ty), "");
+                return self.wip.cast(.bitcast, field_int_val, try o.lowerType(field_ty, .by_value), "");
             },
         }
     }
@@ -2465,8 +2393,8 @@ fn airFieldParentPtr(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Build
     const field_offset = parent_ty.structFieldOffset(extra.field_index, zcu);
     if (field_offset == 0) return field_ptr;
 
-    const res_ty = try o.lowerType(ty_pl.ty.toType());
-    const llvm_usize = try o.lowerType(.usize);
+    const res_ty = try o.lowerType(ty_pl.ty.toType(), .by_value);
+    const llvm_usize = try o.lowerType(.usize, .by_value);
 
     const field_ptr_int = try self.wip.cast(.ptrtoint, field_ptr, llvm_usize, "");
     const base_ptr_int = try self.wip.bin(
@@ -2590,8 +2518,7 @@ fn airDbgVarVal(self: *FuncGen, inst: Air.Inst.Index, is_arg: bool) Allocator.Er
         // We avoid taking this path for naked functions because there's no guarantee that such
         // functions even have a valid stack pointer, making the `alloca` + `store` unsafe.
 
-        const alignment = operand_ty.abiAlignment(zcu).toLlvm();
-        const alloca = try self.buildAlloca(try o.lowerType(operand_ty), alignment);
+        const alloca = try self.buildZigAlloca(operand_ty, .none);
         try self.store(alloca, .none, operand, operand_ty, .normal);
         _ = try self.wip.callIntrinsic(
             .normal,
@@ -2683,7 +2610,7 @@ fn airAssembly(self: *FuncGen, inst: Air.Inst.Index) TodoError!Builder.Value {
             const output_inst = try self.resolveInst(output.operand);
             const output_ty = self.typeOf(output.operand);
             assert(output_ty.zigTypeTag(zcu) == .pointer);
-            const elem_llvm_ty = try o.lowerType(output_ty.childType(zcu));
+            const elem_llvm_ty = try o.lowerType(output_ty.childType(zcu), .by_value);
 
             switch (constraint[0]) {
                 '=' => {},
@@ -2721,7 +2648,7 @@ fn airAssembly(self: *FuncGen, inst: Air.Inst.Index) TodoError!Builder.Value {
             llvm_ret_indirect[output.index] = false;
 
             const ret_ty = self.typeOfIndex(inst);
-            llvm_ret_types[llvm_ret_i] = try o.lowerType(ret_ty);
+            llvm_ret_types[llvm_ret_i] = try o.lowerType(ret_ty, .by_value);
             llvm_ret_i += 1;
         }
 
@@ -2760,7 +2687,7 @@ fn airAssembly(self: *FuncGen, inst: Air.Inst.Index) TodoError!Builder.Value {
                 llvm_param_types[llvm_param_i] = arg_llvm_value.typeOfWip(&self.wip);
             } else {
                 const alignment = arg_ty.abiAlignment(zcu).toLlvm();
-                const arg_llvm_ty = try o.lowerType(arg_ty);
+                const arg_llvm_ty = try o.lowerType(arg_ty, .by_value);
                 const load_inst = try self.wip.load(.normal, arg_llvm_ty, arg_llvm_value, alignment, "");
                 llvm_param_values[llvm_param_i] = load_inst;
                 llvm_param_types[llvm_param_i] = arg_llvm_ty;
@@ -2800,7 +2727,7 @@ fn airAssembly(self: *FuncGen, inst: Air.Inst.Index) TodoError!Builder.Value {
         llvm_param_attrs[llvm_param_i] = if (constraint[0] == '*') blk: {
             if (!is_by_ref) self.maybeMarkAllowZeroAccess(arg_ty.ptrInfo(zcu));
 
-            break :blk try o.lowerType(if (is_by_ref) arg_ty else arg_ty.childType(zcu));
+            break :blk try o.lowerType(if (is_by_ref) arg_ty else arg_ty.childType(zcu), .by_value);
         } else .none;
 
         llvm_param_i += 1;
@@ -2814,7 +2741,7 @@ fn airAssembly(self: *FuncGen, inst: Air.Inst.Index) TodoError!Builder.Value {
         if (constraint[0] != '+') continue;
 
         const rw_ty = self.typeOf(output.operand);
-        const llvm_elem_ty = try o.lowerType(rw_ty.childType(zcu));
+        const llvm_elem_ty = try o.lowerType(rw_ty.childType(zcu), .by_value);
         if (llvm_ret_indirect[output.index]) {
             llvm_param_values[llvm_param_i] = llvm_rw_vals[output.index];
             llvm_param_types[llvm_param_i] = llvm_rw_vals[output.index].typeOfWip(&self.wip);
@@ -3028,7 +2955,7 @@ fn airIsNonNull(
             ));
             return self.wip.icmp(cond, slice_ptr, try o.builder.nullValue(ptr_ty), "");
         }
-        return self.wip.icmp(cond, loaded, try o.builder.zeroInitValue(try o.lowerType(optional_ty)), "");
+        return self.wip.icmp(cond, loaded, try o.builder.zeroInitValue(try o.lowerType(optional_ty, .by_value)), "");
     }
 
     comptime assert(optional_layout_version == 3);
@@ -3057,8 +2984,7 @@ fn airIsErr(
     const operand_ty = self.typeOf(un_op);
     const err_union_ty = if (operand_is_ptr) operand_ty.childType(zcu) else operand_ty;
     const payload_ty = err_union_ty.errorUnionPayload(zcu);
-    const error_type = try o.errorIntType();
-    const zero = try o.builder.intValue(error_type, 0);
+    const zero_err = try o.builder.intValue(try o.errorIntType(.by_value), 0);
 
     const access_kind: Builder.MemoryAccessKind =
         if (operand_is_ptr and operand_ty.isVolatilePtr(zcu)) .@"volatile" else .normal;
@@ -3079,7 +3005,7 @@ fn airIsErr(
             try self.load(operand, operand_ty.ptrAlignment(zcu), err_union_ty, access_kind)
         else
             operand;
-        return self.wip.icmp(cond, loaded, zero, "");
+        return self.wip.icmp(cond, loaded, zero_err, "");
     }
     assert(isByRef(err_union_ty, zcu)); // error unions with runtime bits are always by-ref
 
@@ -3089,7 +3015,7 @@ fn airIsErr(
         .none;
     const err_field_ptr = try self.ptraddConst(operand, codegen.errUnionErrorOffset(payload_ty, zcu));
     const loaded = try self.load(err_field_ptr, err_align, .anyerror, access_kind);
-    return self.wip.icmp(cond, loaded, zero, "");
+    return self.wip.icmp(cond, loaded, zero_err, "");
 }
 
 fn airOptionalPayloadPtr(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
@@ -3228,7 +3154,7 @@ fn airErrUnionPayloadPtrSet(self: *FuncGen, inst: Air.Inst.Index) Allocator.Erro
     const err_union_ptr_align = err_union_ptr_ty.ptrAlignment(zcu);
 
     const payload_ty = err_union_ty.errorUnionPayload(zcu);
-    const non_error_val = try o.builder.intValue(try o.errorIntType(), 0);
+    const non_error_val = try o.builder.intValue(try o.errorIntType(.by_value), 0);
 
     const access_kind: Builder.MemoryAccessKind =
         if (err_union_ptr_ty.isVolatilePtr(zcu)) .@"volatile" else .normal;
@@ -3274,33 +3200,9 @@ fn airSaveErrReturnTraceIndex(self: *FuncGen, inst: Air.Inst.Index) Allocator.Er
     return self.load(field_ptr, field_align, field_ty, .normal);
 }
 
-/// As an optimization, we want to avoid unnecessary copies of
-/// error union/optional types when returning from a function.
-/// Here, we scan forward in the current block, looking to see
-/// if the next instruction is a return (ignoring debug instructions).
-///
-/// The first instruction of `body_tail` is a wrap instruction.
-fn isNextRet(
-    self: *FuncGen,
-    body_tail: []const Air.Inst.Index,
-) bool {
-    const air_tags = self.air.instructions.items(.tag);
-    for (body_tail[1..]) |body_inst| {
-        switch (air_tags[@intFromEnum(body_inst)]) {
-            .ret => return true,
-            .dbg_stmt => continue,
-            else => return false,
-        }
-    }
-    // The only way to get here is to hit the end of a loop instruction
-    // (implicit repeat).
-    return false;
-}
-
-fn airWrapOptional(self: *FuncGen, body_tail: []const Air.Inst.Index) Allocator.Error!Builder.Value {
+fn airWrapOptional(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
     const o = self.object;
     const zcu = o.zcu;
-    const inst = body_tail[0];
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const payload_ty = self.typeOf(ty_op.operand);
     comptime assert(optional_layout_version == 3);
@@ -3309,14 +3211,7 @@ fn airWrapOptional(self: *FuncGen, body_tail: []const Air.Inst.Index) Allocator.
     const optional_ty = self.typeOfIndex(inst);
     if (optional_ty.optionalReprIsPayload(zcu)) return operand;
     assert(isByRef(optional_ty, zcu)); // optionals with runtime bits are by-ref unless `optionalReprIsPayload`
-    const llvm_optional_ty = try o.lowerType(optional_ty);
-    const optional_ptr = if (self.isNextRet(body_tail))
-        self.ret_ptr
-    else brk: {
-        const alignment = optional_ty.abiAlignment(zcu).toLlvm();
-        const optional_ptr = try self.buildAlloca(llvm_optional_ty, alignment);
-        break :brk optional_ptr;
-    };
+    const optional_ptr = try self.buildZigAlloca(optional_ty, .none);
 
     const payload_ptr = optional_ptr; // payload always at offset 0
     try self.store(payload_ptr, .none, operand, payload_ty, .normal);
@@ -3328,26 +3223,18 @@ fn airWrapOptional(self: *FuncGen, body_tail: []const Air.Inst.Index) Allocator.
     return optional_ptr;
 }
 
-fn airWrapErrUnionPayload(self: *FuncGen, body_tail: []const Air.Inst.Index) Allocator.Error!Builder.Value {
+fn airWrapErrUnionPayload(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
     const o = self.object;
     const zcu = o.zcu;
-    const inst = body_tail[0];
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const err_un_ty = self.typeOfIndex(inst);
     const operand = try self.resolveInst(ty_op.operand);
     const payload_ty = self.typeOf(ty_op.operand);
     assert(payload_ty.hasRuntimeBits(zcu));
     assert(isByRef(err_un_ty, zcu)); // error unions with runtime bits are always by-ref
-    const ok_err_code = try o.builder.intValue(try o.errorIntType(), 0);
-    const err_un_llvm_ty = try o.lowerType(err_un_ty);
-
-    const result_ptr = if (self.isNextRet(body_tail))
-        self.ret_ptr
-    else brk: {
-        const alignment = err_un_ty.abiAlignment(o.zcu).toLlvm();
-        const result_ptr = try self.buildAlloca(err_un_llvm_ty, alignment);
-        break :brk result_ptr;
-    };
+    const ok_err_code = try o.builder.intValue(try o.errorIntType(.by_value), 0);
+
+    const result_ptr = try self.buildZigAlloca(err_un_ty, .none);
 
     const err_ptr = try self.ptraddConst(result_ptr, codegen.errUnionErrorOffset(payload_ty, zcu));
     try self.store(err_ptr, .none, ok_err_code, .anyerror, .normal);
@@ -3358,25 +3245,17 @@ fn airWrapErrUnionPayload(self: *FuncGen, body_tail: []const Air.Inst.Index) All
     return result_ptr;
 }
 
-fn airWrapErrUnionErr(self: *FuncGen, body_tail: []const Air.Inst.Index) Allocator.Error!Builder.Value {
+fn airWrapErrUnionErr(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
     const o = self.object;
     const zcu = o.zcu;
-    const inst = body_tail[0];
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const err_un_ty = self.typeOfIndex(inst);
     const payload_ty = err_un_ty.errorUnionPayload(zcu);
     const operand = try self.resolveInst(ty_op.operand);
     if (!payload_ty.hasRuntimeBits(zcu)) return operand;
     assert(isByRef(err_un_ty, zcu)); // error unions with runtime bits are always by-ref
-    const err_un_llvm_ty = try o.lowerType(err_un_ty);
-
-    const result_ptr = if (self.isNextRet(body_tail))
-        self.ret_ptr
-    else brk: {
-        const alignment = err_un_ty.abiAlignment(zcu).toLlvm();
-        const result_ptr = try self.buildAlloca(err_un_llvm_ty, alignment);
-        break :brk result_ptr;
-    };
+
+    const result_ptr = try self.buildZigAlloca(err_un_ty, .none);
 
     const err_ptr = try self.ptraddConst(result_ptr, codegen.errUnionErrorOffset(payload_ty, zcu));
     try self.store(err_ptr, .none, operand, .anyerror, .normal);
@@ -3392,7 +3271,7 @@ fn airWasmMemorySize(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Build
     const o = self.object;
     const pl_op = self.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
     const index = pl_op.payload;
-    const llvm_usize = try o.lowerType(.usize);
+    const llvm_usize = try o.lowerType(.usize, .by_value);
     return self.wip.callIntrinsic(.normal, .none, .@"wasm.memory.size", &.{llvm_usize}, &.{
         try o.builder.intValue(.i32, index),
     }, "");
@@ -3402,7 +3281,7 @@ fn airWasmMemoryGrow(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Build
     const o = self.object;
     const pl_op = self.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
     const index = pl_op.payload;
-    const llvm_isize = try o.lowerType(.isize);
+    const llvm_isize = try o.lowerType(.isize, .by_value);
     return self.wip.callIntrinsic(.normal, .none, .@"wasm.memory.grow", &.{llvm_isize}, &.{
         try o.builder.intValue(.i32, index), try self.resolveInst(pl_op.operand),
     }, "");
@@ -3429,7 +3308,7 @@ fn airMin(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
         .normal,
         .none,
         if (scalar_ty.isSignedInt(zcu)) .smin else .umin,
-        &.{try o.lowerType(inst_ty)},
+        &.{try o.lowerType(inst_ty, .by_value)},
         &.{ lhs, rhs },
         "",
     );
@@ -3449,7 +3328,7 @@ fn airMax(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
         .normal,
         .none,
         if (scalar_ty.isSignedInt(zcu)) .smax else .umax,
-        &.{try o.lowerType(inst_ty)},
+        &.{try o.lowerType(inst_ty, .by_value)},
         &.{ lhs, rhs },
         "",
     );
@@ -3461,7 +3340,7 @@ fn airSlice(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value 
     const ptr = try self.resolveInst(bin_op.lhs);
     const len = try self.resolveInst(bin_op.rhs);
     const inst_ty = self.typeOfIndex(inst);
-    return self.wip.buildAggregate(try self.object.lowerType(inst_ty), &.{ ptr, len }, "");
+    return self.wip.buildAggregate(try self.object.lowerType(inst_ty, .by_value), &.{ ptr, len }, "");
 }
 
 fn airAdd(self: *FuncGen, inst: Air.Inst.Index, fast: Builder.FastMathKind) Allocator.Error!Builder.Value {
@@ -3492,7 +3371,7 @@ fn airSafeArithmetic(
     const scalar_ty = inst_ty.scalarType(zcu);
 
     const intrinsic = if (scalar_ty.isSignedInt(zcu)) signed_intrinsic else unsigned_intrinsic;
-    const llvm_inst_ty = try o.lowerType(inst_ty);
+    const llvm_inst_ty = try o.lowerType(inst_ty, .by_value);
     const results =
         try fg.wip.callIntrinsic(.normal, .none, intrinsic, &.{llvm_inst_ty}, &.{ lhs, rhs }, "");
 
@@ -3542,7 +3421,7 @@ fn airAddSat(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value
         .normal,
         .none,
         if (scalar_ty.isSignedInt(zcu)) .@"sadd.sat" else .@"uadd.sat",
-        &.{try o.lowerType(inst_ty)},
+        &.{try o.lowerType(inst_ty, .by_value)},
         &.{ lhs, rhs },
         "",
     );
@@ -3581,7 +3460,7 @@ fn airSubSat(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value
         .normal,
         .none,
         if (scalar_ty.isSignedInt(zcu)) .@"ssub.sat" else .@"usub.sat",
-        &.{try o.lowerType(inst_ty)},
+        &.{try o.lowerType(inst_ty, .by_value)},
         &.{ lhs, rhs },
         "",
     );
@@ -3620,7 +3499,7 @@ fn airMulSat(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value
         .normal,
         .none,
         if (scalar_ty.isSignedInt(zcu)) .@"smul.fix.sat" else .@"umul.fix.sat",
-        &.{try o.lowerType(inst_ty)},
+        &.{try o.lowerType(inst_ty, .by_value)},
         &.{ lhs, rhs, .@"0" },
         "",
     );
@@ -3664,8 +3543,8 @@ fn airDivFloor(self: *FuncGen, inst: Air.Inst.Index, fast: Builder.FastMathKind)
         return self.buildFloatOp(.floor, fast, inst_ty, 1, .{result});
     }
     if (scalar_ty.isSignedInt(zcu)) {
-        const scalar_llvm_ty = try o.lowerType(scalar_ty);
-        const inst_llvm_ty = try o.lowerType(inst_ty);
+        const scalar_llvm_ty = try o.lowerType(scalar_ty, .by_value);
+        const inst_llvm_ty = try o.lowerType(inst_ty, .by_value);
 
         const ExpectedContents = [std.math.big.int.calcTwosCompLimbCount(256)]std.math.big.Limb;
         var bfa_buf: ExpectedContents = undefined;
@@ -3739,7 +3618,7 @@ fn airMod(self: *FuncGen, inst: Air.Inst.Index, fast: Builder.FastMathKind) Allo
     const lhs = try self.resolveInst(bin_op.lhs);
     const rhs = try self.resolveInst(bin_op.rhs);
     const inst_ty = self.typeOfIndex(inst);
-    const inst_llvm_ty = try o.lowerType(inst_ty);
+    const inst_llvm_ty = try o.lowerType(inst_ty, .by_value);
     const scalar_ty = inst_ty.scalarType(zcu);
 
     if (scalar_ty.isRuntimeFloat()) {
@@ -3768,7 +3647,7 @@ fn airMod(self: *FuncGen, inst: Air.Inst.Index, fast: Builder.FastMathKind) Allo
         defer allocator.free(smin_big_int.limbs);
         smin_big_int.setTwosCompIntLimit(.min, .signed, scalar_bits);
         const smin = try o.builder.splatValue(inst_llvm_ty, try o.builder.bigIntConst(
-            try o.lowerType(scalar_ty),
+            try o.lowerType(scalar_ty, .by_value),
             smin_big_int.toConst(),
         ));
 
@@ -3804,7 +3683,7 @@ fn airPtrSub(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value
     const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
     const bin_op = self.air.extraData(Air.Bin, ty_pl.payload).data;
     const ptr_or_slice = try self.resolveInst(bin_op.lhs);
-    const llvm_usize_ty = try o.lowerType(.usize);
+    const llvm_usize_ty = try o.lowerType(.usize, .by_value);
     const ptr_ty = self.typeOf(bin_op.lhs);
     const elem_ty = ptr_ty.indexableElem(zcu);
     const ptr = switch (ptr_ty.ptrSize(zcu)) {
@@ -3837,8 +3716,7 @@ fn airOverflow(
     assert(isByRef(inst_ty, zcu)); // auto structs are by-ref
 
     const intrinsic = if (scalar_ty.isSignedInt(zcu)) signed_intrinsic else unsigned_intrinsic;
-    const llvm_inst_ty = try o.lowerType(inst_ty);
-    const llvm_lhs_ty = try o.lowerType(lhs_ty);
+    const llvm_lhs_ty = try o.lowerType(lhs_ty, .by_value);
     const results =
         try self.wip.callIntrinsic(.normal, .none, intrinsic, &.{llvm_lhs_ty}, &.{ lhs, rhs }, "");
 
@@ -3846,7 +3724,7 @@ fn airOverflow(
     const overflow_bit = try self.wip.extractValue(results, &.{1}, "");
 
     const result_alignment = inst_ty.abiAlignment(zcu);
-    const alloca_inst = try self.buildAlloca(llvm_inst_ty, result_alignment.toLlvm());
+    const alloca_inst = try self.buildZigAlloca(inst_ty, .none);
 
     {
         // Store to 'result: IntType' field
@@ -3911,7 +3789,7 @@ fn buildFloatCmp(
     const zcu = o.zcu;
     const target = zcu.getTarget();
     const scalar_ty = ty.scalarType(zcu);
-    const scalar_llvm_ty = try o.lowerType(scalar_ty);
+    const scalar_llvm_ty = try o.lowerType(scalar_ty, .by_value);
 
     if (intrinsicsAllowed(scalar_ty, target)) {
         const cond: Builder.FloatCondition = switch (pred) {
@@ -4017,7 +3895,7 @@ fn buildFloatOp(
     const zcu = o.zcu;
     const target = zcu.getTarget();
     const scalar_ty = ty.scalarType(zcu);
-    const llvm_ty = try o.lowerType(ty);
+    const llvm_ty = try o.lowerType(ty, .by_value);
 
     if (op != .tan and intrinsicsAllowed(scalar_ty, target)) switch (op) {
         // Some operations are dedicated LLVM instructions, not available as intrinsics
@@ -4122,7 +4000,7 @@ fn buildFloatOp(
         }),
     };
 
-    const scalar_llvm_ty = try o.lowerType(scalar_ty);
+    const scalar_llvm_ty = try o.lowerType(scalar_ty, .by_value);
     const libc_fn = try o.getLibcFunction(
         fn_name,
         @as([3]Builder.Type, @splat(scalar_llvm_ty))[0..params.len],
@@ -4176,9 +4054,8 @@ fn airShlWithOverflow(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Buil
 
     const dest_ty = self.typeOfIndex(inst);
     assert(isByRef(dest_ty, zcu)); // auto structs are by-ref
-    const llvm_dest_ty = try o.lowerType(dest_ty);
 
-    const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty), "");
+    const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty, .by_value), "");
 
     const result = try self.wip.bin(.shl, lhs, casted_rhs, "");
     const reconstructed = try self.wip.bin(if (lhs_scalar_ty.isSignedInt(zcu))
@@ -4189,7 +4066,7 @@ fn airShlWithOverflow(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Buil
     const overflow_bit = try self.wip.icmp(.ne, lhs, reconstructed, "");
 
     const result_alignment = dest_ty.abiAlignment(zcu);
-    const alloca_inst = try self.buildAlloca(llvm_dest_ty, result_alignment.toLlvm());
+    const alloca_inst = try self.buildZigAlloca(dest_ty, .none);
 
     {
         // Store to 'result: IntType' field
@@ -4245,7 +4122,7 @@ fn airShlExact(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Val
     }
     const lhs_scalar_ty = lhs_ty.scalarType(zcu);
 
-    const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty), "");
+    const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty, .by_value), "");
     return self.wip.bin(if (lhs_scalar_ty.isSignedInt(zcu))
         .@"shl nsw"
     else
@@ -4266,7 +4143,7 @@ fn airShl(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
         // features which we do not use. Therefore this branch is currently impossible.
         unreachable;
     }
-    const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty), "");
+    const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty, .by_value), "");
     return self.wip.bin(.shl, lhs, casted_rhs, "");
 }
 
@@ -4280,8 +4157,8 @@ fn airShlSat(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value
 
     const lhs_ty = self.typeOf(bin_op.lhs);
     const lhs_info = lhs_ty.intInfo(zcu);
-    const llvm_lhs_ty = try o.lowerType(lhs_ty);
-    const llvm_lhs_scalar_ty = try o.lowerType(lhs_ty.scalarType(zcu));
+    const llvm_lhs_ty = try o.lowerType(lhs_ty, .by_value);
+    const llvm_lhs_scalar_ty = try o.lowerType(lhs_ty.scalarType(zcu), .by_value);
 
     const rhs_ty = self.typeOf(bin_op.rhs);
     if (lhs_ty.isVector(zcu) and !rhs_ty.isVector(zcu)) {
@@ -4291,8 +4168,8 @@ fn airShlSat(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value
     }
     const rhs_info = rhs_ty.intInfo(zcu);
     assert(rhs_info.signedness == .unsigned);
-    const llvm_rhs_ty = try o.lowerType(rhs_ty);
-    const llvm_rhs_scalar_ty = try o.lowerType(rhs_ty.scalarType(zcu));
+    const llvm_rhs_ty = try o.lowerType(rhs_ty, .by_value);
+    const llvm_rhs_scalar_ty = try o.lowerType(rhs_ty.scalarType(zcu), .by_value);
 
     const result = try self.wip.callIntrinsic(
         .normal,
@@ -4368,7 +4245,7 @@ fn airShr(self: *FuncGen, inst: Air.Inst.Index, is_exact: bool) Allocator.Error!
     }
     const lhs_scalar_ty = lhs_ty.scalarType(zcu);
 
-    const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty), "");
+    const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty, .by_value), "");
     const is_signed_int = lhs_scalar_ty.isSignedInt(zcu);
 
     return self.wip.bin(if (is_exact)
@@ -4389,7 +4266,7 @@ fn airAbs(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
             .normal,
             .none,
             .abs,
-            &.{try o.lowerType(operand_ty)},
+            &.{try o.lowerType(operand_ty, .by_value)},
             &.{ operand, .false },
             "",
         ),
@@ -4403,7 +4280,7 @@ fn airIntCast(fg: *FuncGen, inst: Air.Inst.Index, safety: bool) Allocator.Error!
     const zcu = o.zcu;
     const ty_op = fg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const dest_ty = fg.typeOfIndex(inst);
-    const dest_llvm_ty = try o.lowerType(dest_ty);
+    const dest_llvm_ty = try o.lowerType(dest_ty, .by_value);
     const operand = try fg.resolveInst(ty_op.operand);
     const operand_ty = fg.typeOf(ty_op.operand);
     const operand_info = operand_ty.intInfo(zcu);
@@ -4431,8 +4308,8 @@ fn airIntCast(fg: *FuncGen, inst: Air.Inst.Index, safety: bool) Allocator.Error!
 
         if (!have_min_check and !have_max_check) break :bounds_check;
 
-        const operand_llvm_ty = try o.lowerType(operand_ty);
-        const operand_scalar_llvm_ty = try o.lowerType(operand_scalar);
+        const operand_llvm_ty = try o.lowerType(operand_ty, .by_value);
+        const operand_scalar_llvm_ty = try o.lowerType(operand_scalar, .by_value);
 
         const is_vector = operand_ty.zigTypeTag(zcu) == .vector;
         assert(is_vector == (dest_ty.zigTypeTag(zcu) == .vector));
@@ -4510,7 +4387,7 @@ fn airIntCast(fg: *FuncGen, inst: Air.Inst.Index, safety: bool) Allocator.Error!
 fn airTrunc(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const operand = try self.resolveInst(ty_op.operand);
-    const dest_llvm_ty = try self.object.lowerType(self.typeOfIndex(inst));
+    const dest_llvm_ty = try self.object.lowerType(self.typeOfIndex(inst), .by_value);
     return self.wip.cast(.trunc, operand, dest_llvm_ty, "");
 }
 
@@ -4524,10 +4401,10 @@ fn airFptrunc(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Valu
     const target = zcu.getTarget();
 
     if (intrinsicsAllowed(dest_ty, target) and intrinsicsAllowed(operand_ty, target)) {
-        return self.wip.cast(.fptrunc, operand, try o.lowerType(dest_ty), "");
+        return self.wip.cast(.fptrunc, operand, try o.lowerType(dest_ty, .by_value), "");
     } else {
-        const operand_llvm_ty = try o.lowerType(operand_ty);
-        const dest_llvm_ty = try o.lowerType(dest_ty);
+        const operand_llvm_ty = try o.lowerType(operand_ty, .by_value);
+        const dest_llvm_ty = try o.lowerType(dest_ty, .by_value);
 
         const dest_bits = dest_ty.floatBits(target);
         const src_bits = operand_ty.floatBits(target);
@@ -4558,10 +4435,10 @@ fn airFpext(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value 
     const target = zcu.getTarget();
 
     if (intrinsicsAllowed(dest_ty, target) and intrinsicsAllowed(operand_ty, target)) {
-        return self.wip.cast(.fpext, operand, try o.lowerType(dest_ty), "");
+        return self.wip.cast(.fpext, operand, try o.lowerType(dest_ty, .by_value), "");
     } else {
-        const operand_llvm_ty = try o.lowerType(operand_ty);
-        const dest_llvm_ty = try o.lowerType(dest_ty);
+        const operand_llvm_ty = try o.lowerType(operand_ty, .by_value);
+        const dest_llvm_ty = try o.lowerType(dest_ty, .by_value);
 
         const dest_bits = dest_ty.scalarType(zcu).floatBits(target);
         const src_bits = operand_ty.scalarType(zcu).floatBits(target);
@@ -4599,39 +4476,69 @@ fn airBitCast(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value 
 
     // We have the following `Air.Legalize` features enabled:
     //
-    // * `.scalarize_bitcast_array`
-    // * `.scalarize_bitcast_vector_non_elementwise`
+    // * `.scalarize_bit_cast_array`
+    // * `.scalarize_bit_cast_vector_non_elementwise`
     //
-    // That means the set of bitcasts we might see is limited to the following:
+    // That means the `bit_cast` instructions we might see are limited to the following:
     //
     // * bool/int/float <-> bool/int/float
     // * `@Vector(n, A)` <-> `@Vector(n, B)`
-    // * pointer <-> pointer
-    // * pointer <-> int
-    // * slice <-> slice
     //
-    // Most of these can be handled by LLVM's `bitcast` instruction. We will check for the few cases
-    // that aren't, and otherwise use `bitcast`.
-
-    if (operand_ty.isSlice(zcu) and dest_ty.isSlice(zcu)) {
-        // The slice types are the same type in LLVM IR, so this conversion is a nop.
-        return operand;
-    }
+    // All of these cases can be handled by LLVM's `bitcast` instruction.
 
     assert(!isByRef(operand_ty, zcu));
     assert(!isByRef(dest_ty, zcu));
 
-    const llvm_dest_ty = try o.lowerType(dest_ty);
+    const llvm_dest_ty = try o.lowerType(dest_ty, .by_value);
+    return fg.wip.cast(.bitcast, operand, llvm_dest_ty, "");
+}
 
-    if (operand_ty.scalarType(zcu).zigTypeTag(zcu) == .int and dest_ty.scalarType(zcu).isPtrAtRuntime(zcu)) {
-        return fg.wip.cast(.inttoptr, operand, llvm_dest_ty, "");
-    }
+fn airNopCast(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
+    const zcu = fg.object.zcu;
+    const ty_op = fg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+    const operand_ty = fg.typeOf(ty_op.operand);
+    const dest_ty = fg.typeOfIndex(inst);
+    assert(isByRef(operand_ty, zcu) == isByRef(dest_ty, zcu));
+    assert(operand_ty.abiSize(zcu) == dest_ty.abiSize(zcu));
+    return fg.resolveInst(ty_op.operand);
+}
 
-    if (operand_ty.scalarType(zcu).isPtrAtRuntime(zcu) and dest_ty.scalarType(zcu).zigTypeTag(zcu) == .int) {
-        return fg.wip.cast(.ptrtoint, operand, llvm_dest_ty, "");
-    }
+fn airPtrFromInt(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
+    const o = fg.object;
+    const zcu = o.zcu;
+    const ty_op = fg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+    const operand_ty = fg.typeOf(ty_op.operand);
+    const dest_ty = fg.typeOfIndex(inst);
+    assert(operand_ty.scalarType(zcu).toIntern() == .usize_type);
+    assert(dest_ty.scalarType(zcu).isPtrAtRuntime(zcu));
 
-    return fg.wip.cast(.bitcast, operand, llvm_dest_ty, "");
+    const operand = try fg.resolveInst(ty_op.operand);
+    const llvm_dest_ty = try o.lowerType(dest_ty, .by_value);
+    return fg.wip.cast(.inttoptr, operand, llvm_dest_ty, "");
+}
+
+fn airIntFromPtr(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
+    const o = fg.object;
+    const zcu = o.zcu;
+    const ty_op = fg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+    const operand_ty = fg.typeOf(ty_op.operand);
+    const dest_ty = fg.typeOfIndex(inst);
+    assert(operand_ty.scalarType(zcu).isPtrAtRuntime(zcu));
+    assert(dest_ty.scalarType(zcu).toIntern() == .usize_type);
+
+    const operand = try fg.resolveInst(ty_op.operand);
+    const llvm_dest_ty = try o.lowerType(dest_ty, .by_value);
+    return fg.wip.cast(.ptrtoint, operand, llvm_dest_ty, "");
+}
+
+fn airUnionFromEnum(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
+    const ty_op = fg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+    const enum_ty = fg.typeOf(ty_op.operand);
+    const union_ty = fg.typeOfIndex(inst);
+    const enum_val = try fg.resolveInst(ty_op.operand);
+    const union_ptr = try fg.buildZigAlloca(union_ty, .none);
+    try fg.store(union_ptr, .none, enum_val, enum_ty, .normal);
+    return union_ptr;
 }
 
 fn airArg(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
@@ -4690,8 +4597,7 @@ fn airArg(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
             "",
         );
     } else if (mod.optimize_mode == .Debug) {
-        const alignment = inst_ty.abiAlignment(zcu).toLlvm();
-        const alloca = try self.buildAlloca(try o.lowerType(inst_ty), alignment);
+        const alloca = try self.buildZigAlloca(inst_ty, .none);
         try self.store(alloca, .none, arg_val, inst_ty, .normal);
         _ = try self.wip.callIntrinsic(
             .normal,
@@ -4733,8 +4639,7 @@ fn airAlloc(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value 
     if (!elem_ty.hasRuntimeBits(zcu)) {
         return (try o.lowerPtrToVoid(ptr_align, ptr_ty.ptrAddressSpace(zcu))).toValue();
     }
-    const llvm_elem_ty = try o.lowerType(elem_ty);
-    return self.buildAlloca(llvm_elem_ty, ptr_align.toLlvm());
+    return self.buildZigAlloca(elem_ty, ptr_align);
 }
 
 fn airRetPtr(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
@@ -4747,8 +4652,19 @@ fn airRetPtr(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value
     if (!elem_ty.hasRuntimeBits(zcu)) {
         return (try o.lowerPtrToVoid(ptr_align, ptr_ty.ptrAddressSpace(zcu))).toValue();
     }
-    const llvm_elem_ty = try o.lowerType(elem_ty);
-    return self.buildAlloca(llvm_elem_ty, ptr_align.toLlvm());
+    return self.buildZigAlloca(elem_ty, ptr_align);
+}
+
+fn buildZigAlloca(fg: *FuncGen, ty: Type, @"align": InternPool.Alignment) Allocator.Error!Builder.Value {
+    const o = fg.object;
+    const resolved_align: InternPool.Alignment = switch (@"align") {
+        .none => ty.abiAlignment(o.zcu),
+        else => |a| a,
+    };
+    return fg.buildAlloca(
+        try o.lowerType(ty, .in_memory),
+        resolved_align.toLlvm(),
+    );
 }
 
 /// Unlike `WipFunction.alloca`, this puts the alloca instruction at the top of the function.
@@ -4823,7 +4739,7 @@ fn airStore(fg: *FuncGen, inst: Air.Inst.Index, safety: bool) Allocator.Error!Bu
             return .none;
         }
 
-        const len = try o.builder.intValue(try o.lowerType(.usize), elem_ty.abiSize(zcu));
+        const len = try o.builder.intValue(try o.lowerType(.usize, .by_value), elem_ty.abiSize(zcu));
         _ = try fg.wip.callMemSet(
             ptr,
             ptr_alignment.toLlvm(),
@@ -4858,7 +4774,7 @@ fn airStore(fg: *FuncGen, inst: Air.Inst.Index, safety: bool) Allocator.Error!Bu
     if (ptr_info.packed_offset.host_size != 0) {
         // Accepted proposal https://github.com/ziglang/zig/issues/24061 will eliminate this usage of `pt`.
         const backing_int_ty = try fg.pt.intType(.unsigned, @intCast(ptr_info.packed_offset.host_size * 8));
-        const llvm_backing_int_ty = try o.lowerType(backing_int_ty);
+        const llvm_backing_int_ty = try o.lowerType(backing_int_ty, .by_value);
 
         const backing_int_val = try fg.load(ptr, ptr_alignment, backing_int_ty, access_kind);
 
@@ -4936,14 +4852,14 @@ fn airLoad(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
 
     // Accepted proposal https://github.com/ziglang/zig/issues/24061 will eliminate this usage of `pt`.
     const backing_int_ty = try fg.pt.intType(.unsigned, @intCast(ptr_info.packed_offset.host_size * 8));
-    const llvm_backing_int_ty = try o.lowerType(backing_int_ty);
+    const llvm_backing_int_ty = try o.lowerType(backing_int_ty, .by_value);
 
     const backing_int_val = try fg.load(ptr, ptr_align, backing_int_ty, .normal);
 
     const elem_bits = ptr_ty.childType(zcu).bitSize(zcu);
     const shift_amt = try o.builder.intValue(llvm_backing_int_ty, ptr_info.packed_offset.bit_offset);
     const shifted_value = try fg.wip.bin(.lshr, backing_int_val, shift_amt, "");
-    const elem_llvm_ty = try o.lowerType(elem_ty);
+    const elem_llvm_ty = try o.lowerType(elem_ty, .by_value);
 
     if (elem_ty.zigTypeTag(zcu) == .float or elem_ty.zigTypeTag(zcu) == .vector) {
         const same_size_int = try o.builder.intType(@intCast(elem_bits));
@@ -4993,7 +4909,7 @@ fn airBreakpoint(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.V
 fn airRetAddr(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
     _ = inst;
     const o = self.object;
-    const llvm_usize = try o.lowerType(.usize);
+    const llvm_usize = try o.lowerType(.usize, .by_value);
     if (!target_util.supportsReturnAddress(self.object.zcu.getTarget(), self.ownerModule().optimize_mode)) {
         // https://github.com/ziglang/zig/issues/11946
         return o.builder.intValue(llvm_usize, 0);
@@ -5005,7 +4921,7 @@ fn airRetAddr(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Valu
 fn airFrameAddress(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
     _ = inst;
     const result = try self.wip.callIntrinsic(.normal, .none, .frameaddress, &.{.ptr}, &.{.@"0"}, "");
-    return self.wip.cast(.ptrtoint, result, try self.object.lowerType(.usize), "");
+    return self.wip.cast(.ptrtoint, result, try self.object.lowerType(.usize, .by_value), "");
 }
 
 fn airCmpxchg(
@@ -5022,7 +4938,7 @@ fn airCmpxchg(
     var expected_value = try self.resolveInst(extra.expected_value);
     var new_value = try self.resolveInst(extra.new_value);
     const operand_ty = ptr_ty.childType(zcu);
-    const llvm_operand_ty = try o.lowerType(operand_ty);
+    const llvm_operand_ty = try o.lowerType(operand_ty, .by_value);
     const llvm_abi_ty = try self.getAtomicAbiType(operand_ty, false);
     if (llvm_abi_ty != .none) {
         // operand needs widening and truncating
@@ -5066,7 +4982,7 @@ fn airCmpxchg(
     const non_null_bit = try self.wip.not(success_bit, "");
 
     const payload_align = operand_ty.abiAlignment(zcu);
-    const alloca_inst = try self.buildAlloca(try o.lowerType(optional_ty), payload_align.toLlvm());
+    const alloca_inst = try self.buildZigAlloca(optional_ty, .none);
 
     // Payload is always the first field at offset 0, so address is `alloca_inst`
     try self.store(alloca_inst, .none, payload, operand_ty, .normal);
@@ -5092,7 +5008,7 @@ fn airAtomicRmw(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Va
     const op = toLlvmAtomicRmwBinOp(extra.op(), is_signed_int, is_float);
     const ordering = toLlvmAtomicOrdering(extra.ordering());
     const llvm_abi_ty = try self.getAtomicAbiType(operand_ty, op == .xchg);
-    const llvm_operand_ty = try o.lowerType(operand_ty);
+    const llvm_operand_ty = try o.lowerType(operand_ty, .by_value);
 
     const access_kind: Builder.MemoryAccessKind =
         if (ptr_ty.isVolatilePtr(zcu)) .@"volatile" else .normal;
@@ -5121,7 +5037,7 @@ fn airAtomicRmw(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Va
 
     // If we are storing a pointer we need to convert to and from a plain old integer.
     const non_ptr_operand = switch (operand_ty.zigTypeTag(zcu)) {
-        .pointer => try self.wip.cast(.ptrtoint, operand, try o.lowerType(.usize), ""),
+        .pointer => try self.wip.cast(.ptrtoint, operand, try o.lowerType(.usize, .by_value), ""),
         else => operand,
     };
 
@@ -5160,7 +5076,7 @@ fn airAtomicLoad(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.V
         Type.fromInterned(info.child).abiAlignment(zcu)).toLlvm();
     const access_kind: Builder.MemoryAccessKind =
         if (info.flags.is_volatile) .@"volatile" else .normal;
-    const elem_llvm_ty = try o.lowerType(elem_ty);
+    const elem_llvm_ty = try o.lowerType(elem_ty, .by_value);
 
     self.maybeMarkAllowZeroAccess(info);
 
@@ -5306,10 +5222,13 @@ fn airMemset(self: *FuncGen, inst: Air.Inst.Index, safety: bool) Allocator.Error
             }
             if (elem_ty.isAbiInt(zcu)) {
                 const info = elem_ty.intInfo(zcu);
-                break :byte self.wip.conv(info.signedness, value, .i8, "");
+                break :byte try self.wip.conv(switch (info.signedness) {
+                    .unsigned => .unsigned,
+                    .signed => .signed,
+                }, value, .i8, "");
             }
-            if (elem_ty == .bool) {
-                break :byte self.wip.cast(.zext, value, .i8, "");
+            if (elem_ty.toIntern() == .bool_type) {
+                break :byte try self.wip.cast(.zext, value, .i8, "");
             }
             break :intrinsic;
         };
@@ -5459,15 +5378,9 @@ fn airGetUnionTag(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.
     const layout = un_ty.unionGetLayout(zcu);
     assert(layout.tag_size != 0);
     const operand = try self.resolveInst(ty_op.operand);
-    if (isByRef(un_ty, zcu)) {
-        const tag_field_ptr = try self.ptraddConst(operand, layout.tagOffset());
-        return self.load(tag_field_ptr, .none, un_ty.unionTagTypeRuntime(zcu).?, .normal);
-    } else {
-        // This is only possible if all fields are zero-bit, in which case `operand` is already an
-        // integer value (the union is lowered as its enum tag).
-        assert(layout.payload_size == 0);
-        return operand;
-    }
+    assert(isByRef(un_ty, zcu));
+    const tag_field_ptr = try self.ptraddConst(operand, layout.tagOffset());
+    return self.load(tag_field_ptr, .none, un_ty.unionTagTypeRuntime(zcu).?, .normal);
 }
 
 fn airUnaryOp(self: *FuncGen, inst: Air.Inst.Index, comptime op: FloatOp) Allocator.Error!Builder.Value {
@@ -5497,11 +5410,11 @@ fn airClzCtz(self: *FuncGen, inst: Air.Inst.Index, intrinsic: Builder.Intrinsic)
         .normal,
         .none,
         intrinsic,
-        &.{try o.lowerType(operand_ty)},
+        &.{try o.lowerType(operand_ty, .by_value)},
         &.{ operand, .false },
         "",
     );
-    return self.wip.conv(.unsigned, result, try o.lowerType(inst_ty), "");
+    return self.wip.conv(.unsigned, result, try o.lowerType(inst_ty, .by_value), "");
 }
 
 fn airBitOp(self: *FuncGen, inst: Air.Inst.Index, intrinsic: Builder.Intrinsic) Allocator.Error!Builder.Value {
@@ -5515,11 +5428,11 @@ fn airBitOp(self: *FuncGen, inst: Air.Inst.Index, intrinsic: Builder.Intrinsic) 
         .normal,
         .none,
         intrinsic,
-        &.{try o.lowerType(operand_ty)},
+        &.{try o.lowerType(operand_ty, .by_value)},
         &.{operand},
         "",
     );
-    return self.wip.conv(.unsigned, result, try o.lowerType(inst_ty), "");
+    return self.wip.conv(.unsigned, result, try o.lowerType(inst_ty, .by_value), "");
 }
 
 fn airByteSwap(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
@@ -5532,7 +5445,7 @@ fn airByteSwap(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Val
 
     const inst_ty = self.typeOfIndex(inst);
     var operand = try self.resolveInst(ty_op.operand);
-    var llvm_operand_ty = try o.lowerType(operand_ty);
+    var llvm_operand_ty = try o.lowerType(operand_ty, .by_value);
 
     if (bits % 16 == 8) {
         // If not an even byte-multiple, we need zero-extend + shift-left 1 byte
@@ -5553,7 +5466,7 @@ fn airByteSwap(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Val
 
     const result =
         try self.wip.callIntrinsic(.normal, .none, .bswap, &.{llvm_operand_ty}, &.{operand}, "");
-    return self.wip.conv(.unsigned, result, try o.lowerType(inst_ty), "");
+    return self.wip.conv(.unsigned, result, try o.lowerType(inst_ty, .by_value), "");
 }
 
 fn airErrorSetHasValue(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
@@ -5573,7 +5486,7 @@ fn airErrorSetHasValue(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Bui
 
     for (0..names.len) |name_index| {
         const err_int = ip.getErrorValueIfExists(names.get(ip)[name_index]).?;
-        const this_tag_int_value = try o.builder.intConst(try o.errorIntType(), err_int);
+        const this_tag_int_value = try o.builder.intConst(try o.errorIntType(.by_value), err_int);
         try wip_switch.addCase(this_tag_int_value, valid_block, &self.wip);
     }
     self.wip.cursor = .{ .block = valid_block };
@@ -5632,7 +5545,7 @@ fn airErrorName(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Va
     const slice_ty = self.typeOfIndex(inst);
 
     // If operand is small (e.g. `u8`), then signedness becomes a problem -- GEP always treats the index as signed.
-    const operand_usize = try self.wip.conv(.unsigned, operand, try o.lowerType(.usize), "");
+    const operand_usize = try self.wip.conv(.unsigned, operand, try o.lowerType(.usize, .by_value), "");
 
     const error_name_table_ptr = try o.getErrorNameTable();
     const error_name_ptr = try self.ptraddScaled(error_name_table_ptr.toValue(&o.builder), operand_usize, slice_ty.abiSize(zcu));
@@ -5643,7 +5556,7 @@ fn airSplat(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value 
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const scalar = try self.resolveInst(ty_op.operand);
     const vector_ty = self.typeOfIndex(inst);
-    return self.wip.splatVector(try self.object.lowerType(vector_ty), scalar, "");
+    return self.wip.splatVector(try self.object.lowerType(vector_ty, .by_value), scalar, "");
 }
 
 fn airSelect(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Value {
@@ -5666,9 +5579,9 @@ fn airShuffleOne(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Val
     const operand = try fg.resolveInst(unwrapped.operand);
     const mask = unwrapped.mask;
     const operand_ty = fg.typeOf(unwrapped.operand);
-    const llvm_operand_ty = try o.lowerType(operand_ty);
-    const llvm_result_ty = try o.lowerType(unwrapped.result_ty);
-    const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu));
+    const llvm_operand_ty = try o.lowerType(operand_ty, .by_value);
+    const llvm_result_ty = try o.lowerType(unwrapped.result_ty, .by_value);
+    const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu), .by_value);
     const llvm_poison_elem = try o.builder.poisonConst(llvm_elem_ty);
     const llvm_poison_mask_elem = try o.builder.poisonConst(.i32);
     const llvm_mask_ty = try o.builder.vectorType(.normal, @intCast(mask.len), .i32);
@@ -5698,7 +5611,7 @@ fn airShuffleOne(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Val
             .elem => llvm_poison_elem,
             .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) elem: {
                 any_defined_comptime_value = true;
-                break :elem try o.lowerValue(val);
+                break :elem try o.lowerValue(val, .by_value);
             } else llvm_poison_elem,
         };
     }
@@ -5770,7 +5683,7 @@ fn airShuffleTwo(fg: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Val
     const unwrapped = fg.air.unwrapShuffleTwo(zcu, inst);
 
     const mask = unwrapped.mask;
-    const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu));
+    const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu), .by_value);
     const llvm_mask_ty = try o.builder.vectorType(.normal, @intCast(mask.len), .i32);
     const llvm_poison_mask_elem = try o.builder.poisonConst(.i32);
 
@@ -5860,7 +5773,7 @@ fn buildReducedCall(
     accum_init: Builder.Value,
 ) Allocator.Error!Builder.Value {
     const o = self.object;
-    const llvm_usize_ty = try o.lowerType(.usize);
+    const llvm_usize_ty = try o.lowerType(.usize, .by_value);
     const llvm_vector_len = try o.builder.intValue(llvm_usize_ty, vector_len);
     const llvm_result_ty = accum_init.typeOfWip(&self.wip);
 
@@ -5907,7 +5820,7 @@ fn buildReducedCall(
     accum.finish(&.{ accum_init, new_accum }, &.{ entry_block, body_block }, &self.wip);
 
     self.wip.cursor = .{ .block = exit_block };
-    return new_accum;
+    return accum.toValue();
 }
 
 fn airReduce(self: *FuncGen, inst: Air.Inst.Index, fast: Builder.FastMathKind) Allocator.Error!Builder.Value {
@@ -5918,9 +5831,9 @@ fn airReduce(self: *FuncGen, inst: Air.Inst.Index, fast: Builder.FastMathKind) A
     const reduce = self.air.instructions.items(.data)[@intFromEnum(inst)].reduce;
     const operand = try self.resolveInst(reduce.operand);
     const operand_ty = self.typeOf(reduce.operand);
-    const llvm_operand_ty = try o.lowerType(operand_ty);
+    const llvm_operand_ty = try o.lowerType(operand_ty, .by_value);
     const scalar_ty = self.typeOfIndex(inst);
-    const llvm_scalar_ty = try o.lowerType(scalar_ty);
+    const llvm_scalar_ty = try o.lowerType(scalar_ty, .by_value);
 
     switch (reduce.operation) {
         .And, .Or, .Xor => return self.wip.callIntrinsic(.normal, .none, switch (reduce.operation) {
@@ -6027,10 +5940,10 @@ fn airAggregateInit(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builde
     const result_ty = self.typeOfIndex(inst);
     const len: usize = @intCast(result_ty.arrayLen(zcu));
     const elements: []const Air.Inst.Ref = @ptrCast(self.air.extra.items[ty_pl.payload..][0..len]);
-    const llvm_result_ty = try o.lowerType(result_ty);
 
     switch (result_ty.zigTypeTag(zcu)) {
         .vector => {
+            const llvm_result_ty = try o.lowerType(result_ty, .by_value);
             var vector = try o.builder.poisonValue(llvm_result_ty);
             for (elements, 0..) |elem, i| {
                 const index_u32 = try o.builder.intValue(.i32, i);
@@ -6072,7 +5985,7 @@ fn airAggregateInit(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builde
                 // TODO in debug builds init to undef so that the padding will be 0xaa
                 // even if we fully populate the fields.
                 const struct_align = result_ty.abiAlignment(zcu);
-                const alloca_inst = try self.buildAlloca(llvm_result_ty, struct_align.toLlvm());
+                const alloca_inst = try self.buildZigAlloca(result_ty, .none);
 
                 for (elements, 0..) |elem, field_index| {
                     if (result_ty.structFieldIsComptime(field_index, zcu)) continue;
@@ -6093,8 +6006,7 @@ fn airAggregateInit(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builde
         .array => {
             assert(isByRef(result_ty, zcu));
 
-            const alignment = result_ty.abiAlignment(zcu).toLlvm();
-            const alloca_inst = try self.buildAlloca(llvm_result_ty, alignment);
+            const alloca_inst = try self.buildZigAlloca(result_ty, .none);
 
             const array_info = result_ty.arrayInfo(zcu);
 
@@ -6124,7 +6036,6 @@ fn airUnionInit(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Va
     const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
     const extra = self.air.extraData(Air.UnionInit, ty_pl.payload).data;
     const union_ty = self.typeOfIndex(inst);
-    const union_llvm_ty = try o.lowerType(union_ty);
     const union_obj = zcu.typeToUnion(union_ty).?;
 
     assert(union_obj.layout != .@"packed");
@@ -6134,8 +6045,7 @@ fn airUnionInit(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Va
     assert(layout.payload_size != 0); // otherwise the value would be comptime-known
     assert(isByRef(union_ty, zcu));
 
-    const alignment = layout.abi_align.toLlvm();
-    const result_ptr = try self.buildAlloca(union_llvm_ty, alignment);
+    const result_ptr = try self.buildZigAlloca(union_ty, layout.abi_align);
     const llvm_payload = try self.resolveInst(extra.init);
     const field_ty = Type.fromInterned(union_obj.field_types.get(ip)[extra.field_index]);
     assert(field_ty.hasRuntimeBits(zcu));
@@ -6150,10 +6060,10 @@ fn airUnionInit(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builder.Va
         const loaded_enum = ip.loadEnumType(tag_ty.toIntern());
         const llvm_tag_val = switch (loaded_enum.field_values.getOrNone(ip, extra.field_index)) {
             .none => try o.builder.intConst(
-                try o.lowerType(.fromInterned(union_obj.enum_tag_type)),
+                try o.lowerType(.fromInterned(union_obj.enum_tag_type), .by_value),
                 extra.field_index, // auto-numbered
             ),
-            else => |tag_val_ip| try o.lowerValue(tag_val_ip),
+            else => |tag_val_ip| try o.lowerValue(tag_val_ip, .by_value),
         };
         const tag_ptr = try self.ptraddConst(result_ptr, layout.tagOffset());
         try self.store(tag_ptr, layout.tag_align, llvm_tag_val.toValue(), tag_ty, .normal);
@@ -6215,7 +6125,7 @@ fn airAddrSpaceCast(self: *FuncGen, inst: Air.Inst.Index) Allocator.Error!Builde
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const inst_ty = self.typeOfIndex(inst);
     const operand = try self.resolveInst(ty_op.operand);
-    return self.wip.cast(.addrspacecast, operand, try self.object.lowerType(inst_ty), "");
+    return self.wip.cast(.addrspacecast, operand, try self.object.lowerType(inst_ty, .by_value), "");
 }
 
 fn workIntrinsic(
@@ -6361,15 +6271,14 @@ fn load(
     const abi_align = load_ty.abiAlignment(zcu);
     const abi_size = load_ty.abiSize(zcu);
 
-    const llvm_load_ty = try o.lowerType(load_ty);
     const llvm_ptr_align: Builder.Alignment = switch (ptr_align) {
         .none => abi_align.toLlvm(),
         else => |a| a.toLlvm(),
     };
 
     if (isByRef(load_ty, zcu)) {
-        const llvm_usize_ty = try o.lowerType(.usize);
-        const result_ptr = try fg.buildAlloca(llvm_load_ty, abi_align.toLlvm());
+        const llvm_usize_ty = try o.lowerType(.usize, .by_value);
+        const result_ptr = try fg.buildZigAlloca(load_ty, .none);
         _ = try fg.wip.callMemCpy(
             result_ptr,
             abi_align.toLlvm(),
@@ -6382,7 +6291,11 @@ fn load(
         return result_ptr;
     }
 
-    if (load_ty.isAbiInt(zcu) and load_ty.bitSize(zcu) != abi_size * 8) {
+    const llvm_memory_ty = try o.lowerType(load_ty, .in_memory);
+    const llvm_value_ty = try o.lowerType(load_ty, .by_value);
+
+    if (llvm_memory_ty != llvm_value_ty) {
+        assert(load_ty.isAbiInt(zcu));
         // `load_ty` is an integer type with padding bits. In theory, we shouldn't need any special
         // handling for these, as LLVM's documented semantics are a valid implementation of Zig's
         // semantics. However:
@@ -6395,8 +6308,7 @@ fn load(
         //
         // Therefore, we handle these memory accesses specially: in this case we will actually load
         // the next-largest "natural" integer type and then truncate to `load_ty`.
-        const llvm_abi_ty = try o.builder.intType(@intCast(abi_size * 8));
-        const loaded = try fg.wip.load(access_kind, llvm_abi_ty, ptr, llvm_ptr_align, "");
+        const loaded = try fg.wip.load(access_kind, llvm_memory_ty, ptr, llvm_ptr_align, "");
         // For packed structs, current Zig semantics don't really allow us to make the padding bits
         // well-defined. This should be solved once https://github.com/ziglang/zig/issues/24061 is
         // implemented, but until then, do a normal trunc for packed types.
@@ -6406,11 +6318,11 @@ fn load(
                 .unsigned => .@"trunc nuw",
                 .signed => .@"trunc nsw",
             },
-        }, loaded, llvm_load_ty, "");
+        }, loaded, llvm_value_ty, "");
     }
 
     // `load_ty` is a simple by-val type which requires no special handling.
-    return fg.wip.load(access_kind, llvm_load_ty, ptr, llvm_ptr_align, "");
+    return fg.wip.load(access_kind, llvm_value_ty, ptr, llvm_ptr_align, "");
 }
 
 /// Non-atomic, non-bitpacked store of `elem` to pointer `ptr`.
@@ -6438,7 +6350,7 @@ fn store(
     };
 
     if (isByRef(elem_ty, zcu)) {
-        const llvm_usize_ty = try o.lowerType(.usize);
+        const llvm_usize_ty = try o.lowerType(.usize, .by_value);
         _ = try fg.wip.callMemCpy(
             ptr,
             llvm_ptr_align,
@@ -6451,16 +6363,19 @@ fn store(
         return;
     }
 
-    assert(elem.typeOfWip(&fg.wip) == try o.lowerType(elem_ty));
+    assert(elem.typeOfWip(&fg.wip) == try o.lowerType(elem_ty, .by_value));
 
-    if (elem_ty.isAbiInt(zcu) and elem_ty.bitSize(zcu) != abi_size * 8) {
+    const llvm_memory_ty = try o.lowerType(elem_ty, .in_memory);
+    const llvm_value_ty = try o.lowerType(elem_ty, .by_value);
+
+    if (llvm_memory_ty != llvm_value_ty) {
+        assert(elem_ty.isAbiInt(zcu));
         // `elem_ty` is an integer type with padding bits, so we need to handle it specially---see
         // the corresponding comment in `FuncGen.load` for more details.
-        const llvm_abi_ty = try o.builder.intType(@intCast(abi_size * 8));
         const extended = try fg.wip.cast(switch (elem_ty.intInfo(zcu).signedness) {
             .unsigned => .zext,
             .signed => .sext,
-        }, elem, llvm_abi_ty, "");
+        }, elem, llvm_memory_ty, "");
         _ = try fg.wip.storeAtomic(
             access_kind,
             extended,
@@ -6486,7 +6401,7 @@ fn store(
 fn valgrindMarkUndef(fg: *FuncGen, ptr: Builder.Value, len: Builder.Value) Allocator.Error!void {
     const VG_USERREQ__MAKE_MEM_UNDEFINED = 1296236545;
     const o = fg.object;
-    const usize_ty = try o.lowerType(.usize);
+    const usize_ty = try o.lowerType(.usize, .by_value);
     const zero = try o.builder.intValue(usize_ty, 0);
     const req = try o.builder.intValue(usize_ty, VG_USERREQ__MAKE_MEM_UNDEFINED);
     const ptr_as_usize = try fg.wip.cast(.ptrtoint, ptr, usize_ty, "");
@@ -6508,7 +6423,7 @@ fn valgrindClientRequest(
     const target = zcu.getTarget();
     if (!target_util.hasValgrindSupport(target, .stage2_llvm)) return default_value;
 
-    const llvm_usize = try o.lowerType(.usize);
+    const llvm_usize = try o.lowerType(.usize, .by_value);
     const usize_align = Type.usize.abiAlignment(zcu).toLlvm();
 
     const array_llvm_ty = try o.builder.arrayType(6, llvm_usize);
@@ -6790,7 +6705,7 @@ const ParamTypeIterator = struct {
                         while (field_it.next()) |field_index| {
                             const field_ty = ty.fieldType(field_index, zcu);
                             if (!field_ty.hasRuntimeBits(zcu)) continue;
-                            it.types_buffer[it.types_len] = try it.object.lowerType(field_ty);
+                            it.types_buffer[it.types_len] = try it.object.lowerType(field_ty, .by_value);
                             it.offsets_buffer[it.types_len] = ty.structFieldOffset(field_index, zcu);
                             it.types_len += 1;
                         }
@@ -6807,7 +6722,7 @@ const ParamTypeIterator = struct {
                         it.llvm_index += 1;
                         return .byval;
                     } else {
-                        it.types_buffer[0..1].* = .{try it.object.lowerType(scalar_ty)};
+                        it.types_buffer[0..1].* = .{try it.object.lowerType(scalar_ty, .by_value)};
                         it.offsets_buffer[0..2].* = .{ 0, scalar_ty.abiSize(zcu) };
                         it.types_len = 1;
                         it.llvm_index += 1;
@@ -6988,166 +6903,138 @@ pub fn iterateParamTypes(object: *Object, fn_info: InternPool.Key.FuncType) Para
     };
 }
 
-fn returnTypeByRef(zcu: *Zcu, target: *const std.Target, ty: Type) bool {
-    if (isByRef(ty, zcu)) {
-        return true;
-    } else if (target.cpu.arch.isX86() and
-        !target.cpu.has(.x86, .avx512f) and
-        ty.totalVectorBits(zcu) >= 512)
-    {
-        // As of LLVM 18, passing a vector byval with fastcc that is 512 bits or more returns
-        // "512-bit vector arguments require 'avx512f' for AVX512"
-        return true;
-    } else {
-        return false;
-    }
-}
-
-pub fn firstParamSRet(fn_info: InternPool.Key.FuncType, zcu: *Zcu, target: *const std.Target) bool {
-    const return_type = Type.fromInterned(fn_info.return_type);
-    if (!return_type.hasRuntimeBits(zcu)) return false;
-
-    return switch (fn_info.cc) {
-        .auto => returnTypeByRef(zcu, target, return_type),
-        .x86_64_sysv, .x86_64_x32 => firstParamSRetSystemV(return_type, zcu, target),
-        .x86_64_win => x86_64_abi.classifyWindows(return_type, zcu, target, .ret) == .memory,
-        .x86_sysv, .x86_win => isByRef(return_type, zcu),
-        .x86_stdcall => !isScalar(zcu, return_type),
-        .x86_fastcall => firstParamSRetX86Fastcall(zcu, return_type),
-        .wasm_mvp => wasm_c_abi.classifyType(return_type, zcu) == .indirect,
-        .aarch64_aapcs,
-        .aarch64_aapcs_darwin,
-        .aarch64_aapcs_win,
-        => aarch64_c_abi.classifyType(return_type, zcu) == .memory,
-        .arm_aapcs, .arm_aapcs_vfp => switch (arm_c_abi.classifyType(return_type, zcu, .ret)) {
-            .memory, .i64_array => true,
-            .i32_array => |size| size != 1,
-            .byval => false,
-        },
-        .riscv64_lp64, .riscv32_ilp32 => riscv_c_abi.classifyType(return_type, zcu) == .memory,
-        .mips_o32 => switch (mips_c_abi.classifyType(return_type, zcu, .ret)) {
-            .memory, .i32_array => true,
-            .byval => false,
-        },
-        else => false, // TODO: investigate other targets/callconvs
-    };
-}
+pub const FnReturnStrat = union(enum) {
+    /// The function return type is OPV (zero-bit), so the LLVM function return type is `void`.
+    void,
+    /// An sret parameter is used. The LLVM function return type is `void`.
+    sret,
+    /// The function's return type directly corresponds to the LLVM function return type.
+    ///
+    /// The return type is by-val, i.e. `isByRef` returns `false`.
+    by_val,
+    /// The LLVM function returns the given `Builder.Type` by reinterpreting memory containing the
+    /// actual return value. The actual return type may be by-val or by-ref.
+    mem_cast: Builder.Type,
 
-fn firstParamSRetX86Fastcall(zcu: *Zcu, ty: Type) bool {
-    if (isScalar(zcu, ty)) {
-        return false;
-    }
-    const tag = ty.zigTypeTag(zcu);
-    if (tag == .@"struct" or tag == .@"union") {
-        const size = ty.abiSize(zcu);
-        if (size == 1 or size == 2 or size == 4 or size == 8) {
-            return false;
-        }
+    fn forceByVal(o: *Object, ret_ty: Type) Allocator.Error!FnReturnStrat {
+        if (!isByRef(ret_ty, o.zcu)) return .by_val;
+        return .{ .mem_cast = try o.lowerType(ret_ty, .in_memory) };
     }
-    return true;
-}
-
-fn firstParamSRetSystemV(ty: Type, zcu: *Zcu, target: *const std.Target) bool {
-    if (isScalar(zcu, ty)) return false;
-    const class = x86_64_abi.classifySystemV(ty, zcu, target, .ret);
-    if (class[0] == .memory) return true;
-    if (class[0] == .x87 and class[2] != .none) return true;
-    return false;
-}
-
+};
 /// In order to support the C calling convention, some return types need to be lowered
 /// completely differently in the function prototype to honor the C ABI, and then
 /// be effectively bitcasted to the actual return type.
-pub fn lowerFnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.Error!Builder.Type {
+pub fn fnReturnStrat(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.Error!FnReturnStrat {
     const zcu = o.zcu;
-    const return_type = Type.fromInterned(fn_info.return_type);
-    if (!return_type.hasRuntimeBits(zcu)) {
-        assert(!return_type.isError(zcu));
-        return .void;
-    }
-    const target = zcu.getTarget();
+    const ret_ty: Type = .fromInterned(fn_info.return_type);
+    ret_ty.assertHasLayout(zcu);
+    if (!ret_ty.hasRuntimeBits(zcu)) return .void;
     switch (fn_info.cc) {
         .@"inline" => unreachable,
-        .auto => return if (returnTypeByRef(zcu, target, return_type)) .void else o.lowerType(return_type),
+        .auto => {
+            if (isByRef(ret_ty, zcu)) return .sret;
+
+            const target = zcu.getTarget();
+            if (target.cpu.arch.isX86() and
+                !target.cpu.has(.x86, .avx512f) and
+                ret_ty.totalVectorBits(zcu) >= 512)
+            {
+                // As of LLVM 18, passing a vector byval with fastcc that is 512 bits or more returns
+                // "512-bit vector arguments require 'avx512f' for AVX512"
+                return .sret;
+            }
+
+            return .by_val;
+        },
         .x86_64_sysv, .x86_64_x32 => return lowerSystemVFnRetTy(o, fn_info),
         .x86_64_win => return lowerWin64FnRetTy(o, fn_info),
-        .x86_stdcall => return if (isScalar(zcu, return_type)) o.lowerType(return_type) else .void,
-        .x86_fastcall => return lowerX86FastcallFnRetTy(o, zcu, return_type),
-        .x86_sysv, .x86_win => return if (isByRef(return_type, zcu)) .void else o.lowerType(return_type),
-        .aarch64_aapcs, .aarch64_aapcs_darwin, .aarch64_aapcs_win => switch (aarch64_c_abi.classifyType(return_type, zcu)) {
-            .memory => return .void,
-            .float_array => return o.lowerType(return_type),
-            .byval => return o.lowerType(return_type),
-            .integer => return .i64,
-            .double_integer => return o.builder.arrayType(2, .i64),
+        .x86_stdcall => if (isScalar(zcu, ret_ty)) {
+            assert(!isByRef(ret_ty, zcu));
+            return .by_val;
+        } else return .sret,
+        .x86_fastcall => return lowerX86FastcallFnRetTy(o, zcu, ret_ty),
+        .x86_sysv, .x86_win => return if (isByRef(ret_ty, zcu)) .sret else .by_val,
+        .aarch64_aapcs, .aarch64_aapcs_darwin, .aarch64_aapcs_win => switch (aarch64_c_abi.classifyType(ret_ty, zcu)) {
+            .memory => return .sret,
+            .float_array, .byval => return .forceByVal(o, ret_ty),
+            .integer => return .{ .mem_cast = .i64 },
+            .double_integer => return .{ .mem_cast = try o.builder.arrayType(2, .i64) },
         },
-        .arm_aapcs, .arm_aapcs_vfp => switch (arm_c_abi.classifyType(return_type, zcu, .ret)) {
-            .memory, .i64_array => return .void,
-            .i32_array => |len| return if (len == 1) .i32 else .void,
-            .byval => return o.lowerType(return_type),
+        .arm_aapcs, .arm_aapcs_vfp => switch (arm_c_abi.classifyType(ret_ty, zcu, .ret)) {
+            .memory, .i64_array => return .sret,
+            .i32_array => |len| return if (len == 1) .{ .mem_cast = .i32 } else .sret,
+            .byval => return .forceByVal(o, ret_ty),
         },
-        .mips_o32 => switch (mips_c_abi.classifyType(return_type, zcu, .ret)) {
-            .memory, .i32_array => return .void,
-            .byval => return o.lowerType(return_type),
+        .mips_o32 => switch (mips_c_abi.classifyType(ret_ty, zcu, .ret)) {
+            .memory, .i32_array => return .sret,
+            .byval => return .forceByVal(o, ret_ty),
         },
-        .riscv64_lp64, .riscv32_ilp32 => switch (riscv_c_abi.classifyType(return_type, zcu)) {
-            .memory => return .void,
-            .integer => return o.builder.intType(@intCast(return_type.bitSize(zcu))),
+        .riscv64_lp64, .riscv32_ilp32 => switch (riscv_c_abi.classifyType(ret_ty, zcu)) {
+            .memory => return .sret,
+            .integer => return .{ .mem_cast = try o.builder.intType(@intCast(ret_ty.abiSize(zcu) * 8)) },
             .double_integer => {
                 const integer: Builder.Type = switch (zcu.getTarget().cpu.arch) {
                     .riscv64, .riscv64be => .i64,
                     .riscv32, .riscv32be => .i32,
                     else => unreachable,
                 };
-                return o.builder.structType(.normal, &.{ integer, integer });
+                return .{ .mem_cast = try o.builder.structType(.normal, &.{ integer, integer }) };
             },
-            .byval => return o.lowerType(return_type),
+            .byval => return .forceByVal(o, ret_ty),
             .fields => {
                 var types_len: usize = 0;
                 var types: [8]Builder.Type = undefined;
-                for (0..return_type.structFieldCount(zcu)) |field_index| {
-                    const field_ty = return_type.fieldType(field_index, zcu);
+                for (0..ret_ty.structFieldCount(zcu)) |field_index| {
+                    const field_ty = ret_ty.fieldType(field_index, zcu);
                     if (!field_ty.hasRuntimeBits(zcu)) continue;
-                    types[types_len] = try o.lowerType(field_ty);
+                    types[types_len] = try o.lowerType(field_ty, .by_value);
                     types_len += 1;
                 }
-                return o.builder.structType(.normal, types[0..types_len]);
+                return .{ .mem_cast = try o.builder.structType(.normal, types[0..types_len]) };
             },
         },
-        .wasm_mvp => switch (wasm_c_abi.classifyType(return_type, zcu)) {
-            .direct => |scalar_ty| return o.lowerType(scalar_ty),
-            .indirect => return .void,
+        .wasm_mvp => switch (wasm_c_abi.classifyType(ret_ty, zcu)) {
+            .direct => |scalar_ty| if (scalar_ty.toIntern() == ret_ty.toIntern()) {
+                assert(!isByRef(ret_ty, zcu));
+                return .by_val;
+            } else {
+                return .{ .mem_cast = try o.lowerType(scalar_ty, .by_value) };
+            },
+            .indirect => return .sret,
         },
         // TODO investigate other callconvs
-        else => return o.lowerType(return_type),
+        else => return .forceByVal(o, ret_ty),
     }
 }
 
-fn lowerX86FastcallFnRetTy(o: *Object, zcu: *Zcu, ty: Type) Allocator.Error!Builder.Type {
+fn lowerX86FastcallFnRetTy(o: *Object, zcu: *Zcu, ty: Type) Allocator.Error!FnReturnStrat {
     if (isScalar(zcu, ty)) {
-        return o.lowerType(ty);
+        assert(!isByRef(ty, zcu));
+        return .by_val;
     }
     const tag = ty.zigTypeTag(zcu);
     if (tag == .@"struct" or tag == .@"union") {
         const size = ty.abiSize(zcu);
         if (size == 1 or size == 2 or size == 4 or size == 8) {
-            return o.builder.intType(@intCast(size * 8));
+            return .{ .mem_cast = try o.builder.intType(@intCast(size * 8)) };
         }
     }
-    return .void;
+    return .sret;
 }
 
-fn lowerWin64FnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.Error!Builder.Type {
+fn lowerWin64FnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.Error!FnReturnStrat {
     const zcu = o.zcu;
-    const return_type = Type.fromInterned(fn_info.return_type);
-    switch (x86_64_abi.classifyWindows(return_type, zcu, zcu.getTarget(), .ret)) {
-        .integer => {
-            if (isScalar(zcu, return_type)) {
-                return o.lowerType(return_type);
-            } else {
-                return o.builder.intType(@intCast(return_type.abiSize(zcu) * 8));
-            }
+    const ret_ty = Type.fromInterned(fn_info.return_type);
+    switch (x86_64_abi.classifyWindows(ret_ty, zcu, zcu.getTarget(), .ret)) {
+        .integer => if (isScalar(zcu, ret_ty)) {
+            assert(!isByRef(ret_ty, zcu));
+            return .by_val;
+        } else {
+            return .{ .mem_cast = try o.builder.intType(@intCast(ret_ty.abiSize(zcu) * 8)) };
         },
+        .win_i128 => return .{ .mem_cast = try o.builder.vectorType(.normal, 2, .i64) },
+        .memory => return .sret,
+
         .sse,
         .bool_vector_mask,
         .integer_per_element,
@@ -7156,7 +7043,10 @@ fn lowerWin64FnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.Err
         .sse_per_xword,
         .sse_per_yword,
         .sse_per_zword,
-        => return o.lowerType(return_type),
+        => {
+            assert(!isByRef(ret_ty, zcu));
+            return .by_val;
+        },
         .sseup,
         .x87,
         .x87up,
@@ -7164,20 +7054,18 @@ fn lowerWin64FnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.Err
         .float,
         .float_combine,
         => unreachable,
-        .win_i128 => return o.builder.vectorType(.normal, 2, .i64),
-        .memory => return .void,
     }
 }
 
-fn lowerSystemVFnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.Error!Builder.Type {
+fn lowerSystemVFnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.Error!FnReturnStrat {
     const zcu = o.zcu;
     const ip = &zcu.intern_pool;
-    const return_type = Type.fromInterned(fn_info.return_type);
-    return_type.assertHasLayout(zcu);
-    if (isScalar(zcu, return_type)) {
-        return o.lowerType(return_type);
+    const ret_ty = Type.fromInterned(fn_info.return_type);
+    if (isScalar(zcu, ret_ty)) {
+        assert(!isByRef(ret_ty, zcu));
+        return .by_val;
     }
-    const classes = x86_64_abi.classifySystemV(return_type, zcu, zcu.getTarget(), .ret);
+    const classes = x86_64_abi.classifySystemV(ret_ty, zcu, zcu.getTarget(), .ret);
     var types_index: u32 = 0;
     var types_buffer: [8]Builder.Type = undefined;
     for (classes) |class| {
@@ -7207,13 +7095,13 @@ fn lowerSystemVFnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.E
                 types_index += 1;
             },
             .x87 => {
-                if (types_index != 0 or classes[2] != .none) return .void;
+                if (types_index != 0 or classes[2] != .none) return .sret;
                 types_buffer[types_index] = .x86_fp80;
                 types_index += 1;
             },
             .x87up => continue,
             .none => break,
-            .memory => return .void,
+            .memory => return .sret,
             .win_i128 => unreachable, // windows only
             .bool_vector_mask,
             .integer_per_element,
@@ -7228,9 +7116,9 @@ fn lowerSystemVFnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.E
     const first_non_integer = std.mem.indexOfNone(x86_64_abi.Class, &classes, &.{.integer});
     if (first_non_integer == null or classes[first_non_integer.?] == .none) {
         assert(first_non_integer orelse classes.len == types_index);
-        switch (ip.indexToKey(return_type.toIntern())) {
+        switch (ip.indexToKey(ret_ty.toIntern())) {
             .struct_type => {
-                const size = return_type.abiSize(zcu);
+                const size = ret_ty.abiSize(zcu);
                 assert((std.math.divCeil(u64, size, 8) catch unreachable) == types_index);
                 if (size % 8 > 0) {
                     types_buffer[types_index - 1] = try o.builder.intType(@intCast(size % 8 * 8));
@@ -7238,9 +7126,9 @@ fn lowerSystemVFnRetTy(o: *Object, fn_info: InternPool.Key.FuncType) Allocator.E
             },
             else => {},
         }
-        if (types_index == 1) return types_buffer[0];
+        if (types_index == 1) return .{ .mem_cast = types_buffer[0] };
     }
-    return o.builder.structType(.normal, types_buffer[0..types_index]);
+    return .{ .mem_cast = try o.builder.structType(.normal, types_buffer[0..types_index]) };
 }
 
 /// This function deliberately does not handle `_BitInt` because it typically
@@ -7380,7 +7268,7 @@ pub fn isByRef(ty: Type, zcu: *const Zcu) bool {
         },
         .@"union" => switch (ty.containerLayout(zcu)) {
             .@"packed" => false,
-            else => ty.hasRuntimeBits(zcu) and !ty.unionHasAllZeroBitFieldTypes(zcu),
+            else => ty.hasRuntimeBits(zcu),
         },
     };
 }
@@ -7411,7 +7299,7 @@ fn getAtomicAbiType(fg: *const FuncGen, ty: Type, is_rmw_xchg: bool) Allocator.E
 fn ptraddConst(fg: *FuncGen, ptr: Builder.Value, offset: u64) Allocator.Error!Builder.Value {
     if (offset == 0) return ptr;
     const o = fg.object;
-    const llvm_usize_ty = try o.lowerType(.usize);
+    const llvm_usize_ty = try o.lowerType(.usize, .by_value);
     const offset_val = try o.builder.intValue(llvm_usize_ty, offset);
     return fg.wip.gep(.inbounds, .i8, ptr, &.{offset_val}, "");
 }
diff --git a/src/codegen/mips/abi.zig b/src/codegen/mips/abi.zig
@@ -18,24 +18,23 @@ pub fn classifyType(ty: Type, zcu: *Zcu, ctx: Context) Class {
     const max_direct_size = target.ptrBitWidth() * 2;
     switch (ty.zigTypeTag(zcu)) {
         .@"struct" => {
-            const bit_size = ty.bitSize(zcu);
             if (ty.containerLayout(zcu) == .@"packed") {
-                if (bit_size > max_direct_size) return .memory;
+                if (ty.bitSize(zcu) > max_direct_size) return .memory;
                 return .byval;
             }
+            const bit_size = ty.abiSize(zcu) * 8;
             if (bit_size > max_direct_size) return .memory;
             // TODO: for bit_size <= 32 using byval is more correct, but that needs inreg argument attribute
             const count = @as(u8, @intCast(std.mem.alignForward(u64, bit_size, 32) / 32));
             return .{ .i32_array = count };
         },
         .@"union" => {
-            const bit_size = ty.bitSize(zcu);
             if (ty.containerLayout(zcu) == .@"packed") {
-                if (bit_size > max_direct_size) return .memory;
+                if (ty.bitSize(zcu) > max_direct_size) return .memory;
                 return .byval;
             }
+            const bit_size = ty.abiSize(zcu) * 8;
             if (bit_size > max_direct_size) return .memory;
-
             return .byval;
         },
         .bool => return .byval,
diff --git a/src/codegen/riscv64/CodeGen.zig b/src/codegen/riscv64/CodeGen.zig
@@ -51,7 +51,7 @@ const InnerError = codegen.Error || error{OutOfRegisters};
 
 pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
     return comptime &.initMany(&.{
-        .expand_intcast_safe,
+        .expand_int_cast_safe,
         .expand_int_from_float_safe,
         .expand_int_from_float_optimized_safe,
         .expand_add_safe,
@@ -1453,7 +1453,7 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void {
             .add_safe,
             .sub_safe,
             .mul_safe,
-            .intcast_safe,
+            .int_cast_safe,
             .int_from_float_safe,
             .int_from_float_optimized_safe,
             => return func.fail("TODO implement safety_checked_instructions", .{}),
@@ -1479,7 +1479,14 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void {
             .ret_ptr         => try func.airRetPtr(inst),
             .arg             => try func.airArg(inst),
             .assembly        => try func.airAsm(inst),
-            .bitcast         => try func.airBitCast(inst),
+            .bit_cast        => try func.airBitCast(inst),
+            .ptr_cast        => try func.airBitCast(inst),
+            .ptr_from_int    => try func.airBitCast(inst),
+            .int_from_ptr    => try func.airBitCast(inst),
+            .error_cast      => try func.airBitCast(inst),
+            .error_from_int  => try func.airBitCast(inst),
+            .int_from_error  => try func.airBitCast(inst),
+            .union_from_enum => try func.airBitCast(inst),
             .block           => try func.airBlock(inst),
             .br              => try func.airBr(inst),
             .repeat          => try func.airRepeat(inst),
@@ -1493,7 +1500,7 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void {
             .dbg_empty_stmt  => func.finishAirBookkeeping(),
             .fptrunc         => try func.airFptrunc(inst),
             .fpext           => try func.airFpext(inst),
-            .intcast         => try func.airIntCast(inst),
+            .int_cast        => try func.airIntCast(inst),
             .trunc           => try func.airTrunc(inst),
             .is_non_null     => try func.airIsNonNull(inst),
             .is_non_null_ptr => try func.airIsNonNullPtr(inst),
@@ -3953,9 +3960,7 @@ fn airPtrElemPtr(func: *Func, inst: Air.Inst.Index) !void {
         const elem_ptr_ty = func.typeOfIndex(inst);
         const base_ptr_ty = func.typeOf(extra.lhs);
 
-        if (elem_ptr_ty.ptrInfo(zcu).flags.vector_index != .none) {
-            @panic("audit");
-        }
+        assert(elem_ptr_ty.ptrInfo(zcu).flags.vector_index == .none);
 
         const base_ptr_mcv = try func.resolveInst(extra.lhs);
         const base_ptr_lock: ?RegisterLock = switch (base_ptr_mcv) {
diff --git a/src/codegen/riscv64/abi.zig b/src/codegen/riscv64/abi.zig
@@ -16,9 +16,8 @@ pub fn classifyType(ty: Type, zcu: *Zcu) Class {
     const max_byval_size = target.ptrBitWidth() * 2;
     switch (ty.zigTypeTag(zcu)) {
         .@"struct" => {
-            const bit_size = ty.bitSize(zcu);
             if (ty.containerLayout(zcu) == .@"packed") {
-                if (bit_size > max_byval_size) return .memory;
+                if (ty.bitSize(zcu) > max_byval_size) return .memory;
                 return .byval;
             }
 
@@ -40,17 +39,18 @@ pub fn classifyType(ty: Type, zcu: *Zcu) Class {
             }
 
             // TODO this doesn't exactly match what clang produces but its better than nothing
+            const bit_size = ty.abiSize(zcu) * 8;
             if (bit_size > max_byval_size) return .memory;
             if (bit_size > max_byval_size / 2) return .double_integer;
             return .integer;
         },
         .@"union" => {
-            const bit_size = ty.bitSize(zcu);
             if (ty.containerLayout(zcu) == .@"packed") {
-                if (bit_size > max_byval_size) return .memory;
+                if (ty.bitSize(zcu) > max_byval_size) return .memory;
                 return .byval;
             }
             // TODO this doesn't exactly match what clang produces but its better than nothing
+            const bit_size = ty.abiSize(zcu) * 8;
             if (bit_size > max_byval_size) return .memory;
             if (bit_size > max_byval_size / 2) return .double_integer;
             return .integer;
@@ -153,13 +153,12 @@ pub fn classifySystem(ty: Type, zcu: *Zcu) [8]SystemClass {
         },
         .error_union => {
             const payload_ty = ty.errorUnionPayload(zcu);
-            const payload_bits = payload_ty.bitSize(zcu);
 
             // the error union itself
             result[0] = .integer;
 
             // anyerror!void can fit into one register
-            if (payload_bits == 0) return result;
+            if (!payload_ty.hasRuntimeBits(zcu)) return result;
 
             return memory_class;
         },
diff --git a/src/codegen/sparc64/CodeGen.zig b/src/codegen/sparc64/CodeGen.zig
@@ -538,7 +538,14 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .ret_ptr         => try self.airRetPtr(inst),
             .arg             => try self.airArg(inst),
             .assembly        => try self.airAsm(inst),
-            .bitcast         => try self.airBitCast(inst),
+            .bit_cast        => try self.airBitCast(inst),
+            .ptr_cast        => try self.airBitCast(inst),
+            .ptr_from_int    => try self.airBitCast(inst),
+            .int_from_ptr    => try self.airBitCast(inst),
+            .error_cast      => try self.airBitCast(inst),
+            .error_from_int  => try self.airBitCast(inst),
+            .int_from_error  => try self.airBitCast(inst),
+            .union_from_enum => try self.airBitCast(inst),
             .block           => try self.airBlock(inst),
             .br              => try self.airBr(inst),
             .repeat          => return self.fail("TODO implement `repeat`", .{}),
@@ -550,7 +557,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .cond_br         => try self.airCondBr(inst),
             .fptrunc         => @panic("TODO try self.airFptrunc(inst)"),
             .fpext           => @panic("TODO try self.airFpext(inst)"),
-            .intcast         => try self.airIntCast(inst),
+            .int_cast        => try self.airIntCast(inst),
             .trunc           => try self.airTrunc(inst),
             .is_non_null     => try self.airIsNonNull(inst),
             .is_non_null_ptr => @panic("TODO try self.airIsNonNullPtr(inst)"),
@@ -689,7 +696,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .add_safe,
             .sub_safe,
             .mul_safe,
-            .intcast_safe,
+            .int_cast_safe,
             .int_from_float_safe,
             .int_from_float_optimized_safe,
             => @panic("TODO implement safety_checked_instructions"),
@@ -1659,7 +1666,7 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void {
     const info_a = operand_ty.intInfo(zcu);
     const info_b = self.typeOfIndex(inst).intInfo(zcu);
     if (info_a.signedness != info_b.signedness)
-        return self.fail("TODO gen intcast sign safety in semantic analysis", .{});
+        return self.fail("TODO gen int_cast sign safety in semantic analysis", .{});
 
     if (info_a.bits == info_b.bits)
         return self.finishAir(inst, operand, .{ ty_op.operand, .none, .none });
diff --git a/src/codegen/spirv/CodeGen.zig b/src/codegen/spirv/CodeGen.zig
@@ -34,7 +34,7 @@ const CodeGen = @This();
 
 pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
     return comptime &.initMany(&.{
-        .expand_intcast_safe,
+        .expand_int_cast_safe,
         .expand_int_from_float_safe,
         .expand_int_from_float_optimized_safe,
         .expand_add_safe,
@@ -1848,7 +1848,17 @@ fn resolveType(cg: *CodeGen, ty: Type, repr: Repr) Error!Id {
         .pointer => {
             const ptr_info = ty.ptrInfo(zcu);
 
-            const child_ty: Type = .fromInterned(ptr_info.child);
+            const child_ty: Type = switch (ptr_info.packed_offset.host_size) {
+                0 => .fromInterned(ptr_info.child),
+                else => switch (ptr_info.flags.vector_index) {
+                    // Accepted proposal https://github.com/ziglang/zig/issues/24061 will eliminate these usages of `pt`.
+                    .none => try pt.intType(.unsigned, ptr_info.packed_offset.host_size * 8),
+                    else => try pt.vectorType(.{
+                        .child = ptr_info.child,
+                        .len = ptr_info.packed_offset.host_size,
+                    }),
+                },
+            };
             const child_ty_id = try cg.resolveType(child_ty, .indirect);
             const storage_class = cg.module.storageClass(ptr_info.flags.address_space);
             const ptr_ty_id = try cg.module.ptrType(child_ty_id, storage_class);
@@ -3847,12 +3857,19 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) Error!void {
             .min => try cg.airMinMax(inst, .min),
             .max => try cg.airMinMax(inst, .max),
 
-            .bitcast         => try cg.airBitCast(inst),
-            .intcast, .trunc => try cg.airIntCast(inst),
-            .float_from_int  => try cg.airFloatFromInt(inst),
-            .int_from_float  => try cg.airIntFromFloat(inst),
-            .fpext, .fptrunc => try cg.airFloatCast(inst),
-            .not             => try cg.airNot(inst),
+            .bit_cast         => try cg.airBitCast(inst),
+            .ptr_cast         => try cg.airBitCast(inst),
+            .ptr_from_int     => try cg.airBitCast(inst),
+            .int_from_ptr     => try cg.airBitCast(inst),
+            .error_cast       => try cg.airBitCast(inst),
+            .error_from_int   => try cg.airBitCast(inst),
+            .int_from_error   => try cg.airBitCast(inst),
+            .union_from_enum  => try cg.airBitCast(inst),
+            .int_cast, .trunc => try cg.airIntCast(inst),
+            .float_from_int   => try cg.airFloatFromInt(inst),
+            .int_from_float   => try cg.airIntFromFloat(inst),
+            .fpext, .fptrunc  => try cg.airFloatCast(inst),
+            .not              => try cg.airNot(inst),
 
             .array_to_slice => try cg.airArrayToSlice(inst),
             .slice          => try cg.airSlice(inst),
@@ -6913,13 +6930,15 @@ fn airLoad(cg: *CodeGen, inst: Air.Inst.Index) !?Id {
     const zcu = cg.module.zcu;
     const pt = cg.pt;
     const ty_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const ptr_ty = cg.typeOf(ty_op.operand);
-    const ptr_info = ptr_ty.ptrInfo(zcu);
+
+    const ptr_info = cg.typeOf(ty_op.operand).ptrInfo(zcu);
+
     const elem_ty = cg.typeOfIndex(inst);
-    const operand = try cg.resolve(ty_op.operand);
-    if (!ptr_ty.isVolatilePtr(zcu) and cg.liveness.isUnused(inst)) return null;
+    const operand_ptr_id = try cg.resolve(ty_op.operand);
 
-    if (cg.virtual_allocas.get(operand)) |stored| return stored.?;
+    assert(ptr_info.child == elem_ty.toIntern());
+
+    if (cg.virtual_allocas.get(operand_ptr_id)) |stored| return stored.?;
 
     if (ptr_info.packed_offset.host_size != 0 and
         ptr_info.flags.vector_index == .none)
@@ -6927,7 +6946,7 @@ fn airLoad(cg: *CodeGen, inst: Air.Inst.Index) !?Id {
         const host_bits: u16 = ptr_info.packed_offset.host_size * 8;
         const elem_bit_size: u16 = @intCast(elem_ty.bitSize(zcu));
         const host_int_ty = try pt.intType(.unsigned, host_bits);
-        const host_val = try cg.load(host_int_ty, operand, .{ .is_volatile = ptr_ty.isVolatilePtr(zcu) });
+        const host_val = try cg.load(host_int_ty, operand_ptr_id, .{ .is_volatile = ptr_info.flags.is_volatile });
         const signedness: Signedness = if (elem_ty.isInt(zcu)) elem_ty.intInfo(zcu).signedness else .unsigned;
         const field_int_ty = try pt.intType(signedness, elem_bit_size);
         const narrowed = if (ptr_info.packed_offset.bit_offset > 0) blk: {
@@ -6946,21 +6965,30 @@ fn airLoad(cg: *CodeGen, inst: Air.Inst.Index) !?Id {
         return try cg.bitCast(elem_ty, field_int_ty, result_id);
     }
 
-    return try cg.load(elem_ty, operand, .{ .is_volatile = ptr_ty.isVolatilePtr(zcu) });
+    const ptr_id = switch (ptr_info.flags.vector_index) {
+        .none => operand_ptr_id,
+        else => |index| ptr_id: {
+            const elem_ptr_ty_id = try cg.module.ptrType(
+                try cg.resolveType(elem_ty, .indirect),
+                cg.module.storageClass(ptr_info.flags.address_space),
+            );
+            break :ptr_id try cg.accessChain(elem_ptr_ty_id, operand_ptr_id, &.{@intFromEnum(index)});
+        },
+    };
+    return try cg.load(elem_ty, ptr_id, .{ .is_volatile = ptr_info.flags.is_volatile });
 }
 
 fn airStore(cg: *CodeGen, inst: Air.Inst.Index) !void {
     const zcu = cg.module.zcu;
     const pt = cg.pt;
     const bin_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const ptr_ty = cg.typeOf(bin_op.lhs);
-    const ptr_info = ptr_ty.ptrInfo(zcu);
-    const elem_ty = ptr_ty.childType(zcu);
-    const ptr = try cg.resolve(bin_op.lhs);
-    const value = try cg.resolve(bin_op.rhs);
+    const ptr_info = cg.typeOf(bin_op.lhs).ptrInfo(zcu);
+    const elem_ty: Type = .fromInterned(ptr_info.child);
+    const operand_ptr_id = try cg.resolve(bin_op.lhs);
+    const value_id = try cg.resolve(bin_op.rhs);
 
-    if (cg.virtual_allocas.getPtr(ptr)) |slot| {
-        slot.* = value;
+    if (cg.virtual_allocas.getPtr(operand_ptr_id)) |slot| {
+        slot.* = value_id;
         return;
     }
 
@@ -6969,19 +6997,19 @@ fn airStore(cg: *CodeGen, inst: Air.Inst.Index) !void {
     {
         const host_bits: u16 = ptr_info.packed_offset.host_size * 8;
         const host_int_ty = try pt.intType(.unsigned, host_bits);
-        const host_val = try cg.load(host_int_ty, ptr, .{ .is_volatile = ptr_ty.isVolatilePtr(zcu) });
+        const host_val = try cg.load(host_int_ty, operand_ptr_id, .{ .is_volatile = ptr_info.flags.is_volatile });
         const elem_bit_size: u16 = @intCast(elem_ty.bitSize(zcu));
         const signedness: Signedness = if (elem_ty.isInt(zcu)) elem_ty.intInfo(zcu).signedness else .unsigned;
         const field_int_ty = try pt.intType(signedness, elem_bit_size);
 
         var value_as_int: Id = undefined;
         if (elem_ty.ip_index == .bool_type) {
-            value_as_int = try cg.convertToIndirect(.bool, value);
+            value_as_int = try cg.convertToIndirect(.bool, value_id);
             value_as_int = try cg.bitCast(field_int_ty, .u1, value_as_int);
         } else if (elem_ty.isInt(zcu)) {
-            value_as_int = value;
+            value_as_int = value_id;
         } else {
-            value_as_int = try cg.bitCast(field_int_ty, elem_ty, value);
+            value_as_int = try cg.bitCast(field_int_ty, elem_ty, value_id);
         }
 
         const extended = blk: {
@@ -7002,11 +7030,22 @@ fn airStore(cg: *CodeGen, inst: Air.Inst.Index) !void {
         const combined = try cg.buildBinary(.OpBitwiseOr, cleared, shifted_val);
         const combined_id = try combined.materialize(cg);
 
-        try cg.store(host_int_ty, ptr, combined_id, .{ .is_volatile = ptr_ty.isVolatilePtr(zcu) });
+        try cg.store(host_int_ty, operand_ptr_id, combined_id, .{ .is_volatile = ptr_info.flags.is_volatile });
         return;
     }
 
-    try cg.store(elem_ty, ptr, value, .{ .is_volatile = ptr_ty.isVolatilePtr(zcu) });
+    const ptr_id = switch (ptr_info.flags.vector_index) {
+        .none => operand_ptr_id,
+        else => |index| ptr_id: {
+            const elem_ptr_ty_id = try cg.module.ptrType(
+                try cg.resolveType(elem_ty, .indirect),
+                cg.module.storageClass(ptr_info.flags.address_space),
+            );
+            break :ptr_id try cg.accessChain(elem_ptr_ty_id, operand_ptr_id, &.{@intFromEnum(index)});
+        },
+    };
+
+    try cg.store(elem_ty, ptr_id, value_id, .{ .is_volatile = ptr_info.flags.is_volatile });
 }
 
 fn airRet(cg: *CodeGen, inst: Air.Inst.Index) !void {
diff --git a/src/codegen/wasm/CodeGen.zig b/src/codegen/wasm/CodeGen.zig
@@ -32,7 +32,7 @@ const compilerRtIntAbbrev = target_util.compilerRtIntAbbrev;
 
 pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
     return comptime &.initMany(&.{
-        .expand_intcast_safe,
+        .expand_int_cast_safe,
         .expand_int_from_float_safe,
         .expand_int_from_float_optimized_safe,
         .expand_add_safe,
@@ -108,7 +108,10 @@ pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
         .scalarize_cmp_vector_optimized,
         .scalarize_fptrunc,
         .scalarize_fpext,
-        .scalarize_intcast,
+        .scalarize_int_cast,
+        .scalarize_ptr_cast,
+        .scalarize_ptr_from_int,
+        .scalarize_int_from_ptr,
         .scalarize_trunc,
         .scalarize_int_from_float,
         .scalarize_int_from_float_optimized,
@@ -120,7 +123,7 @@ pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
         .scalarize_select,
         .scalarize_mul_add,
 
-        .scalarize_bitcast_padded_elems,
+        .scalarize_bit_cast_padded_elems,
     });
 }
 
@@ -1551,9 +1554,17 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
             try cg.finishAir(inst, result, &.{ty_op.operand});
         },
 
-        .bitcast => cg.airBitcast(inst),
+        .ptr_cast => cg.airNopCast(inst),
+        .error_cast => cg.airNopCast(inst),
+        .error_from_int => cg.airNopCast(inst),
+        .int_from_error => cg.airNopCast(inst),
+        .ptr_from_int => cg.airNopCast(inst),
+        .int_from_ptr => cg.airIntFromPtr(inst),
 
-        .intcast => {
+        .bit_cast => cg.airBitcast(inst),
+        .union_from_enum => cg.airBitcast(inst),
+
+        .int_cast => {
             const ty_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
 
             const dest_ty = ty_op.ty.toType();
@@ -1561,7 +1572,7 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
             const src_ty = cg.typeOf(ty_op.operand);
 
             if (dest_ty.zigTypeTag(zcu) == .vector) {
-                return cg.fail("TODO: implement AIR op: intcast for vectors", .{});
+                return cg.fail("TODO: implement AIR op: int_cast for vectors", .{});
             }
 
             const src_int_ty: IntType = .fromType(cg, src_ty);
@@ -1876,7 +1887,7 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
         .add_safe,
         .sub_safe,
         .mul_safe,
-        .intcast_safe,
+        .int_cast_safe,
         .int_from_float_safe,
         .int_from_float_optimized_safe,
         => return cg.fail("TODO implement safety_checked_instructions", .{}),
@@ -2100,16 +2111,20 @@ fn airStore(cg: *CodeGen, inst: Air.Inst.Index, safety: bool) InnerError!void {
     const rhs = try cg.resolveInst(bin_op.rhs);
     const ptr_ty = cg.typeOf(bin_op.lhs);
     const ptr_info = ptr_ty.ptrInfo(zcu);
-    const ty = ptr_ty.childType(zcu);
+    const elem_ty = ptr_ty.childType(zcu);
 
     if (!safety and bin_op.rhs == .undef) {
         return cg.finishAir(inst, .none, &.{ bin_op.lhs, bin_op.rhs });
     }
 
-    assert(!(ptr_info.packed_offset.host_size > 0 and ptr_info.flags.vector_index == .none)); // legalize .expand_packed_store
-
-    try cg.store(lhs, rhs, ty, 0);
-
+    const offset: u32 = switch (ptr_info.flags.vector_index) {
+        .none => offset: {
+            assert(ptr_info.packed_offset.host_size == 0); // legalize .expand_packed_store
+            break :offset 0;
+        },
+        else => |index| @intCast(@intFromEnum(index) * elem_ty.abiSize(zcu)),
+    };
+    try cg.store(lhs, rhs, elem_ty, offset);
     return cg.finishAir(inst, .none, &.{ bin_op.lhs, bin_op.rhs });
 }
 
@@ -2122,7 +2137,16 @@ fn store(cg: *CodeGen, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerErr
     if (!ty.hasRuntimeBits(zcu)) return;
 
     if (isByRef(ty, zcu, cg.target)) {
-        return cg.memcpy(lhs, rhs, .{ .imm32 = @intCast(abi_size) });
+        const offset_ptr: WValue = switch (offset + lhs.offset()) {
+            0 => lhs,
+            else => |total_offset| ptr: {
+                try cg.emitWValue(lhs);
+                try cg.addImm32(total_offset);
+                try cg.addTag(.i32_add);
+                break :ptr .stack;
+            },
+        };
+        return cg.memcpy(offset_ptr, rhs, .{ .imm32 = @intCast(abi_size) });
     }
 
     if (ty.zigTypeTag(zcu) == .vector) {
@@ -2134,7 +2158,7 @@ fn store(cg: *CodeGen, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerErr
         try cg.mir_extra.appendSlice(cg.gpa, &[_]u32{
             @intFromEnum(std.wasm.SimdOpcode.v128_store),
             offset + lhs.offset(),
-            @intCast(ty.abiAlignment(zcu).toByteUnits() orelse 0),
+            @intCast(ty.abiAlignment(zcu).toByteUnits().?),
         });
         return cg.addInst(.{ .tag = .simd_prefix, .data = .{ .payload = extra_index } });
     }
@@ -2175,15 +2199,20 @@ fn airLoad(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     const zcu = pt.zcu;
     const ty_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const operand = try cg.resolveInst(ty_op.operand);
-    const ty = ty_op.ty.toType();
+    const elem_ty = ty_op.ty.toType();
     const ptr_ty = cg.typeOf(ty_op.operand);
     const ptr_info = ptr_ty.ptrInfo(zcu);
 
-    if (!ty.hasRuntimeBits(zcu)) return cg.finishAir(inst, .none, &.{ty_op.operand});
-
-    assert(!(ptr_info.packed_offset.host_size > 0 and ptr_info.flags.vector_index == .none)); // legalize .expand_packed_load
+    assert(elem_ty.hasRuntimeBits(zcu));
 
-    const result = try cg.load(operand, ty, 0);
+    const offset: u32 = switch (ptr_info.flags.vector_index) {
+        .none => offset: {
+            assert(ptr_info.packed_offset.host_size == 0); // legalize .expand_packed_load
+            break :offset 0;
+        },
+        else => |index| @intCast(@intFromEnum(index) * elem_ty.abiSize(zcu)),
+    };
+    const result = try cg.load(operand, elem_ty, offset);
     return cg.finishAir(inst, result, &.{ty_op.operand});
 }
 
@@ -2192,9 +2221,19 @@ fn airLoad(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
 fn load(cg: *CodeGen, operand: WValue, ty: Type, offset: u32) InnerError!WValue {
     const zcu = cg.pt.zcu;
     if (isByRef(ty, zcu, cg.target)) {
-        const val = try cg.allocStack(ty);
-        try cg.store(val, try operand.toLocal(cg, .usize), ty, 0);
-        return val;
+        const src_ptr_maybe_stack: WValue = switch (offset + operand.offset()) {
+            0 => operand,
+            else => |total_offset| ptr: {
+                try cg.emitWValue(operand);
+                try cg.addImm32(total_offset);
+                try cg.addTag(.i32_add);
+                break :ptr .stack;
+            },
+        };
+        const src_ptr = try src_ptr_maybe_stack.toLocal(cg, .usize);
+        const new_ptr = try cg.allocStack(ty);
+        try cg.store(new_ptr, src_ptr, ty, 0);
+        return new_ptr;
     }
 
     // load local's value from memory by its stack position
@@ -5236,6 +5275,39 @@ fn airUnreachable(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     return cg.finishAir(inst, .none, &.{});
 }
 
+fn airNopCast(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
+    const zcu = cg.pt.zcu;
+    const ty_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+
+    const operand_ty = cg.typeOf(ty_op.operand);
+    const dest_ty = cg.typeOfIndex(inst);
+    assert(isByRef(operand_ty, zcu, cg.target) == isByRef(dest_ty, zcu, cg.target));
+    assert(operand_ty.abiSize(zcu) == dest_ty.abiSize(zcu));
+    assert(operand_ty.abiAlignment(zcu) == dest_ty.abiAlignment(zcu));
+
+    const operand = try cg.resolveInst(ty_op.operand);
+    const result = cg.reuseOperand(ty_op.operand, operand);
+    return cg.finishAir(inst, result, &.{ty_op.operand});
+}
+
+fn airIntFromPtr(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
+    const zcu = cg.pt.zcu;
+    const ty_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
+
+    const operand_ty = cg.typeOf(ty_op.operand);
+    const dest_ty = cg.typeOfIndex(inst);
+    assert(isByRef(operand_ty, zcu, cg.target) == isByRef(dest_ty, zcu, cg.target));
+    assert(operand_ty.abiSize(zcu) == dest_ty.abiSize(zcu));
+    assert(operand_ty.abiAlignment(zcu) == dest_ty.abiAlignment(zcu));
+
+    const operand = try cg.resolveInst(ty_op.operand);
+    const result = switch (operand) {
+        .stack_offset => try cg.buildPointerOffset(operand, 0, .new),
+        else => cg.reuseOperand(ty_op.operand, operand),
+    };
+    return cg.finishAir(inst, result, &.{ty_op.operand});
+}
+
 fn airBitcast(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     const ty_op = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const operand = try cg.resolveInst(ty_op.operand);
@@ -6341,12 +6413,11 @@ fn airArrayElemVal(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
         }
     }
 
-    const elem_result = if (isByRef(elem_ty, zcu, cg.target))
+    const result = if (isByRef(elem_ty, zcu, cg.target))
         .stack
     else
         try cg.load(.stack, elem_ty, 0);
-
-    return cg.finishAir(inst, elem_result, &.{ bin_op.lhs, bin_op.rhs });
+    return cg.finishAir(inst, result, &.{ bin_op.lhs, bin_op.rhs });
 }
 
 fn airSplat(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
diff --git a/src/codegen/x86_64/CodeGen.zig b/src/codegen/x86_64/CodeGen.zig
@@ -57,13 +57,13 @@ pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
         .scalarize_shuffle_two,
         .scalarize_select,
 
-        .scalarize_bitcast_padded_elems,
+        .scalarize_bit_cast_padded_elems,
 
         //.unsplat_shift_rhs,
-        .reduce_one_elem_to_bitcast,
-        .splat_one_elem_to_bitcast,
+        .reduce_one_elem_to_bit_cast,
+        .splat_one_elem_to_bit_cast,
 
-        .expand_intcast_safe,
+        .expand_int_cast_safe,
         .expand_int_from_float_safe,
         .expand_int_from_float_optimized_safe,
         .expand_add_safe,
@@ -67434,7 +67434,15 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 };
                 try res[0].finish(inst, &.{ty_op.operand}, &ops, cg);
             },
-            .bitcast => try cg.airBitCast(inst),
+            .bit_cast,
+            .ptr_cast,
+            .ptr_from_int,
+            .int_from_ptr,
+            .error_cast,
+            .error_from_int,
+            .int_from_error,
+            .union_from_enum,
+            => try cg.airBitCast(inst),
             .block => {
                 const block = cg.air.unwrapBlock(inst);
                 if (!cg.mod.strip) try cg.asmPseudo(.pseudo_dbg_enter_block_none);
@@ -93375,7 +93383,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 };
                 try res[0].finish(inst, &.{ty_op.operand}, &ops, cg);
             },
-            .intcast => |air_tag| {
+            .int_cast => |air_tag| {
                 const ty_op = air_datas[@intFromEnum(inst)].ty_op;
                 const dst_ty = ty_op.ty.toType();
                 const src_ty = cg.typeOf(ty_op.operand);
@@ -98133,7 +98141,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 };
                 try res[0].finish(inst, &.{ty_op.operand}, &ops, cg);
             },
-            .intcast_safe => unreachable,
+            .int_cast_safe => unreachable,
             .trunc => |air_tag| {
                 const ty_op = air_datas[@intFromEnum(inst)].ty_op;
                 var ops = try cg.tempsFromOperands(inst, .{ty_op.operand});
@@ -104351,7 +104359,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
                 try ops[0].toSlicePtr(cg);
                 const dst_ty = ty_pl.ty.toType();
-                if (dst_ty.ptrInfo(zcu).flags.vector_index == .none) zero_offset: {
+                zero_offset: {
                     const elem_size = dst_ty.childType(zcu).abiSize(zcu);
                     if (hack_around_sema_opv_bugs and elem_size == 0) break :zero_offset;
                     while (true) for (&ops) |*op| {
@@ -179054,8 +179062,11 @@ fn genSetReg(
     const zcu = pt.zcu;
     const abi_size: u32 = @intCast(ty.abiSize(zcu));
     const dst_alias = registerAlias(dst_reg, @intCast(cg.unalignedSize(ty)));
-    if (ty.bitSize(zcu) > dst_alias.size().bitSize(cg.target))
-        return cg.fail("genSetReg called with a value larger than dst_reg", .{});
+    {
+        const ty_bit_size = if (ty.hasBitRepresentation(zcu)) ty.bitSize(zcu) else 8 * abi_size;
+        if (ty_bit_size > dst_alias.size().bitSize(cg.target))
+            return cg.fail("genSetReg called with a value larger than dst_reg", .{});
+    }
     switch (src_mcv) {
         .none,
         .unreach,
@@ -180128,14 +180139,19 @@ fn airBitCast(self: *CodeGen, inst: Air.Inst.Index) !void {
             break :dst dst_mcv;
         };
 
-        if (dst_ty.isRuntimeFloat()) break :result dst_mcv;
+        switch (dst_ty.zigTypeTag(zcu)) {
+            .float, .error_union, .error_set, .vector => break :result dst_mcv,
+            .@"struct", .@"union" => if (dst_ty.containerLayout(zcu) != .@"packed") break :result dst_mcv,
+            .optional, .pointer => if (!dst_ty.isPtrAtRuntime(zcu)) break :result dst_mcv,
+            else => {},
+        }
 
         if (dst_ty.isAbiInt(zcu) and src_ty.isAbiInt(zcu) and src_ty.zigTypeTag(zcu) != .@"struct" and
             dst_ty.intInfo(zcu).signedness == src_ty.intInfo(zcu).signedness) break :result dst_mcv;
 
         const abi_size = dst_ty.abiSize(zcu);
         const bit_size = dst_ty.bitSize(zcu);
-        if (abi_size * 8 <= bit_size or dst_ty.isVector(zcu)) break :result dst_mcv;
+        if (abi_size * 8 <= bit_size) break :result dst_mcv;
 
         const dst_limbs_len = std.math.divCeil(u31, @intCast(bit_size), 64) catch unreachable;
         const high_mcv: MCValue = switch (dst_mcv) {
@@ -182411,7 +182427,7 @@ fn truncateRegister(self: *CodeGen, ty: Type, reg: Register) !void {
     const zcu = pt.zcu;
     const int_info: InternPool.Key.IntType = if (ty.isAbiInt(zcu)) ty.intInfo(zcu) else .{
         .signedness = .unsigned,
-        .bits = @intCast(ty.bitSize(zcu)),
+        .bits = @intCast(if (ty.hasBitRepresentation(zcu)) ty.bitSize(zcu) else ty.abiSize(zcu) * 8),
     };
     const shift = std.math.cast(u6, 64 - int_info.bits % 64) orelse return;
     try self.spillEflagsIfOccupied();
@@ -182451,10 +182467,6 @@ fn regBitSize(self: *CodeGen, ty: Type) u64 {
     };
 }
 
-fn regExtraBits(self: *CodeGen, ty: Type) u64 {
-    return self.regBitSize(ty) - ty.bitSize(self.pt.zcu);
-}
-
 fn hasFeature(cg: *CodeGen, feature: std.Target.x86.Feature) bool {
     return switch (feature) {
         .@"64bit" => switch (cg.target.cpu.arch) {
@@ -182570,7 +182582,7 @@ fn nonBoolScalarBitSize(cg: *CodeGen, ty: Type) u32 {
             .bool_type => vector_type.len,
             else => @intCast(Type.fromInterned(vector_type.child).bitSize(zcu)),
         },
-        else => @intCast(ty.bitSize(zcu)),
+        else => if (ty.hasBitRepresentation(zcu) or ty.isAbiInt(zcu)) @intCast(ty.bitSize(zcu)) else @intCast(ty.abiSize(zcu) * 8),
     };
 }
 
@@ -192243,7 +192255,8 @@ const Select = struct {
                 .src0_bit_size => @intCast(s.cg.nonBoolScalarBitSize(Select.Operand.Ref.src0.typeOf(s))),
                 .@"8_size_sub_bit_size" => {
                     const ty = op.flags.base.ref.typeOf(s);
-                    break :lhs @intCast(8 * ty.abiSize(s.cg.pt.zcu) - ty.bitSize(s.cg.pt.zcu));
+                    const bit_size = s.cg.intInfo(ty).?.bits;
+                    break :lhs @intCast(8 * ty.abiSize(s.cg.pt.zcu) - bit_size);
                 },
                 .len => @intCast(op.flags.base.ref.typeOf(s).vectorLen(s.cg.pt.zcu)),
                 .elem_limbs => @intCast(@divExact(

	zig fork of https://codeberg.org/ziglang/zig
	Log \| Files \| Refs \| README \| LICENSE

M	CMakeLists.txt	\|	2	+-
M	src/Air.zig	\|	79	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M	src/Air/Legalize.zig	\|	207	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	src/Air/Liveness.zig	\|	13	++++++++++---
M	src/Air/Liveness/Verify.zig	\|	13	++++++++++---
A	src/Air/Verify.zig	\|	465	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/Air/print.zig	\|	13	++++++++++---
M	src/Sema.zig	\|	854	+++++++++++++++++++++++++++++++++++--------------------------------------------
M	src/Sema/LowerZon.zig	\|	15	++++-----------
D	src/Sema/bitcast.zig	\|	774	-------------------------------------------------------------------------------
M	src/Sema/comptime_ptr_access.zig	\|	321	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
A	src/Sema/reinterpret.zig	\|	576	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/Type.zig	\|	153	++++++++++++++++++++++++-------------------------------------------------------
M	src/Value.zig	\|	269	++++++++++++++++++++++++++++++++++++++++---------------------------------------
M	src/Zcu/PerThread.zig	\|	4	++++
M	src/codegen/aarch64/Select.zig	\|	30	+++++++++++++++++++++++-------
M	src/codegen/aarch64/abi.zig	\|	4	++--
M	src/codegen/arm/abi.zig	\|	12	++++++------
M	src/codegen/c.zig	\|	374	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M	src/codegen/c/type.zig	\|	34	+++++++++++++++-------------------
M	src/codegen/llvm.zig	\|	253	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	src/codegen/llvm/FuncGen.zig	\|	1046	+++++++++++++++++++++++++++++++++++--------------------------------------------
M	src/codegen/mips/abi.zig	\|	9	++++-----
M	src/codegen/riscv64/CodeGen.zig	\|	19	++++++++++++-------
M	src/codegen/riscv64/abi.zig	\|	11	+++++------
M	src/codegen/sparc64/CodeGen.zig	\|	15	+++++++++++----
M	src/codegen/spirv/CodeGen.zig	\|	95	++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
M	src/codegen/wasm/CodeGen.zig	\|	121	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M	src/codegen/x86_64/CodeGen.zig	\|	51	++++++++++++++++++++++++++++++++-------------------