diff --git a/src-self-hosted/codegen.zig b/src-self-hosted/codegen.zig
index 36bebe1ca5..ca6d9d800b 100644
--- a/src-self-hosted/codegen.zig
+++ b/src-self-hosted/codegen.zig
@@ -14,6 +14,7 @@ const Allocator = mem.Allocator;
 const trace = @import("tracy.zig").trace;
 const DW = std.dwarf;
 const leb128 = std.debug.leb;
+const log = std.log.scoped(.codegen);
 
 // TODO Turn back on zig fmt when https://github.com/ziglang/zig/issues/5948 is implemented.
 // zig fmt: off
@@ -344,6 +345,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
 
         const Branch = struct {
             inst_table: std.AutoHashMapUnmanaged(*ir.Inst, MCValue) = .{},
+            /// The key must be canonical register.
             registers: std.AutoHashMapUnmanaged(Register, RegisterAllocation) = .{},
             free_registers: FreeRegInt = math.maxInt(FreeRegInt),
 
@@ -381,9 +383,19 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 self.free_registers &= ~(@as(FreeRegInt, 1) << free_index);
                 const reg = callee_preserved_regs[free_index];
                 self.registers.putAssumeCapacityNoClobber(reg, .{ .inst = inst });
+                log.debug("alloc {} => {*}", .{reg, inst});
                 return reg;
             }
 
+            /// Does not track the register.
+            fn findUnusedReg(self: *Branch) ?Register {
+                const free_index = @ctz(FreeRegInt, self.free_registers);
+                if (free_index >= callee_preserved_regs.len) {
+                    return null;
+                }
+                return callee_preserved_regs[free_index];
+            }
+
             fn deinit(self: *Branch, gpa: *Allocator) void {
                 self.inst_table.deinit(gpa);
                 self.registers.deinit(gpa);
@@ -570,8 +582,10 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
             const inst_table = &branch.inst_table;
             for (body.instructions) |inst| {
-                const new_inst = try self.genFuncInst(inst);
-                try inst_table.putNoClobber(self.gpa, inst, new_inst);
+                const mcv = try self.genFuncInst(inst);
+                log.debug("{*} => {}", .{inst, mcv});
+                // TODO don't put void or dead things in here
+                try inst_table.putNoClobber(self.gpa, inst, mcv);
 
                 var i: ir.Inst.DeathsBitIndex = 0;
                 while (inst.getOperand(i)) |operand| : (i += 1) {
@@ -714,7 +728,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             return self.allocMem(inst, abi_size, abi_align);
         }
 
-        fn allocRegOrMem(self: *Self, inst: *ir.Inst) !MCValue {
+        fn allocRegOrMem(self: *Self, inst: *ir.Inst, reg_ok: bool) !MCValue {
             const elem_ty = inst.ty;
             const abi_size = math.cast(u32, elem_ty.abiSize(self.target.*)) catch {
                 return self.fail(inst.src, "type '{}' too big to fit into stack frame", .{elem_ty});
@@ -724,30 +738,73 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 self.stack_align = abi_align;
             const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
 
-            // Make sure the type can fit in a register before we try to allocate one.
-            const ptr_bits = arch.ptrBitWidth();
-            const ptr_bytes: u64 = @divExact(ptr_bits, 8);
-            if (abi_size <= ptr_bytes) {
-                try branch.registers.ensureCapacity(self.gpa, branch.registers.items().len + 1);
-                if (branch.allocReg(inst)) |reg| {
-                    return MCValue{ .register = registerAlias(reg, abi_size) };
+            if (reg_ok) {
+                // Make sure the type can fit in a register before we try to allocate one.
+                const ptr_bits = arch.ptrBitWidth();
+                const ptr_bytes: u64 = @divExact(ptr_bits, 8);
+                if (abi_size <= ptr_bytes) {
+                    try branch.registers.ensureCapacity(self.gpa, branch.registers.items().len + 1);
+                    if (branch.allocReg(inst)) |reg| {
+                        return MCValue{ .register = registerAlias(reg, abi_size) };
+                    }
                 }
             }
             const stack_offset = try self.allocMem(inst, abi_size, abi_align);
             return MCValue{ .stack_offset = stack_offset };
         }
 
-        /// Does not "move" the instruction.
-        fn copyToNewRegister(self: *Self, inst: *ir.Inst) !MCValue {
+        /// Copies a value to a register without tracking the register. The register is not considered
+        /// allocated. A second call to `copyToTmpRegister` may return the same register.
+        /// This can have a side effect of spilling instructions to the stack to free up a register.
+        fn copyToTmpRegister(self: *Self, src: usize, mcv: MCValue) !Register {
+            const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
+
+            const reg = branch.findUnusedReg() orelse b: {
+                // We'll take over the first register. Move the instruction that was previously
+                // there to a stack allocation.
+                const reg = callee_preserved_regs[0];
+                const regs_entry = branch.registers.remove(reg).?;
+                const spilled_inst = regs_entry.value.inst;
+
+                const stack_mcv = try self.allocRegOrMem(spilled_inst, false);
+                const inst_entry = branch.inst_table.getEntry(spilled_inst).?;
+                const reg_mcv = inst_entry.value;
+                assert(reg == toCanonicalReg(reg_mcv.register));
+                inst_entry.value = stack_mcv;
+                try self.genSetStack(src, spilled_inst.ty, stack_mcv.stack_offset, reg_mcv);
+
+                break :b reg;
+            };
+            try self.genSetReg(src, reg, mcv);
+            return reg;
+        }
+
+        /// Allocates a new register and copies `mcv` into it.
+        /// `reg_owner` is the instruction that gets associated with the register in the register table.
+        /// This can have a side effect of spilling instructions to the stack to free up a register.
+        fn copyToNewRegister(self: *Self, reg_owner: *ir.Inst, mcv: MCValue) !MCValue {
             const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
             try branch.registers.ensureCapacity(self.gpa, branch.registers.items().len + 1);
 
-            const reg = branch.allocReg(inst) orelse
-                return self.fail(inst.src, "TODO implement spilling register to stack", .{});
-            const old_mcv = branch.inst_table.get(inst).?;
-            const new_mcv: MCValue = .{ .register = reg };
-            try self.genSetReg(inst.src, reg, old_mcv);
-            return new_mcv;
+            const reg = branch.allocReg(reg_owner) orelse b: {
+                // We'll take over the first register. Move the instruction that was previously
+                // there to a stack allocation.
+                const reg = callee_preserved_regs[0];
+                const regs_entry = branch.registers.getEntry(reg).?;
+                const spilled_inst = regs_entry.value.inst;
+                regs_entry.value = .{ .inst = reg_owner };
+
+                const stack_mcv = try self.allocRegOrMem(spilled_inst, false);
+                const inst_entry = branch.inst_table.getEntry(spilled_inst).?;
+                const reg_mcv = inst_entry.value;
+                assert(reg == toCanonicalReg(reg_mcv.register));
+                inst_entry.value = stack_mcv;
+                try self.genSetStack(reg_owner.src, spilled_inst.ty, stack_mcv.stack_offset, reg_mcv);
+
+                break :b reg;
+            };
+            try self.genSetReg(reg_owner.src, reg, mcv);
+            return MCValue{ .register = reg };
         }
 
         fn genAlloc(self: *Self, inst: *ir.Inst.NoOp) !MCValue {
@@ -868,13 +925,29 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
         }
 
-        fn reuseOperand(inst: *ir.Inst, op_index: ir.Inst.DeathsBitIndex, mcv: MCValue) bool {
-            if (!inst.operandDies(op_index) or !mcv.isMutable())
+        fn reuseOperand(self: *Self, inst: *ir.Inst, op_index: ir.Inst.DeathsBitIndex, mcv: MCValue) bool {
+            if (!inst.operandDies(op_index))
                 return false;
 
-            // OK we're going to do it, but we need to clear the operand death bit so that
-            // it stays allocated.
+            switch (mcv) {
+                .register => |reg| {
+                    // If it's in the registers table, need to associate the register with the
+                    // new instruction.
+                    const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
+                    const entry = branch.registers.getEntry(toCanonicalReg(reg)).?;
+                    entry.value = .{ .inst = inst };
+                    log.debug("reusing {} => {*}", .{reg, inst});
+                },
+                .stack_offset => |off| {
+                    log.debug("reusing stack offset {} => {*}", .{off, inst});
+                    return true;
+                },
+                else => return false,
+            }
+
+            // Prevent the operand deaths processing code from deallocating it.
             inst.clearOperandDeath(op_index);
+
             return true;
         }
 
@@ -887,11 +960,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             if (inst.base.isUnused() and !is_volatile)
                 return MCValue.dead;
             const dst_mcv: MCValue = blk: {
-                if (reuseOperand(&inst.base, 0, ptr)) {
+                if (self.reuseOperand(&inst.base, 0, ptr)) {
                     // The MCValue that holds the pointer can be re-used as the value.
                     break :blk ptr;
                 } else {
-                    break :blk try self.allocRegOrMem(&inst.base);
+                    break :blk try self.allocRegOrMem(&inst.base, true);
                 }
             };
             switch (ptr) {
@@ -985,23 +1058,23 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             var dst_mcv: MCValue = undefined;
             var src_mcv: MCValue = undefined;
             var src_inst: *ir.Inst = undefined;
-            if (reuseOperand(inst, 0, lhs)) {
+            if (self.reuseOperand(inst, 0, lhs)) {
                 // LHS dies; use it as the destination.
                 // Both operands cannot be memory.
                 src_inst = op_rhs;
                 if (lhs.isMemory() and rhs.isMemory()) {
-                    dst_mcv = try self.copyToNewRegister(op_lhs);
+                    dst_mcv = try self.copyToNewRegister(inst, lhs);
                     src_mcv = rhs;
                 } else {
                     dst_mcv = lhs;
                     src_mcv = rhs;
                 }
-            } else if (reuseOperand(inst, 1, rhs)) {
+            } else if (self.reuseOperand(inst, 1, rhs)) {
                 // RHS dies; use it as the destination.
                 // Both operands cannot be memory.
                 src_inst = op_lhs;
                 if (lhs.isMemory() and rhs.isMemory()) {
-                    dst_mcv = try self.copyToNewRegister(op_rhs);
+                    dst_mcv = try self.copyToNewRegister(inst, rhs);
                     src_mcv = lhs;
                 } else {
                     dst_mcv = rhs;
@@ -1009,11 +1082,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 }
             } else {
                 if (lhs.isMemory()) {
-                    dst_mcv = try self.copyToNewRegister(op_lhs);
+                    dst_mcv = try self.copyToNewRegister(inst, lhs);
                     src_mcv = rhs;
                     src_inst = op_rhs;
                 } else {
-                    dst_mcv = try self.copyToNewRegister(op_rhs);
+                    dst_mcv = try self.copyToNewRegister(inst, rhs);
                     src_mcv = lhs;
                     src_inst = op_lhs;
                 }
@@ -1026,18 +1099,26 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             switch (src_mcv) {
                 .immediate => |imm| {
                     if (imm > math.maxInt(u31)) {
-                        src_mcv = try self.copyToNewRegister(src_inst);
+                        src_mcv = MCValue{ .register = try self.copyToTmpRegister(src_inst.src, src_mcv) };
                     }
                 },
                 else => {},
             }
 
-            try self.genX8664BinMathCode(inst.src, dst_mcv, src_mcv, opx, mr);
+            try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, opx, mr);
 
             return dst_mcv;
         }
 
-        fn genX8664BinMathCode(self: *Self, src: usize, dst_mcv: MCValue, src_mcv: MCValue, opx: u8, mr: u8) !void {
+        fn genX8664BinMathCode(
+            self: *Self,
+            src: usize,
+            dst_ty: Type,
+            dst_mcv: MCValue,
+            src_mcv: MCValue,
+            opx: u8,
+            mr: u8,
+        ) !void {
             switch (dst_mcv) {
                 .none => unreachable,
                 .undef => unreachable,
@@ -1087,12 +1168,60 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         },
                     }
                 },
-                .embedded_in_code, .memory, .stack_offset => {
+                .stack_offset => |off| {
+                    switch (src_mcv) {
+                        .none => unreachable,
+                        .undef => return self.genSetStack(src, dst_ty, off, .undef),
+                        .dead, .unreach => unreachable,
+                        .ptr_stack_offset => unreachable,
+                        .ptr_embedded_in_code => unreachable,
+                        .register => |src_reg| {
+                            try self.genX8664ModRMRegToStack(src, dst_ty, off, src_reg, mr + 0x1);
+                        },
+                        .immediate => |imm| {
+                            return self.fail(src, "TODO implement x86 ADD/SUB/CMP source immediate", .{});
+                        },
+                        .embedded_in_code, .memory, .stack_offset => {
+                            return self.fail(src, "TODO implement x86 ADD/SUB/CMP source memory", .{});
+                        },
+                        .compare_flags_unsigned => {
+                            return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (unsigned)", .{});
+                        },
+                        .compare_flags_signed => {
+                            return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (signed)", .{});
+                        },
+                    }
+                },
+                .embedded_in_code, .memory => {
                     return self.fail(src, "TODO implement x86 ADD/SUB/CMP destination memory", .{});
                 },
             }
         }
 
+        fn genX8664ModRMRegToStack(self: *Self, src: usize, ty: Type, off: u32, reg: Register, opcode: u8) !void {
+            const abi_size = ty.abiSize(self.target.*);
+            const adj_off = off + abi_size;
+            try self.code.ensureCapacity(self.code.items.len + 7);
+            self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended() });
+            const reg_id: u8 = @truncate(u3, reg.id());
+            if (adj_off <= 128) {
+                // example: 48 89 55 7f           mov    QWORD PTR [rbp+0x7f],rdx
+                const RM = @as(u8, 0b01_000_101) | (reg_id << 3);
+                const negative_offset = @intCast(i8, -@intCast(i32, adj_off));
+                const twos_comp = @bitCast(u8, negative_offset);
+                self.code.appendSliceAssumeCapacity(&[_]u8{ opcode, RM, twos_comp });
+            } else if (adj_off <= 2147483648) {
+                // example: 48 89 95 80 00 00 00  mov    QWORD PTR [rbp+0x80],rdx
+                const RM = @as(u8, 0b10_000_101) | (reg_id << 3);
+                const negative_offset = @intCast(i32, -@intCast(i33, adj_off));
+                const twos_comp = @bitCast(u32, negative_offset);
+                self.code.appendSliceAssumeCapacity(&[_]u8{ opcode, RM });
+                mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), twos_comp);
+            } else {
+                return self.fail(src, "stack offset too large", .{});
+            }
+        }
+
         fn genArg(self: *Self, inst: *ir.Inst.Arg) !MCValue {
             if (FreeRegInt == u0) {
                 return self.fail(inst.base.src, "TODO implement Register enum for {}", .{self.target.cpu.arch});
@@ -1109,7 +1238,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             const name_with_null = inst.name[0..mem.lenZ(inst.name) + 1];
             switch (result) {
                 .register => |reg| {
-                    branch.registers.putAssumeCapacityNoClobber(reg, .{ .inst = &inst.base });
+                    branch.registers.putAssumeCapacityNoClobber(toCanonicalReg(reg), .{ .inst = &inst.base });
                     branch.markRegUsed(reg);
 
                     try self.dbg_info.ensureCapacity(self.dbg_info.items.len + 8 + name_with_null.len);
@@ -1304,13 +1433,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                     // Either one, but not both, can be a memory operand.
                     // Source operand can be an immediate, 8 bits or 32 bits.
                     const dst_mcv = if (lhs.isImmediate() or (lhs.isMemory() and rhs.isMemory()))
-                        try self.copyToNewRegister(inst.lhs)
+                        try self.copyToNewRegister(&inst.base, lhs)
                     else
                         lhs;
                     // This instruction supports only signed 32-bit immediates at most.
                     const src_mcv = try self.limitImmediateType(inst.rhs, i32);
 
-                    try self.genX8664BinMathCode(inst.base.src, dst_mcv, src_mcv, 7, 0x38);
+                    try self.genX8664BinMathCode(inst.base.src, inst.base.ty, dst_mcv, src_mcv, 7, 0x38);
                     const info = inst.lhs.ty.intInfo(self.target.*);
                     if (info.signed) {
                         return MCValue{ .compare_flags_signed = op };
@@ -1584,6 +1713,10 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         /// resulting REX is meaningful, but will remain the same if it is not.
         /// * Deliberately inserting a "meaningless REX" requires explicit usage of
         /// 0x40, and cannot be done via this function.
+        /// W => 64 bit mode
+        /// R => extension to the MODRM.reg field
+        /// X => extension to the SIB.index field
+        /// B => extension to the MODRM.rm field or the SIB.base field
         fn rex(self: *Self, arg: struct { b: bool = false, w: bool = false, x: bool = false, r: bool = false }) void {
             //  From section 2.2.1.2 of the manual, REX is encoded as b0100WRXB.
             var value: u8 = 0x40;
@@ -1681,27 +1814,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         return self.fail(src, "TODO implement set stack variable from embedded_in_code", .{});
                     },
                     .register => |reg| {
-                        const abi_size = ty.abiSize(self.target.*);
-                        const adj_off = stack_offset + abi_size;
-                        try self.code.ensureCapacity(self.code.items.len + 7);
-                        self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended() });
-                        const reg_id: u8 = @truncate(u3, reg.id());
-                        if (adj_off <= 128) {
-                            // example: 48 89 55 7f           mov    QWORD PTR [rbp+0x7f],rdx
-                            const RM = @as(u8, 0b01_000_101) | (reg_id << 3);
-                            const negative_offset = @intCast(i8, -@intCast(i32, adj_off));
-                            const twos_comp = @bitCast(u8, negative_offset);
-                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x89, RM, twos_comp });
-                        } else if (adj_off <= 2147483648) {
-                            // example: 48 89 95 80 00 00 00  mov    QWORD PTR [rbp+0x80],rdx
-                            const RM = @as(u8, 0b10_000_101) | (reg_id << 3);
-                            const negative_offset = @intCast(i32, -@intCast(i33, adj_off));
-                            const twos_comp = @bitCast(u32, negative_offset);
-                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x89, RM });
-                            mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), twos_comp);
-                        } else {
-                            return self.fail(src, "stack offset too large", .{});
-                        }
+                        try self.genX8664ModRMRegToStack(src, ty, stack_offset, reg, 0x89);
                     },
                     .memory => |vaddr| {
                         return self.fail(src, "TODO implement set stack variable from memory vaddr", .{});
@@ -1709,7 +1822,9 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                     .stack_offset => |off| {
                         if (stack_offset == off)
                             return; // Copy stack variable to itself; nothing to do.
-                        return self.fail(src, "TODO implement copy stack variable to stack variable", .{});
+
+                        const reg = try self.copyToTmpRegister(src, mcv);
+                        return self.genSetStack(src, ty, stack_offset, MCValue{ .register = reg });
                     },
                 },
                 else => return self.fail(src, "TODO implement getSetStack for {}", .{self.target.cpu.arch}),
@@ -2027,7 +2142,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         },
                     });
                     if (imm >= math.maxInt(U)) {
-                        return self.copyToNewRegister(inst);
+                        return MCValue{ .register = try self.copyToTmpRegister(inst.src, mcv) };
                     }
                 },
                 else => {},
diff --git a/test/stage2/test.zig b/test/stage2/test.zig
index 11b1713181..791a073393 100644
--- a/test/stage2/test.zig
+++ b/test/stage2/test.zig
@@ -600,6 +600,58 @@ pub fn addCases(ctx: *TestContext) !void {
             "",
         );
 
+        // Spilling registers to the stack.
+        case.addCompareOutput(
+            \\export fn _start() noreturn {
+            \\    assert(add(3, 4) == 791);
+            \\
+            \\    exit();
+            \\}
+            \\
+            \\fn add(a: u32, b: u32) u32 {
+            \\    const x: u32 = blk: {
+            \\        const c = a + b; // 7
+            \\        const d = a + c; // 10
+            \\        const e = d + b; // 14
+            \\        const f = d + e; // 24
+            \\        const g = e + f; // 38
+            \\        const h = f + g; // 62
+            \\        const i = g + h; // 100
+            \\        const j = i + d; // 110
+            \\        const k = i + j; // 210
+            \\        const l = k + c; // 217
+            \\        const m = l + d; // 227
+            \\        const n = m + e; // 241
+            \\        const o = n + f; // 265
+            \\        const p = o + g; // 303
+            \\        const q = p + h; // 365
+            \\        const r = q + i; // 465
+            \\        const s = r + j; // 575
+            \\        const t = s + k; // 785
+            \\        break :blk t;
+            \\    };
+            \\    const y = x + a; // 788
+            \\    const z = y + a; // 791
+            \\    return z;
+            \\}
+            \\
+            \\pub fn assert(ok: bool) void {
+            \\    if (!ok) unreachable; // assertion failure
+            \\}
+            \\
+            \\fn exit() noreturn {
+            \\    asm volatile ("syscall"
+            \\        :
+            \\        : [number] "{rax}" (231),
+            \\          [arg1] "{rdi}" (0)
+            \\        : "rcx", "r11", "memory"
+            \\    );
+            \\    unreachable;
+            \\}
+        ,
+            "",
+        );
+
         // Character literals and multiline strings.
         case.addCompareOutput(
             \\export fn _start() noreturn {