x86_64: implement mul, div, and mod of large integers

This enables the last compiler-rt test disabled for the x86_64 backend.
This commit is contained in:
Jacob Young
2024-02-05 05:31:57 +01:00
parent c3eb592a34
commit 6235762c09
6 changed files with 375 additions and 17 deletions

View File

@@ -130,7 +130,6 @@ pub fn __umodei4(r_p: [*]u32, u_p: [*]const u32, v_p: [*]const u32, bits: usize)
test "__udivei4/__umodei4" {
if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
const RndGen = std.Random.DefaultPrng;
var rnd = RndGen.init(42);

View File

@@ -3008,7 +3008,7 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
try self.genCopy(dst_ty, dst_mcv, src_mcv);
break :dst dst_mcv;
} else return self.fail("TODO implement trunc from {} to {}", .{ src_ty.fmt(mod), dst_ty.fmt(mod) });
} else try self.allocRegOrMem(inst, true);
if (dst_ty.zigTypeTag(mod) == .Vector) {
assert(src_ty.zigTypeTag(mod) == .Vector and dst_ty.vectorLen(mod) == src_ty.vectorLen(mod));
@@ -3429,7 +3429,10 @@ fn airMulDivBinOp(self: *Self, inst: Air.Inst.Index) !void {
};
try self.spillEflagsIfOccupied();
try self.spillRegisters(&.{ .rax, .rdx });
try self.spillRegisters(&.{ .rax, .rcx, .rdx });
const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rax, .rcx, .rdx });
defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
const lhs_mcv = try self.resolveInst(bin_op.lhs);
const rhs_mcv = try self.resolveInst(bin_op.rhs);
break :result try self.genMulDivBinOp(tag, inst, dst_ty, src_ty, lhs_mcv, rhs_mcv);
@@ -3685,9 +3688,9 @@ fn airMulSat(self: *Self, inst: Air.Inst.Index) !void {
.{ty.fmt(mod)},
);
try self.spillRegisters(&.{ .rax, .rdx });
const reg_locks = self.register_manager.lockRegs(2, .{ .rax, .rdx });
defer for (reg_locks) |reg_lock| if (reg_lock) |lock| self.register_manager.unlockReg(lock);
try self.spillRegisters(&.{ .rax, .rcx, .rdx });
const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rax, .rcx, .rdx });
defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
const lhs_mcv = try self.resolveInst(bin_op.lhs);
const lhs_lock = switch (lhs_mcv) {
@@ -3950,11 +3953,154 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
.Vector => return self.fail("TODO implement airMulWithOverflow for {}", .{dst_ty.fmt(mod)}),
.Int => result: {
const dst_info = dst_ty.intInfo(mod);
if (dst_info.bits > 128 and dst_info.signedness == .unsigned) {
const slow_inc = self.hasFeature(.slow_incdec);
const abi_size: u32 = @intCast(dst_ty.abiSize(mod));
const limb_len = std.math.divCeil(u32, abi_size, 8) catch unreachable;
try self.spillRegisters(&.{ .rax, .rcx, .rdx });
const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rax, .rcx, .rdx });
defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
const dst_mcv = try self.allocRegOrMem(inst, false);
try self.genInlineMemset(
dst_mcv.address(),
.{ .immediate = 0 },
.{ .immediate = tuple_ty.abiSize(mod) },
);
const lhs_mcv = try self.resolveInst(bin_op.lhs);
const rhs_mcv = try self.resolveInst(bin_op.rhs);
const temp_regs = try self.register_manager.allocRegs(
4,
.{ null, null, null, null },
abi.RegisterClass.gp,
);
const temp_locks = self.register_manager.lockRegsAssumeUnused(4, temp_regs);
defer for (temp_locks) |lock| self.register_manager.unlockReg(lock);
try self.asmRegisterRegister(.{ ._, .xor }, temp_regs[0].to32(), temp_regs[0].to32());
const outer_loop: Mir.Inst.Index = @intCast(self.mir_instructions.len);
try self.asmRegisterMemory(.{ ._, .mov }, temp_regs[1].to64(), .{
.base = .{ .frame = rhs_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[0].to64(),
.scale = .@"8",
} },
});
try self.asmRegisterRegister(.{ ._, .@"test" }, temp_regs[1].to64(), temp_regs[1].to64());
const skip_inner = try self.asmJccReloc(.z, undefined);
try self.asmRegisterRegister(.{ ._, .xor }, temp_regs[2].to32(), temp_regs[2].to32());
try self.asmRegisterRegister(.{ ._, .mov }, temp_regs[3].to32(), temp_regs[0].to32());
try self.asmRegisterRegister(.{ ._, .xor }, .ecx, .ecx);
try self.asmRegisterRegister(.{ ._, .xor }, .edx, .edx);
const inner_loop: Mir.Inst.Index = @intCast(self.mir_instructions.len);
try self.asmRegisterImmediate(.{ ._r, .sh }, .cl, Immediate.u(1));
try self.asmMemoryRegister(.{ ._, .adc }, .{
.base = .{ .frame = dst_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[3].to64(),
.scale = .@"8",
.disp = @intCast(tuple_ty.structFieldOffset(0, mod)),
} },
}, .rdx);
try self.asmSetccRegister(.c, .cl);
try self.asmRegisterMemory(.{ ._, .mov }, .rax, .{
.base = .{ .frame = lhs_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[2].to64(),
.scale = .@"8",
} },
});
try self.asmRegister(.{ ._, .mul }, temp_regs[1].to64());
try self.asmRegisterImmediate(.{ ._r, .sh }, .ch, Immediate.u(1));
try self.asmMemoryRegister(.{ ._, .adc }, .{
.base = .{ .frame = dst_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[3].to64(),
.scale = .@"8",
.disp = @intCast(tuple_ty.structFieldOffset(0, mod)),
} },
}, .rax);
try self.asmSetccRegister(.c, .ch);
if (slow_inc) {
try self.asmRegisterImmediate(.{ ._, .add }, temp_regs[2].to32(), Immediate.u(1));
try self.asmRegisterImmediate(.{ ._, .add }, temp_regs[3].to32(), Immediate.u(1));
} else {
try self.asmRegister(.{ ._, .inc }, temp_regs[2].to32());
try self.asmRegister(.{ ._, .inc }, temp_regs[3].to32());
}
try self.asmRegisterImmediate(
.{ ._, .cmp },
temp_regs[3].to32(),
Immediate.u(limb_len),
);
_ = try self.asmJccReloc(.b, inner_loop);
try self.asmRegisterRegister(.{ ._, .@"or" }, .rdx, .rcx);
const overflow = try self.asmJccReloc(.nz, undefined);
const overflow_loop: Mir.Inst.Index = @intCast(self.mir_instructions.len);
try self.asmRegisterImmediate(
.{ ._, .cmp },
temp_regs[2].to32(),
Immediate.u(limb_len),
);
const no_overflow = try self.asmJccReloc(.nb, undefined);
if (slow_inc) {
try self.asmRegisterImmediate(.{ ._, .add }, temp_regs[2].to32(), Immediate.u(1));
} else {
try self.asmRegister(.{ ._, .inc }, temp_regs[2].to32());
}
try self.asmMemoryImmediate(.{ ._, .cmp }, .{
.base = .{ .frame = lhs_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[2].to64(),
.scale = .@"8",
.disp = -8,
} },
}, Immediate.u(0));
_ = try self.asmJccReloc(.z, overflow_loop);
try self.performReloc(overflow);
try self.asmMemoryImmediate(.{ ._, .mov }, .{
.base = .{ .frame = dst_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .byte,
.disp = @intCast(tuple_ty.structFieldOffset(1, mod)),
} },
}, Immediate.u(1));
try self.performReloc(no_overflow);
try self.performReloc(skip_inner);
if (slow_inc) {
try self.asmRegisterImmediate(.{ ._, .add }, temp_regs[0].to32(), Immediate.u(1));
} else {
try self.asmRegister(.{ ._, .inc }, temp_regs[0].to32());
}
try self.asmRegisterImmediate(
.{ ._, .cmp },
temp_regs[0].to32(),
Immediate.u(limb_len),
);
_ = try self.asmJccReloc(.b, outer_loop);
break :result dst_mcv;
}
const lhs_active_bits = self.activeIntBits(bin_op.lhs);
const rhs_active_bits = self.activeIntBits(bin_op.rhs);
const src_bits = @max(lhs_active_bits, rhs_active_bits, dst_info.bits / 2);
const src_ty = try mod.intType(dst_info.signedness, src_bits);
if (src_bits > 64 and src_bits <= 128 and
dst_info.bits > 64 and dst_info.bits <= 128) switch (dst_info.signedness) {
.signed => {
@@ -4110,7 +4256,9 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
};
try self.spillEflagsIfOccupied();
try self.spillRegisters(&.{ .rax, .rdx });
try self.spillRegisters(&.{ .rax, .rcx, .rdx });
const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rax, .rcx, .rdx });
defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
const cc: Condition = switch (dst_info.signedness) {
.unsigned => .c,
@@ -8053,13 +8201,14 @@ fn genMulDivBinOp(
const src_abi_size: u32 = @intCast(src_ty.abiSize(mod));
assert(self.register_manager.isRegFree(.rax));
assert(self.register_manager.isRegFree(.rcx));
assert(self.register_manager.isRegFree(.rdx));
assert(self.eflags_inst == null);
if (dst_abi_size == 16 and src_abi_size == 16) {
assert(tag == .mul or tag == .mul_wrap);
const reg_locks = self.register_manager.lockRegsAssumeUnused(2, .{ .rax, .rdx });
defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
const reg_locks = self.register_manager.lockRegs(2, .{ .rax, .rdx });
defer for (reg_locks) |reg_lock| if (reg_lock) |lock| self.register_manager.unlockReg(lock);
const mat_lhs_mcv = switch (lhs_mcv) {
.load_symbol => mat_lhs_mcv: {
@@ -8124,10 +8273,171 @@ fn genMulDivBinOp(
else => unreachable,
.mul, .mul_wrap => dst_abi_size != src_abi_size and dst_abi_size != src_abi_size * 2,
.div_trunc, .div_floor, .div_exact, .rem, .mod => dst_abi_size != src_abi_size,
} or src_abi_size > 8) return self.fail(
"TODO implement genMulDivBinOp for {s} from {} to {}",
.{ @tagName(tag), src_ty.fmt(mod), dst_ty.fmt(mod) },
);
} or src_abi_size > 8) {
const src_info = src_ty.intInfo(mod);
switch (tag) {
.mul, .mul_wrap => {
const slow_inc = self.hasFeature(.slow_incdec);
const limb_len = std.math.divCeil(u32, src_abi_size, 8) catch unreachable;
try self.spillRegisters(&.{ .rax, .rcx, .rdx });
const reg_locks = self.register_manager.lockRegs(3, .{ .rax, .rcx, .rdx });
defer for (reg_locks) |reg_lock| if (reg_lock) |lock|
self.register_manager.unlockReg(lock);
const dst_mcv = try self.allocRegOrMemAdvanced(dst_ty, maybe_inst, false);
try self.genInlineMemset(
dst_mcv.address(),
.{ .immediate = 0 },
.{ .immediate = src_abi_size },
);
const temp_regs = try self.register_manager.allocRegs(
4,
.{ null, null, null, null },
abi.RegisterClass.gp,
);
const temp_locks = self.register_manager.lockRegs(4, temp_regs);
defer for (temp_locks) |temp_lock| if (temp_lock) |lock|
self.register_manager.unlockReg(lock);
try self.asmRegisterRegister(.{ ._, .xor }, temp_regs[0].to32(), temp_regs[0].to32());
const outer_loop: Mir.Inst.Index = @intCast(self.mir_instructions.len);
try self.asmRegisterMemory(.{ ._, .mov }, temp_regs[1].to64(), .{
.base = .{ .frame = rhs_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[0].to64(),
.scale = .@"8",
} },
});
try self.asmRegisterRegister(.{ ._, .@"test" }, temp_regs[1].to64(), temp_regs[1].to64());
const skip_inner = try self.asmJccReloc(.z, undefined);
try self.asmRegisterRegister(.{ ._, .xor }, temp_regs[2].to32(), temp_regs[2].to32());
try self.asmRegisterRegister(.{ ._, .mov }, temp_regs[3].to32(), temp_regs[0].to32());
try self.asmRegisterRegister(.{ ._, .xor }, .ecx, .ecx);
try self.asmRegisterRegister(.{ ._, .xor }, .edx, .edx);
const inner_loop: Mir.Inst.Index = @intCast(self.mir_instructions.len);
try self.asmRegisterImmediate(.{ ._r, .sh }, .cl, Immediate.u(1));
try self.asmMemoryRegister(.{ ._, .adc }, .{
.base = .{ .frame = dst_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[3].to64(),
.scale = .@"8",
} },
}, .rdx);
try self.asmSetccRegister(.c, .cl);
try self.asmRegisterMemory(.{ ._, .mov }, .rax, .{
.base = .{ .frame = lhs_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[2].to64(),
.scale = .@"8",
} },
});
try self.asmRegister(.{ ._, .mul }, temp_regs[1].to64());
try self.asmRegisterImmediate(.{ ._r, .sh }, .ch, Immediate.u(1));
try self.asmMemoryRegister(.{ ._, .adc }, .{
.base = .{ .frame = dst_mcv.load_frame.index },
.mod = .{ .rm = .{
.size = .qword,
.index = temp_regs[3].to64(),
.scale = .@"8",
} },
}, .rax);
try self.asmSetccRegister(.c, .ch);
if (slow_inc) {
try self.asmRegisterImmediate(.{ ._, .add }, temp_regs[2].to32(), Immediate.u(1));
try self.asmRegisterImmediate(.{ ._, .add }, temp_regs[3].to32(), Immediate.u(1));
} else {
try self.asmRegister(.{ ._, .inc }, temp_regs[2].to32());
try self.asmRegister(.{ ._, .inc }, temp_regs[3].to32());
}
try self.asmRegisterImmediate(
.{ ._, .cmp },
temp_regs[3].to32(),
Immediate.u(limb_len),
);
_ = try self.asmJccReloc(.b, inner_loop);
try self.performReloc(skip_inner);
if (slow_inc) {
try self.asmRegisterImmediate(.{ ._, .add }, temp_regs[0].to32(), Immediate.u(1));
} else {
try self.asmRegister(.{ ._, .inc }, temp_regs[0].to32());
}
try self.asmRegisterImmediate(
.{ ._, .cmp },
temp_regs[0].to32(),
Immediate.u(limb_len),
);
_ = try self.asmJccReloc(.b, outer_loop);
return dst_mcv;
},
.div_trunc, .div_floor, .div_exact, .rem, .mod => switch (src_info.signedness) {
.signed => {},
.unsigned => {
const dst_mcv = try self.allocRegOrMemAdvanced(dst_ty, maybe_inst, false);
const manyptr_u32_ty = try mod.ptrType(.{
.child = .u32_type,
.flags = .{
.size = .Many,
},
});
const manyptr_const_u32_ty = try mod.ptrType(.{
.child = .u32_type,
.flags = .{
.size = .Many,
.is_const = true,
},
});
_ = try self.genCall(.{ .lib = .{
.return_type = .void_type,
.param_types = &.{
manyptr_u32_ty.toIntern(),
manyptr_const_u32_ty.toIntern(),
manyptr_const_u32_ty.toIntern(),
.usize_type,
},
.callee = switch (tag) {
.div_trunc,
.div_floor,
.div_exact,
=> "__udivei4",
.rem,
.mod,
=> "__umodei4",
else => unreachable,
},
} }, &.{
manyptr_u32_ty,
manyptr_const_u32_ty,
manyptr_const_u32_ty,
Type.usize,
}, &.{
dst_mcv.address(),
lhs_mcv.address(),
rhs_mcv.address(),
.{ .immediate = src_info.bits },
});
return dst_mcv;
},
},
else => {},
}
return self.fail(
"TODO implement genMulDivBinOp for {s} from {} to {}",
.{ @tagName(tag), src_ty.fmt(mod), dst_ty.fmt(mod) },
);
}
const ty = if (dst_abi_size <= 8) dst_ty else src_ty;
const abi_size = if (dst_abi_size <= 8) dst_abi_size else src_abi_size;

View File

@@ -232,8 +232,7 @@ pub const Mnemonic = enum {
cmps, cmpsb, cmpsd, cmpsq, cmpsw,
cmpxchg, cmpxchg8b, cmpxchg16b,
cpuid, cqo, cwd, cwde,
div,
idiv, imul, int3,
dec, div, idiv, imul, inc, int3,
ja, jae, jb, jbe, jc, jrcxz, je, jg, jge, jl, jle, jna, jnae, jnb, jnbe,
jnc, jne, jng, jnge, jnl, jnle, jno, jnp, jns, jnz, jo, jp, jpe, jpo, js, jz,
jmp,

View File

@@ -325,6 +325,8 @@ pub const Inst = struct {
cwd,
/// Convert word to doubleword
cwde,
/// Decrement by 1
dec,
/// Unsigned division
/// Signed division
/// Divide packed single-precision floating-point values
@@ -332,7 +334,9 @@ pub const Inst = struct {
/// Divide packed double-precision floating-point values
/// Divide scalar double-precision floating-point values
div,
///
/// Increment by 1
inc,
/// Call to interrupt procedure
int3,
/// Conditional jump
j,

View File

@@ -269,6 +269,12 @@ pub const table = [_]Entry{
.{ .cpuid, .zo, &.{}, &.{ 0x0f, 0xa2 }, 0, .none, .none },
.{ .dec, .m, &.{ .rm8 }, &.{ 0xfe }, 1, .none, .none },
.{ .dec, .m, &.{ .rm8 }, &.{ 0xfe }, 1, .rex, .none },
.{ .dec, .m, &.{ .rm16 }, &.{ 0xff }, 1, .short, .none },
.{ .dec, .m, &.{ .rm32 }, &.{ 0xff }, 1, .none, .none },
.{ .dec, .m, &.{ .rm64 }, &.{ 0xff }, 1, .long, .none },
.{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .none, .none },
.{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .rex, .none },
.{ .div, .m, &.{ .rm16 }, &.{ 0xf7 }, 6, .short, .none },
@@ -296,6 +302,12 @@ pub const table = [_]Entry{
.{ .imul, .rmi, &.{ .r32, .rm32, .imm32 }, &.{ 0x69 }, 0, .none, .none },
.{ .imul, .rmi, &.{ .r64, .rm64, .imm32 }, &.{ 0x69 }, 0, .long, .none },
.{ .inc, .m, &.{ .rm8 }, &.{ 0xfe }, 0, .none, .none },
.{ .inc, .m, &.{ .rm8 }, &.{ 0xfe }, 0, .rex, .none },
.{ .inc, .m, &.{ .rm16 }, &.{ 0xff }, 0, .short, .none },
.{ .inc, .m, &.{ .rm32 }, &.{ 0xff }, 0, .none, .none },
.{ .inc, .m, &.{ .rm64 }, &.{ 0xff }, 0, .long, .none },
.{ .int3, .zo, &.{}, &.{ 0xcc }, 0, .none, .none },
.{ .ja, .d, &.{ .rel32 }, &.{ 0x0f, 0x87 }, 0, .none, .none },

View File

@@ -1059,6 +1059,40 @@ test "@mulWithOverflow bitsize > 32" {
}
}
test "@mulWithOverflow u256" {
if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
{
const const_lhs: u256 = 8035709466408580321693645878924206181189;
const const_rhs: u256 = 343954217539185679456797259115612849079;
const const_result = @mulWithOverflow(const_lhs, const_rhs);
comptime assert(const_result[0] == 100698109432518020450541558444080472799095368135495022414802684874680804056403);
comptime assert(const_result[1] == 1);
var var_lhs = const_lhs;
var var_rhs = const_rhs;
_ = .{ &var_lhs, &var_rhs };
const var_result = @mulWithOverflow(var_lhs, var_rhs);
try std.testing.expect(var_result[0] == const_result[0]);
try std.testing.expect(var_result[1] == const_result[1]);
}
{
const const_lhs: u256 = 100477140835310762407466294984162740292250605075409128262608;
const const_rhs: u256 = 406310585934439581231;
const const_result = @mulWithOverflow(const_lhs, const_rhs);
comptime assert(const_result[0] == 66110554277021146912650321519727251744526528332039438002889524600764482652976);
comptime assert(const_result[1] == 1);
var var_lhs = const_lhs;
var var_rhs = const_rhs;
_ = .{ &var_lhs, &var_rhs };
const var_result = @mulWithOverflow(var_lhs, var_rhs);
try std.testing.expect(var_result[0] == const_result[0]);
try std.testing.expect(var_result[1] == const_result[1]);
}
}
test "@subWithOverflow" {
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO