x86_64: implement clz, ctz, and popCount

This commit is contained in:
Jacob Young
2023-03-18 08:02:17 -04:00
committed by Jakub Konka
parent edd63f9aba
commit 53ec2a955e
5 changed files with 169 additions and 17 deletions

View File

@@ -2597,28 +2597,143 @@ fn airGetUnionTag(self: *Self, inst: Air.Inst.Index) !void {
fn airClz(self: *Self, inst: Air.Inst.Index) !void {
const ty_op = self.air.instructions.items(.data)[inst].ty_op;
const result: MCValue = if (self.liveness.isUnused(inst))
.dead
else
return self.fail("TODO implement airClz for {}", .{self.target.cpu.arch});
const result = result: {
if (self.liveness.isUnused(inst)) break :result .dead;
const dst_ty = self.air.typeOfIndex(inst);
const src_ty = self.air.typeOf(ty_op.operand);
const src_bits = src_ty.bitSize(self.target.*);
const src_mcv = try self.resolveInst(ty_op.operand);
const mat_src_mcv = switch (src_mcv) {
.immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) },
else => src_mcv,
};
const mat_src_lock = switch (mat_src_mcv) {
.register => |reg| self.register_manager.lockReg(reg),
else => null,
};
defer if (mat_src_lock) |lock| self.register_manager.unlockReg(lock);
const dst_reg = try self.register_manager.allocReg(inst, gp);
const dst_mcv = MCValue{ .register = dst_reg };
const dst_lock = self.register_manager.lockReg(dst_reg);
defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
if (Target.x86.featureSetHas(self.target.cpu.features, .lzcnt)) {
try self.genBinOpMir(.lzcnt, src_ty, dst_mcv, mat_src_mcv);
const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*));
const extra_bits = registerAlias(dst_reg, src_abi_size).bitSize() - src_bits;
if (extra_bits > 0) {
try self.genBinOpMir(.sub, dst_ty, dst_mcv, .{ .immediate = extra_bits });
}
break :result dst_mcv;
}
const width_reg = try self.copyToTmpRegister(dst_ty, .{ .immediate = src_bits });
const width_mcv = MCValue{ .register = width_reg };
try self.genBinOpMir(.bsr, src_ty, dst_mcv, mat_src_mcv);
const dst_abi_size = @intCast(u32, @max(dst_ty.abiSize(self.target.*), 2));
try self.asmCmovccRegisterRegister(
registerAlias(dst_reg, dst_abi_size),
registerAlias(width_reg, dst_abi_size),
.z,
);
try self.genBinOpMir(.sub, dst_ty, width_mcv, dst_mcv);
break :result width_mcv;
};
return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
}
fn airCtz(self: *Self, inst: Air.Inst.Index) !void {
const ty_op = self.air.instructions.items(.data)[inst].ty_op;
const result: MCValue = if (self.liveness.isUnused(inst))
.dead
else
return self.fail("TODO implement airCtz for {}", .{self.target.cpu.arch});
const result = result: {
if (self.liveness.isUnused(inst)) break :result .dead;
const dst_ty = self.air.typeOfIndex(inst);
const src_ty = self.air.typeOf(ty_op.operand);
const src_bits = src_ty.bitSize(self.target.*);
const src_mcv = try self.resolveInst(ty_op.operand);
const mat_src_mcv = switch (src_mcv) {
.immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) },
else => src_mcv,
};
const mat_src_lock = switch (mat_src_mcv) {
.register => |reg| self.register_manager.lockReg(reg),
else => null,
};
defer if (mat_src_lock) |lock| self.register_manager.unlockReg(lock);
const dst_reg = try self.register_manager.allocReg(inst, gp);
const dst_mcv = MCValue{ .register = dst_reg };
const dst_lock = self.register_manager.lockReg(dst_reg);
defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
if (Target.x86.featureSetHas(self.target.cpu.features, .bmi)) {
const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*));
const extra_bits = registerAlias(dst_reg, src_abi_size).bitSize() - src_bits;
const masked_mcv = if (extra_bits > 0) masked: {
const mask_mcv = MCValue{
.immediate = ((@as(u64, 1) << @intCast(u6, extra_bits)) - 1) << @intCast(u6, src_bits),
};
const tmp_mcv = tmp: {
if (src_mcv.isImmediate() or self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) {
break :tmp src_mcv;
}
try self.genSetReg(src_ty, dst_reg, src_mcv);
break :tmp dst_mcv;
};
try self.genBinOpMir(.@"or", src_ty, tmp_mcv, mask_mcv);
break :masked tmp_mcv;
} else mat_src_mcv;
try self.genBinOpMir(.tzcnt, src_ty, dst_mcv, masked_mcv);
break :result dst_mcv;
}
const width_reg = try self.copyToTmpRegister(dst_ty, .{ .immediate = src_bits });
try self.genBinOpMir(.bsf, src_ty, dst_mcv, mat_src_mcv);
const abi_size = @max(@intCast(u32, dst_ty.abiSize(self.target.*)), 2);
try self.asmCmovccRegisterRegister(
registerAlias(dst_reg, abi_size),
registerAlias(width_reg, abi_size),
.z,
);
break :result dst_mcv;
};
return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
}
fn airPopcount(self: *Self, inst: Air.Inst.Index) !void {
const ty_op = self.air.instructions.items(.data)[inst].ty_op;
const result: MCValue = if (self.liveness.isUnused(inst))
.dead
else
return self.fail("TODO implement airPopcount for {}", .{self.target.cpu.arch});
const result = result: {
if (self.liveness.isUnused(inst)) break :result .dead;
const op_ty = self.air.typeOf(ty_op.operand);
if (Target.x86.featureSetHas(self.target.cpu.features, .popcnt)) {
const op_mcv = try self.resolveInst(ty_op.operand);
const mat_op_mcv = switch (op_mcv) {
.immediate => MCValue{ .register = try self.copyToTmpRegister(op_ty, op_mcv) },
else => op_mcv,
};
const mat_op_lock = switch (mat_op_mcv) {
.register => |reg| self.register_manager.lockReg(reg),
else => null,
};
defer if (mat_op_lock) |lock| self.register_manager.unlockReg(lock);
const dst_mcv = MCValue{ .register = try self.register_manager.allocReg(inst, gp) };
try self.genBinOpMir(.popcnt, op_ty, dst_mcv, mat_op_mcv);
break :result dst_mcv;
}
return self.fail("TODO implement airPopcount for {}", .{op_ty.fmt(self.bin_file.options.module.?)});
};
return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
}
@@ -3491,7 +3606,7 @@ fn genBinOp(
if (lhs.isRegister() and self.reuseOperand(inst, lhs_air, 0, lhs)) {
break :blk lhs;
}
if (rhs.isRegister() and is_commutative and self.reuseOperand(inst, rhs_air, 1, rhs)) {
if (is_commutative and rhs.isRegister() and self.reuseOperand(inst, rhs_air, 1, rhs)) {
flipped = true;
break :blk rhs;
}

View File

@@ -73,6 +73,8 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
.adc,
.add,
.@"and",
.bsf,
.bsr,
.call,
.cbw,
.cwde,
@@ -89,12 +91,14 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
.int3,
.jmp,
.lea,
.lzcnt,
.mov,
.movzx,
.mul,
.nop,
.@"or",
.pop,
.popcnt,
.push,
.ret,
.sal,
@@ -105,6 +109,7 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
.sub,
.syscall,
.@"test",
.tzcnt,
.ud2,
.xor,

View File

@@ -307,6 +307,7 @@ pub const Mnemonic = enum {
// zig fmt: off
// General-purpose
adc, add, @"and",
bsf, bsr,
call, cbw, cdq, cdqe,
cmova, cmovae, cmovb, cmovbe, cmovc, cmove, cmovg, cmovge, cmovl, cmovle, cmovna,
cmovnae, cmovnb, cmovnbe, cmovnc, cmovne, cmovng, cmovnge, cmovnl, cmovnle, cmovno,
@@ -322,12 +323,13 @@ pub const Mnemonic = enum {
jmp,
lea,
lods, lodsb, lodsd, lodsq, lodsw,
lzcnt,
mov,
movs, movsb, movsd, movsq, movsw,
movsx, movsxd, movzx, mul,
nop,
@"or",
pop, push,
pop, popcnt, push,
ret,
sal, sar, sbb,
scas, scasb, scasd, scasq, scasw,
@@ -336,7 +338,7 @@ pub const Mnemonic = enum {
setnb, setnbe, setnc, setne, setng, setnge, setnl, setnle, setno, setnp, setns,
setnz, seto, setp, setpe, setpo, sets, setz,
stos, stosb, stosd, stosq, stosw,
@"test",
@"test", tzcnt,
ud2,
xor,
// SSE

View File

@@ -38,6 +38,10 @@ pub const Inst = struct {
add,
/// Logical and
@"and",
/// Bit scan forward
bsf,
/// Bit scan reverse
bsr,
/// Call
call,
/// Convert byte to word
@@ -70,6 +74,8 @@ pub const Inst = struct {
jmp,
/// Load effective address
lea,
/// Count the number of leading zero bits
lzcnt,
/// Move
mov,
/// Move with sign extension
@@ -84,6 +90,8 @@ pub const Inst = struct {
@"or",
/// Pop
pop,
/// Return the count of number of bits set to 1
popcnt,
/// Push
push,
/// Return
@@ -104,6 +112,8 @@ pub const Inst = struct {
syscall,
/// Test condition
@"test",
/// Count the number of trailing zero bits
tzcnt,
/// Undefined instruction
ud2,
/// Logical exclusive-or

View File

@@ -81,6 +81,14 @@ pub const table = &[_]Entry{
.{ .@"and", .rm, .r32, .rm32, .none, .none, &.{ 0x23 }, 0, .none },
.{ .@"and", .rm, .r64, .rm64, .none, .none, &.{ 0x23 }, 0, .long },
.{ .bsf, .rm, .r16, .rm16, .none, .none, &.{ 0x0f, 0xbc }, 0, .none },
.{ .bsf, .rm, .r32, .rm32, .none, .none, &.{ 0x0f, 0xbc }, 0, .none },
.{ .bsf, .rm, .r64, .rm64, .none, .none, &.{ 0x0f, 0xbc }, 0, .long },
.{ .bsr, .rm, .r16, .rm16, .none, .none, &.{ 0x0f, 0xbd }, 0, .none },
.{ .bsr, .rm, .r32, .rm32, .none, .none, &.{ 0x0f, 0xbd }, 0, .none },
.{ .bsr, .rm, .r64, .rm64, .none, .none, &.{ 0x0f, 0xbd }, 0, .long },
// This is M encoding according to Intel, but D makes more sense here.
.{ .call, .d, .rel32, .none, .none, .none, &.{ 0xe8 }, 0, .none },
.{ .call, .m, .rm64, .none, .none, .none, &.{ 0xff }, 2, .none },
@@ -301,6 +309,10 @@ pub const table = &[_]Entry{
.{ .lodsd, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .none },
.{ .lodsq, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .long },
.{ .lzcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .none },
.{ .lzcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .none },
.{ .lzcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .long },
.{ .mov, .mr, .rm8, .r8, .none, .none, &.{ 0x88 }, 0, .none },
.{ .mov, .mr, .rm8, .r8, .none, .none, &.{ 0x88 }, 0, .rex },
.{ .mov, .mr, .rm16, .r16, .none, .none, &.{ 0x89 }, 0, .none },
@@ -397,6 +409,10 @@ pub const table = &[_]Entry{
.{ .pop, .m, .rm16, .none, .none, .none, &.{ 0x8f }, 0, .none },
.{ .pop, .m, .rm64, .none, .none, .none, &.{ 0x8f }, 0, .none },
.{ .popcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none },
.{ .popcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none },
.{ .popcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long },
.{ .push, .o, .r16, .none, .none, .none, &.{ 0x50 }, 0, .none },
.{ .push, .o, .r64, .none, .none, .none, &.{ 0x50 }, 0, .none },
.{ .push, .m, .rm16, .none, .none, .none, &.{ 0xff }, 6, .none },
@@ -596,8 +612,8 @@ pub const table = &[_]Entry{
.{ .sub, .rm, .r32, .rm32, .none, .none, &.{ 0x2b }, 0, .none },
.{ .sub, .rm, .r64, .rm64, .none, .none, &.{ 0x2b }, 0, .long },
.{ .syscall, .np, .none, .none, .none, .none, &.{ 0x0f, 0x05 }, 0, .none },
.{ .syscall, .np, .none, .none, .none, .none, &.{ 0x0f, 0x05 }, 0, .none }
,
.{ .@"test", .zi, .al, .imm8, .none, .none, &.{ 0xa8 }, 0, .none },
.{ .@"test", .zi, .ax, .imm16, .none, .none, &.{ 0xa9 }, 0, .none },
.{ .@"test", .zi, .eax, .imm32, .none, .none, &.{ 0xa9 }, 0, .none },
@@ -613,6 +629,10 @@ pub const table = &[_]Entry{
.{ .@"test", .mr, .rm32, .r32, .none, .none, &.{ 0x85 }, 0, .none },
.{ .@"test", .mr, .rm64, .r64, .none, .none, &.{ 0x85 }, 0, .long },
.{ .tzcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .none },
.{ .tzcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .none },
.{ .tzcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .long },
.{ .ud2, .np, .none, .none, .none, .none, &.{ 0x0f, 0x0b }, 0, .none },
.{ .xor, .zi, .al, .imm8, .none, .none, &.{ 0x34 }, 0, .none },