diff --git a/src/Module.zig b/src/Module.zig index 96b490e2a1..90e1a71bd2 100644 --- a/src/Module.zig +++ b/src/Module.zig @@ -4330,6 +4330,33 @@ pub fn intSub(allocator: *Allocator, lhs: Value, rhs: Value) !Value { } } +pub fn intMul(allocator: *Allocator, lhs: Value, rhs: Value) !Value { + // TODO is this a performance issue? maybe we should try the operation without + // resorting to BigInt first. + var lhs_space: Value.BigIntSpace = undefined; + var rhs_space: Value.BigIntSpace = undefined; + const lhs_bigint = lhs.toBigInt(&lhs_space); + const rhs_bigint = rhs.toBigInt(&rhs_space); + const limbs = try allocator.alloc( + std.math.big.Limb, + lhs_bigint.limbs.len + rhs_bigint.limbs.len + 1, + ); + var result_bigint = BigIntMutable{ .limbs = limbs, .positive = undefined, .len = undefined }; + var limbs_buffer = try allocator.alloc( + std.math.big.Limb, + std.math.big.int.calcMulLimbsBufferLen(lhs_bigint.limbs.len, rhs_bigint.limbs.len, 1), + ); + defer allocator.free(limbs_buffer); + result_bigint.mul(lhs_bigint, rhs_bigint, limbs_buffer, allocator); + const result_limbs = result_bigint.limbs[0..result_bigint.len]; + + if (result_bigint.positive) { + return Value.Tag.int_big_positive.create(allocator, result_limbs); + } else { + return Value.Tag.int_big_negative.create(allocator, result_limbs); + } +} + pub fn floatAdd( arena: *Allocator, float_type: Type, @@ -4396,6 +4423,39 @@ pub fn floatSub( } } +pub fn floatMul( + arena: *Allocator, + float_type: Type, + src: LazySrcLoc, + lhs: Value, + rhs: Value, +) !Value { + switch (float_type.tag()) { + .f16 => { + @panic("TODO add __trunctfhf2 to compiler-rt"); + //const lhs_val = lhs.toFloat(f16); + //const rhs_val = rhs.toFloat(f16); + //return Value.Tag.float_16.create(arena, lhs_val * rhs_val); + }, + .f32 => { + const lhs_val = lhs.toFloat(f32); + const rhs_val = rhs.toFloat(f32); + return Value.Tag.float_32.create(arena, lhs_val * rhs_val); + }, + .f64 => { + const lhs_val = lhs.toFloat(f64); + const rhs_val = rhs.toFloat(f64); + return Value.Tag.float_64.create(arena, lhs_val * rhs_val); + }, + .f128, .comptime_float, .c_longdouble => { + const lhs_val = lhs.toFloat(f128); + const rhs_val = rhs.toFloat(f128); + return Value.Tag.float_128.create(arena, lhs_val * rhs_val); + }, + else => unreachable, + } +} + pub fn simplePtrType( mod: *Module, arena: *Allocator, diff --git a/src/Sema.zig b/src/Sema.zig index 98bff5bf23..65a196911e 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -3864,10 +3864,15 @@ fn analyzeArithmetic( // incase rhs is 0, simply return lhs without doing any calculations // TODO Once division is implemented we should throw an error when dividing by 0. if (rhs_val.compareWithZero(.eq)) { - return sema.mod.constInst(sema.arena, src, .{ - .ty = scalar_type, - .val = lhs_val, - }); + switch (zir_tag) { + .add, .addwrap, .sub, .subwrap => { + return sema.mod.constInst(sema.arena, src, .{ + .ty = scalar_type, + .val = lhs_val, + }); + }, + else => {}, + } } const value = switch (zir_tag) { @@ -3885,6 +3890,13 @@ fn analyzeArithmetic( try Module.floatSub(sema.arena, scalar_type, src, lhs_val, rhs_val); break :blk val; }, + .mul => blk: { + const val = if (is_int) + try Module.intMul(sema.arena, lhs_val, rhs_val) + else + try Module.floatMul(sema.arena, scalar_type, src, lhs_val, rhs_val); + break :blk val; + }, else => return sema.mod.fail(&block.base, src, "TODO Implement arithmetic operand '{s}'", .{@tagName(zir_tag)}), }; diff --git a/src/codegen.zig b/src/codegen.zig index 2f83bfe6f3..7990c8d1b7 100644 --- a/src/codegen.zig +++ b/src/codegen.zig @@ -20,6 +20,8 @@ const build_options = @import("build_options"); const LazySrcLoc = Module.LazySrcLoc; const RegisterManager = @import("register_manager.zig").RegisterManager; +const X8664Encoder = @import("codegen/x86_64.zig").Encoder; + /// The codegen-related data that is stored in `ir.Inst.Block` instructions. pub const BlockData = struct { relocs: std.ArrayListUnmanaged(Reloc) = undefined, @@ -1038,7 +1040,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { }, .val = Value.initTag(.bool_true), }; - return try self.genX8664BinMath(&inst.base, inst.operand, &imm.base, 6, 0x30); + return try self.genX8664BinMath(&inst.base, inst.operand, &imm.base); }, .arm, .armeb => { var imm = ir.Inst.Constant{ @@ -1062,7 +1064,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { return MCValue.dead; switch (arch) { .x86_64 => { - return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 0, 0x00); + return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs); }, .arm, .armeb => return try self.genArmBinOp(&inst.base, inst.lhs, inst.rhs, .add), else => return self.fail(inst.base.src, "TODO implement add for {}", .{self.target.cpu.arch}), @@ -1083,6 +1085,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { if (inst.base.isUnused()) return MCValue.dead; switch (arch) { + .x86_64 => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs), .arm, .armeb => return try self.genArmMul(&inst.base, inst.lhs, inst.rhs), else => return self.fail(inst.base.src, "TODO implement mul for {}", .{self.target.cpu.arch}), } @@ -1361,7 +1364,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { return MCValue.dead; switch (arch) { .x86_64 => { - return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 5, 0x28); + return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs); }, .arm, .armeb => return try self.genArmBinOp(&inst.base, inst.lhs, inst.rhs, .sub), else => return self.fail(inst.base.src, "TODO implement sub for {}", .{self.target.cpu.arch}), @@ -1506,8 +1509,20 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { return dst_mcv; } + /// Perform "binary" operators, excluding comparisons. + /// Currently, the following ops are supported: /// ADD, SUB, XOR, OR, AND - fn genX8664BinMath(self: *Self, inst: *ir.Inst, op_lhs: *ir.Inst, op_rhs: *ir.Inst, opx: u8, mr: u8) !MCValue { + fn genX8664BinMath(self: *Self, inst: *ir.Inst, op_lhs: *ir.Inst, op_rhs: *ir.Inst) !MCValue { + // We'll handle these ops in two steps. + // 1) Prepare an output location (register or memory) + // This location will be the location of the operand that dies (if one exists) + // or just a temporary register (if one doesn't exist) + // 2) Perform the op with the other argument + // 3) Sometimes, the output location is memory but the op doesn't support it. + // In this case, copy that location to a register, then perform the op to that register instead. + // + // TODO: make this algorithm less bad + try self.code.ensureCapacity(self.code.items.len + 8); const lhs = try self.resolveInst(op_lhs); @@ -1568,18 +1583,109 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { else => {}, } - try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, opx, mr); + // Now for step 2, we perform the actual op + switch (inst.tag) { + // TODO: Generate wrapping and non-wrapping versions separately + .add, .addwrap => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 0, 0x00), + .bool_or, .bit_or => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 1, 0x08), + .bool_and, .bit_and => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 4, 0x20), + .sub, .subwrap => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 5, 0x28), + .xor, .not => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 6, 0x30), + + .mul, .mulwrap => try self.genX8664Imul(inst.src, inst.ty, dst_mcv, src_mcv), + else => unreachable, + } return dst_mcv; } + /// Wrap over Instruction.encodeInto to translate errors + fn encodeX8664Instruction( + self: *Self, + src: LazySrcLoc, + inst: Instruction, + ) !void { + inst.encodeInto(self.code) catch |err| { + if (err == error.OutOfMemory) + return error.OutOfMemory + else + return self.fail(src, "Instruction.encodeInto failed because {s}", .{@errorName(err)}); + }; + } + + /// This function encodes a binary operation for x86_64 + /// intended for use with the following opcode ranges + /// because they share the same structure. + /// + /// Thus not all binary operations can be used here + /// -- multiplication needs to be done with imul, + /// which doesn't have as convenient an interface. + /// + /// "opx"-style instructions use the opcode extension field to indicate which instruction to execute: + /// + /// opx = /0: add + /// opx = /1: or + /// opx = /2: adc + /// opx = /3: sbb + /// opx = /4: and + /// opx = /5: sub + /// opx = /6: xor + /// opx = /7: cmp + /// + /// opcode | operand shape + /// --------+---------------------- + /// 80 /opx | *r/m8*, imm8 + /// 81 /opx | *r/m16/32/64*, imm16/32 + /// 83 /opx | *r/m16/32/64*, imm8 + /// + /// "mr"-style instructions use the low bits of opcode to indicate shape of instruction: + /// + /// mr = 00: add + /// mr = 08: or + /// mr = 10: adc + /// mr = 18: sbb + /// mr = 20: and + /// mr = 28: sub + /// mr = 30: xor + /// mr = 38: cmp + /// + /// opcode | operand shape + /// -------+------------------------- + /// mr + 0 | *r/m8*, r8 + /// mr + 1 | *r/m16/32/64*, r16/32/64 + /// mr + 2 | *r8*, r/m8 + /// mr + 3 | *r16/32/64*, r/m16/32/64 + /// mr + 4 | *AL*, imm8 + /// mr + 5 | *rAX*, imm16/32 + /// + /// TODO: rotates and shifts share the same structure, so we can potentially implement them + /// at a later date with very similar code. + /// They have "opx"-style instructions, but no "mr"-style instructions. + /// + /// opx = /0: rol, + /// opx = /1: ror, + /// opx = /2: rcl, + /// opx = /3: rcr, + /// opx = /4: shl sal, + /// opx = /5: shr, + /// opx = /6: sal shl, + /// opx = /7: sar, + /// + /// opcode | operand shape + /// --------+------------------ + /// c0 /opx | *r/m8*, imm8 + /// c1 /opx | *r/m16/32/64*, imm8 + /// d0 /opx | *r/m8*, 1 + /// d1 /opx | *r/m16/32/64*, 1 + /// d2 /opx | *r/m8*, CL (for context, CL is register 1) + /// d3 /opx | *r/m16/32/64*, CL (for context, CL is register 1) fn genX8664BinMathCode( self: *Self, src: LazySrcLoc, dst_ty: Type, dst_mcv: MCValue, src_mcv: MCValue, - opx: u8, + opx: u3, mr: u8, ) !void { switch (dst_mcv) { @@ -1598,31 +1704,85 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { .ptr_stack_offset => unreachable, .ptr_embedded_in_code => unreachable, .register => |src_reg| { - self.rex(.{ .b = dst_reg.isExtended(), .r = src_reg.isExtended(), .w = dst_reg.size() == 64 }); - self.code.appendSliceAssumeCapacity(&[_]u8{ mr + 0x1, 0xC0 | (@as(u8, src_reg.id() & 0b111) << 3) | @as(u8, dst_reg.id() & 0b111) }); + // for register, register use mr + 1 + // addressing mode: *r/m16/32/64*, r16/32/64 + const abi_size = dst_ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 3); + encoder.rex(.{ + .w = abi_size == 8, + .r = src_reg.isExtended(), + .b = dst_reg.isExtended(), + }); + encoder.opcode_1byte(mr + 1); + encoder.modRm_direct( + src_reg.low_id(), + dst_reg.low_id(), + ); }, .immediate => |imm| { - const imm32 = @intCast(u31, imm); // This case must be handled before calling genX8664BinMathCode. - // 81 /opx id - if (imm32 <= math.maxInt(u7)) { - self.rex(.{ .b = dst_reg.isExtended(), .w = dst_reg.size() == 64 }); - self.code.appendSliceAssumeCapacity(&[_]u8{ - 0x83, - 0xC0 | (opx << 3) | @truncate(u3, dst_reg.id()), - @intCast(u8, imm32), + // register, immediate use opx = 81 or 83 addressing modes: + // opx = 81: r/m16/32/64, imm16/32 + // opx = 83: r/m16/32/64, imm8 + const imm32 = @intCast(i32, imm); // This case must be handled before calling genX8664BinMathCode. + if (imm32 <= math.maxInt(i8)) { + const abi_size = dst_ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 4); + encoder.rex(.{ + .w = abi_size == 8, + .b = dst_reg.isExtended(), }); + encoder.opcode_1byte(0x83); + encoder.modRm_direct( + opx, + dst_reg.low_id(), + ); + encoder.imm8(@intCast(i8, imm32)); } else { - self.rex(.{ .r = dst_reg.isExtended(), .w = dst_reg.size() == 64 }); - self.code.appendSliceAssumeCapacity(&[_]u8{ - 0x81, - 0xC0 | (opx << 3) | @truncate(u3, dst_reg.id()), + const abi_size = dst_ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 7); + encoder.rex(.{ + .w = abi_size == 8, + .b = dst_reg.isExtended(), }); - std.mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), imm32); + encoder.opcode_1byte(0x81); + encoder.modRm_direct( + opx, + dst_reg.low_id(), + ); + encoder.imm32(@intCast(i32, imm32)); } }, - .embedded_in_code, .memory, .stack_offset => { + .embedded_in_code, .memory => { return self.fail(src, "TODO implement x86 ADD/SUB/CMP source memory", .{}); }, + .stack_offset => |off| { + // register, indirect use mr + 3 + // addressing mode: *r16/32/64*, r/m16/32/64 + const abi_size = dst_ty.abiSize(self.target.*); + const adj_off = off + abi_size; + if (off > math.maxInt(i32)) { + return self.fail(src, "stack offset too large", .{}); + } + const encoder = try X8664Encoder.init(self.code, 7); + encoder.rex(.{ + .w = abi_size == 8, + .r = dst_reg.isExtended(), + }); + encoder.opcode_1byte(mr + 3); + if (adj_off <= std.math.maxInt(i8)) { + encoder.modRm_indirectDisp8( + dst_reg.low_id(), + Register.ebp.low_id(), + ); + encoder.disp8(-@intCast(i8, adj_off)); + } else { + encoder.modRm_indirectDisp32( + dst_reg.low_id(), + Register.ebp.low_id(), + ); + encoder.disp32(-@intCast(i32, adj_off)); + } + }, .compare_flags_unsigned => { return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (unsigned)", .{}); }, @@ -1661,28 +1821,184 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { } } + /// Performs integer multiplication between dst_mcv and src_mcv, storing the result in dst_mcv. + fn genX8664Imul( + self: *Self, + src: LazySrcLoc, + dst_ty: Type, + dst_mcv: MCValue, + src_mcv: MCValue, + ) !void { + switch (dst_mcv) { + .none => unreachable, + .undef => unreachable, + .dead, .unreach, .immediate => unreachable, + .compare_flags_unsigned => unreachable, + .compare_flags_signed => unreachable, + .ptr_stack_offset => unreachable, + .ptr_embedded_in_code => unreachable, + .register => |dst_reg| { + switch (src_mcv) { + .none => unreachable, + .undef => try self.genSetReg(src, dst_ty, dst_reg, .undef), + .dead, .unreach => unreachable, + .ptr_stack_offset => unreachable, + .ptr_embedded_in_code => unreachable, + .register => |src_reg| { + // register, register + // + // Use the following imul opcode + // 0F AF /r: IMUL r32/64, r/m32/64 + const abi_size = dst_ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 4); + encoder.rex(.{ + .w = abi_size == 8, + .r = dst_reg.isExtended(), + .b = src_reg.isExtended(), + }); + encoder.opcode_2byte(0x0f, 0xaf); + encoder.modRm_direct( + dst_reg.low_id(), + src_reg.low_id(), + ); + }, + .immediate => |imm| { + // register, immediate: + // depends on size of immediate. + // + // immediate fits in i8: + // 6B /r ib: IMUL r32/64, r/m32/64, imm8 + // + // immediate fits in i32: + // 69 /r id: IMUL r32/64, r/m32/64, imm32 + // + // immediate is huge: + // split into 2 instructions + // 1) copy the 64 bit immediate into a tmp register + // 2) perform register,register mul + // 0F AF /r: IMUL r32/64, r/m32/64 + if (math.minInt(i8) <= imm and imm <= math.maxInt(i8)) { + const abi_size = dst_ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 4); + encoder.rex(.{ + .w = abi_size == 8, + .r = dst_reg.isExtended(), + .b = dst_reg.isExtended(), + }); + encoder.opcode_1byte(0x6B); + encoder.modRm_direct( + dst_reg.low_id(), + dst_reg.low_id(), + ); + encoder.imm8(@intCast(i8, imm)); + } else if (math.minInt(i32) <= imm and imm <= math.maxInt(i32)) { + const abi_size = dst_ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 7); + encoder.rex(.{ + .w = abi_size == 8, + .r = dst_reg.isExtended(), + .b = dst_reg.isExtended(), + }); + encoder.opcode_1byte(0x69); + encoder.modRm_direct( + dst_reg.low_id(), + dst_reg.low_id(), + ); + encoder.imm32(@intCast(i32, imm)); + } else { + const src_reg = try self.copyToTmpRegister(src, dst_ty, src_mcv); + return self.genX8664Imul(src, dst_ty, dst_mcv, MCValue{ .register = src_reg }); + } + }, + .embedded_in_code, .memory, .stack_offset => { + return self.fail(src, "TODO implement x86 multiply source memory", .{}); + }, + .compare_flags_unsigned => { + return self.fail(src, "TODO implement x86 multiply source compare flag (unsigned)", .{}); + }, + .compare_flags_signed => { + return self.fail(src, "TODO implement x86 multiply source compare flag (signed)", .{}); + }, + } + }, + .stack_offset => |off| { + switch (src_mcv) { + .none => unreachable, + .undef => return self.genSetStack(src, dst_ty, off, .undef), + .dead, .unreach => unreachable, + .ptr_stack_offset => unreachable, + .ptr_embedded_in_code => unreachable, + .register => |src_reg| { + // copy dst to a register + const dst_reg = try self.copyToTmpRegister(src, dst_ty, dst_mcv); + // multiply into dst_reg + // register, register + // Use the following imul opcode + // 0F AF /r: IMUL r32/64, r/m32/64 + const abi_size = dst_ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 4); + encoder.rex(.{ + .w = abi_size == 8, + .r = dst_reg.isExtended(), + .b = src_reg.isExtended(), + }); + encoder.opcode_2byte(0x0f, 0xaf); + encoder.modRm_direct( + dst_reg.low_id(), + src_reg.low_id(), + ); + // copy dst_reg back out + return self.genSetStack(src, dst_ty, off, MCValue{ .register = dst_reg }); + }, + .immediate => |imm| { + return self.fail(src, "TODO implement x86 multiply source immediate", .{}); + }, + .embedded_in_code, .memory, .stack_offset => { + return self.fail(src, "TODO implement x86 multiply source memory", .{}); + }, + .compare_flags_unsigned => { + return self.fail(src, "TODO implement x86 multiply source compare flag (unsigned)", .{}); + }, + .compare_flags_signed => { + return self.fail(src, "TODO implement x86 multiply source compare flag (signed)", .{}); + }, + } + }, + .embedded_in_code, .memory => { + return self.fail(src, "TODO implement x86 multiply destination memory", .{}); + }, + } + } + fn genX8664ModRMRegToStack(self: *Self, src: LazySrcLoc, ty: Type, off: u32, reg: Register, opcode: u8) !void { const abi_size = ty.abiSize(self.target.*); const adj_off = off + abi_size; - try self.code.ensureCapacity(self.code.items.len + 7); - self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended() }); - const reg_id: u8 = @truncate(u3, reg.id()); - if (adj_off <= 128) { - // example: 48 89 55 7f mov QWORD PTR [rbp+0x7f],rdx - const RM = @as(u8, 0b01_000_101) | (reg_id << 3); - const negative_offset = @intCast(i8, -@intCast(i32, adj_off)); - const twos_comp = @bitCast(u8, negative_offset); - self.code.appendSliceAssumeCapacity(&[_]u8{ opcode, RM, twos_comp }); - } else if (adj_off <= 2147483648) { - // example: 48 89 95 80 00 00 00 mov QWORD PTR [rbp+0x80],rdx - const RM = @as(u8, 0b10_000_101) | (reg_id << 3); - const negative_offset = @intCast(i32, -@intCast(i33, adj_off)); - const twos_comp = @bitCast(u32, negative_offset); - self.code.appendSliceAssumeCapacity(&[_]u8{ opcode, RM }); - mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), twos_comp); - } else { + if (off > math.maxInt(i32)) { return self.fail(src, "stack offset too large", .{}); } + + const i_adj_off = -@intCast(i32, adj_off); + const encoder = try X8664Encoder.init(self.code, 7); + encoder.rex(.{ + .w = abi_size == 8, + .r = reg.isExtended(), + }); + encoder.opcode_1byte(opcode); + if (i_adj_off < std.math.maxInt(i8)) { + // example: 48 89 55 7f mov QWORD PTR [rbp+0x7f],rdx + encoder.modRm_indirectDisp8( + reg.low_id(), + Register.ebp.low_id(), + ); + encoder.disp8(@intCast(i8, i_adj_off)); + } else { + // example: 48 89 95 80 00 00 00 mov QWORD PTR [rbp+0x80],rdx + encoder.modRm_indirectDisp32( + reg.low_id(), + Register.ebp.low_id(), + ); + encoder.disp32(i_adj_off); + } } fn genArgDbgInfo(self: *Self, inst: *ir.Inst.Arg, mcv: MCValue) !void { @@ -2126,12 +2442,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { log.debug("got_addr = 0x{x}", .{got_addr}); switch (arch) { .x86_64 => { - try self.genSetReg(inst.base.src, Type.initTag(.u32), .rax, .{ .memory = got_addr }); + try self.genSetReg(inst.base.src, Type.initTag(.u64), .rax, .{ .memory = got_addr }); // callq *%rax + try self.code.ensureCapacity(self.code.items.len + 2); self.code.appendSliceAssumeCapacity(&[2]u8{ 0xff, 0xd0 }); }, .aarch64 => { - try self.genSetReg(inst.base.src, Type.initTag(.u32), .x30, .{ .memory = got_addr }); + try self.genSetReg(inst.base.src, Type.initTag(.u64), .x30, .{ .memory = got_addr }); // blr x30 writeInt(u32, try self.code.addManyAsArray(4), Instruction.blr(.x30).toU32()); }, @@ -2355,15 +2672,19 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { .register => |reg| blk: { // test reg, 1 // TODO detect al, ax, eax - try self.code.ensureCapacity(self.code.items.len + 4); - // TODO audit this codegen: we force w = true here to make - // the value affect the big register - self.rex(.{ .b = reg.isExtended(), .w = true }); - self.code.appendSliceAssumeCapacity(&[_]u8{ - 0xf6, - @as(u8, 0xC0) | (0 << 3) | @truncate(u3, reg.id()), - 0x01, + const encoder = try X8664Encoder.init(self.code, 4); + encoder.rex(.{ + // TODO audit this codegen: we force w = true here to make + // the value affect the big register + .w = true, + .b = reg.isExtended(), }); + encoder.opcode_1byte(0xf6); + encoder.modRm_direct( + 0, + reg.low_id(), + ); + encoder.disp8(1); break :blk 0x84; }, else => return self.fail(inst.base.src, "TODO implement condbr {s} when condition is {s}", .{ self.target.cpu.arch, @tagName(cond) }), @@ -2673,9 +2994,9 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { switch (arch) { .x86_64 => switch (inst.base.tag) { // lhs AND rhs - .bool_and => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 4, 0x20), + .bool_and => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs), // lhs OR rhs - .bool_or => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 1, 0x08), + .bool_or => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs), else => unreachable, // Not a boolean operation }, .arm, .armeb => switch (inst.base.tag) { @@ -2882,39 +3203,6 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { } } - /// Encodes a REX prefix as specified, and appends it to the instruction - /// stream. This only modifies the instruction stream if at least one bit - /// is set true, which has a few implications: - /// - /// * The length of the instruction buffer will be modified *if* the - /// resulting REX is meaningful, but will remain the same if it is not. - /// * Deliberately inserting a "meaningless REX" requires explicit usage of - /// 0x40, and cannot be done via this function. - /// W => 64 bit mode - /// R => extension to the MODRM.reg field - /// X => extension to the SIB.index field - /// B => extension to the MODRM.rm field or the SIB.base field - fn rex(self: *Self, arg: struct { b: bool = false, w: bool = false, x: bool = false, r: bool = false }) void { - comptime assert(arch == .x86_64); - // From section 2.2.1.2 of the manual, REX is encoded as b0100WRXB. - var value: u8 = 0x40; - if (arg.b) { - value |= 0x1; - } - if (arg.x) { - value |= 0x2; - } - if (arg.r) { - value |= 0x4; - } - if (arg.w) { - value |= 0x8; - } - if (value != 0x40) { - self.code.appendAssumeCapacity(value); - } - } - /// Sets the value without any modifications to register allocation metadata or stack allocation metadata. fn setRegOrMem(self: *Self, src: LazySrcLoc, ty: Type, loc: MCValue, val: MCValue) !void { switch (loc) { @@ -3462,20 +3750,25 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { } }, .compare_flags_unsigned => |op| { - try self.code.ensureCapacity(self.code.items.len + 3); + const encoder = try X8664Encoder.init(self.code, 7); // TODO audit this codegen: we force w = true here to make // the value affect the big register - self.rex(.{ .b = reg.isExtended(), .w = true }); - const opcode: u8 = switch (op) { + encoder.rex(.{ + .w = true, + .b = reg.isExtended(), + }); + encoder.opcode_2byte(0x0f, switch (op) { .gte => 0x93, .gt => 0x97, .neq => 0x95, .lt => 0x92, .lte => 0x96, .eq => 0x94, - }; - const id = @as(u8, reg.id() & 0b111); - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x0f, opcode, 0xC0 | id }); + }); + encoder.modRm_direct( + 0, + reg.low_id(), + ); }, .compare_flags_signed => |op| { return self.fail(src, "TODO set register with compare flags value (signed)", .{}); @@ -3485,40 +3778,43 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { // register is the fastest way to zero a register. if (x == 0) { // The encoding for `xor r32, r32` is `0x31 /r`. - // Section 3.1.1.1 of the Intel x64 Manual states that "/r indicates that the - // ModR/M byte of the instruction contains a register operand and an r/m operand." - // - // R/M bytes are composed of two bits for the mode, then three bits for the register, - // then three bits for the operand. Since we're zeroing a register, the two three-bit - // values will be identical, and the mode is three (the raw register value). - // + const encoder = try X8664Encoder.init(self.code, 3); + // If we're accessing e.g. r8d, we need to use a REX prefix before the actual operation. Since // this is a 32-bit operation, the W flag is set to zero. X is also zero, as we're not using a SIB. // Both R and B are set, as we're extending, in effect, the register bits *and* the operand. - try self.code.ensureCapacity(self.code.items.len + 3); - self.rex(.{ .r = reg.isExtended(), .b = reg.isExtended() }); - const id = @as(u8, reg.id() & 0b111); - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x31, 0xC0 | id << 3 | id }); + encoder.rex(.{ + .r = reg.isExtended(), + .b = reg.isExtended(), + }); + encoder.opcode_1byte(0x31); + // Section 3.1.1.1 of the Intel x64 Manual states that "/r indicates that the + // ModR/M byte of the instruction contains a register operand and an r/m operand." + encoder.modRm_direct( + reg.low_id(), + reg.low_id(), + ); + return; } - if (x <= math.maxInt(u32)) { + if (x <= math.maxInt(i32)) { // Next best case: if we set the lower four bytes, the upper four will be zeroed. // // The encoding for `mov IMM32 -> REG` is (0xB8 + R) IMM. - if (reg.isExtended()) { - // Just as with XORing, we need a REX prefix. This time though, we only - // need the B bit set, as we're extending the opcode's register field, - // and there is no Mod R/M byte. - // - // Thus, we need b01000001, or 0x41. - try self.code.resize(self.code.items.len + 6); - self.code.items[self.code.items.len - 6] = 0x41; - } else { - try self.code.resize(self.code.items.len + 5); - } - self.code.items[self.code.items.len - 5] = 0xB8 | @as(u8, reg.id() & 0b111); - const imm_ptr = self.code.items[self.code.items.len - 4 ..][0..4]; - mem.writeIntLittle(u32, imm_ptr, @intCast(u32, x)); + + const encoder = try X8664Encoder.init(self.code, 6); + // Just as with XORing, we need a REX prefix. This time though, we only + // need the B bit set, as we're extending the opcode's register field, + // and there is no Mod R/M byte. + encoder.rex(.{ + .b = reg.isExtended(), + }); + encoder.opcode_withReg(0xB8, reg.low_id()); + + // no ModR/M byte + + // IMM + encoder.imm32(@intCast(i32, x)); return; } // Worst case: we need to load the 64-bit register with the IMM. GNU's assemblers calls @@ -3528,79 +3824,98 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { // This encoding is, in fact, the *same* as the one used for 32-bit loads. The only // difference is that we set REX.W before the instruction, which extends the load to // 64-bit and uses the full bit-width of the register. - // - // Since we always need a REX here, let's just check if we also need to set REX.B. - // - // In this case, the encoding of the REX byte is 0b0100100B - try self.code.ensureCapacity(self.code.items.len + 10); - self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended() }); - self.code.items.len += 9; - self.code.items[self.code.items.len - 9] = 0xB8 | @as(u8, reg.id() & 0b111); - const imm_ptr = self.code.items[self.code.items.len - 8 ..][0..8]; - mem.writeIntLittle(u64, imm_ptr, x); + { + const encoder = try X8664Encoder.init(self.code, 10); + encoder.rex(.{ + .w = true, + .b = reg.isExtended(), + }); + encoder.opcode_withReg(0xB8, reg.low_id()); + encoder.imm64(x); + } }, .embedded_in_code => |code_offset| { // We need the offset from RIP in a signed i32 twos complement. // The instruction is 7 bytes long and RIP points to the next instruction. - try self.code.ensureCapacity(self.code.items.len + 7); - // 64-bit LEA is encoded as REX.W 8D /r. If the register is extended, the REX byte is modified, - // but the operation size is unchanged. Since we're using a disp32, we want mode 0 and lower three - // bits as five. - // REX 0x8D 0b00RRR101, where RRR is the lower three bits of the id. - self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended() }); - self.code.items.len += 6; - const rip = self.code.items.len; + + // 64-bit LEA is encoded as REX.W 8D /r. + const rip = self.code.items.len + 7; const big_offset = @intCast(i64, code_offset) - @intCast(i64, rip); const offset = @intCast(i32, big_offset); - self.code.items[self.code.items.len - 6] = 0x8D; - self.code.items[self.code.items.len - 5] = 0b101 | (@as(u8, reg.id() & 0b111) << 3); - const imm_ptr = self.code.items[self.code.items.len - 4 ..][0..4]; - mem.writeIntLittle(i32, imm_ptr, offset); + const encoder = try X8664Encoder.init(self.code, 7); + + // byte 1, always exists because w = true + encoder.rex(.{ + .w = true, + .r = reg.isExtended(), + }); + // byte 2 + encoder.opcode_1byte(0x8D); + // byte 3 + encoder.modRm_RIPDisp32(reg.low_id()); + // byte 4-7 + encoder.disp32(offset); + + // Double check that we haven't done any math errors + assert(rip == self.code.items.len); }, .register => |src_reg| { // If the registers are the same, nothing to do. if (src_reg.id() == reg.id()) return; - // This is a variant of 8B /r. Since we're using 64-bit moves, we require a REX. - // This is thus three bytes: REX 0x8B R/M. - // If the destination is extended, the R field must be 1. - // If the *source* is extended, the B field must be 1. - // Since the register is being accessed directly, the R/M mode is three. The reg field (the middle - // three bits) contain the destination, and the R/M field (the lower three bits) contain the source. - try self.code.ensureCapacity(self.code.items.len + 3); - self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended(), .b = src_reg.isExtended() }); - const R = 0xC0 | (@as(u8, reg.id() & 0b111) << 3) | @as(u8, src_reg.id() & 0b111); - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, R }); + // This is a variant of 8B /r. + const abi_size = ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 3); + encoder.rex(.{ + .w = abi_size == 8, + .r = reg.isExtended(), + .b = src_reg.isExtended(), + }); + encoder.opcode_1byte(0x8B); + encoder.modRm_direct(reg.low_id(), src_reg.low_id()); }, .memory => |x| { if (self.bin_file.options.pie) { // RIP-relative displacement to the entry in the GOT table. + const abi_size = ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 10); + + // LEA reg, [] + + // We encode the instruction FIRST because prefixes may or may not appear. + // After we encode the instruction, we will know that the displacement bytes + // for [] will be at self.code.items.len - 4. + encoder.rex(.{ + .w = true, // force 64 bit because loading an address (to the GOT) + .r = reg.isExtended(), + }); + encoder.opcode_1byte(0x8D); + encoder.modRm_RIPDisp32(reg.low_id()); + encoder.disp32(0); + // TODO we should come up with our own, backend independent relocation types // which each backend (Elf, MachO, etc.) would then translate into an actual // fixup when linking. if (self.bin_file.cast(link.File.MachO)) |macho_file| { try macho_file.pie_fixups.append(self.bin_file.allocator, .{ .target_addr = x, - .offset = self.code.items.len + 3, + .offset = self.code.items.len - 4, .size = 4, }); } else { return self.fail(src, "TODO implement genSetReg for PIE GOT indirection on this platform", .{}); } - try self.code.ensureCapacity(self.code.items.len + 7); - self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended() }); - self.code.appendSliceAssumeCapacity(&[_]u8{ - 0x8D, - 0x05 | (@as(u8, reg.id() & 0b111) << 3), - }); - mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), 0); - try self.code.ensureCapacity(self.code.items.len + 3); - self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended(), .r = reg.isExtended() }); - const RM = (@as(u8, reg.id() & 0b111) << 3) | @truncate(u3, reg.id()); - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, RM }); - } else if (x <= math.maxInt(u32)) { + // MOV reg, [reg] + encoder.rex(.{ + .w = abi_size == 8, + .r = reg.isExtended(), + .b = reg.isExtended(), + }); + encoder.opcode_1byte(0x8B); + encoder.modRm_indirectDisp0(reg.low_id(), reg.low_id()); + } else if (x <= math.maxInt(i32)) { // Moving from memory to a register is a variant of `8B /r`. // Since we're using 64-bit moves, we require a REX. // This variant also requires a SIB, as it would otherwise be RIP-relative. @@ -3608,14 +3923,18 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { // The SIB must be 0x25, to indicate a disp32 with no scaled index. // 0b00RRR100, where RRR is the lower three bits of the register ID. // The instruction is thus eight bytes; REX 0x8B 0b00RRR100 0x25 followed by a four-byte disp32. - try self.code.ensureCapacity(self.code.items.len + 8); - self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended() }); - self.code.appendSliceAssumeCapacity(&[_]u8{ - 0x8B, - 0x04 | (@as(u8, reg.id() & 0b111) << 3), // R - 0x25, + const abi_size = ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 8); + encoder.rex(.{ + .w = abi_size == 8, + .r = reg.isExtended(), }); - mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), @intCast(u32, x)); + encoder.opcode_1byte(0x8B); + // effective address = [SIB] + encoder.modRm_SIBDisp0(reg.low_id()); + // SIB = disp32 + encoder.sib_disp32(); + encoder.disp32(@intCast(i32, x)); } else { // If this is RAX, we can use a direct load; otherwise, we need to load the address, then indirectly load // the value. @@ -3623,12 +3942,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { // REX.W 0xA1 moffs64* // moffs64* is a 64-bit offset "relative to segment base", which really just means the // absolute address for all practical purposes. - try self.code.resize(self.code.items.len + 10); - // REX.W == 0x48 - self.code.items[self.code.items.len - 10] = 0x48; - self.code.items[self.code.items.len - 9] = 0xA1; - const imm_ptr = self.code.items[self.code.items.len - 8 ..][0..8]; - mem.writeIntLittle(u64, imm_ptr, x); + + const encoder = try X8664Encoder.init(self.code, 10); + encoder.rex(.{ + .w = true, + }); + encoder.opcode_1byte(0xA1); + encoder.writeIntLittle(u64, x); } else { // This requires two instructions; a move imm as used above, followed by an indirect load using the register // as the address and the register as the destination. @@ -3645,41 +3965,42 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { // Now, the register contains the address of the value to load into it // Currently, we're only allowing 64-bit registers, so we need the `REX.W 8B /r` variant. // TODO: determine whether to allow other sized registers, and if so, handle them properly. - // This operation requires three bytes: REX 0x8B R/M - try self.code.ensureCapacity(self.code.items.len + 3); - // For this operation, we want R/M mode *zero* (use register indirectly), and the two register - // values must match. Thus, it's 00ABCABC where ABC is the lower three bits of the register ID. - // - // Furthermore, if this is an extended register, both B and R must be set in the REX byte, as *both* - // register operands need to be marked as extended. - self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended(), .r = reg.isExtended() }); - const RM = (@as(u8, reg.id() & 0b111) << 3) | @truncate(u3, reg.id()); - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, RM }); + + // mov reg, [reg] + const abi_size = ty.abiSize(self.target.*); + const encoder = try X8664Encoder.init(self.code, 3); + encoder.rex(.{ + .w = abi_size == 8, + .r = reg.isExtended(), + .b = reg.isExtended(), + }); + encoder.opcode_1byte(0x8B); + encoder.modRm_indirectDisp0(reg.low_id(), reg.low_id()); } } }, .stack_offset => |unadjusted_off| { - try self.code.ensureCapacity(self.code.items.len + 7); - const size_bytes = @divExact(reg.size(), 8); - const off = unadjusted_off + size_bytes; - self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended() }); - const reg_id: u8 = @truncate(u3, reg.id()); - if (off <= 128) { - // Example: 48 8b 4d 7f mov rcx,QWORD PTR [rbp+0x7f] - const RM = @as(u8, 0b01_000_101) | (reg_id << 3); - const negative_offset = @intCast(i8, -@intCast(i32, off)); - const twos_comp = @bitCast(u8, negative_offset); - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8b, RM, twos_comp }); - } else if (off <= 2147483648) { - // Example: 48 8b 8d 80 00 00 00 mov rcx,QWORD PTR [rbp+0x80] - const RM = @as(u8, 0b10_000_101) | (reg_id << 3); - const negative_offset = @intCast(i32, -@intCast(i33, off)); - const twos_comp = @bitCast(u32, negative_offset); - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8b, RM }); - mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), twos_comp); - } else { + const abi_size = ty.abiSize(self.target.*); + const off = unadjusted_off + abi_size; + if (off < std.math.minInt(i32) or off > std.math.maxInt(i32)) { return self.fail(src, "stack offset too large", .{}); } + const ioff = -@intCast(i32, off); + const encoder = try X8664Encoder.init(self.code, 3); + encoder.rex(.{ + .w = abi_size == 8, + .r = reg.isExtended(), + }); + encoder.opcode_1byte(0x8B); + if (std.math.minInt(i8) <= ioff and ioff <= std.math.maxInt(i8)) { + // Example: 48 8b 4d 7f mov rcx,QWORD PTR [rbp+0x7f] + encoder.modRm_indirectDisp8(reg.low_id(), Register.ebp.low_id()); + encoder.disp8(@intCast(i8, ioff)); + } else { + // Example: 48 8b 8d 80 00 00 00 mov rcx,QWORD PTR [rbp+0x80] + encoder.modRm_indirectDisp32(reg.low_id(), Register.ebp.low_id()); + encoder.disp32(ioff); + } }, }, else => return self.fail(src, "TODO implement getSetReg for {}", .{self.target.cpu.arch}), diff --git a/src/codegen/x86_64.zig b/src/codegen/x86_64.zig index dea39f82cd..dd0b74d46a 100644 --- a/src/codegen/x86_64.zig +++ b/src/codegen/x86_64.zig @@ -1,4 +1,9 @@ const std = @import("std"); +const testing = std.testing; +const mem = std.mem; +const assert = std.debug.assert; +const ArrayList = std.ArrayList; +const Allocator = std.mem.Allocator; const Type = @import("../Type.zig"); const DW = std.dwarf; @@ -68,6 +73,11 @@ pub const Register = enum(u8) { return @truncate(u4, @enumToInt(self)); } + /// Like id, but only returns the lower 3 bits. + pub fn low_id(self: Register) u3 { + return @truncate(u3, @enumToInt(self)); + } + /// Returns the index into `callee_preserved_regs`. pub fn allocIndex(self: Register) ?u4 { return switch (self) { @@ -136,6 +146,493 @@ pub const callee_preserved_regs = [_]Register{ .rax, .rcx, .rdx, .rsi, .rdi, .r8 pub const c_abi_int_param_regs = [_]Register{ .rdi, .rsi, .rdx, .rcx, .r8, .r9 }; pub const c_abi_int_return_regs = [_]Register{ .rax, .rdx }; +/// Encoding helper functions for x86_64 instructions +/// +/// Many of these helpers do very little, but they can help make things +/// slightly more readable with more descriptive field names / function names. +/// +/// Some of them also have asserts to ensure that we aren't doing dumb things. +/// For example, trying to use register 4 (esp) in an indirect modr/m byte is illegal, +/// you need to encode it with an SIB byte. +/// +/// Note that ALL of these helper functions will assume capacity, +/// so ensure that the `code` has sufficient capacity before using them. +/// The `init` method is the recommended way to ensure capacity. +pub const Encoder = struct { + /// Non-owning reference to the code array + code: *ArrayList(u8), + + const Self = @This(); + + /// Wrap `code` in Encoder to make it easier to call these helper functions + /// + /// maximum_inst_size should contain the maximum number of bytes + /// that the encoded instruction will take. + /// This is because the helper functions will assume capacity + /// in order to avoid bounds checking. + pub fn init(code: *ArrayList(u8), maximum_inst_size: u8) !Self { + try code.ensureCapacity(code.items.len + maximum_inst_size); + return Self{ .code = code }; + } + + /// Directly write a number to the code array with big endianness + pub fn writeIntBig(self: Self, comptime T: type, value: T) void { + mem.writeIntBig( + T, + self.code.addManyAsArrayAssumeCapacity(@divExact(@typeInfo(T).Int.bits, 8)), + value, + ); + } + + /// Directly write a number to the code array with little endianness + pub fn writeIntLittle(self: Self, comptime T: type, value: T) void { + mem.writeIntLittle( + T, + self.code.addManyAsArrayAssumeCapacity(@divExact(@typeInfo(T).Int.bits, 8)), + value, + ); + } + + // -------- + // Prefixes + // -------- + + pub const LegacyPrefixes = packed struct { + /// LOCK + prefix_f0: bool = false, + /// REPNZ, REPNE, REP, Scalar Double-precision + prefix_f2: bool = false, + /// REPZ, REPE, REP, Scalar Single-precision + prefix_f3: bool = false, + + /// CS segment override or Branch not taken + prefix_2e: bool = false, + /// DS segment override + prefix_36: bool = false, + /// ES segment override + prefix_26: bool = false, + /// FS segment override + prefix_64: bool = false, + /// GS segment override + prefix_65: bool = false, + + /// Branch taken + prefix_3e: bool = false, + + /// Operand size override (enables 16 bit operation) + prefix_66: bool = false, + + /// Address size override (enables 16 bit address size) + prefix_67: bool = false, + + padding: u5 = 0, + }; + + /// Encodes legacy prefixes + pub fn legacyPrefixes(self: Self, prefixes: LegacyPrefixes) void { + if (@bitCast(u16, prefixes) != 0) { + // Hopefully this path isn't taken very often, so we'll do it the slow way for now + + // LOCK + if (prefixes.prefix_f0) self.code.appendAssumeCapacity(0xf0); + // REPNZ, REPNE, REP, Scalar Double-precision + if (prefixes.prefix_f2) self.code.appendAssumeCapacity(0xf2); + // REPZ, REPE, REP, Scalar Single-precision + if (prefixes.prefix_f3) self.code.appendAssumeCapacity(0xf3); + + // CS segment override or Branch not taken + if (prefixes.prefix_2e) self.code.appendAssumeCapacity(0x2e); + // DS segment override + if (prefixes.prefix_36) self.code.appendAssumeCapacity(0x36); + // ES segment override + if (prefixes.prefix_26) self.code.appendAssumeCapacity(0x26); + // FS segment override + if (prefixes.prefix_64) self.code.appendAssumeCapacity(0x64); + // GS segment override + if (prefixes.prefix_65) self.code.appendAssumeCapacity(0x65); + + // Branch taken + if (prefixes.prefix_3e) self.code.appendAssumeCapacity(0x3e); + + // Operand size override + if (prefixes.prefix_66) self.code.appendAssumeCapacity(0x66); + + // Address size override + if (prefixes.prefix_67) self.code.appendAssumeCapacity(0x67); + } + } + + /// Use 16 bit operand size + /// + /// Note that this flag is overridden by REX.W, if both are present. + pub fn prefix16BitMode(self: Self) void { + self.code.appendAssumeCapacity(0x66); + } + + /// From section 2.2.1.2 of the manual, REX is encoded as b0100WRXB + pub const Rex = struct { + /// Wide, enables 64-bit operation + w: bool = false, + /// Extends the reg field in the ModR/M byte + r: bool = false, + /// Extends the index field in the SIB byte + x: bool = false, + /// Extends the r/m field in the ModR/M byte, + /// or the base field in the SIB byte, + /// or the reg field in the Opcode byte + b: bool = false, + }; + + /// Encodes a REX prefix byte given all the fields + /// + /// Use this byte whenever you need 64 bit operation, + /// or one of reg, index, r/m, base, or opcode-reg might be extended. + /// + /// See struct `Rex` for a description of each field. + /// + /// Does not add a prefix byte if none of the fields are set! + pub fn rex(self: Self, byte: Rex) void { + var value: u8 = 0b0100_0000; + + if (byte.w) value |= 0b1000; + if (byte.r) value |= 0b0100; + if (byte.x) value |= 0b0010; + if (byte.b) value |= 0b0001; + + if (value != 0b0100_0000) { + self.code.appendAssumeCapacity(value); + } + } + + // ------ + // Opcode + // ------ + + /// Encodes a 1 byte opcode + pub fn opcode_1byte(self: Self, opcode: u8) void { + self.code.appendAssumeCapacity(opcode); + } + + /// Encodes a 2 byte opcode + /// + /// e.g. IMUL has the opcode 0x0f 0xaf, so you use + /// + /// encoder.opcode_2byte(0x0f, 0xaf); + pub fn opcode_2byte(self: Self, prefix: u8, opcode: u8) void { + self.code.appendAssumeCapacity(prefix); + self.code.appendAssumeCapacity(opcode); + } + + /// Encodes a 1 byte opcode with a reg field + /// + /// Remember to add a REX prefix byte if reg is extended! + pub fn opcode_withReg(self: Self, opcode: u8, reg: u3) void { + assert(opcode & 0b111 == 0); + self.code.appendAssumeCapacity(opcode | reg); + } + + // ------ + // ModR/M + // ------ + + /// Construct a ModR/M byte given all the fields + /// + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm(self: Self, mod: u2, reg_or_opx: u3, rm: u3) void { + self.code.appendAssumeCapacity( + @as(u8, mod) << 6 | @as(u8, reg_or_opx) << 3 | rm, + ); + } + + /// Construct a ModR/M byte using direct r/m addressing + /// r/m effective address: r/m + /// + /// Note reg's effective address is always just reg for the ModR/M byte. + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm_direct(self: Self, reg_or_opx: u3, rm: u3) void { + self.modRm(0b11, reg_or_opx, rm); + } + + /// Construct a ModR/M byte using indirect r/m addressing + /// r/m effective address: [r/m] + /// + /// Note reg's effective address is always just reg for the ModR/M byte. + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm_indirectDisp0(self: Self, reg_or_opx: u3, rm: u3) void { + assert(rm != 4 and rm != 5); + self.modRm(0b00, reg_or_opx, rm); + } + + /// Construct a ModR/M byte using indirect SIB addressing + /// r/m effective address: [SIB] + /// + /// Note reg's effective address is always just reg for the ModR/M byte. + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm_SIBDisp0(self: Self, reg_or_opx: u3) void { + self.modRm(0b00, reg_or_opx, 0b100); + } + + /// Construct a ModR/M byte using RIP-relative addressing + /// r/m effective address: [RIP + disp32] + /// + /// Note reg's effective address is always just reg for the ModR/M byte. + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm_RIPDisp32(self: Self, reg_or_opx: u3) void { + self.modRm(0b00, reg_or_opx, 0b101); + } + + /// Construct a ModR/M byte using indirect r/m with a 8bit displacement + /// r/m effective address: [r/m + disp8] + /// + /// Note reg's effective address is always just reg for the ModR/M byte. + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm_indirectDisp8(self: Self, reg_or_opx: u3, rm: u3) void { + assert(rm != 4); + self.modRm(0b01, reg_or_opx, rm); + } + + /// Construct a ModR/M byte using indirect SIB with a 8bit displacement + /// r/m effective address: [SIB + disp8] + /// + /// Note reg's effective address is always just reg for the ModR/M byte. + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm_SIBDisp8(self: Self, reg_or_opx: u3) void { + self.modRm(0b01, reg_or_opx, 0b100); + } + + /// Construct a ModR/M byte using indirect r/m with a 32bit displacement + /// r/m effective address: [r/m + disp32] + /// + /// Note reg's effective address is always just reg for the ModR/M byte. + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm_indirectDisp32(self: Self, reg_or_opx: u3, rm: u3) void { + assert(rm != 4); + self.modRm(0b10, reg_or_opx, rm); + } + + /// Construct a ModR/M byte using indirect SIB with a 32bit displacement + /// r/m effective address: [SIB + disp32] + /// + /// Note reg's effective address is always just reg for the ModR/M byte. + /// Remember to add a REX prefix byte if reg or rm are extended! + pub fn modRm_SIBDisp32(self: Self, reg_or_opx: u3) void { + self.modRm(0b10, reg_or_opx, 0b100); + } + + // --- + // SIB + // --- + + /// Construct a SIB byte given all the fields + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib(self: Self, scale: u2, index: u3, base: u3) void { + self.code.appendAssumeCapacity( + @as(u8, scale) << 6 | @as(u8, index) << 3 | base, + ); + } + + /// Construct a SIB byte with scale * index + base, no frills. + /// r/m effective address: [base + scale * index] + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib_scaleIndexBase(self: Self, scale: u2, index: u3, base: u3) void { + assert(base != 5); + + self.sib(scale, index, base); + } + + /// Construct a SIB byte with scale * index + disp32 + /// r/m effective address: [scale * index + disp32] + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib_scaleIndexDisp32(self: Self, scale: u2, index: u3) void { + assert(index != 4); + + // scale is actually ignored + // index = 4 means no index + // base = 5 means no base, if mod == 0. + self.sib(scale, index, 5); + } + + /// Construct a SIB byte with just base + /// r/m effective address: [base] + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib_base(self: Self, base: u3) void { + assert(base != 5); + + // scale is actually ignored + // index = 4 means no index + self.sib(0, 4, base); + } + + /// Construct a SIB byte with just disp32 + /// r/m effective address: [disp32] + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib_disp32(self: Self) void { + // scale is actually ignored + // index = 4 means no index + // base = 5 means no base, if mod == 0. + self.sib(0, 4, 5); + } + + /// Construct a SIB byte with scale * index + base + disp8 + /// r/m effective address: [base + scale * index + disp8] + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib_scaleIndexBaseDisp8(self: Self, scale: u2, index: u3, base: u3) void { + self.sib(scale, index, base); + } + + /// Construct a SIB byte with base + disp8, no index + /// r/m effective address: [base + disp8] + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib_baseDisp8(self: Self, base: u3) void { + // scale is ignored + // index = 4 means no index + self.sib(0, 4, base); + } + + /// Construct a SIB byte with scale * index + base + disp32 + /// r/m effective address: [base + scale * index + disp32] + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib_scaleIndexBaseDisp32(self: Self, scale: u2, index: u3, base: u3) void { + self.sib(scale, index, base); + } + + /// Construct a SIB byte with base + disp32, no index + /// r/m effective address: [base + disp32] + /// + /// Remember to add a REX prefix byte if index or base are extended! + pub fn sib_baseDisp32(self: Self, base: u3) void { + // scale is ignored + // index = 4 means no index + self.sib(0, 4, base); + } + + // ------------------------- + // Trivial (no bit fiddling) + // ------------------------- + + /// Encode an 8 bit immediate + /// + /// It is sign-extended to 64 bits by the cpu. + pub fn imm8(self: Self, imm: i8) void { + self.code.appendAssumeCapacity(@bitCast(u8, imm)); + } + + /// Encode an 8 bit displacement + /// + /// It is sign-extended to 64 bits by the cpu. + pub fn disp8(self: Self, disp: i8) void { + self.code.appendAssumeCapacity(@bitCast(u8, disp)); + } + + /// Encode an 16 bit immediate + /// + /// It is sign-extended to 64 bits by the cpu. + pub fn imm16(self: Self, imm: i16) void { + self.writeIntLittle(i16, imm); + } + + /// Encode an 32 bit immediate + /// + /// It is sign-extended to 64 bits by the cpu. + pub fn imm32(self: Self, imm: i32) void { + self.writeIntLittle(i32, imm); + } + + /// Encode an 32 bit displacement + /// + /// It is sign-extended to 64 bits by the cpu. + pub fn disp32(self: Self, disp: i32) void { + self.writeIntLittle(i32, disp); + } + + /// Encode an 64 bit immediate + /// + /// It is sign-extended to 64 bits by the cpu. + pub fn imm64(self: Self, imm: u64) void { + self.writeIntLittle(u64, imm); + } +}; + +test "x86_64 Encoder helpers" { + var code = ArrayList(u8).init(testing.allocator); + defer code.deinit(); + + // simple integer multiplication + + // imul eax,edi + // 0faf c7 + { + try code.resize(0); + const encoder = try Encoder.init(&code, 4); + encoder.rex(.{ + .r = Register.eax.isExtended(), + .b = Register.edi.isExtended(), + }); + encoder.opcode_2byte(0x0f, 0xaf); + encoder.modRm_direct( + Register.eax.low_id(), + Register.edi.low_id(), + ); + + testing.expectEqualSlices(u8, &[_]u8{ 0x0f, 0xaf, 0xc7 }, code.items); + } + + // simple mov + + // mov eax,edi + // 89 f8 + { + try code.resize(0); + const encoder = try Encoder.init(&code, 3); + encoder.rex(.{ + .r = Register.edi.isExtended(), + .b = Register.eax.isExtended(), + }); + encoder.opcode_1byte(0x89); + encoder.modRm_direct( + Register.edi.low_id(), + Register.eax.low_id(), + ); + + testing.expectEqualSlices(u8, &[_]u8{ 0x89, 0xf8 }, code.items); + } + + // signed integer addition of 32-bit sign extended immediate to 64 bit register + + // add rcx, 2147483647 + // + // Using the following opcode: REX.W + 81 /0 id, we expect the following encoding + // + // 48 : REX.W set for 64 bit operand (*r*cx) + // 81 : opcode for " with immediate" + // c1 : id = rcx, + // : c1 = 11 <-- mod = 11 indicates r/m is register (rcx) + // : 000 <-- opcode_extension = 0 because opcode extension is /0. /0 specifies ADD + // : 001 <-- 001 is rcx + // ffffff7f : 2147483647 + { + try code.resize(0); + const encoder = try Encoder.init(&code, 7); + encoder.rex(.{ .w = true }); // use 64 bit operation + encoder.opcode_1byte(0x81); + encoder.modRm_direct( + 0, + Register.rcx.low_id(), + ); + encoder.imm32(2147483647); + + testing.expectEqualSlices(u8, &[_]u8{ 0x48, 0x81, 0xc1, 0xff, 0xff, 0xff, 0x7f }, code.items); + } +} + // TODO add these registers to the enum and populate dwarfLocOp // // Return Address register. This is stored in `0(%rsp, "")` and is not a physical register. // RA = (16, "RA"), diff --git a/test/stage2/test.zig b/test/stage2/test.zig index c8b9b0cc96..3f5063ba3b 100644 --- a/test/stage2/test.zig +++ b/test/stage2/test.zig @@ -318,6 +318,81 @@ pub fn addCases(ctx: *TestContext) !void { , &[_][]const u8{":2:15: error: incompatible types: 'bool' and 'comptime_int'"}); } + { + var case = ctx.exe("multiplying numbers at runtime and comptime", linux_x64); + case.addCompareOutput( + \\export fn _start() noreturn { + \\ mul(3, 4); + \\ + \\ exit(); + \\} + \\ + \\fn mul(a: u32, b: u32) void { + \\ if (a * b != 12) unreachable; + \\} + \\ + \\fn exit() noreturn { + \\ asm volatile ("syscall" + \\ : + \\ : [number] "{rax}" (231), + \\ [arg1] "{rdi}" (0) + \\ : "rcx", "r11", "memory" + \\ ); + \\ unreachable; + \\} + , + "", + ); + // comptime function call + case.addCompareOutput( + \\export fn _start() noreturn { + \\ exit(); + \\} + \\ + \\fn mul(a: u32, b: u32) u32 { + \\ return a * b; + \\} + \\ + \\const x = mul(3, 4); + \\ + \\fn exit() noreturn { + \\ asm volatile ("syscall" + \\ : + \\ : [number] "{rax}" (231), + \\ [arg1] "{rdi}" (x - 12) + \\ : "rcx", "r11", "memory" + \\ ); + \\ unreachable; + \\} + , + "", + ); + // Inline function call + case.addCompareOutput( + \\export fn _start() noreturn { + \\ var x: usize = 5; + \\ const y = mul(2, 3, x); + \\ exit(y - 30); + \\} + \\ + \\fn mul(a: usize, b: usize, c: usize) callconv(.Inline) usize { + \\ return a * b * c; + \\} + \\ + \\fn exit(code: usize) noreturn { + \\ asm volatile ("syscall" + \\ : + \\ : [number] "{rax}" (231), + \\ [arg1] "{rdi}" (code) + \\ : "rcx", "r11", "memory" + \\ ); + \\ unreachable; + \\} + , + "", + ); + } + { var case = ctx.exe("assert function", linux_x64); case.addCompareOutput( @@ -700,7 +775,8 @@ pub fn addCases(ctx: *TestContext) !void { // Spilling registers to the stack. case.addCompareOutput( \\export fn _start() noreturn { - \\ assert(add(3, 4) == 791); + \\ assert(add(3, 4) == 1221); + \\ assert(mul(3, 4) == 21609); \\ \\ exit(); \\} @@ -716,19 +792,47 @@ pub fn addCases(ctx: *TestContext) !void { \\ const i = g + h; // 100 \\ const j = i + d; // 110 \\ const k = i + j; // 210 - \\ const l = k + c; // 217 - \\ const m = l + d; // 227 - \\ const n = m + e; // 241 - \\ const o = n + f; // 265 - \\ const p = o + g; // 303 - \\ const q = p + h; // 365 - \\ const r = q + i; // 465 - \\ const s = r + j; // 575 - \\ const t = s + k; // 785 - \\ break :blk t; + \\ const l = j + k; // 320 + \\ const m = l + c; // 327 + \\ const n = m + d; // 337 + \\ const o = n + e; // 351 + \\ const p = o + f; // 375 + \\ const q = p + g; // 413 + \\ const r = q + h; // 475 + \\ const s = r + i; // 575 + \\ const t = s + j; // 685 + \\ const u = t + k; // 895 + \\ const v = u + l; // 1215 + \\ break :blk v; \\ }; - \\ const y = x + a; // 788 - \\ const z = y + a; // 791 + \\ const y = x + a; // 1218 + \\ const z = y + a; // 1221 + \\ return z; + \\} + \\ + \\fn mul(a: u32, b: u32) u32 { + \\ const x: u32 = blk: { + \\ const c = a * a * a * a; // 81 + \\ const d = a * a * a * b; // 108 + \\ const e = a * a * b * a; // 108 + \\ const f = a * a * b * b; // 144 + \\ const g = a * b * a * a; // 108 + \\ const h = a * b * a * b; // 144 + \\ const i = a * b * b * a; // 144 + \\ const j = a * b * b * b; // 192 + \\ const k = b * a * a * a; // 108 + \\ const l = b * a * a * b; // 144 + \\ const m = b * a * b * a; // 144 + \\ const n = b * a * b * b; // 192 + \\ const o = b * b * a * a; // 144 + \\ const p = b * b * a * b; // 192 + \\ const q = b * b * b * a; // 192 + \\ const r = b * b * b * b; // 256 + \\ const s = c + d + e + f + g + h + i + j + k + l + m + n + o + p + q + r; // 2401 + \\ break :blk s; + \\ }; + \\ const y = x * a; // 7203 + \\ const z = y * a; // 21609 \\ return z; \\} \\