From cea9ac772a518ff249d47fc2cb7b2776c786ac07 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Mon, 15 May 2023 02:55:41 -0400 Subject: [PATCH] x86_64: implement integer vector min/max --- src/arch/x86_64/CodeGen.zig | 100 ++++++++++++++++++++++++++++++++++ src/arch/x86_64/Encoding.zig | 4 ++ src/arch/x86_64/Mir.zig | 8 +++ src/arch/x86_64/encodings.zig | 58 ++++++++++++++++++++ 4 files changed, 170 insertions(+) diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index ed2c596f8f..2cd5721258 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -6534,6 +6534,34 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx)) .{ .vp_, .@"and" } else .{ .p_, .@"and" }, .bit_or => if (self.hasFeature(.avx)) .{ .vp_, .@"or" } else .{ .p_, .@"or" }, .xor => if (self.hasFeature(.avx)) .{ .vp_, .xor } else .{ .p_, .xor }, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_b, .mins } + else if (self.hasFeature(.sse4_1)) + .{ .p_b, .mins } + else + null, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_b, .minu } + else if (self.hasFeature(.sse4_1)) + .{ .p_b, .minu } + else + null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_b, .maxs } + else if (self.hasFeature(.sse4_1)) + .{ .p_b, .maxs } + else + null, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_b, .maxu } + else if (self.hasFeature(.sse4_1)) + .{ .p_b, .maxu } + else + null, + }, else => null, }, 17...32 => switch (air_tag) { @@ -6546,6 +6574,14 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx2)) .{ .vp_, .@"and" } else null, .bit_or => if (self.hasFeature(.avx2)) .{ .vp_, .@"or" } else null, .xor => if (self.hasFeature(.avx2)) .{ .vp_, .xor } else null, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_b, .mins } else null, + .unsigned => if (self.hasFeature(.avx)) .{ .vp_b, .minu } else null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_b, .maxs } else null, + .unsigned => if (self.hasFeature(.avx2)) .{ .vp_b, .maxu } else null, + }, else => null, }, else => null, @@ -6564,6 +6600,26 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx)) .{ .vp_, .@"and" } else .{ .p_, .@"and" }, .bit_or => if (self.hasFeature(.avx)) .{ .vp_, .@"or" } else .{ .p_, .@"or" }, .xor => if (self.hasFeature(.avx)) .{ .vp_, .xor } else .{ .p_, .xor }, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_w, .mins } + else + .{ .p_w, .mins }, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_w, .minu } + else + .{ .p_w, .minu }, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_w, .maxs } + else + .{ .p_w, .maxs }, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_w, .maxu } + else + .{ .p_w, .maxu }, + }, else => null, }, 9...16 => switch (air_tag) { @@ -6579,6 +6635,14 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx2)) .{ .vp_, .@"and" } else null, .bit_or => if (self.hasFeature(.avx2)) .{ .vp_, .@"or" } else null, .xor => if (self.hasFeature(.avx2)) .{ .vp_, .xor } else null, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_w, .mins } else null, + .unsigned => if (self.hasFeature(.avx)) .{ .vp_w, .minu } else null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_w, .maxs } else null, + .unsigned => if (self.hasFeature(.avx2)) .{ .vp_w, .maxu } else null, + }, else => null, }, else => null, @@ -6602,6 +6666,34 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx)) .{ .vp_, .@"and" } else .{ .p_, .@"and" }, .bit_or => if (self.hasFeature(.avx)) .{ .vp_, .@"or" } else .{ .p_, .@"or" }, .xor => if (self.hasFeature(.avx)) .{ .vp_, .xor } else .{ .p_, .xor }, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_d, .mins } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .mins } + else + null, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_d, .minu } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .minu } + else + null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_d, .maxs } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .maxs } + else + null, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_d, .maxu } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .maxu } + else + null, + }, else => null, }, 5...8 => switch (air_tag) { @@ -6617,6 +6709,14 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx2)) .{ .vp_, .@"and" } else null, .bit_or => if (self.hasFeature(.avx2)) .{ .vp_, .@"or" } else null, .xor => if (self.hasFeature(.avx2)) .{ .vp_, .xor } else null, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_d, .mins } else null, + .unsigned => if (self.hasFeature(.avx)) .{ .vp_d, .minu } else null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_d, .maxs } else null, + .unsigned => if (self.hasFeature(.avx2)) .{ .vp_d, .maxu } else null, + }, else => null, }, else => null, diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 7b029cdb4f..52d010880e 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -280,6 +280,7 @@ pub const Mnemonic = enum { mulps, mulss, orps, pextrw, pinsrw, + pmaxsw, pmaxub, pminsw, pminub, shufps, sqrtps, sqrtss, subps, subss, @@ -318,6 +319,7 @@ pub const Mnemonic = enum { insertps, pextrb, pextrd, pextrq, pinsrb, pinsrd, pinsrq, + pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw, pmulld, roundpd, roundps, roundsd, roundss, // AVX @@ -349,6 +351,8 @@ pub const Mnemonic = enum { vpand, vpandn, vpextrb, vpextrd, vpextrq, vpextrw, vpinsrb, vpinsrd, vpinsrq, vpinsrw, + vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw, + vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw, vpmulhw, vpmulld, vpmullw, vpor, vpshufhw, vpshuflw, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index a18792e6aa..4483de858e 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -453,6 +453,14 @@ pub const Inst = struct { /// Bitwise logical and not of packed single-precision floating-point values /// Bitwise logical and not of packed double-precision floating-point values andn, + /// Maximum of packed signed integers + maxs, + /// Maximum of packed unsigned integers + maxu, + /// Minimum of packed signed integers + mins, + /// Minimum of packed unsigned integers + minu, /// Multiply packed signed integers and store low result mull, /// Multiply packed signed integers and store high result diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 86a79596cd..c326f4230a 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -1011,6 +1011,14 @@ pub const table = [_]Entry{ .{ .pinsrw, .rmi, &.{ .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 }, + .{ .pmaxsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xee }, 0, .none, .sse2 }, + + .{ .pmaxub, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xde }, 0, .none, .sse2 }, + + .{ .pminsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xea }, 0, .none, .sse2 }, + + .{ .pminub, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xda }, 0, .none, .sse2 }, + .{ .pmulhw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .none, .sse2 }, .{ .pmullw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd5 }, 0, .none, .sse2 }, @@ -1091,6 +1099,20 @@ pub const table = [_]Entry{ .{ .pinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 }, .{ .pinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 }, + .{ .pmaxsb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .none, .sse4_1 }, + .{ .pmaxsd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .none, .sse4_1 }, + + .{ .pmaxuw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3e }, 0, .none, .sse4_1 }, + + .{ .pmaxud, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3f }, 0, .none, .sse4_1 }, + + .{ .pminsb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x38 }, 0, .none, .sse4_1 }, + .{ .pminsd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x39 }, 0, .none, .sse4_1 }, + + .{ .pminuw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3a }, 0, .none, .sse4_1 }, + + .{ .pminud, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .none, .sse4_1 }, + .{ .pmulld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 }, .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 }, @@ -1318,6 +1340,24 @@ pub const table = [_]Entry{ .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx }, + .{ .vpmaxsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_128_wig, .avx }, + .{ .vpmaxsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xee }, 0, .vex_128_wig, .avx }, + .{ .vpmaxsd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_128_wig, .avx }, + + .{ .vpmaxub, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xde }, 0, .vex_128_wig, .avx }, + .{ .vpmaxuw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3e }, 0, .vex_128_wig, .avx }, + + .{ .vpmaxud, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3f }, 0, .vex_128_wig, .avx }, + + .{ .vpminsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x38 }, 0, .vex_128_wig, .avx }, + .{ .vpminsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xea }, 0, .vex_128_wig, .avx }, + .{ .vpminsd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x39 }, 0, .vex_128_wig, .avx }, + + .{ .vpminub, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xda }, 0, .vex_128_wig, .avx }, + .{ .vpminuw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3a }, 0, .vex_128_wig, .avx }, + + .{ .vpminud, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .vex_128_wig, .avx }, + .{ .vpmulhw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx }, .{ .vpmulld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx }, @@ -1449,6 +1489,24 @@ pub const table = [_]Entry{ .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 }, + .{ .vpmaxsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_256_wig, .avx }, + .{ .vpmaxsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xee }, 0, .vex_256_wig, .avx }, + .{ .vpmaxsd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_256_wig, .avx }, + + .{ .vpmaxub, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xde }, 0, .vex_256_wig, .avx }, + .{ .vpmaxuw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3e }, 0, .vex_256_wig, .avx }, + + .{ .vpmaxud, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3f }, 0, .vex_256_wig, .avx }, + + .{ .vpminsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x38 }, 0, .vex_256_wig, .avx }, + .{ .vpminsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xea }, 0, .vex_256_wig, .avx }, + .{ .vpminsd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x39 }, 0, .vex_256_wig, .avx }, + + .{ .vpminub, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xda }, 0, .vex_256_wig, .avx }, + .{ .vpminuw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3a }, 0, .vex_256_wig, .avx }, + + .{ .vpminud, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .vex_256_wig, .avx }, + .{ .vpmulhw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx }, .{ .vpmulld, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx },