compiler_rt: Implement floatXiYf/fixXfYi, incl f80
This change:
- Adds generic implementation of the float -> integer conversion
functions floatXiYf, including support for f80
- Updates the existing implementation of integer -> float conversion
fixXiYf to support f16 and f80
- Fixes the handling of the explicit integer bit in `__trunctfxf2`
- Combines the test cases for fixXfYi/floatXiYf into a single file
- Renames `fmodl` to `fmodq`, since it operates on 128-bit floats
The new implementation for floatXiYf has been benchmarked, and generally
provides equal or better performance versus the current implementations:
Throughput (MiB/s) - Before
| u32 | i32 | u64 | i64 | u128 | i128 |
-----|----------|----------|----------|----------|----------|----------|
f16 | none | none | none | none | none | none |
f32 | 2231.67 | 2001.19 | 1745.66 | 1405.77 | 2173.99 | 1874.63 |
f64 | 1407.17 | 1055.83 | 2911.68 | 2437.21 | 1676.05 | 1476.67 |
f80 | none | none | none | none | none | none |
f128 | 327.56 | 321.25 | 645.92 | 654.52 | 1153.56 | 1096.27 |
Throughput (MiB/s) - After
| u32 | i32 | u64 | i64 | u128 | i128 |
-----|----------|----------|----------|----------|----------|----------|
f16 | 1407.61 | 1637.25 | 3555.03 | 2594.56 | 3680.60 | 3063.34 |
f32 | 2101.36 | 2122.62 | 3225.46 | 3123.86 | 2860.05 | 1985.21 |
f64 | 1395.57 | 1314.87 | 2409.24 | 2196.30 | 2384.95 | 1908.15 |
f80 | 475.53 | 457.92 | 884.50 | 812.12 | 1475.27 | 1382.16 |
f128 | 359.60 | 350.91 | 723.08 | 706.80 | 1296.42 | 1198.87 |
This commit is contained in:
126
lib/std/special/compiler_rt/floatfmodq.zig
Normal file
126
lib/std/special/compiler_rt/floatfmodq.zig
Normal file
@@ -0,0 +1,126 @@
|
||||
const builtin = @import("builtin");
|
||||
const std = @import("std");
|
||||
|
||||
// fmodq - floating modulo large, returns the remainder of division for f128 types
|
||||
// Logic and flow heavily inspired by MUSL fmodl for 113 mantissa digits
|
||||
pub fn fmodq(a: f128, b: f128) callconv(.C) f128 {
|
||||
@setRuntimeSafety(builtin.is_test);
|
||||
var amod = a;
|
||||
var bmod = b;
|
||||
const aPtr_u64 = @ptrCast([*]u64, &amod);
|
||||
const bPtr_u64 = @ptrCast([*]u64, &bmod);
|
||||
const aPtr_u16 = @ptrCast([*]u16, &amod);
|
||||
const bPtr_u16 = @ptrCast([*]u16, &bmod);
|
||||
|
||||
const exp_and_sign_index = comptime switch (builtin.target.cpu.arch.endian()) {
|
||||
.Little => 7,
|
||||
.Big => 0,
|
||||
};
|
||||
const low_index = comptime switch (builtin.target.cpu.arch.endian()) {
|
||||
.Little => 0,
|
||||
.Big => 1,
|
||||
};
|
||||
const high_index = comptime switch (builtin.target.cpu.arch.endian()) {
|
||||
.Little => 1,
|
||||
.Big => 0,
|
||||
};
|
||||
|
||||
const signA = aPtr_u16[exp_and_sign_index] & 0x8000;
|
||||
var expA = @intCast(i32, (aPtr_u16[exp_and_sign_index] & 0x7fff));
|
||||
var expB = bPtr_u16[exp_and_sign_index] & 0x7fff;
|
||||
|
||||
// There are 3 cases where the answer is undefined, check for:
|
||||
// - fmodq(val, 0)
|
||||
// - fmodq(val, NaN)
|
||||
// - fmodq(inf, val)
|
||||
// The sign on checked values does not matter.
|
||||
// Doing (a * b) / (a * b) procudes undefined results
|
||||
// because the three cases always produce undefined calculations:
|
||||
// - 0 / 0
|
||||
// - val * NaN
|
||||
// - inf / inf
|
||||
if (b == 0 or std.math.isNan(b) or expA == 0x7fff) {
|
||||
return (a * b) / (a * b);
|
||||
}
|
||||
|
||||
// Remove the sign from both
|
||||
aPtr_u16[exp_and_sign_index] = @bitCast(u16, @intCast(i16, expA));
|
||||
bPtr_u16[exp_and_sign_index] = @bitCast(u16, @intCast(i16, expB));
|
||||
if (amod <= bmod) {
|
||||
if (amod == bmod) {
|
||||
return 0 * a;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
if (expA == 0) {
|
||||
amod *= 0x1p120;
|
||||
expA = aPtr_u16[exp_and_sign_index] -% 120;
|
||||
}
|
||||
|
||||
if (expB == 0) {
|
||||
bmod *= 0x1p120;
|
||||
expB = bPtr_u16[exp_and_sign_index] -% 120;
|
||||
}
|
||||
|
||||
// OR in extra non-stored mantissa digit
|
||||
var highA: u64 = (aPtr_u64[high_index] & (std.math.maxInt(u64) >> 16)) | 1 << 48;
|
||||
var highB: u64 = (bPtr_u64[high_index] & (std.math.maxInt(u64) >> 16)) | 1 << 48;
|
||||
var lowA: u64 = aPtr_u64[low_index];
|
||||
var lowB: u64 = bPtr_u64[low_index];
|
||||
|
||||
while (expA > expB) : (expA -= 1) {
|
||||
var high = highA -% highB;
|
||||
var low = lowA -% lowB;
|
||||
if (lowA < lowB) {
|
||||
high = highA -% 1;
|
||||
}
|
||||
if (high >> 63 == 0) {
|
||||
if ((high | low) == 0) {
|
||||
return 0 * a;
|
||||
}
|
||||
highA = 2 *% high + (low >> 63);
|
||||
lowA = 2 *% low;
|
||||
} else {
|
||||
highA = 2 *% highA + (lowA >> 63);
|
||||
lowA = 2 *% lowA;
|
||||
}
|
||||
}
|
||||
|
||||
var high = highA -% highB;
|
||||
var low = lowA -% lowB;
|
||||
if (lowA < lowB) {
|
||||
high -= 1;
|
||||
}
|
||||
if (high >> 63 == 0) {
|
||||
if ((high | low) == 0) {
|
||||
return 0 * a;
|
||||
}
|
||||
highA = high;
|
||||
lowA = low;
|
||||
}
|
||||
|
||||
while (highA >> 48 == 0) {
|
||||
highA = 2 *% highA + (lowA >> 63);
|
||||
lowA = 2 *% lowA;
|
||||
expA = expA - 1;
|
||||
}
|
||||
|
||||
// Overwrite the current amod with the values in highA and lowA
|
||||
aPtr_u64[high_index] = highA;
|
||||
aPtr_u64[low_index] = lowA;
|
||||
|
||||
// Combine the exponent with the sign, normalize if happend to be denormalized
|
||||
if (expA <= 0) {
|
||||
aPtr_u16[exp_and_sign_index] = @truncate(u16, @bitCast(u32, (expA +% 120))) | signA;
|
||||
amod *= 0x1p-120;
|
||||
} else {
|
||||
aPtr_u16[exp_and_sign_index] = @truncate(u16, @bitCast(u32, expA)) | signA;
|
||||
}
|
||||
|
||||
return amod;
|
||||
}
|
||||
|
||||
test {
|
||||
_ = @import("floatfmodq_test.zig");
|
||||
}
|
||||
Reference in New Issue
Block a user