divxf3.zig (8669B) - Raw
1 const std = @import("std"); 2 const builtin = @import("builtin"); 3 const arch = builtin.cpu.arch; 4 5 const common = @import("common.zig"); 6 const normalize = common.normalize; 7 const wideMultiply = common.wideMultiply; 8 9 pub const panic = common.panic; 10 11 comptime { 12 @export(&__divxf3, .{ .name = "__divxf3", .linkage = common.linkage, .visibility = common.visibility }); 13 } 14 15 pub fn __divxf3(a: f80, b: f80) callconv(.c) f80 { 16 const T = f80; 17 const Z = std.meta.Int(.unsigned, @bitSizeOf(T)); 18 19 const significandBits = std.math.floatMantissaBits(T); 20 const fractionalBits = std.math.floatFractionalBits(T); 21 const exponentBits = std.math.floatExponentBits(T); 22 23 const signBit = (@as(Z, 1) << (significandBits + exponentBits)); 24 const maxExponent = ((1 << exponentBits) - 1); 25 const exponentBias = (maxExponent >> 1); 26 27 const integerBit = (@as(Z, 1) << fractionalBits); 28 const quietBit = integerBit >> 1; 29 const significandMask = (@as(Z, 1) << significandBits) - 1; 30 31 const absMask = signBit - 1; 32 const qnanRep = @as(Z, @bitCast(std.math.nan(T))) | quietBit; 33 const infRep: Z = @bitCast(std.math.inf(T)); 34 35 const aExponent: u32 = @truncate((@as(Z, @bitCast(a)) >> significandBits) & maxExponent); 36 const bExponent: u32 = @truncate((@as(Z, @bitCast(b)) >> significandBits) & maxExponent); 37 const quotientSign: Z = (@as(Z, @bitCast(a)) ^ @as(Z, @bitCast(b))) & signBit; 38 39 var aSignificand: Z = @as(Z, @bitCast(a)) & significandMask; 40 var bSignificand: Z = @as(Z, @bitCast(b)) & significandMask; 41 var scale: i32 = 0; 42 43 // Detect if a or b is zero, denormal, infinity, or NaN. 44 if (aExponent -% 1 >= maxExponent - 1 or bExponent -% 1 >= maxExponent - 1) { 45 const aAbs: Z = @as(Z, @bitCast(a)) & absMask; 46 const bAbs: Z = @as(Z, @bitCast(b)) & absMask; 47 48 // NaN / anything = qNaN 49 if (aAbs > infRep) return @bitCast(@as(Z, @bitCast(a)) | quietBit); 50 // anything / NaN = qNaN 51 if (bAbs > infRep) return @bitCast(@as(Z, @bitCast(b)) | quietBit); 52 53 if (aAbs == infRep) { 54 // infinity / infinity = NaN 55 if (bAbs == infRep) { 56 return @bitCast(qnanRep); 57 } 58 // infinity / anything else = +/- infinity 59 else { 60 return @bitCast(aAbs | quotientSign); 61 } 62 } 63 64 // anything else / infinity = +/- 0 65 if (bAbs == infRep) return @bitCast(quotientSign); 66 67 if (aAbs == 0) { 68 // zero / zero = NaN 69 if (bAbs == 0) { 70 return @bitCast(qnanRep); 71 } 72 // zero / anything else = +/- zero 73 else { 74 return @bitCast(quotientSign); 75 } 76 } 77 // anything else / zero = +/- infinity 78 if (bAbs == 0) return @bitCast(infRep | quotientSign); 79 80 // one or both of a or b is denormal, the other (if applicable) is a 81 // normal number. Renormalize one or both of a and b, and set scale to 82 // include the necessary exponent adjustment. 83 if (aAbs < integerBit) scale +%= normalize(T, &aSignificand); 84 if (bAbs < integerBit) scale -%= normalize(T, &bSignificand); 85 } 86 var quotientExponent: i32 = @as(i32, @bitCast(aExponent -% bExponent)) +% scale; 87 88 // Align the significand of b as a Q63 fixed-point number in the range 89 // [1, 2.0) and get a Q64 approximate reciprocal using a small minimax 90 // polynomial approximation: reciprocal = 3/4 + 1/sqrt(2) - b/2. This 91 // is accurate to about 3.5 binary digits. 92 const q63b: u64 = @intCast(bSignificand); 93 var recip64 = @as(u64, 0x7504f333F9DE6484) -% q63b; 94 // 0x7504f333F9DE6484 / 2^64 + 1 = 3/4 + 1/sqrt(2) 95 96 // Now refine the reciprocal estimate using a Newton-Raphson iteration: 97 // 98 // x1 = x0 * (2 - x0 * b) 99 // 100 // This doubles the number of correct binary digits in the approximation 101 // with each iteration. 102 var correction64: u64 = undefined; 103 correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1); 104 recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63); 105 correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1); 106 recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63); 107 correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1); 108 recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63); 109 correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1); 110 recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63); 111 correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1); 112 recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63); 113 114 // The reciprocal may have overflowed to zero if the upper half of b is 115 // exactly 1.0. This would sabatoge the full-width final stage of the 116 // computation that follows, so we adjust the reciprocal down by one bit. 117 recip64 -%= 1; 118 119 // We need to perform one more iteration to get us to 112 binary digits; 120 // The last iteration needs to happen with extra precision. 121 122 // NOTE: This operation is equivalent to __multi3, which is not implemented 123 // in some architechures 124 var reciprocal: u128 = undefined; 125 var correction: u128 = undefined; 126 var dummy: u128 = undefined; 127 wideMultiply(u128, recip64, q63b, &dummy, &correction); 128 129 correction = -%correction; 130 131 const cHi: u64 = @truncate(correction >> 64); 132 const cLo: u64 = @truncate(correction); 133 134 var r64cH: u128 = undefined; 135 var r64cL: u128 = undefined; 136 wideMultiply(u128, recip64, cHi, &dummy, &r64cH); 137 wideMultiply(u128, recip64, cLo, &dummy, &r64cL); 138 139 reciprocal = r64cH + (r64cL >> 64); 140 141 // Adjust the final 128-bit reciprocal estimate downward to ensure that it 142 // is strictly smaller than the infinitely precise exact reciprocal. Because 143 // the computation of the Newton-Raphson step is truncating at every step, 144 // this adjustment is small; most of the work is already done. 145 reciprocal -%= 2; 146 147 // The numerical reciprocal is accurate to within 2^-112, lies in the 148 // interval [0.5, 1.0), and is strictly smaller than the true reciprocal 149 // of b. Multiplying a by this reciprocal thus gives a numerical q = a/b 150 // in Q127 with the following properties: 151 // 152 // 1. q < a/b 153 // 2. q is in the interval [0.5, 2.0) 154 // 3. The error in q is bounded away from 2^-63 (actually, we have 155 // many bits to spare, but this is all we need). 156 157 // We need a 128 x 128 multiply high to compute q. 158 var quotient128: u128 = undefined; 159 var quotientLo: u128 = undefined; 160 wideMultiply(u128, aSignificand << 2, reciprocal, "ient128, "ientLo); 161 162 // Two cases: quotient is in [0.5, 1.0) or quotient is in [1.0, 2.0). 163 // Right shift the quotient if it falls in the [1,2) range and adjust the 164 // exponent accordingly. 165 const quotient: u64 = if (quotient128 < (integerBit << 1)) b: { 166 quotientExponent -= 1; 167 break :b @intCast(quotient128); 168 } else @intCast(quotient128 >> 1); 169 170 // We are going to compute a residual of the form 171 // 172 // r = a - q*b 173 // 174 // We know from the construction of q that r satisfies: 175 // 176 // 0 <= r < ulp(q)*b 177 // 178 // If r is greater than 1/2 ulp(q)*b, then q rounds up. Otherwise, we 179 // already have the correct result. The exact halfway case cannot occur. 180 const residual: u64 = -%(quotient *% q63b); 181 182 const writtenExponent = quotientExponent + exponentBias; 183 if (writtenExponent >= maxExponent) { 184 // If we have overflowed the exponent, return infinity. 185 return @bitCast(infRep | quotientSign); 186 } else if (writtenExponent < 1) { 187 if (writtenExponent == 0) { 188 // Check whether the rounded result is normal. 189 if (residual > (bSignificand >> 1)) { // round 190 if (quotient == (integerBit - 1)) // If the rounded result is normal, return it 191 return @bitCast(@as(Z, @bitCast(std.math.floatMin(T))) | quotientSign); 192 } 193 } 194 // Flush denormals to zero. In the future, it would be nice to add 195 // code to round them correctly. 196 return @bitCast(quotientSign); 197 } else { 198 const round = @intFromBool(residual > (bSignificand >> 1)); 199 // Insert the exponent 200 var absResult = quotient | (@as(Z, @intCast(writtenExponent)) << significandBits); 201 // Round 202 absResult +%= round; 203 // Insert the sign and return 204 return @bitCast(absResult | quotientSign | integerBit); 205 } 206 } 207 208 test { 209 _ = @import("divxf3_test.zig"); 210 }