divtf3.zig - zig - fork of https://codeberg.org/ziglang/zig

divtf3.zig (9925B) - Raw
      1 const std = @import("std");
      2 const builtin = @import("builtin");
      3 
      4 const common = @import("common.zig");
      5 const normalize = common.normalize;
      6 const wideMultiply = common.wideMultiply;
      7 
      8 pub const panic = common.panic;
      9 
     10 comptime {
     11     if (common.want_ppc_abi) {
     12         @export(&__divtf3, .{ .name = "__divkf3", .linkage = common.linkage, .visibility = common.visibility });
     13     } else if (common.want_sparc_abi) {
     14         @export(&_Qp_div, .{ .name = "_Qp_div", .linkage = common.linkage, .visibility = common.visibility });
     15     }
     16     @export(&__divtf3, .{ .name = "__divtf3", .linkage = common.linkage, .visibility = common.visibility });
     17 }
     18 
     19 pub fn __divtf3(a: f128, b: f128) callconv(.c) f128 {
     20     return div(a, b);
     21 }
     22 
     23 fn _Qp_div(c: *f128, a: *const f128, b: *const f128) callconv(.c) void {
     24     c.* = div(a.*, b.*);
     25 }
     26 
     27 inline fn div(a: f128, b: f128) f128 {
     28     const Z = std.meta.Int(.unsigned, 128);
     29 
     30     const significandBits = std.math.floatMantissaBits(f128);
     31     const exponentBits = std.math.floatExponentBits(f128);
     32 
     33     const signBit = (@as(Z, 1) << (significandBits + exponentBits));
     34     const maxExponent = ((1 << exponentBits) - 1);
     35     const exponentBias = (maxExponent >> 1);
     36 
     37     const implicitBit = (@as(Z, 1) << significandBits);
     38     const quietBit = implicitBit >> 1;
     39     const significandMask = implicitBit - 1;
     40 
     41     const absMask = signBit - 1;
     42     const exponentMask = absMask ^ significandMask;
     43     const qnanRep = exponentMask | quietBit;
     44     const infRep: Z = @bitCast(std.math.inf(f128));
     45 
     46     const aExponent: u32 = @truncate((@as(Z, @bitCast(a)) >> significandBits) & maxExponent);
     47     const bExponent: u32 = @truncate((@as(Z, @bitCast(b)) >> significandBits) & maxExponent);
     48     const quotientSign: Z = (@as(Z, @bitCast(a)) ^ @as(Z, @bitCast(b))) & signBit;
     49 
     50     var aSignificand: Z = @as(Z, @bitCast(a)) & significandMask;
     51     var bSignificand: Z = @as(Z, @bitCast(b)) & significandMask;
     52     var scale: i32 = 0;
     53 
     54     // Detect if a or b is zero, denormal, infinity, or NaN.
     55     if (aExponent -% 1 >= maxExponent - 1 or bExponent -% 1 >= maxExponent - 1) {
     56         const aAbs: Z = @as(Z, @bitCast(a)) & absMask;
     57         const bAbs: Z = @as(Z, @bitCast(b)) & absMask;
     58 
     59         // NaN / anything = qNaN
     60         if (aAbs > infRep) return @bitCast(@as(Z, @bitCast(a)) | quietBit);
     61         // anything / NaN = qNaN
     62         if (bAbs > infRep) return @bitCast(@as(Z, @bitCast(b)) | quietBit);
     63 
     64         if (aAbs == infRep) {
     65             // infinity / infinity = NaN
     66             if (bAbs == infRep) {
     67                 return @bitCast(qnanRep);
     68             }
     69             // infinity / anything else = +/- infinity
     70             else {
     71                 return @bitCast(aAbs | quotientSign);
     72             }
     73         }
     74 
     75         // anything else / infinity = +/- 0
     76         if (bAbs == infRep) return @bitCast(quotientSign);
     77 
     78         if (aAbs == 0) {
     79             // zero / zero = NaN
     80             if (bAbs == 0) {
     81                 return @bitCast(qnanRep);
     82             }
     83             // zero / anything else = +/- zero
     84             else {
     85                 return @bitCast(quotientSign);
     86             }
     87         }
     88         // anything else / zero = +/- infinity
     89         if (bAbs == 0) return @bitCast(infRep | quotientSign);
     90 
     91         // one or both of a or b is denormal, the other (if applicable) is a
     92         // normal number.  Renormalize one or both of a and b, and set scale to
     93         // include the necessary exponent adjustment.
     94         if (aAbs < implicitBit) scale +%= normalize(f128, &aSignificand);
     95         if (bAbs < implicitBit) scale -%= normalize(f128, &bSignificand);
     96     }
     97 
     98     // Set the implicit significand bit.  If we fell through from the
     99     // denormal path it was already set by normalize( ), but setting it twice
    100     // won't hurt anything.
    101     aSignificand |= implicitBit;
    102     bSignificand |= implicitBit;
    103     var quotientExponent: i32 = @as(i32, @bitCast(aExponent -% bExponent)) +% scale;
    104 
    105     // Align the significand of b as a Q63 fixed-point number in the range
    106     // [1, 2.0) and get a Q64 approximate reciprocal using a small minimax
    107     // polynomial approximation: reciprocal = 3/4 + 1/sqrt(2) - b/2.  This
    108     // is accurate to about 3.5 binary digits.
    109     const q63b: u64 = @truncate(bSignificand >> 49);
    110     var recip64 = @as(u64, 0x7504f333F9DE6484) -% q63b;
    111     // 0x7504f333F9DE6484 / 2^64 + 1 = 3/4 + 1/sqrt(2)
    112 
    113     // Now refine the reciprocal estimate using a Newton-Raphson iteration:
    114     //
    115     //     x1 = x0 * (2 - x0 * b)
    116     //
    117     // This doubles the number of correct binary digits in the approximation
    118     // with each iteration.
    119     var correction64: u64 = undefined;
    120     correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1);
    121     recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63);
    122     correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1);
    123     recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63);
    124     correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1);
    125     recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63);
    126     correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1);
    127     recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63);
    128     correction64 = @truncate(~(@as(u128, recip64) *% q63b >> 64) +% 1);
    129     recip64 = @truncate(@as(u128, recip64) *% correction64 >> 63);
    130 
    131     // The reciprocal may have overflowed to zero if the upper half of b is
    132     // exactly 1.0.  This would sabatoge the full-width final stage of the
    133     // computation that follows, so we adjust the reciprocal down by one bit.
    134     recip64 -%= 1;
    135 
    136     // We need to perform one more iteration to get us to 112 binary digits;
    137     // The last iteration needs to happen with extra precision.
    138     const q127blo: u64 = @truncate(bSignificand << 15);
    139     var correction: u128 = undefined;
    140     var reciprocal: u128 = undefined;
    141 
    142     // NOTE: This operation is equivalent to __multi3, which is not implemented
    143     //       in some architecture
    144     var r64q63: u128 = undefined;
    145     var r64q127: u128 = undefined;
    146     var r64cH: u128 = undefined;
    147     var r64cL: u128 = undefined;
    148     var dummy: u128 = undefined;
    149     wideMultiply(u128, recip64, q63b, &dummy, &r64q63);
    150     wideMultiply(u128, recip64, q127blo, &dummy, &r64q127);
    151 
    152     correction = -%(r64q63 + (r64q127 >> 64));
    153 
    154     const cHi: u64 = @truncate(correction >> 64);
    155     const cLo: u64 = @truncate(correction);
    156 
    157     wideMultiply(u128, recip64, cHi, &dummy, &r64cH);
    158     wideMultiply(u128, recip64, cLo, &dummy, &r64cL);
    159 
    160     reciprocal = r64cH + (r64cL >> 64);
    161 
    162     // Adjust the final 128-bit reciprocal estimate downward to ensure that it
    163     // is strictly smaller than the infinitely precise exact reciprocal. Because
    164     // the computation of the Newton-Raphson step is truncating at every step,
    165     // this adjustment is small; most of the work is already done.
    166     reciprocal -%= 2;
    167 
    168     // The numerical reciprocal is accurate to within 2^-112, lies in the
    169     // interval [0.5, 1.0), and is strictly smaller than the true reciprocal
    170     // of b.  Multiplying a by this reciprocal thus gives a numerical q = a/b
    171     // in Q127 with the following properties:
    172     //
    173     //    1. q < a/b
    174     //    2. q is in the interval [0.5, 2.0)
    175     //    3. The error in q is bounded away from 2^-113 (actually, we have a
    176     //       couple of bits to spare, but this is all we need).
    177 
    178     // We need a 128 x 128 multiply high to compute q.
    179     var quotient: u128 = undefined;
    180     var quotientLo: u128 = undefined;
    181     wideMultiply(u128, aSignificand << 2, reciprocal, &quotient, &quotientLo);
    182 
    183     // Two cases: quotient is in [0.5, 1.0) or quotient is in [1.0, 2.0).
    184     // In either case, we are going to compute a residual of the form
    185     //
    186     //     r = a - q*b
    187     //
    188     // We know from the construction of q that r satisfies:
    189     //
    190     //     0 <= r < ulp(q)*b
    191     //
    192     // If r is greater than 1/2 ulp(q)*b, then q rounds up.  Otherwise, we
    193     // already have the correct result.  The exact halfway case cannot occur.
    194     // We also take this time to right shift quotient if it falls in the [1,2)
    195     // range and adjust the exponent accordingly.
    196     var residual: u128 = undefined;
    197     var qb: u128 = undefined;
    198 
    199     if (quotient < (implicitBit << 1)) {
    200         wideMultiply(u128, quotient, bSignificand, &dummy, &qb);
    201         residual = (aSignificand << 113) -% qb;
    202         quotientExponent -%= 1;
    203     } else {
    204         quotient >>= 1;
    205         wideMultiply(u128, quotient, bSignificand, &dummy, &qb);
    206         residual = (aSignificand << 112) -% qb;
    207     }
    208 
    209     const writtenExponent = quotientExponent +% exponentBias;
    210 
    211     if (writtenExponent >= maxExponent) {
    212         // If we have overflowed the exponent, return infinity.
    213         return @bitCast(infRep | quotientSign);
    214     } else if (writtenExponent < 1) {
    215         if (writtenExponent == 0) {
    216             // Check whether the rounded result is normal.
    217             const round = @intFromBool((residual << 1) > bSignificand);
    218             // Clear the implicit bit.
    219             var absResult = quotient & significandMask;
    220             // Round.
    221             absResult += round;
    222             if ((absResult & ~significandMask) > 0) {
    223                 // The rounded result is normal; return it.
    224                 return @bitCast(absResult | quotientSign);
    225             }
    226         }
    227         // Flush denormals to zero.  In the future, it would be nice to add
    228         // code to round them correctly.
    229         return @bitCast(quotientSign);
    230     } else {
    231         const round = @intFromBool((residual << 1) >= bSignificand);
    232         // Clear the implicit bit
    233         var absResult = quotient & significandMask;
    234         // Insert the exponent
    235         absResult |= @as(Z, @intCast(writtenExponent)) << significandBits;
    236         // Round
    237         absResult +%= round;
    238         // Insert the sign and return
    239         return @bitCast(absResult | quotientSign);
    240     }
    241 }
    242 
    243 test {
    244     _ = @import("divtf3_test.zig");
    245 }
	zig fork of https://codeberg.org/ziglang/zig
	Log \| Files \| Refs \| README \| LICENSE