udivmod.zig (4285B) - Raw
1 const std = @import("std"); 2 const builtin = @import("builtin"); 3 const Log2Int = std.math.Log2Int; 4 const common = @import("common.zig"); 5 const HalveInt = common.HalveInt; 6 7 const lo = switch (builtin.cpu.arch.endian()) { 8 .big => 1, 9 .little => 0, 10 }; 11 const hi = 1 - lo; 12 13 // Let _u1 and _u0 be the high and low limbs of U respectively. 14 // Returns U / v_ and sets r = U % v_. 15 fn divwide_generic(comptime T: type, _u1: T, _u0: T, v_: T, r: *T) T { 16 const HalfT = HalveInt(T, false).HalfT; 17 @setRuntimeSafety(common.test_safety); 18 var v = v_; 19 20 const b = @as(T, 1) << (@bitSizeOf(T) / 2); 21 var un64: T = undefined; 22 var un10: T = undefined; 23 24 const s: Log2Int(T) = @intCast(@clz(v)); 25 if (s > 0) { 26 // Normalize divisor 27 v <<= s; 28 un64 = (_u1 << s) | (_u0 >> @intCast((@bitSizeOf(T) - @as(T, @intCast(s))))); 29 un10 = _u0 << s; 30 } else { 31 // Avoid undefined behavior of (u0 >> @bitSizeOf(T)) 32 un64 = _u1; 33 un10 = _u0; 34 } 35 36 // Break divisor up into two 32-bit digits 37 const vn1 = v >> (@bitSizeOf(T) / 2); 38 const vn0 = v & std.math.maxInt(HalfT); 39 40 // Break right half of dividend into two digits 41 const un1 = un10 >> (@bitSizeOf(T) / 2); 42 const un0 = un10 & std.math.maxInt(HalfT); 43 44 // Compute the first quotient digit, q1 45 var q1 = un64 / vn1; 46 var rhat = un64 -% q1 *% vn1; 47 48 // q1 has at most error 2. No more than 2 iterations 49 while (q1 >= b or q1 * vn0 > b * rhat + un1) { 50 q1 -= 1; 51 rhat += vn1; 52 if (rhat >= b) break; 53 } 54 55 const un21 = un64 *% b +% un1 -% q1 *% v; 56 57 // Compute the second quotient digit 58 var q0 = un21 / vn1; 59 rhat = un21 -% q0 *% vn1; 60 61 // q0 has at most error 2. No more than 2 iterations. 62 while (q0 >= b or q0 * vn0 > b * rhat + un0) { 63 q0 -= 1; 64 rhat += vn1; 65 if (rhat >= b) break; 66 } 67 68 r.* = (un21 *% b +% un0 -% q0 *% v) >> s; 69 return q1 *% b +% q0; 70 } 71 72 fn divwide(comptime T: type, _u1: T, _u0: T, v: T, r: *T) T { 73 @setRuntimeSafety(common.test_safety); 74 if (T == u64 and builtin.target.cpu.arch == .x86_64 and builtin.target.os.tag != .windows) { 75 var rem: T = undefined; 76 const quo = asm ( 77 \\divq %[v] 78 : [_] "={rax}" (-> T), 79 [_] "={rdx}" (rem), 80 : [v] "r" (v), 81 [_] "{rax}" (_u0), 82 [_] "{rdx}" (_u1), 83 ); 84 r.* = rem; 85 return quo; 86 } else { 87 return divwide_generic(T, _u1, _u0, v, r); 88 } 89 } 90 91 // Returns a_ / b_ and sets maybe_rem = a_ % b. 92 pub fn udivmod(comptime T: type, a_: T, b_: T, maybe_rem: ?*T) T { 93 @setRuntimeSafety(common.test_safety); 94 const HalfT = HalveInt(T, false).HalfT; 95 const SignedT = std.meta.Int(.signed, @bitSizeOf(T)); 96 97 if (b_ > a_) { 98 if (maybe_rem) |rem| { 99 rem.* = a_; 100 } 101 return 0; 102 } 103 104 const a: [2]HalfT = @bitCast(a_); 105 const b: [2]HalfT = @bitCast(b_); 106 var q: [2]HalfT = undefined; 107 var r: [2]HalfT = undefined; 108 109 // When the divisor fits in 64 bits, we can use an optimized path 110 if (b[hi] == 0) { 111 r[hi] = 0; 112 if (a[hi] < b[lo]) { 113 // The result fits in 64 bits 114 q[hi] = 0; 115 q[lo] = divwide(HalfT, a[hi], a[lo], b[lo], &r[lo]); 116 } else { 117 // First, divide with the high part to get the remainder. After that a_hi < b_lo. 118 q[hi] = a[hi] / b[lo]; 119 q[lo] = divwide(HalfT, a[hi] % b[lo], a[lo], b[lo], &r[lo]); 120 } 121 if (maybe_rem) |rem| { 122 rem.* = @bitCast(r); 123 } 124 return @bitCast(q); 125 } 126 127 // 0 <= shift <= 63 128 const shift: Log2Int(T) = @clz(b[hi]) - @clz(a[hi]); 129 var af: T = @bitCast(a); 130 var bf = @as(T, @bitCast(b)) << shift; 131 q = @bitCast(@as(T, 0)); 132 133 for (0..shift + 1) |_| { 134 q[lo] <<= 1; 135 // Branchless version of: 136 // if (af >= bf) { 137 // af -= bf; 138 // q[lo] |= 1; 139 // } 140 const s = @as(SignedT, @bitCast(bf -% af -% 1)) >> (@bitSizeOf(T) - 1); 141 q[lo] |= @intCast(s & 1); 142 af -= bf & @as(T, @bitCast(s)); 143 bf >>= 1; 144 } 145 if (maybe_rem) |rem| { 146 rem.* = @bitCast(af); 147 } 148 return @bitCast(q); 149 }