trig.zig (11743B) - Raw
1 // Ported from musl, which is licensed under the MIT license: 2 // https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT 3 // 4 // https://git.musl-libc.org/cgit/musl/tree/src/math/__cos.c 5 // https://git.musl-libc.org/cgit/musl/tree/src/math/__cosdf.c 6 // https://git.musl-libc.org/cgit/musl/tree/src/math/__sin.c 7 // https://git.musl-libc.org/cgit/musl/tree/src/math/__sindf.c 8 // https://git.musl-libc.org/cgit/musl/tree/src/math/__tand.c 9 // https://git.musl-libc.org/cgit/musl/tree/src/math/__tandf.c 10 11 /// kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164 12 /// Input x is assumed to be bounded by ~pi/4 in magnitude. 13 /// Input y is the tail of x. 14 /// 15 /// Algorithm 16 /// 1. Since cos(-x) = cos(x), we need only to consider positive x. 17 /// 2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0. 18 /// 3. cos(x) is approximated by a polynomial of degree 14 on 19 /// [0,pi/4] 20 /// 4 14 21 /// cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x 22 /// where the remez error is 23 /// 24 /// | 2 4 6 8 10 12 14 | -58 25 /// |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x +C6*x )| <= 2 26 /// | | 27 /// 28 /// 4 6 8 10 12 14 29 /// 4. let r = C1*x +C2*x +C3*x +C4*x +C5*x +C6*x , then 30 /// cos(x) ~ 1 - x*x/2 + r 31 /// since cos(x+y) ~ cos(x) - sin(x)*y 32 /// ~ cos(x) - x*y, 33 /// a correction term is necessary in cos(x) and hence 34 /// cos(x+y) = 1 - (x*x/2 - (r - x*y)) 35 /// For better accuracy, rearrange to 36 /// cos(x+y) ~ w + (tmp + (r-x*y)) 37 /// where w = 1 - x*x/2 and tmp is a tiny correction term 38 /// (1 - x*x/2 == w + tmp exactly in infinite precision). 39 /// The exactness of w + tmp in infinite precision depends on w 40 /// and tmp having the same precision as x. If they have extra 41 /// precision due to compiler bugs, then the extra precision is 42 /// only good provided it is retained in all terms of the final 43 /// expression for cos(). Retention happens in all cases tested 44 /// under FreeBSD, so don't pessimize things by forcibly clipping 45 /// any extra precision in w. 46 pub fn __cos(x: f64, y: f64) f64 { 47 const C1 = 4.16666666666666019037e-02; // 0x3FA55555, 0x5555554C 48 const C2 = -1.38888888888741095749e-03; // 0xBF56C16C, 0x16C15177 49 const C3 = 2.48015872894767294178e-05; // 0x3EFA01A0, 0x19CB1590 50 const C4 = -2.75573143513906633035e-07; // 0xBE927E4F, 0x809C52AD 51 const C5 = 2.08757232129817482790e-09; // 0x3E21EE9E, 0xBDB4B1C4 52 const C6 = -1.13596475577881948265e-11; // 0xBDA8FAE9, 0xBE8838D4 53 54 const z = x * x; 55 const zs = z * z; 56 const r = z * (C1 + z * (C2 + z * C3)) + zs * zs * (C4 + z * (C5 + z * C6)); 57 const hz = 0.5 * z; 58 const w = 1.0 - hz; 59 return w + (((1.0 - w) - hz) + (z * r - x * y)); 60 } 61 62 pub fn __cosdf(x: f64) f32 { 63 // |cos(x) - c(x)| < 2**-34.1 (~[-5.37e-11, 5.295e-11]). 64 const C0 = -0x1ffffffd0c5e81.0p-54; // -0.499999997251031003120 65 const C1 = 0x155553e1053a42.0p-57; // 0.0416666233237390631894 66 const C2 = -0x16c087e80f1e27.0p-62; // -0.00138867637746099294692 67 const C3 = 0x199342e0ee5069.0p-68; // 0.0000243904487962774090654 68 69 // Try to optimize for parallel evaluation as in __tandf.c. 70 const z = x * x; 71 const w = z * z; 72 const r = C2 + z * C3; 73 return @floatCast(((1.0 + z * C0) + w * C1) + (w * z) * r); 74 } 75 76 /// kernel sin function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854 77 /// Input x is assumed to be bounded by ~pi/4 in magnitude. 78 /// Input y is the tail of x. 79 /// Input iy indicates whether y is 0. (if iy=0, y assume to be 0). 80 /// 81 /// Algorithm 82 /// 1. Since sin(-x) = -sin(x), we need only to consider positive x. 83 /// 2. Callers must return sin(-0) = -0 without calling here since our 84 /// odd polynomial is not evaluated in a way that preserves -0. 85 /// Callers may do the optimization sin(x) ~ x for tiny x. 86 /// 3. sin(x) is approximated by a polynomial of degree 13 on 87 /// [0,pi/4] 88 /// 3 13 89 /// sin(x) ~ x + S1*x + ... + S6*x 90 /// where 91 /// 92 /// |sin(x) 2 4 6 8 10 12 | -58 93 /// |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x +S6*x )| <= 2 94 /// | x | 95 /// 96 /// 4. sin(x+y) = sin(x) + sin'(x')*y 97 /// ~ sin(x) + (1-x*x/2)*y 98 /// For better accuracy, let 99 /// 3 2 2 2 2 100 /// r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6)))) 101 /// then 3 2 102 /// sin(x) = x + (S1*x + (x *(r-y/2)+y)) 103 pub fn __sin(x: f64, y: f64, iy: i32) f64 { 104 const S1 = -1.66666666666666324348e-01; // 0xBFC55555, 0x55555549 105 const S2 = 8.33333333332248946124e-03; // 0x3F811111, 0x1110F8A6 106 const S3 = -1.98412698298579493134e-04; // 0xBF2A01A0, 0x19C161D5 107 const S4 = 2.75573137070700676789e-06; // 0x3EC71DE3, 0x57B1FE7D 108 const S5 = -2.50507602534068634195e-08; // 0xBE5AE5E6, 0x8A2B9CEB 109 const S6 = 1.58969099521155010221e-10; // 0x3DE5D93A, 0x5ACFD57C 110 111 const z = x * x; 112 const w = z * z; 113 const r = S2 + z * (S3 + z * S4) + z * w * (S5 + z * S6); 114 const v = z * x; 115 if (iy == 0) { 116 return x + v * (S1 + z * r); 117 } else { 118 return x - ((z * (0.5 * y - v * r) - y) - v * S1); 119 } 120 } 121 122 pub fn __sindf(x: f64) f32 { 123 // |sin(x)/x - s(x)| < 2**-37.5 (~[-4.89e-12, 4.824e-12]). 124 const S1 = -0x15555554cbac77.0p-55; // -0.166666666416265235595 125 const S2 = 0x111110896efbb2.0p-59; // 0.0083333293858894631756 126 const S3 = -0x1a00f9e2cae774.0p-65; // -0.000198393348360966317347 127 const S4 = 0x16cd878c3b46a7.0p-71; // 0.0000027183114939898219064 128 129 // Try to optimize for parallel evaluation as in __tandf.c. 130 const z = x * x; 131 const w = z * z; 132 const r = S3 + z * S4; 133 const s = z * x; 134 return @floatCast((x + s * (S1 + z * S2)) + s * w * r); 135 } 136 137 /// kernel tan function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854 138 /// Input x is assumed to be bounded by ~pi/4 in magnitude. 139 /// Input y is the tail of x. 140 /// Input odd indicates whether tan (if odd = 0) or -1/tan (if odd = 1) is returned. 141 /// 142 /// Algorithm 143 /// 1. Since tan(-x) = -tan(x), we need only to consider positive x. 144 /// 2. Callers must return tan(-0) = -0 without calling here since our 145 /// odd polynomial is not evaluated in a way that preserves -0. 146 /// Callers may do the optimization tan(x) ~ x for tiny x. 147 /// 3. tan(x) is approximated by a odd polynomial of degree 27 on 148 /// [0,0.67434] 149 /// 3 27 150 /// tan(x) ~ x + T1*x + ... + T13*x 151 /// where 152 /// 153 /// |tan(x) 2 4 26 | -59.2 154 /// |----- - (1+T1*x +T2*x +.... +T13*x )| <= 2 155 /// | x | 156 /// 157 /// Note: tan(x+y) = tan(x) + tan'(x)*y 158 /// ~ tan(x) + (1+x*x)*y 159 /// Therefore, for better accuracy in computing tan(x+y), let 160 /// 3 2 2 2 2 161 /// r = x *(T2+x *(T3+x *(...+x *(T12+x *T13)))) 162 /// then 163 /// 3 2 164 /// tan(x+y) = x + (T1*x + (x *(r+y)+y)) 165 /// 166 /// 4. For x in [0.67434,pi/4], let y = pi/4 - x, then 167 /// tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y)) 168 /// = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y))) 169 pub fn __tan(x_: f64, y_: f64, odd: bool) f64 { 170 var x = x_; 171 var y = y_; 172 173 const T = [_]f64{ 174 3.33333333333334091986e-01, // 3FD55555, 55555563 175 1.33333333333201242699e-01, // 3FC11111, 1110FE7A 176 5.39682539762260521377e-02, // 3FABA1BA, 1BB341FE 177 2.18694882948595424599e-02, // 3F9664F4, 8406D637 178 8.86323982359930005737e-03, // 3F8226E3, E96E8493 179 3.59207910759131235356e-03, // 3F6D6D22, C9560328 180 1.45620945432529025516e-03, // 3F57DBC8, FEE08315 181 5.88041240820264096874e-04, // 3F4344D8, F2F26501 182 2.46463134818469906812e-04, // 3F3026F7, 1A8D1068 183 7.81794442939557092300e-05, // 3F147E88, A03792A6 184 7.14072491382608190305e-05, // 3F12B80F, 32F0A7E9 185 -1.85586374855275456654e-05, // BEF375CB, DB605373 186 2.59073051863633712884e-05, // 3EFB2A70, 74BF7AD4 187 }; 188 const pio4 = 7.85398163397448278999e-01; // 3FE921FB, 54442D18 189 const pio4lo = 3.06161699786838301793e-17; // 3C81A626, 33145C07 190 191 var z: f64 = undefined; 192 var r: f64 = undefined; 193 var v: f64 = undefined; 194 var w: f64 = undefined; 195 var s: f64 = undefined; 196 var a: f64 = undefined; 197 var w0: f64 = undefined; 198 var a0: f64 = undefined; 199 var hx: u32 = undefined; 200 var sign: bool = undefined; 201 202 hx = @intCast(@as(u64, @bitCast(x)) >> 32); 203 const big = (hx & 0x7fffffff) >= 0x3FE59428; // |x| >= 0.6744 204 if (big) { 205 sign = hx >> 31 != 0; 206 if (sign) { 207 x = -x; 208 y = -y; 209 } 210 x = (pio4 - x) + (pio4lo - y); 211 y = 0.0; 212 } 213 z = x * x; 214 w = z * z; 215 216 // Break x^5*(T[1]+x^2*T[2]+...) into 217 // x^5(T[1]+x^4*T[3]+...+x^20*T[11]) + 218 // x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12])) 219 r = T[1] + w * (T[3] + w * (T[5] + w * (T[7] + w * (T[9] + w * T[11])))); 220 v = z * (T[2] + w * (T[4] + w * (T[6] + w * (T[8] + w * (T[10] + w * T[12]))))); 221 s = z * x; 222 r = y + z * (s * (r + v) + y) + s * T[0]; 223 w = x + r; 224 if (big) { 225 s = @floatFromInt(1 - 2 * @as(i3, @intFromBool(odd))); 226 v = s - 2.0 * (x + (r - w * w / (w + s))); 227 return if (sign) -v else v; 228 } 229 if (!odd) { 230 return w; 231 } 232 // -1.0/(x+r) has up to 2ulp error, so compute it accurately 233 w0 = w; 234 w0 = @bitCast(@as(u64, @bitCast(w0)) & 0xffffffff00000000); 235 v = r - (w0 - x); // w0+v = r+x 236 a = -1.0 / w; 237 a0 = a; 238 a0 = @bitCast(@as(u64, @bitCast(a0)) & 0xffffffff00000000); 239 return a0 + a * (1.0 + a0 * w0 + a0 * v); 240 } 241 242 pub fn __tandf(x: f64, odd: bool) f32 { 243 // |tan(x)/x - t(x)| < 2**-25.5 (~[-2e-08, 2e-08]). 244 const T = [_]f64{ 245 0x15554d3418c99f.0p-54, // 0.333331395030791399758 246 0x1112fd38999f72.0p-55, // 0.133392002712976742718 247 0x1b54c91d865afe.0p-57, // 0.0533812378445670393523 248 0x191df3908c33ce.0p-58, // 0.0245283181166547278873 249 0x185dadfcecf44e.0p-61, // 0.00297435743359967304927 250 0x1362b9bf971bcd.0p-59, // 0.00946564784943673166728 251 }; 252 253 const z = x * x; 254 // Split up the polynomial into small independent terms to give 255 // opportunities for parallel evaluation. The chosen splitting is 256 // micro-optimized for Athlons (XP, X64). It costs 2 multiplications 257 // relative to Horner's method on sequential machines. 258 // 259 // We add the small terms from lowest degree up for efficiency on 260 // non-sequential machines (the lowest degree terms tend to be ready 261 // earlier). Apart from this, we don't care about order of 262 // operations, and don't need to to care since we have precision to 263 // spare. However, the chosen splitting is good for accuracy too, 264 // and would give results as accurate as Horner's method if the 265 // small terms were added from highest degree down. 266 const r = T[4] + z * T[5]; 267 const t = T[2] + z * T[3]; 268 const w = z * z; 269 const s = z * x; 270 const u = T[0] + z * T[1]; 271 const r0 = (x + s * u) + (s * w) * (t + w * r); 272 return @floatCast(if (odd) -1.0 / r0 else r0); 273 }