ref: 218258e4ac2940303be597272245cb09662fd759
parent: fb2a06c91f60a948687939741aa884b17219f715
author: S. Gilles <sgilles@math.umd.edu>
date: Wed May 1 11:57:06 EDT 2019
Add 2sum in math util Fast2sum is only for when magnitudes of the arguments are known. 2sum (called "slow2sum") avoids this, at the cost of speed. It ends up being necessary in the forthcoming log-overkill function.
--- a/lib/math/atan-impl.myr
+++ b/lib/math/atan-impl.myr
@@ -427,7 +427,7 @@
du = 0.0
else
var t1, t2
- (t1, t2) = two_by_two(u, x)
+ (t1, t2) = two_by_two64(u, x)
du = ((y - t1) - t2)/x
;;
--- a/lib/math/sin-impl.myr
+++ b/lib/math/sin-impl.myr
@@ -508,8 +508,8 @@
in the polynomial approximations.
*/
var delta1, delta2, deltat
- (delta1, deltat) = fast2sum(-std.flt64frombits(xi), x1)
- (delta2, _) = fast2sum(deltat, x2)
+ (delta1, deltat) = slow2sum(-std.flt64frombits(xi), x1)
+ (delta2, _) = slow2sum(deltat, x2)
/*
sin(delta); the degree 2 coefficient is near 0, so delta_2
@@ -528,16 +528,16 @@
var q : flt64[4]
if (want_sin && !swap_sin_cos) || (want_cos && swap_sin_cos)
- (q[0], q[1]) = two_by_two(std.flt64frombits(sin_xi), cos_delta)
- (q[2], q[3]) = two_by_two(std.flt64frombits(cos_xi), sin_delta)
- std.sort(q[:], fltabscmp)
+ (q[0], q[1]) = two_by_two64(std.flt64frombits(sin_xi), cos_delta)
+ (q[2], q[3]) = two_by_two64(std.flt64frombits(cos_xi), sin_delta)
+ std.sort(q[:], mag_cmp64)
(sin_ret, sin_ret2) = double_compensated_sum(q[:])
;;
if (want_cos && !swap_sin_cos) || (want_sin && swap_sin_cos)
- (q[0], q[1]) = two_by_two(std.flt64frombits(cos_xi), cos_delta)
- (q[2], q[3]) = two_by_two(std.flt64frombits(sin_xi), sin_delta)
+ (q[0], q[1]) = two_by_two64(std.flt64frombits(cos_xi), cos_delta)
+ (q[2], q[3]) = two_by_two64(std.flt64frombits(sin_xi), sin_delta)
q[2] = -q[2]
q[3] = -q[3]
@@ -620,16 +620,16 @@
total_N = rn(x1 * std.flt64frombits(two_over_pi))
Nf = (-total_N : flt64)
(q[0], q[ 4], q[5]) = (x1, x2, x3)
- (q[1], q[ 3]) = two_by_two(Nf, pi_o_2_0)
- (q[2], q[ 6]) = two_by_two(Nf, pi_o_2_1)
- (q[7], q[ 8]) = two_by_two(Nf, pi_o_2_2)
- (q[9], q[10]) = two_by_two(Nf, pi_o_2_3)
+ (q[1], q[ 3]) = two_by_two64(Nf, pi_o_2_0)
+ (q[2], q[ 6]) = two_by_two64(Nf, pi_o_2_1)
+ (q[7], q[ 8]) = two_by_two64(Nf, pi_o_2_2)
+ (q[9], q[10]) = two_by_two64(Nf, pi_o_2_3)
/*
Sorting is very slow, but it's only the top five or so
that are in question
*/
- std.sort(q[0:5], fltabscmp)
+ std.sort(q[0:5], mag_cmp64)
(x1, x2) = double_compensated_sum(q[:])
while !(le_22(x1, x2, pi_o_4_0, pi_o_4_1) && le_22(-pi_o_4_0, -pi_o_4_1, x1, x2))
@@ -641,10 +641,10 @@
cancelled by Nf * pi_o_2_0, so line those up.
*/
(q[0], q[2]) = (x1, x2)
- (q[1], q[3]) = two_by_two(Nf, pi_o_2_0)
- (q[4], q[5]) = two_by_two(Nf, pi_o_2_1)
- (q[6], q[7]) = two_by_two(Nf, pi_o_2_2)
- (q[8], q[9]) = two_by_two(Nf, pi_o_2_3)
+ (q[1], q[3]) = two_by_two64(Nf, pi_o_2_0)
+ (q[4], q[5]) = two_by_two64(Nf, pi_o_2_1)
+ (q[6], q[7]) = two_by_two64(Nf, pi_o_2_2)
+ (q[8], q[9]) = two_by_two64(Nf, pi_o_2_3)
(x1, x2) = double_compensated_sum(q[0:10])
total_N += (N % 4)
;;
@@ -731,15 +731,15 @@
since xc is so small.
*/
var q : flt64[18]
- (q[ 0], q[ 1]) = two_by_two(xa, a1)
- (q[ 2], q[ 3]) = two_by_two(xa, a2)
- (q[ 4], q[ 5]) = two_by_two(xa, a3)
- (q[ 6], q[ 7]) = two_by_two(xb, b1)
- (q[ 8], q[ 9]) = two_by_two(xb, b2)
- (q[10], q[11]) = two_by_two(xb, b3)
- (q[12], q[13]) = two_by_two(xc, c1)
- (q[14], q[15]) = two_by_two(xc, c2)
- (q[16], q[17]) = two_by_two(xc, c3)
+ (q[ 0], q[ 1]) = two_by_two64(xa, a1)
+ (q[ 2], q[ 3]) = two_by_two64(xa, a2)
+ (q[ 4], q[ 5]) = two_by_two64(xa, a3)
+ (q[ 6], q[ 7]) = two_by_two64(xb, b1)
+ (q[ 8], q[ 9]) = two_by_two64(xb, b2)
+ (q[10], q[11]) = two_by_two64(xb, b3)
+ (q[12], q[13]) = two_by_two64(xc, c1)
+ (q[14], q[15]) = two_by_two64(xc, c2)
+ (q[16], q[17]) = two_by_two64(xc, c3)
-> triple_compensated_sum(q[:])
}
@@ -762,37 +762,6 @@
;;
-> (xi, cos_xi, sin_xi)
-}
-
-const triple_compensated_sum = {q : flt64[:]
- /* TODO: verify, with GAPPA or something, that this is correct. */
- std.sort(q, fltabscmp)
- var s1 : flt64, s2 : flt64, s3
- var t1 : flt64, t2 : flt64, t3 : flt64, t4 : flt64, t5 : flt64, t6
- s1 = q[0]
- s2 = 0.0
- s3 = 0.0
- for qq : q[1:]
- (t5, t6) = fast2sum(s3, qq)
- (t3, t4) = fast2sum(s2, t5)
- (t1, t2) = fast2sum(s1, t3)
- s1 = t1
- (s2, s3) = fast2sum(t2, t4 + t6)
- ;;
-
- -> (s1, s2, s3)
-}
-
-const fltabscmp = {x : flt64, y : flt64
- var xb = std.flt64bits(x) & ~(1 << 63)
- var yb = std.flt64bits(y) & ~(1 << 63)
- if xb == yb
- -> `std.Equal
- elif xb > yb
- -> `std.Before
- else
- -> `std.After
- ;;
}
/*
--- a/lib/math/sum-impl.myr
+++ b/lib/math/sum-impl.myr
@@ -1,5 +1,7 @@
use std
+use "util"
+
/* For references, see [Mul+10] section 6.3 */
pkg math =
pkglocal const kahan_sum32 : (l : flt32[:] -> flt32)
@@ -67,12 +69,6 @@
-> s
}
-const mag_cmp32 = {f : flt32, g : flt32
- var u = std.flt32bits(f) & ~(1 << 31)
- var v = std.flt32bits(g) & ~(1 << 31)
- -> std.numcmp(v, u)
-}
-
pkglocal const priest_sum64 = {l : flt64[:]
var l2 = std.sldup(l)
std.sort(l, mag_cmp64)
@@ -80,12 +76,6 @@
var s, c
(s, c) = double_compensated_sum(l2)
-> s
-}
-
-const mag_cmp64 = {f : flt64, g : flt64
- var u = std.flt64bits(f) & ~(1 << 63)
- var v = std.flt64bits(g) & ~(1 << 63)
- -> std.numcmp(v, u)
}
generic double_compensated_sum = {l : @f[:] :: numeric,floating @f
--- a/lib/math/tan-impl.myr
+++ b/lib/math/tan-impl.myr
@@ -430,8 +430,8 @@
;;
var delta1, delta2, deltat
- (delta1, deltat) = fast2sum(-std.flt64frombits(xi), x1)
- (delta2, _) = fast2sum(deltat, x2)
+ (delta1, deltat) = slow2sum(-std.flt64frombits(xi), x1)
+ (delta2, _) = slow2sum(deltat, x2)
var p1, p2
/*
@@ -459,7 +459,7 @@
var u1 = std.flt64frombits(std.flt64bits(u0) & split_mask)
var u2 = u0 - u1
- (ret1, ret2) = fast2sum(u0 - u0*u0*g, u0*((1.0 - u1*f) - u2*f))
+ (ret1, ret2) = slow2sum(u0 - u0*u0*g, u0*((1.0 - u1*f) - u2*f))
goto have_result
;;
@@ -499,7 +499,7 @@
var s : flt64 = x1 * x1
var p : flt64 = horner_polyu(s, tan_coeff[:])
var r1, r2
- (r1, r2) = two_by_two(p, x1)
+ (r1, r2) = two_by_two64(p, x1)
-> fast2sum(r1, r2 + x2)
}
@@ -506,5 +506,5 @@
const pcot = {x1 : flt64, x2 : flt64
var s : flt64 = x1 * x1
var p : flt64 = horner_polyu(s, cot_coeff[:])
- -> fast2sum(p/x1, std.flt64frombits(0x3fd5555555555555)*x2)
+ -> slow2sum(p/x1, std.flt64frombits(0x3fd5555555555555)*x2)
}
--- a/lib/math/util.myr
+++ b/lib/math/util.myr
@@ -16,11 +16,20 @@
const need_round_away : (h : uint64, l : uint64, bitpos_last : int64 -> bool)
/* Multiply x * y to z1 + z2 */
- const two_by_two : (x : flt64, y : flt64 -> (flt64, flt64))
+ const two_by_two64 : (x : flt64, y : flt64 -> (flt64, flt64))
+ const two_by_two32 : (x : flt32, y : flt32 -> (flt32, flt32))
+ /* Compare by magnitude */
+ const mag_cmp32 : (f : flt32, g : flt32 -> std.order)
+ const mag_cmp64 : (f : flt64, g : flt64 -> std.order)
+
/* Return (s, t) such that s + t = a + b, with s = rn(a + b). */
generic fast2sum : (x : @f, y : @f -> (@f, @f)) :: floating, numeric @f
+ generic slow2sum : (x : @f, y : @f -> (@f, @f)) :: floating, numeric @f
+ /* return (a, b, c), a decent sum for q */
+ const triple_compensated_sum : (q : flt64[:] -> (flt64, flt64, flt64))
+
/* Rounds a + b (as flt64s) to a flt32. */
const round_down : (a : flt64, b : flt64 -> flt32)
;;
@@ -188,7 +197,7 @@
/*
Perform high-prec multiplication: x * y = z1 + z2.
*/
-const two_by_two = {x : flt64, y : flt64
+const two_by_two64 = {x : flt64, y : flt64
var xh : flt64 = std.flt64frombits(std.flt64bits(x) & twentysix_bits_mask)
var xl : flt64 = x - xh
var yh : flt64 = std.flt64frombits(std.flt64bits(y) & twentysix_bits_mask)
@@ -213,13 +222,13 @@
(s, c) = fast2sum(s, a2)
/* a3 */
- (yy, u) = fast2sum(c, a3)
+ (yy, u) = slow2sum(c, a3)
(t, v) = fast2sum(s, yy)
z = u + v
(s, c) = fast2sum(t, z)
/* a4 */
- (yy, u) = fast2sum(c, a4)
+ (yy, u) = slow2sum(c, a4)
(t, v) = fast2sum(s, yy)
z = u + v
(s, c) = fast2sum(t, z)
@@ -227,12 +236,69 @@
-> (s, c)
}
+/*
+ The same, for flt32s
+ */
+const two_by_two32 = {x : flt32, y : flt32
+ var xL : flt64 = (x : flt64)
+ var yL : flt64 = (y : flt64)
+ var zL : flt64 = xL * yL
+ var s : flt32 = (zL : flt32)
+ var sL : flt64 = (s : flt64)
+ var cL : flt64 = zL - sL
+ var c : flt32 = (cL : flt32)
+
+ -> (s, c)
+}
+
+const mag_cmp32 = {f : flt32, g : flt32
+ var u = std.flt32bits(f) & ~(1 << 31)
+ var v = std.flt32bits(g) & ~(1 << 31)
+ -> std.numcmp(v, u)
+}
+
+const mag_cmp64 = {f : flt64, g : flt64
+ var u = std.flt64bits(f) & ~(1 << 63)
+ var v = std.flt64bits(g) & ~(1 << 63)
+ -> std.numcmp(v, u)
+}
+
/* Return (s, t) such that s + t = a + b, with s = rn(a + b). */
+generic slow2sum = {a : @f, b : @f :: floating, numeric @f
+ var s = a + b
+ var aa = s - b
+ var bb = s - aa
+ var da = a - aa
+ var db = b - bb
+ var t = da + db
+ -> (s, t)
+}
+
+/* Return (s, t) such that s + t = a + b, with s = rn(a + b), when you KNOW |a| > |b|. */
generic fast2sum = {a : @f, b : @f :: floating, numeric @f
var s = a + b
var z = s - a
var t = b - z
-> (s, t)
+}
+
+const triple_compensated_sum = {q : flt64[:]
+ /* TODO: verify, with GAPPA or something, that this is correct. */
+ std.sort(q, mag_cmp64)
+ var s1 : flt64, s2 : flt64, s3
+ var t1 : flt64, t2 : flt64, t3 : flt64, t4 : flt64, t5 : flt64, t6
+ s1 = q[0]
+ s2 = 0.0
+ s3 = 0.0
+ for qq : q[1:]
+ (t5, t6) = slow2sum(s3, qq)
+ (t3, t4) = slow2sum(s2, t5)
+ (t1, t2) = slow2sum(s1, t3)
+ s1 = t1
+ (s2, s3) = slow2sum(t2, t4 + t6)
+ ;;
+
+ -> (s1, s2, s3)
}
/*