ref: 0dd6c9d5124707e4c69116745469e21f7e076429
parent: e406ac08a6f68623f0454f2983b88575e69ca62b
author: S. Gilles <sgilles@math.umd.edu>
date: Fri Apr 20 10:33:46 EDT 2018
Implement most of expm1. Errors in huge numbers. [Tan92] finishes off calculation of expm1(f) by range expansion, which involves multiplying a number by 2^M, where M grows roughly linearly with f. If f is very large (say, 0x0x42b17200 in single-precision), M may grow to be as larger than the maximal exponent (in this case, M = 128). If 2^M * P is calculated in the normal way, the intermediate value 2^M is not representable, causing mangled output. However, the result may still be computed: the value P is of magnitude < 1. Therefore, a scalb function will be necessary to perform this multiplication without creating unrepresentable intermediate values={
--- a/lib/math/exp-impl.myr
+++ b/lib/math/exp-impl.myr
@@ -1,11 +1,20 @@
use std
use "fpmath"
+use "util"
-/* See [Mul16] (6.2.1), and in particular [Tan89] (which this notation follows) */
+/*
+ See [Mul16] (6.2.1), [Tan89], and [Tan92]. While the exp and
+ expm1 functions are similar enough to be in the same file (Tang's
+ algorithms use the same S constants), they are not quite similar
+ enough to be in the same function.
+ */
pkg math =
pkglocal const exp32 : (f : flt32 -> flt32)
pkglocal const exp64 : (f : flt64 -> flt64)
+
+ pkglocal const expm132 : (f : flt32 -> flt32)
+ pkglocal const expm164 : (f : flt64 -> flt64)
;;
extern const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
@@ -22,37 +31,63 @@
tobits : (f : @f -> @u)
frombits : (u : @u -> @f)
inf : @u
+ nan : @u
+
+ /* For exp */
thresh_1_min : @u
thresh_1_max : @u
thresh_2 : @u
+ Ai : @u[:]
+
+ /* For expm1 */
+ thresh_tiny : @u
+ thresh_huge_neg : @u
+ Log3_4 : @u
+ Log5_4 : @u
+ Bi : @u[:]
+ exactness_thresh : @u
+
+ /* For both */
+ nabs : @u
inv_L : @u
L1 : @u
L2 : @u
- nabs : @u
- Ai : @u[:]
S : (@u, @u)[32]
+
;;
-/* Coefficients for p(t), locally approximating exp1m */
-const Ai32 : uint32[:] = [
- 0x00000000,
- 0x00000000,
+const desc32 : fltdesc(flt32, uint32, int32) = [
+ .explode = std.flt32explode,
+ .assem = std.flt32assem,
+ .fma = fma32,
+ .horner = horner_polyu32,
+ .sgnmask = (1 << 31),
+ .tobits = std.flt32bits,
+ .frombits = std.flt32frombits,
+ .inf = 0x7f800000,
+ .nan = 0x7fc0000,
+ .thresh_1_min = 0xc2cff1b4, /* ln(2^(-127 - 23)) ~= -103.9720770839917 */
+ .thresh_1_max = 0x42b17218, /* ln([2 - 2^(-24)] * 2^127) ~= 88.72283905206 */
+ .inv_L = 0x4238aa3b, /* L = log(2) / 32, this is 1/L in working precision */
+ .L1 = 0x3cb17200, /* L1 and L2 sum to give L in high precision, */
+ .L2 = 0x333fbe8e, /* and L1 has some trailing zeroes. */
+ .nabs = 9, /* L1 has 9 trailing zeroes in binary */
+ .Ai = [ /* Coefficients for approximating expm1 in a tiny range */
0x3f000044,
0x3e2aaaec,
- ][:]
-
-const Ai64 : uint64[:] = [
- 0x0000000000000000,
- 0x0000000000000000,
- 0x3fe0000000000000,
- 0x3fc5555555548f7c,
- 0x3fa5555555545d4e,
- 0x3f811115b7aa905e,
- 0x3f56c1728d739765,
- ][:]
-
-/* Double-precise expansions of 2^(J/32) */
-const S32 : (uint32, uint32)[32] = [
+ ][:],
+ .Log3_4 = 0xbe934b11, /* log(1 - 1/4) < x < log(1 + 1/4) */
+ .Log5_4 = 0x3e647fbf, /* triggers special expm1 case */
+ .thresh_tiny = 0x33000000, /* similar to thresh_1_{min,max}, but for expm1 */
+ .thresh_huge_neg = 0xc18aa122,
+ .Bi = [ /* Coefficients for approximating expm1 between log(3/4) and log(5/4) */
+ 0x3e2aaaaa,
+ 0x3d2aaaa0,
+ 0x3c0889ff,
+ 0x3ab64de5,
+ 0x394ab327,
+ ][:],
+ .S = [ /* Double-precise expansions of 2^(J/32) */
(0x3f800000, 0x00000000),
(0x3f82cd80, 0x35531585),
(0x3f85aac0, 0x34d9f312),
@@ -85,9 +120,49 @@
(0x3fefe480, 0x36e66f73),
(0x3ff52540, 0x36f45492),
(0x3ffa8380, 0x36cb6dc9),
- ]
+ ],
+ .exactness_thresh = 24, /* threshold to prevent underflow in a S[high] + 2^-m */
+]
-const S64 : (uint64, uint64)[32] = [
+const desc64 : fltdesc(flt64, uint64, int64) = [
+ .explode = std.flt64explode,
+ .assem = std.flt64assem,
+ .fma = fma64,
+ .horner = horner_polyu64,
+ .sgnmask = (1 << 63),
+ .tobits = std.flt64bits,
+ .frombits = std.flt64frombits,
+ .inf = 0x7ff0000000000000,
+ .nan = 0x7ff8000000000000,
+ .thresh_1_min = 0xc0874910d52d3052, /* ln(2^(-1023 - 52)) ~= -745.1332191019 */
+ .thresh_1_max = 0x40862e42fefa39ef, /* ln([2 - 2^(-53)] * 2^1023) ~= 709.78271289 */
+ .inv_L = 0x40471547652b82fe, /* below as in exp32 */
+ .L1 = 0x3f962e42fef00000,
+ .L2 = 0x3d8473de6af278ed,
+ .nabs = 20,
+ .Ai = [
+ 0x3fe0000000000000,
+ 0x3fc5555555548f7c,
+ 0x3fa5555555545d4e,
+ 0x3f811115b7aa905e,
+ 0x3f56c1728d739765,
+ ][:],
+ .Log3_4 = 0xbfd269621134db93,
+ .Log5_4 = 0x3fcc8ff7c79a9a22,
+ .thresh_tiny = 0x3c90000000000000,
+ .thresh_huge_neg = 0xc042b708872320e1,
+ .Bi = [
+ 0x3fc5555555555549,
+ 0x3fa55555555554b6,
+ 0x3f8111111111a9f3,
+ 0x3f56c16c16ce14c6,
+ 0x3f2a01a01159dd2d,
+ 0x3efa019f635825c4,
+ 0x3ec71e14bfe3db59,
+ 0x3e928295484734ea,
+ 0x3e5a2836aa646b96,
+ ][:],
+ .S = [
(0x3ff0000000000000, 0x0000000000000000),
(0x3ff059b0d3158540, 0x3d0a1d73e2a475b4),
(0x3ff0b5586cf98900, 0x3ceec5317256e308),
@@ -120,52 +195,16 @@
(0x3ffdfc97337b9b40, 0x3cfeb968cac39ed3),
(0x3ffea4afa2a490c0, 0x3cf9858f73a18f5e),
(0x3fff50765b6e4540, 0x3c99d3e12dd8a18b),
- ]
+ ],
+ .exactness_thresh = 53,
+]
const exp32 = {f : flt32
- const d : fltdesc(flt32, uint32, int32) = [
- .explode = std.flt32explode,
- .assem = std.flt32assem,
- .fma = fma32,
- .horner = horner_polyu32,
- .sgnmask = (1 << 31),
- .tobits = std.flt32bits,
- .frombits = std.flt32frombits,
- .inf = 0x7f800000,
- .thresh_1_min = 0xc2cff1b4, /* ln(2^(-127 - 23)) ~= -103.9720770839917 */
- .thresh_1_max = 0x42b17218, /* ln([2 - 2^(-24)] * 2^127) ~= 88.72283905206 */
- .inv_L = 0x4238aa3b, /* L = log(2) / 32, this is 1/L in working precision */
- .L1 = 0x3cb17200, /* L1 and L2 sum to give L in high precision, and L1 */
- .L2 = 0x333fbe8e, /* has some trailing zeroes. */
- .nabs = 9, /* L1 has 9 trailing zeroes in binary */
- .Ai = Ai32,
- .S = S32,
- ]
-
- -> expgen(f, d)
+ -> expgen(f, desc32)
}
const exp64 = {f : flt64
- const d : fltdesc(flt64, uint64, int64) = [
- .explode = std.flt64explode,
- .assem = std.flt64assem,
- .fma = fma64,
- .horner = horner_polyu64,
- .sgnmask = (1 << 63),
- .tobits = std.flt64bits,
- .frombits = std.flt64frombits,
- .inf = 0x7ff0000000000000,
- .thresh_1_min = 0xc0874910d52d3052, /* ln(2^(-1023 - 52)) ~= -745.1332191019 */
- .thresh_1_max = 0x40862e42fefa39ef, /* ln([2 - 2^(-53)] * 2^1023) ~= 709.78271289 */
- .inv_L = 0x40471547652b82fe, /* as in exp32 */
- .L1 = 0x3f962e42fef00000,
- .L2 = 0x3d8473de6af278ed,
- .nabs = 20,
- .Ai = Ai64,
- .S = S64,
- ]
-
- -> expgen(f, d)
+ -> expgen(f, desc64)
}
generic expgen = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
@@ -180,7 +219,7 @@
*/
if !n && b >= d.thresh_1_max
-> d.frombits(d.inf)
- elif n && b >= d.thresh_1_min
+ elif n && b > d.thresh_1_min
-> (0.0 : @f)
;;
@@ -191,9 +230,8 @@
/* Argument reduction to [ -ln(2)/64, ln(2)/64 ] */
var inv_L = d.frombits(d.inv_L)
- var finv_L = f * inv_L
- var N = rn(finv_L)
+ var N = rn(f * inv_L)
var N2 = N % (32 : @i)
if N2 < 0
N2 += (32 : @i)
@@ -220,7 +258,7 @@
/* Compute a polynomial approximation of exp1m */
var R = R1 + R2
- var Q = d.horner(R, d.Ai)
+ var Q = R * R * d.horner(R, d.Ai)
var P = R1 + (R2 + Q)
/* Expand out from reduced range */
@@ -234,6 +272,127 @@
var nu, eu, su
(nu, eu, su) = d.explode(unscaled)
var exp = d.assem(nu, eu + M, su)
+ if (d.tobits(exp) == 0)
+ /* Make sure we don't quite return 0 */
+ -> d.frombits(1)
+ ;;
-> exp
+}
+
+const expm132 = {f : flt32
+ -> expm1gen(f, desc32)
+}
+
+const expm164 = {f : flt64
+ -> expm1gen(f, desc64)
+}
+
+generic expm1gen = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
+ var b = d.tobits(f)
+ var n, e, s
+ (n, e, s) = d.explode(f)
+
+ /* Special cases: +/- 0, inf, NaN, tiny, and huge */
+ if (b & ~d.sgnmask == 0)
+ -> f
+ elif n && (b & ~d.sgnmask == d.inf)
+ -> (-1.0 : @f)
+ elif (b & ~d.sgnmask == d.inf)
+ -> f
+ elif std.isnan(f)
+ -> d.frombits(d.nan)
+ elif (b & ~d.sgnmask) <= d.thresh_tiny
+ var two_to_large = d.assem(false, 100, 0)
+ var two_to_small = d.assem(false, -100, 0)
+ var abs_f = d.assem(false, e, s)
+ -> (two_to_large * f + abs_f) * two_to_small
+ elif !n && b >= d.thresh_1_max /* exp(x) = oo <=> expm1(x) = oo, as it turns out */
+ -> d.frombits(d.inf)
+ elif n && b >= d.thresh_huge_neg
+ -> (-1.0 : @f)
+ ;;
+
+ if ((n && b < d.Log3_4) || (!n && b < d.Log5_4))
+ /* Procedure 2 */
+
+ /* compute x^2 / 2 with extra precision */
+ var u = round(f, d)
+ var v = f - u
+ var y = u * u * (0.5 : @f)
+ var z = v * (f + u) * (0.5 : @f)
+ var q = f * f * f * d.horner(f, d.Bi)
+
+ var yn, ye, ys
+ (yn, ye, ys) = d.explode(y)
+ if (ye >= -7)
+ -> (u + y) + (q + (v + z))
+ else
+ -> f + (y + (q + z))
+ ;;
+ ;;
+
+ /* Procedure 1 */
+ var inv_L = d.frombits(d.inv_L)
+
+ var N = rn(f * inv_L)
+ var N2 = N % (32 : @i)
+ if N2 < 0
+ N2 += (32 : @i)
+ ;;
+ var N1 = N - N2
+
+ var R1 : @f, R2 : @f
+
+ /*
+ As in the exp case, R1 + R2 very well approximates f
+ reduced into [ -ln(2)/64, ln(2)/64 ]
+ */
+ if std.abs(N) >= (1 << d.nabs)
+ R1 = (f - (N1 : @f) * d.frombits(d.L1)) - ((N2 : @f) * d.frombits(d.L1))
+ else
+ R1 = f - (N : @f) * d.frombits(d.L1)
+ ;;
+ R2 = -1.0 * (N : @f) * d.frombits(d.L2)
+
+ var M = (N1 : @i) / 32
+ var J = (N2 : std.size)
+
+ /* Compute a polynomial approximation of exp1m */
+ var R = R1 + R2
+ var Q = R * R * d.horner(R, d.Ai)
+ var P = R1 + (R2 + Q)
+
+ /* Expand out */
+ var Su_hi, Su_lo
+ (Su_hi, Su_lo) = d.S[J]
+ var S_hi = d.frombits(Su_hi)
+ var S_lo = d.frombits(Su_lo)
+ var S = S_hi + S_lo
+
+ /*
+ Note that [Tan92] includes cases to avoid tripping the
+ underflow flag when not technically required. We currently
+ do not care about such flags, so that logic is omitted.
+ */
+ var two_m = d.assem(false, M, 0)
+ var two_neg_m = d.assem(false, -M, 0)
+ if M >= d.exactness_thresh
+ -> two_m * (S_hi + (S * P + (S_lo - two_neg_m)))
+ elif M <= -8
+ -> two_m * (S_hi + (S * P + S_lo)) - (1.0 : @f)
+ ;;
+
+ -> two_m * ((S_hi - two_neg_m) + (S_hi * P + S_lo * (P + (1.0 : @f))))
+}
+
+generic round = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
+ var b = d.tobits(f)
+ var n, e, s
+ (n, e, s) = d.explode(f)
+ var mask = ~((1 << d.nabs) - 1)
+ if need_round_away(0, ((s & mask) : uint64), (d.nabs : int64) + 1)
+ -> d.assem(n, e, 1 + s & mask)
+ ;;
+ -> d.assem(n, e, s & mask)
}
--- a/lib/math/fpmath.myr
+++ b/lib/math/fpmath.myr
@@ -5,6 +5,7 @@
/* exp-impl */
exp : (f : @f -> @f)
+ expm1 : (f : @f -> @f)
/* fma-impl */
fma : (x : @f, y : @f, z : @f -> @f)
@@ -67,6 +68,7 @@
fma = {x, y, z; -> fma32(x, y, z)}
exp = {f; -> exp32(f)}
+ expm1 = {f; -> expm132(f)}
horner_poly = {f, a; -> horner_poly32(f, a)}
horner_polyu = {f, a; -> horner_polyu32(f, a)}
@@ -86,6 +88,7 @@
fma = {x, y, z; -> fma64(x, y, z)}
exp = {f; -> exp64(f)}
+ expm1 = {f; -> expm164(f)}
horner_poly = {f, a; -> horner_poly64(f, a)}
horner_polyu = {f, a; -> horner_polyu64(f, a)}
@@ -105,6 +108,9 @@
extern const exp32 : (x : flt32 -> flt32)
extern const exp64 : (x : flt64 -> flt64)
+
+extern const expm132 : (x : flt32 -> flt32)
+extern const expm164 : (x : flt64 -> flt64)
extern const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
extern const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
--- a/lib/math/references
+++ b/lib/math/references
@@ -19,3 +19,9 @@
Function in IEEE Floating-point Arithmetic”. In: ACM Trans. Math.
Softw. 15.2 (June 1989), pp. 144–157. issn: 0098-3500. doi:
10.1145/63522.214389. url: http://doi.acm.org/10.1145/63522.214389.
+
+[Tan92]
+Ping Tak Peter Tang. “Table-driven Implementation of the Expm1
+Function in IEEE Floating- point Arithmetic”. In: ACM Trans. Math.
+Softw. 18.2 (June 1992), pp. 211–222. issn: 0098-3500. doi:
+10.1145/146847.146928. url: http://doi.acm.org/10.1145/146847.146928.
--- a/lib/math/test/exp-impl.myr
+++ b/lib/math/test/exp-impl.myr
@@ -8,6 +8,7 @@
[.name="exp-02", .fn = exp02],
[.name="exp-03", .fn = exp03],
[.name="exp-04", .fn = exp04],
+ [.name="expm1-01", .fn = expm101],
][:])
}
@@ -19,6 +20,15 @@
(0x42000000, 0x568fa1fe),
(0xc2b00000, 0x0041edc4),
(0xc2b20000, 0x001840fc),
+ (0x7f7fffff, 0x7f800000),
+ (0x7f800000, 0x7f800000),
+ (0x7f800001, 0x7fc00000),
+ (0xc2cff1b3, 0x00000001),
+ (0xc2cff1b4, 0x00000001),
+ (0xc2cff1b5, 0000000000),
+ (0x42b17217, 0x7f7fff84),
+ (0x42b17218, 0x7f800000),
+ (0x42b17219, 0x7f800000),
][:]
for (x, y) : inputs
@@ -99,3 +109,28 @@
;;
;;
}
+
+const expm101 = {c
+ var inputs : (uint32, uint32)[:] = [
+ (0x00000000, 0x00000000),
+ (0x3f000000, 0x3f261299),
+ (0x34000000, 0x34000000),
+ (0x3c000000, 0x3c008056),
+ (0x42000000, 0x568fa1fe),
+ (0xc2b00000, 0xbf800000),
+ (0xc2b20000, 0xbf800000),
+ (0x01000000, 0x01000000),
+ (0x40000000, 0x40cc7326),
+ (0x42b17200, 0x7f7d7740),
+ ][:]
+
+ for (x, y) : inputs
+ var xf : flt32 = std.flt32frombits(x)
+ var yf : flt32 = std.flt32frombits(y)
+ var rf = math.expm1(xf)
+ testr.check(c, rf == yf,
+ "expm1(0x{b=16,w=8,p=0}) should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
+ x, y, std.flt32bits(rf))
+ ;;
+}
+