ref: 5de1231c0605526b22779dd827cc96461de40f77
parent: d27469804f73bf0874250eda520532875ecf5645
author: S. Gilles <sgilles@math.umd.edu>
date: Thu Mar 22 07:10:21 EDT 2018
Drop "fpmath-" prefix, it's redundant
--- a/lib/math/bld.sub
+++ b/lib/math/bld.sub
@@ -2,15 +2,15 @@
fpmath.myr
# trunc, floor, ceil
- fpmath-trunc-impl+posixy-x64-sse4.s
- fpmath-trunc-impl.myr
+ trunc-impl+posixy-x64-sse4.s
+ trunc-impl.myr
# summation
- fpmath-sum-impl.myr
+ sum-impl.myr
# fused-multiply-add
- fpmath-fma-impl+posixy-x64-fma.s
- fpmath-fma-impl.myr
+ fma-impl+posixy-x64-fma.s
+ fma-impl.myr
lib ../std:std
;;
--- /dev/null
+++ b/lib/math/fma-impl+posixy-x64-fma.s
@@ -1,0 +1,13 @@
+.globl math$fma32
+.globl math$_fma32
+math$fma32:
+math$_fma32:
+ vfmadd132ss %xmm1, %xmm2, %xmm0
+ ret
+
+.globl math$fma64
+.globl math$_fma64
+math$fma64:
+math$_fma64:
+ vfmadd132sd %xmm1, %xmm2, %xmm0
+ ret
--- /dev/null
+++ b/lib/math/fma-impl.myr
@@ -1,0 +1,551 @@
+use std
+
+pkg math =
+ pkglocal const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
+ pkglocal const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
+;;
+
+const exp_mask32 : uint32 = 0xff << 23
+const exp_mask64 : uint64 = 0x7ff << 52
+
+pkglocal const fma32 = {x : flt32, y : flt32, z : flt32
+ var xn, yn
+ (xn, _, _) = std.flt32explode(x)
+ (yn, _, _) = std.flt32explode(y)
+ var xd : flt64 = flt64fromflt32(x)
+ var yd : flt64 = flt64fromflt32(y)
+ var zd : flt64 = flt64fromflt32(z)
+ var prod : flt64 = xd * yd
+ var pn, pe, ps
+ (pn, pe, ps) = std.flt64explode(prod)
+ if pe == -1023
+ pe = -1022
+ ;;
+ if pn != (xn != yn)
+ /* In case of NaNs, sign might not have been preserved */
+ pn = (xn != yn)
+ prod = std.flt64assem(pn, pe, ps)
+ ;;
+
+ var r : flt64 = prod + zd
+ var rn, re, rs
+ (rn, re, rs) = std.flt64explode(r)
+
+ /*
+ At this point, r is probably the correct answer. The
+ only issue is that if truncating r to a flt32 causes
+ rounding-to-even, and it was obtained by rounding in the
+ first place, direction, then we'd be over-rounding. The
+ only way this could possibly be a problem is if the
+ product step rounded to the halfway point (a 100...0,
+ with the 1 just outside truncation range).
+ */
+ if re == 1024 || rs & 0x1fffffff != 0x10000000
+ -> flt32fromflt64(r)
+ ;;
+
+ /*
+ At this point, there's definitely about to be a rounding
+ error. To figure out what to do, compute prod + z with
+ round-to-zero. If we get r again, then it's okay to round
+ r upwards, because it hasn't been rounded away from zero
+ yet and we allow ourselves one such rounding.
+ */
+ var zn, ze, zs
+ (zn, ze, zs) = std.flt64explode(zd)
+ if ze == -1023
+ ze = -1022
+ ;;
+ var rtn, rte, rts
+ if pe >= ze && pn == zn
+ (rtn, rte, rts) = (pn, pe, ps)
+ rts += shr(zs, pe - ze)
+ elif pe > ze || (pe == ze && ps > zs)
+ (rtn, rte, rts) = (pn, pe, ps)
+ rts -= shr(zs, pe - ze)
+ if shr((-1 : uint64), 64 - std.min(64, (pe - ze))) & zs != 0
+ rts--
+ ;;
+ elif pe < ze && pn == zn
+ (rtn, rte, rts) = (zn, ze, zs)
+ rts += shr(ps, ze - pe)
+ else
+ (rtn, rte, rts) = (zn, ze, zs)
+ rts -= shr(ps, ze - pe)
+ if shr((-1 : uint64), 64 - std.min(64, (ze - pe))) & ps != 0
+ rts--
+ ;;
+ ;;
+
+ if rts & (1 << 53) != 0
+ rts >>= 1
+ rte++
+ ;;
+
+ if rn == rtn && rs == rts && re == rte
+ rts++
+ if rts & (1 << 53) != 0
+ rts >>= 1
+ rte++
+ ;;
+ ;;
+ -> flt32fromflt64(std.flt64assem(rtn, rte, rts))
+}
+
+const flt64fromflt32 = {f : flt32
+ var n, e, s
+ (n, e, s) = std.flt32explode(f)
+ var xs : uint64 = (s : uint64)
+ var xe : int64 = (e : int64)
+
+ if e == 128
+ -> std.flt64assem(n, 1024, xs)
+ elif e == -127
+ /*
+ All subnormals in single precision (except 0.0s)
+ can be upgraded to double precision, since the
+ exponent range is so much wider.
+ */
+ var first1 = find_first1_64(xs, 23)
+ if first1 < 0
+ -> std.flt64assem(n, -1023, 0)
+ ;;
+ xs = xs << (52 - (first1 : uint64))
+ xe = -126 - (23 - first1)
+ -> std.flt64assem(n, xe, xs)
+ ;;
+
+ -> std.flt64assem(n, xe, xs << (52 - 23))
+}
+
+const flt32fromflt64 = {f : flt64
+ var n : bool, e : int64, s : uint64
+ (n, e, s) = std.flt64explode(f)
+ var ts : uint32
+ var te : int32 = (e : int32)
+
+ if e >= 128
+ if e == 1023 && s != 0
+ /* NaN */
+ -> std.flt32assem(n, 128, 1)
+ else
+ /* infinity */
+ -> std.flt32assem(n, 128, 0)
+ ;;
+ ;;
+
+ if e >= -127
+ /* normal */
+ ts = ((s >> (52 - 23)) : uint32)
+ if need_round_away(0, s, 52 - 23)
+ ts++
+ if ts & (1 << 24) != 0
+ ts >>= 1
+ te++
+ ;;
+ ;;
+ -> std.flt32assem(n, te, ts)
+ ;;
+
+ /* subnormal already, will have to go to 0 */
+ if e == -1023
+ -> std.flt32assem(n, -127, 0)
+ ;;
+
+ /* subnormal (at least, it will be) */
+ te = -127
+ var shift : int64 = (52 - 23) + (-126 - e)
+ var ts1 = shr(s, shift)
+ ts = (ts1 : uint32)
+ if need_round_away(0, s, shift)
+ ts++
+ ;;
+ -> std.flt32assem(n, te, ts)
+}
+
+pkglocal const fma64 = {x : flt64, y : flt64, z : flt64
+ var xn : bool, yn : bool, zn : bool
+ var xe : int64, ye : int64, ze : int64
+ var xs : uint64, ys : uint64, zs : uint64
+
+ var xb : uint64 = std.flt64bits(x)
+ var yb : uint64 = std.flt64bits(y)
+ var zb : uint64 = std.flt64bits(z)
+
+ /* check for both NaNs and infinities */
+ if xb & exp_mask64 == exp_mask64 || \
+ yb & exp_mask64 == exp_mask64
+ -> x * y + z
+ elif z == 0.0 || z == -0.0 || x * y == 0.0 || x * y == -0.0
+ -> x * y + z
+ elif zb & exp_mask64 == exp_mask64
+ -> z
+ ;;
+
+ (xn, xe, xs) = std.flt64explode(x)
+ (yn, ye, ys) = std.flt64explode(y)
+ (zn, ze, zs) = std.flt64explode(z)
+ if xe == -1023
+ xe = -1022
+ ;;
+ if ye == -1023
+ ye = -1022
+ ;;
+ if ze == -1023
+ ze = -1022
+ ;;
+
+ /* Keep product in high/low uint64s */
+ var xs_h : uint64 = xs >> 32
+ var ys_h : uint64 = ys >> 32
+ var xs_l : uint64 = xs & 0xffffffff
+ var ys_l : uint64 = ys & 0xffffffff
+
+ var t_l : uint64 = xs_l * ys_l
+ var t_m : uint64 = xs_l * ys_h + xs_h * ys_l
+ var t_h : uint64 = xs_h * ys_h
+
+ var prod_l : uint64 = t_l + (t_m << 32)
+ var prod_h : uint64 = t_h + (t_m >> 32)
+ if t_l > prod_l
+ prod_h++
+ ;;
+
+ var prod_n = xn != yn
+ var prod_lastbit_e = (xe - 52) + (ye - 52)
+ var prod_first1 = find_first1_64_hl(prod_h, prod_l, 105)
+ var prod_firstbit_e = prod_lastbit_e + prod_first1
+
+ var z_firstbit_e = ze
+ var z_lastbit_e = ze - 52
+ var z_first1 = 52
+
+ /* subnormals could throw firstbit_e calculations out of whack */
+ if (zb & exp_mask64 == 0)
+ z_first1 = find_first1_64(zs, z_first1)
+ z_firstbit_e = z_lastbit_e + z_first1
+ ;;
+
+ var res_n
+ var res_h = 0
+ var res_l = 0
+ var res_first1
+ var res_lastbit_e
+ var res_firstbit_e
+
+ if prod_n == zn
+ res_n = prod_n
+
+ /*
+ Align prod and z so that the top bit of the
+ result is either 53 or 54, then add.
+ */
+ if prod_firstbit_e >= z_firstbit_e
+ /*
+ [ prod_h ][ prod_l ]
+ [ z...
+ */
+ res_lastbit_e = prod_lastbit_e
+ (res_h, res_l) = (prod_h, prod_l)
+ (res_h, res_l) = add_shifted(res_h, res_l, zs, z_lastbit_e - prod_lastbit_e)
+ else
+ /*
+ [ prod_h ][ prod_l ]
+ [ z...
+ */
+ res_lastbit_e = z_lastbit_e - 64
+ res_h = zs
+ res_l = 0
+ if prod_lastbit_e >= res_lastbit_e + 64
+ /* In this situation, prod must be extremely subnormal */
+ res_h += shl(prod_l, prod_lastbit_e - res_lastbit_e - 64)
+ elif prod_lastbit_e >= res_lastbit_e
+ res_h += shl(prod_h, prod_lastbit_e - res_lastbit_e)
+ res_h += shr(prod_l, res_lastbit_e + 64 - prod_lastbit_e)
+ res_l += shl(prod_l, prod_lastbit_e - res_lastbit_e)
+ elif prod_lastbit_e + 64 >= res_lastbit_e
+ res_h += shr(prod_h, res_lastbit_e - prod_lastbit_e)
+ var l1 = shl(prod_h, prod_lastbit_e + 64 - res_lastbit_e)
+ var l2 = shr(prod_l, res_lastbit_e - prod_lastbit_e)
+ res_l = l1 + l2
+ if res_l < l1
+ res_h++
+ ;;
+ elif prod_lastbit_e + 128 >= res_lastbit_e
+ res_l += shr(prod_h, res_lastbit_e - prod_lastbit_e - 64)
+ ;;
+ ;;
+ else
+ match compare_hl_z(prod_h, prod_l, prod_firstbit_e, prod_lastbit_e, zs, z_firstbit_e, z_lastbit_e)
+ | `std.Equal: -> 0.0
+ | `std.Before:
+ /* prod > z */
+ res_n = prod_n
+ res_lastbit_e = prod_lastbit_e
+ (res_h, res_l) = sub_shifted(prod_h, prod_l, zs, z_lastbit_e - prod_lastbit_e)
+ | `std.After:
+ /* z > prod */
+ res_n = zn
+ res_lastbit_e = z_lastbit_e - 64
+ (res_h, res_l) = sub_shifted(zs, 0, prod_h, prod_lastbit_e + 64 - (z_lastbit_e - 64))
+ (res_h, res_l) = sub_shifted(res_h, res_l, prod_l, prod_lastbit_e - (z_lastbit_e - 64))
+ ;;
+ ;;
+
+ res_first1 = 64 + find_first1_64(res_h, 55)
+ if res_first1 == 63
+ res_first1 = find_first1_64(res_l, 63)
+ ;;
+ res_firstbit_e = res_first1 + res_lastbit_e
+
+ /*
+ Finally, res_h and res_l are the high and low bits of
+ the result. They now need to be assembled into a flt64.
+ Subnormals and infinities could be a problem.
+ */
+ var res_s = 0
+ if res_firstbit_e <= -1023
+ /* Subnormal case */
+ if res_lastbit_e + 128 < 12 - 1022
+ res_s = shr(res_h, 12 - 1022 - (res_lastbit_e + 128))
+ res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
+ elif res_lastbit_e + 64 < 12 - 1022
+ res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
+ res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
+ else
+ res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
+ res_s |= shl(res_l, -12 + (res_lastbit_e + 64) - (-1022))
+ ;;
+
+ if need_round_away(res_h, res_l, res_first1 + (-1074 - res_firstbit_e))
+ res_s++
+ ;;
+
+ /* No need for exponents, they are all zero */
+ var res = res_s
+ if res_n
+ res |= (1 << 63)
+ ;;
+ -> std.flt64frombits(res)
+ ;;
+
+ if res_firstbit_e >= 1024
+ /* Infinity case */
+ if res_n
+ -> std.flt64frombits(0xfff0000000000000)
+ else
+ -> std.flt64frombits(0x7ff0000000000000)
+ ;;
+ ;;
+
+ if res_first1 - 52 >= 64
+ res_s = shr(res_h, (res_first1 : int64) - 64 - 52)
+ if need_round_away(res_h, res_l, res_first1 - 52)
+ res_s++
+ ;;
+ elif res_first1 - 52 >= 0
+ res_s = shl(res_h, 64 - (res_first1 - 52))
+ res_s |= shr(res_l, res_first1 - 52)
+ if need_round_away(res_h, res_l, res_first1 - 52)
+ res_s++
+ ;;
+ else
+ res_s = shl(res_h, res_first1 - 52)
+ ;;
+
+ /* The res_s++s might have messed everything up */
+ if res_s & (1 << 53) != 0
+ res_s >= 1
+ res_firstbit_e++
+ if res_firstbit_e >= 1024
+ if res_n
+ -> std.flt64frombits(0xfff0000000000000)
+ else
+ -> std.flt64frombits(0x7ff0000000000000)
+ ;;
+ ;;
+ ;;
+
+ -> std.flt64assem(res_n, res_firstbit_e, res_s)
+}
+
+/* >> and <<, but without wrapping when the shift is >= 64 */
+const shr : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64
+ if (s : uint64) >= 64
+ -> 0
+ else
+ -> u >> (s : uint64)
+ ;;
+}
+
+const shl : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64
+ if (s : uint64) >= 64
+ -> 0
+ else
+ -> u << (s : uint64)
+ ;;
+}
+
+/*
+ Add (a << s) to [ h ][ l ], where if s < 0 then a corresponding
+ right-shift is used. This is aligned such that if s == 0, then
+ the result is [ h ][ l + a ]
+ */
+const add_shifted = {h : uint64, l : uint64, a : uint64, s : int64
+ if s >= 64
+ -> (h + shl(a, s - 64), l)
+ elif s >= 0
+ var new_h = h + shr(a, 64 - s)
+ var sa = shl(a, s)
+ var new_l = l + sa
+ if new_l < l
+ new_h++
+ ;;
+ -> (new_h, new_l)
+ else
+ var new_h = h
+ var sa = shr(a, -s)
+ var new_l = l + sa
+ if new_l < l
+ new_h++
+ ;;
+ -> (new_h, new_l)
+ ;;
+}
+
+/* As above, but subtract (a << s) */
+const sub_shifted = {h : uint64, l : uint64, a : uint64, s : int64
+ if s >= 64
+ -> (h - shl(a, s - 64), l)
+ elif s >= 0
+ var new_h = h - shr(a, 64 - s)
+ var sa = shl(a, s)
+ var new_l = l - sa
+ if sa > l
+ new_h--
+ ;;
+ -> (new_h, new_l)
+ else
+ var new_h = h
+ var sa = shr(a, -s)
+ var new_l = l - sa
+ if sa > l
+ new_h--
+ ;;
+ -> (new_h, new_l)
+ ;;
+}
+
+const compare_hl_z = {h : uint64, l : uint64, hl_firstbit_e : int64, hl_lastbit_e : int64, z : uint64, z_firstbit_e : int64, z_lastbit_e : int64
+ if hl_firstbit_e > z_firstbit_e
+ -> `std.Before
+ elif hl_firstbit_e < z_firstbit_e
+ -> `std.After
+ ;;
+
+ var h_k : int64 = (hl_firstbit_e - hl_lastbit_e - 64)
+ var z_k : int64 = (z_firstbit_e - z_lastbit_e)
+ while h_k >= 0 && z_k >= 0
+ var h1 = h & shl(1, h_k) != 0
+ var z1 = z & shl(1, z_k) != 0
+ if h1 && !z1
+ -> `std.Before
+ elif !h1 && z1
+ -> `std.After
+ ;;
+ h_k--
+ z_k--
+ ;;
+
+ if z_k < 0
+ if (h & shr((-1 : uint64), 64 - h_k) != 0) || (l != 0)
+ -> `std.Before
+ else
+ -> `std.Equal
+ ;;
+ ;;
+
+ var l_k : int64 = 63
+ while l_k >= 0 && z_k >= 0
+ var l1 = l & shl(1, l_k) != 0
+ var z1 = z & shl(1, z_k) != 0
+ if l1 && !z1
+ -> `std.Before
+ elif !l1 && z1
+ -> `std.After
+ ;;
+ l_k--
+ z_k--
+ ;;
+
+ if (z_k < 0) && (l & shr((-1 : uint64), 64 - l_k) != 0)
+ -> `std.Before
+ elif (l_k < 0) && (z & shr((-1 : uint64), 64 - z_k) != 0)
+ -> `std.After
+ ;;
+
+ -> `std.Equal
+}
+
+/* Find the first 1 bit in a bitstring */
+const find_first1_64 : (b : uint64, start : int64 -> int64) = {b : uint64, start : int64
+ for var j = start; j >= 0; --j
+ var m = shl(1, j)
+ if b & m != 0
+ -> j
+ ;;
+ ;;
+
+ -> -1
+}
+
+const find_first1_64_hl = {h, l, start
+ var first1_h = find_first1_64(h, start - 64)
+ if first1_h >= 0
+ -> first1_h + 64
+ ;;
+
+ -> find_first1_64(l, 63)
+}
+
+/*
+ For [ h ][ l ], where bitpos_last is the position of the last
+ bit that was included in the truncated result (l's last bit has
+ position 0), decide whether rounding up/away is needed. This is
+ true if
+
+ - following bitpos_last is a 1, then a non-zero sequence, or
+
+ - following bitpos_last is a 1, then a zero sequence, and the
+ round would be to even
+ */
+const need_round_away = {h : uint64, l : uint64, bitpos_last : int64
+ var first_omitted_is_1 = false
+ var nonzero_beyond = false
+ if bitpos_last > 64
+ first_omitted_is_1 = h & shl(1, bitpos_last - 1 - 64) != 0
+ nonzero_beyond = nonzero_beyond || h & shr((-1 : uint64), 2 + 64 - (bitpos_last - 64)) != 0
+ nonzero_beyond = nonzero_beyond || (l != 0)
+ else
+ first_omitted_is_1 = l & shl(1, bitpos_last - 1) != 0
+ nonzero_beyond = nonzero_beyond || l & shr((-1 : uint64), 1 + 64 - bitpos_last) != 0
+ ;;
+
+ if !first_omitted_is_1
+ -> false
+ ;;
+
+ if nonzero_beyond
+ -> true
+ ;;
+
+ var hl_is_odd = false
+
+ if bitpos_last >= 64
+ hl_is_odd = h & shl(1, bitpos_last - 64) != 0
+ else
+ hl_is_odd = l & shl(1, bitpos_last) != 0
+ ;;
+
+ -> hl_is_odd
+}
--- a/lib/math/fpmath-fma-impl+posixy-x64-fma.s
+++ /dev/null
@@ -1,13 +1,0 @@
-.globl math$fma32
-.globl math$_fma32
-math$fma32:
-math$_fma32:
- vfmadd132ss %xmm1, %xmm2, %xmm0
- ret
-
-.globl math$fma64
-.globl math$_fma64
-math$fma64:
-math$_fma64:
- vfmadd132sd %xmm1, %xmm2, %xmm0
- ret
--- a/lib/math/fpmath-fma-impl.myr
+++ /dev/null
@@ -1,551 +1,0 @@
-use std
-
-pkg math =
- pkglocal const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
- pkglocal const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
-;;
-
-const exp_mask32 : uint32 = 0xff << 23
-const exp_mask64 : uint64 = 0x7ff << 52
-
-pkglocal const fma32 = {x : flt32, y : flt32, z : flt32
- var xn, yn
- (xn, _, _) = std.flt32explode(x)
- (yn, _, _) = std.flt32explode(y)
- var xd : flt64 = flt64fromflt32(x)
- var yd : flt64 = flt64fromflt32(y)
- var zd : flt64 = flt64fromflt32(z)
- var prod : flt64 = xd * yd
- var pn, pe, ps
- (pn, pe, ps) = std.flt64explode(prod)
- if pe == -1023
- pe = -1022
- ;;
- if pn != (xn != yn)
- /* In case of NaNs, sign might not have been preserved */
- pn = (xn != yn)
- prod = std.flt64assem(pn, pe, ps)
- ;;
-
- var r : flt64 = prod + zd
- var rn, re, rs
- (rn, re, rs) = std.flt64explode(r)
-
- /*
- At this point, r is probably the correct answer. The
- only issue is that if truncating r to a flt32 causes
- rounding-to-even, and it was obtained by rounding in the
- first place, direction, then we'd be over-rounding. The
- only way this could possibly be a problem is if the
- product step rounded to the halfway point (a 100...0,
- with the 1 just outside truncation range).
- */
- if re == 1024 || rs & 0x1fffffff != 0x10000000
- -> flt32fromflt64(r)
- ;;
-
- /*
- At this point, there's definitely about to be a rounding
- error. To figure out what to do, compute prod + z with
- round-to-zero. If we get r again, then it's okay to round
- r upwards, because it hasn't been rounded away from zero
- yet and we allow ourselves one such rounding.
- */
- var zn, ze, zs
- (zn, ze, zs) = std.flt64explode(zd)
- if ze == -1023
- ze = -1022
- ;;
- var rtn, rte, rts
- if pe >= ze && pn == zn
- (rtn, rte, rts) = (pn, pe, ps)
- rts += shr(zs, pe - ze)
- elif pe > ze || (pe == ze && ps > zs)
- (rtn, rte, rts) = (pn, pe, ps)
- rts -= shr(zs, pe - ze)
- if shr((-1 : uint64), 64 - std.min(64, (pe - ze))) & zs != 0
- rts--
- ;;
- elif pe < ze && pn == zn
- (rtn, rte, rts) = (zn, ze, zs)
- rts += shr(ps, ze - pe)
- else
- (rtn, rte, rts) = (zn, ze, zs)
- rts -= shr(ps, ze - pe)
- if shr((-1 : uint64), 64 - std.min(64, (ze - pe))) & ps != 0
- rts--
- ;;
- ;;
-
- if rts & (1 << 53) != 0
- rts >>= 1
- rte++
- ;;
-
- if rn == rtn && rs == rts && re == rte
- rts++
- if rts & (1 << 53) != 0
- rts >>= 1
- rte++
- ;;
- ;;
- -> flt32fromflt64(std.flt64assem(rtn, rte, rts))
-}
-
-const flt64fromflt32 = {f : flt32
- var n, e, s
- (n, e, s) = std.flt32explode(f)
- var xs : uint64 = (s : uint64)
- var xe : int64 = (e : int64)
-
- if e == 128
- -> std.flt64assem(n, 1024, xs)
- elif e == -127
- /*
- All subnormals in single precision (except 0.0s)
- can be upgraded to double precision, since the
- exponent range is so much wider.
- */
- var first1 = find_first1_64(xs, 23)
- if first1 < 0
- -> std.flt64assem(n, -1023, 0)
- ;;
- xs = xs << (52 - (first1 : uint64))
- xe = -126 - (23 - first1)
- -> std.flt64assem(n, xe, xs)
- ;;
-
- -> std.flt64assem(n, xe, xs << (52 - 23))
-}
-
-const flt32fromflt64 = {f : flt64
- var n : bool, e : int64, s : uint64
- (n, e, s) = std.flt64explode(f)
- var ts : uint32
- var te : int32 = (e : int32)
-
- if e >= 128
- if e == 1023 && s != 0
- /* NaN */
- -> std.flt32assem(n, 128, 1)
- else
- /* infinity */
- -> std.flt32assem(n, 128, 0)
- ;;
- ;;
-
- if e >= -127
- /* normal */
- ts = ((s >> (52 - 23)) : uint32)
- if need_round_away(0, s, 52 - 23)
- ts++
- if ts & (1 << 24) != 0
- ts >>= 1
- te++
- ;;
- ;;
- -> std.flt32assem(n, te, ts)
- ;;
-
- /* subnormal already, will have to go to 0 */
- if e == -1023
- -> std.flt32assem(n, -127, 0)
- ;;
-
- /* subnormal (at least, it will be) */
- te = -127
- var shift : int64 = (52 - 23) + (-126 - e)
- var ts1 = shr(s, shift)
- ts = (ts1 : uint32)
- if need_round_away(0, s, shift)
- ts++
- ;;
- -> std.flt32assem(n, te, ts)
-}
-
-pkglocal const fma64 = {x : flt64, y : flt64, z : flt64
- var xn : bool, yn : bool, zn : bool
- var xe : int64, ye : int64, ze : int64
- var xs : uint64, ys : uint64, zs : uint64
-
- var xb : uint64 = std.flt64bits(x)
- var yb : uint64 = std.flt64bits(y)
- var zb : uint64 = std.flt64bits(z)
-
- /* check for both NaNs and infinities */
- if xb & exp_mask64 == exp_mask64 || \
- yb & exp_mask64 == exp_mask64
- -> x * y + z
- elif z == 0.0 || z == -0.0 || x * y == 0.0 || x * y == -0.0
- -> x * y + z
- elif zb & exp_mask64 == exp_mask64
- -> z
- ;;
-
- (xn, xe, xs) = std.flt64explode(x)
- (yn, ye, ys) = std.flt64explode(y)
- (zn, ze, zs) = std.flt64explode(z)
- if xe == -1023
- xe = -1022
- ;;
- if ye == -1023
- ye = -1022
- ;;
- if ze == -1023
- ze = -1022
- ;;
-
- /* Keep product in high/low uint64s */
- var xs_h : uint64 = xs >> 32
- var ys_h : uint64 = ys >> 32
- var xs_l : uint64 = xs & 0xffffffff
- var ys_l : uint64 = ys & 0xffffffff
-
- var t_l : uint64 = xs_l * ys_l
- var t_m : uint64 = xs_l * ys_h + xs_h * ys_l
- var t_h : uint64 = xs_h * ys_h
-
- var prod_l : uint64 = t_l + (t_m << 32)
- var prod_h : uint64 = t_h + (t_m >> 32)
- if t_l > prod_l
- prod_h++
- ;;
-
- var prod_n = xn != yn
- var prod_lastbit_e = (xe - 52) + (ye - 52)
- var prod_first1 = find_first1_64_hl(prod_h, prod_l, 105)
- var prod_firstbit_e = prod_lastbit_e + prod_first1
-
- var z_firstbit_e = ze
- var z_lastbit_e = ze - 52
- var z_first1 = 52
-
- /* subnormals could throw firstbit_e calculations out of whack */
- if (zb & exp_mask64 == 0)
- z_first1 = find_first1_64(zs, z_first1)
- z_firstbit_e = z_lastbit_e + z_first1
- ;;
-
- var res_n
- var res_h = 0
- var res_l = 0
- var res_first1
- var res_lastbit_e
- var res_firstbit_e
-
- if prod_n == zn
- res_n = prod_n
-
- /*
- Align prod and z so that the top bit of the
- result is either 53 or 54, then add.
- */
- if prod_firstbit_e >= z_firstbit_e
- /*
- [ prod_h ][ prod_l ]
- [ z...
- */
- res_lastbit_e = prod_lastbit_e
- (res_h, res_l) = (prod_h, prod_l)
- (res_h, res_l) = add_shifted(res_h, res_l, zs, z_lastbit_e - prod_lastbit_e)
- else
- /*
- [ prod_h ][ prod_l ]
- [ z...
- */
- res_lastbit_e = z_lastbit_e - 64
- res_h = zs
- res_l = 0
- if prod_lastbit_e >= res_lastbit_e + 64
- /* In this situation, prod must be extremely subnormal */
- res_h += shl(prod_l, prod_lastbit_e - res_lastbit_e - 64)
- elif prod_lastbit_e >= res_lastbit_e
- res_h += shl(prod_h, prod_lastbit_e - res_lastbit_e)
- res_h += shr(prod_l, res_lastbit_e + 64 - prod_lastbit_e)
- res_l += shl(prod_l, prod_lastbit_e - res_lastbit_e)
- elif prod_lastbit_e + 64 >= res_lastbit_e
- res_h += shr(prod_h, res_lastbit_e - prod_lastbit_e)
- var l1 = shl(prod_h, prod_lastbit_e + 64 - res_lastbit_e)
- var l2 = shr(prod_l, res_lastbit_e - prod_lastbit_e)
- res_l = l1 + l2
- if res_l < l1
- res_h++
- ;;
- elif prod_lastbit_e + 128 >= res_lastbit_e
- res_l += shr(prod_h, res_lastbit_e - prod_lastbit_e - 64)
- ;;
- ;;
- else
- match compare_hl_z(prod_h, prod_l, prod_firstbit_e, prod_lastbit_e, zs, z_firstbit_e, z_lastbit_e)
- | `std.Equal: -> 0.0
- | `std.Before:
- /* prod > z */
- res_n = prod_n
- res_lastbit_e = prod_lastbit_e
- (res_h, res_l) = sub_shifted(prod_h, prod_l, zs, z_lastbit_e - prod_lastbit_e)
- | `std.After:
- /* z > prod */
- res_n = zn
- res_lastbit_e = z_lastbit_e - 64
- (res_h, res_l) = sub_shifted(zs, 0, prod_h, prod_lastbit_e + 64 - (z_lastbit_e - 64))
- (res_h, res_l) = sub_shifted(res_h, res_l, prod_l, prod_lastbit_e - (z_lastbit_e - 64))
- ;;
- ;;
-
- res_first1 = 64 + find_first1_64(res_h, 55)
- if res_first1 == 63
- res_first1 = find_first1_64(res_l, 63)
- ;;
- res_firstbit_e = res_first1 + res_lastbit_e
-
- /*
- Finally, res_h and res_l are the high and low bits of
- the result. They now need to be assembled into a flt64.
- Subnormals and infinities could be a problem.
- */
- var res_s = 0
- if res_firstbit_e <= -1023
- /* Subnormal case */
- if res_lastbit_e + 128 < 12 - 1022
- res_s = shr(res_h, 12 - 1022 - (res_lastbit_e + 128))
- res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
- elif res_lastbit_e + 64 < 12 - 1022
- res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
- res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
- else
- res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
- res_s |= shl(res_l, -12 + (res_lastbit_e + 64) - (-1022))
- ;;
-
- if need_round_away(res_h, res_l, res_first1 + (-1074 - res_firstbit_e))
- res_s++
- ;;
-
- /* No need for exponents, they are all zero */
- var res = res_s
- if res_n
- res |= (1 << 63)
- ;;
- -> std.flt64frombits(res)
- ;;
-
- if res_firstbit_e >= 1024
- /* Infinity case */
- if res_n
- -> std.flt64frombits(0xfff0000000000000)
- else
- -> std.flt64frombits(0x7ff0000000000000)
- ;;
- ;;
-
- if res_first1 - 52 >= 64
- res_s = shr(res_h, (res_first1 : int64) - 64 - 52)
- if need_round_away(res_h, res_l, res_first1 - 52)
- res_s++
- ;;
- elif res_first1 - 52 >= 0
- res_s = shl(res_h, 64 - (res_first1 - 52))
- res_s |= shr(res_l, res_first1 - 52)
- if need_round_away(res_h, res_l, res_first1 - 52)
- res_s++
- ;;
- else
- res_s = shl(res_h, res_first1 - 52)
- ;;
-
- /* The res_s++s might have messed everything up */
- if res_s & (1 << 53) != 0
- res_s >= 1
- res_firstbit_e++
- if res_firstbit_e >= 1024
- if res_n
- -> std.flt64frombits(0xfff0000000000000)
- else
- -> std.flt64frombits(0x7ff0000000000000)
- ;;
- ;;
- ;;
-
- -> std.flt64assem(res_n, res_firstbit_e, res_s)
-}
-
-/* >> and <<, but without wrapping when the shift is >= 64 */
-const shr : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64
- if (s : uint64) >= 64
- -> 0
- else
- -> u >> (s : uint64)
- ;;
-}
-
-const shl : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64
- if (s : uint64) >= 64
- -> 0
- else
- -> u << (s : uint64)
- ;;
-}
-
-/*
- Add (a << s) to [ h ][ l ], where if s < 0 then a corresponding
- right-shift is used. This is aligned such that if s == 0, then
- the result is [ h ][ l + a ]
- */
-const add_shifted = {h : uint64, l : uint64, a : uint64, s : int64
- if s >= 64
- -> (h + shl(a, s - 64), l)
- elif s >= 0
- var new_h = h + shr(a, 64 - s)
- var sa = shl(a, s)
- var new_l = l + sa
- if new_l < l
- new_h++
- ;;
- -> (new_h, new_l)
- else
- var new_h = h
- var sa = shr(a, -s)
- var new_l = l + sa
- if new_l < l
- new_h++
- ;;
- -> (new_h, new_l)
- ;;
-}
-
-/* As above, but subtract (a << s) */
-const sub_shifted = {h : uint64, l : uint64, a : uint64, s : int64
- if s >= 64
- -> (h - shl(a, s - 64), l)
- elif s >= 0
- var new_h = h - shr(a, 64 - s)
- var sa = shl(a, s)
- var new_l = l - sa
- if sa > l
- new_h--
- ;;
- -> (new_h, new_l)
- else
- var new_h = h
- var sa = shr(a, -s)
- var new_l = l - sa
- if sa > l
- new_h--
- ;;
- -> (new_h, new_l)
- ;;
-}
-
-const compare_hl_z = {h : uint64, l : uint64, hl_firstbit_e : int64, hl_lastbit_e : int64, z : uint64, z_firstbit_e : int64, z_lastbit_e : int64
- if hl_firstbit_e > z_firstbit_e
- -> `std.Before
- elif hl_firstbit_e < z_firstbit_e
- -> `std.After
- ;;
-
- var h_k : int64 = (hl_firstbit_e - hl_lastbit_e - 64)
- var z_k : int64 = (z_firstbit_e - z_lastbit_e)
- while h_k >= 0 && z_k >= 0
- var h1 = h & shl(1, h_k) != 0
- var z1 = z & shl(1, z_k) != 0
- if h1 && !z1
- -> `std.Before
- elif !h1 && z1
- -> `std.After
- ;;
- h_k--
- z_k--
- ;;
-
- if z_k < 0
- if (h & shr((-1 : uint64), 64 - h_k) != 0) || (l != 0)
- -> `std.Before
- else
- -> `std.Equal
- ;;
- ;;
-
- var l_k : int64 = 63
- while l_k >= 0 && z_k >= 0
- var l1 = l & shl(1, l_k) != 0
- var z1 = z & shl(1, z_k) != 0
- if l1 && !z1
- -> `std.Before
- elif !l1 && z1
- -> `std.After
- ;;
- l_k--
- z_k--
- ;;
-
- if (z_k < 0) && (l & shr((-1 : uint64), 64 - l_k) != 0)
- -> `std.Before
- elif (l_k < 0) && (z & shr((-1 : uint64), 64 - z_k) != 0)
- -> `std.After
- ;;
-
- -> `std.Equal
-}
-
-/* Find the first 1 bit in a bitstring */
-const find_first1_64 : (b : uint64, start : int64 -> int64) = {b : uint64, start : int64
- for var j = start; j >= 0; --j
- var m = shl(1, j)
- if b & m != 0
- -> j
- ;;
- ;;
-
- -> -1
-}
-
-const find_first1_64_hl = {h, l, start
- var first1_h = find_first1_64(h, start - 64)
- if first1_h >= 0
- -> first1_h + 64
- ;;
-
- -> find_first1_64(l, 63)
-}
-
-/*
- For [ h ][ l ], where bitpos_last is the position of the last
- bit that was included in the truncated result (l's last bit has
- position 0), decide whether rounding up/away is needed. This is
- true if
-
- - following bitpos_last is a 1, then a non-zero sequence, or
-
- - following bitpos_last is a 1, then a zero sequence, and the
- round would be to even
- */
-const need_round_away = {h : uint64, l : uint64, bitpos_last : int64
- var first_omitted_is_1 = false
- var nonzero_beyond = false
- if bitpos_last > 64
- first_omitted_is_1 = h & shl(1, bitpos_last - 1 - 64) != 0
- nonzero_beyond = nonzero_beyond || h & shr((-1 : uint64), 2 + 64 - (bitpos_last - 64)) != 0
- nonzero_beyond = nonzero_beyond || (l != 0)
- else
- first_omitted_is_1 = l & shl(1, bitpos_last - 1) != 0
- nonzero_beyond = nonzero_beyond || l & shr((-1 : uint64), 1 + 64 - bitpos_last) != 0
- ;;
-
- if !first_omitted_is_1
- -> false
- ;;
-
- if nonzero_beyond
- -> true
- ;;
-
- var hl_is_odd = false
-
- if bitpos_last >= 64
- hl_is_odd = h & shl(1, bitpos_last - 64) != 0
- else
- hl_is_odd = l & shl(1, bitpos_last) != 0
- ;;
-
- -> hl_is_odd
-}
--- a/lib/math/fpmath-sum-impl.myr
+++ /dev/null
@@ -1,104 +1,0 @@
-use std
-
-pkg math =
- pkglocal const kahan_sum32 : (l : flt32[:] -> flt32)
- pkglocal const priest_sum32 : (l : flt32[:] -> flt32)
-
- pkglocal const kahan_sum64: (l : flt64[:] -> flt64)
- pkglocal const priest_sum64 : (l : flt64[:] -> flt64)
-;;
-
-type doomed_flt32_arr = flt32[:]
-type doomed_flt64_arr = flt64[:]
-
-impl disposable doomed_flt32_arr =
- __dispose__ = {a : doomed_flt32_arr; std.slfree((a : flt32[:])) }
-;;
-
-impl disposable doomed_flt64_arr =
- __dispose__ = {a : doomed_flt64_arr; std.slfree((a : flt64[:])) }
-;;
-
-/*
- Kahan's compensated summation. Fast and reasonably accurate,
- although cancellation can cause relative error blowup. For
- something slower, but more accurate, use something like Priest's
- doubly compensated sums.
- */
-pkglocal const kahan_sum32 = {l; -> kahan_sum_gen(l, (0.0 : flt32))}
-pkglocal const kahan_sum64 = {l; -> kahan_sum_gen(l, (0.0 : flt64))}
-
-generic kahan_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
- if l.len == 0
- -> zero
- ;;
-
- var s = zero
- var c = zero
- var y = zero
- var t = zero
-
- for x : l
- y = x - c
- t = s + y
- c = (t - s) - y
- s = t
- ;;
-
- -> s
-}
-
-/*
- Priest's doubly compensated summation. Extremely accurate, but
- relatively slow. For situations in which cancellation is not
- expected, something like Kahan's compensated summation may be
- more useful.
- */
-pkglocal const priest_sum32 = {l : flt32[:]
- var l2 = std.sldup(l)
- std.sort(l2, mag_cmp32)
- auto (l2 : doomed_flt32_arr)
- -> priest_sum_gen(l2, (0.0 : flt32))
-}
-
-const mag_cmp32 = {f : flt32, g : flt32
- var u = std.flt32bits(f) & ~(1 << 31)
- var v = std.flt32bits(g) & ~(1 << 31)
- -> std.numcmp(v, u)
-}
-
-pkglocal const priest_sum64 = {l : flt64[:]
- var l2 = std.sldup(l)
- std.sort(l, mag_cmp64)
- auto (l2 : doomed_flt64_arr)
- -> priest_sum_gen(l2, (0.0 : flt64))
-}
-
-const mag_cmp64 = {f : flt64, g : flt64
- var u = std.flt64bits(f) & ~(1 << 63)
- var v = std.flt64bits(g) & ~(1 << 63)
- -> std.numcmp(v, u)
-}
-
-generic priest_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
- /* l should be sorted in descending order */
- if l.len == 0
- -> zero
- ;;
-
- var s = zero
- var c = zero
-
- for x : l
- var y = c + x
- var u = x - (y - c)
- var t = (y + s)
- var v = (y - (t - s))
- var z = u + v
- s = t + z
- c = z - (s - t)
- ;;
-
- -> s
-}
-
--- a/lib/math/fpmath-trunc-impl+posixy-x64-sse4.s
+++ /dev/null
@@ -1,41 +1,0 @@
-.globl math$trunc32
-.globl math$_trunc32
-math$trunc32:
-math$_trunc32:
- roundss $0x03, %xmm0, %xmm0
- ret
-
-.globl math$floor32
-.globl math$_floor32
-math$floor32:
-math$_floor32:
- roundss $0x01, %xmm0, %xmm0
- ret
-
-.globl math$ceil32
-.globl math$_ceil32
-math$ceil32:
-math$_ceil32:
- roundss $0x02, %xmm0, %xmm0
- ret
-
-.globl math$trunc64
-.globl math$_trunc64
-math$trunc64:
-math$_trunc64:
- roundsd $0x03, %xmm0, %xmm0
- ret
-
-.globl math$floor64
-.globl math$_floor64
-math$floor64:
-math$_floor64:
- roundsd $0x01, %xmm0, %xmm0
- ret
-
-.globl math$ceil64
-.globl math$_ceil64
-math$ceil64:
-math$_ceil64:
- roundsd $0x02, %xmm0, %xmm0
- ret
--- a/lib/math/fpmath-trunc-impl.myr
+++ /dev/null
@@ -1,103 +1,0 @@
-use std
-
-pkg math =
- pkglocal const trunc32 : (f : flt32 -> flt32)
- pkglocal const floor32 : (f : flt32 -> flt32)
- pkglocal const ceil32 : (f : flt32 -> flt32)
- pkglocal const trunc64 : (f : flt64 -> flt64)
- pkglocal const floor64 : (f : flt64 -> flt64)
- pkglocal const ceil64 : (f : flt64 -> flt64)
-;;
-
-const Flt32NegMask : uint32 = (1 << 31)
-const Flt32SigMask : uint32 = (1 << 23) - 1
-
-const Flt64NegMask : uint64 = (1 << 63)
-const Flt64SigMask : uint64 = (1 << 52) - 1
-
-pkglocal const floor32 = {f : flt32
- var n, e, s
- (n, e, s) = std.flt32explode(f)
-
- /* Many special cases */
- if e >= 23 || f == -0.0
- -> f
- elif e < 0
- if n
- -> -1.0
- else
- -> 0.0
- ;;
- ;;
-
- if n
- var fractional_mask = Flt32SigMask >> (e : uint32)
- if s & fractional_mask == 0
- -> f
- else
- /* Turns out the packing of exp and sig is useful */
- var u : uint32 = std.flt32bits(f) & ~fractional_mask
- u += ((1 << 23) >> (e : uint32))
- -> std.flt32frombits(u)
- ;;
- ;;
-
- var m : uint32 = (Flt32SigMask >> (e : uint32))
- -> std.flt32assem(n, e, s & ~m)
-}
-
-pkglocal const trunc32 = {f : flt32
- if std.flt32bits(f) & Flt32NegMask != 0
- -> -floor32(-f)
- else
- -> floor32(f)
- ;;
-}
-
-pkglocal const ceil32 = {f : flt32
- -> -floor32(-f)
-}
-
-pkglocal const floor64 = {f : flt64
- var n, e, s
- (n, e, s) = std.flt64explode(f)
-
- /* Many special cases */
- if e >= 52 || f == -0.0
- -> f
- elif e < 0
- if n
- -> -1.0
- else
- -> 0.0
- ;;
- ;;
-
- if n
- var fractional_mask = Flt64SigMask >> (e : uint64)
- if s & fractional_mask == 0
- -> f
- else
- /* Turns out the packing of exp and sig is useful */
- var u : uint64 = std.flt64bits(f) & ~fractional_mask
- u += ((1 << 52) >> (e : uint64))
- -> std.flt64frombits(u)
- ;;
- ;;
-
- var m : uint64 = (Flt64SigMask >> (e : uint64))
- -> std.flt64assem(n, e, s & ~m)
-}
-
-pkglocal const trunc64 = {f : flt64
- if std.flt64bits(f) & Flt64NegMask != 0
- -> -floor64(-f)
- else
- -> floor64(f)
- ;;
-}
-
-pkglocal const ceil64 = {f : flt64
- -> -floor64(-f)
-}
-
--- /dev/null
+++ b/lib/math/sum-impl.myr
@@ -1,0 +1,104 @@
+use std
+
+pkg math =
+ pkglocal const kahan_sum32 : (l : flt32[:] -> flt32)
+ pkglocal const priest_sum32 : (l : flt32[:] -> flt32)
+
+ pkglocal const kahan_sum64: (l : flt64[:] -> flt64)
+ pkglocal const priest_sum64 : (l : flt64[:] -> flt64)
+;;
+
+type doomed_flt32_arr = flt32[:]
+type doomed_flt64_arr = flt64[:]
+
+impl disposable doomed_flt32_arr =
+ __dispose__ = {a : doomed_flt32_arr; std.slfree((a : flt32[:])) }
+;;
+
+impl disposable doomed_flt64_arr =
+ __dispose__ = {a : doomed_flt64_arr; std.slfree((a : flt64[:])) }
+;;
+
+/*
+ Kahan's compensated summation. Fast and reasonably accurate,
+ although cancellation can cause relative error blowup. For
+ something slower, but more accurate, use something like Priest's
+ doubly compensated sums.
+ */
+pkglocal const kahan_sum32 = {l; -> kahan_sum_gen(l, (0.0 : flt32))}
+pkglocal const kahan_sum64 = {l; -> kahan_sum_gen(l, (0.0 : flt64))}
+
+generic kahan_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
+ if l.len == 0
+ -> zero
+ ;;
+
+ var s = zero
+ var c = zero
+ var y = zero
+ var t = zero
+
+ for x : l
+ y = x - c
+ t = s + y
+ c = (t - s) - y
+ s = t
+ ;;
+
+ -> s
+}
+
+/*
+ Priest's doubly compensated summation. Extremely accurate, but
+ relatively slow. For situations in which cancellation is not
+ expected, something like Kahan's compensated summation may be
+ more useful.
+ */
+pkglocal const priest_sum32 = {l : flt32[:]
+ var l2 = std.sldup(l)
+ std.sort(l2, mag_cmp32)
+ auto (l2 : doomed_flt32_arr)
+ -> priest_sum_gen(l2, (0.0 : flt32))
+}
+
+const mag_cmp32 = {f : flt32, g : flt32
+ var u = std.flt32bits(f) & ~(1 << 31)
+ var v = std.flt32bits(g) & ~(1 << 31)
+ -> std.numcmp(v, u)
+}
+
+pkglocal const priest_sum64 = {l : flt64[:]
+ var l2 = std.sldup(l)
+ std.sort(l, mag_cmp64)
+ auto (l2 : doomed_flt64_arr)
+ -> priest_sum_gen(l2, (0.0 : flt64))
+}
+
+const mag_cmp64 = {f : flt64, g : flt64
+ var u = std.flt64bits(f) & ~(1 << 63)
+ var v = std.flt64bits(g) & ~(1 << 63)
+ -> std.numcmp(v, u)
+}
+
+generic priest_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
+ /* l should be sorted in descending order */
+ if l.len == 0
+ -> zero
+ ;;
+
+ var s = zero
+ var c = zero
+
+ for x : l
+ var y = c + x
+ var u = x - (y - c)
+ var t = (y + s)
+ var v = (y - (t - s))
+ var z = u + v
+ s = t + z
+ c = z - (s - t)
+ ;;
+
+ -> s
+}
+
--- /dev/null
+++ b/lib/math/test/fma-impl.myr
@@ -1,0 +1,97 @@
+use std
+use math
+use testr
+
+const main = {
+ testr.run([
+ [.name="fma-01", .fn = fma01],
+ [.name="fma-02", .fn = fma02],
+ ][:])
+}
+
+const fma01 = {c
+ var inputs : (uint32, uint32, uint32, uint32)[:] = [
+ /*
+ These are mostly obtained by running fpmath-consensus
+ with seed 1234. Each (mostly) covers a different
+ corner case.
+ */
+ (0x000009a4, 0x00000000, 0x00000002, 0x00000002),
+ (0x69000000, 0x90008002, 0x68348026, 0x68348026),
+ (0x334802ab, 0x49113e8d, 0x90aea62e, 0x3ce2f4c3),
+ (0x5c35d8c1, 0x12dcb6e2, 0x6c1a8cc2, 0x6c1a8cc2),
+ (0xf6266d83, 0x2b3e04e8, 0x62f99bda, 0x62bbd79e),
+ (0x7278e907, 0x75f6c0f1, 0xf6f9b8e0, 0x7f800000),
+ (0xd7748eeb, 0x6737b23e, 0x68e3bbc7, 0xff2f7c71),
+ (0x7f373de4, 0x3dcf90f0, 0xd22ac17c, 0x7d9492ca),
+ (0xb50fce04, 0x00cd486d, 0x03800000, 0x03800000),
+ (0xbb600000, 0x43b7161a, 0x8684d442, 0xbfa03357),
+ (0xf26f8a00, 0x4bfac000, 0xc74ba9fc, 0xfeeaa06c),
+ (0x55d1fa60, 0x32f20000, 0x1b1fea3d, 0x49467eaf),
+ (0x29e26c00, 0x62352000, 0xa0e845af, 0x4ca032a9),
+ (0x287650f8, 0x7cd00000, 0x94e85d5e, 0x65c821c9),
+ (0x7689f580, 0x91418000, 0xaa2822ae, 0xc8508e21),
+ (0xbd813cc0, 0x421f0000, 0x9f098e17, 0xc0208977),
+ (0x3745461a, 0x4db9b736, 0xb6d7deff, 0x458f1cd8),
+ (0xa3ccfd37, 0x7f800000, 0xed328e70, 0xff800000),
+ (0xa3790205, 0x5033a3e6, 0xa001fd11, 0xb42ebbd5),
+ (0xa4932927, 0xc565bc34, 0x316887af, 0x31688bcf),
+ (0x83dd6ede, 0x31ddf8e6, 0x01fea4c8, 0x01fea4c7),
+ (0xa4988128, 0x099a41ad, 0x00800000, 0x00800000),
+ (0x1e0479cd, 0x91d5fcb4, 0x00800000, 0x00800000),
+ (0x2f413021, 0x0a3f5a4e, 0x80800483, 0x80800000),
+ (0x144dcd10, 0x12f4aba0, 0x80800000, 0x80800000),
+ (0x0d580b86, 0x435768a8, 0x966c8d6f, 0x966c5ffd),
+ (0xa19e9a6f, 0xb49af3e3, 0xa2468b59, 0xa2468b57),
+ (0xd119e996, 0x8e5ad0e3, 0x247e0028, 0x247e83b7),
+ ][:]
+
+ for (x, y, z, r) : inputs
+ var xf : flt32 = std.flt32frombits(x)
+ var yf : flt32 = std.flt32frombits(y)
+ var zf : flt32 = std.flt32frombits(z)
+ var rf = math.fma(xf, yf, zf)
+ testr.check(c, rf == std.flt32frombits(r),
+ "0x{b=16,w=8,p=0} * 0x{b=16,w=8,p=0} + 0x{b=16,w=8,p=0} should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
+ x, y, z, r, std.flt32bits(rf))
+ ;;
+}
+
+const fma02 = {c
+ var inputs : (uint64, uint64, uint64, uint64)[:] = [
+ /*
+ These are mostly obtained by running fpmath-consensus
+ with seed 1234. Each (mostly) covers a different
+ corner case.
+ */
+ (0x0000000000000000, 0x0000000000000000, 0x0100000000000000, 0x0100000000000000),
+ (0x0000000000000000, 0x0000000000000000, 0x0200000000000000, 0x0200000000000000),
+ (0x00000000000009a4, 0x6900000000000002, 0x6834802690008002, 0x6834802690008002),
+ (0x49113e8d334802ab, 0x5c35d8c190aea62e, 0x6c1a8cc212dcb6e2, 0x6c1a8cc212dcb6e2),
+ (0x2b3e04e8f6266d83, 0xae84e20f62f99bda, 0xc9115a1ccea6ce27, 0xc9115a1ccea6ce27),
+ (0xa03ea9e9b09d932c, 0xded7bc19edcbf0c7, 0xbbc4c1f83b3f8f2e, 0x3f26be5f0c7b48e3),
+ (0xa5ec2141c1e6f339, 0xa2d80fc217f57b61, 0x00b3484b473ef1b8, 0x08d526cb86ee748d),
+ (0xccc6600ee88bb67c, 0xc692eeec9b51cf0f, 0xbf5f1ae3486401b0, 0x536a7a30857129db),
+ (0x5f9b9e449db17602, 0xbef22ae5b6a2b1c5, 0x6133e925e6bf8a12, 0x6133e925e6bf823b),
+ (0x7f851249841b6278, 0x3773388e53a375f4, 0x761c27fc2ffa57be, 0x7709506b0e99dc30),
+ (0x7c7cb20f3ca8af93, 0x800fd7f5cfd5baae, 0x14e4c09c9bb1e17e, 0xbc9c6a3fd0e58682),
+ (0xb5e8db2107f4463f, 0x614af740c0d7eb3b, 0xd7e3d25c4daa81e0, 0xd7e3d798d3ccdffb),
+ (0xae62c8be4cb45168, 0x90cc5236f3516c90, 0x0007f8b14f684558, 0x0007f9364eb1a815),
+ (0x5809f53e32a7e1ba, 0xcc647611ccaa5bf4, 0xdfbdb5c345ce7a56, 0xe480990da5526103),
+ (0xbb889d7f826438e1, 0x03bdaff82129696d, 0x000000dacab276ae, 0x8000009296c962f8),
+ (0x003d95525e2b057a, 0xbef738ea5717d89a, 0x800000089763d88c, 0x800000b456ed1a9c),
+ (0x0be868cb5a7180c8, 0x3357a30707ed947c, 0x80000050d6b86ac6, 0x000000cfa41cb229),
+ (0xbe535f4f8a7498af, 0x00d24adee12217b8, 0x0000005729e93fb0, 0x800000016d975af3),
+ (0x39d1968eb883f088, 0x856f286e3b268f0e, 0x800000d7cdd0ed70, 0x800001e9cf01a0ae),
+ ][:]
+
+ for (x, y, z, r) : inputs
+ var xf : flt64 = std.flt64frombits(x)
+ var yf : flt64 = std.flt64frombits(y)
+ var zf : flt64 = std.flt64frombits(z)
+ var rf = math.fma(xf, yf, zf)
+ testr.check(c, rf == std.flt64frombits(r),
+ "0x{b=16,w=16,p=0} * 0x{b=16,w=16,p=0} + 0x{b=16,w=16,p=0} should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",
+ x, y, z, r, std.flt64bits(rf))
+ ;;
+}
--- a/lib/math/test/fpmath-fma-impl.myr
+++ /dev/null
@@ -1,97 +1,0 @@
-use std
-use math
-use testr
-
-const main = {
- testr.run([
- [.name="fma-01", .fn = fma01],
- [.name="fma-02", .fn = fma02],
- ][:])
-}
-
-const fma01 = {c
- var inputs : (uint32, uint32, uint32, uint32)[:] = [
- /*
- These are mostly obtained by running fpmath-consensus
- with seed 1234. Each (mostly) covers a different
- corner case.
- */
- (0x000009a4, 0x00000000, 0x00000002, 0x00000002),
- (0x69000000, 0x90008002, 0x68348026, 0x68348026),
- (0x334802ab, 0x49113e8d, 0x90aea62e, 0x3ce2f4c3),
- (0x5c35d8c1, 0x12dcb6e2, 0x6c1a8cc2, 0x6c1a8cc2),
- (0xf6266d83, 0x2b3e04e8, 0x62f99bda, 0x62bbd79e),
- (0x7278e907, 0x75f6c0f1, 0xf6f9b8e0, 0x7f800000),
- (0xd7748eeb, 0x6737b23e, 0x68e3bbc7, 0xff2f7c71),
- (0x7f373de4, 0x3dcf90f0, 0xd22ac17c, 0x7d9492ca),
- (0xb50fce04, 0x00cd486d, 0x03800000, 0x03800000),
- (0xbb600000, 0x43b7161a, 0x8684d442, 0xbfa03357),
- (0xf26f8a00, 0x4bfac000, 0xc74ba9fc, 0xfeeaa06c),
- (0x55d1fa60, 0x32f20000, 0x1b1fea3d, 0x49467eaf),
- (0x29e26c00, 0x62352000, 0xa0e845af, 0x4ca032a9),
- (0x287650f8, 0x7cd00000, 0x94e85d5e, 0x65c821c9),
- (0x7689f580, 0x91418000, 0xaa2822ae, 0xc8508e21),
- (0xbd813cc0, 0x421f0000, 0x9f098e17, 0xc0208977),
- (0x3745461a, 0x4db9b736, 0xb6d7deff, 0x458f1cd8),
- (0xa3ccfd37, 0x7f800000, 0xed328e70, 0xff800000),
- (0xa3790205, 0x5033a3e6, 0xa001fd11, 0xb42ebbd5),
- (0xa4932927, 0xc565bc34, 0x316887af, 0x31688bcf),
- (0x83dd6ede, 0x31ddf8e6, 0x01fea4c8, 0x01fea4c7),
- (0xa4988128, 0x099a41ad, 0x00800000, 0x00800000),
- (0x1e0479cd, 0x91d5fcb4, 0x00800000, 0x00800000),
- (0x2f413021, 0x0a3f5a4e, 0x80800483, 0x80800000),
- (0x144dcd10, 0x12f4aba0, 0x80800000, 0x80800000),
- (0x0d580b86, 0x435768a8, 0x966c8d6f, 0x966c5ffd),
- (0xa19e9a6f, 0xb49af3e3, 0xa2468b59, 0xa2468b57),
- (0xd119e996, 0x8e5ad0e3, 0x247e0028, 0x247e83b7),
- ][:]
-
- for (x, y, z, r) : inputs
- var xf : flt32 = std.flt32frombits(x)
- var yf : flt32 = std.flt32frombits(y)
- var zf : flt32 = std.flt32frombits(z)
- var rf = math.fma(xf, yf, zf)
- testr.check(c, rf == std.flt32frombits(r),
- "0x{b=16,w=8,p=0} * 0x{b=16,w=8,p=0} + 0x{b=16,w=8,p=0} should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
- x, y, z, r, std.flt32bits(rf))
- ;;
-}
-
-const fma02 = {c
- var inputs : (uint64, uint64, uint64, uint64)[:] = [
- /*
- These are mostly obtained by running fpmath-consensus
- with seed 1234. Each (mostly) covers a different
- corner case.
- */
- (0x0000000000000000, 0x0000000000000000, 0x0100000000000000, 0x0100000000000000),
- (0x0000000000000000, 0x0000000000000000, 0x0200000000000000, 0x0200000000000000),
- (0x00000000000009a4, 0x6900000000000002, 0x6834802690008002, 0x6834802690008002),
- (0x49113e8d334802ab, 0x5c35d8c190aea62e, 0x6c1a8cc212dcb6e2, 0x6c1a8cc212dcb6e2),
- (0x2b3e04e8f6266d83, 0xae84e20f62f99bda, 0xc9115a1ccea6ce27, 0xc9115a1ccea6ce27),
- (0xa03ea9e9b09d932c, 0xded7bc19edcbf0c7, 0xbbc4c1f83b3f8f2e, 0x3f26be5f0c7b48e3),
- (0xa5ec2141c1e6f339, 0xa2d80fc217f57b61, 0x00b3484b473ef1b8, 0x08d526cb86ee748d),
- (0xccc6600ee88bb67c, 0xc692eeec9b51cf0f, 0xbf5f1ae3486401b0, 0x536a7a30857129db),
- (0x5f9b9e449db17602, 0xbef22ae5b6a2b1c5, 0x6133e925e6bf8a12, 0x6133e925e6bf823b),
- (0x7f851249841b6278, 0x3773388e53a375f4, 0x761c27fc2ffa57be, 0x7709506b0e99dc30),
- (0x7c7cb20f3ca8af93, 0x800fd7f5cfd5baae, 0x14e4c09c9bb1e17e, 0xbc9c6a3fd0e58682),
- (0xb5e8db2107f4463f, 0x614af740c0d7eb3b, 0xd7e3d25c4daa81e0, 0xd7e3d798d3ccdffb),
- (0xae62c8be4cb45168, 0x90cc5236f3516c90, 0x0007f8b14f684558, 0x0007f9364eb1a815),
- (0x5809f53e32a7e1ba, 0xcc647611ccaa5bf4, 0xdfbdb5c345ce7a56, 0xe480990da5526103),
- (0xbb889d7f826438e1, 0x03bdaff82129696d, 0x000000dacab276ae, 0x8000009296c962f8),
- (0x003d95525e2b057a, 0xbef738ea5717d89a, 0x800000089763d88c, 0x800000b456ed1a9c),
- (0x0be868cb5a7180c8, 0x3357a30707ed947c, 0x80000050d6b86ac6, 0x000000cfa41cb229),
- (0xbe535f4f8a7498af, 0x00d24adee12217b8, 0x0000005729e93fb0, 0x800000016d975af3),
- (0x39d1968eb883f088, 0x856f286e3b268f0e, 0x800000d7cdd0ed70, 0x800001e9cf01a0ae),
- ][:]
-
- for (x, y, z, r) : inputs
- var xf : flt64 = std.flt64frombits(x)
- var yf : flt64 = std.flt64frombits(y)
- var zf : flt64 = std.flt64frombits(z)
- var rf = math.fma(xf, yf, zf)
- testr.check(c, rf == std.flt64frombits(r),
- "0x{b=16,w=16,p=0} * 0x{b=16,w=16,p=0} + 0x{b=16,w=16,p=0} should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",
- x, y, z, r, std.flt64bits(rf))
- ;;
-}
--- a/lib/math/test/fpmath-sum-impl.myr
+++ /dev/null
@@ -1,36 +1,0 @@
-use std
-use math
-use testr
-
-const main = {
- testr.run([
- [.name = "sums-kahan-01", .fn = sums_kahan_01],
- [.name = "sums-priest-01", .fn = sums_priest_01],
- ][:])
-}
-
-const sums_kahan_01 = {c
- var sums : (flt32[:], flt32)[:] = [
- ([1.0, 2.0, 3.0][:], 6.0),
-
- /* Naive summing gives 1.0, actual answer is 2.0 */
- ([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 3.0)
- ][:]
-
- for (a, r) : sums
- var s = math.kahan_sum(a)
- testr.eq(c, r, s)
- ;;
-}
-
-const sums_priest_01 = {c
- var sums : (flt32[:], flt32)[:] = [
- ([1.0, 2.0, 3.0][:], 6.0),
- ([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 2.0)
- ][:]
-
- for (a, r) : sums
- var s = math.priest_sum(a)
- testr.eq(c, r, s)
- ;;
-}
--- a/lib/math/test/fpmath-trunc-impl.myr
+++ /dev/null
@@ -1,124 +1,0 @@
-use std
-use math
-use testr
-
-const main = {
- testr.run([
- [.name = "trunc-01", .fn = trunc01],
- [.name = "trunc-02", .fn = trunc02],
- [.name = "floor-01", .fn = floor01],
- [.name = "floor-02", .fn = floor02],
- [.name = "ceil-01", .fn = ceil01],
- [.name = "ceil-02", .fn = ceil02],
- ][:])
-}
-
-const trunc01 = {c
- var flt32s : (flt32, flt32)[:] = [
- (123.4, 123.0),
- (0.0, 0.0),
- (-0.0, -0.0),
- (1.0, 1.0),
- (1.1, 1.0),
- (0.9, 0.0),
- (10664524000000000000.0, 10664524000000000000.0),
- (-3.5, -3.0),
- (101.999, 101.0),
- (std.flt32nan(), std.flt32nan()),
- ][:]
-
- for (f, g) : flt32s
- testr.eq(c, math.trunc(f), g)
- ;;
-}
-
-const trunc02 = {c
- var flt64s : (flt64, flt64)[:] = [
- (0.0, 0.0),
- (-0.0, -0.0),
- (1.0, 1.0),
- (1.1, 1.0),
- (0.9, 0.0),
- (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
- (-3.5, -3.0),
- (101.999, 101.0),
- (std.flt64nan(), std.flt64nan()),
- ][:]
-
- for (f, g) : flt64s
- testr.eq(c, math.trunc(f), g)
- ;;
-}
-
-const floor01 = {c
- var flt32s : (flt32, flt32)[:] = [
- (0.0, 0.0),
- (-0.0, -0.0),
- (0.5, 0.0),
- (1.1, 1.0),
- (10664524000000000000.0, 10664524000000000000.0),
- (-3.5, -4.0),
- (-101.999, -102.0),
- (-126.999, -127.0),
- (-127.999, -128.0),
- (-128.999, -129.0),
- (std.flt32nan(), std.flt32nan()),
- ][:]
-
- for (f, g) : flt32s
- testr.eq(c, math.floor(f), g)
- ;;
-}
-
-const floor02 = {c
- var flt64s : (flt64, flt64)[:] = [
- (0.0, 0.0),
- (-0.0, -0.0),
- (0.5, 0.0),
- (1.1, 1.0),
- (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
- (-3.5, -4.0),
- (-101.999, -102.0),
- (std.flt64nan(), std.flt64nan()),
- ][:]
-
- for (f, g) : flt64s
- testr.eq(c, math.floor(f), g)
- ;;
-}
-
-const ceil01 = {c
- var flt32s : (flt32, flt32)[:] = [
- (0.0, 0.0),
- (-0.0, -0.0),
- (0.5, 1.0),
- (-0.1, -0.0),
- (1.1, 2.0),
- (10664524000000000000.0, 10664524000000000000.0),
- (-3.5, -3.0),
- (-101.999, -101.0),
- (std.flt32nan(), std.flt32nan()),
- ][:]
-
- for (f, g) : flt32s
- testr.eq(c, math.ceil(f), g)
- ;;
-}
-
-const ceil02 = {c
- var flt64s : (flt64, flt64)[:] = [
- (0.0, 0.0),
- (-0.0, -0.0),
- (0.5, 1.0),
- (-0.1, -0.0),
- (1.1, 2.0),
- (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
- (-3.5, -3.0),
- (-101.999, -101.0),
- (std.flt64nan(), std.flt64nan()),
- ][:]
-
- for (f, g) : flt64s
- testr.eq(c, math.ceil(f), g)
- ;;
-}
--- /dev/null
+++ b/lib/math/test/sum-impl.myr
@@ -1,0 +1,36 @@
+use std
+use math
+use testr
+
+const main = {
+ testr.run([
+ [.name = "sums-kahan-01", .fn = sums_kahan_01],
+ [.name = "sums-priest-01", .fn = sums_priest_01],
+ ][:])
+}
+
+const sums_kahan_01 = {c
+ var sums : (flt32[:], flt32)[:] = [
+ ([1.0, 2.0, 3.0][:], 6.0),
+
+ /* Naive summing gives 1.0, actual answer is 2.0 */
+ ([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 3.0)
+ ][:]
+
+ for (a, r) : sums
+ var s = math.kahan_sum(a)
+ testr.eq(c, r, s)
+ ;;
+}
+
+const sums_priest_01 = {c
+ var sums : (flt32[:], flt32)[:] = [
+ ([1.0, 2.0, 3.0][:], 6.0),
+ ([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 2.0)
+ ][:]
+
+ for (a, r) : sums
+ var s = math.priest_sum(a)
+ testr.eq(c, r, s)
+ ;;
+}
--- /dev/null
+++ b/lib/math/test/trunc-impl.myr
@@ -1,0 +1,124 @@
+use std
+use math
+use testr
+
+const main = {
+ testr.run([
+ [.name = "trunc-01", .fn = trunc01],
+ [.name = "trunc-02", .fn = trunc02],
+ [.name = "floor-01", .fn = floor01],
+ [.name = "floor-02", .fn = floor02],
+ [.name = "ceil-01", .fn = ceil01],
+ [.name = "ceil-02", .fn = ceil02],
+ ][:])
+}
+
+const trunc01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (123.4, 123.0),
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (1.0, 1.0),
+ (1.1, 1.0),
+ (0.9, 0.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -3.0),
+ (101.999, 101.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.trunc(f), g)
+ ;;
+}
+
+const trunc02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (1.0, 1.0),
+ (1.1, 1.0),
+ (0.9, 0.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -3.0),
+ (101.999, 101.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.trunc(f), g)
+ ;;
+}
+
+const floor01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 0.0),
+ (1.1, 1.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -4.0),
+ (-101.999, -102.0),
+ (-126.999, -127.0),
+ (-127.999, -128.0),
+ (-128.999, -129.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.floor(f), g)
+ ;;
+}
+
+const floor02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 0.0),
+ (1.1, 1.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -4.0),
+ (-101.999, -102.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.floor(f), g)
+ ;;
+}
+
+const ceil01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 1.0),
+ (-0.1, -0.0),
+ (1.1, 2.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -3.0),
+ (-101.999, -101.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.ceil(f), g)
+ ;;
+}
+
+const ceil02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 1.0),
+ (-0.1, -0.0),
+ (1.1, 2.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -3.0),
+ (-101.999, -101.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.ceil(f), g)
+ ;;
+}
--- /dev/null
+++ b/lib/math/trunc-impl+posixy-x64-sse4.s
@@ -1,0 +1,41 @@
+.globl math$trunc32
+.globl math$_trunc32
+math$trunc32:
+math$_trunc32:
+ roundss $0x03, %xmm0, %xmm0
+ ret
+
+.globl math$floor32
+.globl math$_floor32
+math$floor32:
+math$_floor32:
+ roundss $0x01, %xmm0, %xmm0
+ ret
+
+.globl math$ceil32
+.globl math$_ceil32
+math$ceil32:
+math$_ceil32:
+ roundss $0x02, %xmm0, %xmm0
+ ret
+
+.globl math$trunc64
+.globl math$_trunc64
+math$trunc64:
+math$_trunc64:
+ roundsd $0x03, %xmm0, %xmm0
+ ret
+
+.globl math$floor64
+.globl math$_floor64
+math$floor64:
+math$_floor64:
+ roundsd $0x01, %xmm0, %xmm0
+ ret
+
+.globl math$ceil64
+.globl math$_ceil64
+math$ceil64:
+math$_ceil64:
+ roundsd $0x02, %xmm0, %xmm0
+ ret
--- /dev/null
+++ b/lib/math/trunc-impl.myr
@@ -1,0 +1,103 @@
+use std
+
+pkg math =
+ pkglocal const trunc32 : (f : flt32 -> flt32)
+ pkglocal const floor32 : (f : flt32 -> flt32)
+ pkglocal const ceil32 : (f : flt32 -> flt32)
+ pkglocal const trunc64 : (f : flt64 -> flt64)
+ pkglocal const floor64 : (f : flt64 -> flt64)
+ pkglocal const ceil64 : (f : flt64 -> flt64)
+;;
+
+const Flt32NegMask : uint32 = (1 << 31)
+const Flt32SigMask : uint32 = (1 << 23) - 1
+
+const Flt64NegMask : uint64 = (1 << 63)
+const Flt64SigMask : uint64 = (1 << 52) - 1
+
+pkglocal const floor32 = {f : flt32
+ var n, e, s
+ (n, e, s) = std.flt32explode(f)
+
+ /* Many special cases */
+ if e >= 23 || f == -0.0
+ -> f
+ elif e < 0
+ if n
+ -> -1.0
+ else
+ -> 0.0
+ ;;
+ ;;
+
+ if n
+ var fractional_mask = Flt32SigMask >> (e : uint32)
+ if s & fractional_mask == 0
+ -> f
+ else
+ /* Turns out the packing of exp and sig is useful */
+ var u : uint32 = std.flt32bits(f) & ~fractional_mask
+ u += ((1 << 23) >> (e : uint32))
+ -> std.flt32frombits(u)
+ ;;
+ ;;
+
+ var m : uint32 = (Flt32SigMask >> (e : uint32))
+ -> std.flt32assem(n, e, s & ~m)
+}
+
+pkglocal const trunc32 = {f : flt32
+ if std.flt32bits(f) & Flt32NegMask != 0
+ -> -floor32(-f)
+ else
+ -> floor32(f)
+ ;;
+}
+
+pkglocal const ceil32 = {f : flt32
+ -> -floor32(-f)
+}
+
+pkglocal const floor64 = {f : flt64
+ var n, e, s
+ (n, e, s) = std.flt64explode(f)
+
+ /* Many special cases */
+ if e >= 52 || f == -0.0
+ -> f
+ elif e < 0
+ if n
+ -> -1.0
+ else
+ -> 0.0
+ ;;
+ ;;
+
+ if n
+ var fractional_mask = Flt64SigMask >> (e : uint64)
+ if s & fractional_mask == 0
+ -> f
+ else
+ /* Turns out the packing of exp and sig is useful */
+ var u : uint64 = std.flt64bits(f) & ~fractional_mask
+ u += ((1 << 52) >> (e : uint64))
+ -> std.flt64frombits(u)
+ ;;
+ ;;
+
+ var m : uint64 = (Flt64SigMask >> (e : uint64))
+ -> std.flt64assem(n, e, s & ~m)
+}
+
+pkglocal const trunc64 = {f : flt64
+ if std.flt64bits(f) & Flt64NegMask != 0
+ -> -floor64(-f)
+ else
+ -> floor64(f)
+ ;;
+}
+
+pkglocal const ceil64 = {f : flt64
+ -> -floor64(-f)
+}
+