shithub: mc

Download patch

ref: 5de1231c0605526b22779dd827cc96461de40f77
parent: d27469804f73bf0874250eda520532875ecf5645
author: S. Gilles <sgilles@math.umd.edu>
date: Thu Mar 22 07:10:21 EDT 2018

Drop "fpmath-" prefix, it's redundant

--- a/lib/math/bld.sub
+++ b/lib/math/bld.sub
@@ -2,15 +2,15 @@
 	fpmath.myr
 
 	# trunc, floor, ceil
-	fpmath-trunc-impl+posixy-x64-sse4.s
-	fpmath-trunc-impl.myr
+	trunc-impl+posixy-x64-sse4.s
+	trunc-impl.myr
 
 	# summation
-	fpmath-sum-impl.myr
+	sum-impl.myr
 
 	# fused-multiply-add
-	fpmath-fma-impl+posixy-x64-fma.s
-	fpmath-fma-impl.myr
+	fma-impl+posixy-x64-fma.s
+	fma-impl.myr
 
 	lib ../std:std
 ;;
--- /dev/null
+++ b/lib/math/fma-impl+posixy-x64-fma.s
@@ -1,0 +1,13 @@
+.globl math$fma32
+.globl math$_fma32
+math$fma32:
+math$_fma32:
+	vfmadd132ss %xmm1, %xmm2, %xmm0
+	ret
+
+.globl math$fma64
+.globl math$_fma64
+math$fma64:
+math$_fma64:
+	vfmadd132sd %xmm1, %xmm2, %xmm0
+	ret
--- /dev/null
+++ b/lib/math/fma-impl.myr
@@ -1,0 +1,551 @@
+use std
+
+pkg math =
+	pkglocal const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
+	pkglocal const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
+;;
+
+const exp_mask32 : uint32 = 0xff << 23
+const exp_mask64 : uint64 = 0x7ff << 52
+
+pkglocal const fma32 = {x : flt32, y : flt32, z : flt32
+	var xn, yn
+	(xn, _, _) = std.flt32explode(x)
+	(yn, _, _) = std.flt32explode(y)
+	var xd : flt64 = flt64fromflt32(x)
+	var yd : flt64 = flt64fromflt32(y)
+	var zd : flt64 = flt64fromflt32(z)
+	var prod : flt64 = xd * yd
+	var pn, pe, ps
+	(pn, pe, ps) = std.flt64explode(prod)
+	if pe == -1023
+		pe = -1022
+	;;
+	if pn != (xn != yn)
+		/* In case of NaNs, sign might not have been preserved */
+		pn = (xn != yn)
+		prod = std.flt64assem(pn, pe, ps)
+	;;
+
+	var r : flt64 = prod + zd
+	var rn, re, rs
+	(rn, re, rs) = std.flt64explode(r)
+
+	/*
+	   At this point, r is probably the correct answer. The
+	   only issue is that if truncating r to a flt32 causes
+	   rounding-to-even, and it was obtained by rounding in the
+	   first place, direction, then we'd be over-rounding. The
+	   only way this could possibly be a problem is if the
+	   product step rounded to the halfway point (a 100...0,
+	   with the 1 just outside truncation range).
+         */
+	if re == 1024 || rs & 0x1fffffff != 0x10000000
+		-> flt32fromflt64(r)
+	;;
+
+	/*
+	   At this point, there's definitely about to be a rounding
+	   error. To figure out what to do, compute prod + z with
+	   round-to-zero. If we get r again, then it's okay to round
+	   r upwards, because it hasn't been rounded away from zero
+	   yet and we allow ourselves one such rounding.
+	 */
+	var zn, ze, zs
+	(zn, ze, zs) = std.flt64explode(zd)
+	if ze == -1023
+		ze = -1022
+	;;
+	var rtn, rte, rts
+	if pe >= ze && pn == zn
+		(rtn, rte, rts) = (pn, pe, ps)
+		rts += shr(zs, pe - ze)
+	elif pe > ze || (pe == ze && ps > zs)
+		(rtn, rte, rts) = (pn, pe, ps)
+		rts -= shr(zs, pe - ze)
+		if shr((-1 : uint64), 64 - std.min(64, (pe - ze))) & zs != 0
+			rts--
+		;;
+	elif pe < ze && pn == zn
+		(rtn, rte, rts) = (zn, ze, zs)
+		rts += shr(ps, ze - pe)
+	else
+		(rtn, rte, rts) = (zn, ze, zs)
+		rts -= shr(ps, ze - pe)
+		if shr((-1 : uint64), 64 - std.min(64, (ze - pe))) & ps != 0
+			rts--
+		;;
+	;;
+
+	if rts & (1 << 53) != 0
+		rts >>= 1
+		rte++
+	;;
+
+	if rn == rtn && rs == rts && re == rte
+		rts++
+		if rts & (1 << 53) != 0
+			rts >>= 1
+			rte++
+		;;
+	;;
+	-> flt32fromflt64(std.flt64assem(rtn, rte, rts))
+}
+
+const flt64fromflt32 = {f : flt32
+	var n, e, s
+	(n, e, s) = std.flt32explode(f)
+	var xs : uint64 = (s : uint64)
+	var xe : int64 = (e : int64)
+
+	if e == 128
+		-> std.flt64assem(n, 1024, xs)
+	elif e == -127
+		/*
+		  All subnormals in single precision (except 0.0s)
+		  can be upgraded to double precision, since the
+		  exponent range is so much wider.
+		 */
+		var first1 = find_first1_64(xs, 23)
+		if first1 < 0
+			-> std.flt64assem(n, -1023, 0)
+		;;
+		xs = xs << (52 - (first1 : uint64))
+		xe = -126 - (23 - first1)
+		-> std.flt64assem(n, xe, xs)
+	;;
+
+	-> std.flt64assem(n, xe, xs << (52 - 23))
+}
+
+const flt32fromflt64 = {f : flt64
+	var n : bool, e : int64, s : uint64
+	(n, e, s) = std.flt64explode(f)
+	var ts : uint32
+	var te : int32 = (e : int32)
+
+	if e >= 128
+		if e == 1023 && s != 0
+			/* NaN */
+			-> std.flt32assem(n, 128, 1)
+		else
+			/* infinity */
+			-> std.flt32assem(n, 128, 0)
+		;;
+	;;
+
+	if e >= -127
+		/* normal */
+		ts = ((s >> (52 - 23)) : uint32)
+		if need_round_away(0, s, 52 - 23)
+			ts++
+			if ts & (1 << 24) != 0
+				ts >>= 1
+				te++
+			;;
+		;;
+		-> std.flt32assem(n, te, ts)
+	;;
+
+	/* subnormal already, will have to go to 0 */
+	if e == -1023
+		-> std.flt32assem(n, -127, 0)
+	;;
+
+	/* subnormal (at least, it will be) */
+	te = -127
+	var shift : int64 = (52 - 23) + (-126 - e)
+	var ts1 = shr(s, shift)
+	ts = (ts1 : uint32)
+	if need_round_away(0, s, shift)
+		ts++
+	;;
+	-> std.flt32assem(n, te, ts)
+}
+
+pkglocal const fma64 = {x : flt64, y : flt64, z : flt64
+	var xn : bool, yn : bool, zn : bool
+	var xe : int64, ye : int64, ze : int64
+	var xs : uint64, ys : uint64, zs : uint64
+
+	var xb : uint64 = std.flt64bits(x)
+	var yb : uint64 = std.flt64bits(y)
+	var zb : uint64 = std.flt64bits(z)
+
+	/* check for both NaNs and infinities */
+	if xb & exp_mask64 == exp_mask64 || \
+	   yb & exp_mask64 == exp_mask64
+		-> x * y + z
+	elif z == 0.0 || z == -0.0 || x * y == 0.0 || x * y == -0.0
+		-> x * y + z
+	elif zb & exp_mask64 == exp_mask64
+		-> z
+	;;
+
+	(xn, xe, xs) = std.flt64explode(x)
+	(yn, ye, ys) = std.flt64explode(y)
+	(zn, ze, zs) = std.flt64explode(z)
+	if xe == -1023
+		xe = -1022
+	;;
+	if ye == -1023
+		ye = -1022
+	;;
+	if ze == -1023
+		ze = -1022
+	;;
+
+        /* Keep product in high/low uint64s */
+	var xs_h : uint64 = xs >> 32
+	var ys_h : uint64 = ys >> 32
+	var xs_l : uint64 = xs & 0xffffffff
+	var ys_l : uint64 = ys & 0xffffffff
+
+	var t_l : uint64 = xs_l * ys_l
+	var t_m : uint64 = xs_l * ys_h + xs_h * ys_l
+	var t_h : uint64 = xs_h * ys_h
+
+	var prod_l : uint64 = t_l + (t_m << 32)
+	var prod_h : uint64 = t_h + (t_m >> 32)
+	if t_l > prod_l
+		prod_h++
+	;;
+
+	var prod_n = xn != yn
+	var prod_lastbit_e = (xe - 52) + (ye - 52)
+	var prod_first1 = find_first1_64_hl(prod_h, prod_l, 105)
+	var prod_firstbit_e = prod_lastbit_e + prod_first1
+
+	var z_firstbit_e = ze
+	var z_lastbit_e = ze - 52
+	var z_first1 = 52
+
+	/* subnormals could throw firstbit_e calculations out of whack */
+	if (zb & exp_mask64 == 0)
+		z_first1 = find_first1_64(zs, z_first1)
+		z_firstbit_e = z_lastbit_e + z_first1
+	;;
+
+	var res_n
+	var res_h = 0
+	var res_l = 0
+	var res_first1
+	var res_lastbit_e
+	var res_firstbit_e
+
+	if prod_n == zn
+		res_n = prod_n
+
+		/*
+		   Align prod and z so that the top bit of the
+		   result is either 53 or 54, then add.
+		 */
+		if prod_firstbit_e >= z_firstbit_e
+			/*
+			    [ prod_h ][ prod_l ]
+			         [ z...
+			 */
+			res_lastbit_e = prod_lastbit_e
+			(res_h, res_l) = (prod_h, prod_l)
+			(res_h, res_l) = add_shifted(res_h, res_l, zs, z_lastbit_e - prod_lastbit_e)
+		else
+			/*
+			        [ prod_h ][ prod_l ]
+			    [ z...
+			 */
+			res_lastbit_e = z_lastbit_e - 64
+			res_h = zs
+			res_l = 0
+			if prod_lastbit_e >= res_lastbit_e + 64
+				/* In this situation, prod must be extremely subnormal */
+				res_h += shl(prod_l, prod_lastbit_e - res_lastbit_e - 64)
+			elif prod_lastbit_e >= res_lastbit_e
+				res_h += shl(prod_h, prod_lastbit_e - res_lastbit_e)
+				res_h += shr(prod_l, res_lastbit_e + 64 - prod_lastbit_e)
+				res_l += shl(prod_l, prod_lastbit_e - res_lastbit_e)
+			elif prod_lastbit_e + 64 >= res_lastbit_e
+				res_h += shr(prod_h, res_lastbit_e - prod_lastbit_e)
+				var l1 = shl(prod_h, prod_lastbit_e + 64 - res_lastbit_e)
+				var l2 = shr(prod_l, res_lastbit_e - prod_lastbit_e)
+				res_l = l1 + l2
+				if res_l < l1
+					res_h++
+				;;
+			elif prod_lastbit_e + 128 >= res_lastbit_e
+				res_l += shr(prod_h, res_lastbit_e - prod_lastbit_e - 64)
+			;;
+		;;
+	else
+		match compare_hl_z(prod_h, prod_l, prod_firstbit_e, prod_lastbit_e, zs, z_firstbit_e, z_lastbit_e)
+		| `std.Equal: -> 0.0
+		| `std.Before:
+			/* prod > z */
+			res_n = prod_n
+			res_lastbit_e = prod_lastbit_e
+			(res_h, res_l) = sub_shifted(prod_h, prod_l, zs, z_lastbit_e - prod_lastbit_e)
+		| `std.After:
+			/* z > prod */
+			res_n = zn
+			res_lastbit_e = z_lastbit_e - 64
+			(res_h, res_l) = sub_shifted(zs, 0, prod_h, prod_lastbit_e + 64 - (z_lastbit_e - 64))
+			(res_h, res_l) = sub_shifted(res_h, res_l, prod_l, prod_lastbit_e - (z_lastbit_e - 64))
+		;;
+	;;
+
+	res_first1 = 64 + find_first1_64(res_h, 55)
+	if res_first1 == 63
+		res_first1 = find_first1_64(res_l, 63)
+	;;
+	res_firstbit_e = res_first1 + res_lastbit_e
+
+	/*
+	   Finally, res_h and res_l are the high and low bits of
+	   the result. They now need to be assembled into a flt64.
+	   Subnormals and infinities could be a problem.
+	 */
+	var res_s = 0
+	if res_firstbit_e <= -1023
+		/* Subnormal case */
+		if res_lastbit_e + 128 < 12 - 1022
+			res_s = shr(res_h, 12 - 1022 - (res_lastbit_e + 128))
+			res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
+		elif res_lastbit_e + 64 < 12 - 1022
+			res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
+			res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
+		else
+			res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
+			res_s |= shl(res_l, -12 + (res_lastbit_e + 64) - (-1022))
+		;;
+
+		if need_round_away(res_h, res_l, res_first1 + (-1074 - res_firstbit_e))
+			res_s++
+		;;
+
+		/* No need for exponents, they are all zero */
+		var res = res_s
+		if res_n
+			res |= (1 << 63)
+		;;
+		-> std.flt64frombits(res)
+	;;
+
+	if res_firstbit_e >= 1024
+		/* Infinity case */
+		if res_n
+			-> std.flt64frombits(0xfff0000000000000)
+		else
+			-> std.flt64frombits(0x7ff0000000000000)
+		;;
+	;;
+
+	if res_first1 - 52 >= 64
+		res_s = shr(res_h, (res_first1 : int64) - 64 - 52)
+		if need_round_away(res_h, res_l, res_first1 - 52)
+			res_s++
+		;;
+	elif res_first1 - 52 >= 0
+		res_s = shl(res_h, 64 - (res_first1 - 52))
+		res_s |= shr(res_l, res_first1 - 52)
+		if need_round_away(res_h, res_l, res_first1 - 52)
+			res_s++
+		;;
+	else
+		res_s = shl(res_h, res_first1 - 52)
+	;;
+
+	/* The res_s++s might have messed everything up */
+	if res_s & (1 << 53) != 0
+		res_s >= 1
+		res_firstbit_e++
+		if res_firstbit_e >= 1024
+			if res_n
+				-> std.flt64frombits(0xfff0000000000000)
+			else
+				-> std.flt64frombits(0x7ff0000000000000)
+			;;
+		;;
+	;;
+
+	-> std.flt64assem(res_n, res_firstbit_e, res_s)
+}
+
+/* >> and <<, but without wrapping when the shift is >= 64 */
+const shr : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64
+	if (s : uint64) >= 64
+		-> 0
+	else
+		-> u >> (s : uint64)
+	;;
+}
+
+const shl : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64
+	if (s : uint64) >= 64
+		-> 0
+	else
+		-> u << (s : uint64)
+	;;
+}
+
+/*
+   Add (a << s) to [ h ][ l ], where if s < 0 then a corresponding
+   right-shift is used. This is aligned such that if s == 0, then
+   the result is [ h ][ l + a ]
+ */
+const add_shifted = {h : uint64, l : uint64, a : uint64, s : int64
+	if s >= 64
+		-> (h + shl(a, s - 64), l)
+	elif s >= 0
+		var new_h = h + shr(a, 64 - s)
+		var sa = shl(a, s)
+		var new_l = l + sa
+		if new_l < l
+			new_h++
+		;;
+		-> (new_h, new_l)
+	else
+		var new_h = h
+		var sa = shr(a, -s)
+		var new_l = l + sa
+		if new_l < l
+			new_h++
+		;;
+		-> (new_h, new_l)
+	;;
+}
+
+/* As above, but subtract (a << s) */
+const sub_shifted = {h : uint64, l : uint64, a : uint64, s : int64
+	if s >= 64
+		-> (h - shl(a, s - 64), l)
+	elif s >= 0
+		var new_h = h - shr(a, 64 - s)
+		var sa = shl(a, s)
+		var new_l = l - sa
+		if sa > l
+			new_h--
+		;;
+		-> (new_h, new_l)
+	else
+		var new_h = h
+		var sa = shr(a, -s)
+		var new_l = l - sa
+		if sa > l
+			new_h--
+		;;
+		-> (new_h, new_l)
+	;;
+}
+
+const compare_hl_z = {h : uint64, l : uint64, hl_firstbit_e : int64, hl_lastbit_e : int64, z : uint64, z_firstbit_e : int64, z_lastbit_e : int64
+	if hl_firstbit_e > z_firstbit_e
+		-> `std.Before
+	elif hl_firstbit_e < z_firstbit_e
+		-> `std.After
+	;;
+
+	var h_k : int64 = (hl_firstbit_e - hl_lastbit_e - 64)
+	var z_k : int64 = (z_firstbit_e - z_lastbit_e)
+	while h_k >= 0 && z_k >= 0
+		var h1 = h & shl(1, h_k) != 0
+		var z1 = z & shl(1, z_k) != 0
+		if h1 && !z1
+			-> `std.Before
+		elif !h1 && z1
+			-> `std.After
+		;;
+		h_k--
+		z_k--
+	;;
+
+	if z_k < 0
+		if (h & shr((-1 : uint64), 64 - h_k) != 0) || (l != 0)
+			-> `std.Before
+		else
+			-> `std.Equal
+		;;
+	;;
+
+	var l_k : int64 = 63
+	while l_k >= 0 && z_k >= 0
+		var l1 = l & shl(1, l_k) != 0
+		var z1 = z & shl(1, z_k) != 0
+		if l1 && !z1
+			-> `std.Before
+		elif !l1 && z1
+			-> `std.After
+		;;
+		l_k--
+		z_k--
+	;;
+
+	if (z_k < 0) && (l & shr((-1 : uint64), 64 - l_k) != 0)
+		-> `std.Before
+	elif (l_k < 0) && (z & shr((-1 : uint64), 64 - z_k) != 0)
+		-> `std.After
+	;;
+
+	-> `std.Equal
+}
+
+/* Find the first 1 bit in a bitstring */
+const find_first1_64 : (b : uint64, start : int64 -> int64) = {b : uint64, start : int64
+	for var j = start; j >= 0; --j
+		var m = shl(1, j)
+		if b & m != 0
+			-> j
+		;;
+	;;
+
+	-> -1
+}
+
+const find_first1_64_hl = {h, l, start
+	var first1_h = find_first1_64(h, start - 64)
+	if first1_h >= 0
+		-> first1_h + 64
+	;;
+
+	-> find_first1_64(l, 63)
+}
+
+/*
+   For [ h ][ l ], where bitpos_last is the position of the last
+   bit that was included in the truncated result (l's last bit has
+   position 0), decide whether rounding up/away is needed. This is
+   true if
+
+    - following bitpos_last is a 1, then a non-zero sequence, or
+
+    - following bitpos_last is a 1, then a zero sequence, and the
+      round would be to even
+ */
+const need_round_away = {h : uint64, l : uint64, bitpos_last : int64
+	var first_omitted_is_1 = false
+	var nonzero_beyond = false
+	if bitpos_last > 64
+		first_omitted_is_1 = h & shl(1, bitpos_last - 1 - 64) != 0
+		nonzero_beyond = nonzero_beyond || h & shr((-1 : uint64), 2 + 64 - (bitpos_last - 64)) != 0
+		nonzero_beyond = nonzero_beyond || (l != 0)
+	else
+		first_omitted_is_1 = l & shl(1, bitpos_last - 1) != 0
+		nonzero_beyond = nonzero_beyond || l & shr((-1 : uint64), 1 + 64 - bitpos_last) != 0
+	;;
+
+	if !first_omitted_is_1
+		-> false
+	;;
+
+	if nonzero_beyond
+		-> true
+	;;
+
+	var hl_is_odd = false
+
+	if bitpos_last >= 64
+		hl_is_odd = h & shl(1, bitpos_last - 64) != 0
+	else
+		hl_is_odd = l & shl(1, bitpos_last) != 0
+	;;
+
+	-> hl_is_odd
+}
--- a/lib/math/fpmath-fma-impl+posixy-x64-fma.s
+++ /dev/null
@@ -1,13 +1,0 @@
-.globl math$fma32
-.globl math$_fma32
-math$fma32:
-math$_fma32:
-	vfmadd132ss %xmm1, %xmm2, %xmm0
-	ret
-
-.globl math$fma64
-.globl math$_fma64
-math$fma64:
-math$_fma64:
-	vfmadd132sd %xmm1, %xmm2, %xmm0
-	ret
--- a/lib/math/fpmath-fma-impl.myr
+++ /dev/null
@@ -1,551 +1,0 @@
-use std
-
-pkg math =
-	pkglocal const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
-	pkglocal const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
-;;
-
-const exp_mask32 : uint32 = 0xff << 23
-const exp_mask64 : uint64 = 0x7ff << 52
-
-pkglocal const fma32 = {x : flt32, y : flt32, z : flt32
-	var xn, yn
-	(xn, _, _) = std.flt32explode(x)
-	(yn, _, _) = std.flt32explode(y)
-	var xd : flt64 = flt64fromflt32(x)
-	var yd : flt64 = flt64fromflt32(y)
-	var zd : flt64 = flt64fromflt32(z)
-	var prod : flt64 = xd * yd
-	var pn, pe, ps
-	(pn, pe, ps) = std.flt64explode(prod)
-	if pe == -1023
-		pe = -1022
-	;;
-	if pn != (xn != yn)
-		/* In case of NaNs, sign might not have been preserved */
-		pn = (xn != yn)
-		prod = std.flt64assem(pn, pe, ps)
-	;;
-
-	var r : flt64 = prod + zd
-	var rn, re, rs
-	(rn, re, rs) = std.flt64explode(r)
-
-	/*
-	   At this point, r is probably the correct answer. The
-	   only issue is that if truncating r to a flt32 causes
-	   rounding-to-even, and it was obtained by rounding in the
-	   first place, direction, then we'd be over-rounding. The
-	   only way this could possibly be a problem is if the
-	   product step rounded to the halfway point (a 100...0,
-	   with the 1 just outside truncation range).
-         */
-	if re == 1024 || rs & 0x1fffffff != 0x10000000
-		-> flt32fromflt64(r)
-	;;
-
-	/*
-	   At this point, there's definitely about to be a rounding
-	   error. To figure out what to do, compute prod + z with
-	   round-to-zero. If we get r again, then it's okay to round
-	   r upwards, because it hasn't been rounded away from zero
-	   yet and we allow ourselves one such rounding.
-	 */
-	var zn, ze, zs
-	(zn, ze, zs) = std.flt64explode(zd)
-	if ze == -1023
-		ze = -1022
-	;;
-	var rtn, rte, rts
-	if pe >= ze && pn == zn
-		(rtn, rte, rts) = (pn, pe, ps)
-		rts += shr(zs, pe - ze)
-	elif pe > ze || (pe == ze && ps > zs)
-		(rtn, rte, rts) = (pn, pe, ps)
-		rts -= shr(zs, pe - ze)
-		if shr((-1 : uint64), 64 - std.min(64, (pe - ze))) & zs != 0
-			rts--
-		;;
-	elif pe < ze && pn == zn
-		(rtn, rte, rts) = (zn, ze, zs)
-		rts += shr(ps, ze - pe)
-	else
-		(rtn, rte, rts) = (zn, ze, zs)
-		rts -= shr(ps, ze - pe)
-		if shr((-1 : uint64), 64 - std.min(64, (ze - pe))) & ps != 0
-			rts--
-		;;
-	;;
-
-	if rts & (1 << 53) != 0
-		rts >>= 1
-		rte++
-	;;
-
-	if rn == rtn && rs == rts && re == rte
-		rts++
-		if rts & (1 << 53) != 0
-			rts >>= 1
-			rte++
-		;;
-	;;
-	-> flt32fromflt64(std.flt64assem(rtn, rte, rts))
-}
-
-const flt64fromflt32 = {f : flt32
-	var n, e, s
-	(n, e, s) = std.flt32explode(f)
-	var xs : uint64 = (s : uint64)
-	var xe : int64 = (e : int64)
-
-	if e == 128
-		-> std.flt64assem(n, 1024, xs)
-	elif e == -127
-		/*
-		  All subnormals in single precision (except 0.0s)
-		  can be upgraded to double precision, since the
-		  exponent range is so much wider.
-		 */
-		var first1 = find_first1_64(xs, 23)
-		if first1 < 0
-			-> std.flt64assem(n, -1023, 0)
-		;;
-		xs = xs << (52 - (first1 : uint64))
-		xe = -126 - (23 - first1)
-		-> std.flt64assem(n, xe, xs)
-	;;
-
-	-> std.flt64assem(n, xe, xs << (52 - 23))
-}
-
-const flt32fromflt64 = {f : flt64
-	var n : bool, e : int64, s : uint64
-	(n, e, s) = std.flt64explode(f)
-	var ts : uint32
-	var te : int32 = (e : int32)
-
-	if e >= 128
-		if e == 1023 && s != 0
-			/* NaN */
-			-> std.flt32assem(n, 128, 1)
-		else
-			/* infinity */
-			-> std.flt32assem(n, 128, 0)
-		;;
-	;;
-
-	if e >= -127
-		/* normal */
-		ts = ((s >> (52 - 23)) : uint32)
-		if need_round_away(0, s, 52 - 23)
-			ts++
-			if ts & (1 << 24) != 0
-				ts >>= 1
-				te++
-			;;
-		;;
-		-> std.flt32assem(n, te, ts)
-	;;
-
-	/* subnormal already, will have to go to 0 */
-	if e == -1023
-		-> std.flt32assem(n, -127, 0)
-	;;
-
-	/* subnormal (at least, it will be) */
-	te = -127
-	var shift : int64 = (52 - 23) + (-126 - e)
-	var ts1 = shr(s, shift)
-	ts = (ts1 : uint32)
-	if need_round_away(0, s, shift)
-		ts++
-	;;
-	-> std.flt32assem(n, te, ts)
-}
-
-pkglocal const fma64 = {x : flt64, y : flt64, z : flt64
-	var xn : bool, yn : bool, zn : bool
-	var xe : int64, ye : int64, ze : int64
-	var xs : uint64, ys : uint64, zs : uint64
-
-	var xb : uint64 = std.flt64bits(x)
-	var yb : uint64 = std.flt64bits(y)
-	var zb : uint64 = std.flt64bits(z)
-
-	/* check for both NaNs and infinities */
-	if xb & exp_mask64 == exp_mask64 || \
-	   yb & exp_mask64 == exp_mask64
-		-> x * y + z
-	elif z == 0.0 || z == -0.0 || x * y == 0.0 || x * y == -0.0
-		-> x * y + z
-	elif zb & exp_mask64 == exp_mask64
-		-> z
-	;;
-
-	(xn, xe, xs) = std.flt64explode(x)
-	(yn, ye, ys) = std.flt64explode(y)
-	(zn, ze, zs) = std.flt64explode(z)
-	if xe == -1023
-		xe = -1022
-	;;
-	if ye == -1023
-		ye = -1022
-	;;
-	if ze == -1023
-		ze = -1022
-	;;
-
-        /* Keep product in high/low uint64s */
-	var xs_h : uint64 = xs >> 32
-	var ys_h : uint64 = ys >> 32
-	var xs_l : uint64 = xs & 0xffffffff
-	var ys_l : uint64 = ys & 0xffffffff
-
-	var t_l : uint64 = xs_l * ys_l
-	var t_m : uint64 = xs_l * ys_h + xs_h * ys_l
-	var t_h : uint64 = xs_h * ys_h
-
-	var prod_l : uint64 = t_l + (t_m << 32)
-	var prod_h : uint64 = t_h + (t_m >> 32)
-	if t_l > prod_l
-		prod_h++
-	;;
-
-	var prod_n = xn != yn
-	var prod_lastbit_e = (xe - 52) + (ye - 52)
-	var prod_first1 = find_first1_64_hl(prod_h, prod_l, 105)
-	var prod_firstbit_e = prod_lastbit_e + prod_first1
-
-	var z_firstbit_e = ze
-	var z_lastbit_e = ze - 52
-	var z_first1 = 52
-
-	/* subnormals could throw firstbit_e calculations out of whack */
-	if (zb & exp_mask64 == 0)
-		z_first1 = find_first1_64(zs, z_first1)
-		z_firstbit_e = z_lastbit_e + z_first1
-	;;
-
-	var res_n
-	var res_h = 0
-	var res_l = 0
-	var res_first1
-	var res_lastbit_e
-	var res_firstbit_e
-
-	if prod_n == zn
-		res_n = prod_n
-
-		/*
-		   Align prod and z so that the top bit of the
-		   result is either 53 or 54, then add.
-		 */
-		if prod_firstbit_e >= z_firstbit_e
-			/*
-			    [ prod_h ][ prod_l ]
-			         [ z...
-			 */
-			res_lastbit_e = prod_lastbit_e
-			(res_h, res_l) = (prod_h, prod_l)
-			(res_h, res_l) = add_shifted(res_h, res_l, zs, z_lastbit_e - prod_lastbit_e)
-		else
-			/*
-			        [ prod_h ][ prod_l ]
-			    [ z...
-			 */
-			res_lastbit_e = z_lastbit_e - 64
-			res_h = zs
-			res_l = 0
-			if prod_lastbit_e >= res_lastbit_e + 64
-				/* In this situation, prod must be extremely subnormal */
-				res_h += shl(prod_l, prod_lastbit_e - res_lastbit_e - 64)
-			elif prod_lastbit_e >= res_lastbit_e
-				res_h += shl(prod_h, prod_lastbit_e - res_lastbit_e)
-				res_h += shr(prod_l, res_lastbit_e + 64 - prod_lastbit_e)
-				res_l += shl(prod_l, prod_lastbit_e - res_lastbit_e)
-			elif prod_lastbit_e + 64 >= res_lastbit_e
-				res_h += shr(prod_h, res_lastbit_e - prod_lastbit_e)
-				var l1 = shl(prod_h, prod_lastbit_e + 64 - res_lastbit_e)
-				var l2 = shr(prod_l, res_lastbit_e - prod_lastbit_e)
-				res_l = l1 + l2
-				if res_l < l1
-					res_h++
-				;;
-			elif prod_lastbit_e + 128 >= res_lastbit_e
-				res_l += shr(prod_h, res_lastbit_e - prod_lastbit_e - 64)
-			;;
-		;;
-	else
-		match compare_hl_z(prod_h, prod_l, prod_firstbit_e, prod_lastbit_e, zs, z_firstbit_e, z_lastbit_e)
-		| `std.Equal: -> 0.0
-		| `std.Before:
-			/* prod > z */
-			res_n = prod_n
-			res_lastbit_e = prod_lastbit_e
-			(res_h, res_l) = sub_shifted(prod_h, prod_l, zs, z_lastbit_e - prod_lastbit_e)
-		| `std.After:
-			/* z > prod */
-			res_n = zn
-			res_lastbit_e = z_lastbit_e - 64
-			(res_h, res_l) = sub_shifted(zs, 0, prod_h, prod_lastbit_e + 64 - (z_lastbit_e - 64))
-			(res_h, res_l) = sub_shifted(res_h, res_l, prod_l, prod_lastbit_e - (z_lastbit_e - 64))
-		;;
-	;;
-
-	res_first1 = 64 + find_first1_64(res_h, 55)
-	if res_first1 == 63
-		res_first1 = find_first1_64(res_l, 63)
-	;;
-	res_firstbit_e = res_first1 + res_lastbit_e
-
-	/*
-	   Finally, res_h and res_l are the high and low bits of
-	   the result. They now need to be assembled into a flt64.
-	   Subnormals and infinities could be a problem.
-	 */
-	var res_s = 0
-	if res_firstbit_e <= -1023
-		/* Subnormal case */
-		if res_lastbit_e + 128 < 12 - 1022
-			res_s = shr(res_h, 12 - 1022 - (res_lastbit_e + 128))
-			res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
-		elif res_lastbit_e + 64 < 12 - 1022
-			res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
-			res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
-		else
-			res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
-			res_s |= shl(res_l, -12 + (res_lastbit_e + 64) - (-1022))
-		;;
-
-		if need_round_away(res_h, res_l, res_first1 + (-1074 - res_firstbit_e))
-			res_s++
-		;;
-
-		/* No need for exponents, they are all zero */
-		var res = res_s
-		if res_n
-			res |= (1 << 63)
-		;;
-		-> std.flt64frombits(res)
-	;;
-
-	if res_firstbit_e >= 1024
-		/* Infinity case */
-		if res_n
-			-> std.flt64frombits(0xfff0000000000000)
-		else
-			-> std.flt64frombits(0x7ff0000000000000)
-		;;
-	;;
-
-	if res_first1 - 52 >= 64
-		res_s = shr(res_h, (res_first1 : int64) - 64 - 52)
-		if need_round_away(res_h, res_l, res_first1 - 52)
-			res_s++
-		;;
-	elif res_first1 - 52 >= 0
-		res_s = shl(res_h, 64 - (res_first1 - 52))
-		res_s |= shr(res_l, res_first1 - 52)
-		if need_round_away(res_h, res_l, res_first1 - 52)
-			res_s++
-		;;
-	else
-		res_s = shl(res_h, res_first1 - 52)
-	;;
-
-	/* The res_s++s might have messed everything up */
-	if res_s & (1 << 53) != 0
-		res_s >= 1
-		res_firstbit_e++
-		if res_firstbit_e >= 1024
-			if res_n
-				-> std.flt64frombits(0xfff0000000000000)
-			else
-				-> std.flt64frombits(0x7ff0000000000000)
-			;;
-		;;
-	;;
-
-	-> std.flt64assem(res_n, res_firstbit_e, res_s)
-}
-
-/* >> and <<, but without wrapping when the shift is >= 64 */
-const shr : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64
-	if (s : uint64) >= 64
-		-> 0
-	else
-		-> u >> (s : uint64)
-	;;
-}
-
-const shl : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64
-	if (s : uint64) >= 64
-		-> 0
-	else
-		-> u << (s : uint64)
-	;;
-}
-
-/*
-   Add (a << s) to [ h ][ l ], where if s < 0 then a corresponding
-   right-shift is used. This is aligned such that if s == 0, then
-   the result is [ h ][ l + a ]
- */
-const add_shifted = {h : uint64, l : uint64, a : uint64, s : int64
-	if s >= 64
-		-> (h + shl(a, s - 64), l)
-	elif s >= 0
-		var new_h = h + shr(a, 64 - s)
-		var sa = shl(a, s)
-		var new_l = l + sa
-		if new_l < l
-			new_h++
-		;;
-		-> (new_h, new_l)
-	else
-		var new_h = h
-		var sa = shr(a, -s)
-		var new_l = l + sa
-		if new_l < l
-			new_h++
-		;;
-		-> (new_h, new_l)
-	;;
-}
-
-/* As above, but subtract (a << s) */
-const sub_shifted = {h : uint64, l : uint64, a : uint64, s : int64
-	if s >= 64
-		-> (h - shl(a, s - 64), l)
-	elif s >= 0
-		var new_h = h - shr(a, 64 - s)
-		var sa = shl(a, s)
-		var new_l = l - sa
-		if sa > l
-			new_h--
-		;;
-		-> (new_h, new_l)
-	else
-		var new_h = h
-		var sa = shr(a, -s)
-		var new_l = l - sa
-		if sa > l
-			new_h--
-		;;
-		-> (new_h, new_l)
-	;;
-}
-
-const compare_hl_z = {h : uint64, l : uint64, hl_firstbit_e : int64, hl_lastbit_e : int64, z : uint64, z_firstbit_e : int64, z_lastbit_e : int64
-	if hl_firstbit_e > z_firstbit_e
-		-> `std.Before
-	elif hl_firstbit_e < z_firstbit_e
-		-> `std.After
-	;;
-
-	var h_k : int64 = (hl_firstbit_e - hl_lastbit_e - 64)
-	var z_k : int64 = (z_firstbit_e - z_lastbit_e)
-	while h_k >= 0 && z_k >= 0
-		var h1 = h & shl(1, h_k) != 0
-		var z1 = z & shl(1, z_k) != 0
-		if h1 && !z1
-			-> `std.Before
-		elif !h1 && z1
-			-> `std.After
-		;;
-		h_k--
-		z_k--
-	;;
-
-	if z_k < 0
-		if (h & shr((-1 : uint64), 64 - h_k) != 0) || (l != 0)
-			-> `std.Before
-		else
-			-> `std.Equal
-		;;
-	;;
-
-	var l_k : int64 = 63
-	while l_k >= 0 && z_k >= 0
-		var l1 = l & shl(1, l_k) != 0
-		var z1 = z & shl(1, z_k) != 0
-		if l1 && !z1
-			-> `std.Before
-		elif !l1 && z1
-			-> `std.After
-		;;
-		l_k--
-		z_k--
-	;;
-
-	if (z_k < 0) && (l & shr((-1 : uint64), 64 - l_k) != 0)
-		-> `std.Before
-	elif (l_k < 0) && (z & shr((-1 : uint64), 64 - z_k) != 0)
-		-> `std.After
-	;;
-
-	-> `std.Equal
-}
-
-/* Find the first 1 bit in a bitstring */
-const find_first1_64 : (b : uint64, start : int64 -> int64) = {b : uint64, start : int64
-	for var j = start; j >= 0; --j
-		var m = shl(1, j)
-		if b & m != 0
-			-> j
-		;;
-	;;
-
-	-> -1
-}
-
-const find_first1_64_hl = {h, l, start
-	var first1_h = find_first1_64(h, start - 64)
-	if first1_h >= 0
-		-> first1_h + 64
-	;;
-
-	-> find_first1_64(l, 63)
-}
-
-/*
-   For [ h ][ l ], where bitpos_last is the position of the last
-   bit that was included in the truncated result (l's last bit has
-   position 0), decide whether rounding up/away is needed. This is
-   true if
-
-    - following bitpos_last is a 1, then a non-zero sequence, or
-
-    - following bitpos_last is a 1, then a zero sequence, and the
-      round would be to even
- */
-const need_round_away = {h : uint64, l : uint64, bitpos_last : int64
-	var first_omitted_is_1 = false
-	var nonzero_beyond = false
-	if bitpos_last > 64
-		first_omitted_is_1 = h & shl(1, bitpos_last - 1 - 64) != 0
-		nonzero_beyond = nonzero_beyond || h & shr((-1 : uint64), 2 + 64 - (bitpos_last - 64)) != 0
-		nonzero_beyond = nonzero_beyond || (l != 0)
-	else
-		first_omitted_is_1 = l & shl(1, bitpos_last - 1) != 0
-		nonzero_beyond = nonzero_beyond || l & shr((-1 : uint64), 1 + 64 - bitpos_last) != 0
-	;;
-
-	if !first_omitted_is_1
-		-> false
-	;;
-
-	if nonzero_beyond
-		-> true
-	;;
-
-	var hl_is_odd = false
-
-	if bitpos_last >= 64
-		hl_is_odd = h & shl(1, bitpos_last - 64) != 0
-	else
-		hl_is_odd = l & shl(1, bitpos_last) != 0
-	;;
-
-	-> hl_is_odd
-}
--- a/lib/math/fpmath-sum-impl.myr
+++ /dev/null
@@ -1,104 +1,0 @@
-use std
-
-pkg math =
-	pkglocal const kahan_sum32 : (l : flt32[:] -> flt32)
-	pkglocal const priest_sum32 : (l : flt32[:] -> flt32)
-
-	pkglocal const kahan_sum64: (l : flt64[:] -> flt64)
-	pkglocal const priest_sum64 : (l : flt64[:] -> flt64)
-;;
-
-type doomed_flt32_arr = flt32[:]
-type doomed_flt64_arr = flt64[:]
-
-impl disposable doomed_flt32_arr =
-	__dispose__ = {a : doomed_flt32_arr; std.slfree((a : flt32[:])) }
-;;
-
-impl disposable doomed_flt64_arr =
-	__dispose__ = {a : doomed_flt64_arr; std.slfree((a : flt64[:])) }
-;;
-
-/*
-   Kahan's compensated summation. Fast and reasonably accurate,
-   although cancellation can cause relative error blowup. For
-   something slower, but more accurate, use something like Priest's
-   doubly compensated sums.
- */
-pkglocal const kahan_sum32 = {l; -> kahan_sum_gen(l, (0.0 : flt32))}
-pkglocal const kahan_sum64 = {l; -> kahan_sum_gen(l, (0.0 : flt64))}
-
-generic kahan_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
-	if l.len == 0
-		-> zero
-	;;
-
-	var s = zero
-	var c = zero
-	var y = zero
-	var t = zero
-
-	for x : l
-		y = x - c
-		t = s + y
-		c = (t - s) - y
-		s = t
-	;;
-
-	-> s
-}
-
-/*
-   Priest's doubly compensated summation. Extremely accurate, but
-   relatively slow. For situations in which cancellation is not
-   expected, something like Kahan's compensated summation may be
-   more useful.
- */
-pkglocal const priest_sum32 = {l : flt32[:]
-	var l2 = std.sldup(l)
-	std.sort(l2, mag_cmp32)
-	auto (l2 : doomed_flt32_arr)
-	-> priest_sum_gen(l2, (0.0 : flt32))
-}
-
-const mag_cmp32 = {f : flt32, g : flt32
-	var u = std.flt32bits(f) & ~(1 << 31)
-	var v = std.flt32bits(g) & ~(1 << 31)
-	-> std.numcmp(v, u)
-}
-
-pkglocal const priest_sum64 = {l : flt64[:]
-	var l2 = std.sldup(l)
-	std.sort(l, mag_cmp64)
-	auto (l2 : doomed_flt64_arr)
-	-> priest_sum_gen(l2, (0.0 : flt64))
-}
-
-const mag_cmp64 = {f : flt64, g : flt64
-	var u = std.flt64bits(f) & ~(1 << 63)
-	var v = std.flt64bits(g) & ~(1 << 63)
-	-> std.numcmp(v, u)
-}
-
-generic priest_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
-	/* l should be sorted in descending order */
-	if l.len == 0
-		-> zero
-	;;
-
-	var s = zero
-	var c = zero
-
-	for x : l
-		var y = c + x
-		var u = x - (y - c)
-		var t = (y + s)
-		var v = (y - (t - s))
-		var z = u + v
-		s = t + z
-		c = z - (s - t)
-	;;
-
-	-> s
-}
-
--- a/lib/math/fpmath-trunc-impl+posixy-x64-sse4.s
+++ /dev/null
@@ -1,41 +1,0 @@
-.globl math$trunc32
-.globl math$_trunc32
-math$trunc32:
-math$_trunc32:
-	roundss $0x03, %xmm0, %xmm0
-	ret
-
-.globl math$floor32
-.globl math$_floor32
-math$floor32:
-math$_floor32:
-	roundss $0x01, %xmm0, %xmm0
-	ret
-
-.globl math$ceil32
-.globl math$_ceil32
-math$ceil32:
-math$_ceil32:
-	roundss $0x02, %xmm0, %xmm0
-	ret
-
-.globl math$trunc64
-.globl math$_trunc64
-math$trunc64:
-math$_trunc64:
-	roundsd $0x03, %xmm0, %xmm0
-	ret
-
-.globl math$floor64
-.globl math$_floor64
-math$floor64:
-math$_floor64:
-	roundsd $0x01, %xmm0, %xmm0
-	ret
-
-.globl math$ceil64
-.globl math$_ceil64
-math$ceil64:
-math$_ceil64:
-	roundsd $0x02, %xmm0, %xmm0
-	ret
--- a/lib/math/fpmath-trunc-impl.myr
+++ /dev/null
@@ -1,103 +1,0 @@
-use std
-
-pkg math =
-	pkglocal const trunc32 : (f : flt32 -> flt32)
-	pkglocal const floor32 : (f : flt32 -> flt32)
-	pkglocal const ceil32  : (f : flt32 -> flt32)
-	pkglocal const trunc64 : (f : flt64 -> flt64)
-	pkglocal const floor64 : (f : flt64 -> flt64)
-	pkglocal const ceil64  : (f : flt64 -> flt64)
-;;
-
-const Flt32NegMask : uint32 = (1 << 31)
-const Flt32SigMask : uint32 = (1 << 23) - 1
-
-const Flt64NegMask : uint64 = (1 << 63)
-const Flt64SigMask : uint64 = (1 << 52) - 1
-
-pkglocal const floor32 = {f : flt32
-	var n, e, s
-	(n, e, s) = std.flt32explode(f)
-
-	/* Many special cases */
-	if e >= 23 || f == -0.0
-		-> f
-	elif e < 0
-		if n
-			-> -1.0
-		else
-			-> 0.0
-		;;
-	;;
-
-	if n
-		var fractional_mask = Flt32SigMask >> (e : uint32)
-		if s & fractional_mask == 0
-			-> f
-		else
-			/* Turns out the packing of exp and sig is useful */
-			var u : uint32 = std.flt32bits(f) & ~fractional_mask
-			u += ((1 << 23) >> (e : uint32))
-			-> std.flt32frombits(u)
-		;;
-	;;
-
-	var m : uint32 = (Flt32SigMask >> (e : uint32))
-	-> std.flt32assem(n, e, s & ~m)
-}
-
-pkglocal const trunc32 = {f : flt32
-	if std.flt32bits(f) & Flt32NegMask != 0
-		-> -floor32(-f)
-	else
-		-> floor32(f)
-	;;
-}
-
-pkglocal const ceil32 = {f : flt32
-	-> -floor32(-f)
-}
-
-pkglocal const floor64 = {f : flt64
-	var n, e, s
-	(n, e, s) = std.flt64explode(f)
-
-	/* Many special cases */
-	if e >= 52 || f == -0.0
-		-> f
-	elif e < 0
-		if n
-			-> -1.0
-		else
-			-> 0.0
-		;;
-	;;
-
-	if n
-		var fractional_mask = Flt64SigMask >> (e : uint64)
-		if s & fractional_mask == 0
-			-> f
-		else
-			/* Turns out the packing of exp and sig is useful */
-			var u : uint64 = std.flt64bits(f) & ~fractional_mask
-			u += ((1 << 52) >> (e : uint64))
-			-> std.flt64frombits(u)
-		;;
-	;;
-
-	var m : uint64 = (Flt64SigMask >> (e : uint64))
-	-> std.flt64assem(n, e, s & ~m)
-}
-
-pkglocal const trunc64 = {f : flt64
-	if std.flt64bits(f) & Flt64NegMask != 0
-		-> -floor64(-f)
-	else
-		-> floor64(f)
-	;;
-}
-
-pkglocal const ceil64 = {f : flt64
-	-> -floor64(-f)
-}
-
--- /dev/null
+++ b/lib/math/sum-impl.myr
@@ -1,0 +1,104 @@
+use std
+
+pkg math =
+	pkglocal const kahan_sum32 : (l : flt32[:] -> flt32)
+	pkglocal const priest_sum32 : (l : flt32[:] -> flt32)
+
+	pkglocal const kahan_sum64: (l : flt64[:] -> flt64)
+	pkglocal const priest_sum64 : (l : flt64[:] -> flt64)
+;;
+
+type doomed_flt32_arr = flt32[:]
+type doomed_flt64_arr = flt64[:]
+
+impl disposable doomed_flt32_arr =
+	__dispose__ = {a : doomed_flt32_arr; std.slfree((a : flt32[:])) }
+;;
+
+impl disposable doomed_flt64_arr =
+	__dispose__ = {a : doomed_flt64_arr; std.slfree((a : flt64[:])) }
+;;
+
+/*
+   Kahan's compensated summation. Fast and reasonably accurate,
+   although cancellation can cause relative error blowup. For
+   something slower, but more accurate, use something like Priest's
+   doubly compensated sums.
+ */
+pkglocal const kahan_sum32 = {l; -> kahan_sum_gen(l, (0.0 : flt32))}
+pkglocal const kahan_sum64 = {l; -> kahan_sum_gen(l, (0.0 : flt64))}
+
+generic kahan_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
+	if l.len == 0
+		-> zero
+	;;
+
+	var s = zero
+	var c = zero
+	var y = zero
+	var t = zero
+
+	for x : l
+		y = x - c
+		t = s + y
+		c = (t - s) - y
+		s = t
+	;;
+
+	-> s
+}
+
+/*
+   Priest's doubly compensated summation. Extremely accurate, but
+   relatively slow. For situations in which cancellation is not
+   expected, something like Kahan's compensated summation may be
+   more useful.
+ */
+pkglocal const priest_sum32 = {l : flt32[:]
+	var l2 = std.sldup(l)
+	std.sort(l2, mag_cmp32)
+	auto (l2 : doomed_flt32_arr)
+	-> priest_sum_gen(l2, (0.0 : flt32))
+}
+
+const mag_cmp32 = {f : flt32, g : flt32
+	var u = std.flt32bits(f) & ~(1 << 31)
+	var v = std.flt32bits(g) & ~(1 << 31)
+	-> std.numcmp(v, u)
+}
+
+pkglocal const priest_sum64 = {l : flt64[:]
+	var l2 = std.sldup(l)
+	std.sort(l, mag_cmp64)
+	auto (l2 : doomed_flt64_arr)
+	-> priest_sum_gen(l2, (0.0 : flt64))
+}
+
+const mag_cmp64 = {f : flt64, g : flt64
+	var u = std.flt64bits(f) & ~(1 << 63)
+	var v = std.flt64bits(g) & ~(1 << 63)
+	-> std.numcmp(v, u)
+}
+
+generic priest_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
+	/* l should be sorted in descending order */
+	if l.len == 0
+		-> zero
+	;;
+
+	var s = zero
+	var c = zero
+
+	for x : l
+		var y = c + x
+		var u = x - (y - c)
+		var t = (y + s)
+		var v = (y - (t - s))
+		var z = u + v
+		s = t + z
+		c = z - (s - t)
+	;;
+
+	-> s
+}
+
--- /dev/null
+++ b/lib/math/test/fma-impl.myr
@@ -1,0 +1,97 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name="fma-01", .fn = fma01],
+		[.name="fma-02", .fn = fma02],
+	][:])
+}
+
+const fma01 = {c
+	var inputs : (uint32, uint32, uint32, uint32)[:] = [
+		/*
+		   These are mostly obtained by running fpmath-consensus
+		   with seed 1234. Each (mostly) covers a different
+		   corner case.
+		 */
+		(0x000009a4, 0x00000000, 0x00000002, 0x00000002),
+		(0x69000000, 0x90008002, 0x68348026, 0x68348026),
+		(0x334802ab, 0x49113e8d, 0x90aea62e, 0x3ce2f4c3),
+		(0x5c35d8c1, 0x12dcb6e2, 0x6c1a8cc2, 0x6c1a8cc2),
+		(0xf6266d83, 0x2b3e04e8, 0x62f99bda, 0x62bbd79e),
+		(0x7278e907, 0x75f6c0f1, 0xf6f9b8e0, 0x7f800000),
+		(0xd7748eeb, 0x6737b23e, 0x68e3bbc7, 0xff2f7c71),
+		(0x7f373de4, 0x3dcf90f0, 0xd22ac17c, 0x7d9492ca),
+		(0xb50fce04, 0x00cd486d, 0x03800000, 0x03800000),
+		(0xbb600000, 0x43b7161a, 0x8684d442, 0xbfa03357),
+		(0xf26f8a00, 0x4bfac000, 0xc74ba9fc, 0xfeeaa06c),
+		(0x55d1fa60, 0x32f20000, 0x1b1fea3d, 0x49467eaf),
+		(0x29e26c00, 0x62352000, 0xa0e845af, 0x4ca032a9),
+		(0x287650f8, 0x7cd00000, 0x94e85d5e, 0x65c821c9),
+		(0x7689f580, 0x91418000, 0xaa2822ae, 0xc8508e21),
+		(0xbd813cc0, 0x421f0000, 0x9f098e17, 0xc0208977),
+		(0x3745461a, 0x4db9b736, 0xb6d7deff, 0x458f1cd8),
+		(0xa3ccfd37, 0x7f800000, 0xed328e70, 0xff800000),
+		(0xa3790205, 0x5033a3e6, 0xa001fd11, 0xb42ebbd5),
+		(0xa4932927, 0xc565bc34, 0x316887af, 0x31688bcf),
+		(0x83dd6ede, 0x31ddf8e6, 0x01fea4c8, 0x01fea4c7),
+		(0xa4988128, 0x099a41ad, 0x00800000, 0x00800000),
+		(0x1e0479cd, 0x91d5fcb4, 0x00800000, 0x00800000),
+		(0x2f413021, 0x0a3f5a4e, 0x80800483, 0x80800000),
+		(0x144dcd10, 0x12f4aba0, 0x80800000, 0x80800000),
+		(0x0d580b86, 0x435768a8, 0x966c8d6f, 0x966c5ffd),
+		(0xa19e9a6f, 0xb49af3e3, 0xa2468b59, 0xa2468b57),
+		(0xd119e996, 0x8e5ad0e3, 0x247e0028, 0x247e83b7),
+	][:]
+
+	for (x, y, z, r) : inputs
+		var xf : flt32 = std.flt32frombits(x)
+		var yf : flt32 = std.flt32frombits(y)
+		var zf : flt32 = std.flt32frombits(z)
+		var rf = math.fma(xf, yf, zf)
+		testr.check(c, rf == std.flt32frombits(r),
+			"0x{b=16,w=8,p=0} * 0x{b=16,w=8,p=0} + 0x{b=16,w=8,p=0} should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
+			x, y, z, r, std.flt32bits(rf))
+	;;
+}
+
+const fma02 = {c
+	var inputs : (uint64, uint64, uint64, uint64)[:] = [
+		/*
+		   These are mostly obtained by running fpmath-consensus
+		   with seed 1234. Each (mostly) covers a different
+		   corner case.
+		 */
+		(0x0000000000000000, 0x0000000000000000, 0x0100000000000000, 0x0100000000000000),
+		(0x0000000000000000, 0x0000000000000000, 0x0200000000000000, 0x0200000000000000),
+		(0x00000000000009a4, 0x6900000000000002, 0x6834802690008002, 0x6834802690008002),
+		(0x49113e8d334802ab, 0x5c35d8c190aea62e, 0x6c1a8cc212dcb6e2, 0x6c1a8cc212dcb6e2),
+		(0x2b3e04e8f6266d83, 0xae84e20f62f99bda, 0xc9115a1ccea6ce27, 0xc9115a1ccea6ce27),
+		(0xa03ea9e9b09d932c, 0xded7bc19edcbf0c7, 0xbbc4c1f83b3f8f2e, 0x3f26be5f0c7b48e3),
+		(0xa5ec2141c1e6f339, 0xa2d80fc217f57b61, 0x00b3484b473ef1b8, 0x08d526cb86ee748d),
+		(0xccc6600ee88bb67c, 0xc692eeec9b51cf0f, 0xbf5f1ae3486401b0, 0x536a7a30857129db),
+		(0x5f9b9e449db17602, 0xbef22ae5b6a2b1c5, 0x6133e925e6bf8a12, 0x6133e925e6bf823b),
+		(0x7f851249841b6278, 0x3773388e53a375f4, 0x761c27fc2ffa57be, 0x7709506b0e99dc30),
+		(0x7c7cb20f3ca8af93, 0x800fd7f5cfd5baae, 0x14e4c09c9bb1e17e, 0xbc9c6a3fd0e58682),
+		(0xb5e8db2107f4463f, 0x614af740c0d7eb3b, 0xd7e3d25c4daa81e0, 0xd7e3d798d3ccdffb),
+		(0xae62c8be4cb45168, 0x90cc5236f3516c90, 0x0007f8b14f684558, 0x0007f9364eb1a815),
+		(0x5809f53e32a7e1ba, 0xcc647611ccaa5bf4, 0xdfbdb5c345ce7a56, 0xe480990da5526103),
+		(0xbb889d7f826438e1, 0x03bdaff82129696d, 0x000000dacab276ae, 0x8000009296c962f8),
+		(0x003d95525e2b057a, 0xbef738ea5717d89a, 0x800000089763d88c, 0x800000b456ed1a9c),
+		(0x0be868cb5a7180c8, 0x3357a30707ed947c, 0x80000050d6b86ac6, 0x000000cfa41cb229),
+		(0xbe535f4f8a7498af, 0x00d24adee12217b8, 0x0000005729e93fb0, 0x800000016d975af3),
+		(0x39d1968eb883f088, 0x856f286e3b268f0e, 0x800000d7cdd0ed70, 0x800001e9cf01a0ae),
+	][:]
+
+	for (x, y, z, r) : inputs
+		var xf : flt64 = std.flt64frombits(x)
+		var yf : flt64 = std.flt64frombits(y)
+		var zf : flt64 = std.flt64frombits(z)
+		var rf = math.fma(xf, yf, zf)
+		testr.check(c, rf == std.flt64frombits(r),
+			"0x{b=16,w=16,p=0} * 0x{b=16,w=16,p=0} + 0x{b=16,w=16,p=0} should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",
+			x, y, z, r, std.flt64bits(rf))
+	;;
+}
--- a/lib/math/test/fpmath-fma-impl.myr
+++ /dev/null
@@ -1,97 +1,0 @@
-use std
-use math
-use testr
-
-const main = {
-	testr.run([
-		[.name="fma-01", .fn = fma01],
-		[.name="fma-02", .fn = fma02],
-	][:])
-}
-
-const fma01 = {c
-	var inputs : (uint32, uint32, uint32, uint32)[:] = [
-		/*
-		   These are mostly obtained by running fpmath-consensus
-		   with seed 1234. Each (mostly) covers a different
-		   corner case.
-		 */
-		(0x000009a4, 0x00000000, 0x00000002, 0x00000002),
-		(0x69000000, 0x90008002, 0x68348026, 0x68348026),
-		(0x334802ab, 0x49113e8d, 0x90aea62e, 0x3ce2f4c3),
-		(0x5c35d8c1, 0x12dcb6e2, 0x6c1a8cc2, 0x6c1a8cc2),
-		(0xf6266d83, 0x2b3e04e8, 0x62f99bda, 0x62bbd79e),
-		(0x7278e907, 0x75f6c0f1, 0xf6f9b8e0, 0x7f800000),
-		(0xd7748eeb, 0x6737b23e, 0x68e3bbc7, 0xff2f7c71),
-		(0x7f373de4, 0x3dcf90f0, 0xd22ac17c, 0x7d9492ca),
-		(0xb50fce04, 0x00cd486d, 0x03800000, 0x03800000),
-		(0xbb600000, 0x43b7161a, 0x8684d442, 0xbfa03357),
-		(0xf26f8a00, 0x4bfac000, 0xc74ba9fc, 0xfeeaa06c),
-		(0x55d1fa60, 0x32f20000, 0x1b1fea3d, 0x49467eaf),
-		(0x29e26c00, 0x62352000, 0xa0e845af, 0x4ca032a9),
-		(0x287650f8, 0x7cd00000, 0x94e85d5e, 0x65c821c9),
-		(0x7689f580, 0x91418000, 0xaa2822ae, 0xc8508e21),
-		(0xbd813cc0, 0x421f0000, 0x9f098e17, 0xc0208977),
-		(0x3745461a, 0x4db9b736, 0xb6d7deff, 0x458f1cd8),
-		(0xa3ccfd37, 0x7f800000, 0xed328e70, 0xff800000),
-		(0xa3790205, 0x5033a3e6, 0xa001fd11, 0xb42ebbd5),
-		(0xa4932927, 0xc565bc34, 0x316887af, 0x31688bcf),
-		(0x83dd6ede, 0x31ddf8e6, 0x01fea4c8, 0x01fea4c7),
-		(0xa4988128, 0x099a41ad, 0x00800000, 0x00800000),
-		(0x1e0479cd, 0x91d5fcb4, 0x00800000, 0x00800000),
-		(0x2f413021, 0x0a3f5a4e, 0x80800483, 0x80800000),
-		(0x144dcd10, 0x12f4aba0, 0x80800000, 0x80800000),
-		(0x0d580b86, 0x435768a8, 0x966c8d6f, 0x966c5ffd),
-		(0xa19e9a6f, 0xb49af3e3, 0xa2468b59, 0xa2468b57),
-		(0xd119e996, 0x8e5ad0e3, 0x247e0028, 0x247e83b7),
-	][:]
-
-	for (x, y, z, r) : inputs
-		var xf : flt32 = std.flt32frombits(x)
-		var yf : flt32 = std.flt32frombits(y)
-		var zf : flt32 = std.flt32frombits(z)
-		var rf = math.fma(xf, yf, zf)
-		testr.check(c, rf == std.flt32frombits(r),
-			"0x{b=16,w=8,p=0} * 0x{b=16,w=8,p=0} + 0x{b=16,w=8,p=0} should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
-			x, y, z, r, std.flt32bits(rf))
-	;;
-}
-
-const fma02 = {c
-	var inputs : (uint64, uint64, uint64, uint64)[:] = [
-		/*
-		   These are mostly obtained by running fpmath-consensus
-		   with seed 1234. Each (mostly) covers a different
-		   corner case.
-		 */
-		(0x0000000000000000, 0x0000000000000000, 0x0100000000000000, 0x0100000000000000),
-		(0x0000000000000000, 0x0000000000000000, 0x0200000000000000, 0x0200000000000000),
-		(0x00000000000009a4, 0x6900000000000002, 0x6834802690008002, 0x6834802690008002),
-		(0x49113e8d334802ab, 0x5c35d8c190aea62e, 0x6c1a8cc212dcb6e2, 0x6c1a8cc212dcb6e2),
-		(0x2b3e04e8f6266d83, 0xae84e20f62f99bda, 0xc9115a1ccea6ce27, 0xc9115a1ccea6ce27),
-		(0xa03ea9e9b09d932c, 0xded7bc19edcbf0c7, 0xbbc4c1f83b3f8f2e, 0x3f26be5f0c7b48e3),
-		(0xa5ec2141c1e6f339, 0xa2d80fc217f57b61, 0x00b3484b473ef1b8, 0x08d526cb86ee748d),
-		(0xccc6600ee88bb67c, 0xc692eeec9b51cf0f, 0xbf5f1ae3486401b0, 0x536a7a30857129db),
-		(0x5f9b9e449db17602, 0xbef22ae5b6a2b1c5, 0x6133e925e6bf8a12, 0x6133e925e6bf823b),
-		(0x7f851249841b6278, 0x3773388e53a375f4, 0x761c27fc2ffa57be, 0x7709506b0e99dc30),
-		(0x7c7cb20f3ca8af93, 0x800fd7f5cfd5baae, 0x14e4c09c9bb1e17e, 0xbc9c6a3fd0e58682),
-		(0xb5e8db2107f4463f, 0x614af740c0d7eb3b, 0xd7e3d25c4daa81e0, 0xd7e3d798d3ccdffb),
-		(0xae62c8be4cb45168, 0x90cc5236f3516c90, 0x0007f8b14f684558, 0x0007f9364eb1a815),
-		(0x5809f53e32a7e1ba, 0xcc647611ccaa5bf4, 0xdfbdb5c345ce7a56, 0xe480990da5526103),
-		(0xbb889d7f826438e1, 0x03bdaff82129696d, 0x000000dacab276ae, 0x8000009296c962f8),
-		(0x003d95525e2b057a, 0xbef738ea5717d89a, 0x800000089763d88c, 0x800000b456ed1a9c),
-		(0x0be868cb5a7180c8, 0x3357a30707ed947c, 0x80000050d6b86ac6, 0x000000cfa41cb229),
-		(0xbe535f4f8a7498af, 0x00d24adee12217b8, 0x0000005729e93fb0, 0x800000016d975af3),
-		(0x39d1968eb883f088, 0x856f286e3b268f0e, 0x800000d7cdd0ed70, 0x800001e9cf01a0ae),
-	][:]
-
-	for (x, y, z, r) : inputs
-		var xf : flt64 = std.flt64frombits(x)
-		var yf : flt64 = std.flt64frombits(y)
-		var zf : flt64 = std.flt64frombits(z)
-		var rf = math.fma(xf, yf, zf)
-		testr.check(c, rf == std.flt64frombits(r),
-			"0x{b=16,w=16,p=0} * 0x{b=16,w=16,p=0} + 0x{b=16,w=16,p=0} should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",
-			x, y, z, r, std.flt64bits(rf))
-	;;
-}
--- a/lib/math/test/fpmath-sum-impl.myr
+++ /dev/null
@@ -1,36 +1,0 @@
-use std
-use math
-use testr
-
-const main = {
-	testr.run([
-		[.name = "sums-kahan-01", .fn = sums_kahan_01],
-		[.name = "sums-priest-01", .fn = sums_priest_01],
-	][:])
-}
-
-const sums_kahan_01 = {c
-	var sums : (flt32[:], flt32)[:] = [
-		([1.0, 2.0, 3.0][:], 6.0),
-
-		/* Naive summing gives 1.0, actual answer is 2.0 */
-		([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 3.0)
-	][:]
-
-	for (a, r) : sums
-		var s = math.kahan_sum(a)
-		testr.eq(c, r, s)
-	;;
-}
-
-const sums_priest_01 = {c
-	var sums : (flt32[:], flt32)[:] = [
-		([1.0, 2.0, 3.0][:], 6.0),
-		([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 2.0)
-	][:]
-
-	for (a, r) : sums
-		var s = math.priest_sum(a)
-		testr.eq(c, r, s)
-	;;
-}
--- a/lib/math/test/fpmath-trunc-impl.myr
+++ /dev/null
@@ -1,124 +1,0 @@
-use std
-use math
-use testr
-
-const main = {
-	testr.run([
-		[.name = "trunc-01",    .fn = trunc01],
-		[.name = "trunc-02",    .fn = trunc02],
-		[.name = "floor-01",    .fn = floor01],
-		[.name = "floor-02",    .fn = floor02],
-		[.name = "ceil-01",     .fn = ceil01],
-		[.name = "ceil-02",     .fn = ceil02],
-	][:])
-}
-
-const trunc01 = {c
-	var flt32s : (flt32, flt32)[:] = [
-		(123.4, 123.0),
-		(0.0, 0.0),
-		(-0.0, -0.0),
-		(1.0, 1.0),
-		(1.1, 1.0),
-		(0.9, 0.0),
-		(10664524000000000000.0, 10664524000000000000.0),
-		(-3.5, -3.0),
-		(101.999, 101.0),
-		(std.flt32nan(), std.flt32nan()),
-	][:]
-
-	for (f, g) : flt32s
-		testr.eq(c, math.trunc(f), g)
-	;;
-}
-
-const trunc02 = {c
-	var flt64s : (flt64, flt64)[:] = [
-		(0.0, 0.0),
-		(-0.0, -0.0),
-		(1.0, 1.0),
-		(1.1, 1.0),
-		(0.9, 0.0),
-		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
-		(-3.5, -3.0),
-		(101.999, 101.0),
-		(std.flt64nan(), std.flt64nan()),
-	][:]
-
-	for (f, g) : flt64s
-		testr.eq(c, math.trunc(f), g)
-	;;
-}
-
-const floor01 = {c
-	var flt32s : (flt32, flt32)[:] = [
-		(0.0, 0.0),
-		(-0.0, -0.0),
-		(0.5, 0.0),
-		(1.1, 1.0),
-		(10664524000000000000.0, 10664524000000000000.0),
-		(-3.5, -4.0),
-		(-101.999, -102.0),
-		(-126.999, -127.0),
-		(-127.999, -128.0),
-		(-128.999, -129.0),
-		(std.flt32nan(), std.flt32nan()),
-	][:]
-
-	for (f, g) : flt32s
-		testr.eq(c, math.floor(f), g)
-	;;
-}
-
-const floor02 = {c
-	var flt64s : (flt64, flt64)[:] = [
-		(0.0, 0.0),
-		(-0.0, -0.0),
-		(0.5, 0.0),
-		(1.1, 1.0),
-		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
-		(-3.5, -4.0),
-		(-101.999, -102.0),
-		(std.flt64nan(), std.flt64nan()),
-	][:]
-
-	for (f, g) : flt64s
-		testr.eq(c, math.floor(f), g)
-	;;
-}
-
-const ceil01 = {c
-	var flt32s : (flt32, flt32)[:] = [
-		(0.0, 0.0),
-		(-0.0, -0.0),
-		(0.5, 1.0),
-		(-0.1, -0.0),
-		(1.1, 2.0),
-		(10664524000000000000.0, 10664524000000000000.0),
-		(-3.5, -3.0),
-		(-101.999, -101.0),
-		(std.flt32nan(), std.flt32nan()),
-	][:]
-
-	for (f, g) : flt32s
-		testr.eq(c, math.ceil(f), g)
-	;;
-}
-
-const ceil02 = {c
-	var flt64s : (flt64, flt64)[:] = [
-		(0.0, 0.0),
-		(-0.0, -0.0),
-		(0.5, 1.0),
-		(-0.1, -0.0),
-		(1.1, 2.0),
-		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
-		(-3.5, -3.0),
-		(-101.999, -101.0),
-		(std.flt64nan(), std.flt64nan()),
-	][:]
-
-	for (f, g) : flt64s
-		testr.eq(c, math.ceil(f), g)
-	;;
-}
--- /dev/null
+++ b/lib/math/test/sum-impl.myr
@@ -1,0 +1,36 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name = "sums-kahan-01", .fn = sums_kahan_01],
+		[.name = "sums-priest-01", .fn = sums_priest_01],
+	][:])
+}
+
+const sums_kahan_01 = {c
+	var sums : (flt32[:], flt32)[:] = [
+		([1.0, 2.0, 3.0][:], 6.0),
+
+		/* Naive summing gives 1.0, actual answer is 2.0 */
+		([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 3.0)
+	][:]
+
+	for (a, r) : sums
+		var s = math.kahan_sum(a)
+		testr.eq(c, r, s)
+	;;
+}
+
+const sums_priest_01 = {c
+	var sums : (flt32[:], flt32)[:] = [
+		([1.0, 2.0, 3.0][:], 6.0),
+		([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 2.0)
+	][:]
+
+	for (a, r) : sums
+		var s = math.priest_sum(a)
+		testr.eq(c, r, s)
+	;;
+}
--- /dev/null
+++ b/lib/math/test/trunc-impl.myr
@@ -1,0 +1,124 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name = "trunc-01",    .fn = trunc01],
+		[.name = "trunc-02",    .fn = trunc02],
+		[.name = "floor-01",    .fn = floor01],
+		[.name = "floor-02",    .fn = floor02],
+		[.name = "ceil-01",     .fn = ceil01],
+		[.name = "ceil-02",     .fn = ceil02],
+	][:])
+}
+
+const trunc01 = {c
+	var flt32s : (flt32, flt32)[:] = [
+		(123.4, 123.0),
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(1.0, 1.0),
+		(1.1, 1.0),
+		(0.9, 0.0),
+		(10664524000000000000.0, 10664524000000000000.0),
+		(-3.5, -3.0),
+		(101.999, 101.0),
+		(std.flt32nan(), std.flt32nan()),
+	][:]
+
+	for (f, g) : flt32s
+		testr.eq(c, math.trunc(f), g)
+	;;
+}
+
+const trunc02 = {c
+	var flt64s : (flt64, flt64)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(1.0, 1.0),
+		(1.1, 1.0),
+		(0.9, 0.0),
+		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+		(-3.5, -3.0),
+		(101.999, 101.0),
+		(std.flt64nan(), std.flt64nan()),
+	][:]
+
+	for (f, g) : flt64s
+		testr.eq(c, math.trunc(f), g)
+	;;
+}
+
+const floor01 = {c
+	var flt32s : (flt32, flt32)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(0.5, 0.0),
+		(1.1, 1.0),
+		(10664524000000000000.0, 10664524000000000000.0),
+		(-3.5, -4.0),
+		(-101.999, -102.0),
+		(-126.999, -127.0),
+		(-127.999, -128.0),
+		(-128.999, -129.0),
+		(std.flt32nan(), std.flt32nan()),
+	][:]
+
+	for (f, g) : flt32s
+		testr.eq(c, math.floor(f), g)
+	;;
+}
+
+const floor02 = {c
+	var flt64s : (flt64, flt64)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(0.5, 0.0),
+		(1.1, 1.0),
+		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+		(-3.5, -4.0),
+		(-101.999, -102.0),
+		(std.flt64nan(), std.flt64nan()),
+	][:]
+
+	for (f, g) : flt64s
+		testr.eq(c, math.floor(f), g)
+	;;
+}
+
+const ceil01 = {c
+	var flt32s : (flt32, flt32)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(0.5, 1.0),
+		(-0.1, -0.0),
+		(1.1, 2.0),
+		(10664524000000000000.0, 10664524000000000000.0),
+		(-3.5, -3.0),
+		(-101.999, -101.0),
+		(std.flt32nan(), std.flt32nan()),
+	][:]
+
+	for (f, g) : flt32s
+		testr.eq(c, math.ceil(f), g)
+	;;
+}
+
+const ceil02 = {c
+	var flt64s : (flt64, flt64)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(0.5, 1.0),
+		(-0.1, -0.0),
+		(1.1, 2.0),
+		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+		(-3.5, -3.0),
+		(-101.999, -101.0),
+		(std.flt64nan(), std.flt64nan()),
+	][:]
+
+	for (f, g) : flt64s
+		testr.eq(c, math.ceil(f), g)
+	;;
+}
--- /dev/null
+++ b/lib/math/trunc-impl+posixy-x64-sse4.s
@@ -1,0 +1,41 @@
+.globl math$trunc32
+.globl math$_trunc32
+math$trunc32:
+math$_trunc32:
+	roundss $0x03, %xmm0, %xmm0
+	ret
+
+.globl math$floor32
+.globl math$_floor32
+math$floor32:
+math$_floor32:
+	roundss $0x01, %xmm0, %xmm0
+	ret
+
+.globl math$ceil32
+.globl math$_ceil32
+math$ceil32:
+math$_ceil32:
+	roundss $0x02, %xmm0, %xmm0
+	ret
+
+.globl math$trunc64
+.globl math$_trunc64
+math$trunc64:
+math$_trunc64:
+	roundsd $0x03, %xmm0, %xmm0
+	ret
+
+.globl math$floor64
+.globl math$_floor64
+math$floor64:
+math$_floor64:
+	roundsd $0x01, %xmm0, %xmm0
+	ret
+
+.globl math$ceil64
+.globl math$_ceil64
+math$ceil64:
+math$_ceil64:
+	roundsd $0x02, %xmm0, %xmm0
+	ret
--- /dev/null
+++ b/lib/math/trunc-impl.myr
@@ -1,0 +1,103 @@
+use std
+
+pkg math =
+	pkglocal const trunc32 : (f : flt32 -> flt32)
+	pkglocal const floor32 : (f : flt32 -> flt32)
+	pkglocal const ceil32  : (f : flt32 -> flt32)
+	pkglocal const trunc64 : (f : flt64 -> flt64)
+	pkglocal const floor64 : (f : flt64 -> flt64)
+	pkglocal const ceil64  : (f : flt64 -> flt64)
+;;
+
+const Flt32NegMask : uint32 = (1 << 31)
+const Flt32SigMask : uint32 = (1 << 23) - 1
+
+const Flt64NegMask : uint64 = (1 << 63)
+const Flt64SigMask : uint64 = (1 << 52) - 1
+
+pkglocal const floor32 = {f : flt32
+	var n, e, s
+	(n, e, s) = std.flt32explode(f)
+
+	/* Many special cases */
+	if e >= 23 || f == -0.0
+		-> f
+	elif e < 0
+		if n
+			-> -1.0
+		else
+			-> 0.0
+		;;
+	;;
+
+	if n
+		var fractional_mask = Flt32SigMask >> (e : uint32)
+		if s & fractional_mask == 0
+			-> f
+		else
+			/* Turns out the packing of exp and sig is useful */
+			var u : uint32 = std.flt32bits(f) & ~fractional_mask
+			u += ((1 << 23) >> (e : uint32))
+			-> std.flt32frombits(u)
+		;;
+	;;
+
+	var m : uint32 = (Flt32SigMask >> (e : uint32))
+	-> std.flt32assem(n, e, s & ~m)
+}
+
+pkglocal const trunc32 = {f : flt32
+	if std.flt32bits(f) & Flt32NegMask != 0
+		-> -floor32(-f)
+	else
+		-> floor32(f)
+	;;
+}
+
+pkglocal const ceil32 = {f : flt32
+	-> -floor32(-f)
+}
+
+pkglocal const floor64 = {f : flt64
+	var n, e, s
+	(n, e, s) = std.flt64explode(f)
+
+	/* Many special cases */
+	if e >= 52 || f == -0.0
+		-> f
+	elif e < 0
+		if n
+			-> -1.0
+		else
+			-> 0.0
+		;;
+	;;
+
+	if n
+		var fractional_mask = Flt64SigMask >> (e : uint64)
+		if s & fractional_mask == 0
+			-> f
+		else
+			/* Turns out the packing of exp and sig is useful */
+			var u : uint64 = std.flt64bits(f) & ~fractional_mask
+			u += ((1 << 52) >> (e : uint64))
+			-> std.flt64frombits(u)
+		;;
+	;;
+
+	var m : uint64 = (Flt64SigMask >> (e : uint64))
+	-> std.flt64assem(n, e, s & ~m)
+}
+
+pkglocal const trunc64 = {f : flt64
+	if std.flt64bits(f) & Flt64NegMask != 0
+		-> -floor64(-f)
+	else
+		-> floor64(f)
+	;;
+}
+
+pkglocal const ceil64 = {f : flt64
+	-> -floor64(-f)
+}
+