ref: 5de1231c0605526b22779dd827cc96461de40f77
dir: /lib/math/fma-impl.myr/
use std pkg math = pkglocal const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32) pkglocal const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64) ;; const exp_mask32 : uint32 = 0xff << 23 const exp_mask64 : uint64 = 0x7ff << 52 pkglocal const fma32 = {x : flt32, y : flt32, z : flt32 var xn, yn (xn, _, _) = std.flt32explode(x) (yn, _, _) = std.flt32explode(y) var xd : flt64 = flt64fromflt32(x) var yd : flt64 = flt64fromflt32(y) var zd : flt64 = flt64fromflt32(z) var prod : flt64 = xd * yd var pn, pe, ps (pn, pe, ps) = std.flt64explode(prod) if pe == -1023 pe = -1022 ;; if pn != (xn != yn) /* In case of NaNs, sign might not have been preserved */ pn = (xn != yn) prod = std.flt64assem(pn, pe, ps) ;; var r : flt64 = prod + zd var rn, re, rs (rn, re, rs) = std.flt64explode(r) /* At this point, r is probably the correct answer. The only issue is that if truncating r to a flt32 causes rounding-to-even, and it was obtained by rounding in the first place, direction, then we'd be over-rounding. The only way this could possibly be a problem is if the product step rounded to the halfway point (a 100...0, with the 1 just outside truncation range). */ if re == 1024 || rs & 0x1fffffff != 0x10000000 -> flt32fromflt64(r) ;; /* At this point, there's definitely about to be a rounding error. To figure out what to do, compute prod + z with round-to-zero. If we get r again, then it's okay to round r upwards, because it hasn't been rounded away from zero yet and we allow ourselves one such rounding. */ var zn, ze, zs (zn, ze, zs) = std.flt64explode(zd) if ze == -1023 ze = -1022 ;; var rtn, rte, rts if pe >= ze && pn == zn (rtn, rte, rts) = (pn, pe, ps) rts += shr(zs, pe - ze) elif pe > ze || (pe == ze && ps > zs) (rtn, rte, rts) = (pn, pe, ps) rts -= shr(zs, pe - ze) if shr((-1 : uint64), 64 - std.min(64, (pe - ze))) & zs != 0 rts-- ;; elif pe < ze && pn == zn (rtn, rte, rts) = (zn, ze, zs) rts += shr(ps, ze - pe) else (rtn, rte, rts) = (zn, ze, zs) rts -= shr(ps, ze - pe) if shr((-1 : uint64), 64 - std.min(64, (ze - pe))) & ps != 0 rts-- ;; ;; if rts & (1 << 53) != 0 rts >>= 1 rte++ ;; if rn == rtn && rs == rts && re == rte rts++ if rts & (1 << 53) != 0 rts >>= 1 rte++ ;; ;; -> flt32fromflt64(std.flt64assem(rtn, rte, rts)) } const flt64fromflt32 = {f : flt32 var n, e, s (n, e, s) = std.flt32explode(f) var xs : uint64 = (s : uint64) var xe : int64 = (e : int64) if e == 128 -> std.flt64assem(n, 1024, xs) elif e == -127 /* All subnormals in single precision (except 0.0s) can be upgraded to double precision, since the exponent range is so much wider. */ var first1 = find_first1_64(xs, 23) if first1 < 0 -> std.flt64assem(n, -1023, 0) ;; xs = xs << (52 - (first1 : uint64)) xe = -126 - (23 - first1) -> std.flt64assem(n, xe, xs) ;; -> std.flt64assem(n, xe, xs << (52 - 23)) } const flt32fromflt64 = {f : flt64 var n : bool, e : int64, s : uint64 (n, e, s) = std.flt64explode(f) var ts : uint32 var te : int32 = (e : int32) if e >= 128 if e == 1023 && s != 0 /* NaN */ -> std.flt32assem(n, 128, 1) else /* infinity */ -> std.flt32assem(n, 128, 0) ;; ;; if e >= -127 /* normal */ ts = ((s >> (52 - 23)) : uint32) if need_round_away(0, s, 52 - 23) ts++ if ts & (1 << 24) != 0 ts >>= 1 te++ ;; ;; -> std.flt32assem(n, te, ts) ;; /* subnormal already, will have to go to 0 */ if e == -1023 -> std.flt32assem(n, -127, 0) ;; /* subnormal (at least, it will be) */ te = -127 var shift : int64 = (52 - 23) + (-126 - e) var ts1 = shr(s, shift) ts = (ts1 : uint32) if need_round_away(0, s, shift) ts++ ;; -> std.flt32assem(n, te, ts) } pkglocal const fma64 = {x : flt64, y : flt64, z : flt64 var xn : bool, yn : bool, zn : bool var xe : int64, ye : int64, ze : int64 var xs : uint64, ys : uint64, zs : uint64 var xb : uint64 = std.flt64bits(x) var yb : uint64 = std.flt64bits(y) var zb : uint64 = std.flt64bits(z) /* check for both NaNs and infinities */ if xb & exp_mask64 == exp_mask64 || \ yb & exp_mask64 == exp_mask64 -> x * y + z elif z == 0.0 || z == -0.0 || x * y == 0.0 || x * y == -0.0 -> x * y + z elif zb & exp_mask64 == exp_mask64 -> z ;; (xn, xe, xs) = std.flt64explode(x) (yn, ye, ys) = std.flt64explode(y) (zn, ze, zs) = std.flt64explode(z) if xe == -1023 xe = -1022 ;; if ye == -1023 ye = -1022 ;; if ze == -1023 ze = -1022 ;; /* Keep product in high/low uint64s */ var xs_h : uint64 = xs >> 32 var ys_h : uint64 = ys >> 32 var xs_l : uint64 = xs & 0xffffffff var ys_l : uint64 = ys & 0xffffffff var t_l : uint64 = xs_l * ys_l var t_m : uint64 = xs_l * ys_h + xs_h * ys_l var t_h : uint64 = xs_h * ys_h var prod_l : uint64 = t_l + (t_m << 32) var prod_h : uint64 = t_h + (t_m >> 32) if t_l > prod_l prod_h++ ;; var prod_n = xn != yn var prod_lastbit_e = (xe - 52) + (ye - 52) var prod_first1 = find_first1_64_hl(prod_h, prod_l, 105) var prod_firstbit_e = prod_lastbit_e + prod_first1 var z_firstbit_e = ze var z_lastbit_e = ze - 52 var z_first1 = 52 /* subnormals could throw firstbit_e calculations out of whack */ if (zb & exp_mask64 == 0) z_first1 = find_first1_64(zs, z_first1) z_firstbit_e = z_lastbit_e + z_first1 ;; var res_n var res_h = 0 var res_l = 0 var res_first1 var res_lastbit_e var res_firstbit_e if prod_n == zn res_n = prod_n /* Align prod and z so that the top bit of the result is either 53 or 54, then add. */ if prod_firstbit_e >= z_firstbit_e /* [ prod_h ][ prod_l ] [ z... */ res_lastbit_e = prod_lastbit_e (res_h, res_l) = (prod_h, prod_l) (res_h, res_l) = add_shifted(res_h, res_l, zs, z_lastbit_e - prod_lastbit_e) else /* [ prod_h ][ prod_l ] [ z... */ res_lastbit_e = z_lastbit_e - 64 res_h = zs res_l = 0 if prod_lastbit_e >= res_lastbit_e + 64 /* In this situation, prod must be extremely subnormal */ res_h += shl(prod_l, prod_lastbit_e - res_lastbit_e - 64) elif prod_lastbit_e >= res_lastbit_e res_h += shl(prod_h, prod_lastbit_e - res_lastbit_e) res_h += shr(prod_l, res_lastbit_e + 64 - prod_lastbit_e) res_l += shl(prod_l, prod_lastbit_e - res_lastbit_e) elif prod_lastbit_e + 64 >= res_lastbit_e res_h += shr(prod_h, res_lastbit_e - prod_lastbit_e) var l1 = shl(prod_h, prod_lastbit_e + 64 - res_lastbit_e) var l2 = shr(prod_l, res_lastbit_e - prod_lastbit_e) res_l = l1 + l2 if res_l < l1 res_h++ ;; elif prod_lastbit_e + 128 >= res_lastbit_e res_l += shr(prod_h, res_lastbit_e - prod_lastbit_e - 64) ;; ;; else match compare_hl_z(prod_h, prod_l, prod_firstbit_e, prod_lastbit_e, zs, z_firstbit_e, z_lastbit_e) | `std.Equal: -> 0.0 | `std.Before: /* prod > z */ res_n = prod_n res_lastbit_e = prod_lastbit_e (res_h, res_l) = sub_shifted(prod_h, prod_l, zs, z_lastbit_e - prod_lastbit_e) | `std.After: /* z > prod */ res_n = zn res_lastbit_e = z_lastbit_e - 64 (res_h, res_l) = sub_shifted(zs, 0, prod_h, prod_lastbit_e + 64 - (z_lastbit_e - 64)) (res_h, res_l) = sub_shifted(res_h, res_l, prod_l, prod_lastbit_e - (z_lastbit_e - 64)) ;; ;; res_first1 = 64 + find_first1_64(res_h, 55) if res_first1 == 63 res_first1 = find_first1_64(res_l, 63) ;; res_firstbit_e = res_first1 + res_lastbit_e /* Finally, res_h and res_l are the high and low bits of the result. They now need to be assembled into a flt64. Subnormals and infinities could be a problem. */ var res_s = 0 if res_firstbit_e <= -1023 /* Subnormal case */ if res_lastbit_e + 128 < 12 - 1022 res_s = shr(res_h, 12 - 1022 - (res_lastbit_e + 128)) res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64)) elif res_lastbit_e + 64 < 12 - 1022 res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022)) res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64)) else res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022)) res_s |= shl(res_l, -12 + (res_lastbit_e + 64) - (-1022)) ;; if need_round_away(res_h, res_l, res_first1 + (-1074 - res_firstbit_e)) res_s++ ;; /* No need for exponents, they are all zero */ var res = res_s if res_n res |= (1 << 63) ;; -> std.flt64frombits(res) ;; if res_firstbit_e >= 1024 /* Infinity case */ if res_n -> std.flt64frombits(0xfff0000000000000) else -> std.flt64frombits(0x7ff0000000000000) ;; ;; if res_first1 - 52 >= 64 res_s = shr(res_h, (res_first1 : int64) - 64 - 52) if need_round_away(res_h, res_l, res_first1 - 52) res_s++ ;; elif res_first1 - 52 >= 0 res_s = shl(res_h, 64 - (res_first1 - 52)) res_s |= shr(res_l, res_first1 - 52) if need_round_away(res_h, res_l, res_first1 - 52) res_s++ ;; else res_s = shl(res_h, res_first1 - 52) ;; /* The res_s++s might have messed everything up */ if res_s & (1 << 53) != 0 res_s >= 1 res_firstbit_e++ if res_firstbit_e >= 1024 if res_n -> std.flt64frombits(0xfff0000000000000) else -> std.flt64frombits(0x7ff0000000000000) ;; ;; ;; -> std.flt64assem(res_n, res_firstbit_e, res_s) } /* >> and <<, but without wrapping when the shift is >= 64 */ const shr : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64 if (s : uint64) >= 64 -> 0 else -> u >> (s : uint64) ;; } const shl : (u : uint64, s : int64 -> uint64) = {u : uint64, s : int64 if (s : uint64) >= 64 -> 0 else -> u << (s : uint64) ;; } /* Add (a << s) to [ h ][ l ], where if s < 0 then a corresponding right-shift is used. This is aligned such that if s == 0, then the result is [ h ][ l + a ] */ const add_shifted = {h : uint64, l : uint64, a : uint64, s : int64 if s >= 64 -> (h + shl(a, s - 64), l) elif s >= 0 var new_h = h + shr(a, 64 - s) var sa = shl(a, s) var new_l = l + sa if new_l < l new_h++ ;; -> (new_h, new_l) else var new_h = h var sa = shr(a, -s) var new_l = l + sa if new_l < l new_h++ ;; -> (new_h, new_l) ;; } /* As above, but subtract (a << s) */ const sub_shifted = {h : uint64, l : uint64, a : uint64, s : int64 if s >= 64 -> (h - shl(a, s - 64), l) elif s >= 0 var new_h = h - shr(a, 64 - s) var sa = shl(a, s) var new_l = l - sa if sa > l new_h-- ;; -> (new_h, new_l) else var new_h = h var sa = shr(a, -s) var new_l = l - sa if sa > l new_h-- ;; -> (new_h, new_l) ;; } const compare_hl_z = {h : uint64, l : uint64, hl_firstbit_e : int64, hl_lastbit_e : int64, z : uint64, z_firstbit_e : int64, z_lastbit_e : int64 if hl_firstbit_e > z_firstbit_e -> `std.Before elif hl_firstbit_e < z_firstbit_e -> `std.After ;; var h_k : int64 = (hl_firstbit_e - hl_lastbit_e - 64) var z_k : int64 = (z_firstbit_e - z_lastbit_e) while h_k >= 0 && z_k >= 0 var h1 = h & shl(1, h_k) != 0 var z1 = z & shl(1, z_k) != 0 if h1 && !z1 -> `std.Before elif !h1 && z1 -> `std.After ;; h_k-- z_k-- ;; if z_k < 0 if (h & shr((-1 : uint64), 64 - h_k) != 0) || (l != 0) -> `std.Before else -> `std.Equal ;; ;; var l_k : int64 = 63 while l_k >= 0 && z_k >= 0 var l1 = l & shl(1, l_k) != 0 var z1 = z & shl(1, z_k) != 0 if l1 && !z1 -> `std.Before elif !l1 && z1 -> `std.After ;; l_k-- z_k-- ;; if (z_k < 0) && (l & shr((-1 : uint64), 64 - l_k) != 0) -> `std.Before elif (l_k < 0) && (z & shr((-1 : uint64), 64 - z_k) != 0) -> `std.After ;; -> `std.Equal } /* Find the first 1 bit in a bitstring */ const find_first1_64 : (b : uint64, start : int64 -> int64) = {b : uint64, start : int64 for var j = start; j >= 0; --j var m = shl(1, j) if b & m != 0 -> j ;; ;; -> -1 } const find_first1_64_hl = {h, l, start var first1_h = find_first1_64(h, start - 64) if first1_h >= 0 -> first1_h + 64 ;; -> find_first1_64(l, 63) } /* For [ h ][ l ], where bitpos_last is the position of the last bit that was included in the truncated result (l's last bit has position 0), decide whether rounding up/away is needed. This is true if - following bitpos_last is a 1, then a non-zero sequence, or - following bitpos_last is a 1, then a zero sequence, and the round would be to even */ const need_round_away = {h : uint64, l : uint64, bitpos_last : int64 var first_omitted_is_1 = false var nonzero_beyond = false if bitpos_last > 64 first_omitted_is_1 = h & shl(1, bitpos_last - 1 - 64) != 0 nonzero_beyond = nonzero_beyond || h & shr((-1 : uint64), 2 + 64 - (bitpos_last - 64)) != 0 nonzero_beyond = nonzero_beyond || (l != 0) else first_omitted_is_1 = l & shl(1, bitpos_last - 1) != 0 nonzero_beyond = nonzero_beyond || l & shr((-1 : uint64), 1 + 64 - bitpos_last) != 0 ;; if !first_omitted_is_1 -> false ;; if nonzero_beyond -> true ;; var hl_is_odd = false if bitpos_last >= 64 hl_is_odd = h & shl(1, bitpos_last - 64) != 0 else hl_is_odd = l & shl(1, bitpos_last) != 0 ;; -> hl_is_odd }