shithub: mc

--- a/lib/math/bld.sub

+++ b/lib/math/bld.sub

@@ -24,6 +24,7 @@

 	# x^n

 	log-overkill.myr

+	pown-impl.myr

 	# x^y

 	powr-impl.myr

--- a/lib/math/fpmath.myr

+++ b/lib/math/fpmath.myr

@@ -23,6 +23,9 @@

 		horner_poly : (x : @f, a : @f[:] -> @f)

 		horner_polyu : (x : @f, a : @u[:] -> @f)

+		/* pown-impl */

+		pown : (x : @f, n : @i -> @f)

 		/* powr-impl */

 		powr : (x : @f, y : @f -> @f)

@@ -103,6 +106,8 @@

 	horner_poly = {x, a; -> horner_poly32(x, a)}

 	horner_polyu = {x, a; -> horner_polyu32(x, a)}

+	pown = {x, n; -> pown32(x, n)}

 	powr = {x, y; -> powr32(x, y)}

 	scale2 = {x, m; -> scale232(x, m)}

@@ -139,6 +144,8 @@

 	horner_poly = {x, a; -> horner_poly64(x, a)}

 	horner_polyu = {x, a; -> horner_polyu64(x, a)}

+	pown = {x, n; -> pown64(x, n)}

 	powr = {x, y; -> powr64(x, y)}

--- a/lib/math/impls.myr

+++ b/lib/math/impls.myr

@@ -29,6 +29,9 @@

 	pkglocal extern const horner_polyu32 : (x : flt32, a : uint32[:] -> flt32)

 	pkglocal extern const horner_polyu64 : (x : flt64, a : uint64[:] -> flt64)

+	pkglocal extern const pown32 : (x : flt32, n : int32 -> flt32)

+	pkglocal extern const pown64 : (x : flt64, n : int64 -> flt64)

 	pkglocal extern const powr32 : (x : flt32, y : flt32 -> flt32)

 	pkglocal extern const powr64 : (x : flt64, y : flt64 -> flt64)

--- /dev/null

+++ b/lib/math/pown-impl.myr

@@ -1,0 +1,219 @@

+use std

+use "fpmath"

+use "log-impl"

+use "log-overkill"

+use "sum-impl"

+use "util"

+/*

+   This is an implementation of pown: computing x^n where n is an

+   integer. We sort of follow [PEB04], but without their high-radix

+   log_2.

+ */

+pkg math =

+	pkglocal const pown32 : (x : flt32, n : int32 -> flt32)

+	pkglocal const pown64 : (x : flt64, n : int64 -> flt64)

+;;

+type fltdesc(@f, @u, @i) = struct

+	explode : (f : @f -> (bool, @i, @u))

+	assem : (n : bool, e : @i, s : @u -> @f)

+	tobits : (f : @f -> @u)

+	frombits : (u : @u -> @f)

+	C : (@u, @u)[:]

+	one_over_ln2_hi : @u

+	one_over_ln2_lo : @u

+	nan : @u

+	inf : @u

+	neginf : @u

+	magcmp : (f : @f, g : @f -> std.order)

+	two_by_two : (x : @f, y : @f -> (@f, @f))

+	log_overkill : (x : @f -> (@f, @f))

+	emin : @i

+	emax : @i

+	imax : @i

+	imin : @i

+;;

+const desc32 : fltdesc(flt32, uint32, int32) =  [

+	.explode = std.flt32explode,

+	.assem = std.flt32assem,

+	.tobits = std.flt32bits,

+	.frombits = std.flt32frombits,

+	.C = accurate_logs32[0:130], /* See log-impl.myr */

+	.one_over_ln2_hi = 0x3fb8aa3b, /* 1/ln(2), top part */

+	.one_over_ln2_lo = 0x32a57060, /* 1/ln(2), bottom part */

+	.nan = 0x7fc00000,

+	.inf = 0x7f800000,

+	.neginf = 0xff800000,

+	.magcmp = mag_cmp32,

+	.two_by_two = two_by_two32,

+	.log_overkill = logoverkill32,

+	.emin = -126,

+	.emax = 127,

+	.imax = 2147483647, /* For detecting overflow in final exponent */

+	.imin = -2147483648,

+]

+const desc64 : fltdesc(flt64, uint64, int64) =  [

+	.explode = std.flt64explode,

+	.assem = std.flt64assem,

+	.tobits = std.flt64bits,

+	.frombits = std.flt64frombits,

+	.C = accurate_logs64[0:130], /* See log-impl.myr */

+	.one_over_ln2_hi = 0x3ff71547652b82fe,

+	.one_over_ln2_lo = 0x3c7777d0ffda0d24,

+	.nan = 0x7ff8000000000000,

+	.inf = 0x7ff0000000000000,

+	.neginf = 0xfff0000000000000,

+	.magcmp = mag_cmp64,

+	.two_by_two = two_by_two64,

+	.log_overkill = logoverkill64,

+	.emin = -1022,

+	.emax = 1023,

+	.imax = 9223372036854775807,

+	.imin = -9223372036854775808,

+]

+const pown32 = {x : flt32, n : int32

+	-> powngen(x, n, desc32)

+}

+const pown64 = {x : flt64, n : int64

+	-> powngen(x, n, desc64)

+}

+generic powngen = {x : @f, n : @i, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i

+	var xb

+	xb = d.tobits(x)

+	var xn : bool, xe : @i, xs : @u

+	(xn, xe, xs) = d.explode(x)

+	var nf : @f = (n : @f)

+	/*

+	   Special cases. Note we do not follow IEEE exceptions.

+	 */

+	if n == 0

+		/*

+		   Anything^0 is 1. We're taking the view that x is a tiny range of reals,

+		   so a dense subset of them are 1, even if x is 0.0.

+		 */

+		-> 1.0

+	elif std.isnan(x)

+		/* Propagate NaN (why doesn't this come first? Ask IEEE.) */

+		-> d.frombits(d.nan)

+	elif (x == 0.0)

+		if n < 0 && (n % 2 == 1) && xn

+			/* (+/- 0)^n = +/- oo */

+			-> d.frombits(d.neginf)

+		elif n < 0

+			-> d.frombits(d.inf)

+		elif n % 2 == 1

+			/* (+/- 0)^n = +/- 0 (n odd) */

+			-> d.assem(xn, d.emin - 1, 0)

+		else

+			-> 0.0

+		;;

+	elif n == 1

+		/* Anything^1 is itself */

+		-> x

+	;;

+	/* (-f)^n = (-1)^n * (f)^n. Figure this out now, then pretend f >= 0.0 */

+	var ult_sgn = 1.0

+	if xn && (n % 2 == 1 || n % 2 == -1)

+		ult_sgn = -1.0

+	;;

+	/*

+	   Compute (with x = xs * 2^e)

+	     x^n  = 2^(n*log2(xs)) * 2^(n*e)

+	          = 2^(I + F) * 2^(n*e)

+	          = 2^(F) * 2^(I+n*e)

+	   Since n and e, and I are all integers, we can get the last part from

+	   scale2. The hard part is computing I and F, and then computing 2^F.

+	 */

+	var ln_xs_hi, ln_xs_lo

+	(ln_xs_hi, ln_xs_lo) = d.log_overkill(d.assem(false, 0, xs))

+	/* Now x^n = 2^(n * [ ln_xs / ln(2) ]) * 2^(n + e) */

+	var ls1 : @f[8]

+	(ls1[0], ls1[1]) = d.two_by_two(ln_xs_hi, d.frombits(d.one_over_ln2_hi))

+	(ls1[2], ls1[3]) = d.two_by_two(ln_xs_hi, d.frombits(d.one_over_ln2_lo))

+	(ls1[4], ls1[5]) = d.two_by_two(ln_xs_lo, d.frombits(d.one_over_ln2_hi))

+	(ls1[6], ls1[7]) = d.two_by_two(ln_xs_lo, d.frombits(d.one_over_ln2_lo))

+	/*

+	   Now log2(xs) = Sum(ls1), so

+	     x^n = 2^(n * Sum(ls1)) * 2^(n * e)

+	 */

+	var E1, E2

+	(E1, E2) = double_compensated_sum(ls1[0:8])

+	var ls2 : @f[5]

+	var ls2s : @f[5]

+	var I = 0

+	(ls2[0], ls2[1]) = d.two_by_two(E1, nf)

+	(ls2[2], ls2[3]) = d.two_by_two(E2, nf)

+	ls2[4] = 0.0

+	/* Now x^n = 2^(Sum(ls2)) * 2^(n + e) */

+	for var j = 0; j < 5; ++j

+		var i = rn(ls2[j])

+		I += i

+		ls2[j] -= (i : @f)

+	;;

+	var F1, F2

+	std.slcp(ls2s[0:5], ls2[0:5])

+	std.sort(ls2s[0:5], d.magcmp)

+	(F1, F2) = double_compensated_sum(ls2s[0:5])

+	if (F1 < 0.0 || F1 > 1.0)

+		var i = rn(F1)

+		I += i

+		ls2[4] -= (i : @f)

+		std.slcp(ls2s[0:5], ls2[0:5])

+		std.sort(ls2s[0:5], d.magcmp)

+		(F1, F2) = double_compensated_sum(ls2s[0:5])

+	;;

+	/* Now, x^n = 2^(F1 + F2) * 2^(I + n*e). */

+	var ls3 : @f[6]

+	var log2_hi, log2_lo

+	(log2_hi, log2_lo) = d.C[128]

+	(ls3[0], ls3[1]) = d.two_by_two(F1, d.frombits(log2_hi))

+	(ls3[2], ls3[3]) = d.two_by_two(F1, d.frombits(log2_lo))

+	(ls3[4], ls3[5]) = d.two_by_two(F2, d.frombits(log2_hi))

+	var G1, G2

+	(G1, G2) = double_compensated_sum(ls3[0:6])

+	var base1 = exp(G1)

+	var base2 = exp(G2)

+	var pow_xen = xe * n

+	var pow = pow_xen + I

+	if pow_xen / n != xe || (I > 0 && d.imax - I < pow_xen) || (I < 0 && d.imin - I > pow_xen)

+		/*

+		   The exponent overflowed. There's no way this is representable. We need

+		   to at least recover the correct sign. If the overflow was from the

+		   multiplication, then the sign we want is the sign that pow_xen should

+		   have been. If the overflow was from the addition, then we still want

+		   the sign that pow_xen should have had.

+		 */

+		if (xe > 0) == (n > 0)

+			pow = 2 * d.emax

+		else

+			pow = 2 * d.emin

+		;;

+	;;

+	-> ult_sgn * scale2(base1 * base2, pow)

+}

--- /dev/null

+++ b/lib/math/test/pown-impl.myr

@@ -1,0 +1,119 @@

+use std

+use math

+use testr

+const main = {

+	math.fptrap(false)

+	testr.run([

+		[.name="pown-01", .fn = pown01],

+		[.name="pown-02", .fn = pown02],

+		[.name="pown-03", .fn = pown03],

+	][:])

+}

+const int32fromuint32 = {b; -> (&b : int32#)#}

+const int64fromuint64 = {b; -> (&b : int64#)#}

+/* Mostly obtained from fpmath-consensus */

+const pown01 = {c

+	var inputs : (uint32, uint32, uint32)[:] = [

+		(0x000000f6, 0x00000000, 0x3f800000),

+		(0x00000000, 0x3d800000, 0x00000000),

+		(0x946fc13b, 0x3b21efc7, 0x80000000),

+		(0xb76e98b6, 0xdbeb6637, 0xff800000),

+		(0xc04825b7, 0x53cdd772, 0x7f800000),

+		(0x3f7ff8c0, 0x000a53ba, 0x097c15ec),

+		(0xbefb4dd0, 0xffffffc6, 0x5d3b4cbc),

+		(0xbf77a88b, 0x0000038f, 0xa9b0342c),

+		(0xbd5cdf23, 0xffffffeb, 0xebb17f33),

+		(0x3fb66246, 0x000000e0, 0x78ac13ae),

+		(0x3bfec3ab, 0x00000005, 0x2df9e189),

+		(0x3f7762db, 0x000002b8, 0x2e466343),

+		(0x3e245431, 0xfffffff7, 0x4b582b71),

+		(0xbf73903f, 0x00000328, 0x2276ffa9),

+		(0x3f6a088a, 0xfffffe4d, 0x5b9dcb2e),

+		(0xc06650d6, 0x00000019, 0xd691b004),

+		(0x3f751050, 0x00000731, 0x0583b3e2),

+		(0x3f8124aa, 0x00001e6d, 0x7171d834),

+		(0xbf6c06b0, 0x000001b2, 0x260cb0e7),

+		(0x38307acf, 0x00000003, 0x29a7bd3a),

+		(0x3f7fafa1, 0x00005c97, 0x2a835867),

+		(0xbf80fb78, 0xfffffdcb, 0xbc5a0a5b),

+		(0xbff532bd, 0xffffffe7, 0xb3bc09eb),

+		(0xbf804fe0, 0x00002cbb, 0xd39529b4),

+		(0xbf7eaac3, 0x000026c1, 0x9a1b5779),

+		(0xb6ecbef2, 0xfffffff9, 0xfb5d43d7),

+		(0x40c1d332, 0xffffffef, 0x29628a12),

+	][:]

+	for (x, y, z) : inputs

+		var xf : flt32 = std.flt32frombits(x)

+		var yi : int32 = int32fromuint32(y)

+		var zf : flt32 = std.flt32frombits(z)

+		var rf = math.pown(xf, yi)

+		testr.check(c, rf == zf,

+			"pown(0x{b=16,w=8,p=0}, {}) should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",

+			x, yi, z, std.flt32bits(rf))

+	;;

+}

+/* Mostly obtained from fpmath-consensus */

+const pown02 = {c

+	var inputs : (uint64, uint64, uint64)[:] = [

+		(0x212c65442137286a, 0x000000007f47df7c, 0x0000000000000000),

+		(0x9add65abd325b6eb, 0xfc2b1173d2f9d7c5, 0xfff0000000000000),

+		(0xc00616fb023b43b5, 0x57ce36258314fd54, 0x7ff0000000000000),

+		(0xc12300696c144c98, 0x0000000000000032, 0x7c152603516d90a8),

+		(0xbfd3dfdfc81e60e0, 0xfffffffffffffe8a, 0x675fe457fecdc77b),

+		(0xbff03687f3c6d148, 0xffffffffffffa5ce, 0x2465ab45a663c6e7),

+		(0x3feae36a83f91b30, 0xfffffffffffff2a9, 0x75864016b705ed6e),

+		(0x3feb9a4d0abc8192, 0xffffffffffffff7f, 0x41a6cb5da02a95f7),

+		(0xbfbf733faaeccb42, 0xffffffffffffffc8, 0x4a851d5e1c674b7c),

+		(0xbff0a5cfa8cc163b, 0xfffffffffffffc2c, 0x3c6dbc034c342e8e),

+		(0xbfee9beac6a3caf0, 0xffffffffffffef92, 0x50c94fc31f862949),

+		(0x4cba94fc5ab93856, 0xfffffffffffffffe, 0x26572fe2438caeb6),

+		(0x3feccc63016da0b8, 0x00000000000010a3, 0x17735a1a2b8d1115),

+		(0x3ff06cc390fe6413, 0x0000000000006183, 0x7aec68e2525e0756),

+		(0xbfef7da61a3fc45e, 0x0000000000003c20, 0x29ac311e57d77bb2),

+		(0xbd17d3142424b4ce, 0x000000000000000d, 0x9b061d29ecc312d7),

+		(0xbd9b46e7dcf5ddd7, 0xffffffffffffffee, 0x69d1b75168c2d601),

+		(0x3fb885ed105fdf42, 0x000000000000004b, 0x301272ca384d27ab),

+		(0xbfedd75714c016db, 0x00000000000004a5, 0xb8723796de107a57),

+		(0x3ff25198b971bb27, 0x00000000000012f3, 0x7b21bc435390825e),

+		(0x3fef791958603e0e, 0xffffffffffff85b6, 0x6ecebfe2dc493669),

+		(0xc017043172d0152b, 0x00000000000000e9, 0xe4b2c1666379afdc),

+		(0xc0325800cfeffb8e, 0x00000000000000d8, 0x78983c24a5e29e19),

+		(0xbfee2ae3cd3208ec, 0x00000000000006b7, 0xb6cb06585f39893d),

+	][:]

+	for (x, y, z) : inputs

+		var xf : flt64 = std.flt64frombits(x)

+		var yi : int64 = int64fromuint64(y)

+		var zf : flt64 = std.flt64frombits(z)

+		var rf = math.pown(xf, yi)

+		testr.check(c, rf == zf,

+			"pown(0x{b=16,w=16,p=0}, {}) should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",

+			x, yi, z, std.flt64bits(rf))

+	;;

+}

+/* Here we quarantine off some known-bad results */

+const pown03 = {c

+	var inputs : (uint32, uint32, uint32, uint32)[:] = [

+		(0x3f80040a, 0xfff6d0a0, 0x09f93f64, 0x09f93f63),

+		(0x409d7f5a, 0xffffffd5, 0x0e0c8fa0, 0x0e0c8f9f),

+		(0xbfb588b4, 0x0000008c, 0x62be78b9, 0x62be78ba),

+		(0xb5e0d8c0, 0x00000002, 0x2c457c08, 0x2c457c07),

+	][:]

+	for (x, y, z_perfect, z_accepted) : inputs

+		var xf : flt32 = std.flt32frombits(x)

+		var yi : int32 = int32fromuint32(y)

+		var zf_perfect : flt32 = std.flt32frombits(z_perfect)

+		var zf_accepted : flt32 = std.flt32frombits(z_accepted)

+		var rf = math.pown(xf, yi)

+		testr.check(c, rf == zf_perfect || rf == zf_accepted,

+			"pown(0x{b=16,w=8,p=0}, {}) should be 0x{b=16,w=8,p=0}, will also accept 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",

+			x, yi, z_perfect, z_accepted, std.flt32bits(rf))

+	;;

+}