shithub: mc

--- a/lib/math/pown-impl.myr

+++ b/lib/math/pown-impl.myr

@@ -1,8 +1,9 @@

 use std

 use "fpmath"

-use "log-impl"

+use "impls"

 use "log-overkill"

+use "log-impl"

 use "sum-impl"

 use "util"

@@ -32,7 +33,11 @@

 	neginf : @u

 	magcmp : (f : @f, g : @f -> std.order)

 	two_by_two : (x : @f, y : @f -> (@f, @f))

+	split_add : (x_h : @f, x_l : @f, y_h : @f, y_l : @f -> (@f, @f))

+	split_mul : (x_h : @f, x_l : @f, y_h : @f, y_l : @f -> (@f, @f))

+	floor : (x : @f -> @f)

 	log_overkill : (x : @f -> (@f, @f))

+	precision : @i

 	emin : @i

 	emax : @i

 	imax : @i

@@ -52,7 +57,11 @@

 	.neginf = 0xff800000,

 	.magcmp = mag_cmp32,

 	.two_by_two = two_by_two32,

+	.split_add = split_add32,

+	.split_mul = split_mul32,

+	.floor = floor32,

 	.log_overkill = logoverkill32,

+	.precision = 24,

 	.emin = -126,

 	.emax = 127,

 	.imax = 2147483647, /* For detecting overflow in final exponent */

@@ -72,7 +81,11 @@

 	.neginf = 0xfff0000000000000,

 	.magcmp = mag_cmp64,

 	.two_by_two = two_by_two64,

+	.split_add = hl_add,

+	.split_mul = hl_mult,

+	.floor = floor64,

 	.log_overkill = logoverkill64,

+	.precision = 53,

 	.emin = -1022,

 	.emax = 1023,

 	.imax = 9223372036854775807,

@@ -79,6 +92,24 @@

 	.imin = -9223372036854775808,

+const split_add32 = {x_h : flt32, x_l : flt32, y_h : flt32, y_l : flt32

+	var x : flt64 = (x_h : flt64) + (x_l : flt64)

+	var y : flt64 = (y_h : flt64) + (y_l : flt64)

+	var z = x + y

+	var z_h : flt32 = (z : flt32)

+	var z_l : flt32 = ((z - (z_h : flt64)) : flt32)

+	-> (z_h, z_l)

+}

+const split_mul32 = {x_h : flt32, x_l : flt32, y_h : flt32, y_l : flt32

+	var x : flt64 = (x_h : flt64) + (x_l : flt64)

+	var y : flt64 = (y_h : flt64) + (y_l : flt64)

+	var z = x * y

+	var z_h : flt32 = (z : flt32)

+	var z_l : flt32 = ((z - (z_h : flt64)) : flt32)

+	-> (z_h, z_l)

+}

 const pown32 = {x : flt32, n : int32

 	-> powngen(x, n, desc32)

@@ -123,6 +154,9 @@

 	elif n == 1

 		/* Anything^1 is itself */

 		-> x

+	elif n == -1

+		/* The CPU is probably better at division than we are at pow(). */

+		-> 1.0/x

;;

 	/* (-f)^n = (-1)^n * (f)^n. Figure this out now, then pretend f >= 0.0 */

@@ -142,62 +176,55 @@

 	   Since n and e, and I are all integers, we can get the last part from

 	   scale2. The hard part is computing I and F, and then computing 2^F.

*/

+	if xe > 0

+		/*

+		   But first: do some rough calculations: if we can show n*log(xs) has the

+		   same sign as n*e, and n*e would cause overflow, then we might as well

+		   return right now.

+		 */

+		var exp_rough_estimate = n * xe

+		if n > 0 && (exp_rough_estimate > d.emax + 1 || (exp_rough_estimate / n != xe))

+			-> ult_sgn * d.frombits(d.inf)

+		elif n < 0 && (exp_rough_estimate < d.emin - d.precision - 1 || (exp_rough_estimate / n != xe))

+			-> ult_sgn * 0.0

+		;;

+	elif xe < 0

+		/*

+		   Also, if consider xs/2 and xe + 1, we can analyze the case in which

+		   n*log(xs) has a different sign from n*e.

+	         */

+		var exp_rough_estimate = n * (xe + 1)

+		if n > 0 && (exp_rough_estimate < d.emin - d.precision - 1 || (exp_rough_estimate / n != (xe + 1)))

+			-> ult_sgn * 0.0

+		elif n < 0 && (exp_rough_estimate > d.emax + 1 || (exp_rough_estimate / n != (xe + 1)))

+			-> ult_sgn * d.frombits(d.inf)

+		;;

+	;;

 	var ln_xs_hi, ln_xs_lo

 	(ln_xs_hi, ln_xs_lo) = d.log_overkill(d.assem(false, 0, xs))

 	/* Now x^n = 2^(n * [ ln_xs / ln(2) ]) * 2^(n + e) */

+	var E1, E2

+	(E1, E2) = d.split_mul(ln_xs_hi, ln_xs_lo, d.frombits(d.one_over_ln2_hi), d.frombits(d.one_over_ln2_lo))

-	var ls1 : @f[8]

-	(ls1[0], ls1[1]) = d.two_by_two(ln_xs_hi, d.frombits(d.one_over_ln2_hi))

-	(ls1[2], ls1[3]) = d.two_by_two(ln_xs_hi, d.frombits(d.one_over_ln2_lo))

-	(ls1[4], ls1[5]) = d.two_by_two(ln_xs_lo, d.frombits(d.one_over_ln2_hi))

-	(ls1[6], ls1[7]) = d.two_by_two(ln_xs_lo, d.frombits(d.one_over_ln2_lo))

/*

-	   Now log2(xs) = Sum(ls1), so

+	   Now log2(xs) = E1 + E2, so

-	     x^n = 2^(n * Sum(ls1)) * 2^(n * e)

+	     x^n = 2^(n * E1 + E2) * 2^(n * e)

*/

-	var E1, E2

-	(E1, E2) = double_compensated_sum(ls1[0:8])

-	var ls2 : @f[5]

-	var ls2s : @f[5]

-	var I = 0

-	(ls2[0], ls2[1]) = d.two_by_two(E1, nf)

-	(ls2[2], ls2[3]) = d.two_by_two(E2, nf)

-	ls2[4] = 0.0

-	/* Now x^n = 2^(Sum(ls2)) * 2^(n + e) */

-	for var j = 0; j < 5; ++j

-		var i = rn(ls2[j])

-		I += i

-		ls2[j] -= (i : @f)

-	;;

 	var F1, F2

-	std.slcp(ls2s[0:5], ls2[0:5])

-	std.sort(ls2s[0:5], d.magcmp)

-	(F1, F2) = double_compensated_sum(ls2s[0:5])

+	(F1, F2) = d.split_mul(E1, E2, nf, 0.0)

-	if (F1 < 0.0 || F1 > 1.0)

-		var i = rn(F1)

-		I += i

-		ls2[4] -= (i : @f)

-		std.slcp(ls2s[0:5], ls2[0:5])

-		std.sort(ls2s[0:5], d.magcmp)

-		(F1, F2) = double_compensated_sum(ls2s[0:5])

-	;;

+	var I = rn(F1)

+	(F1, F2) = d.split_add(-1.0 * (I : @f), 0.0, F1, F2)

 	/* Now, x^n = 2^(F1 + F2) * 2^(I + n*e). */

-	var ls3 : @f[6]

 	var log2_hi, log2_lo

 	(log2_hi, log2_lo) = d.C[128]

-	(ls3[0], ls3[1]) = d.two_by_two(F1, d.frombits(log2_hi))

-	(ls3[2], ls3[3]) = d.two_by_two(F1, d.frombits(log2_lo))

-	(ls3[4], ls3[5]) = d.two_by_two(F2, d.frombits(log2_hi))

 	var G1, G2

-	(G1, G2) = double_compensated_sum(ls3[0:6])

+	(G1, G2) = d.split_mul(F1, F2, d.frombits(log2_hi), d.frombits(log2_lo))

 	var base = exp(G1) + G2

 	var pow_xen = xe * n

--- a/lib/math/powr-impl.myr

+++ b/lib/math/powr-impl.myr

@@ -230,7 +230,7 @@

/*

 	   y could actually be above integer infinity, in which

-	   case x^y is most certainly infinity of 0. More importantly,

+	   case x^y is most certainly infinity or 0. More importantly,

 	   we can't safely compute M (below).

*/

 	if x > (1.0 : @f)

--- a/lib/math/test/pown-impl.myr

+++ b/lib/math/test/pown-impl.myr

@@ -100,6 +100,14 @@

 		(0xc017043172d0152b, 0x00000000000000e9, 0xe4b2c1666379afdc),

 		(0xc0325800cfeffb8e, 0x00000000000000d8, 0x78983c24a5e29e19),

 		(0xbfee2ae3cd3208ec, 0x00000000000006b7, 0xb6cb06585f39893d),

+		(0x3f7dd2994731f21f, 0x0000000000000097, 0x0000000000000003),

+		(0x61696e53830d02af, 0xfffffffffffffffe, 0x0000000000000006),

+		(0xc0e60abfce171c2e, 0xffffffffffffffbb, 0x800000000000008a),

+		(0x32dbf16a23293407, 0x0000000000000005, 0x00000000103f2cd6),

+		(0xb95741e695eb8ab2, 0x000000000000000a, 0x00000000000a873c),

+		(0x000aa88b5c2dd078, 0xffffffffffffffff, 0x7fd804c764025003),

+		(0x800cd2d56c4a4074, 0xffffffffffffffff, 0xffd3f696f65f6596),

+		(0x8000d6838a5a8463, 0xffffffffffffffff, 0xfff0000000000000),

 	][:]

 	for (x, y, z) : inputs

--

⑨