shithub: mc

Download patch

ref: 34f2230c4a505f3b94bc33ed07f0839fe66a0e93
parent: 895c8ae58de3ece766a1f9d70094f61adb864f71
author: S. Gilles <sgilles@math.umd.edu>
date: Tue May 21 04:11:06 EDT 2019

Rewrite log-overkill.myr

Instead of performing the arbitrary-precision calculation, we can
instead generalize [Tan90] slightly to use multiple levels of tables.
This isn't quite as wasteful.

--- a/lib/math/ancillary/generate-minimax-by-Remez.gp
+++ b/lib/math/ancillary/generate-minimax-by-Remez.gp
@@ -140,6 +140,10 @@
         return(if(x == 1,1,log(x)/(x-1))/log(2));
 }
 
+{ log1p(x) =
+        return(log(1 + x));
+}
+
 print("\n");
 print("Minimaxing sin(x) / x, degree 6, on [-Pi/(4 * 256), Pi/(4 * 256)]:");
 find_minimax(sinoverx, 6, -Pi/1024, Pi/1024)
@@ -182,6 +186,12 @@
 print("---\n");
 print("Minmimaxing log_2(x) / (x - 1), degree 7, on [1, 2^(1/8)]:");
 find_minimax(log2xoverx, 7, 1, 2^(1/8))
+print("\n");
+/* print("(You'll need to add a 0x0 at the beginning to make a degree 13...\n"); */
+/* print("\n"); */
+print("---\n");
+print("Minmimaxing log(1 + x), degree 5, on [0, 2^-20) [it's just going to give the Taylor expansion]:");
+find_minimax(log1p, 5, 0, 2^-20)
 print("\n");
 /* print("(You'll need to add a 0x0 at the beginning to make a degree 13...\n"); */
 /* print("\n"); */
--- a/lib/math/ancillary/log-overkill-constants.c
+++ b/lib/math/ancillary/log-overkill-constants.c
@@ -8,80 +8,60 @@
 #define FLT64_TO_UINT64(f) (*((uint64_t *) ((char *) &f)))
 #define UINT64_TO_FLT64(u) (*((double *) ((char *) &u)))
 
-#define FLT32_TO_UINT32(f) (*((uint32_t *) ((char *) &f)))
-#define UINT32_TO_FLT32(u) (*((double *) ((char *) &u)))
-
-int main(void)
+int
+main(void)
 {
-        mpfr_t one;
-        mpfr_t two_to_the_minus_k;
-        mpfr_t one_plus_two_to_the_minus_k;
-        mpfr_t ln_one_plus_two_to_the_minus_k;
-        mpfr_t one_minus_two_to_the_minus_k;
-        mpfr_t ln_one_minus_two_to_the_minus_k;
         mpfr_t t1;
         mpfr_t t2;
+        mpfr_t t3;
         double d = 0;
         uint64_t u = 0;
+        size_t j = 0;
+        long int k = 0;
 
-        mpfr_init2(one, 10000);
-        mpfr_init2(two_to_the_minus_k, 10000);
-        mpfr_init2(one_plus_two_to_the_minus_k, 10000);
-        mpfr_init2(ln_one_plus_two_to_the_minus_k, 10000);
-        mpfr_init2(one_minus_two_to_the_minus_k, 10000);
-        mpfr_init2(ln_one_minus_two_to_the_minus_k, 10000);
         mpfr_init2(t1, 10000);
         mpfr_init2(t2, 10000);
+        mpfr_init2(t3, 10000);
 
-        printf("/* C_plus */\n");
-        printf("(0x0000000000000000, 0x0000000000000000, 0x0000000000000000), /* dummy */\n");
+        /* C1 */
+        for (k = -5; k >= -20; k -= 5) {
+                printf(
+                        "const C%ld : (uint64, uint64, uint64, uint64)[32] = [\n",
+                        (k /
+                         (
+                                 -5)));
 
-        for (size_t k = 1; k <= 27; ++k) {
-                mpfr_set_si(one, 1, MPFR_RNDN);
-                mpfr_mul_2si(two_to_the_minus_k, one, -k, MPFR_RNDN);
-                mpfr_add(one_plus_two_to_the_minus_k, one, two_to_the_minus_k, MPFR_RNDN);
-                mpfr_log(ln_one_plus_two_to_the_minus_k, one_plus_two_to_the_minus_k, MPFR_RNDN);
+                for (j = 0; j < 32; ++j) {
+                        mpfr_set_si(t1, 1, MPFR_RNDN);
+                        mpfr_set_si(t2, k, MPFR_RNDN);
+                        mpfr_exp2(t2, t2, MPFR_RNDN);
+                        mpfr_mul_si(t2, t2, j, MPFR_RNDN);
+                        mpfr_add(t1, t1, t2, MPFR_RNDN);
 
-                mpfr_set(t1, ln_one_plus_two_to_the_minus_k, MPFR_RNDN);
-                d = mpfr_get_d(t1, MPFR_RNDN);
-                u = FLT64_TO_UINT64(d);
-                printf("(%#018lx, ", u);
-                mpfr_set_d(t2, d, MPFR_RNDN);
-                mpfr_sub(t1, t1, t2, MPFR_RNDN);
-                d = mpfr_get_d(t1, MPFR_RNDN);
-                u = FLT64_TO_UINT64(d);
-                printf("%#018lx, ", u);
-                mpfr_set_d(t2, d, MPFR_RNDN);
-                mpfr_sub(t1, t1, t2, MPFR_RNDN);
-                d = mpfr_get_d(t1, MPFR_RNDN);
-                u = FLT64_TO_UINT64(d);
-                printf("%#018lx), /* k = %zu */\n", u, k);
-        }
+                        /* first, log(1 + ...) */
+                        mpfr_log(t2, t1, MPFR_RNDN);
+                        d = mpfr_get_d(t2, MPFR_RNDN);
+                        u = FLT64_TO_UINT64(d);
+                        printf("	(%#018lx, ", u);
+                        mpfr_set_d(t3, d, MPFR_RNDN);
+                        mpfr_sub(t2, t2, t3, MPFR_RNDN);
+                        d = mpfr_get_d(t2, MPFR_RNDN);
+                        u = FLT64_TO_UINT64(d);
+                        printf("%#018lx, ", u);
 
-        printf("\n");
-        printf("/* C_minus */\n");
-        printf("(0x0000000000000000, 0x0000000000000000, 0x0000000000000000), /* dummy */\n");
+                        /* now, 1/(1 + ...) */
+                        mpfr_pow_si(t2, t1, -1, MPFR_RNDN);
+                        d = mpfr_get_d(t2, MPFR_RNDN);
+                        u = FLT64_TO_UINT64(d);
+                        printf("    %#018lx, ", u);
+                        mpfr_set_d(t3, d, MPFR_RNDN);
+                        mpfr_sub(t2, t2, t3, MPFR_RNDN);
+                        d = mpfr_get_d(t2, MPFR_RNDN);
+                        u = FLT64_TO_UINT64(d);
+                        printf("%#018lx), /* j = %zu */\n", u, j);
+                }
 
-        for (size_t k = 1; k <= 27; ++k) {
-                mpfr_set_si(one, 1, MPFR_RNDN);
-                mpfr_mul_2si(two_to_the_minus_k, one, -k, MPFR_RNDN);
-                mpfr_sub(one_minus_two_to_the_minus_k, one, two_to_the_minus_k, MPFR_RNDN);
-                mpfr_log(ln_one_minus_two_to_the_minus_k, one_minus_two_to_the_minus_k, MPFR_RNDN);
-
-                mpfr_set(t1, ln_one_minus_two_to_the_minus_k, MPFR_RNDN);
-                d = mpfr_get_d(t1, MPFR_RNDN);
-                u = FLT64_TO_UINT64(d);
-                printf("(%#018lx, ", u);
-                mpfr_set_d(t2, d, MPFR_RNDN);
-                mpfr_sub(t1, t1, t2, MPFR_RNDN);
-                d = mpfr_get_d(t1, MPFR_RNDN);
-                u = FLT64_TO_UINT64(d);
-                printf("%#018lx, ", u);
-                mpfr_set_d(t2, d, MPFR_RNDN);
-                mpfr_sub(t1, t1, t2, MPFR_RNDN);
-                d = mpfr_get_d(t1, MPFR_RNDN);
-                u = FLT64_TO_UINT64(d);
-                printf("%#018lx), /* k = %zu */\n", u, k);
+                printf("]\n\n");
         }
 
         return 0;
--- a/lib/math/bld.sub
+++ b/lib/math/bld.sub
@@ -23,9 +23,7 @@
 	poly-impl.myr
 
 	# x^n and x^1/q
-	log-overkill+posixy-x64-sse2.s
 	log-overkill.myr
-	pown-impl+posixy-x64-sse2.s
 	pown-impl.myr
 
 	# x^y
--- a/lib/math/log-overkill.myr
+++ b/lib/math/log-overkill.myr
@@ -1,48 +1,39 @@
 use std
 
 use "fpmath"
-use "log-impl"
-use "exp-impl"
-use "sum-impl"
+use "impls"
 use "util"
 
 /*
-   This is an implementation of log following [Mul16] 8.3.2, returning
-   far too much precision. These are slower than the
-   table-and-low-degree-polynomial implementations of exp-impl.myr and
-   log-impl.myr, but are needed to handle the powr, pown, and rootn
-   functions.
+   This is an implementation of log(x) using the following idea, based on [Tan90]
 
-   This is only for flt64, because if you want this for flt32 you should
-   just cast to flt64, use the fast functions, and then split back.
+     First, reduce to 2^e · xs, with xs ∈ [1, 2).
 
-   Note that the notation of [Mul16] 8.3.2 is confusing, to say the
-   least. [NM96] is, perhaps, a clearer presentation.
+     xs = F1 + f1, with
+       F1 = 1 + j1/2^5
+       j1 ∈ {1, 2, …, 2^5 - 1}
+       f1 ∈ [0, 2^-5)
 
-   To recap, we use an iteration, starting with t_1 = 0, L_1 = x, and
-   iterate as
+     log(xs) = log(F1) + log(1 + f1/F1)
 
-       t_{n+1} = t_{n} - ln(1 + d_n 2^{-n})
-       L_{n+1} = L_n (1 + d_n 2^{-n})
+     1 + f1/F1 = F2 + f2, with
+       F2 = 1 + j1/2^10
+       j2 ∈ {1, 2, …, 2^5 - 1}
+       f2 ∈ [0, 2^-10)
 
-   where d_n is in {-1, 0, 1}, chosen so that as n -> oo, L_n approaches
-   1, then t_n approaches ln(x).
+     log(xs) = log(F1) + log(F2) + log(1 + f2/F2)
 
-   If we let l_n = L_n - 1, we initialize l_1 = x - 1 and iterate as
+     …
 
-       l_{n+1} = l_n (1 + d_n 2^{-n}) + d_n 2^{-n}
+     log(xs) = log(F1) + log(F2) + log(F3) + log(F4) + log(1 + f4/F4)
 
-   If we further consider l'_n = 2^n l_n, we initialize l'_1 = x - 1,
-   and iterate as
+     And f4/F4 < 2^-20, so we can get 100 bits of precision using a
+     degree 5 polynomial.
 
-       l'_{n + 1} = 2 l'_{n} (1 + d_n 2^{-n}) + 2 d_n
-
-   The nice thing about this is that we can pick d_n easily based on
-   comparing l'_n to some easy constants:
-
-             { +1  if [l'_n] <= -1/2
-       d_n = {  0  if [l'_n] = 0 or 1/2
-             { -1  if [l'_n] >= 1
+     The specific choice of using 4 tables, each with 2^5 entries, may
+     be improvable. It's a trade-off between storage for the tables and
+     the number of floating point operations to chain the results
+     together.
  */
 pkg math =
 	pkglocal const logoverkill32 : (x : flt32 -> (flt32, flt32))
@@ -49,84 +40,151 @@
 	pkglocal const logoverkill64 : (x : flt64 -> (flt64, flt64))
 ;;
 
-/*
-   Tables of 1 +/- 2^-k, for k = 0 to 159, with k = 0 a dummy row. 159
-   is chosen as the first k such that the error between 2^(-53 * 2) and
-   2^(-53 * 2) + log(1 + 2^(-k)) is less than 1 ulp, therefore we'll
-   have a full 53 * 2 bits of precision available with these tables. The
-   layout for C_plus is
 
-        ( ln(1 + 2^-k)[hi],  ln(1 + 2^-k)[lo],    ln(1 + 2^-k)[very lo]) ,
 
-   and for C_minus it is similar, but handling ln(1 - 2^-k).
+/*
+   Ci is a table such that, for Ci[j] = (L1, L2, I1, I2),
+     L1, L2 are log(1 + j·2^-(5i))
+     I1, I2 are 1/(1 + j·2^-(5i))
+ */
+const C1 : (uint64, uint64, uint64, uint64)[32] = [
+	(000000000000000000, 000000000000000000,     0x3ff0000000000000, 000000000000000000), /* j = 0 */
+	(0x3f9f829b0e783300, 0x3c333e3f04f1ef23,     0x3fef07c1f07c1f08, 0xbc7f07c1f07c1f08), /* j = 1 */
+	(0x3faf0a30c01162a6, 0x3c485f325c5bbacd,     0x3fee1e1e1e1e1e1e, 0x3c6e1e1e1e1e1e1e), /* j = 2 */
+	(0x3fb6f0d28ae56b4c, 0xbc5906d99184b992,     0x3fed41d41d41d41d, 0x3c80750750750750), /* j = 3 */
+	(0x3fbe27076e2af2e6, 0xbc361578001e0162,     0x3fec71c71c71c71c, 0x3c8c71c71c71c71c), /* j = 4 */
+	(0x3fc29552f81ff523, 0x3c6301771c407dbf,     0x3febacf914c1bad0, 0xbc8bacf914c1bad0), /* j = 5 */
+	(0x3fc5ff3070a793d4, 0xbc5bc60efafc6f6e,     0x3feaf286bca1af28, 0x3c8af286bca1af28), /* j = 6 */
+	(0x3fc9525a9cf456b4, 0x3c6d904c1d4e2e26,     0x3fea41a41a41a41a, 0x3c80690690690690), /* j = 7 */
+	(0x3fcc8ff7c79a9a22, 0xbc64f689f8434012,     0x3fe999999999999a, 0xbc8999999999999a), /* j = 8 */
+	(0x3fcfb9186d5e3e2b, 0xbc6caaae64f21acb,     0x3fe8f9c18f9c18fa, 0xbc7f3831f3831f38), /* j = 9 */
+	(0x3fd1675cababa60e, 0x3c2ce63eab883717,     0x3fe8618618618618, 0x3c88618618618618), /* j = 10 */
+	(0x3fd2e8e2bae11d31, 0xbc78f4cdb95ebdf9,     0x3fe7d05f417d05f4, 0x3c67d05f417d05f4), /* j = 11 */
+	(0x3fd4618bc21c5ec2, 0x3c7f42decdeccf1d,     0x3fe745d1745d1746, 0xbc7745d1745d1746), /* j = 12 */
+	(0x3fd5d1bdbf5809ca, 0x3c74236383dc7fe1,     0x3fe6c16c16c16c17, 0xbc7f49f49f49f49f), /* j = 13 */
+	(0x3fd739d7f6bbd007, 0xbc78c76ceb014b04,     0x3fe642c8590b2164, 0x3c7642c8590b2164), /* j = 14 */
+	(0x3fd89a3386c1425b, 0xbc729639dfbbf0fb,     0x3fe5c9882b931057, 0x3c7310572620ae4c), /* j = 15 */
+	(0x3fd9f323ecbf984c, 0xbc4a92e513217f5c,     0x3fe5555555555555, 0x3c85555555555555), /* j = 16 */
+	(0x3fdb44f77bcc8f63, 0xbc7cd04495459c78,     0x3fe4e5e0a72f0539, 0x3c8e0a72f0539783), /* j = 17 */
+	(0x3fdc8ff7c79a9a22, 0xbc74f689f8434012,     0x3fe47ae147ae147b, 0xbc6eb851eb851eb8), /* j = 18 */
+	(0x3fddd46a04c1c4a1, 0xbc70467656d8b892,     0x3fe4141414141414, 0x3c64141414141414), /* j = 19 */
+	(0x3fdf128f5faf06ed, 0xbc7328df13bb38c3,     0x3fe3b13b13b13b14, 0xbc83b13b13b13b14), /* j = 20 */
+	(0x3fe02552a5a5d0ff, 0xbc7cb1cb51408c00,     0x3fe3521cfb2b78c1, 0x3c7a90e7d95bc60a), /* j = 21 */
+	(0x3fe0be72e4252a83, 0xbc8259da11330801,     0x3fe2f684bda12f68, 0x3c82f684bda12f68), /* j = 22 */
+	(0x3fe154c3d2f4d5ea, 0xbc859c33171a6876,     0x3fe29e4129e4129e, 0x3c804a7904a7904a), /* j = 23 */
+	(0x3fe1e85f5e7040d0, 0x3c7ef62cd2f9f1e3,     0x3fe2492492492492, 0x3c82492492492492), /* j = 24 */
+	(0x3fe2795e1289b11b, 0xbc6487c0c246978e,     0x3fe1f7047dc11f70, 0x3c81f7047dc11f70), /* j = 25 */
+	(0x3fe307d7334f10be, 0x3c6fb590a1f566da,     0x3fe1a7b9611a7b96, 0x3c61a7b9611a7b96), /* j = 26 */
+	(0x3fe393e0d3562a1a, 0xbc858eef67f2483a,     0x3fe15b1e5f75270d, 0x3c415b1e5f75270d), /* j = 27 */
+	(0x3fe41d8fe84672ae, 0x3c89192f30bd1806,     0x3fe1111111111111, 0x3c61111111111111), /* j = 28 */
+	(0x3fe4a4f85db03ebb, 0x3c313dfa3d3761b6,     0x3fe0c9714fbcda3b, 0xbc7f79b47582192e), /* j = 29 */
+	(0x3fe52a2d265bc5ab, 0xbc61883750ea4d0a,     0x3fe0842108421084, 0x3c70842108421084), /* j = 30 */
+	(0x3fe5ad404c359f2d, 0xbc435955683f7196,     0x3fe0410410410410, 0x3c80410410410410), /* j = 31 */
+]
 
-   They are generated by ancillary/log-overkill-constants.c. 
+const C2 : (uint64, uint64, uint64, uint64)[32] = [
+	(000000000000000000, 000000000000000000,     0x3ff0000000000000, 000000000000000000), /* j = 0 */
+	(0x3f4ffc00aa8ab110, 0xbbe0fecbeb9b6cdb,     0x3feff801ff801ff8, 0x3c2ff801ff801ff8), /* j = 1 */
+	(0x3f5ff802a9ab10e6, 0x3bfe29e3a153e3b2,     0x3feff007fc01ff00, 0x3c8ff007fc01ff00), /* j = 2 */
+	(0x3f67f7047d7983da, 0x3c0a275a19204e80,     0x3fefe811f28a186e, 0xbc849093915301bf), /* j = 3 */
+	(0x3f6ff00aa2b10bc0, 0x3c02821ad5a6d353,     0x3fefe01fe01fe020, 0xbc6fe01fe01fe020), /* j = 4 */
+	(0x3f73f38a60f06489, 0x3c1693c937494046,     0x3fefd831c1cdbed1, 0x3c8e89d3b75ace7e), /* j = 5 */
+	(0x3f77ee11ebd82e94, 0xbc161e96e2fc5d90,     0x3fefd04794a10e6a, 0x3c881bd63ea20ced), /* j = 6 */
+	(0x3f7be79c70058ec9, 0xbbd964fefef02b62,     0x3fefc86155aa1659, 0xbc6b8fc468497f61), /* j = 7 */
+	(0x3f7fe02a6b106789, 0xbbce44b7e3711ebf,     0x3fefc07f01fc07f0, 0x3c6fc07f01fc07f0), /* j = 8 */
+	(0x3f81ebde2d1997e6, 0xbbfffa46e1b2ec81,     0x3fefb8a096acfacc, 0xbc82962e18495af3), /* j = 9 */
+	(0x3f83e7295d25a7d9, 0xbbeff29a11443a06,     0x3fefb0c610d5e939, 0xbc5cb8337f41db5c), /* j = 10 */
+	(0x3f85e1f703ecbe50, 0x3c23eb0bb43693b9,     0x3fefa8ef6d92aca5, 0x3c7cd0c1eaba7f22), /* j = 11 */
+	(0x3f87dc475f810a77, 0xbc116d7687d3df21,     0x3fefa11caa01fa12, 0xbc7aaff02f71aaff), /* j = 12 */
+	(0x3f89d61aadc6bd8d, 0xbc239b097b525947,     0x3fef994dc3455e8d, 0xbc82546d9bc5bd59), /* j = 13 */
+	(0x3f8bcf712c74384c, 0xbc1f6842688f499a,     0x3fef9182b6813baf, 0x3c6b210c54d70f4a), /* j = 14 */
+	(0x3f8dc84b19123815, 0xbc25f0e2d267d821,     0x3fef89bb80dcc421, 0xbc8e7da8c7156f9d), /* j = 15 */
+	(0x3f8fc0a8b0fc03e4, 0xbc183092c59642a1,     0x3fef81f81f81f820, 0xbc8f81f81f81f820), /* j = 16 */
+	(0x3f90dc4518afcc88, 0xbc379c0189fdfe78,     0x3fef7a388f9da20f, 0x3c7f99b2c82d3fb1), /* j = 17 */
+	(0x3f91d7f7eb9eebe7, 0xbc2d41fe63d2dbf9,     0x3fef727cce5f530a, 0x3c84643cedd1cfd9), /* j = 18 */
+	(0x3f92d36cefb557c3, 0xbc132fa3e4f20cf7,     0x3fef6ac4d8f95f7a, 0x3c8e8ed9770a8dde), /* j = 19 */
+	(0x3f93cea44346a575, 0xbc10cb5a902b3a1c,     0x3fef6310aca0dbb5, 0x3c8d2e19807d8c43), /* j = 20 */
+	(0x3f94c99e04901ded, 0xbc2cd10505ada0d6,     0x3fef5b60468d989f, 0xbc805a26b4cad717), /* j = 21 */
+	(0x3f95c45a51b8d389, 0xbc3b10b6c3ec21b4,     0x3fef53b3a3fa204e, 0x3c8450467c5430f3), /* j = 22 */
+	(0x3f96bed948d1b7d1, 0xbc1058290fde6de1,     0x3fef4c0ac223b2bc, 0x3c815c2df7afcd24), /* j = 23 */
+	(0x3f97b91b07d5b11b, 0xbc35b602ace3a510,     0x3fef44659e4a4271, 0x3c85fc17734c36b8), /* j = 24 */
+	(0x3f98b31faca9b00e, 0x3c1d5a46da6f6772,     0x3fef3cc435b0713c, 0x3c81d0a7e69ea094), /* j = 25 */
+	(0x3f99ace7551cc514, 0x3c33409c1df8167f,     0x3fef3526859b8cec, 0x3c2f3526859b8cec), /* j = 26 */
+	(0x3f9aa6721ee835aa, 0xbc24a3a50b6c5621,     0x3fef2d8c8b538c0f, 0xbc88a987ac35964a), /* j = 27 */
+	(0x3f9b9fc027af9198, 0xbbf0ae69229dc868,     0x3fef25f644230ab5, 0x3c594ed8175c78b3), /* j = 28 */
+	(0x3f9c98d18d00c814, 0xbc250589df0f25bf,     0x3fef1e63ad57473c, 0xbc8bf54d8dbc6a00), /* j = 29 */
+	(0x3f9d91a66c543cc4, 0xbc1d34e608cbdaab,     0x3fef16d4c4401f17, 0xbc759ddff074959e), /* j = 30 */
+	(0x3f9e8a3ee30cdcac, 0x3c07086b1c00b395,     0x3fef0f4986300ba6, 0xbc811b6b7ee8766a), /* j = 31 */
+]
 
-   Conveniently, for k > 27, we can calculate the entry exactly using a
-   few terms of the Taylor expansion for ln(1 + x), with the x^{2+}
-   terms vanishing past k = 53. This is possible since we only care
-   about two flt64s worth of precision.
- */
-const C_plus : (uint64, uint64, uint64)[28] = [
-	(0x0000000000000000, 0x0000000000000000, 0x0000000000000000), /* dummy */
-	(0x3fd9f323ecbf984c, 0xbc4a92e513217f5c, 0x38e0c0cfa41ff669), /* k = 1 */
-	(0x3fcc8ff7c79a9a22, 0xbc64f689f8434012, 0x390a24ae3b2f53a1), /* k = 2 */
-	(0x3fbe27076e2af2e6, 0xbc361578001e0162, 0x38b55db94ebc4018), /* k = 3 */
-	(0x3faf0a30c01162a6, 0x3c485f325c5bbacd, 0xb8e0ece597165991), /* k = 4 */
-	(0x3f9f829b0e783300, 0x3c333e3f04f1ef23, 0xb8d814544147acc9), /* k = 5 */
-	(0x3f8fc0a8b0fc03e4, 0xbc183092c59642a1, 0xb8b52414fc416fc2), /* k = 6 */
-	(0x3f7fe02a6b106789, 0xbbce44b7e3711ebf, 0x386a567b6587df34), /* k = 7 */
-	(0x3f6ff00aa2b10bc0, 0x3c02821ad5a6d353, 0xb8912dcccb588a4a), /* k = 8 */
-	(0x3f5ff802a9ab10e6, 0x3bfe29e3a153e3b2, 0xb89538d49c4f745e), /* k = 9 */
-	(0x3f4ffc00aa8ab110, 0xbbe0fecbeb9b6cdb, 0xb877171cf29e89d1), /* k = 10 */
-	(0x3f3ffe002aa6ab11, 0x3b999e2b62cc632d, 0xb81eae851c58687c), /* k = 11 */
-	(0x3f2fff000aaa2ab1, 0x3ba0bbc04dc4e3dc, 0x38152723342e000b), /* k = 12 */
-	(0x3f1fff8002aa9aab, 0x3b910e6678af0afc, 0x382ed521af29bc8d), /* k = 13 */
-	(0x3f0fffc000aaa8ab, 0xbba3bbc110fec82c, 0xb84f79185e42fbaf), /* k = 14 */
-	(0x3effffe0002aaa6b, 0xbb953bbbe6661d42, 0xb835071791df7d3e), /* k = 15 */
-	(0x3eeffff0000aaaa3, 0xbb8553bbbd110fec, 0xb82ff1fae6cea01a), /* k = 16 */
-	(0x3edffff80002aaaa, 0xbb75553bbbc66662, 0x3805f05f166325ff), /* k = 17 */
-	(0x3ecffffc0000aaab, 0xbb6d5553bbbc1111, 0x37a380f8138f70f4), /* k = 18 */
-	(0x3ebffffe00002aab, 0xbb5655553bbbbe66, 0xb7f987507707503c), /* k = 19 */
-	(0x3eafffff00000aab, 0xbb45755553bbbbd1, 0xb7c10fec7ed7ec7e), /* k = 20 */
-	(0x3e9fffff800002ab, 0xbb355955553bbbbc, 0xb7d9999875075875), /* k = 21 */
-	(0x3e8fffffc00000ab, 0xbb2555d55553bbbc, 0x37bf7777809c09a1), /* k = 22 */
-	(0x3e7fffffe000002b, 0xbb15556555553bbc, 0x37b106666678af8b), /* k = 23 */
-	(0x3e6ffffff000000b, 0xbb055557555553bc, 0x37a110bbbbbc04e0), /* k = 24 */
-	(0x3e5ffffff8000003, 0xbaf555559555553c, 0x3791110e6666678b), /* k = 25 */
-	(0x3e4ffffffc000001, 0xbae555555d555554, 0x37811110fbbbbbc0), /* k = 26 */
-	(0x3e3ffffffe000000, 0x3ac5555553555556, 0xb76ddddddf333333), /* k = 27 */
+const C3 : (uint64, uint64, uint64, uint64)[32] = [
+	(000000000000000000, 000000000000000000,     0x3ff0000000000000, 000000000000000000), /* j = 0 */
+	(0x3effffe0002aaa6b, 0xbb953bbbe6661d42,     0x3fefffc0007fff00, 0x3c2fffc0007fff00), /* j = 1 */
+	(0x3f0fffc000aaa8ab, 0xbba3bbc110fec82c,     0x3fefff8001fff800, 0x3c6fff8001fff800), /* j = 2 */
+	(0x3f17ffb8011ffaf0, 0x3b984c534f3d9b6a,     0x3fefff40047fe501, 0xbc8780f2fa4e222b), /* j = 3 */
+	(0x3f1fff8002aa9aab, 0x3b910e6678af0afc,     0x3fefff0007ffc002, 0xbbdfff0007ffc002), /* j = 4 */
+	(0x3f23ff9c029a9723, 0x3bc1b965303b23b1,     0x3feffec00c7f8305, 0xbc6e30d217cb1211), /* j = 5 */
+	(0x3f27ff70047fd782, 0xbbced098a5c0aff0,     0x3feffe8011ff280a, 0x3c6f8685b1bbab34), /* j = 6 */
+	(0x3f2bff3c07250a51, 0xbbc89dd6d6bad8c1,     0x3feffe40187ea913, 0xbc7f8346d2208239), /* j = 7 */
+	(0x3f2fff000aaa2ab1, 0x3ba0bbc04dc4e3dc,     0x3feffe001ffe0020, 0xbc2ffe001ffe0020), /* j = 8 */
+	(0x3f31ff5e07979982, 0xbbce0e704817ebcd,     0x3feffdc0287d2733, 0x3c7f32ce6d7c4d43), /* j = 9 */
+	(0x3f33ff380a6a0e74, 0x3bdb81fcb95bc1fe,     0x3feffd8031fc184e, 0x3c69e5fa087756ad), /* j = 10 */
+	(0x3f35ff0e0ddc70a1, 0x3bacf6f3d97a3c05,     0x3feffd403c7acd72, 0x3c860b1b0bacff22), /* j = 11 */
+	(0x3f37fee011febc18, 0x3bd2b9bcf5d3f323,     0x3feffd0047f940a2, 0xbc5e5d274451985a), /* j = 12 */
+	(0x3f39feae16e0ec8b, 0xbbb6137aceeb34b1,     0x3feffcc054776bdf, 0x3c56b1b1f3ed39e8), /* j = 13 */
+	(0x3f3bfe781c92fd4a, 0xbbc4ed10713cc126,     0x3feffc8061f5492c, 0xbc19fd284f974b74), /* j = 14 */
+	(0x3f3dfe3e2324e946, 0x3bc0916462dd5deb,     0x3feffc407072d28b, 0x3c84eb0c748a57ca), /* j = 15 */
+	(0x3f3ffe002aa6ab11, 0x3b999e2b62cc632d,     0x3feffc007ff00200, 0xbc7ffc007ff00200), /* j = 16 */
+	(0x3f40fedf19941e6e, 0xbbb194c2e0aa6338,     0x3feffbc0906cd18c, 0x3c75b11e79f3cd9f), /* j = 17 */
+	(0x3f41febc1e5ccc3c, 0x3bdc657d895d3592,     0x3feffb80a1e93b34, 0xbc84d11299626e29), /* j = 18 */
+	(0x3f42fe9723b55bac, 0x3bd6cfb73e538464,     0x3feffb40b46538fa, 0xbc8d42a81b0bfc39), /* j = 19 */
+	(0x3f43fe7029a5c947, 0xbbd4d578bf46e36a,     0x3feffb00c7e0c4e1, 0x3c7e673fde054f2c), /* j = 20 */
+	(0x3f44fe4730361165, 0x3be400d77e93f2fd,     0x3feffac0dc5bd8ee, 0x3c8a38b2b2aeaf57), /* j = 21 */
+	(0x3f45fe1c376e3031, 0xbbd524eb8a5ae7f6,     0x3feffa80f1d66f25, 0xbc6a5778f73582ce), /* j = 22 */
+	(0x3f46fdef3f5621a3, 0xbbdf09d734886d52,     0x3feffa4108508189, 0xbc81a45478d24a37), /* j = 23 */
+	(0x3f47fdc047f5e185, 0xbbebfa5c57d202d3,     0x3feffa011fca0a1e, 0x3c6a5b0eed338657), /* j = 24 */
+	(0x3f48fd8f51556b70, 0xbbd0f4f2e08fd201,     0x3feff9c1384302e9, 0x3c8b9a1be68cf877), /* j = 25 */
+	(0x3f49fd5c5b7cbace, 0x3beb6ed49f17d42d,     0x3feff98151bb65ef, 0x3c82d92be315df8f), /* j = 26 */
+	(0x3f4afd276673cada, 0x3bd3222545da594f,     0x3feff9416c332d34, 0x3c8dbbba66ae573a), /* j = 27 */
+	(0x3f4bfcf07242969d, 0x3bc5db4d2b3efe1c,     0x3feff90187aa52be, 0xbc698a69b8df8f19), /* j = 28 */
+	(0x3f4cfcb77ef118f1, 0x3becc55406f300fb,     0x3feff8c1a420d091, 0xbc8032d47bdbf02c), /* j = 29 */
+	(0x3f4dfc7c8c874c82, 0xbbe863e9d57a176f,     0x3feff881c196a0b2, 0x3c858cf2f70e18b2), /* j = 30 */
+	(0x3f4efc3f9b0d2bc8, 0x3bd1e8e8a5f5b8b7,     0x3feff841e00bbd28, 0x3c782227ba60dc8b), /* j = 31 */
 ]
 
-const C_minus : (uint64, uint64, uint64)[28] = [
-	(0x0000000000000000, 0x0000000000000000, 0x0000000000000000), /* dummy */
-	(0xbfe62e42fefa39ef, 0xbc7abc9e3b39803f, 0xb907b57a079a1934), /* k = 1 */
-	(0xbfd269621134db92, 0xbc7e0efadd9db02b, 0x39163d5cf0b6f233), /* k = 2 */
-	(0xbfc1178e8227e47c, 0x3c50e63a5f01c691, 0xb8f03c776a3fb0f1), /* k = 3 */
-	(0xbfb08598b59e3a07, 0x3c5dd7009902bf32, 0x38ea7da07274e01d), /* k = 4 */
-	(0xbfa0415d89e74444, 0xbc4c05cf1d753622, 0xb8d3bc1c184cef0a), /* k = 5 */
-	(0xbf90205658935847, 0xbc327c8e8416e71f, 0x38b19642aac1310f), /* k = 6 */
-	(0xbf8010157588de71, 0xbc146662d417ced0, 0xb87e91702f8418af), /* k = 7 */
-	(0xbf70080559588b35, 0xbc1f96638cf63677, 0x38a90badb5e868b4), /* k = 8 */
-	(0xbf60040155d5889e, 0x3be8f98e1113f403, 0x38601ac2204fbf4b), /* k = 9 */
-	(0xbf50020055655889, 0xbbe9abe6bf0fa436, 0x3867c7d335b216f3), /* k = 10 */
-	(0xbf40010015575589, 0x3bec8863f23ef222, 0x38852c36a3d20146), /* k = 11 */
-	(0xbf30008005559559, 0x3bddd332a0e20e2f, 0x385c8b6b9ff05329), /* k = 12 */
-	(0xbf20004001555d56, 0x3bcddd88863f53f6, 0xb859332cbe6e6ac5), /* k = 13 */
-	(0xbf10002000555655, 0xbbb62224ccd5f17f, 0xb8366327cc029156), /* k = 14 */
-	(0xbf00001000155575, 0xbba5622237779c0a, 0xb7d38f7110a9391d), /* k = 15 */
-	(0xbef0000800055559, 0xbb95562222cccd5f, 0xb816715f87b8e1ee), /* k = 16 */
-	(0xbee0000400015556, 0x3b75553bbbb1110c, 0x381fb17b1f791778), /* k = 17 */
-	(0xbed0000200005555, 0xbb79555622224ccd, 0x3805074f75071791), /* k = 18 */
-	(0xbec0000100001555, 0xbb65d55562222377, 0xb80de7027127028d), /* k = 19 */
-	(0xbeb0000080000555, 0xbb5565555622222d, 0x37e9995075035075), /* k = 20 */
-	(0xbea0000040000155, 0xbb45575555622222, 0xb7edddde70270670), /* k = 21 */
-	(0xbe90000020000055, 0xbb35559555562222, 0xb7c266666af8af9b), /* k = 22 */
-	(0xbe80000010000015, 0xbb25555d55556222, 0xb7b11bbbbbce04e0), /* k = 23 */
-	(0xbe70000008000005, 0xbb15555655555622, 0xb7a111666666af8b), /* k = 24 */
-	(0xbe60000004000001, 0xbb05555575555562, 0xb7911113bbbbbce0), /* k = 25 */
-	(0xbe50000002000000, 0xbaf5555559555556, 0xb78111112666666b), /* k = 26 */
-	(0xbe40000001000000, 0xbac5555557555556, 0x376ddddddc888888), /* k = 27 */
+const C4 : (uint64, uint64, uint64, uint64)[32] = [
+	(000000000000000000, 000000000000000000,     0x3ff0000000000000, 000000000000000000), /* j = 0 */
+	(0x3eafffff00000aab, 0xbb45755553bbbbd1,     0x3feffffe00002000, 0xbc2ffffe00002000), /* j = 1 */
+	(0x3ebffffe00002aab, 0xbb5655553bbbbe66,     0x3feffffc00008000, 0xbc5ffffc00008000), /* j = 2 */
+	(0x3ec7fffdc0004800, 0xbb343ffcf666dfe6,     0x3feffffa00012000, 0xbc7afffaf000f300), /* j = 3 */
+	(0x3ecffffc0000aaab, 0xbb6d5553bbbc1111,     0x3feffff800020000, 0xbc8ffff800020000), /* j = 4 */
+	(0x3ed3fffce000a6ab, 0xbb7f1952e455f818,     0x3feffff600031fff, 0x3c4801387f9e581f), /* j = 5 */
+	(0x3ed7fffb80012000, 0xbb743ff9ecceb2cc,     0x3feffff400047ffe, 0x3c8400287ff0d006), /* j = 6 */
+	(0x3edbfff9e001c955, 0xbb702e9d89490dc5,     0x3feffff200061ffd, 0x3c84804b07df2c8e), /* j = 7 */
+	(0x3edffff80002aaaa, 0xbb75553bbbc66662,     0x3feffff00007fffc, 0x3baffff00007fffc), /* j = 8 */
+	(0x3ee1fffaf001e5ff, 0x3b797c2e21b72cff,     0x3fefffee000a1ffa, 0x3c8380cd078cabc1), /* j = 9 */
+	(0x3ee3fff9c0029aa9, 0x3b8c8ad1ba965260,     0x3fefffec000c7ff8, 0x3c780270fe7960f4), /* j = 10 */
+	(0x3ee5fff870037754, 0xbb8d0c6bc1b51bdd,     0x3fefffea000f1ff6, 0xbc897e36793a8ca8), /* j = 11 */
+	(0x3ee7fff700047ffd, 0x3b8e006132f6735a,     0x3fefffe80011fff3, 0xbc8ffd7801e5fe94), /* j = 12 */
+	(0x3ee9fff57005b8a7, 0x3b771277672a8835,     0x3fefffe600151fef, 0xbc74f906f5aa5866), /* j = 13 */
+	(0x3eebfff3c0072551, 0xbb86c9d894dd7427,     0x3fefffe400187feb, 0xbc8bfb4f841a6c69), /* j = 14 */
+	(0x3eedfff1f008c9fa, 0xbb7701aebecf7ae4,     0x3fefffe2001c1fe6, 0xbc8779d1fdcb2212), /* j = 15 */
+	(0x3eeffff0000aaaa3, 0xbb8553bbbd110fec,     0x3fefffe0001fffe0, 0x3befffe0001fffe0), /* j = 16 */
+	(0x3ef0fff6f80665a6, 0xbb9b95400571451d,     0x3fefffde00241fda, 0xbc8875ce02d51cfe), /* j = 17 */
+	(0x3ef1fff5e00797fa, 0xbb9a0e8ef2f395cc,     0x3fefffdc00287fd2, 0x3c8c0cd071958038), /* j = 18 */
+	(0x3ef2fff4b808ee4d, 0x3b984638f069f71f,     0x3fefffda002d1fca, 0x3c8a8fe8751bf4ef), /* j = 19 */
+	(0x3ef3fff3800a6aa1, 0xbb794b915f81751a,     0x3fefffd80031ffc2, 0xbc8fec781869e17c), /* j = 20 */
+	(0x3ef4fff2380c0ef4, 0x3b80a43b5348b6b9,     0x3fefffd600371fb8, 0xbc8668429728999b), /* j = 21 */
+	(0x3ef5fff0e00ddd47, 0x3b624a1f136cc09c,     0x3fefffd4003c7fad, 0xbc77c6cf4ea2f3e0), /* j = 22 */
+	(0x3ef6ffef780fd79a, 0xbb9a716c4210cdd0,     0x3fefffd200421fa1, 0xbc5aeeb948d5a74d), /* j = 23 */
+	(0x3ef7ffee0011ffec, 0xbb8ff3d9a8c98613,     0x3fefffd00047ff94, 0x3c143fe1a02d8fbc), /* j = 24 */
+	(0x3ef8ffec7814583d, 0x3b9f7bc8a4e1db73,     0x3fefffce004e1f86, 0xbc6141450a04205a), /* j = 25 */
+	(0x3ef9ffeae016e28f, 0xbb8cb889999dd735,     0x3fefffcc00547f77, 0xbc83c837daa53cb3), /* j = 26 */
+	(0x3efaffe93819a0e0, 0xbb9be60d8a0b5ba8,     0x3fefffca005b1f66, 0x3c7d81be350f0677), /* j = 27 */
+	(0x3efbffe7801c9530, 0xbb873b12aed4646c,     0x3fefffc80061ff55, 0xbc8fb4f8834d1a39), /* j = 28 */
+	(0x3efcffe5b81fc17f, 0x3b9fe950a87b2785,     0x3fefffc600691f41, 0x3c8dd655eb844520), /* j = 29 */
+	(0x3efdffe3e02327cf, 0xbb9bfd7604f796f2,     0x3fefffc400707f2d, 0x3c618b7f1a71ae6b), /* j = 30 */
+	(0x3efeffe1f826ca1d, 0xbb60ae99630e566e,     0x3fefffc200781f17, 0x3c80f0bb2d9557af), /* j = 31 */
 ]
 
 const logoverkill32 = {x : flt32
@@ -140,7 +198,13 @@
 const logoverkill64 = {x : flt64
 	var xn, xe, xs
 	(xn, xe, xs) = std.flt64explode(x)
+	var xsf = std.flt64assem(false, 0, xs)
+	var xb = std.flt64bits(x)
+	var tn, te, ts
+	var t1, t2
 
+var DEBUG=(xb==0x3fe26c01523f67a8)
+
 	/* Special cases */
 	if std.isnan(x)
 		-> (std.flt64frombits(0x7ff8000000000000), std.flt64frombits(0x7ff8000000000000))
@@ -154,190 +218,280 @@
 		-> (std.flt64frombits(0x7ff8000000000000), std.flt64frombits(0x7ff8000000000000))
 	;;
 
-	/*
-	   Deal with 2^xe up front: multiply xe by a high-precision log(2). We'll
-	   add them back in to the giant mess of tis later.
-	 */
-	var xef : flt64 = (xe : flt64)
-	var log_2_hi, log_2_lo
-	(log_2_hi, log_2_lo) = accurate_logs64[128] /* See log-impl.myr */
-	var lxe_1, lxe_2, lxe_3, lxe_4
-	(lxe_1, lxe_2) = two_by_two64(xef, std.flt64frombits(log_2_hi))
-	(lxe_3, lxe_4) = two_by_two64(xef, std.flt64frombits(log_2_lo))
+if DEBUG; std.put("x = flt64fromuint64(0x{w=16,p=0,x});\n", xb) ;;
+if DEBUG; std.put("xe = {};\n", xe) ;;
+if DEBUG; std.put("xs = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(std.flt64assem(false, 0, xs))) ;;
 
-	/*
-	   We split t into three parts, so that we can gradually build up two
-	   flt64s worth of information
-	 */
-	var t1 = 0.0
-	var t2 = 0.0
-	var t3 = 0.0
+	var shift = 0
+	var then_invert = false
+	var j1, F1, f1, logF1_hi, logF1_lo, F1_inv_hi, F1_inv_lo, fF1_hi, fF1_lo
 
-	/*
-	   We also split lprime. 
-	 */
-	var lprime1 
-	var lprime2 
-	(lprime1, lprime2) = slow2sum(std.flt64assem(false, 1, xs), -2.0)
-	var lprime3 = 0.0
-
-	for var k = 1; k <= 107; ++k
-		/* Calculate d_k and some quanitites for iteration */
-		var d = 0.0
-		var ln_hi : flt64, ln_mi : flt64, ln_lo : flt64
-
+	/* F1 */
+	if xe == -1 && xs > 0x001d1ff05e41cfba
 		/*
-		   Note the truncation method for [Mul16] is for signed-digit systems,
-		   which we don't have. This comparison follows from the remark following
-		   (8.23), though.
+		   If we reduced to [1, 2) unconditionally, then values of x like 0.999… =
+		   2^-1 · 1.999… would cause subtractive cancellation; we'd compute
+		   log(1.999…), then subtract out log(2) at the end. They'd agree on the
+		   first n bits, and we'd lose n bits of precision.
+
+		   This is only a problem for exponent -1, and for xs large enough;
+		   outside that, the numbers are so different that we won't lose precision
+		   by cancelling. But here, we compute 1/x, proceed (with exponent 0), and
+		   flip all the signs at the end.
 		 */
-		if lprime1 <= -0.5
-			d = 1.0
-			(ln_hi, ln_mi, ln_lo) = get_C_plus(k)
-		elif lprime1 < 1.0
-			d = 0.0
+if DEBUG; std.put("/* inversion case */\n") ;;
+		xe = 0
+if DEBUG; std.put("xe = {};\n", xe) ;;
+		var xinv_hi = 1.0 / x
+		var xinv_lo = fma64(-1.0 * xinv_hi, x, 1.0) / x
+if DEBUG; std.put("xinv_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(xinv_hi)) ;;
+if DEBUG; std.put("xinv_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(xinv_lo)) ;;
+		(tn, te, ts) = std.flt64explode(xinv_hi)
+		shift = ((47 >= te) : uint64) * ((47 - te) : uint64) + ((47 < te) : uint64) * 64
+if DEBUG; std.put("shift = {};\n", shift) ;;
+if DEBUG; std.put("ts = 0b{w=64,p=0,b=2};\n", ts) ;;
+		j1 = (ts >> shift) & 0x1f
+if DEBUG; std.put("j1 = {};\n", j1) ;;
+		var F1m1 = scale2((j1 : flt64), -5)
+		F1 = 1.0 + F1m1
+if DEBUG; std.put("F1 = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(F1)) ;;
+		var f1_hi, f1_lo
+		(f1_hi, f1_lo) = fast2sum(xinv_hi - F1, xinv_lo)
+if DEBUG; std.put("f1    = 0.0;\n") ;;
+if DEBUG; std.put("f1_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f1_hi)) ;;
+if DEBUG; std.put("f1_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f1_lo)) ;;
+		(logF1_hi, logF1_lo, F1_inv_hi, F1_inv_lo) = C1[j1]
+if DEBUG; std.put("logF1_hi = flt64fromuint64(0x{w=16,p=0,x});\n", logF1_hi) ;;
+if DEBUG; std.put("logF1_lo = flt64fromuint64(0x{w=16,p=0,x});\n", logF1_lo) ;;
 
-			/* In this case, t_n is unchanged, and we just scale lprime by 2 */
-			lprime1 = lprime1 * 2.0
-			lprime2 = lprime2 * 2.0
-			lprime3 = lprime3 * 2.0
+		/* Compute 1 + f1/F1 */
+		(fF1_hi, fF1_lo) = two_by_two64(f1_hi, std.flt64frombits(F1_inv_hi))
+		(fF1_lo, t1) = slow2sum(fF1_lo, f1_lo * std.flt64frombits(F1_inv_hi))
+		(fF1_lo, t2) = slow2sum(fF1_lo, f1_hi * std.flt64frombits(F1_inv_lo))
+		(fF1_hi, fF1_lo) = fast2sum(fF1_hi, fF1_lo + (t1 + t2))
+		then_invert = true
+	else
+if DEBUG; std.put("/* normal case */\n") ;;
+		j1 = (xs & 0x000f800000000000) >> 47
+if DEBUG; std.put("j1 = {};\n", j1) ;;
+		F1 = std.flt64assem(false, 0, xs & 0xffff800000000000)
+if DEBUG; std.put("F1 = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(F1)) ;;
+		f1 = xsf - F1
+if DEBUG; std.put("f1     = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f1)) ;;
+if DEBUG; std.put("f1_hi = 0.0;\n") ;;
+if DEBUG; std.put("f1_lo = 0.0;\n") ;;
+		(logF1_hi, logF1_lo, F1_inv_hi, F1_inv_lo) = C1[j1]
+if DEBUG; std.put("logF1_hi = flt64fromuint64(0x{w=16,p=0,x});\n", logF1_hi) ;;
+if DEBUG; std.put("logF1_lo = flt64fromuint64(0x{w=16,p=0,x});\n", logF1_lo) ;;
+if DEBUG; std.put("F1_inv_hi = flt64fromuint64(0x{w=16,p=0,x});\n", F1_inv_hi) ;;
+if DEBUG; std.put("F1_inv_lo = flt64fromuint64(0x{w=16,p=0,x});\n", F1_inv_lo) ;;
 
-			/*
-			   If you're looking for a way to speed this up, calculate how many k we
-			   can skip here, preferably by a lookup table.
-			 */
-			continue
-		else
-			d = -1.0
-			(ln_hi, ln_mi, ln_lo) = get_C_minus(k)
-		;;
+		/* Compute 1 + f1/F1 */
+		(fF1_hi, fF1_lo) = two_by_two64(f1, std.flt64frombits(F1_inv_hi))
+		(fF1_lo, t1) = slow2sum(fF1_lo, f1 * std.flt64frombits(F1_inv_lo))
+		(fF1_hi, fF1_lo) = fast2sum(fF1_hi, fF1_lo)
+		fF1_lo += t1
+if DEBUG; std.put("fF1_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(fF1_hi)) ;;
+if DEBUG; std.put("fF1_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(fF1_lo)) ;;
+	;;
 
-		/* t_{n + 1} */
-		(t1, t2, t3) = foursum(t1, t2, t3, -1.0 * ln_hi)
-		(t1, t2, t3) = foursum(t1, t2, t3, -1.0 * ln_mi)
-		(t1, t2, t3) = foursum(t1, t2, t3, -1.0 * ln_lo)
+	/* F2 */
+	(tn, te, ts) = std.flt64explode(fF1_hi)
+if DEBUG; std.put("te = {};\n", te) ;;
+if DEBUG; std.put("ts = 0b{w=62,p=0,b=2};\n", ts) ;;
+	shift = ((42 >= te) : uint64) * ((42 - te) : uint64) + ((42 < te) : uint64) * 64
+	var j2 = (ts >> shift) & 0x1f
+if DEBUG; std.put("j2 = {};\n", j2) ;;
+	var F2m1 = scale2((j2 : flt64), -10)
+	var F2 = 1.0 + F2m1
+if DEBUG; std.put("F2 = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(F2)) ;;
+	var f2_hi, f2_lo
+	(f2_hi, f2_lo) = fast2sum(fF1_hi - F2m1, fF1_lo)
+if DEBUG; std.put("f2_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f2_hi)) ;;
+if DEBUG; std.put("f2_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f2_lo)) ;;
+	var logF2_hi, logF2_lo, F2_inv_hi, F2_inv_lo
+	(logF2_hi, logF2_lo, F2_inv_hi, F2_inv_lo) = C2[j2]
+if DEBUG; std.put("logF2_hi = flt64fromuint64(0x{w=16,p=0,x});\n", logF2_hi) ;;
+if DEBUG; std.put("logF2_lo = flt64fromuint64(0x{w=16,p=0,x});\n", logF2_lo) ;;
 
-		/* lprime_{n + 1} */
-		lprime1 *= 2.0
-		lprime2 *= 2.0
-		lprime3 *= 2.0
+	/* Compute 1 + f2/F2 */
+	var fF2_hi, fF2_lo
+	(fF2_hi, fF2_lo) = two_by_two64(f2_hi, std.flt64frombits(F2_inv_hi))
+	(fF2_lo, t1) = slow2sum(fF2_lo, f2_lo * std.flt64frombits(F2_inv_hi))
+	(fF2_lo, t2) = slow2sum(fF2_lo, f2_hi * std.flt64frombits(F2_inv_lo))
+	(fF2_hi, fF2_lo) = fast2sum(fF2_hi, fF2_lo + (t1 + t2))
+if DEBUG; std.put("fF2_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(fF2_hi)) ;;
+if DEBUG; std.put("fF2_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(fF2_lo)) ;;
 
-		var lp1m = d * scale2(lprime1, -k)
-		var lp2m = d * scale2(lprime2, -k)
-		var lp3m = d * scale2(lprime3, -k)
-		(lprime1, lprime2, lprime3) = foursum(lprime1, lprime2, lprime3, lp1m)
-		(lprime1, lprime2, lprime3) = foursum(lprime1, lprime2, lprime3, lp2m)
-		(lprime1, lprime2, lprime3) = foursum(lprime1, lprime2, lprime3, lp3m)
-		(lprime1, lprime2, lprime3) = foursum(lprime1, lprime2, lprime3, 2.0 * d)
-	;;
+	/* F3 (just like F2) */
+	(tn, te, ts) = std.flt64explode(fF2_hi)
+	shift = ((37 >= te) : uint64) * ((37 - te) : uint64) + ((37 < te) : uint64) * 64
+	var j3 = (ts >> shift) & 0x1f
+	var F3m1 = scale2((j3 : flt64), -15)
+	var F3 = 1.0 + F3m1
+if DEBUG; std.put("F3 = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(F3)) ;;
+	var f3_hi, f3_lo
+	(f3_hi, f3_lo) = fast2sum(fF2_hi - F3m1, fF2_lo)
+if DEBUG; std.put("f3_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f3_hi)) ;;
+if DEBUG; std.put("f3_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f3_lo)) ;;
+	var logF3_hi, logF3_lo, F3_inv_hi, F3_inv_lo
+	(logF3_hi, logF3_lo, F3_inv_hi, F3_inv_lo) = C3[j3]
+if DEBUG; std.put("logF3_hi = flt64fromuint64(0x{w=16,p=0,x});\n", logF3_hi) ;;
+if DEBUG; std.put("logF3_lo = flt64fromuint64(0x{w=16,p=0,x});\n", logF3_lo) ;;
 
-	var l : flt64[:] = [t1, t2, t3, lxe_1, lxe_2, lxe_3, lxe_4][:]
-	std.sort(l, mag_cmp64)
-	-> double_compensated_sum(l)
-}
+	/* Compute 1 + f3/F3 */
+	var fF3_hi, fF3_lo
+	(fF3_hi, fF3_lo) = two_by_two64(f3_hi, std.flt64frombits(F3_inv_hi))
+	(fF3_lo, t1) = slow2sum(fF3_lo, f3_lo * std.flt64frombits(F3_inv_hi))
+	(fF3_lo, t2) = slow2sum(fF3_lo, f3_hi * std.flt64frombits(F3_inv_lo))
+	(fF3_hi, fF3_lo) = fast2sum(fF3_hi, fF3_lo + (t1 + t2))
+if DEBUG; std.put("fF3_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(fF3_hi)) ;;
+if DEBUG; std.put("fF3_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(fF3_lo)) ;;
 
-/* significand for 1/3 (if you reconstruct without compensating, you get 4/3) */
-const one_third_sig = 0x0015555555555555
+	/* F4 (just like F2) */
+	(tn, te, ts) = std.flt64explode(fF3_hi)
+	shift = ((32 >= te) : uint64) * ((32 - te) : uint64) + ((32 < te) : uint64) * 64
+	var j4 = (ts >> shift) & 0x1f
+	var F4m1 = scale2((j4 : flt64), -20)
+	var F4 = 1.0 + F4m1
+if DEBUG; std.put("F4 = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(F4)) ;;
+	var f4_hi, f4_lo
+	(f4_hi, f4_lo) = fast2sum(fF3_hi - F4m1, fF3_lo)
+if DEBUG; std.put("f4_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f4_hi)) ;;
+if DEBUG; std.put("f4_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(f4_lo)) ;;
+	var logF4_hi, logF4_lo, F4_inv_hi, F4_inv_lo
+	(logF4_hi, logF4_lo, F4_inv_hi, F4_inv_lo) = C4[j4]
+if DEBUG; std.put("logF4_hi = flt64fromuint64(0x{w=16,p=0,x});\n", logF4_hi) ;;
+if DEBUG; std.put("logF4_lo = flt64fromuint64(0x{w=16,p=0,x});\n", logF4_lo) ;;
 
-/* and for 1/5 (if you reconstruct, you get 8/5) */
-const one_fifth_sig = 0x001999999999999a
+	/* Compute 1 + f4/F4 */
+	var fF4_hi, fF4_lo
+	(fF4_hi, fF4_lo) = two_by_two64(f4_hi, std.flt64frombits(F4_inv_hi))
+	(fF4_lo, t1) = slow2sum(fF4_lo, f4_lo * std.flt64frombits(F4_inv_hi))
+	(fF4_lo, t2) = slow2sum(fF4_lo, f4_hi * std.flt64frombits(F4_inv_lo))
+	(fF4_hi, fF4_lo) = fast2sum(fF4_hi, fF4_lo + (t1 + t2))
+if DEBUG; std.put("fF4_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(fF4_hi)) ;;
+if DEBUG; std.put("fF4_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(fF4_lo)) ;;
 
-/*
-   These calculations are incredibly slow. Somebody should speed them up.
- */
-const get_C_plus = {k : int64
-	if k < 0
-		-> (0.0, 0.0, 0.0)
-	elif k < 28
-		var t1, t2, t3
-		(t1, t2, t3) = C_plus[k]
-		-> (std.flt64frombits(t1), std.flt64frombits(t2), std.flt64frombits(t3))
-	elif k < 36
-		var t1 = std.flt64assem(false, -k, 1 << 53)             /* x [ = 2^-k ] */
-		var t2 = std.flt64assem(true, -2*k - 1, 1 << 53)        /* -x^2 / 2     */
-		var t3 = std.flt64assem(false, -3*k - 2, one_third_sig) /*  x^3 / 3     */
-		var t4 = std.flt64assem(true, -4*k - 2, 1 << 53)        /* -x^4 / 4     */
-		var t5 = std.flt64assem(false, -5*k - 3, one_fifth_sig) /*  x^5 / 5     */
-		-> fast_fivesum(t1, t2, t3, t4, t5)
-	elif k < 54
-		var t1 = std.flt64assem(false, -k, 1 << 53)             /* x [ = 2^-k ] */
-		var t2 = std.flt64assem(true, -2*k - 1, 1 << 53)        /* -x^2 / 2     */
-		var t3 = std.flt64assem(false, -3*k - 2, one_third_sig) /*  x^3 / 3     */
-		var t4 = std.flt64assem(true, -4*k - 2, 1 << 53)        /* -x^4 / 4     */
-		-> fast_foursum(t1, t2, t3, t4)
-	else
-		var t1 = std.flt64assem(false, -k, 1 << 53)             /* x [ = 2^-k ] */
-		var t2 = std.flt64assem(true, -2*k - 1, 1 << 53)        /* -x^2 / 2     */
-		var t3 = std.flt64assem(false, -3*k - 2, one_third_sig) /*  x^3 / 3     */
-		-> (t1, t2, t3)
-	;;
-}
+	/*
+	   L = log(1 + f4/F4); we'd like to use horner_polyu, but since we have
+	   _hi and _lo, it becomes more complicated.
+	 */
+	var L_hi, L_lo
+	/* r = (1/5) · x */
+	(L_hi, L_lo) = hl_mult(std.flt64frombits(0x3fc999999999999a), std.flt64frombits(0xbc6999999999999a), fF4_hi, fF4_lo)
 
-const get_C_minus = {k : int64
-	if k < 0
-		-> (0.0, 0.0, 0.0)
-	elif k < 28
-		var t1, t2, t3
-		(t1, t2, t3) = C_minus[k]
-		-> (std.flt64frombits(t1), std.flt64frombits(t2), std.flt64frombits(t3))
-	elif k < 36
-		var t1 = std.flt64assem(true, -k, 1 << 53)              /* x [ = 2^-k ] */
-		var t2 = std.flt64assem(true, -2*k - 1, 1 << 53)        /* -x^2 / 2     */
-		var t3 = std.flt64assem(true, -3*k - 2, one_third_sig)  /*  x^3 / 3     */
-		var t4 = std.flt64assem(true, -4*k - 2, 1 << 53)        /* -x^4 / 4     */
-		var t5 = std.flt64assem(true, -5*k - 3, one_fifth_sig)  /*  x^5 / 5     */
-		-> fast_fivesum(t1, t2, t3, t4, t5)
-	elif k < 54
-		var t1 = std.flt64assem(true, -k, 1 << 53)              /* x [ = 2^-k ] */
-		var t2 = std.flt64assem(true, -2*k - 1, 1 << 53)        /* -x^2 / 2     */
-		var t3 = std.flt64assem(true, -3*k - 2, one_third_sig)  /*  x^3 / 3     */
-		var t4 = std.flt64assem(true, -4*k - 2, 1 << 53)        /* -x^4 / 4     */
-		-> fast_foursum(t1, t2, t3, t4)
-	else
-		var t1 = std.flt64assem(true, -k, 1 << 53)              /* x [ = 2^-k ] */
-		var t2 = std.flt64assem(true, -2*k - 1, 1 << 53)        /* -x^2 / 2     */
-		var t3 = std.flt64assem(true, -3*k - 2, one_third_sig)  /*  x^3 / 3     */
-		-> (t1, t2, t3)
-	;;
-}
+	/* r = r - 1/4 */
+	(t1, t2) = fast2sum(std.flt64frombits(0xbfd0000000000000), L_lo)
+	(L_hi, L_lo) = fast2sum(t1, L_hi)
+	L_lo += t2
 
-const foursum = {a1 : flt64, a2 : flt64, a3 : flt64, x : flt64
-	var t1, t2, t3, t4, t5, t6, s1, s2, s3, s4
+	/* r = r · x */
+	(L_hi, L_lo) = hl_mult(L_hi, L_lo, fF4_hi, fF4_lo)
 
-	(t5, t6) = slow2sum(a3, x)
-	(t3, t4) = slow2sum(a2, t5)
-	(t1, t2) = slow2sum(a1, t3)
-	(s3, s4) = slow2sum(t4, t6)
-	(s1, s2) = slow2sum(t2, s3)
+	/* r = r + 1/3 */
+	(L_hi, L_lo) = hl_add(std.flt64frombits(0x3fd5555555555555), std.flt64frombits(0x3c75555555555555), L_hi, L_lo)
 
-	-> (t1, s1, s2 + s4)
-}
+	/* r = r · x */
+	(L_hi, L_lo) = hl_mult(L_hi, L_lo, fF4_hi, fF4_lo)
 
-/*
-   Specifically for use in get_C_{plus,minus}, in which we know the
-   magnitude orders of the ais.
- */
-const fast_foursum = {a1 : flt64, a2 : flt64, a3 : flt64, a4 : flt64
-	(a3, a4) = fast2sum(a3, a4)
-	(a2, a3) = fast2sum(a2, a3)
-	(a1, a2) = fast2sum(a1, a2)
+	/* r = r - 1/2 */
+	(t1, t2) = fast2sum(std.flt64frombits(0xbfe0000000000000), L_lo)
+	(L_hi, L_lo) = fast2sum(t1, L_hi)
+	L_lo += t2
+
+	/* r = r · x */
+	(L_hi, L_lo) = hl_mult(L_hi, L_lo, fF4_hi, fF4_lo)
+
+	/* r = r + 1 */
+	(t1, t2) = fast2sum(1.0, L_lo)
+	(L_hi, L_lo) = fast2sum(t1, L_hi)
+	L_lo += t2
+	/* r = r · x */
+	(L_hi, L_lo) = hl_mult(L_hi, L_lo, fF4_hi, fF4_lo)
+if DEBUG; std.put("L_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(L_hi)) ;;
+if DEBUG; std.put("L_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(L_lo)) ;;
+
+	/*
+	   Finally, compute log(F1) + log(F2) + log(F3) + log(F4) + L. We may
+	   assume F1 > F2 > F3 > F4 > F5, since the only way this is disrupted is
+	   if some Fi == 1.0, in which case the log is 0 and the fast2sum works
+	   out either way. We can also assume each F1,2,3 > L. 
+	 */
+	var lsig_hi, lsig_lo
+	
+	/* log(F4) + L, slow because they're the same order of magnitude */
+	(t1, t2) = slow2sum(std.flt64frombits(logF4_lo), L_lo)
+	(lsig_lo, t1) = slow2sum(L_hi, t1)
+	(lsig_hi, lsig_lo) = slow2sum(std.flt64frombits(logF4_hi), lsig_lo)
 
-	(a3, a4) = slow2sum(a3, a4)
-	(a2, a3) = slow2sum(a2, a3)
+	(lsig_hi, lsig_lo) = hl_add(std.flt64frombits(logF3_hi), std.flt64frombits(logF3_lo), lsig_hi, lsig_lo + (t1 + t2))
+	(lsig_hi, lsig_lo) = hl_add(std.flt64frombits(logF2_hi), std.flt64frombits(logF2_lo), lsig_hi, lsig_lo)
+	(lsig_hi, lsig_lo) = hl_add(std.flt64frombits(logF1_hi), std.flt64frombits(logF1_lo), lsig_hi, lsig_lo)
+if DEBUG; std.put("lsig_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(lsig_hi)) ;;
+if DEBUG; std.put("lsig_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(lsig_lo)) ;;
 
-	-> (a1, a2, a3 + a4)
+	/* Oh yeah, and we need xe * log(2) */
+	var xel2_hi, xel2_lo, lx_hi, lx_lo
+	(xel2_hi, xel2_lo) = hl_mult((xe : flt64), 0.0, std.flt64frombits(0x3fe62e42fefa39ef), std.flt64frombits(0x3c7abc9e3b39803f))
+if DEBUG; std.put("xel2_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(xel2_hi)) ;;
+if DEBUG; std.put("xel2_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(xel2_lo)) ;;
+
+
+	(t1, t2) = slow2sum(xel2_lo, lsig_lo)
+	(lx_lo, t1) = slow2sum(lsig_hi, t1)
+	(lx_hi, lx_lo) = slow2sum(xel2_hi, lx_lo)
+	(lx_hi, lx_lo) = slow2sum(lx_hi, lx_lo + (t1 + t2))
+if DEBUG; std.put("lx_hi = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(lx_hi)) ;;
+if DEBUG; std.put("lx_lo = flt64fromuint64(0x{w=16,p=0,x});\n", std.flt64bits(lx_lo)) ;;
+
+	if then_invert
+		-> (-1.0 * lx_hi, -1.0 * lx_lo)
+	;;
+
+	-> (lx_hi, lx_lo)
 }
 
-const fast_fivesum = {a1 : flt64, a2 : flt64, a3 : flt64, a4 : flt64, a5 : flt64
-	(a4, a5) = fast2sum(a4, a5)
-	(a3, a4) = fast2sum(a3, a4)
-	(a2, a3) = fast2sum(a2, a3)
-	(a1, a2) = fast2sum(a1, a2)
+const hl_mult = {a_h : flt64, a_l : flt64, b_h : flt64, b_l : flt64
+	/*
+	       [     a_h    ][     a_l    ] * [     b_h    ][     b_l    ]
+	         =
+	   (A) [          a_h*b_h         ]
+	   (B)   +           [          a_h*b_l         ]
+	   (C)   +           [          a_l*b_h         ]
+	   (D)   +                         [          a_l*b_l         ]
 
-	(a4, a5) = slow2sum(a4, a5)
-	(a3, a4) = slow2sum(a3, a4)
-	(a2, a3) = slow2sum(a2, a3)
+	   We therefore want to keep all of A, and the top halves of the two
+	   smaller products B and C.
 
-	(a4, a5) = slow2sum(a4, a5)
-	-> (a1, a2, a3 + a4)
+	   To be pedantic, *_l could be much smaller than pictured above; *_h and
+	   *_l need not butt up right against each other. But that won't cause
+	   any problems; there's no way we'll ignore important information.
+	 */
+	var Ah, Al
+	(Ah, Al) = two_by_two64(a_h, b_h)
+	var Bh = a_h * b_l
+	var Ch = a_l * b_h
+	var resh, resl, t1, t2
+	(resh, resl) = slow2sum(Bh, Ch)
+	(resl, t1) = fast2sum(Al, resl)
+	(resh, t2) = slow2sum(resh, resl)
+	(resh, resl) = slow2sum(Ah, resh)
+	-> (resh, resl + (t1 + t2))
+}
+
+const hl_add = {a_h : flt64, a_l : flt64, b_h : flt64, b_l : flt64
+	/*
+	  Not terribly clever, we just chain a couple of 2sums together. We are
+	  free to impose the requirement that |a_h| > |b_h|, because we'll only
+	  be using this for a = 1/5, 1/3, and the log(Fi)s from the C tables.
+	  However, we can't guarantee that a_l > b_l. For example, compare C1[10]
+	  to C2[18].
+	 */
+
+	var resh, resl, t1, t2
+	(t1, t2) = slow2sum(a_l, b_l)
+	(resl, t1) = slow2sum(b_h, t1)
+	(resh, resl) = fast2sum(a_h, resl)
+	-> (resh, resl + (t1 + t2))
 }
--- a/lib/math/test/log-overkill.myr
+++ b/lib/math/test/log-overkill.myr
@@ -142,7 +142,10 @@
 		(0x3f974b5311aeae57, 0xc00e4420e231d7f0, 0xbc9d614ed9b94484),
 		(0x3fe28aed659dab73, 0xbfe1760d162fed7e, 0xbc64a0ff30250148),
 		(0x403273d9892e62d3, 0x40075255633e0533, 0xbc91eb9834046d7b),
+
+		/* This one catches naive catastrophic cancellation */
 		(0x3fee1d239d2061d7, 0xbfaf1ad3961ab8ba, 0xbbc9bff82ae3fde7),
+
 		(0x3fbc0666ebc60265, 0xc001b257198142d0, 0xbca1cf93360a27f6),
 		(0x3f53267a24ceab6a, 0xc01b01c8ad09c3c1, 0xbca0d85af74df975),
 		(0x3fd2005446cb268e, 0xbff44b879f2ec561, 0x3c66e8eff64f40a1),