shithub: mc

ref: 90d9a10c0234f5868c2e86882aae72fc931d53fd
dir: /lib/math/sin-impl.myr/

View raw version
use std

use "fpmath"
use "fma-impl"
use "scale2-impl"
use "sum-impl"
use "util"

/*
   This implementation of sin and cos uses the "Highly Accurate
   Tables" method of [GB91]. It's not as fast as it could be (the
   sorting and excessive summation really takes a toll), however
   we seem to be consistently within +/-1 ulp of the correct result
   in flt64 case. In flt32 case we are correctly rounded.

   The idea is to tabulate [0, pi/4] at 256 slightly irregular
   intervals xi, with the lucky property that the infinite binary
   expansions of significands of sin(xi) and cos(xi) look like

      / 53 bits \  / many '0's \  / noise \
     1.bbbbbb...bb00000000000...0???????????...

   This allows us to use storage only for a single floating point
   number, but get more than 53 bits of precision.

   Using that, we express x as (N * pi/4) + (xi) + (h), where h is
   tiny. Using identities

      sin(u+v) = sin(u)cos(v) + cos(u)sin(v),
      cos(u+v) = cos(u)cos(v) - sin(u)sin(v),

   we arrive at a sum where every term is known with greater than
   53 bits of precision except for sin(h) and cos(h), which we can
   approximate well.

   As a note, everything is performed in double-precision. Storing
   a second set of tables for single precision constants would
   occupy another 3KiB of memory for a not-very-significant speed
   gain.

   See files pi-constants.c, generate-triples-for-GB91.c, and
   generate-minimax-by-Remez.gp for where the constants come from.

   Note that in a departure from [GB91], we use the smaller degree
   polynomials over a smaller range for the input. This requires
   that we store more C-values for smaller xi values, but makes
   the implementation a bit simpler to read.
 */
pkg math =
	pkglocal const sin32 : (x : flt32 -> flt32)
	pkglocal const sin64 : (x : flt64 -> flt64)

	pkglocal const cos32 : (x : flt32 -> flt32)
	pkglocal const cos64 : (x : flt64 -> flt64)

	pkglocal const sincos32 : (x : flt32 -> (flt32, flt32))
	pkglocal const sincos64 : (x : flt64 -> (flt64, flt64))

	pkglocal const trig_reduce : (x : flt64 -> (int64, flt64, flt64))
	pkglocal const trig_table_approx : (x1 : flt64, C : (uint64, uint64, uint64)[257] -> (uint64, uint64, uint64))
;;

/* Pi/2 in lots of detail, for range reducing sin(2^18) or so */
const pi_over_2 : uint64[4] = [
	0x3ff921fb54442d18,
	0x3c91a62633145c07,
	0xb91f1976b7ed8fbc,
	0x35b4cf98e804177d,
]

/* Pi/4 in lots of detail */
const pi_over_4 : uint64[4] = [
	0x3fe921fb54442d18,
	0x3c81a62633145c07,
	0xb90f1976b7ed8fbc,
	0x35a4cf98e804177d,
]

/* Pre-computed inverses */
const two_over_pi : uint64 = 0x3fe45f306dc9c883 /* 1/(pi/2) */
const oneohtwofour_over_pi : uint64 = 0x40745f306dc9c883 /* 1/(pi/(4 * 256)) */

/*
   Coefficients for minimax, degree 7 polynomials approximating sin
   and cos on [-Pi/1024, Pi/1024] (generated by a Remez algorithm).
 */
const sin_coeff : uint64[8] = [
	0x0000000000000000,
	0x3ff0000000000000,
	0xb8c7400000000000,
	0xbfc5555555555555,
	0x39d0000000000000,
	0x3f81111111111061,
	0xbacc000000000000,
	0xbf2a019fa7ee0417,
]

const cos_coeff : uint64[8] = [
	0x3ff0000000000000,
	0x38c0800000000000,
	0xbfe0000000000000,
	0x39a0000000000000,
	0x3fa55555555553ee,
	0x0000000000000000,
	0xbf56c16b9bfd9fd6,
	0xbbec000000000000,
]

/*
   The Highly Accurate Tables for use in a [GB91]-type algorithm;
   generated by ancillary/generate-triples-for-GB91.c.
 */
const C : (uint64, uint64, uint64)[257] = [
	/*       xi               cos(xi)            sin(xi)      */
	(0x0000000000000000, 0x3ff0000000000000, 0x0000000000000000),
	(0x3f6921fb42e71072, 0x3feffff621622aa5, 0x3f6921f8ad6f8d6f),
	(0x3f7921fb576a8e70, 0x3fefffd8858e80ad, 0x3f7921f1018d5de6),
	(0x3f82d97c6d961293, 0x3fefffa72c98324f, 0x3f82d96afcb3b8a8),
	(0x3f8921fba95a4ba5, 0x3fefff62169765a8, 0x3f8921d251f34230),
	(0x3f8f6a79afcf7cc4, 0x3fefff0943ccb09c, 0x3f8f6a28f137852f),
	(0x3f92d97d6168427b, 0x3feffe9cb42a0249, 0x3f92d9379e0e6011),
	(0x3f95fdbbdcaafdf1, 0x3feffe1c68730a0e, 0x3f95fd4d14eace13),
	(0x3f9921fb47134298, 0x3feffd886087640a, 0x3f992155ea73805c),
	(0x3f9c463ab6204953, 0x3feffce09ce490e2, 0x3f9c454f4439aa60),
	(0x3f9f6a7a230622cf, 0x3feffc251df3604f, 0x3f9f69372b837c03),
	(0x3fa1475c20c40678, 0x3feffb55e4815141, 0x3fa1468532309197),
	(0x3fa2d97c761e7dc7, 0x3feffa72f0045061, 0x3fa2d8656c814503),
	(0x3fa46b9c4167ee58, 0x3feff97c42007eca, 0x3fa46a397ce648ce),
	(0x3fa5fdbbb3fec154, 0x3feff871db006d07, 0x3fa5fc009ce09729),
	(0x3fa78fdba698a573, 0x3feff753bb15f9f8, 0x3fa78dbaad1df29e),
	(0x3fa921fb569e20a0, 0x3feff621e37794e9, 0x3fa91f65f36711fd),
	(0x3faab41b20e054f3, 0x3feff4dc549e4649, 0x3faab101d4af47e4),
	(0x3fac463aa9c96aef, 0x3feff3830f9fe5ee, 0x3fac428cfdc5c35c),
	(0x3fadd85a67cbe296, 0x3feff21614eca1d2, 0x3fadd406ed40d193),
	(0x3faf6a7a3eff7872, 0x3feff09565793013, 0x3faf656e8f97f0f9),
	(0x3fb07e4ceacc9a97, 0x3fefef01028ba740, 0x3fb07b6149c87151),
	(0x3fb1475c76962b85, 0x3fefed58ed6a54be, 0x3fb14400e1aeeb44),
	(0x3fb2106c98abe6cd, 0x3fefeb9d254b1740, 0x3fb20c96690fbe9f),
	(0x3fb2d97c6bdba26e, 0x3fefe9cdad2f1061, 0x3fb2d5207f840524),
	(0x3fb3a28c18e279a9, 0x3fefe7ea85e76da6, 0x3fb39d9ed203bab6),
	(0x3fb46b9c42a3d8e4, 0x3fefe5f3af0a1548, 0x3fb4661187480b43),
	(0x3fb534abcd8d6c89, 0x3fefe3e92c9764ff, 0x3fb52e7708fabcdf),
	(0x3fb5fdbbf6d380b0, 0x3fefe1cafc99687d, 0x3fb5f6d017a62147),
	(0x3fb6c6cbcc2f7248, 0x3fefdf9922e0f7ad, 0x3fb6bf1b46436fc7),
	(0x3fb78fdb8cff3236, 0x3fefdd53a026e6fa, 0x3fb78758587024ce),
	(0x3fb858eb79d953d8, 0x3fefdafa7513ab8e, 0x3fb84f8712f838d4),
	(0x3fb921fb5df13728, 0x3fefd88da3b2cbcb, 0x3fb917a6c5cad0c3),
	(0x3fb9eb0b15ad6e7f, 0x3fefd60d2df8efac, 0x3fb9dfb6d20cd89d),
	(0x3fbab41b0d8e04c2, 0x3fefd3791414a510, 0x3fbaa7b72849583a),
	(0x3fbb7d2b02ec004d, 0x3fefd0d1586ee673, 0x3fbb6fa70acd0c55),
	(0x3fbc463a9c1cda98, 0x3fefce15fde7feac, 0x3fbc3785a525059d),
	(0x3fbd0f4a92881eaf, 0x3fefcb4703aa4711, 0x3fbcff5334552718),
	(0x3fbdd85a7ea6b644, 0x3fefc8646cd750e6, 0x3fbdc70ed631fba7),
	(0x3fbea16a4da0e792, 0x3fefc56e3b81b243, 0x3fbe8eb7fcd470b2),
	(0x3fbf6a7a2082960a, 0x3fefc26471042f37, 0x3fbf564e4de7cd23),
	(0x3fc019c4f4362172, 0x3fefbf470f79208c, 0x3fc00ee89fc60798),
	(0x3fc07e4cb4549583, 0x3fefbc1619c9367a, 0x3fc072a00d3f81a5),
	(0x3fc0e2d4df1f9ef0, 0x3fefb8d18d51928e, 0x3fc0d64dbf2ea47f),
	(0x3fc1475c06879885, 0x3fefb5797828e1da, 0x3fc139f00d3ac360),
	(0x3fc1abe531137149, 0x3fefb20dc250d36d, 0x3fc19d89b968e759),
	(0x3fc2106c256f2705, 0x3fefae8e92bf458c, 0x3fc20116570e32c3),
	(0x3fc274f5eafb17bd, 0x3fefaafbbea74df5, 0x3fc2649aa381628e),
	(0x3fc2d97c80aa0f5b, 0x3fefa7557efae43c, 0x3fc2c81070013fe8),
	(0x3fc33e0462663cd9, 0x3fefa39bacdb1026, 0x3fc32b7bef4456d4),
	(0x3fc3a28c534e14b9, 0x3fef9fce55ed894d, 0x3fc38edbaa59fe3f),
	(0x3fc40714a18f49e2, 0x3fef9bed79763508, 0x3fc3f22fb1298233),
	(0x3fc46b9c38ac6ee7, 0x3fef97f9249e436c, 0x3fc45576b5509b55),
	(0x3fc4d02432e80a59, 0x3fef93f14ed442ec, 0x3fc4b8b1905fbd16),
	(0x3fc534ac0298d5e2, 0x3fef8fd600323753, 0x3fc51bdf7942e899),
	(0x3fc5993416dbe628, 0x3fef8ba736af1693, 0x3fc57f00a065f728),
	(0x3fc5fdbbf1d87ff8, 0x3fef8764fa1887ba, 0x3fc5e2144c8984d0),
	(0x3fc66243dc0ae9e8, 0x3fef830f4a092d65, 0x3fc6451a8808363c),
	(0x3fc6c6cbc286e3f6, 0x3fef7ea629fb13d9, 0x3fc6a8130326d53f),
	(0x3fc72b53b7c96348, 0x3fef7a299bd3df70, 0x3fc70afd9309c784),
	(0x3fc78fdba21869ed, 0x3fef7599a37cdcb3, 0x3fc76dd9e15bdb4a),
	(0x3fc7f4642a07f677, 0x3fef70f63bf58052, 0x3fc7d0a856c8ccd4),
	(0x3fc858eb5b649af0, 0x3fef6c3f7f63a632, 0x3fc83366caea942f),
	(0x3fc8bd736951ad00, 0x3fef6775566b1c57, 0x3fc896172a175d2d),
	(0x3fc921fb399259ea, 0x3fef6297d144aa53, 0x3fc8f8b8223b223a),
	(0x3fc986835ce1c644, 0x3fef5da6ebe94da7, 0x3fc95b4a04783401),
	(0x3fc9eb0b24569abc, 0x3fef58a2b2008d0c, 0x3fc9bdcbe883cc5f),
	(0x3fca4f9323461762, 0x3fef538b1f52fbe6, 0x3fca203e2200e122),
	(0x3fcab41b09c5898b, 0x3fef4e603b080549, 0x3fca82a025ebcacb),
	(0x3fcb18a2e6d4292f, 0x3fef492207941b17, 0x3fcae4f1c649efbf),
	(0x3fcb7d2ae7b9bf9f, 0x3fef43d085cf0736, 0x3fcb4732f2b7a6f7),
	(0x3fcbe1b2cb7b8481, 0x3fef3e6bbc6eba27, 0x3fcba9632f158ca6),
	(0x3fcc463ab6f78496, 0x3fef38f3acd2bb22, 0x3fcc0b8262d9d9f9),
	(0x3fccaac29faeedb5, 0x3fef33685aeb67ba, 0x3fcc6d90473e1ca0),
	(0x3fcd0f4ab0d5a9e9, 0x3fef2dc9c7b77e6a, 0x3fcccf8cc9dfcdca),
	(0x3fcd73d299801a67, 0x3fef2817fb345701, 0x3fcd31775f6e7021),
	(0x3fcdd85a5f03c93d, 0x3fef2252f8ad886d, 0x3fcd934fd0c9eb90),
	(0x3fce3ce2615c01b0, 0x3fef1c7abe28a12c, 0x3fcdf5163efb2fd2),
	(0x3fcea16a34d49aca, 0x3fef168f557f8c38, 0x3fce56ca04ee54fd),
	(0x3fcf05f2396af361, 0x3fef1090bcb17909, 0x3fceb86b43a82046),
	(0x3fcf6a7a34c8a42b, 0x3fef0a7efae01f19, 0x3fcf19f9863cf10b),
	(0x3fcfcf02154123ee, 0x3fef045a14e56969, 0x3fcf7b747f630b74),
	(0x3fd019c51e84ab67, 0x3feefe22087e60d4, 0x3fcfdcdc522606ad),
	(0x3fd04c08e989dfc7, 0x3feef7d6e70399a4, 0x3fd01f17f81c5502),
	(0x3fd07e4ceb775576, 0x3feef178a4618400, 0x3fd04fb80a82fe62),
	(0x3fd0b090f60ca615, 0x3feeeb074a3d9810, 0x3fd0804e15777f94),
	(0x3fd0e2d4c54314ef, 0x3feee482e5663962, 0x3fd0b0d9b95007d9),
	(0x3fd11518d2106a76, 0x3feeddeb6a30657d, 0x3fd0e15b4cec5711),
	(0x3fd1475cc633879f, 0x3feed740e7e7b002, 0x3fd111d25f193a47),
	(0x3fd179a0ce17673e, 0x3feed0835cc79369, 0x3fd1423efcc68ac8),
	(0x3fd1abe4bace84c5, 0x3feec9b2d347a043, 0x3fd172a0dae268af),
	(0x3fd1de28a262f5c2, 0x3feec2cf4cb15d4b, 0x3fd1a2f7f0d5a412),
	(0x3fd2106cb1425d8c, 0x3feebbd8c71a89f1, 0x3fd1d3444b7da503),
	(0x3fd242b0905f83c0, 0x3feeb4cf52e27d33, 0x3fd20385796d7260),
	(0x3fd274f4947838ba, 0x3feeadb2e8897f5d, 0x3fd233bbae3e8ad0),
	(0x3fd2a7386b632300, 0x3feea6839816a5b1, 0x3fd263e67d68a111),
	(0x3fd2d97c9e0be433, 0x3fee9f41524c0663, 0x3fd294064c5a5940),
	(0x3fd30bc07884186a, 0x3fee97ec359da93b, 0x3fd2c41a511fcae1),
	(0x3fd33e046121eaf8, 0x3fee908437cd8074, 0x3fd2f422d00c7726),
	(0x3fd3704855d0fd6b, 0x3fee89095dacc360, 0x3fd3241fa9798465),
	(0x3fd3a28c5798cbdd, 0x3fee817baba342be, 0x3fd35410c0bfc504),
	(0x3fd3d4d04553249a, 0x3fee79db2b58ee3d, 0x3fd383f5d8b1345d),
	(0x3fd407145ea979a2, 0x3fee7227d7cc183e, 0x3fd3b3cf1062e88d),
	(0x3fd43958386293b4, 0x3fee6a61c6350b11, 0x3fd3e39be4473723),
	(0x3fd46b9c38cc0bc5, 0x3fee6288eb9aff42, 0x3fd4135c98341878),
	(0x3fd49de029090fc7, 0x3fee5a9d555912a2, 0x3fd44310da8ddcf9),
	(0x3fd4d024217e697d, 0x3fee529f047e7434, 0x3fd472b8a510e608),
	(0x3fd50267f89f138c, 0x3fee4a8e04a2f573, 0x3fd4a253b2fc94cc),
	(0x3fd534ac160681bb, 0x3fee426a4a0b5efa, 0x3fd4d1e249057326),
	(0x3fd566eff496a5e0, 0x3fee3a33ef471d66, 0x3fd50163cbe183bf),
	(0x3fd59933ebfe75ec, 0x3fee31eaeb28cfda, 0x3fd530d871303f3b),
	(0x3fd5cb77fe42d304, 0x3fee298f425adc95, 0x3fd560401d7fa792),
	(0x3fd5fdbbd011e31d, 0x3fee212109492bba, 0x3fd58f9a5d81663b),
	(0x3fd62ffff34dbbed, 0x3fee18a02ca5e0f9, 0x3fd5bee79d6734e6),
	(0x3fd66243bdc0c9c2, 0x3fee100cce7f06b6, 0x3fd5ee271fdac6fd),
	(0x3fd69487ca38ebb6, 0x3fee0766d9c23a82, 0x3fd61d595943641b),
	(0x3fd6c6cbc58a707e, 0x3fedfeae61f95db0, 0x3fd64c7dde58f888),
	(0x3fd6f90fbf3b4b9a, 0x3fedf5e369de7ac8, 0x3fd67b94a09dba36),
	(0x3fd72b53a84c8c13, 0x3feded05f987af46, 0x3fd6aa9d7500e60d),
	(0x3fd75d97a7d7826a, 0x3fede4160f84614d, 0x3fd6d99863129ab7),
	(0x3fd78fdba723c71a, 0x3feddb13b555c5d8, 0x3fd7088538928031),
	(0x3fd7c21faf2481ab, 0x3fedd1feeeeb3f77, 0x3fd73763e0e5bb7e),
	(0x3fd7f46380443ef6, 0x3fedc8d7cd74eb35, 0x3fd7663403ecedb8),
	(0x3fd826a78fee86a2, 0x3fedbf9e41325ad2, 0x3fd794f5f21f812c),
	(0x3fd858eb6fc6c00e, 0x3fedb652640d194e, 0x3fd7c3a927f6e7be),
	(0x3fd88b2f6091a911, 0x3fedacf42fd7881c, 0x3fd7f24dc4deb30b),
	(0x3fd8bd73747fa82a, 0x3feda383a6d8b415, 0x3fd820e3bcdb4516),
	(0x3fd8efb76ad3bcc6, 0x3fed9a00db088521, 0x3fd84f6ab72e0455),
	(0x3fd921fb52f99571, 0x3fed906bcf71ced7, 0x3fd87de2a57d3bef),
	(0x3fd9543f4c295528, 0x3fed86c484089c2c, 0x3fd8ac4b87fa1376),
	(0x3fd98683387c2ef2, 0x3fed7d0b047d2a2d, 0x3fd8daa526666236),
	(0x3fd9b8c74c734cd7, 0x3fed733f4c983086, 0x3fd908ef9488c638),
	(0x3fd9eb0b316e257b, 0x3fed6961734a4759, 0x3fd9372a66100418),
	(0x3fda1d4f1c5b13c6, 0x3fed5f71745bef9a, 0x3fd96555af393979),
	(0x3fda4f93134bbffd, 0x3fed556f54b18a49, 0x3fd99371591464cc),
	(0x3fda81d7079868bb, 0x3fed4b5b1d5d78c0, 0x3fd9c17d39ba9f53),
	(0x3fdab41b223bd70a, 0x3fed4134cc4c88b2, 0x3fd9ef795a3dd82c),
	(0x3fdae65f0bd2deeb, 0x3fed36fc796c6d1f, 0x3fda1d654e53c0c4),
	(0x3fdb18a2faa54a8e, 0x3fed2cb220194f43, 0x3fda4b412b549e3d),
	(0x3fdb4ae6e27ee94a, 0x3fed2255c92cb15e, 0x3fda790cc9d54d8e),
	(0x3fdb7d2afabcbd96, 0x3fed17e76f8b1bff, 0x3fdaa6c83ff29436),
	(0x3fdbaf6edd63f9e6, 0x3fed0d672ed022eb, 0x3fdad47314b13a5e),
	(0x3fdbe1b2ee09849b, 0x3fed02d4f8ac5a8e, 0x3fdb020d86625c4a),
	(0x3fdc13f6d9f827b3, 0x3fecf830e504ee27, 0x3fdb2f972dd6dd44),
	(0x3fdc463ac85c4fc4, 0x3feced7af23188c8, 0x3fdb5d101285fa87),
	(0x3fdc787e99315d1d, 0x3fece2b32dae84df, 0x3fdb8a77fb79b6c9),
	(0x3fdcaac2c391b562, 0x3fecd7d984771033, 0x3fdbb7cf38286324),
	(0x3fdcdd06bf1547a8, 0x3fecccee1a978b83, 0x3fdbe515317a2767),
	(0x3fdd0f4a93c25d1a, 0x3fecc1f0f53b85ed, 0x3fdc1249d2e7d3d7),
	(0x3fdd418e7cb2c1ac, 0x3fecb6e20e487888, 0x3fdc3f6d35bf5483),
	(0x3fdd73d292000a16, 0x3fecabc16721278b, 0x3fdc6c7f53ab8367),
	(0x3fdda616817e6870, 0x3feca08f18d00ef9, 0x3fdc997fc72e0f8c),
	(0x3fddd85a5a1d76bb, 0x3fec954b270990c5, 0x3fdcc66e8203a2b0),
	(0x3fde0a9e6cc874c6, 0x3fec89f5868b6c72, 0x3fdcf34bb0b7c500),
	(0x3fde3ce26dc62595, 0x3fec7e8e4f51729a, 0x3fdd2016f3f2781b),
	(0x3fde6f264279a0bc, 0x3fec73158e8e71da, 0x3fdd4cd0187c011d),
	(0x3fdea16a5680fb27, 0x3fec678b32bc65ff, 0x3fdd797762744bde),
	(0x3fded3ae3d42764c, 0x3fec5bef5bdf2d0e, 0x3fdda60c55ccc8d6),
	(0x3fdf05f24a4f648b, 0x3fec5041fdd6d9d1, 0x3fddd28f21277b30),
	(0x3fdf383651e91662, 0x3fec448329efd6a1, 0x3fddfeff8240fbd5),
	(0x3fdf6a7a42ac94c9, 0x3fec38b2eb87ae38, 0x3fde2b5d4e604dad),
	(0x3fdf9cbe1d3b3429, 0x3fec2cd149d960c0, 0x3fde57a86acebf99),
	(0x3fdfcf020d397ddb, 0x3fec20de41e8a738, 0x3fde83e0e2af7102),
	(0x3fe000a313205931, 0x3fec14d9d64b5fbe, 0x3fdeb006abd63bf5),
	(0x3fe019c4f9037b82, 0x3fec08c42ac58524, 0x3fdedc194338385b),
	(0x3fe032e70860dbf0, 0x3febfc9d20441910, 0x3fdf08191a1b3b78),
	(0x3fe04c0906e9f028, 0x3febf064da5e176a, 0x3fdf3405af2bebc0),
	(0x3fe0652b03fdfd24, 0x3febe41b5936a8f3, 0x3fdf5fdf024485e7),
	(0x3fe07e4d0b91a0db, 0x3febd7c09e80889b, 0x3fdf8ba50d2a0c48),
	(0x3fe0976ef1f9c5ae, 0x3febcb54c769186a, 0x3fdfb75768e7fa9b),
	(0x3fe0b090e7d18f51, 0x3febbed7c3a63454, 0x3fdfe2f64f20fede),
	(0x3fe0c9b2e9a39b2d, 0x3febb2499c87d6e9, 0x3fe00740cf66097a),
	(0x3fe0e2d4ddcaea14, 0x3feba5aa669df22e, 0x3fe01cfc88506502),
	(0x3fe0fbf6e1078c8e, 0x3feb98fa1b3ff547, 0x3fe032ae5dc3499b),
	(0x3fe11518d98a4a52, 0x3feb8c38cf44e150, 0x3fe048562c12ec1d),
	(0x3fe12e3add8e0dcc, 0x3feb7f667f41f3ed, 0x3fe05df3f90aa7ce),
	(0x3fe1475ce227685c, 0x3feb728338a5b7b8, 0x3fe07387ade96093),
	(0x3fe1607ec66ef221, 0x3feb658f1462d09f, 0x3fe0891121335eab),
	(0x3fe179a0be834cc7, 0x3feb5889ffa66df4, 0x3fe09e9072510f34),
	(0x3fe192c2b8568d3a, 0x3feb4b740bbd2fb5, 0x3fe0b405848141dd),
	(0x3fe1abe4b59244a1, 0x3feb3e4d3fd6bd5a, 0x3fe0c9704befbcad),
	(0x3fe1c506b2c3bd2e, 0x3feb3115a5da6c4f, 0x3fe0ded0b8742978),
	(0x3fe1de289173a977, 0x3feb23cd5613980e, 0x3fe0f426a30829d9),
	(0x3fe1f74a8f60e6e9, 0x3feb167438110e6e, 0x3fe1097232ed0851),
	(0x3fe2106ca03e5fd0, 0x3feb090a5a650d01, 0x3fe11eb350745afe),
	(0x3fe2298e9db917c4, 0x3feafb8fd9ca55cd, 0x3fe133e9ce192ca0),
	(0x3fe242b08f9ff90f, 0x3feaee04ba8026f6, 0x3fe14915a5706056),
	(0x3fe25bd28f640cf6, 0x3feae068f72931d9, 0x3fe15e36ded7bb02),
	(0x3fe274f47c0699ab, 0x3fead2bcaa0d2e11, 0x3fe1734d518c9922),
	(0x3fe28e1685bcfd71, 0x3feac4ffc15749f9, 0x3fe1885918f9ac8e),
	(0x3fe2a736fe1e1494, 0x3feab7333233d9b8, 0x3fe19d58c0a87f59),
	(0x3fe2c05a9f4eb35a, 0x3feaa9546b006d3c, 0x3fe1b2502def806a),
	(0x3fe2d97c6af42411, 0x3fea9b66344e259f, 0x3fe1c73b28d8e8d8),
	(0x3fe2f29e6e1ad75e, 0x3fea8d6775437b8d, 0x3fe1dc1b5a8d2e11),
	(0x3fe30bc06e6776fd, 0x3fea7f5856cdc155, 0x3fe1f0f0859072c6),
	(0x3fe324e1d749c5d3, 0x3fea7139354a6417, 0x3fe205ba2249d8d8),
	(0x3fe33e045ca06605, 0x3fea63092400435f, 0x3fe21a798c19a7e7),
	(0x3fe357267800e833, 0x3fea54c9076262f5, 0x3fe22f2d7377bfc7),
	(0x3fe370485e5be5d6, 0x3fea4678cad14810, 0x3fe243d5f7a46407),
	(0x3fe3896a4d91d9d2, 0x3fea3818540e8f5e, 0x3fe258733edb7371),
	(0x3fe3a28c599d4a38, 0x3fea29a7a0666245, 0x3fe26d054caf4fb0),
	(0x3fe3bbae6b7bfd61, 0x3fea1b26c5d7e6f3, 0x3fe2818c01832d38),
	(0x3fe3d4d03ed37a30, 0x3fea0c95f4fd994c, 0x3fe29607190147e2),
	(0x3fe3edf268105227, 0x3fe9fdf4e0b7d0c8, 0x3fe2aa76ff6bb39c),
	(0x3fe4071445cf2ff6, 0x3fe9ef43eff2b264, 0x3fe2bedb24e4da22),
	(0x3fe420363dd59357, 0x3fe9e082f06f9ebe, 0x3fe2d333cf8d19db),
	(0x3fe4395843ba2fd5, 0x3fe9d1b1f26b4f3c, 0x3fe2e780e8af8d40),
	(0x3fe4527a412b9192, 0x3fe9c2d10c2c8325, 0x3fe2fbc251bb901b),
	(0x3fe46b9c27a00cd7, 0x3fe9b3e04f99cbd5, 0x3fe30ff7f2910508),
	(0x3fe484be309a8d17, 0x3fe9a4dfa3af458c, 0x3fe32421ecef726e),
	(0x3fe49de033475607, 0x3fe995cf29f2517a, 0x3fe33840139184b1),
	(0x3fe4b7021f53f06e, 0x3fe986aef5919af2, 0x3fe34c524d121d49),
	(0x3fe4d0240f052501, 0x3fe9777f00244ce3, 0x3fe36058a217b268),
	(0x3fe4e9463716a53c, 0x3fe9683f32f2ae77, 0x3fe3745330215f46),
	(0x3fe502681e9a6016, 0x3fe958efe0cb9eeb, 0x3fe388418ac15fff),
	(0x3fe51b8a16255181, 0x3fe94990e237aa2e, 0x3fe39c23e5b6b244),
	(0x3fe534ac09d21709, 0x3fe93a224cd1d5d1, 0x3fe3affa24f6eadd),
	(0x3fe54dcdf3b3627e, 0x3fe92aa42dcf60f8, 0x3fe3c3c437a1dace),
	(0x3fe566effef25aad, 0x3fe91b16740dcf8f, 0x3fe3d782336d7ad4),
	(0x3fe5801215d3e4d3, 0x3fe90b79366e5e53, 0x3fe3eb33faf99080),
	(0x3fe59933ff695620, 0x3fe8fbcca2107806, 0x3fe3fed9559bf7b0),
	(0x3fe5b255ee2800cd, 0x3fe8ec10a14c3bbf, 0x3fe412725ec52f70),
	(0x3fe5cb77fdabb08b, 0x3fe8dc452c6aa905, 0x3fe425ff1fc9c896),
	(0x3fe5e499ef7e111b, 0x3fe8cc6a746824f6, 0x3fe4397f5c023a11),
	(0x3fe5fdbbe1bc4e34, 0x3fe8bc807027ea41, 0x3fe44cf31eda7ea0),
	(0x3fe616ddfb59ae0d, 0x3fe8ac8710ace7bf, 0x3fe4605a7a5aaeff),
	(0x3fe62fffdd1c090a, 0x3fe89c7e9c66cac4, 0x3fe473b51912497a),
	(0x3fe64921cfa4dad5, 0x3fe88c66ef074f3a, 0x3fe48703271ce28c),
	(0x3fe66243d696fbd4, 0x3fe87c401005fc0b, 0x3fe49a449b40f2d3),
	(0x3fe67b65d9d2cf92, 0x3fe86c0a18cb21f8, 0x3fe4ad795715a3e0),
	(0x3fe69487ba5a4421, 0x3fe85bc52776b9fc, 0x3fe4c0a1373040c0),
	(0x3fe6ada9c7509d39, 0x3fe84b7112ce79ff, 0x3fe4d3bc6c09e271),
	(0x3fe6c6cbcacfb7d4, 0x3fe83b0e07c9e659, 0x3fe4e6cac0c53e06),
	(0x3fe6dfedb933bfad, 0x3fe82a9c1836e690, 0x3fe4f9cc20e5450e),
	(0x3fe6f90fb67efcd3, 0x3fe81a1b36b01830, 0x3fe50cc09bf06f81),
	(0x3fe71231b7343738, 0x3fe8098b74dea97c, 0x3fe51fa81d7d14ee),
	(0x3fe72b53a85aa5f7, 0x3fe7f8ece98512ef, 0x3fe532828ba63ede),
	(0x3fe74475af974694, 0x3fe7e83f85f965d1, 0x3fe5454ff702d29b),
	(0x3fe75d9799aac134, 0x3fe7d783768ce953, 0x3fe558102da86994),
	(0x3fe776b9903c81f2, 0x3fe7c6b8a9e499af, 0x3fe56ac34326d2d0),
	(0x3fe78fdba082343f, 0x3fe7b5df2167453d, 0x3fe57d6935ab2542),
	(0x3fe7a8fdabde74d9, 0x3fe7a4f6fbedea5e, 0x3fe59001e2ecf285),
	(0x3fe7c21f8108af2e, 0x3fe794006540f1ec, 0x3fe5a28d1b2b3ab1),
	(0x3fe7db4178eb2ca1, 0x3fe782fb2be4619a, 0x3fe5b50b14a054d2),
	(0x3fe7f4637bfce457, 0x3fe771e76a206b4b, 0x3fe5c77bb26e7244),
	(0x3fe80d856a5e5235, 0x3fe760c5402e27c0, 0x3fe5d9ded1dab0d8),
	(0x3fe826a77aed6885, 0x3fe74f94932c2aa4, 0x3fe5ec348fa6b9f0),
	(0x3fe83fc98cbf9b15, 0x3fe73e55841a0699, 0x3fe5fe7cc8635d47),
	(0x3fe858eb8885faf1, 0x3fe72d082dab83d0, 0x3fe610b75fe5f52f),
	(0x3fe8720d606e4a1c, 0x3fe71baca44228b1, 0x3fe622e441189d58),
	(0x3fe88b2f5ac151d8, 0x3fe70a42c20978bf, 0x3fe63503939a8c88),
	(0x3fe8a4516df43f11, 0x3fe6f8ca982975b6, 0x3fe64715452c2eb4),
	(0x3fe8bd735c5fca22, 0x3fe6e7445c4d6347, 0x3fe659191e5ec1a0),
	(0x3fe8d69563ed711a, 0x3fe6d5afee23005e, 0x3fe66b0f407fe153),
	(0x3fe8efb7598d0461, 0x3fe6c40d769b68c4, 0x3fe67cf781aec585),
	(0x3fe908d9720f2113, 0x3fe6b25cdb7a4ca4, 0x3fe68ed1fc7319a2),
	(0x3fe921fb5b6a3d21, 0x3fe6a09e61712f13, 0x3fe6a09e6b8d4885),
]

/*
   The huge reduction tables. R[j] = (l1, l2, l3), such that l1+l2+l3
   is less than 2pi, and 2^(50j + 25) is approximately l1+l2+l3
   (mod 2pi).

   The stepping of 50 was chosen because it is a nice, round number,
   close to 52.  [GB91] make reference to a difficult-to-reduce
   number, Xhard, which is in the range (-2^27, 2^27). By using an
   offset of 25, we can ensure that huge_reduce returns results in
   about that range.  This allows us to reuse calculations in [GB91]
   showing we have probably stored enough bits of pi (and of these
   numbers).

   TODO: There is always the chance that, for some x = x1*2^j +
   x2*2^k, there is catastrophic cancellation which makes the
   remainder sum x1*r[50j+25] + x2*r[50k+25] imprecise. That needs
   to be checked; it shouldn't be too hard.
 */
const R : (uint64, uint64, uint64)[20] = [
	(0x4011fb222e13e839, 0xbcbeecfb7d19df11, 0x3955f9708d867b5b), /* 2^25 mod 2pi */
	(0x3ffee6639887604d, 0x3c88b437ad3f55e0, 0x39218522303457a8), /* 2^75 mod 2pi */
	(0x3fc589bfb31f1687, 0x3c31994c1edb7977, 0xb8dd15afd80892a0), /* 2^125 mod 2pi */
	(0x400663e27d2e0b47, 0xbca00a74acd21bf1, 0x39257592b34b8c25), /* 2^175 mod 2pi */
	(0x40177ff13d560e79, 0xbc8403f354e865f6, 0x38e6e69984c4338e), /* 2^225 mod 2pi */
	(0x4015a4a4671a3606, 0x3ca6fc3b1a2bddd5, 0xb9270923530d0279), /* 2^275 mod 2pi */
	(0x3ff3bf65f73a0474, 0x3c67deda97eb4131, 0x38fa32118f8f578f), /* 2^325 mod 2pi */
	(0x3ffa8e506685f311, 0x3c698a6391d9d31b, 0x38db58d212d28d0a), /* 2^375 mod 2pi */
	(0x400d38d7ecc58385, 0x3c98b076242f0ed3, 0x38e04e83f1274a16), /* 2^425 mod 2pi */
	(0x4017023cb671ed43, 0xbcbeb418af32f189, 0xb944d3aea8efaa5f), /* 2^475 mod 2pi */
	(0x401655d13f672fe9, 0x3ca8e56fc8ad533e, 0x393d464a045591bf), /* 2^525 mod 2pi */
	(0x3feec8dd916fa3b6, 0x3c82c834374ed2af, 0x39225586c285a0d8), /* 2^575 mod 2pi */
	(0x400ed2900b47ba63, 0x3c697a0d5776cc4d, 0x390c831068ee85b1), /* 2^625 mod 2pi */
	(0x4008563b8e99caac, 0xbc5ee03ee870ab9d, 0x38fb980dd9756063), /* 2^675 mod 2pi */
	(0x4012d961cf7cd57d, 0xbcbdde0a1d27628e, 0xb95a8840599f8246), /* 2^725 mod 2pi */
	(0x4016919aac4a18f4, 0xbc93552937a28b88, 0xb9145e70b28c0cbb), /* 2^775 mod 2pi */
	(0x400c37d195196372, 0xbca948065398479d, 0xb94cadd4f6e80c1b), /* 2^825 mod 2pi */
	(0x400d77a34a63ebae, 0x3ca8ca5d7ccc2874, 0xb91611b1e4b65369), /* 2^875 mod 2pi */
	(0x3ff4cb08f62cb38b, 0xbc90f42df8fc967a, 0xb8ed7d71202d2f45), /* 2^925 mod 2pi */
	(0x401848860f8742d6, 0x3c90337738c287b4, 0xb92f24fc614ec2f7), /* 2^975 mod 2pi */
]

const sin32 = {x : flt32
	var s, s2
	(s, s2, _, _) = w64((x : flt64), true, false)
	-> round_down(s, s2)
}

const sin64 = {x : flt64
	var s
	(s, _, _, _) = w64(x, true, false)
	-> s
}

const cos32 = {x : flt32
	var c, c2
	(_, _, c, c2) = w64((x : flt64), false, true)
	-> round_down(c, c2)
}

const cos64 = {x : flt64
	var c
	(_, _, c, _) = w64(x, false, true)
	-> c
}

const sincos32 = {x : flt32
	var s : flt64, s2 : flt64, c : flt64, c2 : flt64
	(s, s2, c, c2) = w64((x : flt64), true, true)
	-> (round_down(s, s2), round_down(c, c2))
}

const sincos64 = {x : flt64
	var s, c
	(s, _, c, _) = w64(x, true, true)
	-> (s, c)
}

/* Calculate sin and/or cos */
const w64 = {x : flt64, want_sin : bool, want_cos : bool
	var sin_ret : flt64 = 0.0
	var cos_ret : flt64 = 0.0
	var sin_ret2 : flt64 = 0.0
	var cos_ret2 : flt64 = 0.0

	var e : int64
	(_, e, _) = std.flt64explode(x)

	if e == 1024
		-> (std.flt64nan(), 0.0, std.flt64nan(), 0.0)
	;;

	var N : int64
	var x1 : flt64, x2 : flt64
	(N, x1, x2) = trig_reduce(x)

	/* Handle multiples of pi/2 */
	var swap_sin_cos : bool = false
	var then_negate_sin : bool = false
	var then_negate_cos : bool = false
	match N
	| 1:
		/* sin(x + pi/2) = cos(x), cos(x + pi/2) = -sin(x) */
		swap_sin_cos = true
		then_negate_cos = true
	| 2:
		/* sin(x + pi) = -sin(x), cos(x + pi) = -cos(x) */
		then_negate_cos = true
		then_negate_sin = true
	| 3:
		/* sin(x + 3pi/2) = -cos(x), cos(x + 3pi/2) = sin(x) */
		swap_sin_cos = true
		then_negate_sin = true
	| _:
	;;

	var first_negate_sin : bool = false
	if x1 < 0.0
		x1 = -x1
		x2 = -x2
		first_negate_sin = true
	;;

	/* Figure out where in the C table x lies */
	var xi : uint64, sin_xi : uint64, cos_xi : uint64, sin_delta, cos_delta
	(xi, cos_xi, sin_xi) = trig_table_approx(x1, C)

	/*
	   Compute x - xi with compensated summation. Because xi
	   and delta both lie in the same interval of width (pi/4)/256,
	   which is less than 1/256, we can use that |delta1| <
	   2^-8: we need a polynomial approximation of at least
	   degree 7.

	   This also gives that |delta2| < 2^(-60), vanishing quickly
	   in the polynomial approximations.
	 */
	var delta1, delta2, deltat
	(delta1, deltat) = fast2sum(-std.flt64frombits(xi), x1)
	(delta2, _) = fast2sum(deltat, x2)

	/*
	   sin(delta); the degree 2 coefficient is near 0, so delta_2
	   only shows up in deg 1
	 */
	sin_delta = horner_polyu(delta1, sin_coeff[:])
	sin_delta += delta2 * std.flt64frombits(sin_coeff[1])

	/*
	   cos(delta); delta_2 shows up in deg 1 and 2; the term
	   we care about is a1*d2 + 2*a2*d1*d2
	 */
	cos_delta = horner_polyu(delta1, cos_coeff[:])
	cos_delta += delta2 * fma64(delta1, 2.0*std.flt64frombits(cos_coeff[2]), std.flt64frombits(cos_coeff[1]))

	var q : flt64[4]

	if (want_sin && !swap_sin_cos) || (want_cos && swap_sin_cos)
		(q[0], q[1]) = two_by_two(std.flt64frombits(sin_xi), cos_delta)
		(q[2], q[3]) = two_by_two(std.flt64frombits(cos_xi), sin_delta)
		std.sort(q[:], fltabscmp)

		(sin_ret, sin_ret2) = double_compensated_sum(q[:])
	;;

	if (want_cos && !swap_sin_cos) || (want_sin && swap_sin_cos)
		(q[0], q[1]) = two_by_two(std.flt64frombits(cos_xi), cos_delta)
		(q[2], q[3]) = two_by_two(std.flt64frombits(sin_xi), sin_delta)
		q[2] = -q[2]
		q[3] = -q[3]

		/*
		   No need to sort; cos_xi and sin_xi are in [0,1],
		   cos_delta is close to 1, sin_delta is close to
		   0.
		*/
		std.swap(&q[1], &q[2])

		(cos_ret, cos_ret2) = double_compensated_sum(q[:])
	;;

	if first_negate_sin
		sin_ret = -sin_ret
		sin_ret2 = -sin_ret2
	;;

	if swap_sin_cos
		std.swap(&sin_ret, &cos_ret)
		std.swap(&sin_ret2, &cos_ret2)
	;;

	if then_negate_sin
		sin_ret = -sin_ret
		sin_ret2 = -sin_ret2
	;;

	if then_negate_cos
		cos_ret = -cos_ret
		cos_ret2 = -cos_ret2
	;;

	-> (sin_ret, sin_ret2, cos_ret, cos_ret2)
}

/* Reduce x to N*(pi/2) + x', with x' in [-pi/4, pi/4] */
const trig_reduce = {x : flt64
	var N : int64 = 0
	var Nf : flt64

	/*
	   We actually only care about N mod 4. If x is very large,
	   the ultimate N that we end up using might not be
	   representable (either as an int64 or flt64), so we instead
	   just keep track of the mod 4 part exactly.
	 */

	/*
	   If we want to store pi in a form to properly reduce
	   2^1000 or so, it turns out that there's some complication
	   with the final digits: they trail off beyond subnormality
	   and can't be represented, even though they are the digits
	   we need for the reduction. Therefore, for extremely high
	   values of x, we pre-compute the reduction and return it
	   here.

	   We get: x1 about in range 2^25, and that (x1 + x2 + x3)
	   approximates x (mod 2pi) very well. This is good enough
	   to ensure that N = x/pi is representable (int64 and
	   flt64).  We do need to worry about catastrophic cancellation,
	   however.
	 */
	var x1, x2, x3
	(x1, x2, x3) = huge_reduce_2pi(x)

	var pi_o_2_0 : flt64 = std.flt64frombits(pi_over_2[0])
	var pi_o_2_1 : flt64 = std.flt64frombits(pi_over_2[1])
	var pi_o_2_2 : flt64 = std.flt64frombits(pi_over_2[2])
	var pi_o_2_3 : flt64 = std.flt64frombits(pi_over_2[3])
	var pi_o_4_0 : flt64 = std.flt64frombits(pi_over_4[0])
	var pi_o_4_1 : flt64 = std.flt64frombits(pi_over_4[1])
	var pi_o_4_2 : flt64 = std.flt64frombits(pi_over_4[2])
	var pi_o_4_3 : flt64 = std.flt64frombits(pi_over_4[3])

	var total_N = 0
	var q : flt64[11]

	/* Compute initial reduction -- this might not be sufficient. */
	total_N = rn(x1 * std.flt64frombits(two_over_pi))
	Nf = (-total_N : flt64)
	(q[0], q[ 4], q[5]) = (x1, x2, x3)
	(q[1], q[ 3]) = two_by_two(Nf, pi_o_2_0)
	(q[2], q[ 6]) = two_by_two(Nf, pi_o_2_1)
	(q[7], q[ 8]) = two_by_two(Nf, pi_o_2_2)
	(q[9], q[10]) = two_by_two(Nf, pi_o_2_3)

	/*
	  Sorting is very slow, but it's only the top five or so
	  that are in question
	 */
	std.sort(q[0:5], fltabscmp)
	(x1, x2) = double_compensated_sum(q[:])

	while !(le_22(x1, x2, pi_o_4_0, pi_o_4_1) && le_22(-pi_o_4_0, -pi_o_4_1, x1, x2))
		N = rn(x1 * std.flt64frombits(two_over_pi))
		Nf = (-N : flt64)

		/*
		   Sorting is slow. We know that x1 is roughly
		   cancelled by Nf * pi_o_2_0, so line those up.
		 */
		(q[0], q[2]) = (x1, x2)
		(q[1], q[3]) = two_by_two(Nf, pi_o_2_0)
		(q[4], q[5]) = two_by_two(Nf, pi_o_2_1)
		(q[6], q[7]) = two_by_two(Nf, pi_o_2_2)
		(q[8], q[9]) = two_by_two(Nf, pi_o_2_3)
		(x1, x2) = double_compensated_sum(q[0:10])
		total_N += (N % 4)
	;;

	-> (((total_N % 4) + 4) % 4, x1, x2)
}

const huge_reduce_2pi = {x : flt64
	var e : int64
	var b : uint64 = std.flt64bits(x)
	(_, e, _) = std.flt64explode(x)

	if e < 25
		-> (x, 0.0, 0.0)
	;;

	/*
	   Since the stepping of R is 50, and x has 53 significant
	   bits, we get a splitting of x into two components. We
	   want

	       x = [ xa * 2^(50j + 25) ] + [ xb * 2^(50(j-1) + 25) ]

	   and {ai}, {bi} such that

	      a1 + a2 + a3 === ( 2^(50j + 25) ) mod 2pi

	      b1 + b2 + b3 === ( 2^(50(j-1) + 25) ) mod 2pi

	   If j is small enough, we can slack off a bit. We really
	   just want xa(a1+a2+a3) + xb(b1+b2+b3) === x (mod 2pi)
	   with a heck of a lot of precision.
	 */
	var j : uint64 = (e - 25 : uint64) / 50
	var xa : flt64 = 0.0, xb : flt64 = 0.0, xc : flt64 = 0.0
	var a1 : flt64 = 0.0, a2 : flt64 = 0.0, a3 : flt64 = 0.0
	var b1 : flt64 = 0.0, b2 : flt64 = 0.0, b3 : flt64 = 0.0
	var c1 : flt64 = 0.0, c2 : flt64 = 0.0, c3 : flt64 = 0.0
	var u1 : uint64, u2 : uint64, u3 : uint64
	var xcur = x

	var e1 : int64 = 50*(j : int64) + 25
	var e2 : int64 = e1 - 50
	var e3 : int64 = e2 - 50
	xa = trunc(scale2(x,-e1))
	xb = trunc(scale2(x,-e2))
	xc = trunc(scale2(x,-e3))
	xc -= scale2(xb, 50)
	xb -= scale2(xa, 50)

	(u1, u2, u3) = R[j]
	a1 = std.flt64frombits(u1)
	a2 = std.flt64frombits(u2)
	a3 = std.flt64frombits(u3)

	if j == 0
		(b1, b2, b3) = (x - scale2(xa, e1), 0.0, 0.0)
		xb = 1.0
		(c1, c2, c3) = (0.0, 0.0, 0.0)
		xc = 0.0
	else
		(u1, u2, u3) = R[j - 1]
		b1 = std.flt64frombits(u1)
		b2 = std.flt64frombits(u2)
		b3 = std.flt64frombits(u3)

		if j == 1
			(c1, c2, c3) = (x - scale2(xa, e1) - scale2(xb, e2), 0.0, 0.0)
			xc = 1.0
		else
			(u1, u2, u3) = R[j - 2]
			c1 = std.flt64frombits(u1)
			c2 = std.flt64frombits(u2)
			c3 = std.flt64frombits(u3)
		;;
	;;

	/*
	   Now we need to combine all this. Even worse, when we
	   multiply the two together, we need to keep full precision
	   (at least for the high-ish bits), so we need some help.

	   TODO: The c-type calculations can probably be optimized,
	   since xc is so small.
	 */
	var q : flt64[18]
	(q[ 0], q[ 1]) = two_by_two(xa, a1)
	(q[ 2], q[ 3]) = two_by_two(xa, a2)
	(q[ 4], q[ 5]) = two_by_two(xa, a3)
	(q[ 6], q[ 7]) = two_by_two(xb, b1)
	(q[ 8], q[ 9]) = two_by_two(xb, b2)
	(q[10], q[11]) = two_by_two(xb, b3)
	(q[12], q[13]) = two_by_two(xc, c1)
	(q[14], q[15]) = two_by_two(xc, c2)
	(q[16], q[17]) = two_by_two(xc, c3)

	-> triple_compensated_sum(q[:])
}

const trig_table_approx = {x1 : flt64, C : (uint64, uint64, uint64)[257]
	var j = (rn(x1 * std.flt64frombits(oneohtwofour_over_pi)) : uint64)
	var x1u = std.flt64bits(x1)
	var xi, sin_xi, cos_xi, test_cos_xi, test_sin_xi

	/* We either want j or j - 1. */
	j = std.max(std.min(j, 256), 0)

	(xi, cos_xi, sin_xi) = C[j]
	if j > 0
		var test_xi
		(test_xi, test_cos_xi, test_sin_xi) = C[j - 1]
		if std.abs(x1u - test_xi) < std.abs(x1u - xi)
			-> (test_xi, test_cos_xi, test_sin_xi)
		;;
	;;

	-> (xi, cos_xi, sin_xi)
}

const triple_compensated_sum = {q : flt64[:]
	/* TODO: verify, with GAPPA or something, that this is correct. */
	std.sort(q, fltabscmp)
	var s1 : flt64, s2 : flt64, s3
	var t1 : flt64, t2 : flt64, t3 : flt64, t4 : flt64, t5 : flt64, t6
	s1 = q[0]
	s2 = 0.0
	s3 = 0.0
	for qq : q[1:]
		(t5, t6) = fast2sum(s3, qq)
		(t3, t4) = fast2sum(s2, t5)
		(t1, t2) = fast2sum(s1, t3)
		s1 = t1
		(s2, s3) = fast2sum(t2, t4 + t6)
	;;

	-> (s1, s2, s3)
}

const fltabscmp = {x : flt64, y : flt64
	var xb = std.flt64bits(x) & ~(1 << 63)
	var yb = std.flt64bits(y) & ~(1 << 63)
	if xb == yb
		-> `std.Equal
	elif xb > yb
		-> `std.Before
	else
		-> `std.After
	;;
}

/*
   Return true iff (a1 + a2) <= (b1 + b2)
 */
const le_22 = { a1, a2, b1, b2
	if a1 < b1
		-> true
	elif a1 > b1
		-> false
	;;

	if a2 > b2
		-> false
	;;

	-> true
}