ref: f762f96f07cf2dfc7b3bb745d6a767dccda0a01a
dir: /lib/math/sin-impl.myr/
use std use "fpmath" use "fma-impl" use "scale2-impl" use "sum-impl" use "util" /* This implementation of sin and cos uses the "Highly Accurate Tables" method of [GB91]. It's not as fast as it could be (the sorting and excessive summation really takes a toll), however we seem to be consistently within +/-1 ulp of the correct result in flt64 case. In flt32 case we are correctly rounded. The idea is to tabulate [0, pi/4] at 256 slightly irregular intervals xi, with the lucky property that the infinite binary expansions of significands of sin(xi) and cos(xi) look like / 53 bits \ / many '0's \ / noise \ 1.bbbbbb...bb00000000000...0???????????... This allows us to use storage only for a single floating point number, but get more than 53 bits of precision. Using that, we express x as (N * pi/4) + (xi) + (h), where h is tiny. Using identities sin(u+v) = sin(u)cos(v) + cos(u)sin(v), cos(u+v) = cos(u)cos(v) - sin(u)sin(v), we arrive at a sum where every term is known with greater than 53 bits of precision except for sin(h) and cos(h), which we can approximate well. As a note, everything is performed in double-precision. Storing a second set of tables for single precision constants would occupy another 3KiB of memory for a not-very-significant speed gain. See files pi-constants.c, generate-triples-for-GB91.c, and generate-minimax-by-Remez.gp for where the constants come from. Note that in a departure from [GB91], we use the smaller degree polynomials over a smaller range for the input. This requires that we store more C-values for smaller xi values, but makes the implementation a bit simpler to read. */ pkg math = pkglocal const sin32 : (x : flt32 -> flt32) pkglocal const sin64 : (x : flt64 -> flt64) pkglocal const cos32 : (x : flt32 -> flt32) pkglocal const cos64 : (x : flt64 -> flt64) pkglocal const sincos32 : (x : flt32 -> (flt32, flt32)) pkglocal const sincos64 : (x : flt64 -> (flt64, flt64)) pkglocal const trig_reduce : (x : flt64 -> (int64, flt64, flt64)) pkglocal const trig_table_approx : (x1 : flt64, C : (uint64, uint64, uint64)[257] -> (uint64, uint64, uint64)) pkglocal const pi_over_2 : uint64[4] ;; /* Pi/2 in lots of detail, for range reducing sin(2^18) or so */ const pi_over_2 = [ 0x3ff921fb54442d18, 0x3c91a62633145c07, 0xb91f1976b7ed8fbc, 0x35b4cf98e804177d, ] /* Pi/4 in lots of detail */ const pi_over_4 : uint64[4] = [ 0x3fe921fb54442d18, 0x3c81a62633145c07, 0xb90f1976b7ed8fbc, 0x35a4cf98e804177d, ] /* Pre-computed inverses */ const two_over_pi : uint64 = 0x3fe45f306dc9c883 /* 1/(pi/2) */ const oneohtwofour_over_pi : uint64 = 0x40745f306dc9c883 /* 1/(pi/(4 * 256)) */ /* Coefficients for minimax, degree 7 polynomials approximating sin and cos on [-Pi/1024, Pi/1024] (generated by a Remez algorithm). */ const sin_coeff : uint64[8] = [ 0x0000000000000000, 0x3ff0000000000000, 0xb8c7400000000000, 0xbfc5555555555555, 0x39d0000000000000, 0x3f81111111111061, 0xbacc000000000000, 0xbf2a019fa7ee0417, ] const cos_coeff : uint64[8] = [ 0x3ff0000000000000, 0x38c0800000000000, 0xbfe0000000000000, 0x39a0000000000000, 0x3fa55555555553ee, 0x0000000000000000, 0xbf56c16b9bfd9fd6, 0xbbec000000000000, ] /* The Highly Accurate Tables for use in a [GB91]-type algorithm; generated by ancillary/generate-triples-for-GB91.c. */ const C : (uint64, uint64, uint64)[257] = [ /* xi cos(xi) sin(xi) */ (0x0000000000000000, 0x3ff0000000000000, 0x0000000000000000), (0x3f6921fb42e71072, 0x3feffff621622aa5, 0x3f6921f8ad6f8d6f), (0x3f7921fb576a8e70, 0x3fefffd8858e80ad, 0x3f7921f1018d5de6), (0x3f82d97c6d961293, 0x3fefffa72c98324f, 0x3f82d96afcb3b8a8), (0x3f8921fba95a4ba5, 0x3fefff62169765a8, 0x3f8921d251f34230), (0x3f8f6a79afcf7cc4, 0x3fefff0943ccb09c, 0x3f8f6a28f137852f), (0x3f92d97d6168427b, 0x3feffe9cb42a0249, 0x3f92d9379e0e6011), (0x3f95fdbbdcaafdf1, 0x3feffe1c68730a0e, 0x3f95fd4d14eace13), (0x3f9921fb47134298, 0x3feffd886087640a, 0x3f992155ea73805c), (0x3f9c463ab6204953, 0x3feffce09ce490e2, 0x3f9c454f4439aa60), (0x3f9f6a7a230622cf, 0x3feffc251df3604f, 0x3f9f69372b837c03), (0x3fa1475c20c40678, 0x3feffb55e4815141, 0x3fa1468532309197), (0x3fa2d97c761e7dc7, 0x3feffa72f0045061, 0x3fa2d8656c814503), (0x3fa46b9c4167ee58, 0x3feff97c42007eca, 0x3fa46a397ce648ce), (0x3fa5fdbbb3fec154, 0x3feff871db006d07, 0x3fa5fc009ce09729), (0x3fa78fdba698a573, 0x3feff753bb15f9f8, 0x3fa78dbaad1df29e), (0x3fa921fb569e20a0, 0x3feff621e37794e9, 0x3fa91f65f36711fd), (0x3faab41b20e054f3, 0x3feff4dc549e4649, 0x3faab101d4af47e4), (0x3fac463aa9c96aef, 0x3feff3830f9fe5ee, 0x3fac428cfdc5c35c), (0x3fadd85a67cbe296, 0x3feff21614eca1d2, 0x3fadd406ed40d193), (0x3faf6a7a3eff7872, 0x3feff09565793013, 0x3faf656e8f97f0f9), (0x3fb07e4ceacc9a97, 0x3fefef01028ba740, 0x3fb07b6149c87151), (0x3fb1475c76962b85, 0x3fefed58ed6a54be, 0x3fb14400e1aeeb44), (0x3fb2106c98abe6cd, 0x3fefeb9d254b1740, 0x3fb20c96690fbe9f), (0x3fb2d97c6bdba26e, 0x3fefe9cdad2f1061, 0x3fb2d5207f840524), (0x3fb3a28c18e279a9, 0x3fefe7ea85e76da6, 0x3fb39d9ed203bab6), (0x3fb46b9c42a3d8e4, 0x3fefe5f3af0a1548, 0x3fb4661187480b43), (0x3fb534abcd8d6c89, 0x3fefe3e92c9764ff, 0x3fb52e7708fabcdf), (0x3fb5fdbbf6d380b0, 0x3fefe1cafc99687d, 0x3fb5f6d017a62147), (0x3fb6c6cbcc2f7248, 0x3fefdf9922e0f7ad, 0x3fb6bf1b46436fc7), (0x3fb78fdb8cff3236, 0x3fefdd53a026e6fa, 0x3fb78758587024ce), (0x3fb858eb79d953d8, 0x3fefdafa7513ab8e, 0x3fb84f8712f838d4), (0x3fb921fb5df13728, 0x3fefd88da3b2cbcb, 0x3fb917a6c5cad0c3), (0x3fb9eb0b15ad6e7f, 0x3fefd60d2df8efac, 0x3fb9dfb6d20cd89d), (0x3fbab41b0d8e04c2, 0x3fefd3791414a510, 0x3fbaa7b72849583a), (0x3fbb7d2b02ec004d, 0x3fefd0d1586ee673, 0x3fbb6fa70acd0c55), (0x3fbc463a9c1cda98, 0x3fefce15fde7feac, 0x3fbc3785a525059d), (0x3fbd0f4a92881eaf, 0x3fefcb4703aa4711, 0x3fbcff5334552718), (0x3fbdd85a7ea6b644, 0x3fefc8646cd750e6, 0x3fbdc70ed631fba7), (0x3fbea16a4da0e792, 0x3fefc56e3b81b243, 0x3fbe8eb7fcd470b2), (0x3fbf6a7a2082960a, 0x3fefc26471042f37, 0x3fbf564e4de7cd23), (0x3fc019c4f4362172, 0x3fefbf470f79208c, 0x3fc00ee89fc60798), (0x3fc07e4cb4549583, 0x3fefbc1619c9367a, 0x3fc072a00d3f81a5), (0x3fc0e2d4df1f9ef0, 0x3fefb8d18d51928e, 0x3fc0d64dbf2ea47f), (0x3fc1475c06879885, 0x3fefb5797828e1da, 0x3fc139f00d3ac360), (0x3fc1abe531137149, 0x3fefb20dc250d36d, 0x3fc19d89b968e759), (0x3fc2106c256f2705, 0x3fefae8e92bf458c, 0x3fc20116570e32c3), (0x3fc274f5eafb17bd, 0x3fefaafbbea74df5, 0x3fc2649aa381628e), (0x3fc2d97c80aa0f5b, 0x3fefa7557efae43c, 0x3fc2c81070013fe8), (0x3fc33e0462663cd9, 0x3fefa39bacdb1026, 0x3fc32b7bef4456d4), (0x3fc3a28c534e14b9, 0x3fef9fce55ed894d, 0x3fc38edbaa59fe3f), (0x3fc40714a18f49e2, 0x3fef9bed79763508, 0x3fc3f22fb1298233), (0x3fc46b9c38ac6ee7, 0x3fef97f9249e436c, 0x3fc45576b5509b55), (0x3fc4d02432e80a59, 0x3fef93f14ed442ec, 0x3fc4b8b1905fbd16), (0x3fc534ac0298d5e2, 0x3fef8fd600323753, 0x3fc51bdf7942e899), (0x3fc5993416dbe628, 0x3fef8ba736af1693, 0x3fc57f00a065f728), (0x3fc5fdbbf1d87ff8, 0x3fef8764fa1887ba, 0x3fc5e2144c8984d0), (0x3fc66243dc0ae9e8, 0x3fef830f4a092d65, 0x3fc6451a8808363c), (0x3fc6c6cbc286e3f6, 0x3fef7ea629fb13d9, 0x3fc6a8130326d53f), (0x3fc72b53b7c96348, 0x3fef7a299bd3df70, 0x3fc70afd9309c784), (0x3fc78fdba21869ed, 0x3fef7599a37cdcb3, 0x3fc76dd9e15bdb4a), (0x3fc7f4642a07f677, 0x3fef70f63bf58052, 0x3fc7d0a856c8ccd4), (0x3fc858eb5b649af0, 0x3fef6c3f7f63a632, 0x3fc83366caea942f), (0x3fc8bd736951ad00, 0x3fef6775566b1c57, 0x3fc896172a175d2d), (0x3fc921fb399259ea, 0x3fef6297d144aa53, 0x3fc8f8b8223b223a), (0x3fc986835ce1c644, 0x3fef5da6ebe94da7, 0x3fc95b4a04783401), (0x3fc9eb0b24569abc, 0x3fef58a2b2008d0c, 0x3fc9bdcbe883cc5f), (0x3fca4f9323461762, 0x3fef538b1f52fbe6, 0x3fca203e2200e122), (0x3fcab41b09c5898b, 0x3fef4e603b080549, 0x3fca82a025ebcacb), (0x3fcb18a2e6d4292f, 0x3fef492207941b17, 0x3fcae4f1c649efbf), (0x3fcb7d2ae7b9bf9f, 0x3fef43d085cf0736, 0x3fcb4732f2b7a6f7), (0x3fcbe1b2cb7b8481, 0x3fef3e6bbc6eba27, 0x3fcba9632f158ca6), (0x3fcc463ab6f78496, 0x3fef38f3acd2bb22, 0x3fcc0b8262d9d9f9), (0x3fccaac29faeedb5, 0x3fef33685aeb67ba, 0x3fcc6d90473e1ca0), (0x3fcd0f4ab0d5a9e9, 0x3fef2dc9c7b77e6a, 0x3fcccf8cc9dfcdca), (0x3fcd73d299801a67, 0x3fef2817fb345701, 0x3fcd31775f6e7021), (0x3fcdd85a5f03c93d, 0x3fef2252f8ad886d, 0x3fcd934fd0c9eb90), (0x3fce3ce2615c01b0, 0x3fef1c7abe28a12c, 0x3fcdf5163efb2fd2), (0x3fcea16a34d49aca, 0x3fef168f557f8c38, 0x3fce56ca04ee54fd), (0x3fcf05f2396af361, 0x3fef1090bcb17909, 0x3fceb86b43a82046), (0x3fcf6a7a34c8a42b, 0x3fef0a7efae01f19, 0x3fcf19f9863cf10b), (0x3fcfcf02154123ee, 0x3fef045a14e56969, 0x3fcf7b747f630b74), (0x3fd019c51e84ab67, 0x3feefe22087e60d4, 0x3fcfdcdc522606ad), (0x3fd04c08e989dfc7, 0x3feef7d6e70399a4, 0x3fd01f17f81c5502), (0x3fd07e4ceb775576, 0x3feef178a4618400, 0x3fd04fb80a82fe62), (0x3fd0b090f60ca615, 0x3feeeb074a3d9810, 0x3fd0804e15777f94), (0x3fd0e2d4c54314ef, 0x3feee482e5663962, 0x3fd0b0d9b95007d9), (0x3fd11518d2106a76, 0x3feeddeb6a30657d, 0x3fd0e15b4cec5711), (0x3fd1475cc633879f, 0x3feed740e7e7b002, 0x3fd111d25f193a47), (0x3fd179a0ce17673e, 0x3feed0835cc79369, 0x3fd1423efcc68ac8), (0x3fd1abe4bace84c5, 0x3feec9b2d347a043, 0x3fd172a0dae268af), (0x3fd1de28a262f5c2, 0x3feec2cf4cb15d4b, 0x3fd1a2f7f0d5a412), (0x3fd2106cb1425d8c, 0x3feebbd8c71a89f1, 0x3fd1d3444b7da503), (0x3fd242b0905f83c0, 0x3feeb4cf52e27d33, 0x3fd20385796d7260), (0x3fd274f4947838ba, 0x3feeadb2e8897f5d, 0x3fd233bbae3e8ad0), (0x3fd2a7386b632300, 0x3feea6839816a5b1, 0x3fd263e67d68a111), (0x3fd2d97c9e0be433, 0x3fee9f41524c0663, 0x3fd294064c5a5940), (0x3fd30bc07884186a, 0x3fee97ec359da93b, 0x3fd2c41a511fcae1), (0x3fd33e046121eaf8, 0x3fee908437cd8074, 0x3fd2f422d00c7726), (0x3fd3704855d0fd6b, 0x3fee89095dacc360, 0x3fd3241fa9798465), (0x3fd3a28c5798cbdd, 0x3fee817baba342be, 0x3fd35410c0bfc504), (0x3fd3d4d04553249a, 0x3fee79db2b58ee3d, 0x3fd383f5d8b1345d), (0x3fd407145ea979a2, 0x3fee7227d7cc183e, 0x3fd3b3cf1062e88d), (0x3fd43958386293b4, 0x3fee6a61c6350b11, 0x3fd3e39be4473723), (0x3fd46b9c38cc0bc5, 0x3fee6288eb9aff42, 0x3fd4135c98341878), (0x3fd49de029090fc7, 0x3fee5a9d555912a2, 0x3fd44310da8ddcf9), (0x3fd4d024217e697d, 0x3fee529f047e7434, 0x3fd472b8a510e608), (0x3fd50267f89f138c, 0x3fee4a8e04a2f573, 0x3fd4a253b2fc94cc), (0x3fd534ac160681bb, 0x3fee426a4a0b5efa, 0x3fd4d1e249057326), (0x3fd566eff496a5e0, 0x3fee3a33ef471d66, 0x3fd50163cbe183bf), (0x3fd59933ebfe75ec, 0x3fee31eaeb28cfda, 0x3fd530d871303f3b), (0x3fd5cb77fe42d304, 0x3fee298f425adc95, 0x3fd560401d7fa792), (0x3fd5fdbbd011e31d, 0x3fee212109492bba, 0x3fd58f9a5d81663b), (0x3fd62ffff34dbbed, 0x3fee18a02ca5e0f9, 0x3fd5bee79d6734e6), (0x3fd66243bdc0c9c2, 0x3fee100cce7f06b6, 0x3fd5ee271fdac6fd), (0x3fd69487ca38ebb6, 0x3fee0766d9c23a82, 0x3fd61d595943641b), (0x3fd6c6cbc58a707e, 0x3fedfeae61f95db0, 0x3fd64c7dde58f888), (0x3fd6f90fbf3b4b9a, 0x3fedf5e369de7ac8, 0x3fd67b94a09dba36), (0x3fd72b53a84c8c13, 0x3feded05f987af46, 0x3fd6aa9d7500e60d), (0x3fd75d97a7d7826a, 0x3fede4160f84614d, 0x3fd6d99863129ab7), (0x3fd78fdba723c71a, 0x3feddb13b555c5d8, 0x3fd7088538928031), (0x3fd7c21faf2481ab, 0x3fedd1feeeeb3f77, 0x3fd73763e0e5bb7e), (0x3fd7f46380443ef6, 0x3fedc8d7cd74eb35, 0x3fd7663403ecedb8), (0x3fd826a78fee86a2, 0x3fedbf9e41325ad2, 0x3fd794f5f21f812c), (0x3fd858eb6fc6c00e, 0x3fedb652640d194e, 0x3fd7c3a927f6e7be), (0x3fd88b2f6091a911, 0x3fedacf42fd7881c, 0x3fd7f24dc4deb30b), (0x3fd8bd73747fa82a, 0x3feda383a6d8b415, 0x3fd820e3bcdb4516), (0x3fd8efb76ad3bcc6, 0x3fed9a00db088521, 0x3fd84f6ab72e0455), (0x3fd921fb52f99571, 0x3fed906bcf71ced7, 0x3fd87de2a57d3bef), (0x3fd9543f4c295528, 0x3fed86c484089c2c, 0x3fd8ac4b87fa1376), (0x3fd98683387c2ef2, 0x3fed7d0b047d2a2d, 0x3fd8daa526666236), (0x3fd9b8c74c734cd7, 0x3fed733f4c983086, 0x3fd908ef9488c638), (0x3fd9eb0b316e257b, 0x3fed6961734a4759, 0x3fd9372a66100418), (0x3fda1d4f1c5b13c6, 0x3fed5f71745bef9a, 0x3fd96555af393979), (0x3fda4f93134bbffd, 0x3fed556f54b18a49, 0x3fd99371591464cc), (0x3fda81d7079868bb, 0x3fed4b5b1d5d78c0, 0x3fd9c17d39ba9f53), (0x3fdab41b223bd70a, 0x3fed4134cc4c88b2, 0x3fd9ef795a3dd82c), (0x3fdae65f0bd2deeb, 0x3fed36fc796c6d1f, 0x3fda1d654e53c0c4), (0x3fdb18a2faa54a8e, 0x3fed2cb220194f43, 0x3fda4b412b549e3d), (0x3fdb4ae6e27ee94a, 0x3fed2255c92cb15e, 0x3fda790cc9d54d8e), (0x3fdb7d2afabcbd96, 0x3fed17e76f8b1bff, 0x3fdaa6c83ff29436), (0x3fdbaf6edd63f9e6, 0x3fed0d672ed022eb, 0x3fdad47314b13a5e), (0x3fdbe1b2ee09849b, 0x3fed02d4f8ac5a8e, 0x3fdb020d86625c4a), (0x3fdc13f6d9f827b3, 0x3fecf830e504ee27, 0x3fdb2f972dd6dd44), (0x3fdc463ac85c4fc4, 0x3feced7af23188c8, 0x3fdb5d101285fa87), (0x3fdc787e99315d1d, 0x3fece2b32dae84df, 0x3fdb8a77fb79b6c9), (0x3fdcaac2c391b562, 0x3fecd7d984771033, 0x3fdbb7cf38286324), (0x3fdcdd06bf1547a8, 0x3fecccee1a978b83, 0x3fdbe515317a2767), (0x3fdd0f4a93c25d1a, 0x3fecc1f0f53b85ed, 0x3fdc1249d2e7d3d7), (0x3fdd418e7cb2c1ac, 0x3fecb6e20e487888, 0x3fdc3f6d35bf5483), (0x3fdd73d292000a16, 0x3fecabc16721278b, 0x3fdc6c7f53ab8367), (0x3fdda616817e6870, 0x3feca08f18d00ef9, 0x3fdc997fc72e0f8c), (0x3fddd85a5a1d76bb, 0x3fec954b270990c5, 0x3fdcc66e8203a2b0), (0x3fde0a9e6cc874c6, 0x3fec89f5868b6c72, 0x3fdcf34bb0b7c500), (0x3fde3ce26dc62595, 0x3fec7e8e4f51729a, 0x3fdd2016f3f2781b), (0x3fde6f264279a0bc, 0x3fec73158e8e71da, 0x3fdd4cd0187c011d), (0x3fdea16a5680fb27, 0x3fec678b32bc65ff, 0x3fdd797762744bde), (0x3fded3ae3d42764c, 0x3fec5bef5bdf2d0e, 0x3fdda60c55ccc8d6), (0x3fdf05f24a4f648b, 0x3fec5041fdd6d9d1, 0x3fddd28f21277b30), (0x3fdf383651e91662, 0x3fec448329efd6a1, 0x3fddfeff8240fbd5), (0x3fdf6a7a42ac94c9, 0x3fec38b2eb87ae38, 0x3fde2b5d4e604dad), (0x3fdf9cbe1d3b3429, 0x3fec2cd149d960c0, 0x3fde57a86acebf99), (0x3fdfcf020d397ddb, 0x3fec20de41e8a738, 0x3fde83e0e2af7102), (0x3fe000a313205931, 0x3fec14d9d64b5fbe, 0x3fdeb006abd63bf5), (0x3fe019c4f9037b82, 0x3fec08c42ac58524, 0x3fdedc194338385b), (0x3fe032e70860dbf0, 0x3febfc9d20441910, 0x3fdf08191a1b3b78), (0x3fe04c0906e9f028, 0x3febf064da5e176a, 0x3fdf3405af2bebc0), (0x3fe0652b03fdfd24, 0x3febe41b5936a8f3, 0x3fdf5fdf024485e7), (0x3fe07e4d0b91a0db, 0x3febd7c09e80889b, 0x3fdf8ba50d2a0c48), (0x3fe0976ef1f9c5ae, 0x3febcb54c769186a, 0x3fdfb75768e7fa9b), (0x3fe0b090e7d18f51, 0x3febbed7c3a63454, 0x3fdfe2f64f20fede), (0x3fe0c9b2e9a39b2d, 0x3febb2499c87d6e9, 0x3fe00740cf66097a), (0x3fe0e2d4ddcaea14, 0x3feba5aa669df22e, 0x3fe01cfc88506502), (0x3fe0fbf6e1078c8e, 0x3feb98fa1b3ff547, 0x3fe032ae5dc3499b), (0x3fe11518d98a4a52, 0x3feb8c38cf44e150, 0x3fe048562c12ec1d), (0x3fe12e3add8e0dcc, 0x3feb7f667f41f3ed, 0x3fe05df3f90aa7ce), (0x3fe1475ce227685c, 0x3feb728338a5b7b8, 0x3fe07387ade96093), (0x3fe1607ec66ef221, 0x3feb658f1462d09f, 0x3fe0891121335eab), (0x3fe179a0be834cc7, 0x3feb5889ffa66df4, 0x3fe09e9072510f34), (0x3fe192c2b8568d3a, 0x3feb4b740bbd2fb5, 0x3fe0b405848141dd), (0x3fe1abe4b59244a1, 0x3feb3e4d3fd6bd5a, 0x3fe0c9704befbcad), (0x3fe1c506b2c3bd2e, 0x3feb3115a5da6c4f, 0x3fe0ded0b8742978), (0x3fe1de289173a977, 0x3feb23cd5613980e, 0x3fe0f426a30829d9), (0x3fe1f74a8f60e6e9, 0x3feb167438110e6e, 0x3fe1097232ed0851), (0x3fe2106ca03e5fd0, 0x3feb090a5a650d01, 0x3fe11eb350745afe), (0x3fe2298e9db917c4, 0x3feafb8fd9ca55cd, 0x3fe133e9ce192ca0), (0x3fe242b08f9ff90f, 0x3feaee04ba8026f6, 0x3fe14915a5706056), (0x3fe25bd28f640cf6, 0x3feae068f72931d9, 0x3fe15e36ded7bb02), (0x3fe274f47c0699ab, 0x3fead2bcaa0d2e11, 0x3fe1734d518c9922), (0x3fe28e1685bcfd71, 0x3feac4ffc15749f9, 0x3fe1885918f9ac8e), (0x3fe2a736fe1e1494, 0x3feab7333233d9b8, 0x3fe19d58c0a87f59), (0x3fe2c05a9f4eb35a, 0x3feaa9546b006d3c, 0x3fe1b2502def806a), (0x3fe2d97c6af42411, 0x3fea9b66344e259f, 0x3fe1c73b28d8e8d8), (0x3fe2f29e6e1ad75e, 0x3fea8d6775437b8d, 0x3fe1dc1b5a8d2e11), (0x3fe30bc06e6776fd, 0x3fea7f5856cdc155, 0x3fe1f0f0859072c6), (0x3fe324e1d749c5d3, 0x3fea7139354a6417, 0x3fe205ba2249d8d8), (0x3fe33e045ca06605, 0x3fea63092400435f, 0x3fe21a798c19a7e7), (0x3fe357267800e833, 0x3fea54c9076262f5, 0x3fe22f2d7377bfc7), (0x3fe370485e5be5d6, 0x3fea4678cad14810, 0x3fe243d5f7a46407), (0x3fe3896a4d91d9d2, 0x3fea3818540e8f5e, 0x3fe258733edb7371), (0x3fe3a28c599d4a38, 0x3fea29a7a0666245, 0x3fe26d054caf4fb0), (0x3fe3bbae6b7bfd61, 0x3fea1b26c5d7e6f3, 0x3fe2818c01832d38), (0x3fe3d4d03ed37a30, 0x3fea0c95f4fd994c, 0x3fe29607190147e2), (0x3fe3edf268105227, 0x3fe9fdf4e0b7d0c8, 0x3fe2aa76ff6bb39c), (0x3fe4071445cf2ff6, 0x3fe9ef43eff2b264, 0x3fe2bedb24e4da22), (0x3fe420363dd59357, 0x3fe9e082f06f9ebe, 0x3fe2d333cf8d19db), (0x3fe4395843ba2fd5, 0x3fe9d1b1f26b4f3c, 0x3fe2e780e8af8d40), (0x3fe4527a412b9192, 0x3fe9c2d10c2c8325, 0x3fe2fbc251bb901b), (0x3fe46b9c27a00cd7, 0x3fe9b3e04f99cbd5, 0x3fe30ff7f2910508), (0x3fe484be309a8d17, 0x3fe9a4dfa3af458c, 0x3fe32421ecef726e), (0x3fe49de033475607, 0x3fe995cf29f2517a, 0x3fe33840139184b1), (0x3fe4b7021f53f06e, 0x3fe986aef5919af2, 0x3fe34c524d121d49), (0x3fe4d0240f052501, 0x3fe9777f00244ce3, 0x3fe36058a217b268), (0x3fe4e9463716a53c, 0x3fe9683f32f2ae77, 0x3fe3745330215f46), (0x3fe502681e9a6016, 0x3fe958efe0cb9eeb, 0x3fe388418ac15fff), (0x3fe51b8a16255181, 0x3fe94990e237aa2e, 0x3fe39c23e5b6b244), (0x3fe534ac09d21709, 0x3fe93a224cd1d5d1, 0x3fe3affa24f6eadd), (0x3fe54dcdf3b3627e, 0x3fe92aa42dcf60f8, 0x3fe3c3c437a1dace), (0x3fe566effef25aad, 0x3fe91b16740dcf8f, 0x3fe3d782336d7ad4), (0x3fe5801215d3e4d3, 0x3fe90b79366e5e53, 0x3fe3eb33faf99080), (0x3fe59933ff695620, 0x3fe8fbcca2107806, 0x3fe3fed9559bf7b0), (0x3fe5b255ee2800cd, 0x3fe8ec10a14c3bbf, 0x3fe412725ec52f70), (0x3fe5cb77fdabb08b, 0x3fe8dc452c6aa905, 0x3fe425ff1fc9c896), (0x3fe5e499ef7e111b, 0x3fe8cc6a746824f6, 0x3fe4397f5c023a11), (0x3fe5fdbbe1bc4e34, 0x3fe8bc807027ea41, 0x3fe44cf31eda7ea0), (0x3fe616ddfb59ae0d, 0x3fe8ac8710ace7bf, 0x3fe4605a7a5aaeff), (0x3fe62fffdd1c090a, 0x3fe89c7e9c66cac4, 0x3fe473b51912497a), (0x3fe64921cfa4dad5, 0x3fe88c66ef074f3a, 0x3fe48703271ce28c), (0x3fe66243d696fbd4, 0x3fe87c401005fc0b, 0x3fe49a449b40f2d3), (0x3fe67b65d9d2cf92, 0x3fe86c0a18cb21f8, 0x3fe4ad795715a3e0), (0x3fe69487ba5a4421, 0x3fe85bc52776b9fc, 0x3fe4c0a1373040c0), (0x3fe6ada9c7509d39, 0x3fe84b7112ce79ff, 0x3fe4d3bc6c09e271), (0x3fe6c6cbcacfb7d4, 0x3fe83b0e07c9e659, 0x3fe4e6cac0c53e06), (0x3fe6dfedb933bfad, 0x3fe82a9c1836e690, 0x3fe4f9cc20e5450e), (0x3fe6f90fb67efcd3, 0x3fe81a1b36b01830, 0x3fe50cc09bf06f81), (0x3fe71231b7343738, 0x3fe8098b74dea97c, 0x3fe51fa81d7d14ee), (0x3fe72b53a85aa5f7, 0x3fe7f8ece98512ef, 0x3fe532828ba63ede), (0x3fe74475af974694, 0x3fe7e83f85f965d1, 0x3fe5454ff702d29b), (0x3fe75d9799aac134, 0x3fe7d783768ce953, 0x3fe558102da86994), (0x3fe776b9903c81f2, 0x3fe7c6b8a9e499af, 0x3fe56ac34326d2d0), (0x3fe78fdba082343f, 0x3fe7b5df2167453d, 0x3fe57d6935ab2542), (0x3fe7a8fdabde74d9, 0x3fe7a4f6fbedea5e, 0x3fe59001e2ecf285), (0x3fe7c21f8108af2e, 0x3fe794006540f1ec, 0x3fe5a28d1b2b3ab1), (0x3fe7db4178eb2ca1, 0x3fe782fb2be4619a, 0x3fe5b50b14a054d2), (0x3fe7f4637bfce457, 0x3fe771e76a206b4b, 0x3fe5c77bb26e7244), (0x3fe80d856a5e5235, 0x3fe760c5402e27c0, 0x3fe5d9ded1dab0d8), (0x3fe826a77aed6885, 0x3fe74f94932c2aa4, 0x3fe5ec348fa6b9f0), (0x3fe83fc98cbf9b15, 0x3fe73e55841a0699, 0x3fe5fe7cc8635d47), (0x3fe858eb8885faf1, 0x3fe72d082dab83d0, 0x3fe610b75fe5f52f), (0x3fe8720d606e4a1c, 0x3fe71baca44228b1, 0x3fe622e441189d58), (0x3fe88b2f5ac151d8, 0x3fe70a42c20978bf, 0x3fe63503939a8c88), (0x3fe8a4516df43f11, 0x3fe6f8ca982975b6, 0x3fe64715452c2eb4), (0x3fe8bd735c5fca22, 0x3fe6e7445c4d6347, 0x3fe659191e5ec1a0), (0x3fe8d69563ed711a, 0x3fe6d5afee23005e, 0x3fe66b0f407fe153), (0x3fe8efb7598d0461, 0x3fe6c40d769b68c4, 0x3fe67cf781aec585), (0x3fe908d9720f2113, 0x3fe6b25cdb7a4ca4, 0x3fe68ed1fc7319a2), (0x3fe921fb5b6a3d21, 0x3fe6a09e61712f13, 0x3fe6a09e6b8d4885), ] /* The huge reduction tables. R[j] = (l1, l2, l3), such that l1+l2+l3 is less than 2pi, and 2^(50j + 25) is approximately l1+l2+l3 (mod 2pi). The stepping of 50 was chosen because it is a nice, round number, close to 52. [GB91] make reference to a difficult-to-reduce number, Xhard, which is in the range (-2^27, 2^27). By using an offset of 25, we can ensure that huge_reduce returns results in about that range. This allows us to reuse calculations in [GB91] showing we have probably stored enough bits of pi (and of these numbers). TODO: There is always the chance that, for some x = x1*2^j + x2*2^k, there is catastrophic cancellation which makes the remainder sum x1*r[50j+25] + x2*r[50k+25] imprecise. That needs to be checked; it shouldn't be too hard. */ const R : (uint64, uint64, uint64)[20] = [ (0x4011fb222e13e839, 0xbcbeecfb7d19df11, 0x3955f9708d867b5b), /* 2^25 mod 2pi */ (0x3ffee6639887604d, 0x3c88b437ad3f55e0, 0x39218522303457a8), /* 2^75 mod 2pi */ (0x3fc589bfb31f1687, 0x3c31994c1edb7977, 0xb8dd15afd80892a0), /* 2^125 mod 2pi */ (0x400663e27d2e0b47, 0xbca00a74acd21bf1, 0x39257592b34b8c25), /* 2^175 mod 2pi */ (0x40177ff13d560e79, 0xbc8403f354e865f6, 0x38e6e69984c4338e), /* 2^225 mod 2pi */ (0x4015a4a4671a3606, 0x3ca6fc3b1a2bddd5, 0xb9270923530d0279), /* 2^275 mod 2pi */ (0x3ff3bf65f73a0474, 0x3c67deda97eb4131, 0x38fa32118f8f578f), /* 2^325 mod 2pi */ (0x3ffa8e506685f311, 0x3c698a6391d9d31b, 0x38db58d212d28d0a), /* 2^375 mod 2pi */ (0x400d38d7ecc58385, 0x3c98b076242f0ed3, 0x38e04e83f1274a16), /* 2^425 mod 2pi */ (0x4017023cb671ed43, 0xbcbeb418af32f189, 0xb944d3aea8efaa5f), /* 2^475 mod 2pi */ (0x401655d13f672fe9, 0x3ca8e56fc8ad533e, 0x393d464a045591bf), /* 2^525 mod 2pi */ (0x3feec8dd916fa3b6, 0x3c82c834374ed2af, 0x39225586c285a0d8), /* 2^575 mod 2pi */ (0x400ed2900b47ba63, 0x3c697a0d5776cc4d, 0x390c831068ee85b1), /* 2^625 mod 2pi */ (0x4008563b8e99caac, 0xbc5ee03ee870ab9d, 0x38fb980dd9756063), /* 2^675 mod 2pi */ (0x4012d961cf7cd57d, 0xbcbdde0a1d27628e, 0xb95a8840599f8246), /* 2^725 mod 2pi */ (0x4016919aac4a18f4, 0xbc93552937a28b88, 0xb9145e70b28c0cbb), /* 2^775 mod 2pi */ (0x400c37d195196372, 0xbca948065398479d, 0xb94cadd4f6e80c1b), /* 2^825 mod 2pi */ (0x400d77a34a63ebae, 0x3ca8ca5d7ccc2874, 0xb91611b1e4b65369), /* 2^875 mod 2pi */ (0x3ff4cb08f62cb38b, 0xbc90f42df8fc967a, 0xb8ed7d71202d2f45), /* 2^925 mod 2pi */ (0x401848860f8742d6, 0x3c90337738c287b4, 0xb92f24fc614ec2f7), /* 2^975 mod 2pi */ ] const sin32 = {x : flt32 var s, s2 (s, s2, _, _) = w64((x : flt64), true, false) -> round_down(s, s2) } const sin64 = {x : flt64 var s (s, _, _, _) = w64(x, true, false) -> s } const cos32 = {x : flt32 var c, c2 (_, _, c, c2) = w64((x : flt64), false, true) -> round_down(c, c2) } const cos64 = {x : flt64 var c (_, _, c, _) = w64(x, false, true) -> c } const sincos32 = {x : flt32 var s : flt64, s2 : flt64, c : flt64, c2 : flt64 (s, s2, c, c2) = w64((x : flt64), true, true) -> (round_down(s, s2), round_down(c, c2)) } const sincos64 = {x : flt64 var s, c (s, _, c, _) = w64(x, true, true) -> (s, c) } /* Calculate sin and/or cos */ const w64 = {x : flt64, want_sin : bool, want_cos : bool var sin_ret : flt64 = 0.0 var cos_ret : flt64 = 0.0 var sin_ret2 : flt64 = 0.0 var cos_ret2 : flt64 = 0.0 var e : int64 (_, e, _) = std.flt64explode(x) if e == 1024 -> (std.flt64nan(), 0.0, std.flt64nan(), 0.0) ;; var N : int64 var x1 : flt64, x2 : flt64 (N, x1, x2) = trig_reduce(x) /* Handle multiples of pi/2 */ var swap_sin_cos : bool = false var then_negate_sin : bool = false var then_negate_cos : bool = false match N | 1: /* sin(x + pi/2) = cos(x), cos(x + pi/2) = -sin(x) */ swap_sin_cos = true then_negate_cos = true | 2: /* sin(x + pi) = -sin(x), cos(x + pi) = -cos(x) */ then_negate_cos = true then_negate_sin = true | 3: /* sin(x + 3pi/2) = -cos(x), cos(x + 3pi/2) = sin(x) */ swap_sin_cos = true then_negate_sin = true | _: ;; var first_negate_sin : bool = false if x1 < 0.0 x1 = -x1 x2 = -x2 first_negate_sin = true ;; /* Figure out where in the C table x lies */ var xi : uint64, sin_xi : uint64, cos_xi : uint64, sin_delta, cos_delta (xi, cos_xi, sin_xi) = trig_table_approx(x1, C) /* Compute x - xi with compensated summation. Because xi and delta both lie in the same interval of width (pi/4)/256, which is less than 1/256, we can use that |delta1| < 2^-8: we need a polynomial approximation of at least degree 7. This also gives that |delta2| < 2^(-60), vanishing quickly in the polynomial approximations. */ var delta1, delta2, deltat (delta1, deltat) = fast2sum(-std.flt64frombits(xi), x1) (delta2, _) = fast2sum(deltat, x2) /* sin(delta); the degree 2 coefficient is near 0, so delta_2 only shows up in deg 1 */ sin_delta = horner_polyu(delta1, sin_coeff[:]) sin_delta += delta2 * std.flt64frombits(sin_coeff[1]) /* cos(delta); delta_2 shows up in deg 1 and 2; the term we care about is a1*d2 + 2*a2*d1*d2 */ cos_delta = horner_polyu(delta1, cos_coeff[:]) cos_delta += delta2 * fma64(delta1, 2.0*std.flt64frombits(cos_coeff[2]), std.flt64frombits(cos_coeff[1])) var q : flt64[4] if (want_sin && !swap_sin_cos) || (want_cos && swap_sin_cos) (q[0], q[1]) = two_by_two(std.flt64frombits(sin_xi), cos_delta) (q[2], q[3]) = two_by_two(std.flt64frombits(cos_xi), sin_delta) std.sort(q[:], fltabscmp) (sin_ret, sin_ret2) = double_compensated_sum(q[:]) ;; if (want_cos && !swap_sin_cos) || (want_sin && swap_sin_cos) (q[0], q[1]) = two_by_two(std.flt64frombits(cos_xi), cos_delta) (q[2], q[3]) = two_by_two(std.flt64frombits(sin_xi), sin_delta) q[2] = -q[2] q[3] = -q[3] /* No need to sort; cos_xi and sin_xi are in [0,1], cos_delta is close to 1, sin_delta is close to 0. */ std.swap(&q[1], &q[2]) (cos_ret, cos_ret2) = double_compensated_sum(q[:]) ;; if first_negate_sin sin_ret = -sin_ret sin_ret2 = -sin_ret2 ;; if swap_sin_cos std.swap(&sin_ret, &cos_ret) std.swap(&sin_ret2, &cos_ret2) ;; if then_negate_sin sin_ret = -sin_ret sin_ret2 = -sin_ret2 ;; if then_negate_cos cos_ret = -cos_ret cos_ret2 = -cos_ret2 ;; -> (sin_ret, sin_ret2, cos_ret, cos_ret2) } /* Reduce x to N*(pi/2) + x', with x' in [-pi/4, pi/4] */ const trig_reduce = {x : flt64 var N : int64 = 0 var Nf : flt64 /* We actually only care about N mod 4. If x is very large, the ultimate N that we end up using might not be representable (either as an int64 or flt64), so we instead just keep track of the mod 4 part exactly. */ /* If we want to store pi in a form to properly reduce 2^1000 or so, it turns out that there's some complication with the final digits: they trail off beyond subnormality and can't be represented, even though they are the digits we need for the reduction. Therefore, for extremely high values of x, we pre-compute the reduction and return it here. We get: x1 about in range 2^25, and that (x1 + x2 + x3) approximates x (mod 2pi) very well. This is good enough to ensure that N = x/pi is representable (int64 and flt64). We do need to worry about catastrophic cancellation, however. */ var x1, x2, x3 (x1, x2, x3) = huge_reduce_2pi(x) var pi_o_2_0 : flt64 = std.flt64frombits(pi_over_2[0]) var pi_o_2_1 : flt64 = std.flt64frombits(pi_over_2[1]) var pi_o_2_2 : flt64 = std.flt64frombits(pi_over_2[2]) var pi_o_2_3 : flt64 = std.flt64frombits(pi_over_2[3]) var pi_o_4_0 : flt64 = std.flt64frombits(pi_over_4[0]) var pi_o_4_1 : flt64 = std.flt64frombits(pi_over_4[1]) var pi_o_4_2 : flt64 = std.flt64frombits(pi_over_4[2]) var pi_o_4_3 : flt64 = std.flt64frombits(pi_over_4[3]) var total_N = 0 var q : flt64[11] /* Compute initial reduction -- this might not be sufficient. */ total_N = rn(x1 * std.flt64frombits(two_over_pi)) Nf = (-total_N : flt64) (q[0], q[ 4], q[5]) = (x1, x2, x3) (q[1], q[ 3]) = two_by_two(Nf, pi_o_2_0) (q[2], q[ 6]) = two_by_two(Nf, pi_o_2_1) (q[7], q[ 8]) = two_by_two(Nf, pi_o_2_2) (q[9], q[10]) = two_by_two(Nf, pi_o_2_3) /* Sorting is very slow, but it's only the top five or so that are in question */ std.sort(q[0:5], fltabscmp) (x1, x2) = double_compensated_sum(q[:]) while !(le_22(x1, x2, pi_o_4_0, pi_o_4_1) && le_22(-pi_o_4_0, -pi_o_4_1, x1, x2)) N = rn(x1 * std.flt64frombits(two_over_pi)) Nf = (-N : flt64) /* Sorting is slow. We know that x1 is roughly cancelled by Nf * pi_o_2_0, so line those up. */ (q[0], q[2]) = (x1, x2) (q[1], q[3]) = two_by_two(Nf, pi_o_2_0) (q[4], q[5]) = two_by_two(Nf, pi_o_2_1) (q[6], q[7]) = two_by_two(Nf, pi_o_2_2) (q[8], q[9]) = two_by_two(Nf, pi_o_2_3) (x1, x2) = double_compensated_sum(q[0:10]) total_N += (N % 4) ;; -> (((total_N % 4) + 4) % 4, x1, x2) } const huge_reduce_2pi = {x : flt64 var e : int64 var b : uint64 = std.flt64bits(x) (_, e, _) = std.flt64explode(x) if e < 25 -> (x, 0.0, 0.0) ;; /* Since the stepping of R is 50, and x has 53 significant bits, we get a splitting of x into two components. We want x = [ xa * 2^(50j + 25) ] + [ xb * 2^(50(j-1) + 25) ] and {ai}, {bi} such that a1 + a2 + a3 === ( 2^(50j + 25) ) mod 2pi b1 + b2 + b3 === ( 2^(50(j-1) + 25) ) mod 2pi If j is small enough, we can slack off a bit. We really just want xa(a1+a2+a3) + xb(b1+b2+b3) === x (mod 2pi) with a heck of a lot of precision. */ var j : uint64 = (e - 25 : uint64) / 50 var xa : flt64 = 0.0, xb : flt64 = 0.0, xc : flt64 = 0.0 var a1 : flt64 = 0.0, a2 : flt64 = 0.0, a3 : flt64 = 0.0 var b1 : flt64 = 0.0, b2 : flt64 = 0.0, b3 : flt64 = 0.0 var c1 : flt64 = 0.0, c2 : flt64 = 0.0, c3 : flt64 = 0.0 var u1 : uint64, u2 : uint64, u3 : uint64 var xcur = x var e1 : int64 = 50*(j : int64) + 25 var e2 : int64 = e1 - 50 var e3 : int64 = e2 - 50 xa = trunc(scale2(x,-e1)) xb = trunc(scale2(x,-e2)) xc = trunc(scale2(x,-e3)) xc -= scale2(xb, 50) xb -= scale2(xa, 50) (u1, u2, u3) = R[j] a1 = std.flt64frombits(u1) a2 = std.flt64frombits(u2) a3 = std.flt64frombits(u3) if j == 0 (b1, b2, b3) = (x - scale2(xa, e1), 0.0, 0.0) xb = 1.0 (c1, c2, c3) = (0.0, 0.0, 0.0) xc = 0.0 else (u1, u2, u3) = R[j - 1] b1 = std.flt64frombits(u1) b2 = std.flt64frombits(u2) b3 = std.flt64frombits(u3) if j == 1 (c1, c2, c3) = (x - scale2(xa, e1) - scale2(xb, e2), 0.0, 0.0) xc = 1.0 else (u1, u2, u3) = R[j - 2] c1 = std.flt64frombits(u1) c2 = std.flt64frombits(u2) c3 = std.flt64frombits(u3) ;; ;; /* Now we need to combine all this. Even worse, when we multiply the two together, we need to keep full precision (at least for the high-ish bits), so we need some help. TODO: The c-type calculations can probably be optimized, since xc is so small. */ var q : flt64[18] (q[ 0], q[ 1]) = two_by_two(xa, a1) (q[ 2], q[ 3]) = two_by_two(xa, a2) (q[ 4], q[ 5]) = two_by_two(xa, a3) (q[ 6], q[ 7]) = two_by_two(xb, b1) (q[ 8], q[ 9]) = two_by_two(xb, b2) (q[10], q[11]) = two_by_two(xb, b3) (q[12], q[13]) = two_by_two(xc, c1) (q[14], q[15]) = two_by_two(xc, c2) (q[16], q[17]) = two_by_two(xc, c3) -> triple_compensated_sum(q[:]) } const trig_table_approx = {x1 : flt64, C : (uint64, uint64, uint64)[257] var j = (rn(x1 * std.flt64frombits(oneohtwofour_over_pi)) : uint64) var x1u = std.flt64bits(x1) var xi, sin_xi, cos_xi, test_cos_xi, test_sin_xi /* We either want j or j - 1. */ j = std.max(std.min(j, 256), 0) (xi, cos_xi, sin_xi) = C[j] if j > 0 var test_xi (test_xi, test_cos_xi, test_sin_xi) = C[j - 1] if std.abs(x1u - test_xi) < std.abs(x1u - xi) -> (test_xi, test_cos_xi, test_sin_xi) ;; ;; -> (xi, cos_xi, sin_xi) } const triple_compensated_sum = {q : flt64[:] /* TODO: verify, with GAPPA or something, that this is correct. */ std.sort(q, fltabscmp) var s1 : flt64, s2 : flt64, s3 var t1 : flt64, t2 : flt64, t3 : flt64, t4 : flt64, t5 : flt64, t6 s1 = q[0] s2 = 0.0 s3 = 0.0 for qq : q[1:] (t5, t6) = fast2sum(s3, qq) (t3, t4) = fast2sum(s2, t5) (t1, t2) = fast2sum(s1, t3) s1 = t1 (s2, s3) = fast2sum(t2, t4 + t6) ;; -> (s1, s2, s3) } const fltabscmp = {x : flt64, y : flt64 var xb = std.flt64bits(x) & ~(1 << 63) var yb = std.flt64bits(y) & ~(1 << 63) if xb == yb -> `std.Equal elif xb > yb -> `std.Before else -> `std.After ;; } /* Return true iff (a1 + a2) <= (b1 + b2) */ const le_22 = { a1, a2, b1, b2 if a1 < b1 -> true elif a1 > b1 -> false ;; if a2 > b2 -> false ;; -> true }