ref: d27469804f73bf0874250eda520532875ecf5645
parent: 31356a321f79afc2dd2cc5a6b7c9c72f77cf3394
author: S. Gilles <sgilles@math.umd.edu>
date: Wed Mar 21 19:21:23 EDT 2018
Implement assembly version of fused multiply-add The feature flag for fma includes OSXSave and AVX detection, as these are prerequisites for using the xmm/ymm registers. We do not, however, check the result of XGETBV (as recommended in the Intel software developer manual, vol 1, 14.5.3) because this seems to be a check that should be performed at runtime (see the note in section 14.3). This is slightly uncomfortable -- perhaps libmath should come with an __init__ that bails if the user is trying to use AVX stuff when the OS isn't using XSAVE.
--- a/lib/math/bld.sub
+++ b/lib/math/bld.sub
@@ -9,6 +9,7 @@
fpmath-sum-impl.myr
# fused-multiply-add
+ fpmath-fma-impl+posixy-x64-fma.s
fpmath-fma-impl.myr
lib ../std:std
--- /dev/null
+++ b/lib/math/fpmath-fma-impl+posixy-x64-fma.s
@@ -1,0 +1,13 @@
+.globl math$fma32
+.globl math$_fma32
+math$fma32:
+math$_fma32:
+ vfmadd132ss %xmm1, %xmm2, %xmm0
+ ret
+
+.globl math$fma64
+.globl math$_fma64
+math$fma64:
+math$_fma64:
+ vfmadd132sd %xmm1, %xmm2, %xmm0
+ ret
--- a/mbld/opts.myr
+++ b/mbld/opts.myr
@@ -35,6 +35,13 @@
/* not exactly portable, but good enough for now */
const CpuidSSE4 : uint64= 0x180000
+
+ /*
+ Intel manuals (vol 1, 14.5.3) say AVX, OSXSAVE also
+ needed. For full portability, XGETBV also needs to be
+ checked, though it isn't right now.
+ */
+ const CpuidFMA : uint64= 0x18001000
extern const cpufeatures : (-> uint64)
;;
--- a/mbld/syssel.myr
+++ b/mbld/syssel.myr
@@ -165,6 +165,9 @@
if opt_cpufeatures & CpuidSSE4 == CpuidSSE4
tag(b, "sse4")
;;
+ if opt_cpufeatures & CpuidFMA == CpuidFMA
+ tag(b, "fma")
+ ;;
| unknown:
std.fatal("unknown architecture {}\n", unknown)
;;