shithub: mc

Download patch

ref: d27469804f73bf0874250eda520532875ecf5645
parent: 31356a321f79afc2dd2cc5a6b7c9c72f77cf3394
author: S. Gilles <sgilles@math.umd.edu>
date: Wed Mar 21 19:21:23 EDT 2018

Implement assembly version of fused multiply-add

The feature flag for fma includes OSXSave and AVX detection, as
these are prerequisites for using the xmm/ymm registers. We do not,
however, check the result of XGETBV (as recommended in the Intel
software developer manual, vol 1, 14.5.3) because this seems to be
a check that should be performed at runtime (see the note in section
14.3).

This is slightly uncomfortable -- perhaps libmath should come with
an __init__ that bails if the user is trying to use AVX stuff when
the OS isn't using XSAVE.

--- a/lib/math/bld.sub
+++ b/lib/math/bld.sub
@@ -9,6 +9,7 @@
 	fpmath-sum-impl.myr
 
 	# fused-multiply-add
+	fpmath-fma-impl+posixy-x64-fma.s
 	fpmath-fma-impl.myr
 
 	lib ../std:std
--- /dev/null
+++ b/lib/math/fpmath-fma-impl+posixy-x64-fma.s
@@ -1,0 +1,13 @@
+.globl math$fma32
+.globl math$_fma32
+math$fma32:
+math$_fma32:
+	vfmadd132ss %xmm1, %xmm2, %xmm0
+	ret
+
+.globl math$fma64
+.globl math$_fma64
+math$fma64:
+math$_fma64:
+	vfmadd132sd %xmm1, %xmm2, %xmm0
+	ret
--- a/mbld/opts.myr
+++ b/mbld/opts.myr
@@ -35,6 +35,13 @@
 
 	/* not exactly portable, but good enough for now */
 	const CpuidSSE4 : uint64= 0x180000
+
+	/*
+	   Intel manuals (vol 1, 14.5.3) say AVX, OSXSAVE also
+	   needed. For full portability, XGETBV also needs to be
+	   checked, though it isn't right now.
+	 */
+	const CpuidFMA  : uint64= 0x18001000
 	extern const cpufeatures : (-> uint64)
 ;;
 
--- a/mbld/syssel.myr
+++ b/mbld/syssel.myr
@@ -165,6 +165,9 @@
 			if opt_cpufeatures & CpuidSSE4 == CpuidSSE4
 				tag(b, "sse4")
 			;;
+			if opt_cpufeatures & CpuidFMA == CpuidFMA
+				tag(b, "fma")
+			;;
 		| unknown:
 			std.fatal("unknown architecture {}\n", unknown)
 		;;