ref: 96ccc85d6a58b7ebfcb69d6d470ff209668f6187
author: rodri <rgl@antares-labs.eu>
date: Thu Nov 23 06:43:50 EST 2023
initial SIMD trials of some libgeometry functions.
--- /dev/null
+++ b/dppd.s
@@ -1,0 +1,71 @@
+#include "sse.h"
+
+DATA one(SB)/8,$1.0
+GLOBL one(SB), $8
+
+TEXT dppd(SB), 1, $0
+ MOVQ SP, AX
+ MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
+ MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */
+ MOVLPD(32, rAX, rX1) /* MOVLPD b+0(FP), X1 */
+ MOVHPD(40, rAX, rX1) /* MOVHPD b+8(FP), X1*/
+ DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
+ RET
+
+TEXT dppd3(SB), 1, $0
+ MOVQ SP, AX
+ MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
+ MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */
+ MOVLPD(40, rAX, rX1) /* MOVLPD b+0(FP), X1 */
+ MOVHPD(48, rAX, rX1) /* MOVHPD b+8(FP), X1 */
+ DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
+ MOVSD one(SB), X1
+ MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */
+ MOVHPD(56, rAX, rX1) /* MOVHPD b+16(FP), X1 */
+ DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
+ RET
+
+TEXT Pt2b(SB), 1, $0
+ MOVQ BP, DI
+ MOVSD x+8(FP), X0
+ MOVSD X0, 0(DI)
+ MOVSD y+16(FP), X0
+ MOVSD X0, 8(DI)
+ MOVSD w+24(FP), X0
+ MOVSD X0, 16(DI)
+ RET
+
+TEXT hsubpd(SB), 1, $0
+ MOVQ SP, AX
+ MOVLPD(8, rAX, rX0)
+ MOVHPD(16, rAX, rX0)
+ HSUBPD(rX0, rX0)
+ RET
+
+TEXT xvec3(SB), 1, $0
+ MOVQ SP, AX
+ ADDQ $8, AX
+ MOVLPD(40, rAX, rX0)
+ MOVHPD(8, rAX, rX0)
+ MOVLPD(16, rAX, rX1)
+ MOVHPD(48, rAX, rX1)
+ MOVLPD(56, rAX, rX2)
+ MOVHPD(24, rAX, rX2)
+ MOVAPD X1, X3
+ MULPD X2, X3
+ HSUBPD(rX3, rX3) /* x */
+ MOVAPD X2, X4
+ SHUFPD $0x1, X4, X4
+ MULPD X0, X4
+ HSUBPD(rX4, rX4) /* y */
+ MOVAPD X0, X5
+ MULPD X1, X5
+ SHUFPD $0x1, X5, X5
+ HSUBPD(rX5, rX5) /* z */
+ MOVQ BP, DI
+ MOVSD X3, 0(DI)
+ MOVSD X4, 8(DI)
+ MOVSD X5, 16(DI)
+ XORPD X0, X0
+ MOVSD X0, 24(DI)
+ RET
--- /dev/null
+++ b/main.c
@@ -1,0 +1,82 @@
+#include <u.h>
+#include <libc.h>
+#include <geometry.h>
+
+uvlong nanosec(void);
+double min(double, double);
+double dppd(Point2, Point2);
+double dppd3(Point3, Point3);
+Point2 Pt2b(double, double, double);
+Point3 xvec3(Point3, Point3);
+double hsubpd(double, double);
+
+double
+fmin(double a, double b)
+{
+ return a<b? a: b;
+}
+
+void
+main(int argc, char *argv[])
+{
+ uvlong t0, t1;
+ double a, b, r;
+ Point2 p0, p1;
+ Point3 p0t, p1t, pr;
+
+ GEOMfmtinstall();
+ ARGBEGIN{default:sysfatal("shit");}ARGEND
+ if(argc != 2)
+ sysfatal("shit");
+ a = strtod(argv[0], nil);
+ b = strtod(argv[1], nil);
+
+ t0 = nanosec();
+ r = fmin(a, b);
+ t1 = nanosec();
+ print("fmin(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+ t0 = nanosec();
+ r = min(a, b);
+ t1 = nanosec();
+ print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+
+ p0 = Pt2b(a, 1, 1);
+ p1 = Pt2b(b, 3, 1);
+ t0 = nanosec();
+ r = dppd(p0, p1);
+ t1 = nanosec();
+ print("dppd(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+ t0 = nanosec();
+ r = dotvec2(p0, p1);
+ t1 = nanosec();
+ print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+
+ p0t = Pt3(a, 1, 9, 1);
+ p1t = Pt3(b, 3, 4, 1);
+ t0 = nanosec();
+ r = dppd3(p0t, p1t);
+ t1 = nanosec();
+ print("dppd3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+ t0 = nanosec();
+ r = dotvec3(p0t, p1t);
+ t1 = nanosec();
+ print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+
+ t0 = nanosec();
+ r = hsubpd(a, b);
+ t1 = nanosec();
+ print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+
+ p0t = Pt3(a, 1, 9, 1);
+ p1t = Pt3(b, 3, 4, 1);
+ t0 = nanosec();
+ pr = xvec3(p0t, p1t);
+ t1 = nanosec();
+ print("xvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+ t0 = nanosec();
+ pr = crossvec3(p0t, p1t);
+ t1 = nanosec();
+ print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+
+ exits(nil);
+}
--- /dev/null
+++ b/min.s
@@ -1,0 +1,4 @@
+TEXT min(SB), 1, $0
+ MOVSD a+0(FP), X0
+ MINSD b+8(FP), X0
+ RET
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,14 @@
+</$objtype/mkfile
+
+BIN=/$objtype/bin
+TARG=stuff
+OFILES=\
+ main.$O\
+ min.$O\
+ dppd.$O\
+ nanosec.$O\
+
+HFILES=\
+ sse.h\
+
+</sys/src/cmd/mkone
--- /dev/null
+++ b/nanosec.c
@@ -1,0 +1,109 @@
+#include <u.h>
+#include <libc.h>
+#include <tos.h>
+
+/*
+ * This code is a mixture of cpuid(1) and the nanosec() found in vmx,
+ * in order to force the use of nsec(2) in case we are running in a
+ * virtualized environment where the clock is mis-bhyve-ing.
+ */
+
+typedef struct Res {
+ ulong ax, bx, cx, dx;
+} Res;
+
+static uchar _cpuid[] = {
+ 0x5E, /* POP SI (PC) */
+ 0x5D, /* POP BP (Res&) */
+ 0x58, /* POP AX */
+ 0x59, /* POP CX */
+
+ 0x51, /* PUSH CX */
+ 0x50, /* PUSH AX */
+ 0x55, /* PUSH BP */
+ 0x56, /* PUSH SI */
+
+ 0x31, 0xDB, /* XOR BX, BX */
+ 0x31, 0xD2, /* XOR DX, DX */
+
+ 0x0F, 0xA2, /* CPUID */
+
+ 0x89, 0x45, 0x00, /* MOV AX, 0(BP) */
+ 0x89, 0x5d, 0x04, /* MOV BX, 4(BP) */
+ 0x89, 0x4d, 0x08, /* MOV CX, 8(BP) */
+ 0x89, 0x55, 0x0C, /* MOV DX, 12(BP) */
+ 0xC3, /* RET */
+};
+
+static Res (*cpuid)(ulong ax, ulong cx) = (Res(*)(ulong, ulong)) _cpuid;
+
+/*
+ * nsec() is wallclock and can be adjusted by timesync
+ * so need to use cycles() instead, but fall back to
+ * nsec() in case we can't
+ */
+uvlong
+nanosec(void)
+{
+ static uvlong fasthz, xstart;
+ char buf[13], path[128];
+ ulong w;
+ uvlong x, div;
+ int fd;
+ Res r;
+
+ if(fasthz == ~0ULL)
+ return nsec() - xstart;
+
+ if(fasthz == 0){
+ /* first long in a.out header */
+ snprint(path, sizeof path, "/proc/%d/text", getpid());
+ fd = open(path, OREAD);
+ if(fd < 0)
+ goto Wallclock;
+ if(read(fd, buf, 4) != 4){
+ close(fd);
+ goto Wallclock;
+ }
+ close(fd);
+
+ w = ((ulong *) buf)[0];
+
+ switch(w){
+ default:
+ goto Wallclock;
+ case 0x978a0000: /* amd64 */
+ /* patch out POP BP -> POP AX */
+ _cpuid[1] = 0x58;
+ case 0xeb010000: /* 386 */
+ break;
+ }
+ segflush(_cpuid, sizeof(_cpuid));
+
+ r = cpuid(0x40000000, 0);
+ ((ulong *) buf)[0] = r.bx;
+ ((ulong *) buf)[1] = r.cx;
+ ((ulong *) buf)[2] = r.dx;
+ buf[12] = 0;
+
+ if(strstr(buf, "bhyve") != nil)
+ goto Wallclock;
+
+ if(_tos->cyclefreq){
+ fasthz = _tos->cyclefreq;
+ cycles(&xstart);
+ } else {
+Wallclock:
+ fasthz = ~0ULL;
+ xstart = nsec();
+ }
+ return 0;
+ }
+ cycles(&x);
+ x -= xstart;
+
+ /* this is ugly */
+ for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL);
+
+ return x / (fasthz / div);
+}
--- /dev/null
+++ b/sse.h
@@ -1,0 +1,50 @@
+#define rAX 0
+#define rCX 1
+#define rDX 2
+#define rBX 3
+#define rSP 4
+#define rBP 5
+#define rSI 6
+#define rDI 7
+
+#define rX0 0
+#define rX1 1
+#define rX2 2
+#define rX3 3
+#define rX4 4
+#define rX5 5
+#define rX6 6
+
+#define OP(o, m, ro, rm) WORD $0x0F66; /* op + modr/m byte */ \
+ BYTE $(o); \
+ BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define OPi(o, m, ro, rm, i) OP((o), (m), (ro), (rm)); \
+ BYTE $(i)
+#define OP4(o, m, ro, rm) WORD $0x0F66; \
+ WORD $(o); \
+ BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \
+ BYTE $(i)
+
+/* MOVLPD */
+//opcode = 660F12
+//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
+//disp8 = 8 / 32
+#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off))
+
+/* MOVHPD */
+//opcode = 660F16
+//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
+//disp8 = 16 / 40
+#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off))
+
+/* HSUBPD */
+//opcode = 660F7D = 01100110 00001111 01111101
+//modrm = 11 000 000 [X0 → X0]
+#define HSUBPD(s, d) OP(0x7D, 0x3, (d), (s))
+
+/* DPPD */
+//opcode = 660F3A41 = 01100110 00001111 00111010 01000001
+//modrm = 11 000 001 [X1 → X0]
+//imm8 = 0011 0001
+#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)