shithub: amd64-simd

Download patch

ref: a0b600a89c2e6e636579fe727235d036c08c7a9d
parent: 092bcb0cb43b4a1ca351a3085c512bf6afa89989
author: rodri <rgl@antares-labs.eu>
date: Wed Nov 29 16:19:16 EST 2023

remove instructions recently added to 9front. implemented some tests.

--- a/avx.h
+++ b/avx.h
@@ -55,7 +55,7 @@
 #define VMOVDQA_256rm(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66);		\
 			VOP(0x7F, 0x3, (s), (d))
 
-/* VMODQU */
+/* VMOVDQU */
 #define VMOVDQU_128mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3);	\
 				VOPi(0x6F, 0x1, (d), (s), (off))
 #define VMOVDQU_128rm(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3);		\
--- a/bench/main.c
+++ b/bench/main.c
@@ -5,6 +5,7 @@
 #include "../bench9/b.h"
 
 double min(double, double);
+double dotvec2_sse(Point2, Point2);
 double dotvec2_sse4(Point2, Point2);
 double dotvec2_avx(Point2, Point2);
 double dotvec3_sse4(Point3, Point3);
@@ -64,14 +65,15 @@
 bdotvec2(int fd)
 {
 	Bgr g;
-	B *b0, *b1, *b2;
+	B *b0, *b1, *b2, *b3;
 	Point2 a, b;
 	int i;
 
 	benchinitgr(&g, "2d dot product");
 	b0 = benchadd(&g, "dotvec2");
-	b1 = benchadd(&g, "dotvec2_sse4");
-	b2 = benchadd(&g, "dotvec2_avx");
+	b1 = benchadd(&g, "dotvec2_sse");
+	b2 = benchadd(&g, "dotvec2_sse4");
+	b3 = benchadd(&g, "dotvec2_avx");
 
 	while(b0->n > 0 || b1->n > 0){
 		a = Vec2(truerand()*frand(), truerand()*frand());
@@ -84,13 +86,18 @@
 
 		benchin(b1);
 		for(i = 0; i < 1e6; i++)
-			dotvec2_sse4(a, b);
+			dotvec2_sse(a, b);
 		benchout(b1);
 
 		benchin(b2);
 		for(i = 0; i < 1e6; i++)
-			dotvec2_avx(a, b);
+			dotvec2_sse4(a, b);
 		benchout(b2);
+
+		benchin(b3);
+		for(i = 0; i < 1e6; i++)
+			dotvec2_avx(a, b);
+		benchout(b3);
 	}
 
 	benchprintgr(&g, fd);
--- a/dppd.s
+++ b/dppd.s
@@ -5,10 +5,30 @@
 DATA one(SB)/8,$1.0
 GLOBL one(SB), $8
 
+TEXT round(SB), 1, $0
+	MOVSD a+0(FP), X0
+	ROUNDSD $0x4, X0, X0
+	RET
+
+TEXT addsub_sse(SB), 1, $0
+	MOVQ b+8(FP), DX
+	MOVUPD 0(BP), X1
+	MOVUPD 0(DX), X0
+	ADDSUBPD X1, X0
+	MOVUPD X0, 0(DX)
+	RET
+
+TEXT dotvec2_sse(SB), 1, $0
+	MOVUPD a+0(FP), X0
+	MOVUPD b+24(FP), X1
+	MULPD X1, X0
+	HADDPD X0, X0
+	RET
+
 TEXT dotvec2_sse4(SB), 1, $0
 	MOVUPD a+0(FP), X0
 	MOVUPD b+24(FP), X1
-	DPPD(rX1, rX0)		/* DPPD $0x31, X1, X0 */
+	DPPD $0x31, X1, X0
 	RET
 
 TEXT dotvec2_avx(SB), 1, $0
@@ -24,7 +44,7 @@
 TEXT dotvec3_sse4(SB), 1, $0
 	MOVUPD a+0(FP), X0
 	MOVUPD b+32(FP), X1
-	DPPD(rX1, rX0)		/* DPPD $0x31, X1, X0 */
+	DPPD $0x31, X1, X0
 	MOVSD a+16(FP), X1
 	MULSD b+48(FP), X1
 	ADDSD X1, X0
@@ -56,7 +76,7 @@
 TEXT hsubpd(SB), 1, $0
 	MOVLPD a+0(FP), X0
 	MOVHPD b+8(FP), X0
-	HSUBPD(rX0, rX0)	/* HSUBPD X0, X0 */
+	HSUBPD X0, X0
 	RET
 
 TEXT crossvec3_sse(SB), 1, $0
@@ -68,15 +88,15 @@
 	MOVHPD a+24(FP), X2	/* X2 := [a.z][b.z] */
 	MOVAPD X1, X3
 	MULPD X2, X3
-	HSUBPD(rX3, rX3)	/* x */
+	HSUBPD X3, X3		/* x */
 	MOVAPD X2, X4
 	SHUFPD $0x1, X4, X4
 	MULPD X0, X4
-	HSUBPD(rX4, rX4)	/* y */
+	HSUBPD X4, X4		/* y */
 	MOVAPD X0, X5
 	MULPD X1, X5
 	SHUFPD $0x1, X5, X5
-	HSUBPD(rX5, rX5)	/* z */
+	HSUBPD X5, X5		/* z */
 	MOVQ BP, DI
 	MOVSD X3, 0(DI)
 	MOVSD X4, 8(DI)
--- a/main.c
+++ b/main.c
@@ -3,6 +3,7 @@
 #include <geometry.h>
 
 double min(double, double);
+double dotvec2_sse(Point2, Point2);
 double dotvec2_sse4(Point2, Point2);
 double dotvec2_avx(Point2, Point2);
 double dotvec3_sse4(Point3, Point3);
@@ -14,7 +15,16 @@
 Point2 addpt2_sse(Point2, Point2);
 Point2 addpt2_avx(Point2, Point2);
 Point3 addpt3_avx(Point3, Point3);
+void addsub_sse(double*,double*);
+double round(double);
 
+void
+addsub(double *a, double *b)
+{
+	b[0] = b[0]-a[0];
+	b[1] = b[1]+a[1];
+}
+
 double
 fmin(double a, double b)
 {
@@ -31,6 +41,7 @@
 main(int argc, char *argv[])
 {
 	double a, b, r;
+	double va[2], vb[2];
 	Point2 p0, p1, pr;
 	Point3 p0t, p1t, prt;
 
@@ -56,6 +67,9 @@
 	r = dotvec2(p0, p1);
 	print("dotvec2(%v, %v) = %g\n", p0, p1, r);
 	r = 0;
+	r = dotvec2_sse(p0, p1);
+	print("dotvec2_sse(%v, %v) = %g\n", p0, p1, r);
+	r = 0;
 	r = dotvec2_sse4(p0, p1);
 	print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r);
 	r = 0;
@@ -126,6 +140,26 @@
 	prt = Vec3(0,0,0);
 	prt = addpt3_avx(p0t, p1t);
 	print("addpt3_avx(%V, %V) = %V\n", p0t, p1t, prt);
+
+	print("\n");
+
+	va[0] = va[1] = a;
+	vb[0] = vb[1] = b;
+	print("addsub([%g %g], [%g %g]) = ", va[0], va[1], vb[0], vb[1]);
+	addsub(va, vb);
+	print("[%g %g]\n", vb[0], vb[1]);
+
+	va[0] = va[1] = a;
+	vb[0] = vb[1] = b;
+	print("addsub_sse([%g %g], [%g %g]) = ", va[0], va[1], vb[0], vb[1]);
+	addsub_sse(va, vb);
+	print("[%g %g]\n", vb[0], vb[1]);
+
+	print("\n");
+
+	r = 0;
+	r = round(a);
+	print("round(%g) = %g\n", a, r);
 
 	exits(nil);
 }
--- a/sse.h
+++ b/sse.h
@@ -18,26 +18,3 @@
 /* MODQU */
 #define MOVDQU_mr(off, s, d) F3OPi(0x6F, 0x1, (d), (s), (off))
 #define MOVDQU_rm(off, s, d) F3OPi(0x7F, 0x1, (s), (d), (off))
-
-/* MOVLPD */
-//opcode = 660F12
-//modrm  = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
-//disp8 = 8 / 32
-//#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off))
-
-/* MOVHPD */
-//opcode = 660F16
-//modrm  = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
-//disp8 = 16 / 40
-//#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off))
-
-/* HSUBPD */
-//opcode = 660F7D = 01100110 00001111 01111101
-//modrm = 11 000 000 [X0 → X0]
-#define HSUBPD(s, d) OP(0x7D, 0x3, (d), (s))
-
-/* DPPD */
-//opcode = 660F3A41 = 01100110 00001111 00111010 01000001
-//modrm  = 11 000 001 [X1 → X0]
-//imm8   = 0011 0001
-#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)