ref: 7cf4634e668730749aa8b7fa9ff16cf4234958fa
parent: d850c3b7f47e58556c160f9d03ea20aa52452020
author: rodri <rgl@antares-labs.eu>
date: Fri Nov 24 11:48:14 EST 2023
clean and organize things up. implement VZEROUPPER.
--- /dev/null
+++ b/avx.h
@@ -1,0 +1,40 @@
+#define VEX_m_0F (1)
+#define VEX_m_0F38 (2)
+#define VEX_m_0F3A (3)
+#define VEX_L_128 (0)
+#define VEX_L_256 (1)
+#define VEX_p_NO (0)
+#define VEX_p_66 (1)
+#define VEX_p_F3 (2)
+#define VEX_p_F2 (3)
+
+#define VEX3(r, x, b, m, w, v, l, p) BYTE $0xC4; \
+ BYTE $(((~r)<<7)|((~x)<<6)|((~b)<<5)|(m)); \
+ BYTE $(((w)<<7)|((~v)<<3)|((l)<<2)|(p))
+#define VEX2(r, b, l, p) BYTE $0xC5; \
+ BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p))
+#define VOP(o, m, ro, rm) BYTE $(o); \
+ BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define VOPi(o, m, ro, rm, i) VOP((o), (m), (ro), (rm)); \
+ BYTE $(i)
+
+
+/* VZEROUPPER */
+#define VZEROUPPER VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_NO); BYTE $0x77
+
+/* VMOVAPD */
+#define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOPi(0x10, 0x1, (d), (s), (off))
+#define VMOVAPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOP(0x28, 0x3, (d), (s))
+/* VDPPD */
+#define VDPPD(s0, s1, d) VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66); \
+ VOPi(0x41, 0x3, (d), (s1), 0x31)
+
+/* VFMADD231SD (128 bit) */
+#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \
+ VOP(0xB9, 0x3, (d), (s1))
+
+/* VFMADD231PD (128 bit) */
+#define VFMADD231PD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \
+ VOP(0xB8, 0x3, (d), (s1))
--- a/dppd.s
+++ b/dppd.s
@@ -1,4 +1,6 @@
+#include "regs.h"
#include "sse.h"
+#include "avx.h"
DATA one(SB)/8,$1.0
GLOBL one(SB), $8
--- a/mkfile
+++ b/mkfile
@@ -9,6 +9,8 @@
nanosec.$O\
HFILES=\
+ regs.h\
sse.h\
+ avx.h\
</sys/src/cmd/mkone
--- /dev/null
+++ b/regs.h
@@ -1,0 +1,19 @@
+/* GPRs */
+#define rAX 0
+#define rCX 1
+#define rDX 2
+#define rBX 3
+#define rSP 4
+#define rBP 5
+#define rSI 6
+#define rDI 7
+
+/* SSE and AVX (represent [XYZ]MM) */
+#define rX0 0 /* X8 */
+#define rX1 1 /* X9 */
+#define rX2 2 /* X10 */
+#define rX3 3 /* X11 */
+#define rX4 4 /* X12 */
+#define rX5 5 /* X13 */
+#define rX6 6 /* X14 */
+#define rX7 7 /* X15 */
--- a/sse.h
+++ b/sse.h
@@ -1,30 +1,3 @@
-#define rAX 0
-#define rCX 1
-#define rDX 2
-#define rBX 3
-#define rSP 4
-#define rBP 5
-#define rSI 6
-#define rDI 7
-
-#define rX0 0
-#define rX1 1
-#define rX2 2
-#define rX3 3
-#define rX4 4
-#define rX5 5
-#define rX6 6
-
-#define VEX_m_0F (1)
-#define VEX_m_0F38 (2)
-#define VEX_m_0F3A (3)
-#define VEX_L_128 (0)
-#define VEX_L_256 (1)
-#define VEX_p_NO (0)
-#define VEX_p_66 (1)
-#define VEX_p_F3 (2)
-#define VEX_p_F2 (3)
-
#define OP(o, m, ro, rm) WORD $0x0F66; BYTE $(o); \
BYTE $(((m)<<6)|((ro)<<3)|(rm))
#define OPi(o, m, ro, rm, i) OP((o), (m), (ro), (rm)); \
@@ -34,15 +7,6 @@
#define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \
BYTE $(i)
-#define VEX3(r, x, b, m, w, v, l, p) BYTE $0xC4; \
- BYTE $(((~r)<<7)|((~x)<<6)|((~b)<<5)|(m)); \
- BYTE $(((w)<<7)|((~v)<<3)|((l)<<2)|(p))
-#define VEX2(r, b, l, p) BYTE $0xC5; \
- BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p))
-#define VOP(o, m, ro, rm) BYTE $(o); \
- BYTE $(((m)<<6)|((ro)<<3)|(rm))
-#define VOPi(o, m, ro, rm, i) VOP((o), (m), (ro), (rm)); \
- BYTE $(i)
/* MOVLPD */
//opcode = 660F12
@@ -66,20 +30,3 @@
//modrm = 11 000 001 [X1 → X0]
//imm8 = 0011 0001
#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)
-
-/* VMOVAPD */
-#define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
- VOPi(0x10, 0x1, (d), (s), (off))
-#define VMOVAPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
- VOP(0x28, 0x3, (d), (s))
-/* VDPPD */
-#define VDPPD(s0, s1, d) VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66); \
- VOPi(0x41, 0x3, (d), (s1), 0x31)
-
-/* VFMADD231SD (128 bit) */
-#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \
- VOP(0xB9, 0x3, (d), (s1))
-
-/* VFMADD231PD (128 bit) */
-#define VFMADD231PD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \
- VOP(0xB8, 0x3, (d), (s1))