shithub: riscv

Download patch

ref: ed9e9f98e9cc502c72b27c68612e9e187ec11e10
parent: d4fb753c9c90e0ca745a1b3708ad3ec4ca523e71
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sat Feb 1 05:31:41 EST 2014

libc and ape support for amd64

--- /dev/null
+++ b/amd64/include/ape/float.h
@@ -1,0 +1,80 @@
+#ifndef __FLOAT
+#define __FLOAT
+/* IEEE, default rounding */
+
+#define FLT_ROUNDS	1
+#define FLT_RADIX	2
+
+#define FLT_DIG		6
+#define FLT_EPSILON	1.19209290e-07
+#define FLT_MANT_DIG	24
+#define FLT_MAX		3.40282347e+38
+#define FLT_MAX_10_EXP	38
+#define FLT_MAX_EXP	128
+#define FLT_MIN		1.17549435e-38
+#define FLT_MIN_10_EXP	-37
+#define FLT_MIN_EXP	-125
+
+#define DBL_DIG		15
+#define DBL_EPSILON	2.2204460492503131e-16
+#define DBL_MANT_DIG	53
+#define DBL_MAX		1.797693134862315708145e+308
+#define DBL_MAX_10_EXP	308
+#define DBL_MAX_EXP	1024
+#define DBL_MIN		2.225073858507201383090233e-308
+#define DBL_MIN_10_EXP	-307
+#define DBL_MIN_EXP	-1021
+#define LDBL_MANT_DIG	DBL_MANT_DIG
+#define LDBL_EPSILON	DBL_EPSILON
+#define LDBL_DIG	DBL_DIG
+#define LDBL_MIN_EXP	DBL_MIN_EXP
+#define LDBL_MIN	DBL_MIN
+#define LDBL_MIN_10_EXP	DBL_MIN_10_EXP
+#define LDBL_MAX_EXP	DBL_MAX_EXP
+#define LDBL_MAX	DBL_MAX
+#define LDBL_MAX_10_EXP	DBL_MAX_10_EXP
+
+typedef 	union FPdbleword FPdbleword;
+union FPdbleword
+{
+	double	x;
+	struct {	/* little endian */
+		long lo;
+		long hi;
+	};
+};
+
+#ifdef _RESEARCH_SOURCE
+/* define stuff needed for floating conversion */
+#define IEEE_8087	1
+#define Sudden_Underflow 1
+#endif
+#ifdef _PLAN9_SOURCE
+/* MXCSR */
+/* fcr */
+#define	FPFTZ	(1<<15)	/* amd64 */
+#define	FPINEX	(1<<12)
+#define	FPUNFL	(1<<11)
+#define	FPOVFL	(1<<10)
+#define	FPZDIV	(1<<9)
+#define	FPDNRM	(1<<8)	/* amd64 */
+#define	FPINVAL	(1<<7)
+#define	FPDAZ	(1<<6)	/* amd64 */
+#define	FPRNR	(0<<13)
+#define	FPRZ	(3<<13)
+#define	FPRPINF	(2<<13)
+#define	FPRNINF	(1<<13)
+#define	FPRMASK	(3<<13)
+#define	FPPEXT	0
+#define	FPPSGL	0
+#define	FPPDBL	0
+#define	FPPMASK	0
+/* fsr */
+#define	FPAINEX	(1<<5)
+#define	FPAUNFL	(1<<4)
+#define	FPAOVFL	(1<<3)
+#define	FPAZDIV	(1<<2)
+#define	FPADNRM	(1<<1)	/* not in plan 9 */
+#define	FPAINVAL	(1<<0)
+#endif
+#endif /* __FLOAT */
--- /dev/null
+++ b/amd64/include/ape/inttypes.h
@@ -1,0 +1,21 @@
+#ifndef _SUSV2_SOURCE
+#error "inttypes.h is SUSV2"
+#endif
+
+#ifndef _INTTYPES_H_
+#define _INTTYPES_H_ 1
+
+
+typedef char int8_t;
+typedef short int16_t;
+typedef int int32_t;
+typedef long long int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef long long intptr_t;
+typedef unsigned long long uintptr_t;
+
+#endif
--- /dev/null
+++ b/amd64/include/ape/math.h
@@ -1,0 +1,78 @@
+#ifndef __MATH
+#define __MATH
+#pragma lib "/$M/lib/ape/libap.a"
+
+/* a HUGE_VAL appropriate for IEEE double-precision */
+/* the correct value, 1.797693134862316e+308, causes a ken overflow */
+#define HUGE_VAL 1.79769313486231e+308
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern double acos(double);
+extern double asin(double);
+extern double atan(double);
+extern double atan2(double, double);
+extern double cos(double);
+extern double hypot(double, double);
+extern double sin(double);
+extern double tan(double);
+extern double cosh(double);
+extern double sinh(double);
+extern double tanh(double);
+extern double exp(double);
+extern double frexp(double, int *);
+extern double ldexp(double, int);
+extern double log(double);
+extern double log10(double);
+extern double modf(double, double *);
+extern double pow(double, double);
+extern double sqrt(double);
+extern double ceil(double);
+extern double fabs(double);
+extern double floor(double);
+extern double fmod(double, double);
+extern double NaN(void);
+extern int isNaN(double);
+extern double Inf(int);
+extern int isInf(double, int);
+
+#ifdef _RESEARCH_SOURCE
+/* does >> treat left operand as unsigned ? */
+#define Unsigned_Shifts 1
+#define	M_E		2.7182818284590452354	/* e */
+#define	M_LOG2E		1.4426950408889634074	/* log 2e */
+#define	M_LOG10E	0.43429448190325182765	/* log 10e */
+#define	M_LN2		0.69314718055994530942	/* log e2 */
+#define	M_LN10		2.30258509299404568402	/* log e10 */
+#define	M_PI		3.14159265358979323846	/* pi */
+#define	M_PI_2		1.57079632679489661923	/* pi/2 */
+#define	M_PI_4		0.78539816339744830962	/* pi/4 */
+#define	M_1_PI		0.31830988618379067154	/* 1/pi */
+#define	M_2_PI		0.63661977236758134308	/* 2/pi */
+#define	M_2_SQRTPI	1.12837916709551257390	/* 2/sqrt(pi) */
+#define	M_SQRT2		1.41421356237309504880	/* sqrt(2) */
+#define	M_SQRT1_2	0.70710678118654752440	/* 1/sqrt(2) */
+
+extern double hypot(double, double);
+extern double erf(double);
+extern double erfc(double);
+extern double j0(double);
+extern double y0(double);
+extern double j1(double);
+extern double y1(double);
+extern double jn(int, double);
+extern double yn(int, double);
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#define isnan(x) isNaN(x)
+#define isinf(x) isInf(x, 0)
+
+#endif /* __MATH */
--- /dev/null
+++ b/amd64/include/ape/stdarg.h
@@ -1,0 +1,18 @@
+#ifndef __STDARG
+#define __STDARG
+
+typedef char *va_list;
+
+#define va_start(list, start) list = (sizeof(start)<8 ? (char *)((long long *)&(start)+1) : \
+(char *)(&(start)+1))
+#define va_end(list)
+#define va_arg(list, mode)\
+	((sizeof(mode) == 1)?\
+		((mode*)(list += 8))[-8]:\
+	(sizeof(mode) == 2)?\
+		((mode*)(list += 8))[-4]:\
+	(sizeof(mode) == 4)?\
+		((mode*)(list += 8))[-2]:\
+		((mode*)(list += sizeof(mode)))[-1])
+
+#endif /* __STDARG */
--- /dev/null
+++ b/amd64/include/ape/ureg.h
@@ -1,0 +1,38 @@
+#ifndef __UREG_H
+#define __UREG_H
+#if !defined(_PLAN9_SOURCE)
+    This header file is an extension to ANSI/POSIX
+#endif
+
+struct Ureg {
+	unsigned long long	ax;
+	unsigned long long	bx;
+	unsigned long long	cx;
+	unsigned long long	dx;
+	unsigned long long	si;
+	unsigned long long	di;
+	unsigned long long	bp;
+	unsigned long long	r8;
+	unsigned long long	r9;
+	unsigned long long	r10;
+	unsigned long long	r11;
+	unsigned long long	r12;
+	unsigned long long	r13;
+	unsigned long long	r14;
+	unsigned long long	r15;
+
+	unsigned short		ds;
+	unsigned short		es;
+	unsigned short		fs;
+	unsigned short		gs;
+
+	unsigned long long	type;
+	unsigned long long	error;		/* error code (or zero) */
+	unsigned long long	pc;		/* pc */
+	unsigned long long	cs;		/* old context */
+	unsigned long long	flags;		/* old flags */
+	unsigned long long	sp;		/* sp */
+	unsigned long long	ss;		/* old stack segment */
+};
+
+#endif
--- /dev/null
+++ b/sys/src/ape/lib/9/amd64/getcallerpc.s
@@ -1,0 +1,3 @@
+TEXT getcallerpc(SB), 1, $0
+	MOVQ	-8(RARG), AX
+	RET
--- /dev/null
+++ b/sys/src/ape/lib/9/amd64/getfcr.s
@@ -1,0 +1,38 @@
+
+TEXT	setfcr(SB), $4
+	XORL	$(0x3F<<7),RARG	/* bits are cleared in csr to enable them */
+	ANDL	$0xFFC0, RARG	/* just the fcr bits */
+	WAIT	/* is this needed? */
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	RARG, AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET
+
+TEXT	getfcr(SB), $4
+	WAIT
+	STMXCSR	0(SP)
+	MOVWLZX	0(SP), AX
+	ANDL	$0xFFC0, AX
+	XORL	$(0x3F<<7),AX
+	RET
+
+TEXT	getfsr(SB), $4
+	WAIT
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$0x3F, AX
+	RET
+
+TEXT	setfsr(SB), $4
+	ANDL	$0x3F, RARG
+	WAIT
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	RARG, AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/_seek.c
@@ -1,0 +1,11 @@
+extern long __SEEK(long long*, int, long long, int);
+
+long long
+_SEEK(int fd, long long o, int p)
+{
+	long long l;
+
+	if(__SEEK(&l, fd, o, p) < 0)
+		l = -1;
+	return l;
+}
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/cycles.s
@@ -1,0 +1,5 @@
+TEXT _cycles(SB),1,$0				/* time stamp counter; cycles since power up */
+	RDTSC
+	MOVL	AX, 0(RARG)			/* lo */
+	MOVL	DX, 4(RARG)			/* hi */
+	RET
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/lock.c
@@ -1,0 +1,26 @@
+#define _LOCK_EXTENSION
+#include "../plan9/sys9.h"
+#include <lock.h>
+
+int	tas(int*);
+
+void
+lock(Lock *lk)
+{
+	while(tas(&lk->val))
+		_SLEEP(0);
+}
+
+int
+canlock(Lock *lk)
+{
+	if(tas(&lk->val))
+		return 0;
+	return 1;
+}
+
+void
+unlock(Lock *lk)
+{
+	lk->val = 0;
+}
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/main9.s
@@ -1,0 +1,26 @@
+#define NPRIVATES	16
+
+GLOBL	_tos(SB), $8
+GLOBL	_errnoloc(SB), $8
+GLOBL	_privates(SB), $8
+GLOBL	_nprivates(SB), $8
+
+TEXT	_main(SB), 1, $(32+NPRIVATES*8)
+
+	/* _tos = arg */
+	MOVQ	AX, _tos(SB)
+	LEAQ	24(SP), AX
+	MOVQ	AX, _errnoloc(SB)
+	LEAQ	32(SP), AX
+	MOVQ	AX, _privates(SB)
+	MOVQ	$NPRIVATES, _nprivates(SB)
+	CALL	_envsetup(SB)
+	MOVL	inargc-8(FP), RARG
+	LEAQ	inargv+0(FP), AX
+	MOVQ	AX, 8(SP)
+	MOVQ	environ(SB), AX
+	MOVQ	AX, 16(SP)
+	CALL	main(SB)
+	MOVQ	AX, RARG
+	CALL	exit(SB)
+	RET
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/main9p.s
@@ -1,0 +1,45 @@
+#define NPRIVATES	16
+
+GLOBL	_tos(SB), $8
+GLOBL	_privates(SB), $8
+GLOBL	_nprivates(SB), $8
+
+TEXT	_mainp(SB), 1, $(3*8+NPRIVATES*8)
+
+	/* _tos = arg */
+	MOVQ	AX, _tos(SB)
+	LEAQ	8(SP), AX
+	MOVQ	AX, _privates(SB)
+	MOVQ	$NPRIVATES, _nprivates(SB)
+
+	/* _profmain(); */
+	CALL	_profmain(SB)
+
+	/* _tos->prof.pp = _tos->prof.next; */
+	MOVQ	_tos+0(SB),DX
+	MOVQ	4(DX),CX
+	MOVQ	CX,(DX)
+
+	CALL	_envsetup(SB)
+
+	/* main(argc, argv, environ); */
+	MOVL	inargc-8(FP), RARG
+	LEAQ	inargv+0(FP), AX
+	MOVQ	AX, 8(SP)
+	MOVQ	environ(SB), AX
+	MOVQ	AX, 16(SP)
+	CALL	main(SB)
+
+loop:
+	MOVL	AX, RARG
+	CALL	exit(SB)
+	MOVQ	$_profin(SB), AX	/* force loading of profile */
+	MOVL	$0, AX
+	JMP	loop
+
+TEXT	_savearg(SB), 1, $0
+	RET
+
+TEXT	_callpc(SB), 1, $0
+	MOVQ	8(RARG), AX
+	RET
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/mkfile
@@ -1,0 +1,20 @@
+APE=/sys/src/ape
+objtype=amd64
+<$APE/config
+LIB=/$objtype/lib/ape/libap.a
+OFILES=\
+	_seek.$O\
+	cycles.$O\
+	lock.$O\
+	main9.$O\
+	main9p.$O\
+	notetramp.$O\
+	setjmp.$O\
+	strchr.$O\
+	strlen.$O\
+	tas.$O\
+
+</sys/src/cmd/mksyslib
+
+CFLAGS=-c -D_POSIX_SOURCE -D_PLAN9_SOURCE
+
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/notetramp.c
@@ -1,0 +1,81 @@
+#include "../plan9/lib.h"
+#include "../plan9/sys9.h"
+#include <signal.h>
+#include <setjmp.h>
+
+/* A stack to hold pcs when signals nest */
+#define MAXSIGSTACK 20
+typedef struct Pcstack Pcstack;
+static struct Pcstack {
+	int sig;
+	void (*hdlr)(int, char*, Ureg*);
+	unsigned long long restorepc;
+	Ureg *u;
+} pcstack[MAXSIGSTACK];
+static int nstack = 0;
+
+static void notecont(Ureg*, char*);
+
+void
+_notetramp(int sig, void (*hdlr)(int, char*, Ureg*), Ureg *u)
+{
+	Pcstack *p;
+
+	if(nstack >= MAXSIGSTACK)
+		_NOTED(1);	/* nesting too deep; just do system default */
+	p = &pcstack[nstack];
+	p->restorepc = u->pc;
+	p->sig = sig;
+	p->hdlr = hdlr;
+	p->u = u;
+	nstack++;
+	u->pc = (unsigned long long) notecont;
+	_NOTED(2);	/* NSAVE: clear note but hold state */
+}
+
+static void
+notecont(Ureg *u, char *s)
+{
+	Pcstack *p;
+	void(*f)(int, char*, Ureg*);
+
+	p = &pcstack[nstack-1];
+	f = p->hdlr;
+	u->pc = p->restorepc;
+	nstack--;
+	(*f)(p->sig, s, u);
+	_NOTED(3);	/* NRSTR */
+}
+
+#define JMPBUFPC 1
+#define JMPBUFSP 0
+
+extern sigset_t	_psigblocked;
+
+typedef struct {
+	sigset_t set;
+	sigset_t blocked;
+	unsigned long long jmpbuf[2];
+} sigjmp_buf_amd64;
+
+void
+siglongjmp(sigjmp_buf j, int ret)
+{
+	struct Ureg *u;
+	sigjmp_buf_amd64 *jb;
+
+	jb = (sigjmp_buf_amd64*)j;
+
+	if(jb->set)
+		_psigblocked = jb->blocked;
+	if(nstack == 0 || pcstack[nstack-1].u->sp > jb->jmpbuf[JMPBUFSP])
+		longjmp((void*)jb->jmpbuf, ret);
+	u = pcstack[nstack-1].u;
+	nstack--;
+	u->ax = ret;
+	if(ret == 0)
+		u->ax = 1;
+	u->pc = jb->jmpbuf[JMPBUFPC];
+	u->sp = jb->jmpbuf[JMPBUFSP] + 8;
+	_NOTED(3);	/* NRSTR */
+}
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/setjmp.s
@@ -1,0 +1,27 @@
+TEXT	longjmp(SB), $0
+	MOVL	r+8(FP), AX
+	CMPL	AX, $0
+	JNE	ok		/* ansi: "longjmp(0) => longjmp(1)" */
+	MOVL	$1, AX		/* bless their pointed heads */
+ok:
+	MOVQ	0(RARG), SP	/* restore sp */
+	MOVQ	8(RARG), BX	/* put return pc on the stack */
+	MOVQ	BX, 0(SP)
+	RET
+
+TEXT	setjmp(SB), $0
+	MOVQ	SP, 0(RARG)	/* store sp */
+	MOVQ	0(SP), BX	/* store return pc */
+	MOVQ	BX, 8(RARG)
+	MOVL	$0, AX		/* return 0 */
+	RET
+
+TEXT	sigsetjmp(SB), $0
+	MOVL	savemask+8(FP), BX
+	MOVL	BX, 0(RARG)
+	MOVL	$_psigblocked(SB), 4(RARG)
+	MOVQ	SP, 8(RARG)	/* store sp */
+	MOVQ	0(SP), BX	/* store return pc */
+	MOVQ	BX, 16(RARG)
+	MOVL	$0, AX	/* return 0 */
+	RET
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/strchr.s
@@ -1,0 +1,38 @@
+	TEXT	strchr(SB), $0
+
+	MOVQ	RARG, DI
+	MOVB	c+8(FP), AX
+	CMPB	AX, $0
+	JEQ	l2	/**/
+
+/*
+ * char is not null
+ */
+l1:
+	MOVB	(DI), BX
+	CMPB	BX, $0
+	JEQ	ret0
+	ADDQ	$1, DI
+	CMPB	AX, BX
+	JNE	l1
+
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
+
+/*
+ * char is null
+ */
+l2:
+	MOVQ	$-1, CX
+	CLD
+
+	REPN;	SCASB
+
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
+
+ret0:
+	MOVQ	$0, AX
+	RET
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/strlen.s
@@ -1,0 +1,16 @@
+	TEXT	strlen(SB),$0
+
+	MOVL	$0, AX
+	MOVQ	$-1, CX
+	CLD
+/*
+ * look for end of string
+ */
+
+	MOVQ	RARG, DI
+	REPN;	SCASB
+
+	MOVQ	DI, AX
+	SUBQ	RARG, AX
+	SUBQ	$1, AX
+	RET
--- /dev/null
+++ b/sys/src/ape/lib/ap/amd64/tas.s
@@ -1,0 +1,5 @@
+TEXT	tas(SB),$0
+
+	MOVL	$0xdeadead,AX
+	XCHGL	AX,(RARG)
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/_seek.c
@@ -1,0 +1,14 @@
+#include <u.h>
+#include <libc.h>
+
+extern int _seek(vlong*, int, vlong, int);
+
+vlong
+seek(int fd, vlong o, int p)
+{
+	vlong l;
+
+	if(_seek(&l, fd, o, p) < 0)
+		l = -1LL;
+	return l;
+}
--- /dev/null
+++ b/sys/src/libc/amd64/argv0.s
@@ -1,0 +1,4 @@
+GLOBL	argv0(SB), $8
+GLOBL	_tos(SB), $8
+GLOBL	_privates(SB), $8
+GLOBL	_nprivates(SB), $4
--- /dev/null
+++ b/sys/src/libc/amd64/atom.s
@@ -1,0 +1,69 @@
+TEXT ainc(SB), 1, $0	/* int ainc(int *); */
+ainclp:
+	MOVL	(RARG), AX	/* exp */
+	MOVL	AX, BX
+	INCL	BX		/* new */
+	LOCK; CMPXCHGL BX, (RARG)
+	JNZ	ainclp
+	MOVL	BX, AX
+	RET
+
+TEXT adec(SB), 1, $0	/* int adec(int*); */
+adeclp:
+	MOVL	(RARG), AX
+	MOVL	AX, BX
+	DECL	BX
+	LOCK; CMPXCHGL BX, (RARG)
+	JNZ	adeclp
+	MOVL	BX, AX
+	RET
+
+/*
+ * int cas32(u32int *p, u32int ov, u32int nv);
+ * int cas(uint *p, int ov, int nv);
+ * int casl(ulong *p, ulong ov, ulong nv);
+ */
+
+TEXT cas32(SB), 1, $0
+TEXT cas(SB), 1, $0
+TEXT casul(SB), 1, $0
+TEXT casl(SB), 1, $0			/* back compat */
+	MOVL	exp+8(FP), AX
+	MOVL	new+16(FP), BX
+	LOCK; CMPXCHGL BX, (RARG)
+	MOVL	$1, AX				/* use CMOVLEQ etc. here? */
+	JNZ	_cas32r0
+_cas32r1:
+	RET
+_cas32r0:
+	DECL	AX
+	RET
+
+/*
+ * int cas64(u64int *p, u64int ov, u64int nv);
+ * int casp(void **p, void *ov, void *nv);
+ */
+
+TEXT cas64(SB), 1, $0
+TEXT casp(SB), 1, $0
+	MOVQ	exp+8(FP), AX
+	MOVQ	new+16(FP), BX
+	LOCK; CMPXCHGQ BX, (RARG)
+	MOVL	$1, AX				/* use CMOVLEQ etc. here? */
+	JNZ	_cas64r0
+_cas64r1:
+	RET
+_cas64r0:
+	DECL	AX
+	RET
+
+TEXT fas64(SB), 1, $-4
+TEXT fasp(SB), 1, $-4
+	MOVQ	p+8(FP), AX
+	LOCK; XCHGQ	AX, (RARG)			/*  */
+	RET
+
+TEXT fas32(SB), 1, $-4
+	MOVL	p+8(FP), AX
+	LOCK; XCHGL	AX, (RARG)			/*  */
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/cycles.s
@@ -1,0 +1,5 @@
+TEXT cycles(SB),1,$0				/* time stamp counter; cycles since power up */
+	RDTSC
+	MOVL	AX, 0(RARG)			/* lo */
+	MOVL	DX, 4(RARG)			/* hi */
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/getcallerpc.s
@@ -1,0 +1,3 @@
+TEXT getcallerpc(SB), 1, $0
+	MOVQ	-8(RARG), AX
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/getfcr.s
@@ -1,0 +1,38 @@
+
+TEXT	setfcr(SB), $4
+	XORL	$(0x3F<<7),RARG	/* bits are cleared in csr to enable them */
+	ANDL	$0xFFC0, RARG	/* just the fcr bits */
+	WAIT	/* is this needed? */
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	RARG, AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET
+
+TEXT	getfcr(SB), $4
+	WAIT
+	STMXCSR	0(SP)
+	MOVWLZX	0(SP), AX
+	ANDL	$0xFFC0, AX
+	XORL	$(0x3F<<7),AX
+	RET
+
+TEXT	getfsr(SB), $4
+	WAIT
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$0x3F, AX
+	RET
+
+TEXT	setfsr(SB), $4
+	ANDL	$0x3F, RARG
+	WAIT
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	RARG, AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/main9.s
@@ -1,0 +1,19 @@
+#define NPRIVATES	16
+
+TEXT	_main(SB), 1, $(2*8+NPRIVATES*8)
+	MOVQ	AX, _tos(SB)
+	LEAQ	16(SP), AX
+	MOVQ	AX, _privates(SB)
+	MOVL	$NPRIVATES, _nprivates(SB)
+	MOVL	inargc-8(FP), RARG
+	LEAQ	inargv+0(FP), AX
+	MOVQ	AX, 8(SP)
+	CALL	main(SB)
+
+loop:
+	MOVQ	$_exits<>(SB), RARG
+	CALL	exits(SB)
+	JMP	loop
+
+DATA	_exits<>+0(SB)/4, $"main"
+GLOBL	_exits<>+0(SB), $5
--- /dev/null
+++ b/sys/src/libc/amd64/main9p.s
@@ -1,0 +1,41 @@
+#define NPRIVATES	16
+
+TEXT _mainp(SB), 1, $(2*8+NPRIVATES*8)
+	MOVQ	AX, _tos(SB)		/* _tos = arg */
+	LEAQ	16(SP), AX
+	MOVQ	AX, _privates(SB)
+	MOVL	$NPRIVATES, _nprivates(SB)
+
+	CALL	_profmain(SB)		/* _profmain(); */
+
+	MOVQ	_tos+0(SB), DX		/* _tos->prof.pp = _tos->prof.next; */
+	MOVQ	8(DX), CX
+	MOVQ	CX, (DX)
+
+	MOVL	inargc-8(FP), RARG	/* main(argc, argv); */
+	LEAQ	inargv+0(FP), AX
+	MOVQ	AX, 8(SP)
+	CALL	main(SB)
+
+loop:
+	MOVQ	$_exits<>(SB), RARG
+	CALL	exits(SB)
+	MOVQ	$_profin(SB), AX	/* force loading of profile */
+	JMP	loop
+
+TEXT	_savearg(SB), 1, $0
+	MOVQ	RARG, AX
+	RET
+
+TEXT	_saveret(SB), 1, $0
+	RET
+
+TEXT	_restorearg(SB), 1, $0
+	RET				/* we want RARG in RARG */
+
+TEXT	_callpc(SB), 1, $0
+	MOVQ	8(RARG), AX
+	RET
+
+DATA	_exits<>+0(SB)/4, $"main"
+GLOBL	_exits<>+0(SB), $5
--- /dev/null
+++ b/sys/src/libc/amd64/memccpy.s
@@ -1,0 +1,58 @@
+	TEXT	memccpy(SB),$0
+
+	MOVL	n+24(FP), CX
+	CMPL	CX, $0
+	JEQ	none
+	MOVQ	p2+8(FP), DI
+	MOVBLZX	c+16(FP), AX
+	CLD
+/*
+ * find the character in the second string
+ */
+
+	REPN;	SCASB
+	JEQ	found
+
+/*
+ * if not found, set count to 'n'
+ */
+none:
+	MOVL	$0, AX
+	MOVL	n+24(FP), BX
+	JMP	memcpy
+
+/*
+ * if found, set count to bytes thru character
+ */
+found:
+	MOVQ	DI, AX
+	SUBQ	p2+8(FP), AX
+	MOVQ	AX, BX
+	ADDQ	RARG, AX
+
+/*
+ * copy the memory
+ */
+
+memcpy:
+	MOVQ	RARG, DI
+	MOVQ	p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, DX
+	ORQ	SI, DX
+	ANDL	$3, DX
+	JNE	c3
+	MOVL	BX, CX
+	SHRQ	$2, CX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	ANDL	$3, BX
+c3:
+	MOVL	BX, CX
+	REP;	MOVSB
+
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/memchr.s
@@ -1,0 +1,23 @@
+	TEXT	memchr(SB),$0
+
+	MOVL	n+16(FP), CX
+	CMPL	CX, $0
+	JEQ	none
+	MOVQ	RARG, DI
+	MOVBLZX	c+8(FP), AX
+	CLD
+/*
+ * SCASB is memchr instruction
+ */
+
+	REPN;	SCASB
+	JEQ	found
+
+none:
+	MOVL	$0, AX
+	RET
+
+found:
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/memcmp.s
@@ -1,0 +1,52 @@
+	TEXT	memcmp(SB),$0
+
+	MOVL	n+16(FP), BX
+	CMPL	BX, $0
+	JEQ	none
+	MOVQ	RARG, DI
+	MOVQ	p2+8(FP), SI
+	CLD
+	MOVQ	DI, CX
+	ORQ	SI, CX
+	ANDL	$3, CX
+	JNE	c3
+/*
+ * first by longs
+ */
+
+	MOVL	BX, CX
+	SHRQ	$2, CX
+
+	REP;	CMPSL
+	JNE	found
+
+/*
+ * then by bytes
+ */
+	ANDL	$3, BX
+c3:
+	MOVL	BX, CX
+	REP;	CMPSB
+	JNE	found1
+
+none:
+	MOVQ	$0, AX
+	RET
+
+/*
+ * if long found,
+ * back up and look by bytes
+ */
+found:
+	MOVL	$4, CX
+	SUBQ	CX, DI
+	SUBQ	CX, SI
+	REP;	CMPSB
+
+found1:
+	JLS	lt
+	MOVQ	$-1, AX
+	RET
+lt:
+	MOVQ	$1, AX
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/memcpy.s
@@ -1,0 +1,81 @@
+TEXT memcpy(SB), $0
+	MOVQ	RARG, DI
+	MOVQ	DI, AX			/* return value */
+	MOVQ	p2+8(FP), SI
+	MOVL	n+16(FP), BX
+	CMPL	BX, $0
+	JGT	_ok
+	JEQ	_return			/* nothing to do if n == 0 */
+	MOVL	$0, SI			/* fault if n < 0 */
+
+/*
+ * check and set for backwards:
+ *	(p2 < p1) && ((p2+n) > p1)
+ */
+_ok:
+	CMPQ	SI, DI
+	JGT	_forward
+	JEQ	_return			/* nothing to do if p2 == p1 */
+	MOVQ	SI, DX
+	ADDQ	BX, DX
+	CMPQ	DX, DI
+	JGT	_back
+
+/*
+ * copy whole longs if aligned
+ */
+_forward:
+	CLD
+	MOVQ	SI, DX
+	ORQ	DI, DX
+	ANDL	$3, DX
+	JNE	c3f
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	MOVSL
+
+/*
+ * copy the rest, by bytes
+ */
+	JEQ	_return			/* flags set by above ANDL */
+c3f:
+	MOVL	BX, CX
+	REP;	MOVSB
+
+	RET
+
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+_back:
+	ADDQ	BX, DI
+	ADDQ	BX, SI
+	STD
+	SUBQ	$4, DI
+	SUBQ	$4, SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, DX
+	ORQ	SI, DX
+	ANDL	$3, DX
+	JNE	c3b
+	MOVL	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	JEQ	_return			/* flags set by above ANDL */
+
+c3b:
+	ADDQ	$3, DI
+	ADDQ	$3, SI
+	MOVL	BX, CX
+	REP;	MOVSB
+
+_return:
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/memmove.s
@@ -1,0 +1,81 @@
+TEXT memmove(SB), $0
+	MOVQ	RARG, DI
+	MOVQ	DI, AX			/* return value */
+	MOVQ	p2+8(FP), SI
+	MOVL	n+16(FP), BX
+	CMPL	BX, $0
+	JGT	_ok
+	JEQ	_return			/* nothing to do if n == 0 */
+	MOVL	$0, SI			/* fault if n < 0 */
+
+/*
+ * check and set for backwards:
+ *	(p2 < p1) && ((p2+n) > p1)
+ */
+_ok:
+	CMPQ	SI, DI
+	JGT	_forward
+	JEQ	_return			/* nothing to do if p2 == p1 */
+	MOVQ	SI, DX
+	ADDQ	BX, DX
+	CMPQ	DX, DI
+	JGT	_back
+
+/*
+ * copy whole longs if aligned
+ */
+_forward:
+	CLD
+	MOVQ	SI, DX
+	ORQ	DI, DX
+	ANDL	$3, DX
+	JNE	c3f
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	MOVSL
+
+/*
+ * copy the rest, by bytes
+ */
+	JEQ	_return			/* flags set by above ANDL */
+c3f:
+	MOVL	BX, CX
+	REP;	MOVSB
+
+	RET
+
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+_back:
+	ADDQ	BX, DI
+	ADDQ	BX, SI
+	STD
+	SUBQ	$4, DI
+	SUBQ	$4, SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, DX
+	ORQ	SI, DX
+	ANDL	$3, DX
+	JNE	c3b
+	MOVL	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	JEQ	_return			/* flags set by above ANDL */
+
+c3b:
+	ADDQ	$3, DI
+	ADDQ	$3, SI
+	MOVL	BX, CX
+	REP;	MOVSB
+
+_return:
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/memset.s
@@ -1,0 +1,41 @@
+	TEXT	memset(SB),$0
+
+	CLD
+	MOVQ	RARG, DI
+	MOVBLZX	c+8(FP), AX
+	MOVL	n+16(FP), BX
+/*
+ * if not enough bytes, just set bytes
+ */
+	CMPL	BX, $9
+	JLS	c3
+/*
+ * if not aligned, just set bytes
+ */
+	MOVQ	RARG, CX
+	ANDL	$3,CX
+	JNE	c3
+/*
+ * build word in AX
+ */
+	MOVB	AL, AH
+	MOVL	AX, CX
+	SHLL	$16, CX
+	ORL	CX, AX
+/*
+ * set whole longs
+ */
+c1:
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	STOSL
+/*
+ * set the rest, by bytes
+ */
+c3:
+	MOVL	BX, CX
+	REP;	STOSB
+ret:
+	MOVQ	RARG,AX
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/mkfile
@@ -1,0 +1,41 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libc.a
+SFILES=\
+	argv0.s\
+	atom.s\
+	cycles.s\
+	getfcr.s\
+	main9.s\
+	main9p.s\
+	memccpy.s\
+	memchr.s\
+	memcmp.s\
+	memcpy.s\
+	memmove.s\
+	memset.s\
+	muldiv.s\
+	setjmp.s\
+	sqrt.s\
+	strcat.s\
+	strchr.s\
+	strcpy.s\
+	strlen.s\
+	tas.s\
+
+CFILES=\
+	_seek.c\
+	getcallerpc.c\
+	notejmp.c\
+
+HFILES=/sys/include/libc.h
+
+OFILES=${CFILES:%.c=%.$O} ${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$CFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib
--- /dev/null
+++ b/sys/src/libc/amd64/muldiv.s
@@ -1,0 +1,12 @@
+TEXT	umuldiv(SB), $0
+	MOVL	RARG, AX
+	MULL	b+8(FP)
+	DIVL	c+16(FP)
+	RET
+
+TEXT	muldiv(SB), $0
+	MOVL	RARG, AX
+	IMULL	b+8(FP)
+	IDIVL	c+16(FP)
+	RET
+	END
--- /dev/null
+++ b/sys/src/libc/amd64/notejmp.c
@@ -1,0 +1,16 @@
+#include <u.h>
+#include <libc.h>
+#include <ureg.h>
+
+void
+notejmp(void *vr, jmp_buf j, int ret)
+{
+	struct Ureg *r = vr;
+
+	r->ax = ret;
+	if(ret == 0)
+		r->ax = 1;
+	r->pc = j[JMPBUFPC];
+	r->sp = j[JMPBUFSP] + 8;
+	noted(NCONT);
+}
--- /dev/null
+++ b/sys/src/libc/amd64/setjmp.s
@@ -1,0 +1,17 @@
+TEXT	longjmp(SB), $0
+	MOVL	r+8(FP), AX
+	CMPL	AX, $0
+	JNE	ok		/* ansi: "longjmp(0) => longjmp(1)" */
+	MOVL	$1, AX		/* bless their pointed heads */
+ok:
+	MOVQ	0(RARG), SP	/* restore sp */
+	MOVQ	8(RARG), BX	/* put return pc on the stack */
+	MOVQ	BX, 0(SP)
+	RET
+
+TEXT	setjmp(SB), $0
+	MOVQ	SP, 0(RARG)	/* store sp */
+	MOVQ	0(SP), BX	/* store return pc */
+	MOVQ	BX, 8(RARG)
+	MOVL	$0, AX		/* return 0 */
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/sqrt.s
@@ -1,0 +1,4 @@
+TEXT	sqrt(SB), $0
+	MOVSD	a+0(FP), X0
+	SQRTSD	X0, X0
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/strcat.s
@@ -1,0 +1,48 @@
+	TEXT	strcat(SB),$0
+
+	MOVL	$0, AX
+	MOVQ	$-1, CX
+	CLD
+
+/*
+ * find length of second string
+ */
+
+	MOVQ	p2+8(FP), DI
+	REPN;	SCASB
+
+	MOVQ	DI, BX
+	SUBQ	p2+8(FP), BX
+
+/*
+ * find end of first string
+ */
+
+	MOVQ	RARG, DI
+	REPN;	SCASB
+
+/*
+ * copy the memory
+ */
+	SUBQ	$1, DI
+	MOVQ	p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, CX
+	ORQ	SI, CX
+	ANDL	$3, CX
+	JNE	c3
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	ANDL	$3, BX
+c3:
+	MOVQ	BX, CX
+	REP;	MOVSB
+
+	MOVQ	RARG, AX
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/strchr.s
@@ -1,0 +1,38 @@
+	TEXT	strchr(SB), $0
+
+	MOVQ	RARG, DI
+	MOVB	c+8(FP), AX
+	CMPB	AX, $0
+	JEQ	l2	/**/
+
+/*
+ * char is not null
+ */
+l1:
+	MOVB	(DI), BX
+	CMPB	BX, $0
+	JEQ	ret0
+	ADDQ	$1, DI
+	CMPB	AX, BX
+	JNE	l1
+
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
+
+/*
+ * char is null
+ */
+l2:
+	MOVQ	$-1, CX
+	CLD
+
+	REPN;	SCASB
+
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
+
+ret0:
+	MOVQ	$0, AX
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/strcpy.s
@@ -1,0 +1,40 @@
+	TEXT	strcpy(SB),$0
+
+	MOVL	$0, AX
+	MOVQ	$-1, CX
+	CLD
+/*
+ * find end of second string
+ */
+
+	MOVQ	p2+8(FP), DI
+	REPN;	SCASB
+
+	MOVQ	DI, BX
+	SUBQ	p2+8(FP), BX
+
+/*
+ * copy the memory
+ */
+	MOVQ	RARG, DI
+	MOVQ	p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, CX
+	ORQ		SI, CX
+	ANDL	$3, CX
+	JNE	c3
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	ANDL	$3, BX
+c3:
+	MOVL	BX, CX
+	REP;	MOVSB
+
+	MOVQ	RARG, AX
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/strlen.s
@@ -1,0 +1,16 @@
+	TEXT	strlen(SB),$0
+
+	MOVL	$0, AX
+	MOVQ	$-1, CX
+	CLD
+/*
+ * look for end of string
+ */
+
+	MOVQ	RARG, DI
+	REPN;	SCASB
+
+	MOVQ	DI, AX
+	SUBQ	RARG, AX
+	SUBQ	$1, AX
+	RET
--- /dev/null
+++ b/sys/src/libc/amd64/tas.s
@@ -1,0 +1,5 @@
+TEXT	_tas(SB), 1, $0
+
+	MOVL	$0xdeaddead,AX
+	XCHGL	AX,(RARG)
+	RET
--- /dev/null
+++ b/sys/src/libmp/amd64/mkfile
@@ -1,0 +1,20 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libmp.a
+SFILES=\
+	mpdigdiv.s\
+	mpvecadd.s\
+	mpvecdigmuladd.s\
+	mpvecdigmulsub.s\
+	mpvecsub.s\
+
+HFILES=/$objtype/include/u.h /sys/include/mp.h ../port/dat.h
+
+OFILES=${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib
--- /dev/null
+++ b/sys/src/libmp/amd64/mpdigdiv.s
@@ -1,0 +1,21 @@
+TEXT	mpdigdiv(SB),$0
+
+/*	MOVL	dividend+0(FP),BX */
+	MOVL	0(RARG),AX
+	MOVL	4(RARG),DX
+	MOVL	divisor+8(FP),BX
+	MOVQ	quotient+16(FP),DI
+	XORL	CX,CX
+	CMPL	DX,BX		/* dividend >= 2^32 * divisor */
+	JHS	_divovfl
+	CMPL	BX,CX		/* divisor == 0 */
+	JE	_divovfl
+	DIVL	BX		/* AX = DX:AX/BX */
+	MOVL	AX,0(DI)
+	RET
+
+	/* return all 1's */
+_divovfl:
+	NOTL	CX
+	MOVL	CX,0(DI)
+	RET
--- /dev/null
+++ b/sys/src/libmp/amd64/mpvecadd.s
@@ -1,0 +1,54 @@
+/*
+ *	mpvecadd(mpdigit *a, int alen, mpdigit *b, int blen, mpdigit *sum)
+ *
+ *		sum[0:alen] = a[0:alen-1] + b[0:blen-1]
+ *
+ *	prereq: alen >= blen, sum has room for alen+1 digits
+ */
+TEXT	mpvecadd(SB),$0
+
+	MOVL	alen+8(FP),DX
+	MOVL	blen+24(FP),CX
+/*	MOVL	a+0(FP),SI */
+	MOVQ	RARG, SI
+	MOVQ	b+16(FP),BX
+	SUBL	CX,DX
+	MOVQ	sum+32(FP),DI
+	XORL	BP,BP			/* this also sets carry to 0 */
+
+	/* skip addition if b is zero */
+	TESTL	CX,CX
+	JZ	_add1
+
+	/* sum[0:blen-1],carry = a[0:blen-1] + b[0:blen-1] */
+_addloop1:
+	MOVL	(SI)(BP*4), AX
+	ADCL	(BX)(BP*4), AX
+	MOVL	AX,(DI)(BP*4)
+	INCL	BP
+	LOOP	_addloop1
+
+_add1:
+	/* jump if alen > blen */
+	INCL	DX
+	MOVL	DX,CX
+	LOOP	_addloop2
+
+	/* sum[alen] = carry */
+_addend:
+	JC	_addcarry
+	MOVL	$0,(DI)(BP*4)
+	RET
+_addcarry:
+	MOVL	$1,(DI)(BP*4)
+	RET
+
+	/* sum[blen:alen-1],carry = a[blen:alen-1] + 0 */
+_addloop2:
+	MOVL	(SI)(BP*4),AX
+	ADCL	$0,AX
+	MOVL	AX,(DI)(BP*4)
+	INCL	BP
+	LOOP	_addloop2
+	JMP	_addend
+
--- /dev/null
+++ b/sys/src/libmp/amd64/mpvecdigmuladd.s
@@ -1,0 +1,53 @@
+/*
+ *	mpvecdigmul(mpdigit *b, int n, mpdigit m, mpdigit *p)
+ *
+ *	p += b*m
+ *
+ *	each step look like:
+ *		hi,lo = m*b[i]
+ *		lo += oldhi + carry
+ *		hi += carry
+ *		p[i] += lo
+ *		oldhi = hi
+ *
+ *	the registers are:
+ *		hi = DX		- constrained by hardware
+ *		lo = AX		- constrained by hardware
+ *		b+n = SI	- can't be BP
+ *		p+n = DI	- can't be BP
+ *		i-n = BP
+ *		m = BX
+ *		oldhi = CX
+ *		
+ */
+TEXT	mpvecdigmuladd(SB),$0
+
+/*	MOVQ	b+0(FP),SI	*/
+	MOVQ	RARG,SI
+	MOVL	n+8(FP),CX
+	MOVL	m+16(FP),BX
+	MOVQ	p+24(FP),DI
+	MOVL	CX,BP
+	NEGQ	BP		/* BP = -n */
+	SHLL	$2,CX
+	ADDQ	CX,SI		/* SI = b + n */
+	ADDQ	CX,DI		/* DI = p + n */
+	XORL	CX,CX
+_muladdloop:
+	MOVL	(SI)(BP*4),AX	/* lo = b[i] */
+	MULL	BX		/* hi, lo = b[i] * m */
+	ADDL	CX,AX		/* lo += oldhi */
+	JCC	_muladdnocarry1
+	INCL	DX		/* hi += carry */
+_muladdnocarry1:
+	ADDL	AX,(DI)(BP*4)	/* p[i] += lo */
+	JCC	_muladdnocarry2
+	INCL	DX		/* hi += carry */
+_muladdnocarry2:
+	MOVL	DX,CX		/* oldhi = hi */
+	INCQ	BP		/* i++ */
+	JNZ	_muladdloop
+	XORL	AX,AX
+	ADDL	CX,(DI)(BP*4)	/* p[n] + oldhi */
+	ADCL	AX,AX		/* return carry out of p[n] */
+	RET
--- /dev/null
+++ b/sys/src/libmp/amd64/mpvecdigmulsub.s
@@ -1,0 +1,53 @@
+/*
+ *	mpvecdigmulsub(mpdigit *b, int n, mpdigit m, mpdigit *p)
+ *
+ *	p -= b*m
+ *
+ *	each step look like:
+ *		hi,lo = m*b[i]
+ *		lo += oldhi + carry
+ *		hi += carry
+ *		p[i] += lo
+ *		oldhi = hi
+ *
+ *	the registers are:
+ *		hi = DX		- constrained by hardware
+ *		lo = AX		- constrained by hardware
+ *		b = SI		- can't be BP
+ *		p = DI		- can't be BP
+ *		i = BP
+ *		n = CX		- constrained by LOOP instr
+ *		m = BX
+ *		oldhi = R8
+ *		
+ */
+TEXT	mpvecdigmulsub(SB),$0
+
+/*	MOVL	b+0(FP),SI	*/
+	MOVQ	RARG,SI
+	MOVL	n+8(FP),CX
+	MOVL	m+16(FP),BX
+	MOVQ	p+24(FP),DI
+	XORL	BP,BP
+	MOVL	BP,R8
+_mulsubloop:
+	MOVL	(SI)(BP*4),AX		/* lo = b[i] */
+	MULL	BX			/* hi, lo = b[i] * m */
+	ADDL	R8,AX		/* lo += oldhi */
+	JCC	_mulsubnocarry1
+	INCL	DX			/* hi += carry */
+_mulsubnocarry1:
+	SUBL	AX,(DI)(BP*4)
+	JCC	_mulsubnocarry2
+	INCL	DX			/* hi += carry */
+_mulsubnocarry2:
+	MOVL	DX,R8
+	INCL	BP
+	LOOP	_mulsubloop
+	SUBL	R8,(DI)(BP*4)
+	JCC	_mulsubnocarry3
+	MOVQ	$-1,AX
+	RET
+_mulsubnocarry3:
+	MOVQ	$1,AX
+	RET
--- /dev/null
+++ b/sys/src/libmp/amd64/mpvecsub.s
@@ -1,0 +1,45 @@
+/*
+ *	mpvecsub(mpdigit *a, int alen, mpdigit *b, int blen, mpdigit *diff)
+ *
+ *		diff[0:alen-1] = a[0:alen-1] - b[0:blen-1]
+ *
+ *	prereq: alen >= blen, diff has room for alen digits
+ */
+TEXT	mpvecsub(SB),$0
+
+/*	MOVQ	a+0(FP),SI */
+	MOVQ	RARG, SI
+	MOVQ	b+16(FP),BX
+	MOVL	alen+8(FP),DX
+	MOVL	blen+24(FP),CX
+	MOVQ	diff+32(FP),DI
+	SUBL	CX,DX
+	XORL	BP,BP			/* this also sets carry to 0 */
+
+	/* skip subraction if b is zero */
+	TESTL	CX,CX
+	JZ	_sub1
+
+	/* diff[0:blen-1],borrow = a[0:blen-1] - b[0:blen-1] */
+_subloop1:
+	MOVL	(SI)(BP*4),AX
+	SBBL	(BX)(BP*4),AX
+	MOVL	AX,(DI)(BP*4)
+	INCL	BP
+	LOOP	_subloop1
+
+_sub1:
+	INCL	DX
+	MOVL	DX,CX
+	LOOP	_subloop2
+	RET
+
+	/* diff[blen:alen-1] = a[blen:alen-1] - 0 */
+_subloop2:
+	MOVL	(SI)(BP*4),AX
+	SBBL	$0,AX
+	MOVL	AX,(DI)(BP*4)
+	INCL	BP
+	LOOP	_subloop2
+	RET
+
--- /dev/null
+++ b/sys/src/libsec/amd64/md5block.s
@@ -1,0 +1,212 @@
+/*
+ *  rfc1321 requires that I include this.  The code is new.  The constants
+ *  all come from the rfc (hence the copyright).  We trade a table for the
+ *  macros in rfc.  The total size is a lot less. -- presotto
+ *
+ *	Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ *	rights reserved.
+ *
+ *	License to copy and use this software is granted provided that it
+ *	is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ *	Algorithm" in all material mentioning or referencing this software
+ *	or this function.
+ *
+ *	License is also granted to make and use derivative works provided
+ *	that such works are identified as "derived from the RSA Data
+ *	Security, Inc. MD5 Message-Digest Algorithm" in all material
+ *	mentioning or referencing the derived work.
+ *
+ *	RSA Data Security, Inc. makes no representations concerning either
+ *	the merchantability of this software or the suitability of this
+ *	software forany particular purpose. It is provided "as is"
+ *	without express or implied warranty of any kind.
+ *	These notices must be retained in any copies of any part of this
+ *	documentation and/or software.
+ */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+/*
+ * SI is data
+ *	a += FN(B,C,D);
+ *	a += x[sh] + t[sh];
+ *	a = (a << S11) | (a >> (32 - S11));
+ *	a += b;
+ */
+
+#define BODY1(off,V,FN,SH,A,B,C,D)\
+	FN(B,C,D)\
+	LEAL V(A)(DI*1),A;\
+	ADDL (off)(BP),A;\
+	ROLL $SH,A;\
+	ADDL B,A;\
+
+#define BODY(off,V,FN,SH,A,B,C,D)\
+	FN(B,C,D)\
+	LEAL V(A)(DI*1),A;\
+	ADDL (off)(BP),A;\
+	ROLL $SH,A;\
+	ADDL B,A;\
+
+/*
+ * fn1 = ((c ^ d) & b) ^ d
+ */
+#define FN1(B,C,D)\
+	MOVL C,DI;\
+	XORL D,DI;\
+	ANDL B,DI;\
+	XORL D,DI;\
+
+/*
+ * fn2 = ((b ^ c) & d) ^ c;
+ */
+#define FN2(B,C,D)\
+	MOVL B,DI;\
+	XORL C,DI;\
+	ANDL D,DI;\
+	XORL C,DI;\
+
+/*
+ * fn3 = b ^ c ^ d;
+ */
+#define FN3(B,C,D)\
+	MOVL B,DI;\
+	XORL C,DI;\
+	XORL D,DI;\
+
+/*
+ * fn4 = c ^ (b | ~d);
+ */
+#define FN4(B,C,D)\
+	MOVL D,DI;\
+	XORL $-1,DI;\
+	ORL B,DI;\
+	XORL C,DI;\
+
+#define	LEN	8
+#define	STATE	16
+
+TEXT	_md5block+0(SB), $0
+
+	MOVQ	RARG,R8
+	MOVLQZX len+LEN(FP),BX
+	ADDQ	BX,R8
+
+mainloop:
+	MOVQ state+STATE(FP),SI
+	MOVL (SI),AX
+	MOVL 4(SI),BX
+	MOVL 8(SI),CX
+	MOVL 12(SI),DX
+
+	BODY1( 0*4,0xd76aa478,FN1,S11,AX,BX,CX,DX)
+	BODY1( 1*4,0xe8c7b756,FN1,S12,DX,AX,BX,CX)
+	BODY1( 2*4,0x242070db,FN1,S13,CX,DX,AX,BX)
+	BODY1( 3*4,0xc1bdceee,FN1,S14,BX,CX,DX,AX)
+
+	BODY1( 4*4,0xf57c0faf,FN1,S11,AX,BX,CX,DX)
+	BODY1( 5*4,0x4787c62a,FN1,S12,DX,AX,BX,CX)
+	BODY1( 6*4,0xa8304613,FN1,S13,CX,DX,AX,BX)
+	BODY1( 7*4,0xfd469501,FN1,S14,BX,CX,DX,AX)
+
+	BODY1( 8*4,0x698098d8,FN1,S11,AX,BX,CX,DX)
+	BODY1( 9*4,0x8b44f7af,FN1,S12,DX,AX,BX,CX)
+	BODY1(10*4,0xffff5bb1,FN1,S13,CX,DX,AX,BX)
+	BODY1(11*4,0x895cd7be,FN1,S14,BX,CX,DX,AX)
+
+	BODY1(12*4,0x6b901122,FN1,S11,AX,BX,CX,DX)
+	BODY1(13*4,0xfd987193,FN1,S12,DX,AX,BX,CX)
+	BODY1(14*4,0xa679438e,FN1,S13,CX,DX,AX,BX)
+	BODY1(15*4,0x49b40821,FN1,S14,BX,CX,DX,AX)
+
+
+	BODY( 1*4,0xf61e2562,FN2,S21,AX,BX,CX,DX)
+	BODY( 6*4,0xc040b340,FN2,S22,DX,AX,BX,CX)
+	BODY(11*4,0x265e5a51,FN2,S23,CX,DX,AX,BX)
+	BODY( 0*4,0xe9b6c7aa,FN2,S24,BX,CX,DX,AX)
+
+	BODY( 5*4,0xd62f105d,FN2,S21,AX,BX,CX,DX)
+	BODY(10*4,0x02441453,FN2,S22,DX,AX,BX,CX)
+	BODY(15*4,0xd8a1e681,FN2,S23,CX,DX,AX,BX)
+	BODY( 4*4,0xe7d3fbc8,FN2,S24,BX,CX,DX,AX)
+
+	BODY( 9*4,0x21e1cde6,FN2,S21,AX,BX,CX,DX)
+	BODY(14*4,0xc33707d6,FN2,S22,DX,AX,BX,CX)
+	BODY( 3*4,0xf4d50d87,FN2,S23,CX,DX,AX,BX)
+	BODY( 8*4,0x455a14ed,FN2,S24,BX,CX,DX,AX)
+
+	BODY(13*4,0xa9e3e905,FN2,S21,AX,BX,CX,DX)
+	BODY( 2*4,0xfcefa3f8,FN2,S22,DX,AX,BX,CX)
+	BODY( 7*4,0x676f02d9,FN2,S23,CX,DX,AX,BX)
+	BODY(12*4,0x8d2a4c8a,FN2,S24,BX,CX,DX,AX)
+
+
+	BODY( 5*4,0xfffa3942,FN3,S31,AX,BX,CX,DX)
+	BODY( 8*4,0x8771f681,FN3,S32,DX,AX,BX,CX)
+	BODY(11*4,0x6d9d6122,FN3,S33,CX,DX,AX,BX)
+	BODY(14*4,0xfde5380c,FN3,S34,BX,CX,DX,AX)
+
+	BODY( 1*4,0xa4beea44,FN3,S31,AX,BX,CX,DX)
+	BODY( 4*4,0x4bdecfa9,FN3,S32,DX,AX,BX,CX)
+	BODY( 7*4,0xf6bb4b60,FN3,S33,CX,DX,AX,BX)
+	BODY(10*4,0xbebfbc70,FN3,S34,BX,CX,DX,AX)
+
+	BODY(13*4,0x289b7ec6,FN3,S31,AX,BX,CX,DX)
+	BODY( 0*4,0xeaa127fa,FN3,S32,DX,AX,BX,CX)
+	BODY( 3*4,0xd4ef3085,FN3,S33,CX,DX,AX,BX)
+	BODY( 6*4,0x04881d05,FN3,S34,BX,CX,DX,AX)
+
+	BODY( 9*4,0xd9d4d039,FN3,S31,AX,BX,CX,DX)
+	BODY(12*4,0xe6db99e5,FN3,S32,DX,AX,BX,CX)
+	BODY(15*4,0x1fa27cf8,FN3,S33,CX,DX,AX,BX)
+	BODY( 2*4,0xc4ac5665,FN3,S34,BX,CX,DX,AX)
+
+
+	BODY( 0*4,0xf4292244,FN4,S41,AX,BX,CX,DX)
+	BODY( 7*4,0x432aff97,FN4,S42,DX,AX,BX,CX)
+	BODY(14*4,0xab9423a7,FN4,S43,CX,DX,AX,BX)
+	BODY( 5*4,0xfc93a039,FN4,S44,BX,CX,DX,AX)
+
+	BODY(12*4,0x655b59c3,FN4,S41,AX,BX,CX,DX)
+	BODY( 3*4,0x8f0ccc92,FN4,S42,DX,AX,BX,CX)
+	BODY(10*4,0xffeff47d,FN4,S43,CX,DX,AX,BX)
+	BODY( 1*4,0x85845dd1,FN4,S44,BX,CX,DX,AX)
+
+	BODY( 8*4,0x6fa87e4f,FN4,S41,AX,BX,CX,DX)
+	BODY(15*4,0xfe2ce6e0,FN4,S42,DX,AX,BX,CX)
+	BODY( 6*4,0xa3014314,FN4,S43,CX,DX,AX,BX)
+	BODY(13*4,0x4e0811a1,FN4,S44,BX,CX,DX,AX)
+
+	BODY( 4*4,0xf7537e82,FN4,S41,AX,BX,CX,DX)
+	BODY(11*4,0xbd3af235,FN4,S42,DX,AX,BX,CX)
+	BODY( 2*4,0x2ad7d2bb,FN4,S43,CX,DX,AX,BX)
+	BODY( 9*4,0xeb86d391,FN4,S44,BX,CX,DX,AX)
+
+	ADDQ $(16*4),BP
+	MOVQ state+STATE(FP),DI
+	ADDL AX,0(DI)
+	ADDL BX,4(DI)
+	ADDL CX,8(DI)
+	ADDL DX,12(DI)
+
+	CMPQ BP,R8
+	JCS mainloop
+
+	RET
--- /dev/null
+++ b/sys/src/libsec/amd64/mkfile
@@ -1,0 +1,19 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libsec.a
+FILES=\
+	md5block\
+	sha1block\
+
+HFILES=/sys/include/libsec.h
+
+SFILES=${FILES:%=%.s}
+
+OFILES=${FILES:%=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib
--- /dev/null
+++ b/sys/src/libsec/amd64/sha1block.s
@@ -1,0 +1,197 @@
+/* x = (wp[off-f] ^ wp[off-8] ^ wp[off-14] ^ wp[off-16]) <<< 1;
+ * wp[off] = x;
+ * x += A <<< 5;
+ * E += 0xca62c1d6 + x;
+ * x = FN(B,C,D);
+ * E += x;
+ * B >>> 2
+ */
+#define BSWAPDI	BYTE $0x0f; BYTE $0xcf;
+
+#define BODY(off,FN,V,A,B,C,D,E)\
+	MOVL (off-64)(BP),DI;\
+	XORL (off-56)(BP),DI;\
+	XORL (off-32)(BP),DI;\
+	XORL (off-12)(BP),DI;\
+	ROLL $1,DI;\
+	MOVL DI,off(BP);\
+	LEAL V(DI)(E*1),E;\
+	MOVL A,DI;\
+	ROLL $5,DI;\
+	ADDL DI,E;\
+	FN(B,C,D)\
+	ADDL DI,E;\
+	RORL $2,B;\
+
+#define BODY0(off,FN,V,A,B,C,D,E)\
+	MOVLQZX off(BX),DI;\
+	BSWAPDI;\
+	MOVL DI,off(BP);\
+	LEAL V(DI)(E*1),E;\
+	MOVL A,DI;\
+	ROLL $5,DI;\
+	ADDL DI,E;\
+	FN(B,C,D)\
+	ADDL DI,E;\
+	RORL $2,B;\
+
+/*
+ * fn1 = (((C^D)&B)^D);
+ */
+#define FN1(B,C,D)\
+	MOVL C,DI;\
+	XORL D,DI;\
+	ANDL B,DI;\
+	XORL D,DI;\
+
+/*
+ * fn24 = B ^ C ^ D
+ */
+#define FN24(B,C,D)\
+	MOVL B,DI;\
+	XORL C,DI;\
+	XORL D,DI;\
+
+/*
+ * fn3 = ((B ^ C) & (D ^= B)) ^ B
+ * D ^= B to restore D
+ */
+#define FN3(B,C,D)\
+	MOVL B,DI;\
+	XORL C,DI;\
+	XORL B,D;\
+	ANDL D,DI;\
+	XORL B,DI;\
+	XORL B,D;\
+
+/*
+ * stack offsets
+ * void sha1block(uchar *DATA, int LEN, ulong *STATE)
+ */
+#define	DATA	0
+#define	LEN	8
+#define	STATE	16
+
+/*
+ * stack offsets for locals
+ * ulong w[80];
+ * uchar *edata;
+ * ulong *w15, *w40, *w60, *w80;
+ * register local
+ * ulong *wp = BP
+ * ulong a = eax, b = ebx, c = ecx, d = edx, e = esi
+ * ulong tmp = edi
+ */
+#define	Rpdata	R8
+#define WARRAY	(-8-(80*4))
+#define TMP1	(-16-(80*4))
+#define TMP2	(-24-(80*4))
+#define W15	(-32-(80*4))
+#define W40	(-40-(80*4))
+#define W60	(-48-(80*4))
+#define W80	(-56-(80*4))
+#define EDATA	(-64-(80*4))
+
+TEXT	_sha1block+0(SB),$384
+
+	MOVQ RARG, Rpdata
+	MOVLQZX len+LEN(FP),BX
+	ADDQ BX, RARG
+	MOVQ RARG,edata+EDATA(SP)
+
+	LEAQ aw15+(WARRAY+15*4)(SP),DI
+	MOVQ DI,w15+W15(SP)
+	LEAQ aw40+(WARRAY+40*4)(SP),DX
+	MOVQ DX,w40+W40(SP)
+	LEAQ aw60+(WARRAY+60*4)(SP),CX
+	MOVQ CX,w60+W60(SP)
+	LEAQ aw80+(WARRAY+80*4)(SP),DI
+	MOVQ DI,w80+W80(SP)
+
+mainloop:
+	LEAQ warray+WARRAY(SP),BP
+
+	MOVQ state+STATE(FP),DI
+	MOVL (DI),AX
+	MOVL 4(DI),BX
+	MOVL BX,tmp1+TMP1(SP)
+	MOVL 8(DI),CX
+	MOVL 12(DI),DX
+	MOVL 16(DI),SI
+
+	MOVQ Rpdata,BX
+
+loop1:
+	BODY0(0,FN1,0x5a827999,AX,tmp1+TMP1(SP),CX,DX,SI)
+	MOVL SI,tmp2+TMP2(SP)
+	BODY0(4,FN1,0x5a827999,SI,AX,tmp1+TMP1(SP),CX,DX)
+	MOVL tmp1+TMP1(SP),SI
+	BODY0(8,FN1,0x5a827999,DX,tmp2+TMP2(SP),AX,SI,CX)
+	BODY0(12,FN1,0x5a827999,CX,DX,tmp2+TMP2(SP),AX,SI)
+	MOVL SI,tmp1+TMP1(SP)
+	BODY0(16,FN1,0x5a827999,SI,CX,DX,tmp2+TMP2(SP),AX)
+	MOVL tmp2+TMP2(SP),SI
+
+	ADDQ $20,BX
+	ADDQ $20,BP
+	CMPQ BP,w15+W15(SP)
+	JCS loop1
+
+	BODY0(0,FN1,0x5a827999,AX,tmp1+TMP1(SP),CX,DX,SI)
+	ADDQ $4,BX
+	MOVQ BX,R8
+	MOVQ tmp1+TMP1(SP),BX
+
+	BODY(4,FN1,0x5a827999,SI,AX,BX,CX,DX)
+	BODY(8,FN1,0x5a827999,DX,SI,AX,BX,CX)
+	BODY(12,FN1,0x5a827999,CX,DX,SI,AX,BX)
+	BODY(16,FN1,0x5a827999,BX,CX,DX,SI,AX)
+
+	ADDQ $20,BP
+
+loop2:
+	BODY(0,FN24,0x6ed9eba1,AX,BX,CX,DX,SI)
+	BODY(4,FN24,0x6ed9eba1,SI,AX,BX,CX,DX)
+	BODY(8,FN24,0x6ed9eba1,DX,SI,AX,BX,CX)
+	BODY(12,FN24,0x6ed9eba1,CX,DX,SI,AX,BX)
+	BODY(16,FN24,0x6ed9eba1,BX,CX,DX,SI,AX)
+
+	ADDQ $20,BP
+	CMPQ BP,w40+W40(SP)
+	JCS loop2
+
+loop3:
+	BODY(0,FN3,0x8f1bbcdc,AX,BX,CX,DX,SI)
+	BODY(4,FN3,0x8f1bbcdc,SI,AX,BX,CX,DX)
+	BODY(8,FN3,0x8f1bbcdc,DX,SI,AX,BX,CX)
+	BODY(12,FN3,0x8f1bbcdc,CX,DX,SI,AX,BX)
+	BODY(16,FN3,0x8f1bbcdc,BX,CX,DX,SI,AX)
+
+	ADDQ $20,BP
+	CMPQ BP,w60+W60(SP)
+	JCS loop3
+
+loop4:
+	BODY(0,FN24,0xca62c1d6,AX,BX,CX,DX,SI)
+	BODY(4,FN24,0xca62c1d6,SI,AX,BX,CX,DX)
+	BODY(8,FN24,0xca62c1d6,DX,SI,AX,BX,CX)
+	BODY(12,FN24,0xca62c1d6,CX,DX,SI,AX,BX)
+	BODY(16,FN24,0xca62c1d6,BX,CX,DX,SI,AX)
+
+	ADDQ $20,BP
+	CMPQ BP,w80+W80(SP)
+	JCS loop4
+
+	MOVQ state+STATE(FP),DI
+	ADDL AX,0(DI)
+	ADDL BX,4(DI)
+	ADDL CX,8(DI)
+	ADDL DX,12(DI)
+	ADDL SI,16(DI)
+
+	MOVQ edata+EDATA(SP),DI
+	CMPQ Rpdata,DI
+	JCS mainloop
+
+	RET
+	END
--