shithub: mc

Download patch

ref: b2d29ed0eebd0e3e2c162b5e83d6d0e8b6ddee57
parent: d21343766fc75075e9246b386b60e264bfd1860e
parent: 920e16ca93f648b551af2b688de6a1d5fe1cb802
author: Ori Bernstein <ori@eigenstate.org>
date: Thu May 5 18:59:19 EDT 2016

Merge ../integrate-libthread

--- /dev/null
+++ b/lib/thread/atomic-impl+plan9-x64.s
@@ -1,0 +1,71 @@
+TEXT thread$xget32+0(SB),1,$0
+	MOVL	(DI), AX
+	RET
+TEXT thread$xget64+0(SB),1,$0
+	MOVQ	(DI), AX
+	RET
+TEXT thread$xgetp+0(SB),1,$0
+	MOVQ	(DI), AX
+	RET
+
+TEXT thread$xset32+0(SB),1,$0
+	MOVL	SI, (DI)
+	RET
+TEXT thread$xset64+0(SB),1,$0
+	MOVQ	SI, (DI)
+	RET
+TEXT thread$xsetp+0(SB),1,$0
+	MOVQ	SI, (DI)
+	RET
+
+TEXT thread$xadd32+0(SB),1,$0
+	LOCK; XADDL	SI, (DI)
+	MOVL	SI, AX
+	RET
+TEXT thread$xadd64+0(SB),1,$0
+	LOCK; XADDQ	SI, (DI)
+	MOVQ	SI, AX
+	RET
+TEXT thread$xaddp+0(SB),1,$0
+	LOCK; XADDQ	SI, (DI)
+	MOVQ	SI, AX
+	RET
+
+TEXT thread$xsub32+0(SB),1,$0
+	LOCK; XSUBL	SI, (DI)
+	MOVL	SI, AX
+	RET
+TEXT thread$xsub64+0(SB),1,$0
+	LOCK; XSUBQ	SI, (DI)
+	MOVQ	SI, AX
+	RET
+TEXT thread$xsubp+0(SB),1,$0
+	LOCK; XSUBQ	SI, (DI)
+	MOVQ	SI, AX
+	RET
+
+TEXT thread$xcas32+0(SB),1,$0
+	MOVL	SI, AX
+	LOCK; CMPXCHGL	DX, (DI)
+	RET
+TEXT thread$xcas64+0(SB),1,$0
+	MOVQ	SI, AX
+	LOCK; CMPXCHGQ	DX, (DI)
+	RET
+TEXT thread$xcasp+0(SB),1,$0
+	MOVQ	SI, AX
+	LOCK; CMPXCHGQ	DX, (DI)
+	RET
+
+TEXT thread$xchg32+0(SB),1,$0
+	MOVL	SI, AX
+	LOCK; XCHGL	(DI), AX
+	RET
+TEXT thread$xchg64+0(SB),1,$0
+	MOVQ	SI, AX
+	LOCK; XCHGQ	(DI), AX
+	RET
+TEXT thread$xchgp+0(SB),1,$0
+	MOVQ	SI, AX
+	LOCK; XCHGQ	(DI), AX
+	RET
--- /dev/null
+++ b/lib/thread/atomic-impl+x64.s
@@ -1,0 +1,90 @@
+.globl thread$xget32
+.globl _thread$xget32
+thread$xget32:
+_thread$xget32:
+	movl	(%rdi), %eax
+	ret
+.globl thread$xget64
+.globl thread$xgetp
+.globl _thread$xget64
+.globl _thread$xgetp
+thread$xget64:
+thread$xgetp:
+_thread$xget64:
+_thread$xgetp:
+	movq	(%rdi), %rax
+	ret
+
+.globl thread$xset32
+.globl _thread$xset32
+thread$xset32:
+_thread$xset32:
+	movl	%esi, (%rdi)
+	ret
+.globl thread$xset64
+.globl thread$xsetp
+.globl _thread$xset64
+.globl _thread$xsetp
+thread$xset64:
+thread$xsetp:
+_thread$xset64:
+_thread$xsetp:
+	movq	%rsi, (%rdi)
+	ret
+
+.globl thread$xadd32
+.globl _thread$xadd32
+thread$xadd32:
+_thread$xadd32:
+	lock xaddl	%esi, (%rdi)
+	movl %esi,%eax
+	ret
+.globl thread$xadd64
+.globl thread$xaddp
+.globl _thread$xadd64
+.globl _thread$xaddp
+thread$xadd64:
+thread$xaddp:
+_thread$xadd64:
+_thread$xaddp:
+	lock xaddq	%rsi, (%rdi)
+	movq %rsi,%rax
+	ret
+
+.globl thread$xcas32
+.globl _thread$xcas32
+thread$xcas32:
+_thread$xcas32:
+	movl	%esi, %eax
+	lock cmpxchgl	%edx, (%rdi)
+	ret
+.globl thread$xcas64
+.globl thread$xcasp
+.globl _thread$xcas64
+.globl _thread$xcasp
+thread$xcas64:
+thread$xcasp:
+_thread$xcas64:
+_thread$xcasp:
+	movq	%rsi, %rax
+	lock cmpxchgq	%rdx, (%rdi)
+	ret
+
+.globl thread$xchg32
+.globl _thread$xchg32
+thread$xchg32:
+_thread$xchg32:
+	movl	%esi, %eax
+	lock xchgl	(%rdi), %eax
+	ret
+.globl thread$xchg64
+.globl thread$xchgp
+.globl _thread$xchg64
+.globl _thread$xchgp
+thread$xchg64:
+thread$xchgp:
+_thread$xchg64:
+_thread$xchgp:
+	movq	%rsi, %rax
+	lock xchgq	(%rdi), %rax
+	ret
--- /dev/null
+++ b/lib/thread/atomic.myr
@@ -1,0 +1,78 @@
+use std
+
+pkg thread =
+	trait atomic @a::(integral,numeric) =
+		xget	: (p : @a# -> @a)
+		xset	: (p : @a#, v : @a -> void)
+		xadd	: (p : @a#, v : @a -> @a)
+		xcas	: (p : @a#, old : @a, new : @a -> @a)
+		xchg	: (p : @a#, new : @a -> @a)
+	;;
+
+	impl atomic int32
+	impl atomic int64
+	impl atomic uint32
+	impl atomic uint64
+;;
+
+impl atomic int32 =
+	xget	= {p; -> xget32(p castto(uint32#)) castto(int32)}
+	xset	= {p, v; xset32(p castto(uint32#), v castto(uint32))}
+	xadd	= {p, v; -> xadd32(p castto(uint32#), v castto(uint32)) castto(int32)}
+	xcas	= {p, old, new; -> xcas32(p castto(uint32#), old castto(uint32), new castto(uint32)) castto(int32)}
+	xchg	= {p, v; -> xchg32(p castto(uint32#), v castto(uint32)) castto(int32)}
+;;
+
+
+impl atomic int64 =
+	xget	= {p; -> xget64(p castto(uint64#)) castto(int64)}
+	xset	= {p, v; xset64(p castto(uint64#), v castto(uint64))}
+	xadd	= {p, v; -> xadd64(p castto(uint64#), v castto(uint64)) castto(int64)}
+	xcas	= {p, old, new; -> xcas64(p castto(uint64#), old castto(uint64), new castto(uint64)) castto(int64)}
+	xchg	= {p, v; -> xchg64(p castto(uint64#), v castto(uint64)) castto(int64)}
+;;
+
+impl atomic uint32 =
+	xget	= {p; -> xget32(p)}
+	xset	= {p, v; xset32(p, v)}
+	xadd	= {p, v; -> xadd32(p, v)}
+	xcas	= {p, old, new; -> xcas32(p, old, new)}
+	xchg	= {p, v; -> xchg32(p, v)}
+;;
+
+
+impl atomic uint64 =
+	xget	= {p; -> xget64(p)}
+	xset	= {p, v; xset64(p, v)}
+	xadd	= {p, v; -> xadd64(p, v)}
+	xcas	= {p, old, new; -> xcas64(p, old, new)}
+	xchg	= {p, v; -> xchg64(p, v)}
+;;
+
+impl atomic std.intptr =
+	xget	= {p; -> xgetp(p)}
+	xset	= {p, v; xsetp(p, v)}
+	xadd	= {p, v; -> xaddp(p, v)}
+	xcas	= {p, old, new; -> xcasp(p, old, new)}
+	xchg	= {p, v; -> xchgp(p, v)}
+;;
+
+extern const xget32	: (p : uint32# -> uint32)
+extern const xget64	: (p : uint64# -> uint64)
+extern const xgetp	: (p : std.intptr# -> std.intptr)
+
+extern const xset32	: (p : uint32#, v : uint32 -> void)
+extern const xset64	: (p : uint64#, v : uint64 -> void)
+extern const xsetp	: (p : std.intptr#, v : std.intptr -> void)
+
+extern const xadd32	: (p : uint32#, v : uint32 -> uint32)
+extern const xadd64	: (p : uint64#, v : uint64 -> uint64)
+extern const xaddp	: (p : std.intptr#, v : std.intptr -> std.intptr)
+
+extern const xcas32	: (p : uint32#, old: uint32, new : uint32 -> uint32)
+extern const xcas64	: (p : uint64#, old: uint64, new : uint64 -> uint64)
+extern const xcasp	: (p : std.intptr#, old: std.intptr, new : std.intptr -> std.intptr)
+
+extern const xchg32	: (p : uint32#, v : uint32 -> uint32)
+extern const xchg64	: (p : uint64#, v : uint64 -> uint64)
+extern const xchgp	: (p : std.intptr#, v : std.intptr -> std.intptr)
--- /dev/null
+++ b/lib/thread/bld.proj
@@ -1,0 +1,31 @@
+lib thread =
+	common.myr
+
+	# linux impl of basic thread primitives
+	condvar+linux.myr
+	mutex+linux.myr
+	spawn+linux.myr
+	exit+linux-x64.s
+
+	# freebsd impl of thread primitives
+	condvar+freebsd.myr
+	mutex+freebsd.myr
+	spawn+freebsd.myr
+	exit+freebsd-x64.s
+
+	# osx impl of thread primitives
+	#condvar+osx.myr
+	mutex+osx.myr
+	spawn+osx.myr
+	start+osx-x64.s
+
+	# 9front impl of thread primitives
+	#condvar+plan9.myr
+	mutex+plan9.myr
+	spawn+plan9.myr
+	atomic-impl+plan9-x64.s
+
+	atomic-impl+x64.s
+	atomic.myr
+;;
+
--- /dev/null
+++ b/lib/thread/common.myr
@@ -1,0 +1,5 @@
+use std
+
+pkg thread = 
+	generic Zptr = 0 castto(@a#)
+;;
--- /dev/null
+++ b/lib/thread/condvar+freebsd.myr
@@ -1,0 +1,59 @@
+use std
+use sys
+
+use "atomic.use"
+use "common.use"
+use "mutex.use"
+
+pkg thread =
+	type cond = struct
+		_mtx	: mutex#
+		_seq	: uint32
+	;;
+
+	const mkcond	: (mtx : mutex# -> cond)
+	const condwait	: (cond : cond# -> void)
+	const condsignal	: (cond : cond# -> void)
+	const condbroadcast	: (cond : cond# -> void)
+;;
+
+const mkcond = {mtx
+	-> [._mtx = mtx, ._seq = 0]
+}
+
+const condwait = {cond
+	var seq
+	var mtx
+
+	mtx = cond._mtx
+	seq = cond._seq
+
+	mtxunlock(mtx)
+	sys.umtx_op(&cond._seq castto(void#), \
+		sys.Umtxwaituintpriv, \
+		seq castto(uint64), \
+		Zptr, Zptr)
+
+	/*
+	We need to atomically set the mutex to contended. This allows us to
+	pass responsibility for waking up the potential other waiters on to the
+	unlocker of the mutex.
+	*/
+	while xchg(&mtx._state, Contended) != Unlocked
+		sys.umtx_op(&mtx._state castto(void#), \
+			sys.Umtxwaituintpriv, \
+			Contended castto(uint64), \
+			Zptr, Zptr)
+	;;
+}
+
+const condsignal = {cond : cond#
+	xadd(&cond._seq, 1)
+	sys.umtx_op(&cond._seq castto(void#), sys.Umtxwakepriv, 1, Zptr, Zptr)
+}
+
+const condbroadcast = {cond : cond#
+	xadd(&cond._seq, 1)
+	sys.umtx_op(&cond._seq castto(void#), sys.Umtxwakepriv, 0x7ffffff, Zptr, Zptr)
+}
+
--- /dev/null
+++ b/lib/thread/condvar+linux.myr
@@ -1,0 +1,61 @@
+use std
+use sys
+
+use "atomic.use"
+use "common.use"
+use "mutex.use"
+
+pkg thread =
+	type cond = struct
+		_mtx	: mutex#
+		_seq	: int32
+	;;
+
+	const mkcond	: (mtx : mutex# -> cond)
+	const condwait	: (cond : cond# -> void)
+	const condsignal	: (cond : cond# -> void)
+	const condbroadcast	: (cond : cond# -> void)
+;;
+
+const mkcond = {mtx
+	-> [._mtx = mtx, ._seq = 0]
+}
+
+const condwait = {cond
+	var seq
+	var mtx
+
+	mtx = cond._mtx
+	seq = cond._seq
+
+	mtxunlock(mtx)
+	sys.futex(&cond._seq, sys.Futexwait | sys.Futexpriv, seq, Zptr, Zptr, 0)
+
+	/*
+	We need to atomically set the mutex to contended. This allows us to
+	pass responsibility for waking up the potential other waiters on to the
+	unlocker of the mutex.
+	*/
+	while xchg(&mtx._state, Contended) != Unlocked
+		sys.futex(&mtx._state, sys.Futexwait | sys.Futexpriv, \
+			Contended, Zptr, Zptr, 0)
+	;;
+}
+
+const condsignal = {cond : cond#
+	xadd(&cond._seq, 1)
+	sys.futex(&cond._seq, sys.Futexwake | sys.Futexpriv, 1, Zptr, Zptr, 0)
+}
+
+const condbroadcast = {cond : cond#
+	xadd(&cond._seq, 1)
+	/*
+	The futex docs seem to be broken -- the timeout parameter seems to be
+	used for the number of threads to move, and is not ignored when
+	requeueing
+	*/
+	sys.futex(&cond._seq, sys.Futexcmprequeue | sys.Futexpriv, \
+		1, 0x7fffffff castto(sys.timespec#), \
+		&cond._mtx._state, cond._seq)
+}
+
--- /dev/null
+++ b/lib/thread/exit+freebsd-x64.s
@@ -1,0 +1,23 @@
+/*
+const thread.exit	: (stacksz : std.size -> void)
+NOTE: must be called from the bottom of the stack, since
+we assume that %rbp is in the top 4k of the stack.
+*/
+.globl thread$exit
+thread$exit:
+	/* find top of stack */
+	movq	%rbp,%rdi	/* addr */
+	andq	$~0xfff,%rdi	/* align it */
+	addq	$0x1000,%rdi
+
+	/* munmap(base, size) */
+	movq	$73,%rax	/* munmap */
+	movq	-8(%rdi),%rsi	/* size */
+	subq	%rsi,%rdi	/* move to base ptr */
+	syscall
+
+	/* thr_exit(null) */
+	movq	$431,%rax	/* exit */
+	xorq	%rdi,%rdi	/* 0 */
+	syscall
+	
--- /dev/null
+++ b/lib/thread/exit+linux-x64.s
@@ -1,0 +1,23 @@
+/*
+const thread.exit	: (stacksz : std.size -> void)
+NOTE: must be called from the bottom of the stack, since
+we assume that %rbp is in the top 4k of the stack.
+*/
+.globl thread$exit
+thread$exit:
+	/* find top of stack */
+	movq	%rbp,%rdi	/* addr */
+	andq	$~0xfff,%rdi	/* align it */
+	addq	$0x1000,%rdi
+
+	/* munmap(base, size) */
+	movq	$11,%rax	/* munmap */
+	movq	-8(%rdi),%rsi	/* size */
+	subq	%rsi,%rdi	/* move to base ptr */
+	syscall
+
+	/* thread_exit(0) */
+	movq	$60,%rax	/* exit */
+	xorq	%rdi,%rdi	/* 0 */
+	syscall
+	
--- /dev/null
+++ b/lib/thread/future.myr
@@ -1,0 +1,63 @@
+use std
+
+use "mutex.use"
+
+pkg thread =
+	type future(@a) = struct
+		mtx	: mutex
+		set	: bool
+		val	: @a
+	;;
+
+	generic mkfut	: (-> future(@a))
+	generic futset	: (fut : future(@a)#, val : @a -> bool)
+	generic futget	: (fut : future(@a)# -> @a)
+	generic futtryget	: (fut : future(@a)# -> std.option(@a))
+	generic futclear	: (fut : future(@a)# -> void)
+;;
+
+const Unset = 0
+const Waiting = 1
+const Set = 2
+
+generic mkfut = {
+	var fut
+
+	fut = [.mtx = mkmtx() ]
+	mtxlock(&fut.mtx)
+	-> fut
+}
+
+generic futset = {fut, val
+	if fut.set
+		-> false
+	;;
+	/* compiler doesn't reorder shit */
+	fut.val = val
+	fut.set = true
+	mtxunlock(&fut.mtx)
+	-> true
+}
+
+generic futtryget = {fut
+	var val
+
+	if !fut.set
+		-> `std.None
+	;;
+	mtxlock(&fut.mtx)
+	val = fut.val
+	mtxunlock(&fut.mtx)
+	-> `std.Some val
+}
+
+generic futget = {fut
+	var val
+
+	mtxlock(&fut.mtx)
+	val = fut.val
+	mtxunlock(&fut.mtx)
+	-> val
+}
+
+
--- /dev/null
+++ b/lib/thread/mutex+freebsd.myr
@@ -1,0 +1,80 @@
+use std
+use sys
+
+use "atomic.use"
+use "common.use"
+
+pkg thread =
+	type mutex = struct
+		_state	: uint32
+	;;	
+
+	const mkmtx	: (-> mutex)
+	const mtxlock	: (mtx : mutex# -> void)
+	const mtxtrylock	: (mtx : mutex# -> bool)
+	const mtxunlock	: (mtx : mutex# -> void)
+
+	pkglocal const Unlocked = 0
+	pkglocal const Locked = 1
+	pkglocal const Contended = 2
+;;
+
+var nspin = 10	/* FIXME: pick a sane number, based on CPU count */
+
+const mkmtx = {
+	-> [._state = Unlocked]
+}
+
+const mtxlock = {mtx
+	var c
+
+	/* 
+	Uncontended case: we get an unlocked mutex, and we lock it.
+	*/
+        c = Locked
+	for var i = 0; i < nspin; i++
+		c = xcas(&mtx._state, Unlocked, Locked) 
+		if c == Unlocked
+			->
+		;;
+	;;
+
+	/*
+	Contended case: we set the lock state to Contended. This indicates that there
+	the lock is locked, and we potentially have threads waiting on it, which means
+	that we will need to wake them up.
+	*/
+	if c == Locked
+		c = xchg(&mtx._state, Contended)
+	;;
+
+	while c != Unlocked
+		sys.umtx_op( \
+			&mtx._state castto(void#), \
+			sys.Umtxwaituintpriv, \
+			Contended castto(uint64), \
+			Zptr, Zptr)
+		c = xchg(&mtx._state, Contended)
+	;;
+}
+
+const mtxtrylock = {mtx
+	-> xcas(&mtx._state, Unlocked, Locked) == Unlocked
+}
+
+const mtxunlock = {mtx
+	/*
+	Uncontended case: If the mutex state is not contended, and we still
+	are uncontended by the xchg() call, then it's safe to simply return;
+	nobody was waiting for us.
+	*/
+	if mtx._state == Contended
+		mtx._state = Unlocked
+	elif xchg(&mtx._state, Unlocked) == Locked
+		->
+	;;
+
+	/* wake all threads: for some reason nwake */
+	sys.umtx_op(&mtx._state castto(void#), sys.Umtxwakepriv, 1, Zptr, Zptr)
+}
+
--- /dev/null
+++ b/lib/thread/mutex+linux.myr
@@ -1,0 +1,76 @@
+use std
+use sys
+
+use "atomic.use"
+use "common.use"
+
+pkg thread =
+	type mutex = struct
+		_state	: int32
+	;;	
+
+	const mkmtx	: (-> mutex)
+	const mtxlock	: (mtx : mutex# -> void)
+	const mtxtrylock	: (mtx : mutex# -> bool)
+	const mtxunlock	: (mtx : mutex# -> void)
+
+	pkglocal const Unlocked = 0
+	pkglocal const Locked = 1
+	pkglocal const Contended = 2
+;;
+
+var nspin = 10	/* FIXME: pick a sane number, based on CPU count */
+
+const mkmtx = {
+	-> [._state = Unlocked]
+}
+
+const mtxlock = {mtx
+	var c
+
+	/* 
+	Uncontended case: we get an unlocked mutex, and we lock it.
+	*/
+        c = Locked
+	for var i = 0; i < nspin; i++
+		c = xcas(&mtx._state, Unlocked, Locked) 
+		if c == Unlocked
+			-> void
+		;;
+	;;
+
+	/*
+	Contended case: we set the lock state to Contended. This indicates that there
+	the lock is locked, and we potentially have threads waiting on it, which means
+	that we will need to wake them up.
+	*/
+	if c == Locked
+		c = xchg(&mtx._state, Contended)
+	;;
+
+	while c != Unlocked
+		sys.futex(&mtx._state, sys.Futexwait | sys.Futexpriv, Contended, Zptr, Zptr, 0)
+		c = xchg(&mtx._state, Contended)
+	;;
+}
+
+const mtxtrylock = {mtx
+	-> xcas(&mtx._state, Unlocked, Locked) == Unlocked
+}
+
+const mtxunlock = {mtx
+	/*
+	Uncontended case: If the mutex state is not contended, and we still
+	are uncontended by the xchg() call, then it's safe to simply return;
+	nobody was waiting for us.
+	*/
+	if mtx._state == Contended
+		mtx._state = Unlocked
+	elif xchg(&mtx._state, Unlocked) == Locked
+		-> void
+	;;
+
+	/* wake one thread */
+	sys.futex(&mtx._state, sys.Futexwake | sys.Futexpriv, 1, Zptr, Zptr, 0)
+}
+
--- /dev/null
+++ b/lib/thread/mutex+osx.myr
@@ -1,0 +1,65 @@
+use std
+use sys
+
+
+use "atomic.use"
+use "common.use"
+
+pkg thread =
+	type mutex = struct
+		_state	: uint32
+	;;	
+
+	const mkmtx	: (-> mutex)
+	const mtxlock	: (mtx : mutex# -> void)
+	const mtxtrylock	: (mtx : mutex# -> bool)
+	const mtxunlock	: (mtx : mutex# -> void)
+;;
+
+const mkmtx = {
+	-> [._state = 0]
+}
+
+/* a shitty spinlock */
+const mtxlock = {mtx
+	/* first fast */
+	for var i = 0; i < 1000; i++
+		if xcas(&mtx._state, 0, 1) == 0
+			-> void
+		;;
+		std.nanosleep(0)
+	;;
+	
+	/* then slower */
+	for var i = 0; i < 1000; i++
+		if xcas(&mtx._state, 0, 1) == 0
+			-> void
+		;;
+		std.nanosleep(10_000) /* 10 us */
+	;;
+
+	/* even slower */
+	for var i = 0; i < 1000; i++
+		if xcas(&mtx._state, 0, 1) == 0
+			-> void
+		;;
+		std.nanosleep(100_000) /* 100 us */
+	;;
+
+	/* I'm rip van winkle! */
+	while true
+		if xcas(&mtx._state, 0, 1) == 0
+			-> void
+		;;
+		std.nanosleep(1000_000) /* 1 ms */
+	;;
+}
+
+const mtxtrylock = {mtx
+	-> xcas(&mtx._state, 0, 1) == 0
+}
+
+	
+const mtxunlock = {mtx
+	xset(&mtx._state, 0)
+}
--- /dev/null
+++ b/lib/thread/mutex+plan9.myr
@@ -1,0 +1,47 @@
+use std
+use sys
+
+
+use "atomic.use"
+use "common.use"
+
+pkg thread =
+	type mutex = struct
+		_state	: uint32
+		_sem	: uint32
+	;;	
+
+	const mkmtx	: (-> mutex)
+	const mtxlock	: (mtx : mutex# -> void)
+	const mtxtrylock	: (mtx : mutex# -> bool)
+	const mtxunlock	: (mtx : mutex# -> void)
+;;
+
+const mkmtx = {
+	-> [._state = 0, ._sem=0]
+}
+
+const mtxlock = {mtx
+	/* if the old value was 0, we aren't contended */
+	if xadd(&mtx._state, 1) == 0
+		-> void
+	;;
+	
+	while sys.semacquire(&mtx._sem, 1) < 0
+		/* interrupted; retry */
+	;;
+}
+
+const mtxtrylock = {mtx
+	-> xcas(&mtx._state, 0, 1) == 0
+}
+
+	
+const mtxunlock = {mtx
+	/* if we were the only thread waiting on the lock, there was no contention */
+	if xadd(&mtx._state, -1) == 1
+		-> void
+	;;
+
+	sys.semrelease(&mtx._sem, 1)
+}
--- /dev/null
+++ b/lib/thread/spawn+freebsd.myr
@@ -1,0 +1,74 @@
+use sys
+use std
+
+pkg thread =
+	type tid = uint64
+
+	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
+;;
+
+
+const Stacksz = 8*std.MiB
+extern const exit : (-> void)
+
+const spawn = {fn
+	-> spawnstk(fn, Stacksz)
+}
+
+const spawnstk = {fn, sz
+	var stk : byte#, tid, ctid, ret
+	var szp, fp, tos
+
+	stk = getstk(sz)
+	if stk == sys.Mapbad
+		-> `std.Fail "couldn't get stack"
+	;;
+	tid = -1
+	/* find top of stack */
+	tos = (stk castto(std.intptr)) + (sz castto(std.intptr))
+
+	/* store the stack size */
+	tos -= sizeof(sys.size)
+	sz -= sizeof(sys.size)
+	szp = tos castto(sys.size#)
+	szp# = Stacksz
+
+	/* store the function we call */
+	tos -= sizeof((->void))
+	sz -= sizeof((->void))
+	fp = tos castto((->void)#)
+	fp# = fn
+
+	ret = sys.thr_new(&[
+		.startfn = startthread castto(void#),
+		.arg = tos castto(void#),
+		.stkbase = stk castto(byte#),
+		.stksz = sz,
+		.tid = &ctid,
+		.ptid = &tid,
+		.flags = 2,
+		.rtp = 0 castto(sys.rtprio#)
+	], sizeof(sys.thrparam))
+
+	if ret < 0
+		-> `std.Fail "couldn't spawn thread"
+	;;
+	-> `std.Ok tid castto(tid)
+}
+
+const getstk = {sz
+	var p, m
+
+	p = sys.mmap(0 castto(byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
+	if p == sys.Mapbad
+		-> p
+	;;
+	m = p castto(std.intptr)
+	-> m castto(byte#)
+}
+
+const startthread = {fn : (-> void)#
+	fn#()
+	exit()
+}
+
--- /dev/null
+++ b/lib/thread/spawn+linux.myr
@@ -1,0 +1,68 @@
+use sys
+use std
+
+pkg thread =
+	type tid = sys.pid
+
+	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
+;;
+
+extern const exit : (-> void)
+
+/* Holy shit flag mania. */
+const Thrflag = sys.Clonevm | sys.Clonefs | sys.Clonefiles  | \
+	sys.Clonesighand | sys.Clonethread |sys.Clonesysvsem | \
+	sys.Clonesettls | sys.Cloneparentsettid | sys.Clonechildcleartid
+
+const Stacksz = 8*std.MiB
+
+const spawn = {fn
+	-> spawnstk(fn, Stacksz)
+}
+
+const spawnstk = {fn, sz
+	var stk : byte#, tid, ctid, ret
+	var szp, fp, tos
+
+	stk = getstk(sz)
+	if stk == sys.Mapbad
+		-> `std.Fail "couldn't get stack"
+	;;
+	tos = stk castto(std.intptr)
+	tos -= sizeof(int64)
+	szp = tos castto(sys.size#)
+	szp# = Stacksz
+	tos -= sizeof((->void))
+	fp = tos castto((->void)#)
+	fp# = fn
+
+	ret = sys.fnclone(Thrflag, \
+		tos castto(byte#),\
+		&tid, 0 castto(byte#), \
+		&ctid, 0 castto(byte#), \
+		startthread castto(void#)) castto(tid)
+	if ret < 0
+		std.put("errno={}\n", -ret)
+		-> `std.Fail "couldn't spawn thread"
+	;;
+	-> `std.Ok ret
+}
+
+const getstk = {sz
+	var p, m
+
+	p = sys.mmap(0 castto(byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
+	if p == sys.Mapbad
+		-> p
+	;;
+	/* stack starts at the top of memory and grows down. */
+	m = p castto(std.intptr)
+	m += sz castto(std.intptr)
+	-> m castto(byte#)
+}
+
+const startthread = {fn : (-> void)
+	fn()
+	exit()
+}
+
--- /dev/null
+++ b/lib/thread/spawn+osx.myr
@@ -1,0 +1,60 @@
+use sys
+use std
+
+pkg thread =
+	type tid = uint64
+
+	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
+;;
+
+
+const Stacksz = 8*std.MiB
+extern const exit : (-> void)
+extern const start : (-> void)
+
+const __init__ = {
+	var ret
+
+	ret = sys.bsdthread_register(\
+		start castto(void#), \	/* start */
+		0 castto(void#), \	/* wqthread */
+		0 castto(uint32), \	/* sz */
+		0 castto(uint32), \	/* dummy */
+		0 castto(void#), \	/* targconc */
+		0 castto(uint32))	/* queueoff */
+	if ret < 0
+		std.fatal("unable to init threads: {}", ret)
+	;;
+}
+
+
+
+const spawn = {fn
+	-> spawnstk(fn, Stacksz)
+}
+
+const spawnstk = {fn, sz
+	var tid : tid, ret
+
+
+	std.put("...hi? fn={}\n", fn castto(void#))
+	ret = sys.bsdthread_create( \
+		fn castto(void#), \
+		envptr(&fn), \
+		sz castto(void#), \
+		0 castto(void#), \
+		0)
+
+	if ret == (-1 castto(void#))
+		-> `std.Fail "couldn't spawn thread"
+	;;
+	-> `std.Ok ret castto(tid)
+}
+
+const envptr = {fn
+	var repr : std.intptr[2]
+
+	repr = (fn castto(std.intptr[2]#))#
+	-> repr[0] castto(void#)
+}
+
--- /dev/null
+++ b/lib/thread/spawn+plan9.myr
@@ -1,0 +1,18 @@
+use std
+use sys
+
+pkg thread =
+	type tid = uint64
+
+	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
+;;
+
+const spawn = {fn
+	match sys.rfork(sys.Rfproc | sys.Rfmem | sys.Rfnowait)
+	| 0:
+		fn()
+		std.exit(0)
+	| -1:	-> `std.Fail "unable to spawn thread"
+	| thr:	-> `std.Ok thr castto(tid)
+	;;
+}
\ No newline at end of file
--- /dev/null
+++ b/lib/thread/start+osx-x64.s
@@ -1,0 +1,22 @@
+// The entry point for thread start, registered with bsdthread_register
+//      %rdi: pthread (0, for us)
+//      %rsi: mach thread port (ignored)
+//      %rdx: func
+//      %rcx: env
+//      %r8: stack
+//      %r9: flags (= 0)
+//      %rsp: stack - C_64_REDZONE_LEN (= stack - 128)
+.globl _thread$start
+_thread$start:
+	/* call the function */
+#	movq	%r8, %rsp	/* set up stack */
+	movq	%rcx, %rax	/* set up env */
+        callq    *%rdx		/* call function */
+
+	/* exit the thread */
+	movq	$0x2000169, %rax	/* Sysbsdthread_terminate */
+	movq	%rsp, %rdi	/* stack */
+	movq	$0, %rsi	/* len */
+	movq	$0, %rdx	/* sem */
+	syscall
+	
--- /dev/null
+++ b/lib/thread/test/atomic.myr
@@ -1,0 +1,29 @@
+use std
+use thread
+
+use "test/util.use"
+
+const Nherd = 20
+
+var val : uint64 = 0
+var done : uint32 = 0
+
+const main = {
+	done = 0
+	val = 0
+	mkherd(Nherd, incvar)
+	while thread.xget(&done) != Nherd
+		/* nothing */
+	;;
+	std.assert(val == 2_000_000, "atomics are broken\n")
+}
+
+const incvar = {
+	var i
+
+	for i = 0; i < 100_000; i++
+		thread.xadd(&val, 1)
+	;;
+	thread.xadd(&done, 1)
+}
+
--- /dev/null
+++ b/lib/thread/test/condvar.myr
@@ -1,0 +1,88 @@
+use std
+use thread
+
+use "test/util.use"
+
+const Nwakes = 1000
+
+var cv
+var mtx
+var val
+
+var done : int32
+var nwoken : int32
+var nready : int32
+var locked : int32
+
+const main = {
+	done = 0
+	val = 123
+
+	mtx = thread.mkmtx()
+	cv = thread.mkcond(&mtx)
+	thread.spawn(cvwait)
+	thread.spawn(cvwake)
+	while done == 0
+		/* nothing */
+	;;
+	std.assert(nwoken == Nwakes, "wrong number of wakes")
+	std.assert(val == 123, "wrong val after all are done")
+
+	nwoken = 0
+	nready = 0
+	mkherd(100, cvwaitonce)
+
+	/* wait until the herd is ready */
+	while nready != 100	/* 0 to 99 */
+		/* nothing */
+	;;
+	while locked == 0
+		/* nothing */
+	;;
+	thread.condbroadcast(&cv)
+	while nwoken != 100
+		/* nothing */
+	;;
+	std.assert(nwoken == 100, "wrong thread count woken")
+
+}
+
+const cvwait = {
+	for var i = 0; i < Nwakes; i++
+		thread.mtxlock(&mtx)
+		thread.condwait(&cv)
+		std.assert(val == 456, "wrong val after signal\n")
+		val = 123
+		thread.mtxunlock(&mtx)
+
+		thread.xadd(&nwoken, 1)
+	;;
+	val = 123
+	thread.xadd(&done, 1)
+
+}
+
+const cvwake = {
+	while true
+		thread.mtxlock(&mtx)
+		val = 456
+		thread.mtxunlock(&mtx)
+
+		thread.condsignal(&cv)
+		if nwoken >= Nwakes
+			break
+		;;
+	;;
+}
+
+const cvwaitonce = {
+	thread.xadd(&nready, 1)
+
+	thread.mtxlock(&mtx)
+	thread.xadd(&locked, 1)
+	thread.condwait(&cv)
+	thread.mtxunlock(&mtx)
+
+	thread.xadd(&nwoken, 1)
+}
+
--- /dev/null
+++ b/lib/thread/test/future.myr
@@ -1,0 +1,50 @@
+use std
+use sys
+use thread
+
+use "test/util.use"
+
+var fut
+var nready : int32
+var ndone : int32
+
+const main = {
+	nready = 0
+	ndone = 0
+	fut = thread.mkfut()
+	/* set after we have some waiters */
+	mkherd(100, getfuture)
+	while nready != 100
+		/* spin */
+	;;
+	std.put("done waiting for ready\n")
+	std.assert(ndone == 0, "thread proceeded too soon\n")
+	thread.futset(&fut, 666)
+	std.assert(thread.futset(&fut, 1) == false, "double set future\n")
+	while ndone != 100
+		/* spin */
+	;;
+	std.put("double set future ok")
+	/* start up a few more to make sure we can still read */
+	mkherd(50, getfuture)
+	while ndone != 150
+		/* spin */
+	;;
+
+	
+	/* set ahead of time */
+	ndone = 0
+	fut = thread.mkfut()
+	thread.futset(&fut, 666)
+	std.assert(thread.futset(&fut, 666) == false, "double set future\n")
+	mkherd(100, getfuture)
+	while ndone != 100
+		/* spin */
+	;;
+}
+
+const getfuture = {
+	thread.xadd(&nready, 1)
+	std.assert(thread.futget(&fut) == 666, "wrong value gotten from future")
+	thread.xadd(&ndone, 1)
+}
--- /dev/null
+++ b/lib/thread/test/mutex.myr
@@ -1,0 +1,33 @@
+use std
+use thread
+
+use "test/util.use"
+
+const Nherd = 20
+
+var val : uint64 = 0
+var done : uint32 = 0
+var mtx : thread.mutex
+
+const main = {
+	done = 0
+	val = 0
+
+	mtx = thread.mkmtx()
+	mkherd(Nherd, incvar)
+	while thread.xget(&done) != Nherd
+		/* nothing */
+	;;
+	if val != 10_000 * 20
+		std.fatal("mutexes are broken, got {}\n", val)
+	;;
+}
+
+const incvar = {
+	for var i = 0; i < 10_000; i++
+		thread.mtxlock(&mtx)
+		val++
+		thread.mtxunlock(&mtx)
+	;;
+	thread.xadd(&done, 1)
+}
--- /dev/null
+++ b/lib/thread/test/spawn.myr
@@ -1,0 +1,25 @@
+use std
+use thread
+
+var done : int32
+var capture
+
+const main = {
+	var ptr
+
+	capture = 666
+	ptr = &capture
+	thread.spawn({
+		std.assert(capture==666, "wrong captured value\n")
+		std.assert(ptr#==666, "wrong captured ptr value\n")
+		ptr# = 333
+		thread.xadd(&done, 1)
+	})
+
+	while done == 0
+		/* nothing */
+	;;
+
+	std.assert(capture == 333, "capture wasn't written to correctly\n")
+}
+
--- /dev/null
+++ b/lib/thread/test/util.myr
@@ -1,0 +1,12 @@
+use std
+use thread
+
+pkg =
+	const mkherd : (n : uint32, fn : (-> void) ->void)
+;;
+
+const mkherd = {n, fn
+	for var i = 0; i < n; i++
+		std.try(thread.spawn(fn))
+	;;
+}