shithub: mc

Download patch

ref: 2d9fe7100209acb431e392d58e4cf4eccfde00b7
parent: b17873ed4e6a14786758fd68290ef572b0b88f10
author: iriri <iri@konnichiwastevenspielbergde.su>
date: Sat Oct 13 09:58:52 EDT 2018

Add thread-local storage for POSIX-y platforms.

This patch implements thread-local storage by using a segment register
to store pointers to regions of memory unique to each thread. In spawned
threads, this region starts above the top of the stack, while in the
main thread, this region is initially statically allocated so it can be
used in `__init__` functions and dynamically reallocated if it needs to
grow beyond 8 slots. The api consists of 3 functions: `tlsalloc()`,
which must be called from the main thread, allocates a slot and returns
a key to be used with `settls(k, v)` and `gettls(k)`. Each thread
inherits its tls slots from the thread that spawned it and any slot
added after a given thread is spawned is not available in that thread.

Adding tls regions gives threads an easy way to get their own tids,
allowing us to add some basic correctness checks to the mutex code. A
pointer to the base of the stack and the size of the mapping are also
stored in the tls region, making it easy to support user-specified stack
sizes in the future.

Changes from previous version:
- Fixed size of main tls static allocation.
- More comments, less magic.
- Fixed typo in start+osx-x64.s exit+{freebsd,linux,openbsd}-x64.s
- `gettlskey` renamed to `tlsalloc`.
- `key` is now `tlskey(@a#)`, improving safety.
- Test is slightly less bad.

--- a/bld.tags
+++ b/bld.tags
@@ -1,3 +1,7 @@
+fsbase: freebsd
+fsbase: linux
+fsbase: netbsd
+fsbase: openbsd
 futex: freebsd
 futex: linux
 futex: openbsd:6.2
--- a/lib/sys/sys+freebsd-x64.myr
+++ b/lib/sys/sys+freebsd-x64.myr
@@ -38,6 +38,7 @@
 	type cpulevel	= int
 	type cpusetid	= int
 	type idtype	= int
+	type sysarchop	= int64
 	
 	type acltype	= int
 	type acltag	= uint32
@@ -802,6 +803,13 @@
 	const Siglwp	: signo = Sigthr
 	const Siglibrt	: signo = 33	/* reserved by real-time library. */
 	
+	/* sysarch ops */
+	const Archamd64getfs   : sysarchop = 128
+	const Archamd64setfs   : sysarchop = 129
+	const Archamd64getgs   : sysarchop = 130
+	const Archamd64setgs   : sysarchop = 131
+	const Archamd64getxfpu : sysarchop = 132
+	
 	extern const syscall : (sc:scno, args:... -> int64)
 	extern var __cenvp : byte##
 	
@@ -1285,7 +1293,7 @@
 	const quotactl			:  (path : byte#, cmd : int, uid : int, arg : void# -> int)
 	const lgetfh			:  (fname : byte#, fhp : fhandle# -> int)
 	const getfh			:  (fname : byte#, fhp : fhandle# -> int)
-	const sysarch			:  (op : int, parms : byte# -> int)
+	const sysarch			:  (op : sysarchop, parms : void## -> int)
 	const rtprio			:  (function : int, pid : pid, rtp : rtprio# -> int)
 	const setfib			:  (fibnum : int -> int)
 	const ntp_adjtime		:  (tp : timex# -> int)
@@ -1969,7 +1977,7 @@
 	 -> (syscall(Sysgetfh, a(fname), a(fhp)) : int)
 }
 const sysarch	= {op, parms
-	 -> (syscall(Syssysarch, a(op), a(parms)) : int)
+	 -> (syscall(Syssysarch, op, a(parms)) : int)
 }
 const rtprio	= {function, pid, rtp
 	 -> (syscall(Sysrtprio, a(function), a(pid), a(rtp)) : int)
--- a/lib/sys/sys+linux-x64.myr
+++ b/lib/sys/sys+linux-x64.myr
@@ -45,6 +45,7 @@
 	type mfdflags	= uint32
 	type aiocontext	= uint64
 	type msg	= void#
+	type arch_prctlop	= uint64
 	
 	
 	type clock = union
@@ -590,6 +591,12 @@
 	
 	/* return value for a failed mapping */
 	const Mapbad	: byte# = (-1 : byte#)
+
+	/* arch_prctl ops */
+	const Archsetgs : arch_prctlop = 0x1001
+	const Archsetfs : arch_prctlop = 0x1002
+	const Archgetfs : arch_prctlop = 0x1003
+	const Archgetgs : arch_prctlop = 0x1004
 	
 	/* signal flags */
 	const Sanocldstop	: sigflags = 0x00000001
@@ -1097,6 +1104,7 @@
 	const Sysmq_notify		: scno = 244
 	const Sysmq_getsetattr		: scno = 245
 	const Sysprctl			: scno = 157
+	const Sysarch_prctl		: scno = 158
 	const Sysswapon			: scno = 167
 	const Sysswapoff		: scno = 168
 	const Sys_sysctl		: scno = 156
@@ -1308,7 +1316,7 @@
 	const settimeofday		:  (tv : timeval#, tz : timezone# -> int64)
 	const adjtimex			:  (txc_p : timex# -> int64)
 	const times			:  (tbuf : tms# -> int64)
-	const gettid			:  ( -> int64)
+	const gettid			:  ( -> pid)
 	const alarm			:  (seconds : uint -> int64)
 	const getppid			:  ( -> int64)
 	const geteuid			:  ( -> int64)
@@ -1484,6 +1492,7 @@
 	const mq_notify			:  (mqdes : int, notification : sigevent# -> int64)
 	const mq_getsetattr		:  (mqdes : int, mqstat : mq_attr#, omqstat : mq_attr# -> int64)
 	const prctl			:  (option : int, arg2 : uint64, arg3 : uint64, arg4 : uint64, arg5 : uint64 -> int64)
+	const arch_prctl		:  (op : arch_prctlop, addr : void# -> int64)
 	const swapon			:  (specialfile : byte#, swap_flags : int -> int64)
 	const swapoff			:  (specialfile : byte# -> int64)
 	const _sysctl			:  (args : sysctl_args# -> int64)
@@ -1782,7 +1791,7 @@
 	 -> (syscall(Systimes, a(tbuf)) : int64)
 }
 const gettid	= {
-	 -> (syscall(Sysgettid) : int64)
+	 -> (syscall(Sysgettid) : pid)
 }
 const alarm	= {seconds
 	 -> (syscall(Sysalarm, a(seconds)) : int64)
@@ -2308,6 +2317,9 @@
 }
 const prctl	= {option, arg2, arg3, arg4, arg5
 	 -> (syscall(Sysprctl, a(option), a(arg2), a(arg3), a(arg4), a(arg5)) : int64)
+}
+const arch_prctl	= {op, addr
+	 -> syscall(Sysarch_prctl, op, addr)
 }
 const swapon	= {specialfile, swap_flags
 	 -> (syscall(Sysswapon, a(specialfile), a(swap_flags)) : int64)
--- a/lib/sys/sys+netbsd-x64.myr
+++ b/lib/sys/sys+netbsd-x64.myr
@@ -18,6 +18,7 @@
 	type umtxop	= int32
 	type signo	= int32
 	type sigflags	= int32
+	type sysarchop	= int64
 
 	type clock = union
 		`Clockrealtime
@@ -344,6 +345,21 @@
 	const Umtxmtxwake2	: umtxop = 22
 	const Umtxmax	: umtxop = 23
 
+	/* sysarch ops */
+	const X8664getldt    : sysarchop = 0
+	const X8664setldt    : sysarchop = 1
+	const X8664iopl      : sysarchop = 2
+	const X8664getioperm : sysarchop = 3
+	const X8664setioperm : sysarchop = 4
+	const X8664oldvm86   : sysarchop = 5
+	const X8664getmtrr   : sysarchop = 11
+	const X8664setmtrr   : sysarchop = 12
+	const X8664vm86      : sysarchop = 13
+	const X8664getgsbase : sysarchop = 14
+	const X8664getfsbase : sysarchop = 15
+	const X8664setgsbase : sysarchop = 16
+	const X8664setfsbase : sysarchop = 17
+
 	/* signal actions */
 	const Saonstack		: sigflags = 0x0001	/* take signal on signal stack */
 	const Sarestart		: sigflags = 0x0002	/* restart system call on signal return */
@@ -908,6 +924,9 @@
 		new : void#, newsz : size# \
 		-> int)
 
+	/* misc */
+	const sysarch	: (op : sysarchop, args : void## -> int)
+
 	extern const cstring	: (str : byte[:] -> byte#)
 	/* filled by start code */
 	extern var __cenvp : byte##
@@ -1100,6 +1119,10 @@
 	/* all args already passed through a() or ar  ptrs */
 	-> (syscall(Sys__sysctl, \
 		(mib : int#), a(mib.len), old, oldsz, new, newsz) : int)
+}
+
+const sysarch = {op, args
+	-> (syscall(Syssysarch, op, args) : int)
 }
 
 const clockid = {clk
--- a/lib/sys/sys+openbsd-x64.myr
+++ b/lib/sys/sys+openbsd-x64.myr
@@ -215,7 +215,7 @@
 	const Mfixed	: mopt = 0x10
 	const Mfile	: mopt = 0x0
 	const Manon	: mopt = 0x1000
-	const Mstack	: mopt = 0x4000
+	const Mstack	: mopt = 0x0
 	const Mnoreplace	: mopt = 0x0800
 
 	/* file types */
--- a/lib/sys/sys+openbsd:6.1-x64.myr
+++ b/lib/sys/sys+openbsd:6.1-x64.myr
@@ -1031,7 +1031,7 @@
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1750,5 +1750,5 @@
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
--- a/lib/sys/sys+openbsd:6.2-x64.myr
+++ b/lib/sys/sys+openbsd:6.2-x64.myr
@@ -348,7 +348,7 @@
 	const Mfixed	: mopt = 0x10
 	const Mfile	: mopt = 0x0
 	const Manon	: mopt = 0x1000
-	const Mstack	: mopt = 0x4000
+	const Mstack	: mopt = 0x0
 	const Mnoreplace	: mopt = 0x0800
 	
 	/* file types */
@@ -1037,7 +1037,7 @@
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1759,5 +1759,5 @@
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
--- a/lib/sys/sys+openbsd:6.3-x64.myr
+++ b/lib/sys/sys+openbsd:6.3-x64.myr
@@ -1036,7 +1036,7 @@
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1755,5 +1755,5 @@
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
--- a/lib/thread/bld.sub
+++ b/lib/thread/bld.sub
@@ -14,6 +14,11 @@
 	sem.myr
 	waitgrp.myr
 
+	# fsbase-based impls
+	tls+fsbase.myr
+	tls-impl+fsbase-x64.s
+	types+fsbase.myr
+
 	# futex-based impls
 	mutex+futex.myr
 	rwlock+futex.myr
@@ -23,6 +28,7 @@
 	# linux impl of basic thread primitives
 	condvar+linux.myr
 	exit+linux-x64.s
+	fsbase+linux.myr
 	futex+linux.myr
 	ncpu+linux.myr
 	spawn+linux.myr
@@ -30,6 +36,7 @@
 	# freebsd impl of thread primitives
 	condvar+freebsd.myr
 	exit+freebsd-x64.s
+	fsbase+freebsd.myr
 	futex+freebsd.myr
 	ncpu+freebsd.myr
 	spawn+freebsd.myr
@@ -37,6 +44,7 @@
 	# netbsd impl of thread primitives
 	#condvar+netbsd.myr
 	#mutex+netbsd.myr
+	fsbase+netbsd.myr
 	spawn+netbsd.myr
 	#ncpu+netbsd.myr
 	#exit+netbsd-x64.s
@@ -46,6 +54,9 @@
 	futex+osx.myr
 	spawn+osx.myr
 	start+osx-x64.s
+	tls+osx.myr
+	tls-impl+osx-x64.s
+	types+osx.myr
 
 	# 9front impl of thread primitives
 	#condvar+plan9.myr
@@ -58,6 +69,7 @@
 	# openbsd impl of thread primitives
 	condvar+openbsd:6.2.myr
 	exit+openbsd-x64.s
+	fsbase+openbsd.myr
 	futex+openbsd:6.2.myr
 	ncpu+openbsd.myr
 	spawn+openbsd.myr
--- a/lib/thread/common.myr
+++ b/lib/thread/common.myr
@@ -1,5 +1,3 @@
-use std
-
-pkg thread = 
+pkg thread =
 	pkglocal generic Zptr : @a#  = (0 : @a#)
 ;;
--- a/lib/thread/exit+freebsd-x64.s
+++ b/lib/thread/exit+freebsd-x64.s
@@ -1,19 +1,12 @@
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$73,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	movq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* thr_exit(null) */
--- a/lib/thread/exit+linux-x64.s
+++ b/lib/thread/exit+linux-x64.s
@@ -1,19 +1,12 @@
 /*
-const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
+const thread.exit : (-> void)
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$11,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	movq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* thread_exit(0) */
--- a/lib/thread/exit+openbsd-x64.s
+++ b/lib/thread/exit+openbsd-x64.s
@@ -1,15 +1,8 @@
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* 
 	  Because OpenBSD wants a valid stack whenever
 	  we enter the kernel, we need to toss a preallocated
@@ -19,8 +12,8 @@
 
 	/* munmap(base, size) */
 	movq	$73,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	movq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* __threxit(0) */
--- /dev/null
+++ b/lib/thread/fsbase+freebsd.myr
@@ -1,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.sysarch(sys.Archamd64setfs, &(h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: sysarch returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h
+	match sys.sysarch(sys.Archamd64getfs, &h)
+	| 0: -> (h : tlshdr#)
+	| err:
+		std.fput(std.Err, "error: sysarch returned {}\n", err)
+		std.suicide()
+	;;
+}
--- /dev/null
+++ b/lib/thread/fsbase+linux.myr
@@ -1,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.arch_prctl(sys.Archsetfs, (h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: arch_prctl returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h : tlshdr#
+	match sys.arch_prctl(sys.Archgetfs, (&h : void#))
+	| 0: -> h
+	| err:
+		std.fput(std.Err, "error: arch_prctl returned {}\n", err)
+		std.suicide()
+	;;
+}
--- /dev/null
+++ b/lib/thread/fsbase+netbsd.myr
@@ -1,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.sysarch(sys.X8664setfsbase, &(h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: sysarch returned: {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h
+	match sys.sysarch(sys.X8664getfsbase, &h)
+	| 0: -> (h : tlshdr#)
+	| err:
+		std.fput(std.Err, "error: sysarch returned: {}\n", err)
+		std.suicide()
+	;;
+}
--- /dev/null
+++ b/lib/thread/fsbase+openbsd.myr
@@ -1,0 +1,16 @@
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	sys.__set_tcb((h : void#))
+}
+
+const getfsbase = {
+	-> (sys.__get_tcb() : tlshdr#)
+}
--- a/lib/thread/mutex+futex.myr
+++ b/lib/thread/mutex+futex.myr
@@ -1,9 +1,14 @@
+use std
+
 use "atomic"
 use "futex"
+use "tls"
+use "types"
 
 pkg thread =
 	type mutex = struct
 		_state	: ftxtag
+		_owner	: tid
 	;;	
 
 	const mkmtx	: (-> mutex)
@@ -21,12 +26,19 @@
 var nspin = 10	/* FIXME: pick a sane number, based on CPU count */
 
 const mkmtx = {
-	-> [._state = Unlocked]
+	-> [._state = Unlocked, ._owner = -1]
 }
 
 const mtxlock = {mtx
 	var c
 
+	if mtx._owner == tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to relock a mutex it already holds\n",
+			tid())
+		std.suicide()
+	;;
+
 	/*
 	Uncontended case: we get an unlocked mutex, and we lock it.
 	*/
@@ -34,6 +46,7 @@
 	for var i = 0; i < nspin; i++
 		c = xcas(&mtx._state, Unlocked, Locked)
 		if c == Unlocked
+			mtx._owner = tid()
 			-> void
 		;;
 	;;
@@ -51,14 +64,32 @@
 		ftxwait(&mtx._state, Contended, -1)
 		c = xchg(&mtx._state, Contended)
 	;;
+	mtx._owner = tid()
 }
 
 const mtxtrylock = {mtx
-	-> xcas(&mtx._state, Unlocked, Locked) == Unlocked
+	if xcas(&mtx._state, Unlocked, Locked) == Unlocked
+		mtx._owner = tid()
+		-> true
+	;;
+	-> false
 }
 
 const mtxunlock = {mtx
 	/*
+	Nonatomically loading mtx._owner may produce false negatives on
+	weakly-ordered architectures but having to atomically store and load
+	mtx._owner doesn't seem worth it.
+	*/
+	if mtx._owner != tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to unlock a mutex last held by {}\n",
+			tid(), mtx._owner)
+		std.suicide()
+	;;
+	mtx._owner = -1
+
+	/*
 	Either the lock is contended or it's uncontended. Any other
 	state is a bug.
 
@@ -72,7 +103,15 @@
 }
 
 const mtxcontended = {mtx
+	if mtx._owner == tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to relock a mutex it already holds\n",
+			tid())
+		std.suicide()
+	;;
+
 	while xchg(&mtx._state, Contended) != Unlocked
 		ftxwait(&mtx._state, Contended, -1)
 	;;
+	mtx._owner = tid()
 }
--- a/lib/thread/mutex.myr
+++ b/lib/thread/mutex.myr
@@ -1,5 +1,4 @@
 use std
-use sys
 
 use "atomic"
 
--- a/lib/thread/rwlock+futex.myr
+++ b/lib/thread/rwlock+futex.myr
@@ -2,6 +2,8 @@
 
 use "atomic"
 use "futex"
+use "tls"
+use "types"
 
 pkg thread =
 	/*
@@ -13,6 +15,7 @@
 	*/
 	type rwlock = struct
 		_state : ftxtag
+		_owner : tid
 	;;
 
 	const mkrwlock  : (-> rwlock)
@@ -28,7 +31,7 @@
 const Waitbit = 0x80000000
 
 const mkrwlock = {
-	-> [._state = 0]
+	-> [._state = 0, ._owner = -1]
 }
 
 const rdlock = {rw
@@ -61,6 +64,13 @@
 
 const wrlock = {rw
 	for ; ;
+		if rw._owner == tid()
+			std.fput(std.Err,
+				"error: thread {} attempted to relock an rwlock it already holds\n",
+				tid())
+			std.suicide()
+		;;
+
 		/*
 		`_state` must be 0 for a writer to acquire the lock. Anything
 		else means the lock is either held or in the process of being
@@ -68,6 +78,7 @@
 		 */
 		var s = xcas(&rw._state, 0, Nrmask)
 		if s == 0
+			rw._owner = tid()
 			-> void
 		;;
 
@@ -98,7 +109,11 @@
 }
 
 const trywrlock = {rw
-	-> xcas(&rw._state, 0, Nrmask) == 0
+	if xcas(&rw._state, 0, Nrmask) == 0
+		rw._owner = tid()
+		-> true
+	;;
+	-> false
 }
 
 const rdunlock = {rw
@@ -122,6 +137,14 @@
 }
 
 const wrunlock = {rw
+	if rw._owner != tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to unlock an rwlock last held by {}\n",
+			tid(), rw._owner)
+		std.suicide()
+	;;
+	rw._owner = -1
+
 	/*
 	If the wait bit was set then there are one or more waiting readers,
 	writers, or both. In the first and third cases, we need to wake
--- a/lib/thread/spawn+freebsd.myr
+++ b/lib/thread/spawn+freebsd.myr
@@ -1,9 +1,12 @@
 use sys
 use std
 
-pkg thread =
-	type tid = uint64
+use "common"
+use "fsbase"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
@@ -16,60 +19,63 @@
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ctid, ret
-	var szp, f, tos, env, envsz
+	var stk, tos, stksz, hdr, tid = -1, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	tid = -1
-	/* find top of stack */
-	tos = (stk : std.intptr) + (sz : std.intptr)
+	(tos, stksz, hdr) = initstk(stk, fn, sz)
 
-	/* store the stack size */
-	tos -= sizeof(sys.size)
-	sz -= sizeof(sys.size)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
-
-	/* store the function we call */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	sz -= (envsz : sys.size)
-	env = tos
-	tos -= sizeof((->void))
-	sz -= sizeof((->void))
-	f = (tos : (->void)#)
-	f# = std.fnbdup(fn, (env : byte#)[:envsz])
-	var repr = (&fn : int64[2]#)#
-
 	ret = sys.thr_new(&[
 		.startfn = (startthread : void#),
 		.arg = (tos : void#),
 		.stkbase = (stk : byte#),
-		.stksz = sz,
-		.tid = &ctid,
+		.stksz = stksz,
+		.tid = (&hdr.tid : uint64#),
 		.ptid = &tid,
 		.flags = 2,
-		.rtp = (0 : sys.rtprio#)
+		.rtp = Zptr,
 	], sizeof(sys.thrparam))
 
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (tid : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var stksz, len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	m = (p : std.intptr)
-	-> (m : byte#)
+	stksz = sz
+	len = tlslen()
+	stksz -= (sizeof(tlshdr) + ((len : sys.size) * sizeof(void#)) + 0xf) & ~0xf
+	tos = (stk : std.intptr) + (stksz : std.intptr)
+	hdr = (tos : tlshdr#)
+	hdr.base = stk
+	hdr.stksz = sz
+
+	var fn1 = {
+		/*
+		We write `hdr.len` here because it follows `hdr.tid` so it gets
+		overwritten by the kernel in `thr_new`. Even though `sys.pid`
+		is 32 bits, `thr_param.tid` is a `uint64#` for legacy reasons.
+		*/
+		hdr.len = len
+		setfsbase(hdr)
+		fn()
+	}
+
+	envsz = std.fnenvsz(fn1)
+	tos -= (envsz : std.intptr)
+	stksz -= (envsz : sys.size)
+	env = tos
+	tos -= sizeof((->void))
+	stksz -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn1, (env : byte#)[:envsz])
+	-> ((tos : byte#), stksz, hdr)
 }
 
 const startthread = {f : (-> void)#
--- a/lib/thread/spawn+linux.myr
+++ b/lib/thread/spawn+linux.myr
@@ -1,72 +1,67 @@
 use sys
 use std
 
-pkg thread =
-	type tid = sys.pid
+use "common"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
+const Stacksz = 8*std.MiB
 extern const exit : (-> void)
 
 /* Holy shit flag mania. */
-const Thrflag = sys.Clonevm | sys.Clonefs | sys.Clonefiles  | \
-	sys.Clonesighand | sys.Clonethread |sys.Clonesysvsem | \
-	sys.Clonesettls | sys.Cloneparentsettid | sys.Clonechildcleartid
+const Thrflag = sys.Clonevm | sys.Clonefs | sys.Clonefiles | \
+	sys.Clonesighand | sys.Clonethread | sys.Clonesettls | \
+	sys.Clonechildsettid
 
-const Stacksz = 8*std.MiB
-
 const spawn = {fn
 	-> spawnstk(fn, Stacksz)
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ctid, ret
+	var stk, tos, hdr, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	stk = initstack(stk, fn, Stacksz)
+	(tos, hdr) = initstk(stk, fn, sz)
 
-	ret = sys.fnclone(Thrflag, \
-		(stk : byte#),\
-		&tid, (0 : byte#), \
-		&ctid, (0 : byte#), \
+	ret = sys.fnclone(Thrflag,
+		tos,
+		Zptr,
+		(hdr : byte#),
+		(&hdr.tid : sys.pid#),
+		Zptr,
 		(startthread : void#))
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (ret : tid)
 }
 
-const initstack = {stk, fn, sz
-	var tos, szp, fp, env, envsz
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
 
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
+
 	envsz = std.fnenvsz(fn)
-	tos = (stk : std.intptr)
-	tos -= sizeof(int64)
-	szp = (tos : sys.size#)
-	szp# = sz
 	tos -= (envsz : std.intptr)
 	env = tos
 	tos -= sizeof((->void))
 	fp = (tos : (->void)#)
 	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
-	-> (tos : byte#)
-}
-
-const getstk = {sz
-	var p, m
-
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	/* stack starts at the top of memory and grows down. */
-	m = (p : std.intptr)
-	m += (sz : std.intptr)
-	-> (m : byte#)
+	-> ((tos : byte#), hdr)
 }
 
 const startthread = {fn : (-> void)
--- a/lib/thread/spawn+openbsd.myr
+++ b/lib/thread/spawn+openbsd.myr
@@ -1,9 +1,11 @@
 use std
 use sys
 
-pkg thread =
-	type tid = uint64
+use "common"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 	pkglocal var exitstk : byte#
 ;;
@@ -18,6 +20,7 @@
 	  time to swap to before we invalidate a stack.
 	 */
 	exitstk = getstk(16)
+	std.assert(exitstk != sys.Mapbad, "error: failed to mmap exitstk\n")
 }
 
 const spawn = {fn;
@@ -25,30 +28,17 @@
 }
 
 const spawnstk = {fn, sz
-	var stk, szp, fp, tos, tfp, env, envsz
-	var ret
+	var stk, tos, hdr, tfp, ret
 
 	stk = getstk(sz)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	/* store size */
-	tos = (stk : std.intptr)
-	tos -= sizeof(int64)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
+	(tos, hdr) = initstk(stk, fn, sz)
 
-	/* store func */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	env = tos
-	tos -= sizeof((->void))
-	fp = (tos : (->void)#)
-	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
-
 	tfp = [
-		.tcb = (0 : void#),
-		.tid = &ret,
+		.tcb = (hdr : void#),
+		.tid = (&hdr.tid : sys.pid#),
 		.stk = (tos : byte#),
 	]
 	ret = sys.__tfork_thread(&tfp,
@@ -56,22 +46,34 @@
 		(startthread : void#),
 		(0 : void#))
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (ret : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon | sys.Mstack, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	/* stack starts at the top of memory and grows down. */
-	m = (p : std.intptr)
-	m += (sz : std.intptr)
-	-> (m : byte#)
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
+
+	envsz = std.fnenvsz(fn)
+	tos -= (envsz : std.intptr)
+	env = tos
+	tos -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
+	-> ((tos : byte#), hdr)
+}
+
+const getstk = {sz
+	-> sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 }
 
 const startthread = {fn : (-> void)
--- a/lib/thread/spawn+osx.myr
+++ b/lib/thread/spawn+osx.myr
@@ -1,9 +1,10 @@
 use sys
 use std
 
-pkg thread =
-	type tid = uint64
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
@@ -34,35 +35,14 @@
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ret
-	var szp, f, tos, env, envsz
+	var stk, tos, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	tid = -1
+	tos = initstk(stk, fn, sz)
 
-	/* find top of stack */
-	tos = (stk : std.intptr) + (sz : std.intptr)
-
-	/* store the stack size */
-	tos -= sizeof(sys.size)
-	sz -= sizeof(sys.size)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
-
-	/* store the function we call */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	sz -= (envsz : sys.size)
-	env = tos
-	tos -= sizeof((->void))
-	sz -= sizeof((->void))
-	f = (tos : (->void)#)
-	f# = std.fnbdup(fn, (env : byte#)[:envsz])
-	var repr = (&fn : int64[2]#)#
-
 	ret = sys.bsdthread_create( \
 		(tramp	: void#), \	/* start */
 		(tos	: void#), \		/* arg */
@@ -70,21 +50,37 @@
 		(0	: void#), \		/* pthread struct */
 		0x01000000)			/* flags (PTHREAD_START_CUSTOM): don't alloc stack in kernel */
 
-	if ret == (-1 : void#)
+	if (ret : std.size) < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
-	-> `std.Ok (ret : tid)
+	-> `std.Ok (stk : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	m = (p : std.intptr)
-	-> (m : byte#)
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.tid = (stk : tid)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
+
+	var fn1 = {
+		setgsbase(hdr)
+		fn()
+	}
+
+	envsz = std.fnenvsz(fn1)
+	tos -= (envsz : std.intptr)
+	env = tos
+	tos -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn1, (env : byte#)[:envsz])
+	-> (tos : byte#)
 }
 
 /*
--- a/lib/thread/start+osx-x64.s
+++ b/lib/thread/start+osx-x64.s
@@ -15,20 +15,13 @@
 	
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl _thread$exit
 _thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$0x2000049,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	movq	%gs:0x08,%rdi	/* base */
+	movq	%gs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* exit the thread */
--- /dev/null
+++ b/lib/thread/test/die.myr
@@ -1,0 +1,8 @@
+use thread
+
+const main = {
+	var m = thread.mkmtx()
+	thread.mtxlock(&m)
+	thread.mtxunlock(&m)
+	thread.mtxunlock(&m)
+}
--- /dev/null
+++ b/lib/thread/test/tls.myr
@@ -1,0 +1,49 @@
+use std
+use sys
+use thread
+
+const Nelt = 100
+const Nthr = 100
+
+var elts : thread.tid[Nelt]
+var start
+var wg
+
+const setget = {
+	var tid = thread.tid()
+	var localelts : thread.tid[Nelt]
+	for var i = 0; i < Nelt; i++
+		localelts[i] = elts[i] + tid
+	;;
+
+	var k = start
+	for var i = 0; i < Nelt; i++
+		thread.tlsset(k, &localelts[i])
+		k++
+	;;
+	k = start
+	for var i = 0; i < Nelt; i++
+		std.assert(thread.tlsget(k)# == localelts[i], "tls is broken\n")
+		k++
+	;;
+	thread.wgpost(&wg)
+}
+
+const main = {
+	for var i = 0; i < Nelt; i++
+		elts[i] = std.randnum()
+	;;
+
+	start = thread.tlsalloc()
+	for var i = 1; i < Nelt; i++
+		var k : thread.tlskey(thread.tid#) = thread.tlsalloc()
+	;;
+
+	wg = thread.mkwg(Nthr)
+	for var i = 1; i < 100; i++
+		thread.spawn(setget)
+	;;
+	setget()
+
+	thread.wgwait(&wg)
+}
--- /dev/null
+++ b/lib/thread/tls+fsbase.myr
@@ -1,0 +1,59 @@
+use std
+
+use "common"
+use "fsbase"
+use "types"
+
+pkg thread =
+	generic      tlsalloc : (-> tlskey(@a#))
+	generic      tlsset   : (k : tlskey(@a#), v : @a# -> void)
+	generic      tlsget   : (k : tlskey(@a#) -> @a#)
+	extern const tid      : (-> tid)
+
+	pkglocal const        tlsoob : (k : tlskey(void) -> void)
+	pkglocal extern const tlslen : (-> tlskey(void))
+;;
+
+const Staticcap = 8
+
+var _hdr
+var _cap = Staticcap
+
+generic tlsalloc = {
+	std.assert(tid() == 0, "error: tlsalloc must be called from main thread\n")
+	if _hdr == Zptr
+		/* `_hdr` is lazily initialized here since we can't set it in start.s */
+		_hdr = getfsbase()
+	;;
+
+	if _hdr.len++ == _cap
+		std.assert(_cap < 0x8000_0000, "error: max tls slots exceeded\n")
+		var l = sizeof(tlshdr) + ((_cap : std.size) * sizeof(void#))
+		var h = std.bytealloc(sizeof(tlshdr) + ((_cap *= 2 : std.size) * sizeof(void#)))
+
+		std.memblit(h, (_hdr : byte#), l)
+		setfsbase((h : tlshdr#))
+		/* this is ugly... the initial tls region is statically allocated */
+		if _cap != Staticcap * 2
+			std.bytefree((_hdr : byte#), l)
+		;;
+		_hdr = (h : tlshdr#)
+	;;
+	-> (_hdr.len - 1 : tlskey(@a#))
+}
+
+generic tlsset = {k, v
+	_tlsset((k : tlskey(void)), (v : void#))
+}
+
+generic tlsget = {k
+	-> (_tlsget((k : tlskey(void))) : @a#)
+}
+
+const tlsoob = {k
+	std.fput(std.Err, "error: tlskey {} out of bounds {}\n", k, tlslen())
+	std.suicide()
+}
+
+extern const _tlsset : (k : tlskey(void), v : void# -> void)
+extern const _tlsget : (k : tlskey(void) -> void#)
--- /dev/null
+++ b/lib/thread/tls+osx.myr
@@ -1,0 +1,70 @@
+use std
+
+use "common"
+use "types"
+
+pkg thread =
+	generic      tlsalloc : (-> tlskey(@a#))
+	generic      tlsset   : (k : tlskey(@a#), v : @a# -> void)
+	generic      tlsget   : (k : tlskey(@a#) -> @a#)
+	extern const tid      : (-> tid)
+
+	pkglocal const        tlsoob    : (k : tlskey(void) -> void)
+	pkglocal extern const tlslen    : (-> tlskey(void))
+	pkglocal const        setgsbase : (h : tlshdr# -> void)
+	pkglocal extern const getgsbase : (-> tlshdr#)
+;;
+
+const Staticcap = 8
+
+var _hdr
+var _cap = Staticcap
+
+generic tlsalloc = {
+	std.assert(tid() == 0, "error: tlsalloc must be called from main thread\n")
+	if _hdr == Zptr
+		/* `_hdr` is lazily initialized here since we can't set it in start.s */
+		_hdr = getgsbase()
+	;;
+
+	if _hdr.len++ == _cap
+		std.assert(_cap < 0x8000_0000, "error: max tls slots exceeded\n")
+		var l = sizeof(tlshdr) + ((_cap : std.size) * sizeof(void#))
+		var h = std.bytealloc(sizeof(tlshdr) + ((_cap *= 2 : std.size) * sizeof(void#)))
+
+		std.memblit(h, (_hdr : byte#), l)
+		setgsbase((h : tlshdr#))
+		/* this is ugly... the initial tls region is statically allocated */
+		if _cap != Staticcap * 2
+			std.bytefree((_hdr : byte#), l)
+		;;
+		_hdr = (h : tlshdr#)
+	;;
+	-> (_hdr.len - 1 : tlskey(@a#))
+}
+
+generic tlsset = {k, v
+	_tlsset((k : tlskey(void)), (v : void#))
+}
+
+generic tlsget = {k
+	-> (_tlsget((k : tlskey(void))) : @a#)
+}
+
+const tlsoob = {k
+	std.fput(std.Err, "error: tlskey {} out of bounds {}\n", k, tlslen())
+	std.suicide()
+}
+
+const setgsbase = {h
+	match _setgsbase(h)
+	| 0xf: /* yes, this indicates success; no, it's not documented */
+	| err:
+		std.fput(std.Err, "error: setgsbase returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+extern const _tlsset    : (k : tlskey(void), v : void# -> void)
+extern const _tlsget    : (k : tlskey(void) -> void#)
+extern const _setgsbase : (h : tlshdr# -> int64)
--- /dev/null
+++ b/lib/thread/tls-impl+fsbase-x64.s
@@ -1,0 +1,48 @@
+.set tid,	0x00
+.set len,	0x04
+.set slots,	0x18
+
+/* const tid : (-> tid) */
+.globl thread$tid
+.globl _thread$tid
+thread$tid:
+_thread$tid:
+	movl	%fs:tid, %eax
+	ret
+
+/* const _tlsset : (k : key, v : void# -> void) */
+.globl thread$_tlsset
+.globl _thread$_tlsset
+thread$_tlsset:
+_thread$_tlsset:
+	cmpl	%fs:len, %edi
+	jnb	oob
+
+	movslq	%edi, %rdi
+	movq	$slots, %r10
+	movq	%rsi, %fs:(%r10, %rdi, 0x8)
+	ret
+
+/* const _tlsget : (k : key -> void#) */
+.globl thread$_tlsget
+.globl _thread$_tlsget
+thread$_tlsget:
+_thread$_tlsget:
+	cmpl	%fs:len, %edi
+	jnb	oob
+
+	movslq	%edi, %rdi
+	movq	$slots, %r10
+	movq	%fs:(%r10, %rdi, 0x8), %rax
+	ret
+
+oob:
+	call	thread$tlsoob
+
+/* const tlslen : (-> key) */
+.globl thread$tlslen
+.globl _thread$tlslen
+thread$tlslen:
+_thread$tlslen:
+	movl	%fs:len, %eax
+	ret
--- /dev/null
+++ b/lib/thread/tls-impl+osx-x64.s
@@ -1,0 +1,64 @@
+.set tid,	0x00
+.set len,	0x08
+.set self,	0x20
+.set slots,	0x28
+
+/* const tid : (-> tid) */
+.globl thread$tid
+.globl _thread$tid
+thread$tid:
+_thread$tid:
+	movq	%gs:tid, %rax
+	ret
+
+/* const _tlsset : (k : key, v : void# -> void) */
+.globl thread$_tlsset
+.globl _thread$_tlsset
+thread$_tlsset:
+_thread$_tlsset:
+	cmpq	%gs:len, %rdi
+	jnb	oob
+
+	movq	$slots, %r10
+	movq	%rsi, %gs:(%r10, %rdi, 0x8)
+	ret
+
+/* const _tlsget : (k : key -> void#) */
+.globl thread$_tlsget
+.globl _thread$_tlsget
+thread$_tlsget:
+_thread$_tlsget:
+	cmpq	%gs:len, %rdi
+	jnb	oob
+
+	movq	$slots, %r10
+	movq	%gs:(%r10, %rdi, 0x8), %rax
+	ret
+
+oob:
+	call	_thread$tlsoob
+
+/* const tlslen : (-> key) */
+.globl thread$tlslen
+.globl _thread$tlslen
+thread$tlslen:
+_thread$tlslen:
+	movq	%gs:len, %rax
+	ret
+
+/* const _setgsbase : (h : tlshdr# -> int64) */
+.globl thread$_setgsbase
+.globl _thread$_setgsbase
+thread$_setgsbase:
+_thread$_setgsbase:
+	movq	$0x3000003, %rax /* undocumented syscall; sets %gs to %rdi */
+	syscall
+	ret
+
+/* const getgsbase : (-> tlshdr#) */
+.globl thread$getgsbase
+.globl _thread$getgsbase
+thread$getgsbase:
+_thread$getgsbase:
+	movq	%gs:self, %rax
+	ret
--- /dev/null
+++ b/lib/thread/types+fsbase.myr
@@ -1,0 +1,19 @@
+use sys
+
+pkg thread =
+	type tid        = sys.pid /* 32 bits on all of the fsbase platforms */
+	type tlskey(@a) = uint32
+
+	/*
+	XXX: Be sure to update tls-impl+fsbase.s and
+	rt/start-{freebsd,linux,netbsd,openbsd}.s if any changes are made to
+	the size of this struct and/or the offsets of any of its members.
+	 */
+	pkglocal type tlshdr = struct
+		tid   : tid
+		len   : tlskey(void)
+		base  : byte#
+		stksz : sys.size
+		slots : void#[...]
+	;;
+;;
--- /dev/null
+++ b/lib/thread/types+osx.myr
@@ -1,0 +1,20 @@
+use sys
+
+pkg thread =
+	type tid        = sys.pid /* 64 bits */
+	type tlskey(@a) = uint64
+
+	/*
+	XXX: Be sure to update tls-impl+osx.s and rt/start-osx.s if any changes
+	are made to the size of this struct and/or the offsets of any of its
+	members.
+	 */
+	pkglocal type tlshdr = struct
+		tid   : tid
+		len   : tlskey(void)
+		base  : byte#
+		stksz : sys.size
+		self  : tlshdr#
+		slots : void#[...]
+	;;
+;;
--- a/mk/bootstrap/bootstrap+Darwin-x86_64.sh
+++ b/mk/bootstrap/bootstrap+Darwin-x86_64.sh
@@ -7,6 +7,7 @@
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/start.o lib/thread/start+osx-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+osx-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,18 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+osx.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+osx.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/start.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/start.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/mk/bootstrap/bootstrap+FreeBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+FreeBSD-amd64.sh
@@ -7,6 +7,7 @@
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+freebsd-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+freebsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+freebsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+freebsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/mk/bootstrap/bootstrap+Linux-x86_64.sh
+++ b/mk/bootstrap/bootstrap+Linux-x86_64.sh
@@ -7,6 +7,7 @@
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+linux-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+linux.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+linux.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+linux.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/mk/bootstrap/bootstrap+NetBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+NetBSD-amd64.sh
@@ -6,6 +6,7 @@
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/config.myr
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -119,15 +120,18 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+netbsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+netbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+netbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/mk/bootstrap/bootstrap+OpenBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+OpenBSD-amd64.sh
@@ -7,6 +7,7 @@
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+openbsd-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+openbsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+openbsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+openbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+openbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+openbsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+openbsd:6.2.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/rt/start-freebsd.s
+++ b/rt/start-freebsd.s
@@ -4,6 +4,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -34,6 +39,16 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	subq	$0x10,%rsp
+	movq	$165,%rax		/* sysarch */
+	movq	$129,%rdi		/* Archamd64setfs */
+	leaq	thread$__tls(%rip),%rsi
+	movq	%rsi,(%rsp)
+	movq	%rsp,%rsi
+	syscall
+	addq	$0x10,%rsp
 
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
--- a/rt/start-linux.s
+++ b/rt/start-linux.s
@@ -4,6 +4,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -35,6 +40,12 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	movq	$158,%rax		/* arch_prctl */
+	movq	$0x1002,%rdi		/* Archsetfs */
+	leaq	thread$__tls(%rip),%rsi
+	syscall
 
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
--- a/rt/start-netbsd.s
+++ b/rt/start-netbsd.s
@@ -12,6 +12,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -19,6 +23,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -43,6 +48,16 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	subq	$0x10,%rsp
+	movq	$165,%rax		/* sysarch */
+	movq	$15,%rdi		/* X8664setfsbase */
+	leaq	thread$__tls(%rip),%rsi
+	movq	%rsi,(%rsp)
+	movq	%rsp,%rsi
+	syscall
+	addq	$0x10,%rsp
 
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
--- a/rt/start-openbsd.s
+++ b/rt/start-openbsd.s
@@ -13,6 +13,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -20,6 +24,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -44,6 +49,11 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	movq	$329,%rax		/* Sys__set_tcb */
+	leaq	thread$__tls(%rip),%rdi
+	syscall
 
 	xorq %rbp,%rbp
 	/*
--- a/rt/start-osx.s
+++ b/rt/start-osx.s
@@ -4,6 +4,10 @@
 _sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 104 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 40 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl start
@@ -35,6 +40,12 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	movq	$0x3000003,%rax		/* undocumented setgsbase syscall */
+	leaq	thread$__tls(%rip),%rdi
+	movq	%rdi,0x20(%rdi)		/* also store a copy in __tls.self */
+	syscall
 
 	xorq %rbp,%rbp
 	call	___init__
--- a/support/syscall-gen/types+freebsd-x64.frag
+++ b/support/syscall-gen/types+freebsd-x64.frag
@@ -32,6 +32,7 @@
 type cpulevel	= int
 type cpusetid	= int
 type idtype	= int
+type sysarchop	= int
 
 type acltype	= int
 type acltag	= uint32
@@ -795,6 +796,13 @@
 const Sigthr	: signo = 32	/* reserved by thread library. */
 const Siglwp	: signo = Sigthr
 const Siglibrt	: signo = 33	/* reserved by real-time library. */
+
+/* sysarch ops */
+const Archamd64getfs   : sysarchop = 128
+const Archamd64setfs   : sysarchop = 129
+const Archamd64getgs   : sysarchop = 130
+const Archamd64setgs   : sysarchop = 131
+const Archamd64getxfpu : sysarchop = 131
 
 extern const syscall : (sc:scno, args:... -> int64)
 extern var __cenvp : byte##
--- a/support/syscall-gen/types+linux-x64.frag
+++ b/support/syscall-gen/types+linux-x64.frag
@@ -38,6 +38,7 @@
 type mfdflags	= uint32
 type aiocontext	= uint64
 type msg	= void#
+type arch_prctlop	= uint64
 
 
 type clock = union
@@ -583,6 +584,12 @@
 
 /* return value for a failed mapping */
 const Mapbad	: byte# = (-1 : byte#)
+
+/* arch_prctl ops */
+const Archsetgs : arch_prctlop = 0x1001
+const Archsetfs : arch_prctlop = 0x1002
+const Archgetfs : arch_prctlop = 0x1003
+const Archgetgs : arch_prctlop = 0x1004
 
 /* signal flags */
 const Sanocldstop	: sigflags = 0x00000001