shithub: gefix

Download patch

ref: 5eafd0ab6729981323928f2db6b7b2c10fe317c6
author: Ori Bernstein <ori@eigenstate.org>
date: Sat Jul 20 21:15:44 EDT 2024

gefix: hacked gefs to fix ream issue

--- /dev/null
+++ b/atomic-386.s
@@ -1,0 +1,100 @@
+/*  get variants */
+TEXT ageti+0(SB),1,$0
+TEXT agetl+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+	MOVL	p+0(FP), AX
+	MOVL	0(AX), AX
+	RET
+
+TEXT agetv+0(SB),1,$0
+	MOVL	r+0(FP), AX
+	MOVL	p+4(FP), BX
+	FMOVD	(BX), F0
+	FMOVDP	F0, (AX)
+	RET
+
+/*  set variants */
+TEXT aseti+0(SB),1,$0
+TEXT asetl+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+	MOVL		p+0(FP), BX
+	MOVL		v+4(FP), AX
+	LOCK; XCHGL	(BX), AX
+	RET
+
+TEXT asetv+0(SB),1,$0
+	MOVL	p+4(FP), DI
+	MOVL	nv+8(FP), BX
+	MOVL	nv+12(FP), CX
+	MOVL	0(DI), AX
+	MOVL	4(DI), DX
+loop:
+	LOCK;	CMPXCHG8B (DI)
+        JNE     loop
+	MOVL	p+0(FP),DI
+	MOVL	AX, 0(DI)
+	MOVL	DX, 4(DI)
+	RET
+
+/*  inc variants */
+TEXT ainci+0(SB),1,$0
+TEXT aincl+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+	MOVL	p+0(FP), BX
+	MOVL	v+4(FP), CX
+	MOVL	CX, AX
+	LOCK; XADDL AX, (BX)
+	ADDL	CX, AX
+	RET
+
+TEXT aincv+0(SB),1,$0
+	MOVL	p+4(FP), DI
+retry:
+	MOVL	0(DI), AX
+	MOVL	4(DI), DX
+	MOVL 	AX, BX
+	MOVL	DX, CX
+	ADDL	v+8(FP), BX
+	ADCL	v+12(FP), CX
+	LOCK; CMPXCHG8B (DI)
+	JNE	retry
+	MOVL	r+0(FP), DI
+	MOVL	BX, 0x0(DI)
+	MOVL	CX, 0x4(DI)
+	RET
+
+/*  cas variants */
+TEXT acasi+0(SB),1,$0
+TEXT acasl+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+	MOVL	p+0(FP), CX
+	MOVL	ov+4(FP), AX
+	MOVL	nv+8(FP), DX
+	LOCK; CMPXCHGL DX, (CX)
+	JNE	fail32
+	MOVL	$1,AX
+	RET
+fail32:
+	MOVL	$0,AX
+	RET
+
+TEXT acasv+0(SB),1,$0
+	MOVL	p+0(FP), DI
+	MOVL	ov+4(FP), AX
+	MOVL	ov+8(FP), DX
+	MOVL	nv+12(FP), BX
+	MOVL	nv+16(FP), CX
+	LOCK; CMPXCHG8B (DI)
+	JNE	fail64
+	MOVL	$1,AX
+	RET
+fail64:
+	MOVL	$0,AX
+	RET
+
+/* barriers (do we want to distinguish types?) */
+TEXT coherence+0(SB),1,$0
+	/* this is essentially mfence but that requires sse2 */
+	XORL	AX, AX
+	LOCK; XADDL AX, (SP)
+	RET
--- /dev/null
+++ b/atomic-amd64.s
@@ -1,0 +1,59 @@
+/*  get variants */
+TEXT agetl+0(SB),1,$0
+	MOVL	(RARG), AX
+	RET
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+	MOVQ	(RARG), AX
+	RET
+
+/*  set variants */
+TEXT asetl+0(SB),1,$0
+	MOVL		v+8(FP), AX
+	LOCK; XCHGL	(RARG), AX
+	RET
+
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+	MOVQ		v+8(FP), AX
+	LOCK; XCHGQ	(RARG), AX
+	RET
+
+/*  inc variants */
+TEXT aincl+0(SB),1,$0
+	MOVQ		v+8(FP), BX
+	MOVQ		BX, AX
+	LOCK; XADDL	AX, (RARG)
+	ADDQ		BX, AX
+	RET
+
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+	MOVQ		v+8(FP), BX
+	MOVQ		BX, AX
+	LOCK; XADDQ	AX, (RARG)
+	ADDQ		BX, AX
+	RET
+
+/*  cas variants */
+TEXT acasl+0(SB),1,$0
+	MOVL	c+8(FP), AX
+	MOVL	v+16(FP), BX
+	LOCK; CMPXCHGL	BX, (RARG)
+	SETEQ	AX
+	MOVBLZX	AX, AX
+	RET
+
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+	MOVQ	c+8(FP), AX
+	MOVQ	v+16(FP), BX
+	LOCK; CMPXCHGQ BX, (RARG)
+	SETEQ	AX
+	MOVBLZX	AX, AX
+	RET
+
+/* barriers (do we want to distinguish types?) */
+TEXT coherence+0(SB),1,$0
+	MFENCE
+	RET
--- /dev/null
+++ b/atomic-arm.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+	uintptr x = (uintptr)p;
+
+	/* constants from splitmix32 rng */
+	x = (x ^ (x >> 16)) * 0x85ebca6b;
+	x = (x ^ (x >> 13)) * 0xc2b2ae35;
+	x = (x ^ (x >> 16));
+	return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+	T n(T *p)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define SET(T, n) \
+	T n(T *p, T v)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		*p = v;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define INC(T, n) \
+	T n(T *p, T dv)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		*p += dv;		\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define CAS(T, n) \
+	int n(T *p, T ov, T nv)		\
+	{				\
+		uintptr h;		\
+		int r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		if(*p == ov){		\
+			*p = nv;	\
+			r = 1;		\
+		}else			\
+			r = 0;		\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/atomic-arm64.s
@@ -1,0 +1,79 @@
+/*  get variants */
+TEXT agetl+0(SB),1,$0
+	MOVW	(R0), R0
+	RETURN
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+	MOV	(R0), R0
+	RETURN
+
+/*  set variants */
+TEXT asetl+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	R0, R2
+_setl:
+	LDAXRW	(R2), R0
+	STLXRW	R1, (R2), R3
+	CBNZW	R3, _setl
+	RETURN
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	R0, R2
+_setp:
+	LDAXR	(R2), R0
+	STLXR	R1, (R2), R3
+	CBNZW	R3, _setp
+	RETURN
+
+/*  inc variants */
+TEXT aincl+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	R0, R2
+_incl:
+	LDAXRW	(R2), R0
+	ADDW	R1, R0, R3
+	STLXRW	R3, (R2), R4
+	CBNZW	R4, _incl
+	RETURN
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	R0, R2
+_incp:
+	LDAXR	(R2), R0
+	ADD	R1, R0, R3
+	STLXR	R3, (R2), R4
+	CBNZW	R4, _incp
+	RETURN
+
+/*  cas variants */
+TEXT acasl+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	0x10(FP), R2
+	LDAXRW	(R0), R3
+	CMPW	R1, R3
+	BNE	_casl
+	STLXRW	R2, (R0), R4
+	CMPW	$0, R4
+_casl:
+	CSETW	EQ, R0
+	RETURN
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	0x10(FP), R2
+	LDAXR	(R0), R3
+	CMP	R1, R3
+	BNE	_casp
+	STLXR	R2, (R0), R4
+	CMPW	$0, R4
+_casp:
+	CSETW	EQ, R0
+	RETURN
+
+/* barriers */
+#define ISH	(2<<2 | 3)
+TEXT coherence+0(SB),1,$0
+	DMB	$ISH
+	RETURN
--- /dev/null
+++ b/atomic-mips.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+	uintptr x = (uintptr)p;
+
+	/* constants from splitmix32 rng */
+	x = (x ^ (x >> 16)) * 0x85ebca6b;
+	x = (x ^ (x >> 13)) * 0xc2b2ae35;
+	x = (x ^ (x >> 16));
+	return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+	T n(T *p)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define SET(T, n) \
+	T n(T *p, T v)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		*p = v;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define INC(T, n) \
+	T n(T *p, T dv)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		*p += dv;		\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define CAS(T, n) \
+	int n(T *p, T ov, T nv)		\
+	{				\
+		uintptr h;		\
+		int r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		if(*p == ov){		\
+			*p = nv;	\
+			r = 1;		\
+		}else			\
+			r = 0;		\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/atomic-power64.s
@@ -1,0 +1,101 @@
+/*  get variants */
+TEXT agetl+0(SB),1,$0
+	SYNC
+	// See ISA 3.0B section B.2.3, "Safe Fetch"
+	MOVWZ	0(RARG), RARG
+	CMPW	RARG, RARG, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	RETURN
+
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+	SYNC
+	// See ISA 3.0B section B.2.3, "Safe Fetch"
+	MOVD	0(RARG), RARG
+	CMP	RARG, RARG, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	RETURN
+
+/*  set variants */
+TEXT asetl+0(SB),1,$0
+	MOVW	val+8(FP), R4
+	SYNC
+	MOVW	R4, 0(RARG)
+	RETURN
+
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+	MOVD	val+8(FP), R4
+	SYNC
+	MOVD	R4, 0(RARG)
+	RETURN
+
+/*  inc variants */
+TEXT aincl+0(SB),1,$0
+	MOVD	RARG, R4
+	MOVW	delta+8(FP), R5
+	LWSYNC
+	LWAR	(R4), RARG
+	ADD	R5, RARG
+	STWCCC	RARG, (R4)
+	BNE	-3(PC)
+	RETURN
+
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+	MOVD	RARG, R4
+	MOVD	delta+8(FP), R5
+	LWSYNC
+	LDAR	(R4), RARG
+	ADD	R5, RARG
+	STDCCC	RARG, (R4)
+	BNE	-3(PC)
+	RETURN
+
+/*  cas variants */
+TEXT acasl+0(SB),1,$0
+	MOVWZ	old+8(FP), R4
+	MOVWZ	new+16(FP), R5
+	LWSYNC
+casagain:
+	LWAR	(RARG), R6
+	CMPW	R6, R4
+	BNE	casfail
+	STWCCC	R5, (RARG)
+	BNE	casagain
+	MOVD	$1, RARG
+	LWSYNC
+	RETURN
+casfail:
+	LWSYNC
+	AND	R0, RARG
+	RETURN
+
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+	MOVD	old+8(FP), R4
+	MOVD	new+16(FP), R5
+	LWSYNC
+cas64again:
+	LDAR	(RARG), R6
+	CMP	R6, R4
+	BNE	cas64fail
+	STDCCC	R5, (RARG)
+	BNE	cas64again
+	MOVD	$1, RARG
+	LWSYNC
+	RETURN
+cas64fail:
+	LWSYNC
+	AND	R0, RARG
+	RETURN
+
+/* barriers */
+TEXT coherence+0(SB),1,$0
+	// LWSYNC is the "export" barrier recommended by Power ISA
+	// v2.07 book II, appendix B.2.2.2.
+	// LWSYNC is a load/load, load/store, and store/store barrier.
+	LWSYNC
+	RETURN
--- /dev/null
+++ b/atomic-spim.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+	uintptr x = (uintptr)p;
+
+	/* constants from splitmix32 rng */
+	x = (x ^ (x >> 16)) * 0x85ebca6b;
+	x = (x ^ (x >> 13)) * 0xc2b2ae35;
+	x = (x ^ (x >> 16));
+	return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+	T n(T *p)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define SET(T, n) \
+	T n(T *p, T v)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		*p = v;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define INC(T, n) \
+	T n(T *p, T dv)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		*p += dv;		\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define CAS(T, n) \
+	int n(T *p, T ov, T nv)		\
+	{				\
+		uintptr h;		\
+		int r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		if(*p == ov){		\
+			*p = nv;	\
+			r = 1;		\
+		}else			\
+			r = 0;		\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/atomic.h
@@ -1,0 +1,16 @@
+long	agetl(long*);
+vlong	agetv(vlong*);
+void*	agetp(void**);
+
+long	asetl(long*, long);
+vlong	asetv(vlong*, vlong);
+void*	asetp(void**, void*);
+
+long	aincl(long*, long);
+vlong	aincv(vlong*, vlong);
+
+int	acasl(long*, long, long);
+int	acasv(vlong*, vlong, vlong);
+int	acasp(void**, void*, void*);
+
+void	coherence(void);
--- /dev/null
+++ b/blk.c
@@ -1,0 +1,1124 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+static vlong	blkalloc_lk(Arena*, int);
+static vlong	blkalloc(int, uint, int);
+static void	blkdealloc_lk(Arena*, vlong);
+static Blk*	initblk(Blk*, vlong, vlong, int);
+static void	readblk(Blk*, Bptr, int);
+
+int
+checkflag(Blk *b, int set, int clr)
+{
+	long v;
+
+	v = agetl(&b->flag);
+	return (v & (set|clr)) == set;
+}
+
+void
+setflag(Blk *b, int set, int clr)
+{
+	long ov, nv;
+
+	while(1){
+		ov = agetl(&b->flag);
+		nv = (ov & ~clr) | set;
+		if(acasl(&b->flag, ov, nv))
+			break;
+	}
+}
+
+void
+syncblk(Blk *b)
+{
+	assert(checkflag(b, Bfinal, 0));
+	assert(b->bp.addr >= 0);
+	tracex("syncblk", b->bp, b->type, -1);
+	if(pwrite(fs->fd, b->buf, Blksz, b->bp.addr) == -1)
+		broke("%B %s: %r", b->bp, Eio);
+	setflag(b, 0, Bdirty);
+}
+
+static void
+readblk(Blk *b, Bptr bp, int flg)
+{
+	vlong off, xh, ck, rem, n;
+	char *p;
+
+	off = bp.addr;
+	rem = Blksz;
+	while(rem != 0){
+		n = pread(fs->fd, b->buf, rem, off);
+		if(n <= 0)
+			error("%s: %r", Eio);
+		off += n;
+		rem -= n;
+	}
+	b->cnext = nil;
+	b->cprev = nil;
+	b->hnext = nil;
+
+	b->bp.addr = bp.addr;
+	b->bp.hash = -1;
+	b->bp.gen = -1;
+
+	b->nval = 0;
+	b->valsz = 0;
+	b->nbuf = 0;
+	b->bufsz = 0;
+	b->logsz = 0;
+
+	p = b->buf + 2;
+	b->type = (flg&GBraw) ? Tdat : UNPACK16(b->buf+0);
+	switch(b->type){
+	default:
+		broke("invalid block type %d @%llx", b->type, bp);
+		break;
+	case Tdat:
+	case Tsuper:
+		b->data = b->buf;
+		break;
+	case Tarena:
+		b->data = p;
+		break;
+	case Tdlist:
+	case Tlog:
+		b->logsz = UNPACK16(p);		p += 2;
+		b->logh = UNPACK64(p);		p += 8;
+		b->logp = unpackbp(p, Ptrsz);	p += Ptrsz;
+		assert(p - b->buf == Loghdsz);
+		b->data = p;
+		break;
+	case Tpivot:
+		b->nval = UNPACK16(p);		p += 2;
+		b->valsz = UNPACK16(p);		p += 2;
+		b->nbuf = UNPACK16(p);		p += 2;
+		b->bufsz = UNPACK16(p);		p += 2;
+		assert(p - b->buf == Pivhdsz);
+		b->data = p;
+		break;
+	case Tleaf:
+		b->nval = UNPACK16(p);		p += 2;
+		b->valsz = UNPACK16(p);		p += 2;
+		assert(p - b->buf == Leafhdsz);
+		b->data = p;
+		break;
+	}
+	if(b->type == Tlog || b->type == Tdlist){
+		xh = b->logh;
+		ck = bufhash(b->data, b->logsz);
+	}else{
+		xh = bp.hash;
+		ck = blkhash(b);
+	}
+	if((!flg&GBnochk) && ck != xh){
+		if(!(flg&GBsoftchk))
+			broke("%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+		fprint(2, "%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+		error(Ecorrupt);
+	}
+	assert(b->magic == Magic);
+}
+
+static Arena*
+pickarena(uint ty, uint hint, int tries)
+{
+	uint n, r;
+
+	r = ainc(&fs->roundrobin)/2048;
+	if(ty == Tdat)
+		n = hint % (fs->narena - 1) + r + 1;
+	else
+		n = r;
+	return &fs->arenas[(n + tries) % fs->narena];
+}
+
+Arena*
+getarena(vlong b)
+{
+	int hi, lo, mid;
+	vlong alo, ahi;
+	Arena *a;
+
+	lo = 0;
+	hi = fs->narena;
+	if(b == fs->sb0->bp.addr)
+		return &fs->arenas[0];
+	if(b == fs->sb1->bp.addr)
+		return &fs->arenas[hi-1];
+	while(1){
+		mid = (hi + lo)/2;
+		a = &fs->arenas[mid];
+		alo = a->h0->bp.addr;
+		ahi = alo + a->size + 2*Blksz;
+		if(b < alo)
+			hi = mid-1;
+		else if(b > ahi)
+			lo = mid+1;
+		else
+			return a;
+	}
+}
+
+
+static void
+freerange(Avltree *t, vlong off, vlong len)
+{
+	Arange *r, *s;
+
+	assert(len % Blksz == 0);
+	if((r = calloc(1, sizeof(Arange))) == nil)
+		error(Enomem);
+	r->off = off;
+	r->len = len;
+	assert(avllookup(t, r, 0) == nil);
+	avlinsert(t, r);
+
+Again:
+	s = (Arange*)avlprev(r);
+	if(s != nil && s->off+s->len == r->off){
+		avldelete(t, r);
+		s->len = s->len + r->len;
+		free(r);
+		r = s;
+		goto Again;
+	}
+	s = (Arange*)avlnext(r);
+	if(s != nil && r->off+r->len == s->off){
+		avldelete(t, r);
+		s->off = r->off;
+		s->len = s->len + r->len;
+		free(r);
+		r = s;
+		goto Again;
+	}
+}
+
+static void
+grabrange(Avltree *t, vlong off, vlong len)
+{
+	Arange *r, *s, q;
+	vlong l;
+
+	assert(len % Blksz == 0);
+	q.off = off;
+	q.len = len;
+	r = (Arange*)avllookup(t, &q.Avl, -1);
+	if(r == nil || off + len > r->off + r->len)
+		abort();
+
+	if(off == r->off){
+		r->off += len;
+		r->len -= len;
+	}else if(off + len == r->off + r->len){
+		r->len -= len;
+	}else if(off > r->off && off+len < r->off + r->len){
+		s = emalloc(sizeof(Arange), 0);
+		l = r->len;
+		s->off = off + len;
+		r->len = off - r->off;
+		s->len = l - r->len - len;
+		avlinsert(t, s);
+	}else
+		abort();
+
+	if(r->len == 0){
+		avldelete(t, r);
+		free(r);
+	}
+}
+
+static Blk*
+mklogblk(Arena *a, vlong o)
+{
+	Blk *lb;
+
+	lb = a->logbuf[0];
+	if(lb == a->logtl)
+		lb = a->logbuf[1];
+	assert(lb->ref == 1);
+	lb->flag = Bstatic;
+	initblk(lb, o, -1, Tlog);
+	traceb("logblk" , lb->bp);
+	lb->lasthold0 = lb->lasthold;
+	lb = holdblk(lb);
+	lb->lasthold = getcallerpc(&a);
+	return lb;
+}
+
+/*
+ * Logs an allocation. Must be called
+ * with arena lock held. Duplicates some
+ * of the work in allocblk to prevent
+ * recursion.
+ */
+static void
+logappend(Arena *a, vlong off, vlong len, int op)
+{
+	vlong o, start, end;
+	Blk *lb;
+	char *p;
+
+	assert((off & 0xff) == 0);
+	assert(op == LogAlloc || op == LogFree || op == LogSync);
+	if(op != LogSync){
+		start = a->h0->bp.addr;
+		end = start + a->size + 2*Blksz;
+		assert(off >= start);
+		assert(off < end);
+	}
+	lb = a->logtl;
+	assert(lb->ref > 0);
+	assert(lb->type == Tlog);
+	assert(lb->logsz >= 0);
+	dprint("logop %d: %llx+%llx@%x\n", op, off, len, lb->logsz);
+
+	if(checkflag(lb, 0, Bdirty))
+		setflag(lb, Bdirty, Bfinal);
+
+	/*
+	 * move to the next block when we have
+	 * too little room in the log:
+	 * We're appending up to 16 bytes as
+	 * part of the operation, followed by
+	 * 16 bytes of new log entry allocation
+	 * and chaining.
+	 */
+	if(lb->logsz >= Logspc - Logslop){
+		o = blkalloc_lk(a, 0);
+		if(o == -1)
+			error(Efull);
+		p = lb->data + lb->logsz;
+		PACK64(p, o|LogAlloc1);
+		lb->logsz += 8;
+		lb->logp = (Bptr){o, -1, -1};
+		lb = mklogblk(a, o);
+	}
+	if(len == Blksz){
+		if(op == LogAlloc)
+			op = LogAlloc1;
+		else if(op == LogFree)
+			op = LogFree1;
+	}
+	off |= op;
+	p = lb->data + lb->logsz;
+	PACK64(p, off);
+	lb->logsz += 8;
+	if(op >= Log2wide){
+		PACK64(p+8, len);
+		lb->logsz += 8;
+	}
+	if(lb != a->logtl) {
+		finalize(lb);
+		syncblk(lb);
+
+		finalize(a->logtl);
+		syncblk(a->logtl);
+		dropblk(a->logtl);
+		a->logtl = lb;
+		a->nlog++;
+	}
+}
+
+void
+loadlog(Arena *a, Bptr bp)
+{
+	vlong ent, off, len, gen;
+	int op, i, n;
+	char *d;
+	Blk *b;
+
+
+	dprint("loadlog %B\n", bp);
+	traceb("loadlog", bp);
+	b = a->logbuf[0];
+	while(1){
+		assert(checkflag(b, Bstatic, Bcached));
+		holdblk(b);
+		readblk(b, bp, 0);
+		dprint("\tload %B chain %B\n", bp, b->logp);
+		a->nlog++;
+		for(i = 0; i < b->logsz; i += n){
+			d = b->data + i;
+			ent = UNPACK64(d);
+			op = ent & 0xff;
+			off = ent & ~0xff;
+			n = (op >= Log2wide) ? 16 : 8;
+			switch(op){
+			case LogSync:
+				gen = ent >> 8;
+				dprint("\tlog@%x: sync %lld\n", i, gen);
+				if(gen >= fs->qgen){
+					if(a->logtl == nil){
+						b->logsz = i;
+						a->logtl = b;
+						cachedel(b->bp.addr);
+						setflag(b, Bdirty, 0);
+						return;
+					}
+					dropblk(b);
+					return;
+				}
+				break;
+	
+			case LogAlloc:
+			case LogAlloc1:
+				len = (op >= Log2wide) ? UNPACK64(d+8) : Blksz;
+				dprint("\tlog@%x alloc: %llx+%llx\n", i, off, len);
+				grabrange(a->free, off & ~0xff, len);
+				a->used += len;
+				break;
+			case LogFree:
+			case LogFree1:
+				len = (op >= Log2wide) ? UNPACK64(d+8) : Blksz;
+				dprint("\tlog@%x free: %llx+%llx\n", i, off, len);
+				freerange(a->free, off & ~0xff, len);
+				a->used -= len;
+				break;
+			default:
+				dprint("\tlog@%x: log op %d\n", i, op);
+				abort();
+				break;
+			}
+		}
+		if(b->logp.addr == -1){
+			a->logtl = b;
+			return;
+		}
+		bp = b->logp;
+		dropblk(b);
+	}
+}
+
+void
+flushlog(Arena *a)
+{
+	if(checkflag(a->logtl, 0, Bdirty|Bstatic))
+		return;
+	finalize(a->logtl);
+	syncblk(a->logtl);
+}
+
+void
+compresslog(Arena *a)
+{
+	int i, nr, nblks, nlog;
+	vlong sz, *blks;
+	Blk *b;
+	Arange *r;
+	char *p;
+
+	flushlog(a);
+	/*
+	 * Prepare what we're writing back.
+	 * Arenas must be sized so that we can
+	 * keep the merged log in memory for
+	 * a rewrite.
+	 */
+	sz = 0;
+	nr = 0;
+	nlog = 0;
+	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+		sz += 16;
+		nr++;
+	}
+
+	/*
+	 * Make a pessimistic estimate of the number of blocks
+	 * needed to store the ranges, as well as the blocks
+	 * used to store the range allocations.
+	 *
+	 * This does modify the tree, but it's safe because
+	 * we can only be removing entries from the tree, not
+	 * splitting or inserting new ones.
+	 */
+	nblks = (sz+Logspc)/(Logspc - Logslop) + 16*nr/(Logspc-Logslop) + 1;
+	if((blks = calloc(nblks, sizeof(vlong))) == nil)
+		error(Enomem);
+	if(waserror()){
+		free(blks);
+		nexterror();
+	}
+	for(i = 0; i < nblks; i++){
+		blks[i] = blkalloc_lk(a, 1);
+		if(blks[i] == -1)
+			error(Efull);
+	}
+
+	/* fill up the log with the ranges from the tree */
+	i = 0;
+	b = mklogblk(a, blks[i++]);
+	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+		if(b->logsz >= Logspc - Logslop){
+			b->logp = (Bptr){blks[i], -1, -1};
+			finalize(b);
+			syncblk(b);
+			dropblk(b);
+			nlog++;
+			b = mklogblk(a, blks[i++]);
+		}
+		p = b->data + b->logsz;
+		PACK64(p+0, r->off|LogFree);
+		PACK64(p+8, r->len);
+		b->logsz += 16;
+	}
+
+	/*
+	 * now we have a valid freelist, and we can start
+	 * appending stuff to it. Clean up the eagerly
+	 * allocated extra blocks.
+	 *
+	 * Note that we need to drop the reference to the
+	 * old logtl before we free the old blocks, because
+	 * deallocating a block may require another block.
+	 */
+	dropblk(a->logtl);
+	a->loghd = (Bptr){blks[0], -1, -1};
+	a->logtl = b;	/* written back by sync() later */
+	a->nlog = nlog;
+	a->lastlogsz = nlog;
+
+	/* May add blocks to new log */
+	for(; i < nblks; i++)
+		blkdealloc_lk(a, blks[i]);
+	poperror();
+	free(blks);
+}
+
+int
+logbarrier(Arena *a, vlong gen)
+{
+	logappend(a, gen<<8, 0, LogSync);
+	return 0;
+}
+
+/*
+ * Allocate from an arena, with lock
+ * held. May be called multiple times
+ * per operation, to alloc space for
+ * the alloc log.
+ */
+static vlong
+blkalloc_lk(Arena *a, int seq)
+{
+	Arange *r;
+	vlong b;
+
+	if(seq)
+		r = (Arange*)avlmin(a->free);
+	else
+		r = (Arange*)avlmax(a->free);
+	if(!usereserve && a->size - a->used <= a->reserve)
+		return -1;
+	if(r == nil)
+		broke(Estuffed);
+
+	/*
+	 * A bit of sleight of hand here:
+	 * while we're changing the sorting
+	 * key, but we know it won't change
+	 * the sort order because the tree
+	 * covers disjoint ranges
+	 */
+	if(seq){
+		b = r->off;
+		r->len -= Blksz;
+		r->off += Blksz;
+	}else{
+		r->len -= Blksz;
+		b = r->off + r->len;
+	}
+	if(r->len == 0){
+		avldelete(a->free, r);
+		free(r);
+	}
+	a->used += Blksz;
+	return b;
+}
+
+static void
+blkdealloc_lk(Arena *a, vlong b)
+{
+	cachedel(b);
+	logappend(a, b, Blksz, LogFree);
+	freerange(a->free, b, Blksz);
+	a->used -= Blksz;
+}
+
+static vlong
+blkalloc(int ty, uint hint, int seq)
+{
+	Arena *a;
+	vlong b;
+	int tries;
+
+	tries = 0;
+Again:
+	a = pickarena(ty, hint, tries);
+	/*
+	 * Loop through the arena up to 2 times.
+	 * The first pass tries to find an arena
+	 * that has space and is not in use, the
+	 * second waits until an arena is free.
+	 */
+	if(tries == 2*fs->narena)
+		error(Efull);
+	tries++;
+	if(tries < fs->narena){
+		if(canqlock(a) == 0)
+			goto Again;
+	}else
+		qlock(a);
+	if(waserror()){
+		qunlock(a);
+		nexterror();
+	}
+	b = blkalloc_lk(a, seq);
+	if(b == -1){
+		qunlock(a);
+		poperror();
+		goto Again;
+	}
+	logappend(a, b, Blksz, LogAlloc);
+	qunlock(a);
+	poperror();
+	return b;
+}
+
+static Blk*
+initblk(Blk *b, vlong bp, vlong gen, int ty)
+{
+	Blk *ob;
+
+	ob = cacheget(bp);
+	if(ob != nil)
+		fatal("double alloc: %#p %B %#p %B", b, b->bp, ob, ob->bp);
+	b->type = ty;
+	b->bp.addr = bp;
+	b->bp.hash = -1;
+	b->bp.gen = gen;
+	switch(ty){
+	case Tdat:
+		b->data = b->buf;
+		break;
+	case Tarena:
+		b->data = b->buf+2;
+		break;
+	case Tdlist:
+	case Tlog:
+		b->logsz = 0;
+		b->logp = (Bptr){-1, -1, -1};
+		b->data = b->buf + Loghdsz;
+		break;
+	case Tpivot:
+		b->data = b->buf + Pivhdsz;
+		break;
+	case Tleaf:
+		b->data = b->buf + Leafhdsz;
+		break;
+	}
+	setflag(b, Bdirty, 0);
+	b->nval = 0;
+	b->valsz = 0;
+	b->nbuf = 0;
+	b->bufsz = 0;
+	b->logsz = 0;
+	b->alloced = getcallerpc(&b);
+
+	return b;
+}
+
+Blk*
+newdblk(Tree *t, vlong hint, int seq)
+{
+	vlong bp;
+	Blk *b;
+
+	bp = blkalloc(Tdat, hint, seq);
+	b = cachepluck();
+	initblk(b, bp, t->memgen, Tdat);
+	b->alloced = getcallerpc(&t);
+	tracex("newblk" , b->bp, Tdat, -1);
+	return b;
+
+}
+
+Blk*
+newblk(Tree *t, int ty)
+{
+	vlong bp;
+	Blk *b;
+
+fprint(2, "newblk from %p", getcallerpc(&t));
+	bp = blkalloc(ty, 0, 0);
+	b = cachepluck();
+	initblk(b, bp, t->memgen, ty);
+	b->alloced = getcallerpc(&t);
+	tracex("newblk" , b->bp, ty, -1);
+	return b;
+}
+
+Blk*
+dupblk(Tree *t, Blk *b)
+{
+	Blk *r;
+
+	if((r = newblk(t, b->type)) == nil)
+		return nil;
+
+	tracex("dup" , b->bp, b->type, t->gen);
+	r->bp.hash = -1;
+	r->nval = b->nval;
+	r->valsz = b->valsz;
+	r->nbuf = b->nbuf;
+	r->bufsz = b->bufsz;
+	r->logsz = b->logsz;
+	r->alloced = getcallerpc(&t);
+	memcpy(r->buf, b->buf, sizeof(r->buf));
+	return r;
+}
+
+void
+finalize(Blk *b)
+{
+	if(b->type != Tdat)
+		PACK16(b->buf, b->type);
+
+	switch(b->type){
+	default:
+		abort();
+		break;
+	case Tpivot:
+		PACK16(b->buf+2, b->nval);
+		PACK16(b->buf+4, b->valsz);
+		PACK16(b->buf+6, b->nbuf);
+		PACK16(b->buf+8, b->bufsz);
+		break;
+	case Tleaf:
+		PACK16(b->buf+2, b->nval);
+		PACK16(b->buf+4, b->valsz);
+		break;
+	case Tdlist:
+	case Tlog:
+		b->logh = bufhash(b->data, b->logsz);
+		PACK16(b->buf+2, b->logsz);
+		PACK64(b->buf+4, b->logh);
+		packbp(b->buf+12, Ptrsz, &b->logp);
+		break;
+	case Tdat:
+	case Tarena:
+	case Tsuper:
+		break;
+	}
+
+	b->bp.hash = blkhash(b);
+	setflag(b, Bdirty|Bfinal, 0);
+}
+
+Blk*
+getblk(Bptr bp, int flg)
+{
+	Blk *b;
+	int i;
+
+	i = ihash(bp.addr) % nelem(fs->blklk);
+	qlock(&fs->blklk[i]);
+	if(waserror()){
+		qunlock(&fs->blklk[i]);
+		nexterror();
+	}
+	if((b = cacheget(bp.addr)) != nil){
+		assert(checkflag(b, 0, Bfreed));
+		b->lasthold = getcallerpc(&bp);
+		qunlock(&fs->blklk[i]);
+		poperror();
+		return b;
+	}
+	b = cachepluck();
+	b->alloced = getcallerpc(&bp);
+	b->alloced = getcallerpc(&bp);
+	readblk(b, bp, flg);
+	b->bp.gen = bp.gen;
+	b->lasthold = getcallerpc(&bp);
+	cacheins(b);
+	qunlock(&fs->blklk[i]);
+	poperror();
+
+	return b;
+}
+
+
+Blk*
+holdblk(Blk *b)
+{
+	ainc(&b->ref);
+	b->lasthold = getcallerpc(&b);
+	return b;
+}
+
+void
+dropblk(Blk *b)
+{
+	if(b == nil)
+		return;
+	b->lastdrop = getcallerpc(&b);
+	if(adec(&b->ref) != 0)
+		return;
+	/*
+	 * freed blocks go to the LRU bottom
+	 * for early reuse.
+	 */
+	if(checkflag(b, Bfreed, 0))
+		lrubot(b);
+	else
+		lrutop(b);
+}
+
+ushort
+blkfill(Blk *b)
+{
+	switch(b->type){
+	case Tpivot:
+		return 2*b->nbuf + b->bufsz +  2*b->nval + b->valsz;
+	case Tleaf:
+		return 2*b->nval + b->valsz;
+	default:
+		fprint(2, "invalid block @%lld\n", b->bp.addr);
+		abort();
+	}
+}
+
+void
+limbo(int op, Limbo *l)
+{
+	Limbo *p;
+	ulong ge;
+
+	l->op = op;
+	while(1){
+		ge = agetl(&fs->epoch);
+		p = agetp(&fs->limbo[ge]);
+		l->next = p;
+		if(acasp(&fs->limbo[ge], p, l)){
+			aincl(&fs->nlimbo, 1);
+			break;
+		}
+	}
+}
+
+void
+freeblk(Tree *t, Blk *b)
+{
+	if(t == &fs->snap || (t != nil && b->bp.gen < t->memgen)){
+		tracex("killb", b->bp, getcallerpc(&t), -1);
+		killblk(t, b->bp);
+		return;
+	}
+	b->freed = getcallerpc(&t);
+	tracex("freeb", b->bp, getcallerpc(&t), -1);
+	setflag(b, Blimbo, 0);
+	holdblk(b);
+	assert(b->ref > 1);
+	limbo(DFblk, b);
+}
+
+void
+freebp(Tree *t, Bptr bp)
+{
+	Bfree *f;
+
+	if(t == &fs->snap || (t != nil && bp.gen < t->memgen)){
+		tracex("killb", bp, getcallerpc(&t), -1);
+		killblk(t, bp);
+		return;
+	}
+	tracex("freeb", bp, getcallerpc(&t), -1);
+
+	qlock(&fs->bfreelk);
+	while(fs->bfree == nil)
+		rsleep(&fs->bfreerz);
+	f = fs->bfree;
+	fs->bfree = (Bfree*)f->next;
+	qunlock(&fs->bfreelk);
+
+	f->bp = bp;
+	limbo(DFbp, f);
+}
+
+void
+epochstart(int tid)
+{
+	ulong ge;
+
+	ge = agetl(&fs->epoch);
+	asetl(&fs->lepoch[tid], ge | Eactive);
+}
+
+void
+epochend(int tid)
+{
+	ulong le;
+
+	le = agetl(&fs->lepoch[tid]);
+	asetl(&fs->lepoch[tid], le &~ Eactive);
+}
+
+void
+epochwait(void)
+{
+	int i, delay;
+	ulong e, ge;
+
+	delay = 0;
+Again:
+	ge = agetl(&fs->epoch);
+	for(i = 0; i < fs->nworker; i++){
+		e = agetl(&fs->lepoch[i]);
+		if((e & Eactive) && e != (ge | Eactive)){
+			if(delay < 1000)
+				delay++;
+			else
+				fprint(2, "stalled epoch %lx [worker %d]\n", e, i);
+			sleep(delay);
+			goto Again;
+		}
+	}
+}
+
+void
+epochclean(void)
+{
+	ulong c, e, ge;
+	Limbo *p, *n;
+	Blk *b;
+	Bfree *f;
+	Arena *a;
+	Qent qe;
+	int i;
+
+	c = agetl(&fs->nlimbo);
+	ge = agetl(&fs->epoch);
+	for(i = 0; i < fs->nworker; i++){
+		e = agetl(&fs->lepoch[i]);
+		if((e & Eactive) && e != (ge | Eactive)){
+			if(c < fs->cmax/4)
+				return;
+			epochwait();
+		}
+	}
+	epochwait();
+	p = asetp(&fs->limbo[(ge+1)%3], nil);
+	asetl(&fs->epoch, (ge+1)%3);
+
+	for(; p != nil; p = n){
+		n = p->next;
+		switch(p->op){
+		case DFtree:
+			free(p);
+			break;
+		case DFmnt:
+			free(p);
+			break;
+		case DFbp:
+			f = (Bfree*)p;
+			a = getarena(f->bp.addr);
+			if((b = cacheget(f->bp.addr)) != nil){
+				setflag(b, Bfreed, Bdirty|Blimbo);
+				dropblk(b);
+			}
+			qe.op = Qfree;
+			qe.bp = f->bp;
+			qe.b = nil;
+			qput(a->sync, qe);
+			qlock(&fs->bfreelk);
+			f->next = fs->bfree;
+			fs->bfree = f;
+			rwakeup(&fs->bfreerz);
+			qunlock(&fs->bfreelk);
+			break;
+		case DFblk:
+			b = (Blk*)p;
+			qe.op = Qfree;
+			qe.bp = b->bp;
+			qe.b = nil;
+			setflag(b, Bfreed, Bdirty|Blimbo);
+			a = getarena(b->bp.addr);
+			dropblk(b);
+			qput(a->sync, qe);
+			break;
+		default:
+			abort();
+		}
+		aincl(&fs->nlimbo, -1);
+	}
+}
+
+void
+enqueue(Blk *b)
+{
+	Arena *a;
+	Qent qe;
+
+	assert(checkflag(b, Bdirty, Bqueued|Bstatic));
+	assert(b->bp.addr >= 0);
+	finalize(b);
+	if(checkflag(b, 0, Bcached)){
+		cacheins(b);
+		b->cached = getcallerpc(&b);
+	}
+	holdblk(b);
+
+	b->enqueued = getcallerpc(&b);
+	traceb("queueb", b->bp);
+	a = getarena(b->bp.addr);
+	qe.op = Qwrite;
+	qe.bp = b->bp;
+	qe.b = b;
+	qput(a->sync, qe);
+}
+
+void
+qinit(Syncq *q)
+{
+	q->fullrz.l = &q->lk;
+	q->emptyrz.l = &q->lk;
+	q->nheap = 0;
+	q->heapsz = fs->cmax;
+	q->heap = emalloc(q->heapsz*sizeof(Qent), 1);
+}
+
+static int
+qcmp(Qent *a, Qent *b)
+{
+	if(a->qgen != b->qgen)
+		return (a->qgen < b->qgen) ? -1 : 1;
+	if(a->op != b->op)
+		return (a->op < b->op) ? -1 : 1;
+	if(a->bp.addr != b->bp.addr)
+		return (a->bp.addr < b->bp.addr) ? -1 : 1;
+	return 0;
+}
+
+void
+qput(Syncq *q, Qent qe)
+{
+	int i;
+
+	if(qe.op == Qfree || qe.op == Qwrite)
+		assert((qe.bp.addr & (Blksz-1)) == 0);
+	else if(qe.op == Qfence)
+		assert(fs->syncing > 0);
+	else
+		abort();
+	if(qe.b != nil)
+		assert(qe.b->ref > 0);
+	qlock(&q->lk);
+	qe.qgen = agetv(&fs->qgen);
+	while(q->nheap == q->heapsz)
+		rsleep(&q->fullrz);
+	for(i = q->nheap; i > 0; i = (i-1)/2){
+		if(qcmp(&qe, &q->heap[(i-1)/2]) == 1)
+			break;
+		q->heap[i] = q->heap[(i-1)/2];
+	}
+	q->heap[i] = qe;
+	q->nheap++;
+	rwakeup(&q->emptyrz);
+	qunlock(&q->lk);
+}
+
+static Qent
+qpop(Syncq *q)
+{
+	int i, l, r, m;
+	Qent e, t;
+
+	qlock(&q->lk);
+	while(q->nheap == 0)
+		rsleep(&q->emptyrz);
+	e = q->heap[0];
+	if(--q->nheap == 0)
+		goto Out;
+
+	i = 0;
+	q->heap[0] = q->heap[q->nheap];
+	while(1){
+		m = i;
+		l = 2*i+1;
+		r = 2*i+2;
+		if(l < q->nheap && qcmp(&q->heap[m], &q->heap[l]) == 1)
+			m = l;
+		if(r < q->nheap && qcmp(&q->heap[m], &q->heap[r]) == 1)
+			m = r;
+		if(m == i)
+			break;
+		t = q->heap[m];
+		q->heap[m] = q->heap[i];
+		q->heap[i] = t;
+		i = m;
+	}
+Out:
+	rwakeup(&q->fullrz);
+	qunlock(&q->lk);
+	if(e.b != nil){
+		setflag(e.b, 0, Bqueued);
+		e.b->queued = 0;
+	}
+	return e;
+}
+
+void
+runsync(int, void *p)
+{
+	Arena *a;
+	Syncq *q;
+	Qent qe;
+
+	q = p;
+	if(waserror()){
+		aincl(&fs->rdonly, 1);
+		fprint(2, "error syncing: %s\n", errmsg());
+		return;
+	}
+	while(1){
+		qe = qpop(q);
+		switch(qe.op){
+		case Qfree:
+			tracex("qfreeb", qe.bp, qe.qgen, -1);
+			/*
+			 * we shouldn't have a block in a free op,
+			 * the frees go into the queue just to ensure
+			 * write/reuse ordering.
+			 */
+			assert(qe.b == nil);
+			a = getarena(qe.bp.addr);
+			qlock(a);
+			blkdealloc_lk(a, qe.bp.addr);
+			qunlock(a);
+			break;
+		case Qfence:
+			tracev("qfence", qe.qgen);
+			qlock(&fs->synclk);
+			if(--fs->syncing == 0)
+				rwakeupall(&fs->syncrz);
+			qunlock(&fs->synclk);
+			break;
+		case Qwrite:
+			tracex("qsyncb", qe.bp, qe.qgen, -1);
+			if(checkflag(qe.b, Bfreed, Bstatic) == 0)
+				syncblk(qe.b);
+			dropblk(qe.b);
+			break;
+		default:
+			abort();
+		}
+		assert(estacksz() == 1);
+	}
+}
--- /dev/null
+++ b/cache.c
@@ -1,0 +1,190 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static void
+lrudel(Blk *b)
+{
+	if(b == fs->chead)
+		fs->chead = b->cnext;
+	if(b == fs->ctail)
+		fs->ctail = b->cprev;
+	if(b->cnext != nil)
+		b->cnext->cprev = b->cprev;
+	if(b->cprev != nil)
+		b->cprev->cnext = b->cnext;
+	b->cnext = nil;
+	b->cprev = nil;		
+}
+
+void
+lrutop(Blk *b)
+{
+	qlock(&fs->lrulk);
+	/*
+	 * Someone got in first and did a
+	 * cache lookup; we no longer want
+	 * to put this into the LRU, because
+	 * its now in use.
+	 */
+	assert(b->magic == Magic);
+	assert(checkflag(b, 0, Bstatic));
+	if(b->ref != 0){
+		qunlock(&fs->lrulk);
+		return;
+	}
+	lrudel(b);
+	if(fs->chead != nil)
+		fs->chead->cprev = b;
+	if(fs->ctail == nil)
+		fs->ctail = b;
+	b->cnext = fs->chead;
+	fs->chead = b;
+	rwakeup(&fs->lrurz);
+	qunlock(&fs->lrulk);
+}
+
+void
+lrubot(Blk *b)
+{
+	qlock(&fs->lrulk);
+	/*
+	 * Someone got in first and did a
+	 * cache lookup; we no longer want
+	 * to put this into the LRU, because
+	 * its now in use.
+	 */
+	assert(b->magic == Magic);
+	assert(checkflag(b, 0, Bstatic));
+	if(b->ref != 0){
+		qunlock(&fs->lrulk);
+		return;
+	}
+	lrudel(b);
+	if(fs->ctail != nil)
+		fs->ctail->cnext = b;
+	if(fs->chead == nil)
+		fs->chead = b;
+	b->cprev = fs->ctail;
+	fs->ctail = b;
+	rwakeup(&fs->lrurz);
+	qunlock(&fs->lrulk);
+}
+
+void
+cacheins(Blk *b)
+{
+	Bucket *bkt;
+	u32int h;
+
+	assert(b->magic == Magic);
+	h = ihash(b->bp.addr);
+	bkt = &fs->bcache[h % fs->cmax];
+	qlock(&fs->lrulk);
+	traceb("cache", b->bp);
+	assert(checkflag(b, 0, Bstatic|Bcached));
+	setflag(b, Bcached, 0);
+	assert(b->hnext == nil);
+	for(Blk *bb = bkt->b; bb != nil; bb = bb->hnext)
+		assert(b != bb && b->bp.addr != bb->bp.addr);
+	b->cached = getcallerpc(&b);
+	b->hnext = bkt->b;
+	bkt->b = b;
+	qunlock(&fs->lrulk);
+}
+
+static void
+cachedel_lk(vlong addr)
+{
+	Bucket *bkt;
+	Blk *b, **p;
+	u32int h;
+
+	if(addr == -1)
+		return;
+
+	Bptr bp = {addr, -1, -1};
+	tracex("uncache", bp, -1, getcallerpc(&addr));
+	h = ihash(addr);
+	bkt = &fs->bcache[h % fs->cmax];
+	p = &bkt->b;
+	for(b = bkt->b; b != nil; b = b->hnext){
+		if(b->bp.addr == addr){
+			/* FIXME: Until we clean up snap.c, we can have dirty blocks in cache */
+			assert(checkflag(b, Bcached, Bstatic)); //Bdirty));
+			*p = b->hnext;
+			b->uncached = getcallerpc(&addr);
+			b->hnext = nil;
+			setflag(b, 0, Bcached);
+			break;
+		}
+		p = &b->hnext;
+	}
+}
+void
+cachedel(vlong addr)
+{
+	qlock(&fs->lrulk);
+	Bptr bp = {addr, -1, -1};
+	tracex("uncachelk", bp, -1, getcallerpc(&addr));
+	cachedel_lk(addr);
+	qunlock(&fs->lrulk);
+}
+
+Blk*
+cacheget(vlong addr)
+{
+	Bucket *bkt;
+	u32int h;
+	Blk *b;
+
+	h = ihash(addr);
+	bkt = &fs->bcache[h % fs->cmax];
+	qlock(&fs->lrulk);
+	for(b = bkt->b; b != nil; b = b->hnext){
+		if(b->bp.addr == addr){
+			holdblk(b);
+			lrudel(b);
+			b->lasthold = getcallerpc(&addr);
+			break;
+		}
+	}
+	qunlock(&fs->lrulk);
+
+	return b;
+}
+
+/*
+ * Pulls the block from the bottom of the LRU for reuse.
+ */
+Blk*
+cachepluck(void)
+{
+	Blk *b;
+
+	qlock(&fs->lrulk);
+	while(fs->ctail == nil)
+		rsleep(&fs->lrurz);
+
+	b = fs->ctail;
+	assert(b->magic == Magic);
+	assert(b->ref == 0);
+	if(checkflag(b, Bcached, 0))
+		cachedel_lk(b->bp.addr);
+	if(checkflag(b, Bcached, 0))
+		fprint(2, "%B cached %#p freed %#p\n", b->bp, b->cached, b->freed);
+	assert(checkflag(b, 0, Bcached));
+	lrudel(b);
+	b->flag = 0;
+	b->lasthold = 0;
+	b->lastdrop = 0;
+	b->freed = 0;
+	b->hnext = nil;
+	qunlock(&fs->lrulk);
+
+	return  holdblk(b);
+}
--- /dev/null
+++ b/check.c
@@ -1,0 +1,306 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <atomic.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+isfree(vlong bp)
+{
+	Arange *r, q;
+	Arena *a;
+
+	q.off = bp;
+	q.len = Blksz;
+
+	a = getarena(bp);
+	r = (Arange*)avllookup(a->free, &q, -1);
+	if(r == nil)
+		return 0;
+	return bp < (r->off + r->len);
+}
+
+static int
+checktree(int fd, Blk *b, int h, Kvp *lo, Kvp *hi)
+{
+	Kvp x, y;
+	Msg mx, my;
+	int i, r, fill;
+	Blk *c;
+	int fail;
+	Bptr bp;
+
+	fail = 0;
+	if(h < 0){
+		fprint(fd, "node too deep (loop?\n");
+		fail++;
+		return fail;
+	} 
+	if(b->type == Tleaf){
+		if(h != 0){
+			fprint(fd, "unbalanced leaf\n");
+			fail++;
+		}
+		if(h != 0 && b->nval < 2){
+			fprint(fd, "warning: underfilled leaf %B\n", b->bp);
+			fail++;
+		}
+	}
+	if(b->type == Tpivot && b->nval < 2)
+		fprint(fd, "warning: underfilled pivot %B\n", b->bp);
+	getval(b, 0, &x);
+	if(lo && keycmp(lo, &x) > 0){
+		fprint(fd, "out of range keys %P != %P\n", lo, &x);
+		showblk(fd, b, "out of range", 1);
+		fail++;
+	}
+	for(i = 1; i < b->nval; i++){
+		getval(b, i, &y);
+		if(hi && keycmp(&y, hi) >= 0){
+			fprint(fd, "out of range keys %P >= %P\n", &y, hi);
+			fail++;
+		}
+		if(b->type == Tpivot){
+			bp = getptr(&x, &fill);
+			if(isfree(bp.addr)){
+				fprint(fd, "freed block in use: %llx\n", bp.addr);
+				fail++;
+			}
+			if((c = getblk(bp, 0)) == nil){
+				fprint(fd, "corrupt block: %B\n", bp);
+				fail++;
+				continue;
+			}
+			if(blkfill(c) != fill){
+				fprint(fd, "mismatched block fill\n");
+				fail++;
+			}
+			if(checktree(fd, c, h - 1, &x, &y))
+				fail++;
+			dropblk(c);
+		}
+		r = keycmp(&x, &y);
+		switch(r){
+		case -1:
+			break;
+		case 0:
+			fprint(fd, "duplicate keys %P, %P\n", &x, &y);
+			fail++;
+			break;
+		case 1:
+			fprint(fd, "misordered keys %P, %P\n", &x, &y);
+			fail++;
+			break;
+		}
+		x = y;
+	}
+	if(b->type == Tpivot){
+		getval(b, b->nval-1, &y);
+		bp = getptr(&x, &fill);
+		if((c = getblk(bp, 0)) == nil){
+			fprint(fd, "corrupt block: %B\n", bp);
+			fail++;
+		}
+		if(c != nil && checktree(fd, c, h - 1, &y, nil))
+			fail++;
+		dropblk(c);
+		if(b->nbuf > 0){
+			getmsg(b, 0, &mx);
+			if(hi && keycmp(&mx, hi) >= 0){
+				fprint(fd, "out of range messages %P != %M\n", hi, &mx);
+				fail++;
+			}
+		}
+		for(i = 1; i < b->nbuf; i++){
+			getmsg(b, i, &my);
+			switch(my.op){
+			case Owstat:		/* kvp dirent */
+				if((my.v[0] & ~(Owsize|Owmode|Owmtime|Owatime|Owuid|Owgid|Owmuid)) != 0){
+					fprint(fd, "invalid stat op %x\n", my.v[0]);
+					fail++;
+				}
+				break;
+			default:
+				if(my.op <= 0 || my.op >= Nmsgtype){
+					fprint(fd, "invalid message op %d\n", my.op);
+					fail++;
+				}
+				break;
+			}
+			if(hi && keycmp(&y, hi) > 0){
+				fprint(fd, "out of range keys %P >= %P\n", &y, hi);
+				fail++;
+			}
+			if(keycmp(&mx, &my) == 1){
+				fprint(fd, "misordered keys %P, %P\n", &x, &y);
+				fail++;
+				break;
+			}
+			mx = my;
+		}
+
+	}
+	return fail;
+}
+
+static int
+checklog(int fd, Bptr hd)
+{
+	Bptr bp, nb;
+	Blk *b;
+
+	bp = (Bptr){-1, -1, -1};
+	for(bp = hd; bp.addr != -1; bp = nb){
+		if(waserror()){
+			fprint(fd, "error loading %B\n", bp);
+			return 0;
+		}
+traceb("chklg", bp);
+		b = getblk(bp, 0);
+		nb = b->logp;
+		dropblk(b);
+		poperror();
+	}
+	return 1;
+}
+
+static int
+checkfree(int fd)
+{
+	Arena *a;
+	Arange *r, *n;
+	int i, fail;
+
+	fail = 0;
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		qlock(a);
+		r = (Arange*)avlmin(a->free);
+		for(n = (Arange*)avlnext(r); n != nil; n = (Arange*)avlnext(n)){
+			if(r->off >= n->off){
+				fprint(2, "misordered length %llx >= %llx\n", r->off, n->off);
+				fail++;
+			}
+			if(r->off+r->len >= n->off){
+				fprint(2, "overlaping range %llx+%llx >= %llx\n", r->off, r->len, n->off);
+				fail++;
+			}
+			r = n;
+		}
+		if(!checklog(fd, a->loghd))
+			fprint(fd, "arena %d: broken freelist\n", i);
+		qunlock(a);
+	}
+	return fail;
+}
+
+static int
+checkdlist(int fd)
+{
+	char pfx[1];
+	Dlist dl;
+	Scan s;
+
+	checklog(fd, fs->snapdl.hd);
+	pfx[0] = Kdlist;
+	btnewscan(&s, pfx, 1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		kv2dlist(&s.kv, &dl);
+		if(!checklog(fd, dl.hd))
+			print("bad dlist %P: %s\n", &s.kv, errmsg());
+	}
+	btexit(&s);
+	return 0;
+}
+
+static int
+checkdata(int, Tree *t)
+{
+	char pfx[1];
+	Bptr bp;
+	Scan s;
+	Blk *b;
+
+	pfx[0] = Klabel;
+	btnewscan(&s, pfx, 1);
+	btenter(t, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		bp = unpackbp(s.kv.v, s.kv.nv);
+		if(isfree(bp.addr)){
+			fprint(2, "free block in use: %B\n", bp);
+			error("free block in use");
+		}
+		b = getblk(bp, GBraw);
+		dropblk(b);
+	}
+	btexit(&s);
+	return 0;
+}
+
+int
+checkfs(int fd)
+{
+	int ok, height;
+	char pfx[1], name[Keymax+1];
+	Tree *t;
+	Scan s;
+	Blk *b;
+
+	ok = 1;
+	aincl(&fs->rdonly, 1);
+	epochwait();
+	if(waserror()){
+		fprint(fd, "error checking %s\n", errmsg());
+		return 0;
+	}
+	fprint(fd, "checking freelist\n");
+	if(checkfree(fd))
+		ok = 0;
+	fprint(fd, "checking deadlist\n");
+	if(checkdlist(fd))
+		ok = 0;
+	fprint(fd, "checking snap tree: %B\n", fs->snap.bp);
+	if((b = getroot(&fs->snap, &height)) != nil){
+		if(checktree(fd, b, height-1, nil, 0))
+			ok = 0;
+		dropblk(b);
+	}
+	pfx[0] = Klabel;
+	btnewscan(&s, pfx, 1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		if(waserror()){
+			fprint(fd, "moving on: %s\n", errmsg());
+			continue;
+		}
+		memcpy(name, s.kv.k+1, s.kv.nk-1);
+		name[s.kv.nk-1] = 0;
+		if((t = opensnap(name, nil)) == nil){
+			fprint(2, "invalid snap label %s\n", name);
+			ok = 0;
+			break;
+		}
+		fprint(fd, "checking snap %s: %B\n", name, t->bp);
+		b = getroot(t, &height);
+		if(checktree(fd, b, height-1, nil, 0))
+			ok = 0;
+		if(checkdata(fd, t))
+			ok = 0;
+		dropblk(b);
+		poperror();
+	}
+	btexit(&s);
+	aincl(&fs->rdonly, -1);
+	poperror();
+	return ok;
+}
--- /dev/null
+++ b/cons.c
@@ -1,0 +1,465 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Cmd	Cmd;
+
+struct Cmd {
+	char	*name;
+	char	*sub;
+	int	minarg;
+	int	maxarg;
+	int	epoch;
+	void	(*fn)(int, char**, int);
+};
+
+static void
+setdbg(int fd, char **ap, int na)
+{
+	debug = (na == 1) ? atoi(ap[0]) : !debug;
+	fprint(fd, "debug → %d\n", debug);
+}
+
+static void
+sendsync(int fd, int halt)
+{
+	Amsg *a;
+
+	a = mallocz(sizeof(Amsg), 1);
+	if(a == nil){
+		fprint(fd, "alloc sync msg: %r\n");
+		free(a);
+		return;
+	}
+	a->op = AOsync;
+	a->halt = halt;
+	a->fd = fd;
+	chsend(fs->admchan, a);		
+}
+
+static void
+syncfs(int fd, char **, int)
+{
+	sendsync(fd, 0);
+	fprint(fd, "synced\n");
+}
+
+static void
+haltfs(int fd, char **, int)
+{
+	sendsync(fd, 1);
+	fprint(fd, "gefs: ending...\n");
+}
+
+static void
+listsnap(int fd)
+{
+	char pfx[Snapsz];
+	Scan s;
+	uint flg;
+	int sz;
+
+	pfx[0] = Klabel;
+	sz = 1;
+	btnewscan(&s, pfx, sz);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		flg = UNPACK32(s.kv.v+1+8);
+		fprint(fd, "snap %.*s", s.kv.nk-1, s.kv.k+1);
+		if(flg != 0)
+			fprint(fd, " [");
+		if(flg & Lmut)
+			fprint(fd, " mutable");
+		if(flg & Lauto)
+			fprint(fd, " auto");
+		if(flg & Ltsnap)
+			fprint(fd, " tsnap");
+		if(flg != 0)
+			fprint(fd, " ]");
+		fprint(fd, "\n");
+	}
+	btexit(&s);
+}
+
+static void
+snapfs(int fd, char **ap, int na)
+{
+	Amsg *a;
+	int i;
+
+	if((a = mallocz(sizeof(Amsg), 1)) == nil){
+		fprint(fd, "alloc sync msg: %r\n");
+		return;
+	}
+	a->op = AOsnap;
+	a->fd = fd;
+	a->flag = Ltsnap;
+	while(ap[0][0] == '-'){
+		for(i = 1; ap[0][i]; i++){
+			switch(ap[0][i]){
+			case 'S':	a->flag &= ~Ltsnap;	break;
+			case 'm':	a->flag |= Lmut;	break;
+			case 'd':	a->delete++;		break;
+			case 'l':
+				listsnap(fd);
+				free(a);
+				return;
+			default:
+				fprint(fd, "usage: snap -[Smdl] [old [new]]\n");
+				free(a);
+				return;
+			}
+		}
+		na--;
+		ap++;
+	}
+	if(a->delete && na != 1 || !a->delete && na != 2){
+		fprint(fd, "usage: snap -[md] old [new]\n");
+		free(a);
+		return;
+	}
+	if(na >= 1)
+		strecpy(a->old, a->old+sizeof(a->old), ap[0]);
+	if(na >= 2)
+		strecpy(a->new, a->new+sizeof(a->new), ap[1]);
+	sendsync(fd, 0);
+	chsend(fs->admchan, a);
+}
+
+static void
+fsckfs(int fd, char**, int)
+{
+	if(checkfs(fd))
+		fprint(fd, "ok\n");
+	else
+		fprint(fd, "broken\n");
+}
+
+static void
+refreshusers(int fd, char **, int)
+{
+	Mount *mnt;
+
+	if((mnt = getmount("adm")) == nil){
+		fprint(fd, "load users: missing 'adm'\n");
+		return;
+	}
+	if(waserror()){
+		fprint(fd, "load users: %s\n", errmsg());
+		clunkmount(mnt);
+		return;
+	}
+	loadusers(fd, mnt->root);
+	fprint(fd, "refreshed users\n");
+	clunkmount(mnt);
+}
+
+static void
+showbstate(int fd, char**, int)
+{
+	char *p, fbuf[8];
+	Blk *b;
+
+	for(b = blkbuf; b != blkbuf+fs->cmax; b++){
+		p = fbuf;
+		if(b->flag & Bdirty)	*p++ = 'd';
+		if(b->flag & Bfinal)	*p++ = 'f';
+		if(b->flag & Bfreed)	*p++ = 'F';
+		if(b->flag & Bcached)	*p++ = 'c';
+		if(b->flag & Bqueued)	*p++ = 'q';
+		if(b->flag & Blimbo)	*p++ = 'L';
+		*p = 0;
+		fprint(fd, "blk %#p type %d flag %s bp %B ref %ld alloc %#p queued %#p, hold %#p drop %#p cached %#p\n",
+			b, b->type, fbuf, b->bp, b->ref, b->alloced, b->queued, b->lasthold, b->lastdrop, b->cached);
+	}
+}
+
+static void
+showusers(int fd, char**, int)
+{
+	User *u, *v;
+	int i, j;
+	char *sep;
+
+	rlock(&fs->userlk);
+	for(i = 0; i < fs->nusers; i++){
+		u = &fs->users[i];
+		fprint(fd, "%d:%s:", u->id, u->name);
+		if((v = uid2user(u->lead)) == nil)
+			fprint(fd, "???:");
+		else
+			fprint(fd, "%s:", v->name);
+		sep = "";
+		for(j = 0; j < u->nmemb; j++){
+			if((v = uid2user(u->memb[j])) == nil)
+				fprint(fd, "%s???", sep);
+			else
+				fprint(fd, "%s%s", sep, v->name);
+			sep = ",";
+		}
+		fprint(fd, "\n");
+	}
+	runlock(&fs->userlk);
+}
+
+static void
+showdf(int fd, char**, int)
+{
+	char *units[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", nil};
+	vlong size, used, free;
+	double hsize, hused, hfree;
+	double pct;
+	Arena *a;
+	int i, us, uu, uf;
+
+	size = 0;
+	used = 0;
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		qlock(a);
+		size += a->size;
+		used += a->used;
+		qunlock(a);
+		fprint(fd, "arena %d: %llx/%llx (%.2f%%)\n", i, a->used, a->size, 100*(double)a->used/(double)a->size);
+	}
+	free = size - used;
+	hsize = size;
+	hused = used;
+	hfree = free;
+	for(us = 0; us < nelem(units)-1 && hsize >= 500 ; us++)
+		hsize /= 1024;
+	for(uu = 0; uu < nelem(units)-1 && hused >= 500 ; uu++)
+		hused /= 1024;
+	for(uf = 0; uf < nelem(units)-1 && hfree >= 500 ; uf++)
+		hfree /= 1024;
+	pct = 100.0*(double)used/(double)size;
+	fprint(fd, "fill:\t%.2f%%\n", pct);
+	fprint(fd, "used:\t%lld (%.2f %s)\n", used, hused, units[uu]);
+	fprint(fd, "size:\t%lld (%.2f %s)\n", size, hsize, units[us]);
+	fprint(fd, "free:\t%lld (%.2f %s)\n", free, hfree, units[uf]);
+}
+
+void
+showfid(int fd, char**, int)
+{
+	int i;
+	Fid *f;
+	Conn *c;
+
+	for(c = fs->conns; c != nil; c = c->next){
+		fprint(fd, "-- conn %p: fids --\n", c);
+		for(i = 0; i < Nfidtab; i++){
+			lock(&c->fidtablk[i]);
+			for(f = c->fidtab[i]; f != nil; f = f->next){
+				rlock(f->dent);
+				fprint(fd, "\tfid[%d] from %#zx: %d [refs=%ld, k=%K, qid=%Q m=%d, dmode:%d duid: %d, dgid: %d]\n",
+					i, getmalloctag(f), f->fid, f->dent->ref, &f->dent->Key, f->dent->qid,
+					f->mode, f->dmode, f->duid, f->dgid);
+				runlock(f->dent);
+			}
+			unlock(&c->fidtablk[i]);
+		}
+	}
+}
+
+void
+showtree(int fd, char **ap, int na)
+{
+	char *name;
+	Tree *t;
+	Blk *b;
+	int h;
+
+	name = "main";
+	memset(&t, 0, sizeof(t));
+	if(na == 1)
+		name = ap[0];
+	if(strcmp(name, "snap") == 0)
+		t = &fs->snap;
+	else if((t = opensnap(name, nil)) == nil){
+		fprint(fd, "open %s: %r\n", name);
+		return;
+	}
+	b = getroot(t, &h);
+	fprint(fd, "=== [%s] %B @%d\n", name, t->bp, t->ht);
+	showblk(fd, b, "contents", 1);
+	dropblk(b);
+	if(t != &fs->snap)
+		closesnap(t);
+}
+
+static void
+permflip(int fd, char **ap, int)
+{
+	if(strcmp(ap[0], "on") == 0)
+		permissive = 1;
+	else if(strcmp(ap[0], "off") == 0)
+		permissive = 0;
+	else
+		fprint(2, "unknown permissive %s\n", ap[0]);
+	fprint(fd, "permissive: %d → %d\n", !permissive, permissive);
+}
+
+static void
+savetrace(int fd, char **ap, int na)
+{
+	Biobuf *bfd;
+	Trace *t;
+	int i;
+
+	if(na == 0)
+		bfd = Bfdopen(dup(fd, -1), OWRITE);
+	else
+		bfd = Bopen(ap[0], OWRITE);
+	if(bfd == nil){
+		fprint(fd, "error opening output");
+		return;
+	}
+	for(i = 0; i < fs->ntrace; i++){
+		t = &fs->trace[(fs->traceidx + i) % fs->ntrace];
+		if(t->msg[0] == 0)
+			continue;
+		Bprint(bfd, "[%d@%d] %s", t->tid, t->qgen, t->msg);
+		if(t->bp.addr != -1)
+			Bprint(bfd, " %B", t->bp);
+		if(t->v0 != -1)
+			Bprint(bfd, " %llx", t->v0);
+		if(t->v1 != -1)
+			Bprint(bfd, " %llx", t->v1);
+		Bprint(bfd, "\n");
+	}
+	Bterm(bfd);
+	fprint(fd, "saved\n");
+}
+
+static void
+showfree(int fd, char **, int)
+{
+	Arange *r;
+	Arena *a;
+	int i;
+
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		qlock(a);
+		fprint(fd, "arena %d %llx+%llx{\n", i, a->h0->bp.addr, a->size);
+		for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r))
+			fprint(fd, "\t%llx..%llx (%llx)\n", r->off, r->off+r->len, r->len);
+		fprint(fd, "}\n");
+		qunlock(a);
+	}
+}
+
+static void
+unreserve(int fd, char **ap, int)
+{
+	if(strcmp(ap[0], "on") == 0)
+		usereserve = 0;
+	else if(strcmp(ap[0], "off") == 0)
+		usereserve = 1;
+	else
+		fprint(2, "unknown reserve %s\n", ap[0]);
+	fprint(fd, "reserve: %d → %d\n", !permissive, permissive);
+}
+
+static void
+help(int fd, char**, int)
+{
+	char *msg =
+		"help -- show this help\n"
+		"check -- check for consistency\n"
+		"df -- show disk usage\n"
+		"halt -- stop all writers, sync, and go read-only\n"
+		"permit [on|off] -- switch to/from permissive mode\n"
+		"reserve [on|off] -- enable block reserves\n"
+		"snap -[Smdl] [old [new]] -- manage snapshots\n"
+		"sync -- flush all pending writes to disk\n"
+		"users -- reload user table from adm snapshot\n"
+		"save trace [name] -- save a trace of recent activity\n"
+		"show -- debug dumps\n"
+		"	tree [name]\n"
+		"	fid\n"
+		"	users\n";
+	fprint(fd, "%s", msg);
+}
+
+Cmd cmdtab[] = {
+	/* admin */
+	{.name="check",		.sub=nil,	.minarg=0, .maxarg=0, .fn=fsckfs, .epoch=1},
+	{.name="df",		.sub=nil, 	.minarg=0, .maxarg=0, .fn=showdf},
+	{.name="halt",		.sub=nil,	.minarg=0, .maxarg=0, .fn=haltfs},
+	{.name="help",		.sub=nil,	.minarg=0, .maxarg=0, .fn=help},
+	{.name="permit",	.sub=nil,	.minarg=1, .maxarg=1, .fn=permflip},
+	{.name="snap",		.sub=nil,	.minarg=1, .maxarg=3, .fn=snapfs},
+	{.name="sync",		.sub=nil,	.minarg=0, .maxarg=0, .fn=syncfs},
+	{.name="reserve",	.sub=nil,	.minarg=0, .maxarg=1, .fn=unreserve},
+	{.name="users",		.sub=nil,	.minarg=0, .maxarg=1, .fn=refreshusers},
+
+	/* debugging */
+	{.name="show",		.sub="fid",	.minarg=0, .maxarg=0, .fn=showfid},
+	{.name="show",		.sub="tree",	.minarg=0, .maxarg=1, .fn=showtree, .epoch=1},
+	{.name="show",		.sub="users",	.minarg=0, .maxarg=0, .fn=showusers},
+	{.name="show",		.sub="bstate",	.minarg=0, .maxarg=0, .fn=showbstate, .epoch=1},
+	{.name="show",		.sub="free",	.minarg=0, .maxarg=0, .fn=showfree},
+	{.name="debug",		.sub=nil,	.minarg=0, .maxarg=1, .fn=setdbg},
+	{.name="save",		.sub="trace",	.minarg=0, .maxarg=1, .fn=savetrace},
+	{.name=nil, .sub=nil},
+};
+
+void
+runcons(int tid, void *pfd)
+{
+	char buf[256], *f[4], **ap;
+	int i, n, nf, na, fd;
+	Cmd *c;
+
+	fd = (uintptr)pfd;
+	while(1){
+		fprint(fd, "gefs# ");
+		if((n = read(fd, buf, sizeof(buf)-1)) == -1)
+			break;
+		buf[n] = 0;
+		nf = tokenize(buf, f, nelem(f));
+		if(nf == 0 || strlen(f[0]) == 0)
+			continue;
+		for(c = cmdtab; c->name != nil; c++){
+			ap = f;
+			na = nf;
+			if(strcmp(c->name, *ap) != 0)
+				continue;
+			ap++;
+			na--;
+			if(c->sub != nil){
+				if(na == 0 || strcmp(c->sub, *ap) != 0)
+					continue;
+				ap++;
+				na--;
+			}
+			if(na < c->minarg || na > c->maxarg)
+				continue;
+			if(c->epoch)
+				epochstart(tid);
+			if(!waserror()){
+				c->fn(fd, ap, na);
+				poperror();
+			}else
+				fprint(fd, "%s: %s\n", f[0], errmsg());
+			if(c->epoch)
+				epochend(tid);
+			break;
+		}
+		if(c->name == nil){
+			fprint(fd, "unknown command '%s", f[0]);
+			for(i = 1; i < nf; i++)
+				fprint(fd, " %s", f[i]);
+			fprint(fd, "'\n");
+		}
+	}
+}
--- /dev/null
+++ b/dat.h
@@ -1,0 +1,776 @@
+typedef struct Blk	Blk;
+typedef struct Amsg	Amsg;
+typedef struct Gefs	Gefs;
+typedef struct Errctx	Errctx;
+typedef struct Fmsg	Fmsg;
+typedef struct Fid	Fid;
+typedef struct Msg	Msg;
+typedef struct Key	Key;
+typedef struct Val	Val;
+typedef struct Kvp	Kvp;
+typedef struct Xdir	Xdir;
+typedef struct Bptr	Bptr;
+typedef struct Limbo	Limbo;
+typedef struct Bfree	Bfree;
+typedef struct Scan	Scan;
+typedef struct Dent	Dent;
+typedef struct Scanp	Scanp;
+typedef struct Arena	Arena;
+typedef struct Arange	Arange;
+typedef struct Bucket	Bucket;
+typedef struct Chan	Chan;
+typedef struct Syncq	Syncq;
+typedef struct Qent	Qent;
+typedef struct Trace	Trace;
+typedef struct Tree	Tree;
+typedef struct Dlist	Dlist;
+typedef struct Mount	Mount;
+typedef struct User	User;
+typedef struct Conn	Conn;
+
+enum {
+	KiB	= 1024ULL,
+	MiB	= 1024ULL*KiB,
+	GiB	= 1024ULL*MiB,
+	TiB	= 1024ULL*GiB,
+
+	Lgblk	= 14,
+	Blksz	= (1ULL<<Lgblk),
+
+	Nrefbuf	= 1024,			/* number of ref incs before syncing */
+	Nfidtab	= 1024,			/* number of fit hash entries */
+	Nflushtab = 1024,		/* flush table size */
+	Ndtab	= 1024,			/* number of dir tab entries */
+	Max9p	= 32*KiB,		/* biggest message size we're willing to negotiate */
+	Nsec	= 1000LL*1000*1000,	/* nanoseconds to the second */
+	Maxent	= 256,			/* maximum size of ent key, with terminator */
+	Maxname	= Maxent-1-9-1,		/* maximum size of a name element */
+	Maxuname= 64,			/* maximum length of a username */
+	Maxtag	= 1<<16,		/* maximum tag in 9p */
+
+	/*
+	 * Kpmax must be no more than 1/4 of pivspc, or
+	 * there is no way to get a valid split of a
+	 * maximally filled tree.
+	 */
+	Keymax	= Maxent,		/* key data limit */
+	Inlmax	= 512,			/* inline data limit */
+	Ptrsz	= 24,			/* off, hash, gen */
+	Pptrsz	= 26,			/* off, hash, gen, fill */
+	Fillsz	= 2,			/* block fill count */
+	Offksz	= 17,			/* type, qid, off */
+	Snapsz	= 9,			/* tag, snapid */
+	Dpfxsz	= 9,			/* directory prefix */
+	Upksz	= 9,			/* directory prefix */
+	Dlksz	= 1+8+8,		/* tag, death, birth */
+	Dlvsz	= Ptrsz+Ptrsz,		/* hd,tl of deadlist */
+	Dlkvpsz	= Dlksz+Dlvsz,		/* full size of dlist kvp */
+	Treesz	= 4+4+4+4		/* ref, ht, flg, gen, pred, succ, base, root */
+		  +8+8+8+8+Ptrsz,
+	Kvmax	= Keymax + Inlmax,	/* Key and value */
+	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
+	Wstatmax = 4+8+8+8,		/* mode, size, atime, mtime */
+	Arenasz	= 8+8+8+8,		/* loghd, loghash, size, used */
+	
+	Pivhdsz		= 10,
+	Leafhdsz	= 6,
+	Loghdsz		= 2+2+8+Ptrsz,			/* type, len, hash, chain */
+	Rootsz		= 4+Ptrsz,			/* root pointer */
+	Pivsz		= Blksz - Pivhdsz,
+	Bufspc		= (Blksz - Pivhdsz)/2,		/* pivot room */
+	Pivspc		= Blksz - Pivhdsz - Bufspc,
+	Logspc		= Blksz - Loghdsz,
+	Logslop		= 16+16+8,			/* val, nextb, chain */
+	Leafspc 	= Blksz - Leafhdsz,
+	Msgmax  	= 1 + (Kvmax > Kpmax ? Kvmax : Kpmax),
+	Estacksz	= 64,
+};
+
+enum {
+	Eactive	= 1UL<<30,	/* epoch active flag */
+};
+
+enum {
+	/*
+	 * dent: pqid[8] qid[8] -- a directory entry key.
+	 * ptr:  off[8] hash[8] gen[8] -- a key for an Dir block.
+	 * dir:  serialized Xdir
+	 */
+
+	/* fs keys */
+	Kdat,	/* qid[8] off[8] => ptr:		pointer to data page */
+	Kent,	/* pqid[8] name[n] => dir[n]:		serialized Dir */
+	Kup,	/* qid[8] => Kent:			parent dir */
+
+	/* snapshot keys */
+	Klabel,	/* name[] => snapid[]:			snapshot label */
+	Ksnap,	/* sid[8] => ref[8], tree[52]:		snapshot root */
+	Kdlist,	/* snap[8] gen[8] => hd[ptr],tl[ptr]	deadlist  */
+};
+
+enum {
+	Bdirty	= 1 << 0,
+	Bfinal	= 1 << 1,
+	Bfreed	= 1 << 2,
+	Bcached	= 1 << 3,
+	Bqueued	= 1 << 4,
+	Blimbo	= 1 << 5,
+	Bstatic	= 1 << 6,
+};
+
+enum {
+	Lmut	= 1 << 0,	/* can we modify snaps via this label */
+	Lauto	= 1 << 1,	/* was this label generated automatically */
+	Ltsnap	= 1 << 2,	/* should we skip the timed snapshots */
+};
+
+enum {
+	Qdump = 1ULL << 63,
+};
+
+#define Zb (Bptr){-1, -1, -1}
+
+/* internal errors */
+//#define Efs	(abort(), "fs broke")
+extern char Efs[];
+extern char Ecorrupt[];
+extern char Efsvers[];
+extern char Eimpl[];
+extern char Ebotch[];
+extern char Eio[];
+extern char Enofid[];
+extern char Efid[];
+extern char Etype[];
+extern char Edscan[];
+extern char Esrch[];
+extern char Eexist[];
+extern char Emode[];
+extern char Efull[];
+extern char Estuffed[];
+extern char Eauth[];
+extern char Elength[];
+extern char Eperm[];
+extern char Einuse[];
+extern char Ebadf[];
+extern char Ename[];
+extern char Enomem[];
+extern char Eattach[];
+extern char Enosnap[];
+extern char Esnap[];
+extern char Edir[];
+extern char Esyntax[];
+extern char Enouser[];
+extern char Enogrp[];
+extern char Efsize[];
+extern char Ebadu[];
+extern char Erdonly[];
+extern char Elocked[];
+extern char Eauthp[];
+extern char Eauthd[];
+extern char Eauthph[];
+extern char Ephase[];
+extern char Enone[];
+extern char Enoauth[];
+
+extern char Ewstatb[];
+extern char Ewstatd[];
+extern char Ewstatg[];
+extern char Ewstatl[];
+extern char Ewstatm[];
+extern char Ewstato[];
+extern char Ewstatp[];
+extern char Ewstatq[];
+extern char Ewstatu[];
+extern char Ewstatv[];
+extern char Enempty[];
+
+/*
+ * All metadata blocks share a common header:
+ * 
+ *	type[2]
+ *
+ * The None type is reserved for file data blocks
+ * and refcount blocks.
+ *
+ * The superblock has this layout:
+ *	version[8]	always "gefsNNNNN"
+ *	blksz[4]	block size in bytes
+ *	bufsz[4]	portion of leaf nodes
+ *			allocated to buffers,
+ *			in bytes
+ *	height[4]	tree height of root node
+ *	rootb[8]	address of root in last
+ *			snapshot.
+ *	rooth[8]	hash of root node
+ *	narena[4]	number of arenas in tree
+ *	flag[8]	feature flag
+ *	gen[8]		The flush generation
+ *
+ * The arena zone blocks have this layout, and
+ * are overwritten in place:
+ *
+ *	log[8]		The head of the alloc log
+ *	logh[8]		The hash of the alloc log
+ *
+ * The log blocks have this layout, and are one of
+ * two types of blocks that get overwritten in place:
+ *
+ *	hash[8]		The hash of the previous log block
+ *
+ *	The remainder of the block is filled with log
+ *	entries. Each log entry has at least 8 bytes
+ *	of entry. Some are longer. The opcode is or'ed
+ *	into the low order bits of the first vlong.
+ *	These ops take the following form:
+ *
+ *	Alloc, Free:
+ *		off[8] len[8]
+ *	Alloc1, Free1:
+ *		off[8]
+ *	Ref:
+ *		off[8]
+ *	Flush:	
+ *		gen[8]
+ *
+ * Pivots have the following layout:
+ *
+ *	nval[2]
+ *	valsz[2]
+ *	nbuf[2]
+ *	bufsz[2]
+ *
+ * Leaves have the following layout:
+ *
+ *	nval[2]
+ *	valsz[2]
+ *	pad[4]sure, 
+ *
+ * Within these nodes, pointers have the following
+ * layout:
+ *
+ *	off[8] hash[8] fill[2]
+ */
+enum {
+	Tdat,
+	Tpivot,
+	Tleaf,
+	Tlog,
+	Tdlist,
+	Tarena,
+	Tsuper = 0x6765,	/* 'ge' bigendian */
+};
+
+enum {
+	Vinl,	/* Inline value */
+	Vref,	/* Block pointer */
+};
+
+enum {
+	GBraw	= 1<<0,
+	GBwrite	= 1<<1,
+	GBnochk	= 1<<2,
+	GBsoftchk = 1<<3,
+};
+
+enum {
+	Onop,		/* nothing */
+	Oinsert,	/* new kvp */
+	Odelete,	/* delete kvp */
+	Oclearb,	/* free block ptr if exists */
+	Oclobber,	/* remove file if it exists */
+	Owstat,		/* update kvp dirent */
+	Orelink,	/* rechain forwards */
+	Oreprev,	/* rechain backwards */
+	Nmsgtype,	/* maximum message type */
+};
+
+enum {
+	Magic = 0x979b929e98969c8c,
+};
+
+/*
+ * Wstat ops come with associated data, in the order
+ * of the bit flag.
+ */
+enum{
+	/* wstat flag */
+	Owsize	= 1<<0,	/* [8]fsize: update file size */
+	Owmode	= 1<<1,	/* [4]mode: update file mode */
+	Owmtime	= 1<<2, /* [8]mtime: update mtime, in nsec */
+	Owatime	= 1<<3, /* [8]atime: update atime, in nsec */
+	Owuid	= 1<<4,	/* [4]uid: set uid */
+	Owgid	= 1<<5,	/* [4]uid: set gid */
+	Owmuid	= 1<<6,	/* [4]uid: set muid */
+};
+
+/*
+ * Operations for the allocation log.
+ */
+enum {
+	LogNop,		/* unused */
+	/* 1-wide entries */
+	LogAlloc1,	/* alloc a block */
+	LogFree1,	/* free a block */
+	LogSync,	/* sync barrier for replay */
+
+	/* 2-wide entries */
+#define	Log2wide	LogAlloc
+	LogAlloc,	/* alloc a range */
+	LogFree,	/* free a range */
+};
+
+enum {
+	AOnone,
+	AOsnap,
+	AOsync,
+	AOclear,
+	AOrclose,
+};
+
+enum {
+	DFblk,
+	DFbp,
+	DFmnt,
+	DFtree,
+};
+
+struct Limbo {
+	Limbo	*next;
+	int	op;
+};
+
+struct Bptr {
+	vlong	addr;
+	uvlong	hash;
+	vlong	gen;
+};
+
+struct Key{
+	char	*k;
+	int	nk;
+};
+
+struct Val {
+	short	nv;
+	char	*v;
+};
+
+struct Kvp {
+	Key;
+	Val;
+};
+
+struct Msg {
+	char	op;
+	Kvp;
+};
+
+struct Dlist {
+	Dlist	*cnext;	/* cache next entry */
+	Dlist	*cprev;	/* cache prev entry */
+	Dlist	*chain;	/* hash table chain */
+	Blk	*ins;	/* loaded head */
+
+	vlong	gen;	/* deadlist gen */
+	vlong	bgen;	/* birth gen */
+	Bptr	hd;	/* deadlist head */
+	Bptr	tl;	/* deadlist tail */
+};
+
+struct Errctx {
+	long	tid;
+	char	err[128];
+	jmp_buf	errlab[Estacksz];
+	int	nerrlab;
+};
+
+struct Arange {
+	Avl;
+	vlong	off;
+	vlong	len;
+};
+
+struct Bucket {
+	Blk	*b;
+};
+
+struct Amsg {
+	int	op;
+	int	fd;
+	union {
+		struct {	/* AOsnap */
+			char	old[128];
+			char	new[128];
+			int	flag;
+			char	delete;
+
+		};
+		struct {	/* AOsync */
+			int	halt;
+		};
+		struct {	/* AOclear, AOrclose */
+			Mount	*mnt;
+			Dent	*dent;
+			vlong	qpath;
+			vlong	off;
+			vlong	end;
+		};
+	};
+};
+
+struct Fmsg {
+	Fcall;
+	Conn	*conn;
+	int	sz;	/* the size of the message buf */
+	uchar	buf[];
+};
+
+struct Tree {
+	Limbo;
+
+	/* in-memory */
+	Lock	lk;
+	long	memref;	/* number of in-memory references to this */
+	vlong	memgen;	/* wip next generation */
+	int	dirty;
+
+	/* on-disk */
+	int	nref;	/* number snapshots forked/after us */
+	int	nlbl;	/* number of labels referring to us */
+	int	ht;	/* height of the tree */
+	uint	flag;	/* flag set */
+	Bptr	bp;	/* block pointer of root */
+	vlong	gen;	/* generation */
+	vlong	pred;	/* previous snapshot */
+	vlong	succ;	/* next snapshot */
+	vlong	base;	/* base snapshot */
+};
+
+struct Bfree {
+	Limbo;
+	Bptr bp;
+};
+
+struct User {
+	int	id;
+	int	lead;
+	int	*memb;
+	int	nmemb;
+	char	name[128];
+};
+
+enum {
+	/* in priority order */
+	Qnone,
+	Qfence,
+	Qwrite,
+	Qfree,
+};
+
+struct Qent {
+	vlong	qgen;
+	Bptr	bp;
+	Blk	*b;
+	int	op;
+};
+
+struct Syncq {
+	QLock	lk;
+	Rendez	fullrz;
+	Rendez	emptyrz;
+	Qent	*heap;
+	int	nheap;
+	int	heapsz;
+};
+
+struct Trace {
+	int	tid;
+	int	qgen;
+	char	msg[16];
+	Bptr	bp;
+	vlong	v0;
+	vlong	v1;
+};
+
+/*
+ * Overall state of the file sytem.
+ * Shadows the superblock contents.
+ */
+struct Gefs {
+	int	blksz;
+	int	bufspc;
+	Tree	snap;
+	Dlist	snapdl;
+	int	narena;
+	vlong	flag;
+	vlong	nextqid;
+	vlong	nextgen;
+	vlong	qgen;
+	Bptr	*arenabp;
+
+	/* superblocks */
+	Blk	*sb0;	/* primary */
+	Blk	*sb1;	/* backup */
+
+	/* arena allocation */
+	Arena	*arenas;
+	long	roundrobin;
+	long	syncing;
+	long	nsyncers;
+	long	nreaders;
+
+	QLock	synclk;
+	Rendez	syncrz;
+
+	QLock	mountlk;
+	Mount	*mounts;
+	Mount	*snapmnt;
+	Lock	connlk;
+	Conn	*conns;
+
+	Chan	*wrchan;
+	Chan	*admchan;
+	Chan	**rdchan;
+
+	QLock	mutlk;
+	long	nworker;
+	long	epoch;
+	long	lepoch[32];
+	Limbo	*limbo[3];
+	long	nlimbo;
+
+	Syncq	syncq[32];
+
+	int	fd;
+	long	rdonly;
+	int	noauth;
+
+	/* user list */
+	RWLock	userlk;
+	User	*users;
+	int	nusers;
+
+	/* slow block io */
+	QLock	blklk[32];
+	
+	/* deadlist cache */
+	Dlist	**dlcache;
+	Dlist	*dlhead;
+	Dlist	*dltail;
+	int	dlcount;
+	int	dlcmax;
+
+	/* block lru */
+	QLock	lrulk;
+	Rendez	lrurz;
+	Bucket	*bcache;
+	Blk	*chead;
+	Blk	*ctail;
+	usize	ccount;
+	usize	cmax;
+
+	/* preallocated deferred frees */
+	QLock	bfreelk;
+	Rendez	bfreerz;
+	Bfree	*bfree;
+
+	RWLock	flushq[Nflushtab];
+	int	flushop[Nflushtab];
+
+	Trace	*trace;
+	long	traceidx;
+	long	ntrace;
+};
+
+struct Arena {
+	QLock;
+	Avltree *free;
+	Blk	**queue;
+	int	nqueue;
+	Blk	*logbuf[2];	/* preallocated log pages */
+	Blk	*h0;		/* arena header */
+	Blk	*h1;		/* arena footer */
+	Blk	**q;		/* write queue */
+	vlong	nq;
+	vlong	size;
+	vlong	used;
+	vlong	reserve;
+	/* allocation log */
+	vlong	lastlogsz;	/* size after last compression */
+	vlong	nlog;		/* number of blocks in log */
+	Bptr	loghd;		/* allocation log */
+	Blk	*logtl;		/* end of the log, open for writing */
+	Syncq	*sync;
+};
+
+struct Xdir {
+	/* file data */
+	uvlong	flag;	/* storage flag */
+	Qid	qid;	/* unique id from server */
+	ulong	mode;	/* permissions */
+	vlong	atime;	/* last read time: nsec */
+	vlong	mtime;	/* last write time: nsec */
+	uvlong	length;	/* file length */
+	int	uid;	/* owner name */
+	int	gid;	/* group name */
+	int	muid;	/* last modifier name */
+	char	*name;	/* last element of path */
+};
+
+struct Dent {
+	RWLock;
+	Key;
+	Xdir;
+	Dent	*next;
+	QLock	trunclk;
+	Rendez	truncrz;
+	vlong	up;
+	long	ref;
+	char	gone;
+	char	trunc;
+
+	char	buf[Maxent];
+};
+
+struct Mount {
+	Limbo;
+	Lock;
+	Mount	*next;
+	long	ref;
+	vlong	gen;
+	char	name[64];
+	Tree	*root;	/* EBR protected */
+
+	int	flag;
+
+	/* open directory entries */
+	Lock	dtablk;
+	Dent	*dtab[Ndtab];
+
+	/* snapshot history */
+	char	minutely[60][128];
+	char	hourly[24][128];
+};
+
+struct Conn {
+	Conn	*next;
+	QLock	wrlk;
+	int	rfd;
+	int	wfd;
+	int	iounit;
+	int	versioned;
+
+	/* fid hash table */
+	Lock	fidtablk[Nfidtab];
+	Fid	*fidtab[Nfidtab];
+};
+
+struct Fid {
+	Lock;
+	Fid	*next;
+	/*
+	 * if opened with OEXEC, we want to use a snapshot,
+	 * instead of the most recent root, to prevent
+	 * paging in the wrong executable.
+	 */
+	Mount	*mnt;
+	Scan	*scan;	/* in progres scan */
+	Dent	*dent;	/* (pqid, name) ref, modified on rename */
+	Dent	*dir;
+	Amsg	*rclose;	
+	void	*auth;
+
+	u32int	fid;
+	vlong	qpath;
+	vlong	pqpath;
+	long	ref;
+	int	mode;
+	int	iounit;
+
+	int	uid;
+	int	duid;
+	int	dgid;
+	int	dmode;
+
+	char	permit;
+	char	fromdump;
+};
+
+enum {
+	POmod,
+	POrot,
+	POsplit,
+	POmerge,
+};
+
+struct Scanp {
+	int	bi;
+	int	vi;
+	Blk	*b;
+};
+
+struct Scan {
+	vlong	offset;	/* last read offset */
+	char	first;
+	char	donescan;
+	char	overflow;
+	char	present;
+	int	ht;
+	Kvp	kv;
+	Key	pfx;
+	char	kvbuf[Kvmax];
+	char	pfxbuf[Keymax];
+	Scanp	*path;
+};
+
+struct Blk {
+	Limbo;
+	/* cache entry */
+	Blk	*cnext;
+	Blk	*cprev;
+	Blk	*hnext;
+
+	/* serialized to disk in header */
+	short	type;	/* @0, for all */
+	union {
+		struct {
+			short	nval;	/* @2, for Leaf, Pivot: data[0:2] */
+			short	valsz;	/* @4, for Leaf, Pivot: data[2:4] */
+			short   nbuf;	/* @6, for Pivot */
+			short   bufsz;	/* @8, for Pivot */
+		};
+		struct {
+			int	logsz;	/* @2 for allocation log */
+			uvlong	logh;	/* @4 for log body hash */
+			Bptr	logp;	/* @12 next deadlist chain */
+		};
+	};
+
+	/* debug */
+	uintptr queued;
+	uintptr lasthold;
+	uintptr lasthold0;
+	uintptr lastdrop;
+	uintptr	enqueued;
+	uintptr cached;
+	uintptr uncached;
+	uintptr	alloced;
+	uintptr	freed;
+
+	Bptr	bp;
+	long	ref;
+	long	flag;
+	char	*data;
+	char	buf[Blksz];
+	vlong	magic;
+};
+
+struct Chan {
+	int	size;	/* size of queue */
+	long	count;	/* how many in queue (semaphore) */
+	long	avail;	/* how many available to send (semaphore) */
+	Lock	rl, wl;	/* circular pointers */
+	void	**rp;
+	void	**wp;
+	void*	args[];	/* list of saved pointers, [->size] */
+};
--- /dev/null
+++ b/dump.c
@@ -1,0 +1,366 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include <ctype.h>
+
+#include "dat.h"
+#include "fns.h"
+
+char	spc[128];
+
+static int
+showkey(Fmt *fmt, Key *k)
+{
+	int n;
+
+	/*
+	 * dent: pqid[8] qid[8] -- a directory entry key.
+	 * ptr:  off[8] hash[8] -- a key for an Dir block.
+	 * dir:  fixed statbuf header, user ids
+	 */
+	if(k->nk == 0)
+		return fmtprint(fmt, "\"\"");
+	switch(k->k[0]){
+	case Kdat:	/* qid[8] off[8] => ptr[16]:	pointer to data page */
+		n = fmtprint(fmt, "dat qid:%llx off:%llx",
+			UNPACK64(k->k+1), UNPACK64(k->k+9));
+		break;
+	case Kent:	/* pqid[8] name[n] => dir[n]:	serialized Dir */
+		n = fmtprint(fmt, "ent dir:%llx, name:\"%.*s\"",
+			UNPACK64(k->k+1), k->nk-11, k->k+11);
+		break;
+	case Klabel:	/* name[n] => tree[24]:	snapshot ref */
+		n = fmtprint(fmt, "label name:\"%.*s\"", k->nk-1, k->k+1);
+		break;
+	case Ksnap:	/* name[n] => tree[24]:	snapshot root */
+		n = fmtprint(fmt, "snap id:%lld", UNPACK64(k->k+1));
+		break;
+	case Kup:	/* qid[8] => pqid[8]:		parent dir */
+		n = fmtprint(fmt, "up dir:%llx", UNPACK64(k->k+1));
+		break;
+	case Kdlist:
+		n = fmtprint(fmt, "dlist gen:%lld, bgen:%lld",
+			UNPACK64(k->k+1), UNPACK64(k->k+9));
+		break;
+	default:
+		n = fmtprint(fmt, "??? %.*H", k->nk, k->k);
+		break;
+	}
+	return n;
+}
+
+static int
+showval(Fmt *fmt, Kvp *v, int op, int flg)
+{
+	int n, ws;
+	char *p;
+	Tree t;
+	Xdir d;
+
+	n = 0;
+	if(flg){
+		assert(v->nv == Ptrsz+2);
+		n = fmtprint(fmt, "(%B,%d)", unpackbp(v->v, v->nv), UNPACK16(v->v+Ptrsz));
+		return n;
+	}
+	if(op == Odelete || op == Oclearb){
+		n = fmtprint(fmt, "delete");
+		return n;
+	}
+	switch(v->k[0]){
+	case Kdat:	/* qid[8] off[8] => ptr[16]:	pointer to data page */
+		switch(op){
+		case Odelete:
+		case Oclearb:
+			n = 0;
+			break;
+		case Onop:
+		case Oinsert:
+			if(v->nv == Ptrsz)
+				n = fmtprint(fmt, "ptr:%B", unpackbp(v->v, v->nv));
+			else
+				n = fmtprint(fmt, "BROKEN ptr %.*H", v->nk, v->k);
+			break;
+		}
+		break;
+	case Kent:	/* pqid[8] name[n] => dir[n]:	serialized Dir */
+		switch(op){
+		case Onop:
+		case Oinsert:
+			kv2dir(v, &d);
+			n = fmtprint(fmt, "[qid=(%llux,%lud,%d), p=%luo, f=%llux, t=%lld,%lld, l=%lld, o=%d, g=%d m=%d]",
+				d.qid.path, d.qid.vers, d.qid.type, d.mode,
+				d.flag, d.atime, d.mtime, d.length,
+				d.uid, d.gid, d.muid);
+			break;
+		case Odelete:
+			n = fmtprint(fmt, "delete");
+			break;
+		case Owstat:
+			p = v->v;
+			ws = *p++;
+			if(ws & Owsize){
+				n += fmtprint(fmt, "size:%llx ", UNPACK64(p));
+				p += 8;
+			}
+			if(ws & Owmode){
+				n += fmtprint(fmt, "mode:%uo ", UNPACK32(p));
+				p += 4;
+			}
+			if(ws & Owmtime){
+				n += fmtprint(fmt, "mtime:%llx ", UNPACK64(p));
+				p += 8;
+			}
+			if(ws & Owatime){
+				n += fmtprint(fmt, "mtime:%llx ", UNPACK64(p));
+				p += 8;
+			}
+			if(ws & Owuid){
+				n += fmtprint(fmt, "uid:%d ", UNPACK32(p));
+				p += 4;
+			}
+			if(ws & Owgid){
+				n += fmtprint(fmt, "gid:%d ", UNPACK32(p));
+				p += 4;
+			}
+			if(ws & Owmuid){
+				n += fmtprint(fmt, "muid:%d ", UNPACK32(p));
+				p += 4;
+			}
+			if(p != v->v + v->nv){
+				fprint(2, "v->nv: %d, sz=%d\n", v->nv, (int)(p - v->v));
+				abort();
+			}
+			break;
+		}
+		break;
+	case Ksnap:	/* name[n] => dent[16] ptr[16]:	snapshot root */
+		switch(op){
+		case Orelink:
+		case Oreprev:
+			n = fmtprint(fmt, "gen: %lld, dlbl: %d, dref: %d",
+				UNPACK64(v->v), v->v[8], v->v[9]);
+			break;
+		case Onop:
+		case Oinsert:
+			if(unpacktree(&t, v->v, v->nv) == nil)
+				n = fmtprint(fmt, "corrupt tree");
+			else
+				n = fmtprint(fmt, "<tree %B [pred=%lld, succ=%lld, nref=%d, nlbl=%d]>",
+					t.bp, t.pred, t.succ, t.nref, t.nlbl);
+			break;
+		default:
+			n = fmtprint(fmt, "?? unknown op %d", op);
+		}
+		break;
+	case Klabel:
+		n = fmtprint(fmt, "snap id:%lld", UNPACK64(v->v+1));
+		break;
+	case Kup:	/* qid[8] => pqid[8]:		parent dir */
+		n = fmtprint(fmt, "super dir:%llx, name:\"%.*s\")",
+			UNPACK64(v->v+1), v->nv-11, v->v+11);
+		break;
+	case Kdlist:
+		n = fmtprint(fmt, "hd:%B, tl:%B",
+			unpackbp(v->v, v->nv),
+			unpackbp(v->v+Ptrsz, v->nv-Ptrsz));
+		break;
+	default:
+		n = fmtprint(fmt, "??? %.*H", v->nk, v->k);
+		break;
+	}
+	return n;
+
+}
+
+int
+Bconv(Fmt *fmt)
+{
+	Bptr bp;
+
+	bp = va_arg(fmt->args, Bptr);
+	return fmtprint(fmt, "(%llx,%.16llux,%llx)", bp.addr, bp.hash, bp.gen);
+}
+
+int
+Mconv(Fmt *fmt)
+{
+	char *opname[Nmsgtype] = {
+	[Oinsert]	"Oinsert",
+	[Odelete]	"Odelete",
+	[Oclearb]	"Oclearb",
+	[Oclobber]	"Oclobber",
+	[Owstat]	"Owstat",
+	[Orelink]	"Orelink",
+	[Oreprev]	"Oreprev",
+	};
+	Msg *m;
+	int f, n;
+
+	f = (fmt->flags & FmtSharp) != 0;
+	m = va_arg(fmt->args, Msg*);
+	if(m == nil)
+		return fmtprint(fmt, "Msg{nil}");
+	n = fmtprint(fmt, "Msg(%s, ", opname[m->op]);
+	n += showkey(fmt, m);
+	n += fmtprint(fmt, ") => (");
+	n += showval(fmt, m, m->op, f);
+	n += fmtprint(fmt, ")");
+	return n;
+}
+
+int
+Pconv(Fmt *fmt)
+{
+	Kvp *kv;
+	int f, n;
+
+	f = (fmt->flags & FmtSharp) != 0;
+	kv = va_arg(fmt->args, Kvp*);
+	if(kv == nil)
+		return fmtprint(fmt, "Kvp{nil}");
+	n = fmtprint(fmt, "Kvp(");
+	n += showkey(fmt, kv);
+	n += fmtprint(fmt, ") => (");
+	n += showval(fmt, kv, Onop, f);
+	n += fmtprint(fmt, ")");
+	return n;
+}
+
+int
+Kconv(Fmt *fmt)
+{
+	Key *k;
+	int n;
+
+	k = va_arg(fmt->args, Key*);
+	if(k == nil)
+		return fmtprint(fmt, "Key{nil}");
+	n = fmtprint(fmt, "Key(");
+	n += showkey(fmt, k);
+	n += fmtprint(fmt, ")");
+	return n;
+}
+
+int
+Rconv(Fmt *fmt)
+{
+	Arange *r;
+
+	r = va_arg(fmt->args, Arange*);
+	if(r == nil)
+		return fmtprint(fmt, "<Arange:nil>");
+	else
+		return fmtprint(fmt, "Arange(%lld+%lld)", r->off, r->len);
+}
+
+int
+Qconv(Fmt *fmt)
+{
+	Qid q;
+
+	q = va_arg(fmt->args, Qid);
+	return fmtprint(fmt, "(%llx %ld %d)", q.path, q.vers, q.type);
+}
+
+static void
+rshowblk(int fd, Blk *b, int indent, int recurse)
+{
+	Blk *c;
+	int i;
+	Bptr bp;
+	Kvp kv;
+	Msg m;
+
+	if(indent > sizeof(spc)/4)
+		indent = sizeof(spc)/4;
+	if(b == nil){
+		fprint(fd, "NIL\n");
+		return;
+	}
+	fprint(fd, "%.*s[BLK]|{%B}\n", 4*indent, spc, b->bp);
+	switch(b->type){
+	case Tpivot:
+		for(i = 0; i < b->nbuf; i++){
+			getmsg(b, i, &m);
+			fprint(fd, "%.*s[%03d]|%M\n", 4*indent, spc, i, &m);
+		}
+		/* wet floor */
+	case Tleaf:
+		for(i = 0; i < b->nval; i++){
+			getval(b, i, &kv);
+			if(b->type == Tpivot){
+				fprint(fd, "%.*s[%03d]|%#P\n", 4*indent, spc, i, &kv);
+				bp = unpackbp(kv.v, kv.nv);
+				c = getblk(bp, 0);
+				if(recurse)
+					rshowblk(fd, c, indent + 1, 1);
+				dropblk(c);
+			}else{
+				fprint(fd, "%.*s[%03d]|%P\n", 4*indent, spc, i, &kv);
+			}
+		}
+		break;
+	case Tarena:
+		fprint(fd, "arena -- ");
+		goto Show;
+	case Tlog:
+		fprint(fd, "log -- ");
+		goto Show;
+	case Tdlist:
+		fprint(fd, "dlist -- ");
+		goto Show;
+	case Tdat:
+		fprint(fd, "dat -- ");
+	Show:
+		for(i = 0; i < 32; i++){
+			fprint(fd, "%x", b->buf[i] & 0xff);
+			if(i % 4 == 3)
+				fprint(fd, " ");
+		}
+		fprint(fd, "\n");
+		break;
+	}
+}
+
+void
+showblk(int fd, Blk *b, char *m, int recurse)
+{
+	fprint(fd, "=== %s\n", m);
+	rshowblk(fd, b, 0, recurse);
+}
+
+void
+showbp(int fd, Bptr bp, int recurse)
+{
+	Blk *b;
+
+	b = getblk(bp, GBnochk);
+	rshowblk(fd, b, 0, recurse);
+	dropblk(b);
+}
+
+void
+showtreeroot(int fd, Tree *t)
+{
+	fprint(fd, "\tflag\t0x%x\n", t->flag);
+	fprint(fd, "\tgen:\t%lld\n", t->gen);
+	fprint(fd, "\tbase\t%lld\n", t->base);
+	fprint(fd, "\tpred:\t%lld\n", t->pred);
+	fprint(fd, "\tsucc:\t%lld\n", t->succ);
+	fprint(fd, "\tnref:\t%d\n", t->nref);
+	fprint(fd, "\tnlbl:\t%d\n", t->nlbl);
+	fprint(fd, "\tht:\t%d\n", t->ht);
+	fprint(fd, "\tbp:\t%B\n", t->bp);
+}
+
+void
+initshow(void)
+{
+	int i;
+
+	memset(spc, ' ', sizeof(spc));
+	for(i = 0; i < sizeof(spc); i += 4)
+		spc[i] = '|';
+}
--- /dev/null
+++ b/error.c
@@ -1,0 +1,78 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include "dat.h"
+
+char Efs[]	= "internal error";
+char Ecorrupt[] = "block contents corrupted";
+char Efsvers[]	= "unknown fs version";
+char Eimpl[]	= "not implemented";
+char Ebotch[]	= "protocol botch";
+char Eio[]	= "i/o error";
+char Enofid[]	= "unknown fid";
+char Efid[]	= "fid in use";
+char Etype[]	= "invalid fid type";
+char Edscan[]	= "invalid dir scan offset";
+char Esrch[]	= "directory entry not found";
+char Eexist[]	= "create/wstat -- file exists";
+char Emode[]	= "open/create -- unknown mode";
+char Efull[]	= "file system full";
+char Estuffed[]	= "emergency blocks exhausted";
+char Eauth[]	= "authentication failed";
+char Elength[]	= "name too long";
+char Eperm[]	= "permission denied";
+char Einuse[]	= "resource in use";
+char Ebadf[]	= "invalid file";
+char Ename[]	= "create/wstat -- bad character in file name";
+char Enomem[]	= "out of memory";
+char Eattach[]	= "attach required";
+char Enosnap[]	= "attach -- bad specifier";
+char Edir[]	= "invalid directory";
+char Esyntax[]	= "syntax error";
+char Enouser[]	= "user does not exist";
+char Enogrp[]	= "group does not exist";
+char Efsize[]	= "file too big";
+char Ebadu[]	= "attach -- unknown user or failed authentication";
+char Erdonly[]	= "file system read only";
+char Elocked[]	= "open/create -- file is locked";
+char Eauthp[]	= "authread -- auth protocol not finished";
+char Eauthd[]	= "authread -- not enough data";
+char Eauthph[]	= "auth phase error";
+char Enone[]	= "auth -- user 'none' requires no authentication";
+char Enoauth[]	= "auth -- authentication disabled";
+char Ephase[]	= "phase error -- use after remove";
+
+char Ewstatb[]	= "wstat -- unknown bits in qid.type/mode";
+char Ewstatd[]	= "wstat -- attempt to change directory";
+char Ewstatg[]	= "wstat -- not in group";
+char Ewstatl[]	= "wstat -- attempt to make length negative";
+char Ewstatm[]	= "wstat -- attempt to change muid";
+char Ewstato[]	= "wstat -- not owner or group leader";
+char Ewstatp[]	= "wstat -- attempt to change qid.path";
+char Ewstatq[]	= "wstat -- qid.type/dir.mode mismatch";
+char Ewstatu[]	= "wstat -- not owner";
+char Ewstatv[]	= "wstat -- attempt to change qid.vers";
+char Enempty[]	= "directory is not empty";
+
+//char Echar[]		= "bad character in directory name";
+//char Eopen[]		= "read/write -- on non open fid";
+//char Ecount[]		= "read/write -- count too big";
+//char Ealloc[]		= "phase error -- directory entry not allocated";
+//char Eqid[]		= "phase error -- qid does not match";
+//char Eaccess[]	= "access permission denied";
+//char Eentry[]		= "directory entry not found";
+//char Edir1[]		= "walk -- in a non-directory";
+//char Edir2[]		= "create -- in a non-directory";
+//char Edot[]		= "create/wstat -- . and .. illegal names";
+//char Ewalk[]		= "walk -- too many (system wide)";
+//char Eoffset[]	= "read/write -- offset negative";
+//char Ebroken[]	= "read/write -- lock is broken";
+//char Eauth[]		= "attach -- authentication failed";
+//char Eauth2[]		= "read/write -- authentication unimplemented";
+//char Etoolong[]	= "name too long";
+//char Efidinuse[]	= "fid in use";
+//char Eversion[]	= "version conversion";
+//char Eauthnone[]	= "auth -- user 'none' requires no authentication";
+//char Eauthdisabled[]	= "auth -- authentication disabled";	/* development */
+//char Eauthfile[]	= "auth -- out of auth files";
--- /dev/null
+++ b/fns.h
@@ -1,0 +1,213 @@
+#pragma varargck type "M"	Msg*
+#pragma varargck type "P"	Kvp*
+#pragma varargck type "K"	Key*
+#pragma varargck type "V"	Val*
+#pragma varargck type "B"	Bptr
+#pragma varargck type "R"	Arange*
+#pragma varargck type "X"	char*
+#pragma varargck type "Q"	Qid
+
+extern Gefs*	fs;
+extern int	debug;
+extern int	permissive;
+extern int	usereserve;
+extern char*	reamuser;
+extern Errctx**	errctx;
+extern Blk*	blkbuf;
+extern int	noneid;
+extern int	nogroupid;
+extern int	admid;
+
+#define	UNPACK8(p)	(((uchar*)(p))[0])
+#define	UNPACK16(p)	((((uchar*)(p))[0]<<8)|(((uchar*)(p))[1]))
+#define	UNPACK32(p)	((((uchar*)(p))[0]<<24)|(((uchar*)(p))[1]<<16)|\
+				(((uchar*)(p))[2]<<8)|(((uchar*)(p))[3]))
+#define	UNPACK64(p)	(((u64int)((((uchar*)(p))[0]<<24)|(((uchar*)(p))[1]<<16)|\
+				(((uchar*)(p))[2]<<8)|(((uchar*)(p))[3])))<<32 |\
+			((u64int)((((uchar*)(p))[4]<<24)|(((uchar*)(p))[5]<<16)|\
+				(((uchar*)(p))[6]<<8)|(((uchar*)(p))[7]))))
+
+#define	PACK8(p,v)	do{(p)[0]=(v);}while(0)
+#define	PACK16(p,v)	do{(p)[0]=(v)>>8;(p)[1]=(v);}while(0)
+#define	PACK32(p,v)	do{(p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v);}while(0)
+#define	PACK64(p,v)	do{(p)[0]=(v)>>56;(p)[1]=(v)>>48;(p)[2]=(v)>>40;(p)[3]=(v)>>32;\
+			   (p)[4]=(v)>>24;(p)[5]=(v)>>16;(p)[6]=(v)>>8;(p)[7]=(v);}while(0)
+
+void*	emalloc(usize, int);
+
+Blk*	newdblk(Tree*, vlong, int);
+Blk*	newblk(Tree*, int);
+Blk*	dupblk(Tree*, Blk*);
+Blk*	getroot(Tree*, int*);
+Blk*	getblk(Bptr, int);
+Blk*	holdblk(Blk*);
+void	dropblk(Blk*);
+
+void	lrutop(Blk*);
+void	lrubot(Blk*);
+void	cacheins(Blk*);
+void	cachedel(vlong);
+Blk*	cacheget(vlong);
+Blk*	cachepluck(void);
+
+void	qinit(Syncq*);
+void	qput(Syncq*, Qent);
+
+Arena*	getarena(vlong);
+void	syncblk(Blk*);
+void	enqueue(Blk*);
+void	epochstart(int);
+void	epochend(int);
+void	epochwait(void);
+void	epochclean(void);
+void	limbo(int op, Limbo*);
+void	freeblk(Tree*, Blk*);
+void	freebp(Tree*, Bptr);
+int	logbarrier(Arena *, vlong);
+void	dlappend(Dlist *dl, Bptr);
+void	killblk(Tree*, Bptr);
+ushort	blkfill(Blk*);
+uvlong	blkhash(Blk*);
+uvlong	bufhash(void*, usize);
+u32int	ihash(uvlong);
+void	finalize(Blk*);
+
+Mount*	getmount(char*);
+void	clunkmount(Mount*);
+
+void	updatesnap(Tree**, Tree*, char*, int);
+void	tagsnap(Tree*, char*, int);
+void	delsnap(Tree*, vlong, char*);
+void	freedl(Dlist*, int);
+Tree*	opensnap(char*, int*);
+
+void	closesnap(Tree*);
+void	reamfs(char*);
+void	growfs(char*);
+void	loadarena(Arena*, Bptr);
+void	loadfs(char*);
+void	loadlog(Arena*, Bptr);
+void	flushlog(Arena*);
+int	scandead(Dlist*, int, void(*)(Bptr, void*), void*);
+int	endfs(void);
+void	compresslog(Arena*);
+void	dlsync(void);
+void	setval(Blk*, Kvp*);
+
+Conn*	newconn(int, int);
+
+int	walk1(Tree*, vlong, char*, Qid*, vlong*);
+void	loadusers(int, Tree*);
+User*	uid2user(int);
+User*	name2user(char*);
+
+void	btupsert(Tree*, Msg*, int);
+int	btlookup(Tree*, Key*, Kvp*, char*, int);
+void	btnewscan(Scan*, char*, int);
+void	btenter(Tree*, Scan*);
+int	btnext(Scan*, Kvp*);
+void	btexit(Scan*);
+
+int	checkflag(Blk *b, int, int);
+void	setflag(Blk *b, int, int);
+
+char*	estrdup(char*);
+
+int	keycmp(Key *, Key *);
+void	cpkey(Key*, Key*, char*, int);
+void	cpkvp(Kvp*, Kvp*, char*, int);
+
+/* for dumping */
+void	getval(Blk*, int, Kvp*);
+void	getmsg(Blk*, int, Msg*);
+Bptr	getptr(Kvp*, int*);
+
+void	initshow(void);
+void	showblk(int, Blk*, char*, int);
+void	showbp(int, Bptr, int);
+void	showtreeroot(int, Tree*);
+int	checkfs(int);
+
+#define dprint(...) \
+	do{ \
+		if(debug) fprint(2, __VA_ARGS__); \
+	}while(0)
+
+#define fatal(...) \
+	do{ \
+		fprint(2, __VA_ARGS__); \
+		abort(); \
+	}while(0)
+
+#define tracex(msg, bp, v0, v1) \
+	do{ \
+		if(fs->trace != nil) \
+			_trace(msg, bp, v0, v1); \
+	} while(0)
+
+#define traceb(msg, bp)	tracex(msg, bp, -1, -1)
+#define tracev(msg, v0)	tracex(msg, Zb, v0, -1)
+#define tracem(msg)	tracex(msg, Zb, -1, -1)
+
+jmp_buf*	_waserror(void);
+_Noreturn void	error(char*, ...);
+_Noreturn void	broke(char*, ...);
+_Noreturn void	nexterror(void);
+#define waserror()	(setjmp(*_waserror()))
+#define errmsg()	((*errctx)->err)
+#define	poperror()	assert((*errctx)->nerrlab-- > 0)
+#define estacksz()	((*errctx)->nerrlab)
+void	_trace(char*, Bptr, vlong, vlong);
+char*	packstr(char*, char*, char*);
+
+void	dir2kv(vlong, Xdir*, Kvp*, char*, int);
+int	dir2statbuf(Xdir*, char*, int);
+void	dlist2kv(Dlist*, Kvp*, char*, int);
+void	lbl2kv(char*, vlong, uint, Kvp*, char*, int);
+void	link2kv(vlong, vlong, Kvp*, char*, int);
+void	retag2kv(vlong, vlong, int, int, Kvp*, char*, int);
+void	tree2kv(Tree*, Kvp*, char*, int);
+
+void	kv2dir(Kvp*, Xdir*);
+void	kv2dlist(Kvp*, Dlist*);
+void	kv2link(Kvp*, vlong*, vlong*);
+void	kv2qid(Kvp*, Qid*);
+int	kv2statbuf(Kvp*, char*, int);
+
+char*	packarena(char*, int, Arena*);
+char*	packbp(char*, int, Bptr*);
+char*	packdkey(char*, int, vlong, char*);
+char*	packdval(char*, int, Xdir*);
+char*	packlbl(char*, int, char*);
+char*	packsnap(char*, int, vlong);
+char*	packsuper(char*, int, vlong);
+char*	packtree(char*, int, Tree*);
+char*	packsb(char*, int, Gefs*);
+
+char*	unpackarena(Arena*, char*, int);
+Bptr	unpackbp(char*, int);
+char*	unpackdkey(char*, int, vlong*);
+Tree*	unpacktree(Tree*, char*, int);
+char*	unpacksb(Gefs*, char*, int);
+char*	unpackstr(char*, char*, char**);
+
+/* fmt */
+int	Bconv(Fmt*);
+int	Mconv(Fmt*);
+int	Pconv(Fmt*);
+int	Rconv(Fmt*);
+int	Kconv(Fmt*);
+int	Qconv(Fmt*);
+
+Chan*	mkchan(int);
+void*	chrecv(Chan*);
+void	chsend(Chan*, void*);
+void	runfs(int, void*);
+void	runmutate(int, void*);
+void	runread(int, void*);
+void	runcons(int, void*);
+void	runtasks(int, void*);
+void	runsync(int, void*);
+void	runsweep(int, void*);
+void	runsweep(int, void*);
+void	fixfs(void);
--- /dev/null
+++ b/fs.c
@@ -1,0 +1,2796 @@
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+static void	respond(Fmsg*, Fcall*);
+static void	rerror(Fmsg*, char*, ...);
+static void	clunkfid(Conn*, Fid*, Amsg**);
+
+int
+walk1(Tree *t, vlong up, char *name, Qid *qid, vlong *len)
+{
+	char *p, kbuf[Keymax], rbuf[Kvmax];
+	int err;
+	Xdir d;
+	Kvp kv;
+	Key k;
+
+	err = 0;
+	p = packdkey(kbuf, sizeof(kbuf), up, name);
+	k.k = kbuf;
+	k.nk = p - kbuf;
+	if(err)
+		return -1;
+	if(!btlookup(t, &k, &kv, rbuf, sizeof(rbuf)))
+		return -1;
+	kv2dir(&kv, &d);
+	*qid = d.qid;
+	*len = d.length;
+	return 0;
+}
+
+static void
+touch(Dent *de, Msg *msg)
+{
+	wlock(de);
+	de->qid.vers++;
+	msg->op = Owstat;
+	msg->k = de->k;
+	msg->nk = de->nk;
+	msg->v = "\0";
+	msg->nv = 1;
+	wunlock(de);
+}
+
+static void
+wrbarrier(void)
+{
+	tracev("barrier", fs->qgen);
+	aincv(&fs->qgen, 1);
+}
+
+static void
+wrwait(void)
+{
+	Qent qe;
+	int i;
+
+	tracev("wrwait", fs->qgen);
+	aincv(&fs->qgen, 1);
+	fs->syncing = fs->nsyncers;
+	for(i = 0; i < fs->nsyncers; i++){
+		qe.op = Qfence;
+		qe.bp.addr = 0;
+		qe.bp.hash = -1;
+		qe.bp.gen = -1;
+		qe.b = nil;
+		qput(&fs->syncq[i], qe);
+	}
+	aincv(&fs->qgen, 1);
+	while(fs->syncing != 0)
+		rsleep(&fs->syncrz);
+	tracev("flushed", fs->qgen);
+}
+
+static void
+sync(void)
+{
+	Mount *mnt;
+	Arena *a;
+	Dlist dl;
+	int i;
+
+	qlock(&fs->synclk);
+	if(waserror()){
+		fprint(2, "failed to sync: %s\n", errmsg());
+		qunlock(&fs->synclk);
+		nexterror();
+	}
+
+	/* 
+	 * Wait for data that we're syncing to hit disk
+	 */
+	tracem("flush1");
+	wrbarrier();
+	/*
+	 * pass 0: Update all open snapshots, and
+	 *  pack the blocks we want to sync. Snap
+	 *  while holding the write lock, and then
+	 *  wait until all the blocks they point at
+	 *  have hit disk; once they're on disk, we
+	 *  can take a consistent snapshot.
+         */
+	qlock(&fs->mutlk);
+	tracem("packb");
+	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next)
+		updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+	/*
+	 * Now that we've updated the snaps, we can sync the
+	 * dlist; the snap tree will not change from here.
+	 */
+	dlsync();
+	dl = fs->snapdl;
+	fs->snapdl.hd = Zb;
+	fs->snapdl.tl = Zb;
+	fs->snapdl.ins = nil;
+	traceb("syncdl.dl", dl.hd);
+	traceb("syncdl.rb", fs->snap.bp);
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		qlock(a);
+		/*
+		 * because the log uses preallocated
+		 * blocks, we need to write the log
+		 * block out synchronously, or it may
+		 * get reused.
+		 */
+		logbarrier(a, fs->qgen);
+		flushlog(a);
+
+		packarena(a->h0->data, Blksz, a);
+		packarena(a->h1->data, Blksz, a);
+		finalize(a->h0);
+		finalize(a->h1);
+		fs->arenabp[i] = a->h0->bp;
+		qunlock(a);
+	}
+	assert(fs->snapdl.hd.addr == -1);
+	traceb("packsb.rb", fs->snap.bp);
+	packsb(fs->sb0->buf, Blksz, fs);
+	packsb(fs->sb1->buf, Blksz, fs);
+	finalize(fs->sb0);
+	finalize(fs->sb1);
+	fs->snap.dirty = 0;
+	qunlock(&fs->mutlk);
+
+	/*
+	 * pass 1: sync block headers; if we crash here,
+	 *  the block footers are consistent, and we can
+	 *  use them.
+	 */
+	tracem("arenas0");
+	for(i = 0; i < fs->narena; i++)
+		enqueue(fs->arenas[i].h0);
+	wrbarrier();
+
+	/*
+	 * pass 2: sync superblock; we have a consistent
+	 * set of block headers, so if we crash, we can
+	 * use the loaded block headers; the footers will
+	 * get synced after so that we can use them next
+	 * time around.
+         */
+	tracem("supers");
+	enqueue(fs->sb0);
+	enqueue(fs->sb1);
+	wrbarrier();
+
+	/*
+	 * pass 3: sync block footers; if we crash here,
+	 *  the block headers are consistent, and we can
+	 *  use them.
+         */
+	tracem("arenas1");
+	for(i = 0; i < fs->narena; i++)
+		enqueue(fs->arenas[i].h1);
+
+	/*
+	 * Pass 4: clean up the old snap tree's deadlist.
+	 * we need to wait for all the new data to hit disk
+	 * before we can free anything, otherwise it gets
+	 * clobbered.
+	 */
+	tracem("snapdl");
+	wrwait();
+	freedl(&dl, 1);
+	qunlock(&fs->synclk);
+	tracem("synced");
+	poperror();
+}
+
+static void
+snapfs(Amsg *a, Tree **tp)
+{
+	Tree *t, *s;
+	Mount *mnt;
+
+	if(waserror()){
+		*tp = nil;
+		nexterror();
+	}
+	t = nil;
+	*tp = nil;
+	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+		if(strcmp(a->old, mnt->name) == 0){
+			updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+			t = agetp(&mnt->root);
+			ainc(&t->memref);
+			break;
+		}
+	}
+	if(t == nil && (t = opensnap(a->old, nil)) == nil){
+		if(a->fd != -1)
+			fprint(a->fd, "snap: open '%s': does not exist\n", a->old);
+		poperror();
+		return;
+	}
+	if(a->delete){
+		if(mnt != nil) {
+			if(a->fd != -1)
+				fprint(a->fd, "snap: snap is mounted: '%s'\n", a->old);
+			poperror();
+			return;
+		}
+		if(t->nlbl == 1 && t->nref <= 1 && t->succ == -1){
+			aincl(&t->memref, 1);
+			*tp = t;
+		}
+		delsnap(t, t->succ, a->old);
+	}else{
+		if((s = opensnap(a->new, nil)) != nil){
+			if(a->fd != -1)
+				fprint(a->fd, "snap: already exists '%s'\n", a->new);
+			closesnap(s);
+			poperror();
+			return;
+		}
+		tagsnap(t, a->new, a->flag);
+	}
+	closesnap(t);
+	poperror();
+	if(a->fd != -1){
+		if(a->delete)
+			fprint(a->fd, "deleted: %s\n", a->old);
+		else if(a->flag & Lmut)
+			fprint(a->fd, "forked: %s from %s\n", a->new, a->old);
+		else
+			fprint(a->fd, "labeled: %s from %s\n", a->new, a->old);
+	}
+}
+
+static void
+filldumpdir(Xdir *d)
+{
+	memset(d, 0, sizeof(Xdir));
+	d->name = "/";
+	d->qid.path = Qdump;
+	d->qid.vers = fs->nextgen;
+	d->qid.type = QTDIR;
+	d->mode = DMDIR|0555;
+	d->atime = 0;
+	d->mtime = 0;
+	d->length = 0;
+	d->uid = -1;
+	d->gid = -1;
+	d->muid = -1;
+}
+
+static char*
+okname(char *name)
+{
+	int i;
+
+	if(name[0] == 0)
+		return Ename;
+	if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
+		return Ename;
+	for(i = 0; i < Maxname; i++){
+		if(name[i] == 0)
+			return nil;
+		if((name[i]&0xff) < 0x20 || name[i] == '/')
+			return Ename;
+	}
+	return Elength;
+}
+
+Chan*
+mkchan(int size)
+{
+	Chan *c;
+
+	if((c = mallocz(sizeof(Chan) + size*sizeof(void*), 1)) == nil)
+		sysfatal("create channel");
+	c->size = size;
+	c->avail = size;
+	c->count = 0;
+	c->rp = c->args;
+	c->wp = c->args;
+	return c;
+
+}
+
+void*
+chrecv(Chan *c)
+{
+	void *a;
+	long v;
+
+	v = agetl(&c->count);
+	if(v == 0 || !acasl(&c->count, v, v-1))
+		semacquire(&c->count, 1);
+	lock(&c->rl);
+	a = *c->rp;
+	if(++c->rp >= &c->args[c->size])
+		c->rp = c->args;
+	unlock(&c->rl);
+	semrelease(&c->avail, 1);
+	return a;
+}
+
+void
+chsend(Chan *c, void *m)
+{
+	long v;
+
+	v = agetl(&c->avail);
+	if(v == 0 || !acasl(&c->avail, v, v-1))
+		semacquire(&c->avail, 1);
+	lock(&c->wl);
+	*c->wp = m;
+	if(++c->wp >= &c->args[c->size])
+		c->wp = c->args;
+	unlock(&c->wl);
+	semrelease(&c->count, 1);
+}
+
+static void
+fshangup(Conn *c, char *fmt, ...)
+{
+	char buf[ERRMAX];
+	va_list ap;
+	Amsg *a;
+	Fid *f;
+	int i;
+
+	va_start(ap, fmt);
+	vsnprint(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+	fprint(2, "hangup: %s\n", buf);
+	close(c->rfd);
+	close(c->wfd);
+	for(i = 0; i < Nfidtab; i++){
+		lock(&c->fidtablk[i]);
+		for(f = c->fidtab[i]; f != nil; f = f->next){
+			lock(f);
+			if(waserror()){
+				unlock(f);
+				continue;
+			}
+			a = nil;
+			clunkfid(c, f, &a);
+			unlock(f);
+			if(a != nil)
+				chsend(fs->admchan, a);
+			nexterror();
+		}
+		unlock(&c->fidtablk[i]);
+	}
+}
+
+static void
+respond(Fmsg *m, Fcall *r)
+{
+	RWLock *lk;
+	uchar buf[Max9p+IOHDRSZ];
+	int w, n;
+
+	r->tag = m->tag;
+	dprint("→ %F\n", r);
+	assert(m->type+1 == r->type || r->type == Rerror);
+	if((n = convS2M(r, buf, sizeof(buf))) == 0)
+		abort();
+	qlock(&m->conn->wrlk);
+	w = write(m->conn->wfd, buf, n);
+	qunlock(&m->conn->wrlk);
+	if(w != n)
+		fshangup(m->conn, Eio);
+	if(m->type == Tflush){
+		lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
+		wunlock(lk);
+	}else{
+		lk = &fs->flushq[ihash(m->tag) % Nflushtab];
+		runlock(lk);
+	}
+	free(m);
+}
+
+static void
+rerror(Fmsg *m, char *fmt, ...)
+{
+	char buf[128];
+	va_list ap;
+	Fcall r;
+
+	va_start(ap, fmt);
+	vsnprint(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+	r.type = Rerror;
+	r.ename = buf;
+	respond(m, &r);
+}
+
+
+static void
+upsert(Mount *mnt, Msg *m, int nm)
+{
+	if(!(mnt->flag & Lmut))
+		error(Erdonly);
+	if(mnt->root->nlbl != 1 || mnt->root->nref != 0)
+		updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+	btupsert(mnt->root, m, nm);
+}
+
+/*
+ * When truncating a file, mutations need
+ * to wait for the sweeper to finish; this
+ * means the mutator needs to release the
+ * mutation lock, exit the epoch, and
+ * allow the sweeper to finish its job
+ * before resuming.
+ */
+static void
+truncwait(Dent *de, int id)
+{
+	epochend(id);
+	qunlock(&fs->mutlk);
+	qlock(&de->trunclk);
+	while(de->trunc)
+		rsleep(&de->truncrz);
+	qunlock(&de->trunclk);
+	qlock(&fs->mutlk);
+	epochstart(id);
+}
+
+static int
+readb(Tree *t, Fid *f, char *d, vlong o, vlong n, vlong sz)
+{
+	char buf[Offksz], kvbuf[Offksz+32];
+	vlong fb, fo;
+	Bptr bp;
+	Blk *b;
+	Key k;
+	Kvp kv;
+
+	if(o >= sz)
+		return 0;
+
+	fb = o & ~(Blksz-1);
+	fo = o & (Blksz-1);
+	if(fo+n > Blksz)
+		n = Blksz-fo;
+
+	k.k = buf;
+	k.nk = sizeof(buf);
+	k.k[0] = Kdat;
+	PACK64(k.k+1, f->qpath);
+	PACK64(k.k+9, fb);
+
+	if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf))){
+		memset(d, 0, n);
+		return n;
+	}
+
+	bp = unpackbp(kv.v, kv.nv);
+	b = getblk(bp, GBraw);
+	memcpy(d, b->buf+fo, n);
+	dropblk(b);
+	return n;
+}
+
+static int
+writeb(Fid *f, Msg *m, Bptr *ret, char *s, vlong o, vlong n, vlong sz)
+{
+	char buf[Kvmax];
+	vlong fb, fo;
+	Blk *b, *t;
+	int seq;
+	Tree *r;
+	Bptr bp;
+	Kvp kv;
+
+	fb = o & ~(Blksz-1);
+	fo = o & (Blksz-1);
+
+	m->k[0] = Kdat;
+	PACK64(m->k+1, f->qpath);
+	PACK64(m->k+9, fb);
+
+	if(fo+n >= Blksz)
+		seq = 1;
+	else
+		seq = 0;
+	b = newdblk(f->mnt->root, f->qpath, seq);
+	t = nil;
+	r = f->mnt->root;
+	if(btlookup(r, m, &kv, buf, sizeof(buf))){
+		bp = unpackbp(kv.v, kv.nv);
+		if(fb < sz && (fo != 0 || n != Blksz)){
+			t = getblk(bp, GBraw);
+			memcpy(b->buf, t->buf, Blksz);
+			dropblk(t);
+		}
+	}
+	if(fo+n > Blksz)
+		n = Blksz-fo;
+	memcpy(b->buf+fo, s, n);
+	if(t == nil){
+		if(fo > 0)
+			memset(b->buf, 0, fo);
+		if(fo+n < Blksz)
+			memset(b->buf+fo+n, 0, Blksz-fo-n);
+	}
+	enqueue(b);
+
+	packbp(m->v, m->nv, &b->bp);
+	*ret = b->bp;
+	dropblk(b);
+	return n;
+}
+
+static Dent*
+getdent(Mount *mnt, vlong pqid, Xdir *d)
+{
+	Dent *de;
+	char *e;
+	u32int h;
+
+	h = ihash(d->qid.path) % Ndtab;
+	lock(&mnt->dtablk);
+	for(de = mnt->dtab[h]; de != nil; de = de->next){
+		if(de->qid.path == d->qid.path){
+			ainc(&de->ref);
+			goto Out;
+		}
+	}
+
+	de = emalloc(sizeof(Dent), 1);
+	de->Xdir = *d;
+	de->ref = 1;
+	de->up = pqid;
+	de->qid = d->qid;
+	de->length = d->length;
+	de->truncrz.l = &de->trunclk;
+
+	if((e = packdkey(de->buf, sizeof(de->buf), pqid, d->name)) == nil){
+		free(de);
+		de = nil;
+		goto Out;
+	}
+	de->k = de->buf;
+	de->nk = e - de->buf;
+	de->name = de->buf + 11;
+	de->next = mnt->dtab[h];
+	mnt->dtab[h] = de;
+
+Out:
+	unlock(&mnt->dtablk);
+	return de;
+}
+
+static void
+loadautos(Mount *mnt)
+{
+	char pfx[128];
+	int m, h, ns;
+	uint flg;
+	Scan s;
+
+	m = 0;
+	h = 0;
+	pfx[0] = Klabel;
+	ns = snprint(pfx+1, sizeof(pfx)-1, "%s@minute.", mnt->name);
+	btnewscan(&s, pfx, ns+1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		flg = UNPACK32(s.kv.v+1+8);
+		if(flg & Lauto){
+			memcpy(mnt->minutely[m], s.kv.k+1, s.kv.nk-1);
+			mnt->minutely[m][s.kv.nk-1] = 0;
+			m = (m+1)%60;
+			continue;
+		}
+	}
+	btexit(&s);
+
+	pfx[0] = Klabel;
+	ns = snprint(pfx+1, sizeof(pfx)-1, "%s@hour.", mnt->name);
+	btnewscan(&s, pfx, ns+1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		flg = UNPACK32(s.kv.v+1+8);
+		if(flg & Lauto){
+			memcpy(mnt->hourly[h], s.kv.k+1, s.kv.nk-1);
+			mnt->hourly[h][s.kv.nk-1] = 0;
+			h = (h+1)%24;
+			continue;
+		}
+	}
+	btexit(&s);
+}
+
+Mount *
+getmount(char *name)
+{
+	Mount *mnt;
+	Tree *t;
+	int flg;
+
+	if(strcmp(name, "dump") == 0){
+		ainc(&fs->snapmnt->ref);
+		return fs->snapmnt;
+	}
+
+	qlock(&fs->mountlk);
+	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+		if(strcmp(name, mnt->name) == 0){
+			ainc(&mnt->ref);
+			goto Out;
+		}
+	}
+
+	if((mnt = mallocz(sizeof(*mnt), 1)) == nil)
+		error(Enomem);
+	if(waserror()){
+		qunlock(&fs->mountlk);
+		free(mnt);
+		nexterror();
+	}
+	mnt->ref = 1;
+	snprint(mnt->name, sizeof(mnt->name), "%s", name);
+	if((t = opensnap(name, &flg)) == nil)
+		error(Enosnap);
+	loadautos(mnt);
+	mnt->flag = flg;
+	mnt->root = t;
+	mnt->next = fs->mounts;
+	asetp(&fs->mounts, mnt);
+	poperror();
+
+Out:
+	qunlock(&fs->mountlk);
+	return mnt;
+}
+
+void
+clunkmount(Mount *mnt)
+{
+	Mount *me, **p;
+
+	if(mnt == nil)
+		return;
+	if(adec(&mnt->ref) == 0){
+		qlock(&fs->mountlk);
+		for(p = &fs->mounts; (me = *p) != nil; p = &me->next){
+			if(me == mnt)
+				break;
+		}
+		assert(me != nil);
+		*p = me->next;
+		limbo(DFmnt, me);
+		qunlock(&fs->mountlk);
+	}
+}
+
+static void
+clunkdent(Mount *mnt, Dent *de)
+{
+	Dent *e, **pe;
+	u32int h;
+
+	if(de == nil)
+		return;
+	if(de->qid.type & QTAUTH && adec(&de->ref) == 0){
+		free(de);
+		return;
+	}
+	lock(&mnt->dtablk);
+	if(adec(&de->ref) != 0)
+		goto Out;
+	h = ihash(de->qid.path) % Ndtab;
+	pe = &mnt->dtab[h];
+	for(e = mnt->dtab[h]; e != nil; e = e->next){
+		if(e == de)
+			break;
+		pe = &e->next;
+	}
+	assert(e != nil);
+	*pe = e->next;
+	free(de);
+Out:
+	unlock(&mnt->dtablk);
+}
+
+static Fid*
+getfid(Conn *c, u32int fid)
+{
+	u32int h;
+	Fid *f;
+
+	h = ihash(fid) % Nfidtab;
+	lock(&c->fidtablk[h]);
+	for(f = c->fidtab[h]; f != nil; f = f->next)
+		if(f->fid == fid){
+			ainc(&f->ref);
+			break;
+		}
+	unlock(&c->fidtablk[h]);
+	return f;
+}
+
+static void
+putfid(Fid *f)
+{
+	if(adec(&f->ref) != 0)
+		return;
+	clunkdent(f->mnt, f->dent);
+	clunkdent(f->mnt, f->dir);
+	clunkmount(f->mnt);
+	free(f);
+}
+
+static Fid*
+dupfid(Conn *c, u32int new, Fid *f)
+{
+	Fid *n, *o;
+	u32int h;
+
+	h = ihash(new) % Nfidtab;
+	if((n = malloc(sizeof(Fid))) == nil)
+		return nil;
+
+	*n = *f;
+	n->fid = new;
+	n->ref = 2; /* one for dup, one for clunk */
+	n->mode = -1;
+	n->next = nil;
+
+	lock(&c->fidtablk[h]);
+	for(o = c->fidtab[h]; o != nil; o = o->next)
+		if(o->fid == new)
+			break;
+	if(o == nil){
+		n->next = c->fidtab[h];
+		c->fidtab[h] = n;
+	}
+	unlock(&c->fidtablk[h]);
+
+	if(o != nil){
+		fprint(2, "fid in use: %d == %d\n", o->fid, new);
+		free(n);
+		return nil;
+	}
+	if(n->mnt != nil)
+		ainc(&n->mnt->ref);
+	ainc(&n->dent->ref);
+	ainc(&n->dir->ref);
+	setmalloctag(n, getcallerpc(&c));
+	return n;
+}
+
+static void
+clunkfid(Conn *c, Fid *fid, Amsg **ao)
+{
+	Fid *f, **pf;
+	u32int h;
+
+	h = ihash(fid->fid) % Nfidtab;
+	lock(&c->fidtablk[h]);
+	pf = &c->fidtab[h];
+	for(f = c->fidtab[h]; f != nil; f = f->next){
+		if(f == fid){
+			assert(adec(&f->ref) != 0);
+			*pf = f->next;
+			break;
+		}
+		pf = &f->next;
+	}
+	assert(f != nil);
+	if(f->scan != nil){
+		free(f->scan);
+		f->scan = nil;
+	}
+	if(f->rclose != nil){
+		*ao = f->rclose;
+
+		qlock(&f->dent->trunclk);
+		f->dent->trunc = 1;
+		qunlock(&f->dent->trunclk);
+
+		wlock(f->dent);
+		f->dent->gone = 1;
+		wunlock(f->dent);
+
+		aincl(&f->dent->ref, 1);
+		aincl(&f->mnt->ref, 1);
+		(*ao)->op = AOrclose;
+		(*ao)->mnt = f->mnt;
+		(*ao)->qpath = f->qpath;
+		(*ao)->off = 0;
+		(*ao)->end = f->dent->length;
+		(*ao)->dent = f->dent;
+	}
+	unlock(&c->fidtablk[h]);
+}
+
+static int
+readmsg(Conn *c, Fmsg **pm)
+{
+	char szbuf[4];
+	int sz, n;
+	Fmsg *m;
+
+	n = readn(c->rfd, szbuf, 4);
+	if(n <= 0){
+		*pm = nil;
+		return n;
+	}
+	if(n != 4){
+		werrstr("short read: %r");
+		return -1;
+	}
+	sz = GBIT32(szbuf);
+	if(sz > c->iounit){
+		werrstr("message size too large");
+		return -1;
+	}
+	if((m = malloc(sizeof(Fmsg)+sz)) == nil)
+		return -1;
+	if(readn(c->rfd, m->buf+4, sz-4) != sz-4){
+		werrstr("short read: %r");
+		free(m);
+		return -1;
+	}
+	m->conn = c;
+	m->sz = sz;
+	PBIT32(m->buf, sz);
+	*pm = m;
+	return 0;
+}
+
+static void
+fsversion(Fmsg *m)
+{
+	Fcall r;
+	char *p;
+
+	memset(&r, 0, sizeof(Fcall));
+	p = strchr(m->version, '.');
+	if(p != nil)
+		*p = '\0';
+	r.type = Rversion;
+	r.msize = Max9p + IOHDRSZ;
+	if(strcmp(m->version, "9P2000") == 0){
+		if(m->msize < r.msize)
+			r.msize = m->msize;
+		r.version = "9P2000";
+		m->conn->versioned = 1;
+		m->conn->iounit = r.msize;
+	}else{
+		r.version = "unknown";
+		m->conn->versioned = 0;
+	}
+	respond(m, &r);
+}
+
+void
+authfree(AuthRpc *auth)
+{
+	AuthRpc *rpc;
+
+	if(rpc = auth){
+		close(rpc->afd);
+		auth_freerpc(rpc);
+	}
+}
+
+AuthRpc*
+authnew(void)
+{
+	static char *keyspec = "proto=p9any role=server";
+	AuthRpc *rpc;
+	int fd;
+
+	if(access("/mnt/factotum", 0) < 0)
+		if((fd = open("/srv/factotum", ORDWR)) >= 0)
+			mount(fd, -1, "/mnt", MBEFORE, "");
+	if((fd = open("/mnt/factotum/rpc", ORDWR)) < 0)
+		return nil;
+	if((rpc = auth_allocrpc(fd)) == nil){
+		close(fd);
+		return nil;
+	}
+	if(auth_rpc(rpc, "start", keyspec, strlen(keyspec)) != ARok){
+		authfree(rpc);
+		return nil;
+	}
+	return rpc;
+}
+
+static void
+authread(Fid *f, Fcall *r, void *data, vlong count)
+{
+	AuthInfo *ai;
+	AuthRpc *rpc;
+	User *u;
+
+	if((rpc = f->auth) == nil)
+		error(Etype);
+
+	switch(auth_rpc(rpc, "read", nil, 0)){
+	default:
+		error(Eauthp);
+	case ARdone:
+		if((ai = auth_getinfo(rpc)) == nil)
+			goto Phase;
+		rlock(&fs->userlk);
+		u = name2user(ai->cuid);
+		auth_freeAI(ai);
+		if(u == nil){
+			runlock(&fs->userlk);
+			error(Enouser);
+		}
+		f->uid = u->id;
+		runlock(&fs->userlk);
+		return;
+	case ARok:
+		if(count < rpc->narg)
+			error(Eauthd);
+		memmove(data, rpc->arg, rpc->narg);
+		r->count = rpc->narg;
+		return;
+	case ARphase:
+	Phase:
+		error(Eauthph);
+	}
+}
+
+static void
+authwrite(Fid *f, Fcall *r, void *data, vlong count)
+{
+	AuthRpc *rpc;
+
+	if((rpc = f->auth) == nil)
+		error(Etype);
+	if(auth_rpc(rpc, "write", data, count) != ARok)
+		error(Ebotch);
+	r->type = Rwrite;
+	r->count = count;
+
+}
+
+static void
+fsauth(Fmsg *m)
+{
+	Dent *de;
+	Fcall r;
+	Fid f;
+
+	if(fs->noauth){
+		rerror(m, Eauth);
+		return;
+	}
+	if(strcmp(m->uname, "none") == 0){
+		rerror(m, Enone);
+		return;
+	}
+	if((de = mallocz(sizeof(Dent), 1)) == nil){
+		rerror(m, Enomem);
+		return;
+	}
+	memset(de, 0, sizeof(Dent));
+	de->ref = 0;
+	de->qid.type = QTAUTH;
+	de->qid.path = aincv(&fs->nextqid, 1);
+	de->qid.vers = 0;
+	de->length = 0;
+	de->k = nil;
+	de->nk = 0;
+
+	memset(&f, 0, sizeof(Fid));
+	f.fid = NOFID;
+	f.mnt = nil;
+	f.qpath = de->qid.path;
+	f.pqpath = de->qid.path;
+	f.mode = -1;
+	f.iounit = m->conn->iounit;
+	f.dent = de;
+	f.dir = de;
+	f.uid = -1;
+	f.duid = -1;
+	f.dgid = -1;
+	f.dmode = 0600;
+	f.auth = authnew();
+	if(dupfid(m->conn, m->afid, &f) == nil){
+		rerror(m, Efid);
+		free(de);
+		return;
+	}
+	r.type = Rauth;
+	r.aqid = de->qid;
+	respond(m, &r);
+}
+
+static int
+ingroup(int uid, int gid)
+{
+	User *u, *g;
+	int i, in;
+
+	rlock(&fs->userlk);
+	in = 0;
+	u = uid2user(uid);
+	g = uid2user(gid);
+	if(u != nil && g != nil)
+		if(u->id == g->id)
+			in = 1;
+		else for(i = 0; i < g->nmemb; i++)
+			if(u->id == g->memb[i])
+				in = 1;
+	runlock(&fs->userlk);
+	return in;
+}
+
+static int
+groupleader(int uid, int gid)
+{
+	User *g;
+	int i, lead;
+
+	lead = 0;
+	rlock(&fs->userlk);
+	g = uid2user(gid);
+	if(g != nil){
+		if(g->lead == 0){
+			for(i = 0; i < g->nmemb; i++)
+				if(g->memb[i] == uid){
+					lead = 1;
+					break;
+				}
+		}else if(uid == g->lead)
+			lead = 1;
+	}
+	runlock(&fs->userlk);
+	return lead;
+
+}
+
+static int
+mode2bits(int req)
+{
+	int m;
+
+	m = 0;
+	switch(req&0xf){
+	case OREAD:	m = DMREAD;		break;
+	case OWRITE:	m = DMWRITE;		break;
+	case ORDWR:	m = DMREAD|DMWRITE;	break;
+	case OEXEC:	m = DMREAD|DMEXEC;	break;
+	}
+	if(req&OTRUNC)
+		m |= DMWRITE;
+	return m;
+}
+
+static int
+fsaccess(Fid *f, ulong fmode, int fuid, int fgid, int m)
+{
+	/* uid none gets only other permissions */
+	if(f->permit)
+		return 0;
+	if(f->uid != noneid) {
+		if(f->uid == fuid)
+			if((m & (fmode>>6)) == m)
+				return 0;
+		if(ingroup(f->uid, fgid))
+			if((m & (fmode>>3)) == m)
+				return 0;
+	}
+	if((m & fmode) == m) {
+		if((fmode & DMDIR) && (m == DMEXEC))
+			return 0;
+		if(!ingroup(f->uid, nogroupid))
+			return 0;
+	}
+	return -1;
+}
+
+static void
+fsattach(Fmsg *m)
+{
+	char dbuf[Kvmax], kvbuf[Kvmax];
+	char *p, *n, *aname;
+	Mount *mnt;
+	Dent *de;
+	Tree *t;
+	User *u;
+	Fcall r;
+	Xdir d;
+	Kvp kv;
+	Key dk;
+	Fid f, *af;
+	int uid;
+
+	de = nil;
+	mnt = nil;
+	if(waserror()){
+		rerror(m, errmsg());
+		goto Err;
+	}
+	aname = m->aname;
+	if(aname[0] == '%')
+		aname++;
+	if(aname[0] == '\0')
+		aname = "main";
+	if((mnt = getmount(aname)) == nil)
+		error(Enosnap);
+
+	rlock(&fs->userlk);
+	n = m->uname;
+	/*
+	 * to allow people to add themselves to the user file,
+	 * we need to force the user id to one that exists.
+	 */
+	if(permissive && strcmp(aname, "adm") == 0)
+		n = "adm";
+	if((u = name2user(n)) == nil){
+		runlock(&fs->userlk);
+		error(Enouser);
+	}
+	uid = u->id;
+	runlock(&fs->userlk);
+
+	if(m->afid != NOFID){
+		r.data = nil;
+		r.count = 0;
+		if((af = getfid(m->conn, m->afid)) == nil)
+			error(Enofid);
+		authread(af, &r, nil, 0);
+		putfid(af);
+		if(af->uid != uid)
+			error(Ebadu);
+	}else if(!fs->noauth && strcmp(m->uname, "none") != 0)
+		error(Ebadu);
+
+	if(strcmp(m->aname, "dump") == 0){
+		memset(&d, 0, sizeof(d));
+		filldumpdir(&d);
+	}else{
+		if((p = packdkey(dbuf, sizeof(dbuf), -1ULL, "")) == nil)
+			error(Elength);
+		dk.k = dbuf;
+		dk.nk = p - dbuf;
+		t = agetp(&mnt->root);
+		if(!btlookup(t, &dk, &kv, kvbuf, sizeof(kvbuf)))
+			error(Enosnap);
+		kv2dir(&kv, &d);
+	}
+	de = getdent(mnt, -1, &d);
+	memset(&f, 0, sizeof(Fid));
+	f.fid = NOFID;
+	f.mnt = mnt;
+	f.qpath = d.qid.path;
+	f.pqpath = d.qid.path;
+	f.mode = -1;
+	f.iounit = m->conn->iounit;
+	f.dent = de;
+	f.dir = de;
+	f.uid = uid;
+	f.duid = d.uid;
+	f.dgid = d.gid;
+	f.dmode = d.mode;
+	if(m->aname[0] == '%'){
+		if(!permissive && !ingroup(uid, admid))
+			error(Eperm);
+		f.permit = 1;
+	}
+	if(strcmp(aname, "dump") == 0)
+		f.fromdump = 1;
+	if(dupfid(m->conn, m->fid, &f) == nil)
+		error(Efid);
+
+	r.type = Rattach;
+	r.qid = d.qid;
+	respond(m, &r);
+	poperror();
+
+
+Err:	clunkdent(mnt, de);
+	clunkmount(mnt);
+}
+
+static int
+findparent(Tree *t, vlong up, vlong *qpath, char **name, char *buf, int nbuf)
+{
+	char *p, kbuf[Keymax];
+	Kvp kv;
+	Key k;
+
+	p = packsuper(kbuf, sizeof(kbuf), up);
+	k.k = kbuf;
+	k.nk = p - kbuf;
+	if(!btlookup(t, &k, &kv, buf, nbuf))
+		error(Esrch);
+	*name = unpackdkey(kv.v, kv.nv, qpath);
+	return 1;
+}
+
+static void
+dkey(Key *k, vlong up, char *name, char *buf, int nbuf)
+{
+	char *p;
+
+	p = packdkey(buf, nbuf, up, name);
+	k->k = buf;
+	k->nk = p - buf;
+}
+
+static void
+fswalk(Fmsg *m)
+{
+	char *name, kbuf[Maxent], kvbuf[Kvmax];
+	int duid, dgid, dmode;
+	vlong up, upup, prev;
+	Dent *dent, *dir;
+	Fid *o, *f;
+	Mount *mnt;
+	Amsg *ao;
+	Tree *t;
+	Fcall r;
+	Xdir d;
+	Kvp kv;
+	Key k;
+	int i;
+
+	if((o = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	if(waserror()){
+		rerror(m, errmsg());
+		putfid(o);
+		return;
+	}
+	if(o->mode != -1)
+		error(Einuse);
+	t = o->mnt->root;
+	mnt = o->mnt;
+	up = o->pqpath;
+	prev = o->qpath;
+	rlock(o->dent);
+	d = *o->dent;
+	runlock(o->dent);
+	duid = d.uid;
+	dgid = d.gid;
+	dmode = d.mode;
+	r.type = Rwalk;
+	for(i = 0; i < m->nwname; i++){
+		name = m->wname[i];
+		if(strlen(name) > Maxname)
+			error(Elength);
+		if(fsaccess(o, d.mode, d.uid, d.gid, DMEXEC) != 0)
+			break;
+		if(strcmp(name, "..") == 0){
+			if(up == -1 && o->fromdump){
+				mnt = fs->snapmnt;
+				filldumpdir(&d);
+				prev = -1ULL;
+				up = -1ULL;
+				r.wqid[i] = d.qid;
+				continue;
+			}
+			findparent(t, up, &prev, &name, kbuf, sizeof(kbuf));
+		}else if(d.qid.path == Qdump){
+			mnt = getmount(m->wname[i]);
+			name = "";
+			prev = -1ULL;
+			t = mnt->root;
+		}
+		up = prev;
+		duid = d.uid;
+		dgid = d.gid;
+		dmode = d.mode;
+		dkey(&k, prev, name, kbuf, sizeof(kbuf));
+		if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+			break;
+		kv2dir(&kv, &d);
+		prev = d.qid.path;
+		r.wqid[i] = d.qid;
+	}
+	r.nwqid = i;
+	if(i == 0 && m->nwname != 0)
+		error(Esrch);
+	f = o;
+	if(m->fid != m->newfid && i == m->nwname){
+		if((f = dupfid(m->conn, m->newfid, o)) == nil)
+			error(Efid);
+		putfid(o);
+	}
+	if(i > 0 && i == m->nwname){
+		lock(f);
+		ao = nil;
+		if(waserror()){
+			if(f != o)
+				clunkfid(m->conn, f, &ao);
+			assert(ao == nil);
+			unlock(f);
+			nexterror();
+		}
+		if(up == -1ULL){
+			/* the root contains itself, I guess */
+			dent = getdent(mnt, up, &d);
+			dir = getdent(mnt, up, &d);
+		}else{
+			dent = getdent(mnt, up, &d);
+			findparent(t, up, &upup, &name, kbuf, sizeof(kbuf));
+			dkey(&k, upup, name, kbuf, sizeof(kbuf));
+			if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+				broke("missing parent");
+			kv2dir(&kv, &d);
+			dir = getdent(mnt, upup, &d);
+		}
+		clunkdent(f->mnt, f->dent);
+		clunkdent(f->mnt, f->dir);
+		if(mnt != f->mnt){
+			clunkmount(f->mnt);
+			ainc(&mnt->ref);
+			f->mnt = mnt;
+		}
+		f->qpath = r.wqid[i-1].path;
+		f->pqpath = up;
+		f->dent = dent;
+		f->dir = dir;
+		f->duid = duid;
+		f->dgid = dgid;
+		f->dmode = dmode;
+		poperror();
+		unlock(f);
+	}
+	respond(m, &r);
+	poperror();
+	putfid(f);
+}
+
+static void
+fsstat(Fmsg *m)
+{
+	char buf[STATMAX];
+	Fcall r;
+	Fid *f;
+	int n;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	if(waserror()){
+		rerror(m, errmsg());
+		putfid(f);
+		return;
+	}
+	rlock(f->dent);
+	if((n = dir2statbuf(f->dent, buf, sizeof(buf))) == -1)
+		error(Efs);
+	runlock(f->dent);
+	r.type = Rstat;
+	r.stat = (uchar*)buf;
+	r.nstat = n;
+	respond(m, &r);
+	poperror();
+	putfid(f);
+}
+
+static void
+fswstat(Fmsg *m, int id, Amsg **ao)
+{
+	char rnbuf[Kvmax], opbuf[Kvmax], upbuf[Upksz];
+	char *p, *e, strs[65535];
+	int op, nm, rename;
+	vlong oldlen;
+	Qid old;
+	Fcall r;
+	Dent *de;
+	Msg mb[4];
+	Xdir n;
+	Dir d;
+	Tree *t;
+	Fid *f;
+	Key k;
+	User *u;
+
+	*ao = nil;
+	rename = 0;
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	de = f->dent;
+	truncwait(de, id);
+	wlock(de);
+	if(waserror()){
+		rerror(m, errmsg());
+		free(*ao);
+		*ao = nil;
+		goto Err;
+	}
+	if(de->gone)
+		error(Ephase);
+	if((de->qid.type & QTAUTH) || (de->qid.path & Qdump))
+		error(Emode);
+	if(convM2D(m->stat, m->nstat, &d, strs) <= BIT16SZ)
+		error(Edir);
+
+	t = agetp(&f->mnt->root);
+	n = de->Xdir;
+	n.qid.vers++;
+	p = opbuf+1;
+	op = 0;
+
+	/* check validity of updated fields and construct Owstat message */
+	if(d.qid.path != ~0 || d.qid.vers != ~0){
+		if(d.qid.path != de->qid.path)
+			error(Ewstatp);
+		if(d.qid.vers != de->qid.vers)
+			error(Ewstatv);
+	}
+	if(*d.name != '\0'){
+		if(strlen(d.name) > Maxname)
+			error(Elength);
+		if(strcmp(d.name, de->name) != 0){
+			rename = 1;
+			if((e = okname(d.name)) != nil)
+				error(e);
+			if(walk1(t, f->dent->up, d.name, &old, &oldlen) == 0)
+				error(Eexist);
+			n.name = d.name;
+		}
+	}
+	if(d.length != ~0){
+		if(d.length < 0)
+			error(Ewstatl);
+		if(d.length != de->length){
+			if(d.length < de->length){
+				if((*ao = malloc(sizeof(Amsg))) == nil)
+					error(Enomem);
+				qlock(&de->trunclk);
+				de->trunc = 1;
+				qunlock(&de->trunclk);
+				aincl(&de->ref, 1);
+				aincl(&f->mnt->ref, 1);
+				(*ao)->op = AOclear;
+				(*ao)->mnt = f->mnt;
+				(*ao)->qpath = f->qpath;
+				(*ao)->off = d.length;
+				(*ao)->end = f->dent->length;
+				(*ao)->dent = de;
+			}
+			de->length = d.length;
+			n.length = d.length;
+			op |= Owsize;
+			PACK64(p, n.length);
+			p += 8;
+		}
+	}
+	if(d.mode != ~0){
+		if((d.mode^de->mode) & DMDIR)
+			error(Ewstatd);
+		if(d.mode & ~(DMDIR|DMAPPEND|DMEXCL|DMTMP|0777))
+			error(Ewstatb);
+		if(d.mode != de->mode){
+			n.mode = d.mode;
+			n.qid.type = d.mode>>24;
+			op |= Owmode;
+			PACK32(p, n.mode);
+			p += 4;
+		}
+	}
+	if(d.mtime != ~0){
+		n.mtime = d.mtime*Nsec;
+		if(n.mtime != de->mtime){
+			op |= Owmtime;
+			PACK64(p, n.mtime);
+			p += 8;
+		}
+	}
+	if(*d.uid != '\0'){
+		if(strlen(d.uid) > Maxuname)
+			error(Elength);
+		rlock(&fs->userlk);
+		u = name2user(d.uid);
+		if(u == nil){
+			runlock(&fs->userlk);
+			error(Enouser);
+		}
+		n.uid = u->id;
+		runlock(&fs->userlk);
+		if(n.uid != de->uid){
+			op |= Owuid;
+			PACK32(p, n.uid);
+			p += 4;
+		}
+	}
+	if(*d.gid != '\0'){
+		if(strlen(d.gid) > Maxuname)
+			error(Elength);
+		rlock(&fs->userlk);
+		u = name2user(d.gid);
+		if(u == nil){
+			runlock(&fs->userlk);
+			error(Enogrp);
+		}
+		n.gid = u->id;
+		runlock(&fs->userlk);
+		if(n.gid != de->gid){
+			op |= Owgid;
+			PACK32(p, n.gid);
+			p += 4;
+		}
+	}
+	op |= Owmuid;
+	n.muid = f->uid;
+	PACK32(p, n.muid);
+	p += 4;
+
+	/* check permissions */
+	if(rename)
+		if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
+			error(Eperm);
+	if(op & Owsize)
+		if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1)
+			error(Eperm);
+	if(op & (Owmode|Owmtime))
+		if(!f->permit && f->uid != de->uid && !groupleader(f->uid, de->gid))
+			error(Ewstato);
+	if(op & Owuid)
+		if(!f->permit)
+			error(Ewstatu);
+	if(op & Owgid)
+		if(!f->permit
+		&& !(f->uid == de->uid && ingroup(f->uid, n.gid))
+		&& !(groupleader(f->uid, de->gid) && groupleader(f->uid, n.gid)))
+			error(Ewstatg);
+
+	/* update directory entry */
+	nm = 0;
+	if(rename && !de->gone){
+		mb[nm].op = Oclobber;
+		mb[nm].Key = de->Key;
+		mb[nm].v = nil;
+		mb[nm].nv = 0;
+		nm++;
+	
+		mb[nm].op = Oinsert;
+		dir2kv(f->pqpath, &n, &mb[nm], rnbuf, sizeof(rnbuf));
+		k = mb[nm].Key;
+		nm++;
+
+		if(de->qid.type & QTDIR){
+			packsuper(upbuf, sizeof(upbuf), f->qpath);
+			mb[nm].op = Oinsert;
+			mb[nm].k = upbuf;
+			mb[nm].nk = Upksz;
+			mb[nm].v = mb[nm-1].k;
+			mb[nm].nv = mb[nm-1].nk;
+			nm++;
+		}
+		touch(f->dir, &mb[nm++]);
+	}else{
+		opbuf[0] = op;
+		mb[nm].op = Owstat;
+		mb[nm].Key = de->Key;
+		mb[nm].v = opbuf;
+		mb[nm].nv = p - opbuf;
+		nm++;
+	}
+	assert(nm <= nelem(mb));
+	upsert(f->mnt, mb, nm);
+
+	de->Xdir = n;
+	if(rename)
+		cpkey(de, &k, de->buf, sizeof(de->buf));
+
+	r.type = Rwstat;
+	respond(m, &r);
+	poperror();
+
+Err:	wunlock(de);
+	putfid(f);
+}
+
+
+static void
+fsclunk(Fmsg *m, Amsg **ao)
+{
+	Fcall r;
+	Fid *f;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	lock(f);
+	clunkfid(m->conn, f, ao);
+	unlock(f);
+	r.type = Rclunk;
+	respond(m, &r);
+	putfid(f);
+}
+
+static void
+fscreate(Fmsg *m)
+{
+	char *p, *e, buf[Kvmax], upkbuf[Keymax], upvbuf[Inlmax];
+	int nm, duid, dgid, dmode;
+	Dent *de;
+	vlong oldlen;
+	Qid old;
+	Fcall r;
+	Msg mb[3];
+	Fid *f;
+	Xdir d;
+
+	if((e = okname(m->name)) != nil){
+		rerror(m, e);
+		return;
+	}
+	if(m->perm & (DMMOUNT|DMAUTH)){
+		rerror(m, Ebotch);
+		return;
+	}
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	lock(f);
+
+	if(waserror()){
+		rerror(m, errmsg());
+		goto Err;
+		
+	}
+	if(f->mode != -1){
+		rerror(m, Einuse);
+		goto Out;
+	}
+	de = f->dent;
+	if(walk1(f->mnt->root, f->qpath, m->name, &old, &oldlen) == 0){
+		rerror(m, Eexist);
+		goto Out;
+	}
+
+	rlock(de);
+	if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1){
+		rerror(m, Eperm);
+		runlock(de);
+		goto Out;
+	}
+	duid = de->uid;
+	dgid = de->gid;
+	dmode = de->mode;
+	runlock(de);
+
+	nm = 0;
+	d.qid.type = 0;
+	if(m->perm & DMDIR)
+		d.qid.type |= QTDIR;
+	if(m->perm & DMAPPEND)
+		d.qid.type |= QTAPPEND;
+	if(m->perm & DMEXCL)
+		d.qid.type |= QTEXCL;
+	if(m->perm & DMTMP)
+		d.qid.type |= QTTMP;
+	d.qid.path = aincv(&fs->nextqid, 1);
+	d.qid.vers = 0;
+	d.mode = m->perm;
+	if(m->perm & DMDIR)
+		d.mode &= ~0777 | de->mode & 0777;
+	else
+		d.mode &= ~0666 | de->mode & 0666;
+	d.name = m->name;
+	d.atime = nsec();
+	d.mtime = d.atime;
+	d.length = 0;
+	d.uid = f->uid;
+	d.gid = dgid;
+	d.muid = f->uid;
+
+	mb[nm].op = Oinsert;
+	dir2kv(f->qpath, &d, &mb[nm], buf, sizeof(buf));
+	nm++;
+
+	if(m->perm & DMDIR){
+		mb[nm].op = Oinsert;
+		if((p = packsuper(upkbuf, sizeof(upkbuf), d.qid.path)) == nil)
+			sysfatal("ream: pack super");
+		mb[nm].k = upkbuf;
+		mb[nm].nk = p - upkbuf;
+		if((p = packdkey(upvbuf, sizeof(upvbuf), f->qpath, d.name)) == nil)
+			sysfatal("ream: pack super");
+		mb[nm].v = upvbuf;
+		mb[nm].nv = p - upvbuf;
+		nm++;
+	}
+	touch(f->dent, &mb[nm++]);
+	assert(nm <= nelem(mb));
+	upsert(f->mnt, mb, nm);
+
+	de = getdent(f->mnt, f->qpath, &d);
+	clunkdent(f->mnt, f->dent);
+	f->mode = mode2bits(m->mode);
+	f->pqpath = f->qpath;
+	f->qpath = d.qid.path;
+	f->dent = de;
+	f->duid = duid;
+	f->dgid = dgid;
+	f->dmode = dmode;
+	if(m->mode & ORCLOSE)
+		f->rclose = emalloc(sizeof(Amsg), 1);
+
+	r.type = Rcreate;
+	r.qid = d.qid;
+	r.iounit = f->iounit;
+	respond(m, &r);
+Out:	poperror();
+Err:	unlock(f);
+	putfid(f);
+	return;
+}
+
+static char*
+candelete(Fid *f)
+{
+	char *e, pfx[Dpfxsz];
+	Tree *t;
+	Scan s;
+
+	if(!(f->dent->qid.type & QTDIR))
+		return nil;
+
+	t = agetp(&f->mnt->root);
+	packdkey(pfx, sizeof(pfx), f->qpath, nil);
+	btnewscan(&s, pfx, sizeof(pfx));
+	btenter(t, &s);
+	if(btnext(&s, &s.kv))
+		e = Enempty;
+	else
+		e = nil;
+	btexit(&s);
+	return e;
+}
+
+static void
+fsremove(Fmsg *m, int id, Amsg **ao)
+{
+	char *e, buf[Kvmax];
+	Fcall r;
+	int nm;
+	Msg mb[3];
+	Tree *t;
+	Kvp kv;
+	Fid *f;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	t = f->mnt->root;
+	nm = 0;
+	*ao = nil;
+	lock(f);
+	clunkfid(m->conn, f, ao);
+	/* rclose files are getting removed here anyways */
+	if(*ao != nil)
+		f->rclose = nil;
+	unlock(f);
+
+	truncwait(f->dent, id);
+	wlock(f->dent);
+	if(waserror()){
+		rerror(m, errmsg());
+		free(*ao);
+		*ao = nil;
+		goto Err;
+	}
+	if(f->dent->gone)
+		error(Ephase);
+	/*
+	 * we need a double check that the file is in the tree
+	 * here, because the walk to the fid is done in a reader
+	 * proc that can look it up in a stale version of the
+	 * tree, while we clunk the dent in the mutator proc.
+	 *
+	 * this means we can theoretically get some deletions
+	 * of files that are already gone.
+	 */
+	if(!btlookup(t, &f->dent->Key, &kv, buf, sizeof(buf)))
+		error(Ephase);
+	if((e = candelete(f)) != nil)
+		error(e);
+	if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
+		error(Eperm);
+	lock(f);
+	mb[nm].op = Odelete;
+	mb[nm].k = f->dent->k;
+	mb[nm].nk = f->dent->nk;
+	mb[nm].v = "\0";
+	mb[nm].nv = 1;
+	nm++;
+	unlock(f);
+
+	if(f->dent->qid.type & QTDIR){
+		packsuper(buf, sizeof(buf), f->qpath);
+		mb[nm].op = Oclobber;
+		mb[nm].k = buf;
+		mb[nm].nk = Upksz;
+		mb[nm].nv = 0;
+		nm++;
+	}else{
+		if(*ao == nil)
+			*ao = emalloc(sizeof(Amsg), 1);
+		aincl(&f->mnt->ref, 1);
+		(*ao)->op = AOclear;
+		(*ao)->mnt = f->mnt;
+		(*ao)->qpath = f->qpath;
+		(*ao)->off = 0;
+		(*ao)->end = f->dent->length;
+		(*ao)->dent = nil;
+	}
+	touch(f->dir, &mb[nm++]);
+	assert(nm <= nelem(mb));
+	upsert(f->mnt, mb, nm);
+	f->dent->gone = 1;
+	r.type = Rremove;
+	respond(m, &r);
+	poperror();
+Err:
+	wunlock(f->dent);
+	putfid(f);
+	return;
+}
+
+static void
+fsopen(Fmsg *m, int id, Amsg **ao)
+{
+	char *p, *e, buf[Kvmax];
+	int mbits;
+	Tree *t;
+	Fcall r;
+	Xdir d;
+	Fid *f;
+	Kvp kv;
+	Msg mb;
+
+	mbits = mode2bits(m->mode);
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	if(waserror()){
+		rerror(m, errmsg());
+		putfid(f);
+		return;
+	}
+	if(m->mode & OTRUNC)
+		truncwait(f->dent, id);
+	t = agetp(&f->mnt->root);
+	if((f->qpath & Qdump) != 0){
+		filldumpdir(&d);
+	}else{
+		if(!btlookup(t, f->dent, &kv, buf, sizeof(buf)))
+			error(Esrch);
+		kv2dir(&kv, &d);
+	}
+	wlock(f->dent);
+	if(waserror()){
+		wunlock(f->dent);
+		nexterror();
+	}
+	if(f->dent->gone)
+		error(Ephase);
+	if(f->dent->qid.type & QTEXCL)
+	if(f->dent->ref != 1)
+		error(Elocked);
+	if(m->mode & ORCLOSE)
+		if((e = candelete(f)) != nil)
+			error(e);
+	if(fsaccess(f, d.mode, d.uid, d.gid, mbits) == -1)
+		error(Eperm);
+	f->dent->length = d.length;
+	poperror();
+	wunlock(f->dent);
+	r.type = Ropen;
+	r.qid = d.qid;
+	r.iounit = f->iounit;
+
+	lock(f);
+	if(f->mode != -1){
+		unlock(f);
+		error(Einuse);
+	}
+	if((m->mode & OTRUNC) && !(f->dent->mode & DMAPPEND)){
+		wlock(f->dent);
+
+		if(waserror()){
+			wunlock(f->dent);
+			free(*ao);
+			*ao = nil;
+			nexterror();
+		}
+		*ao = emalloc(sizeof(Amsg), 1);
+		qlock(&f->dent->trunclk);
+		f->dent->trunc = 1;
+		qunlock(&f->dent->trunclk);
+		aincl(&f->dent->ref, 1);
+		aincl(&f->mnt->ref, 1);
+		(*ao)->op = AOclear;
+		(*ao)->mnt = f->mnt;
+		(*ao)->qpath = f->qpath;
+		(*ao)->off = 0;
+		(*ao)->end = f->dent->length;
+		(*ao)->dent = f->dent;
+
+		f->dent->muid = f->uid;
+		f->dent->qid.vers++;
+		f->dent->length = 0;
+
+		mb.op = Owstat;
+		p = buf;
+		p[0] = Owsize|Owmuid;	p += 1;
+		PACK64(p, 0);		p += 8;
+		PACK32(p, f->uid);	p += 4;
+		mb.k = f->dent->k;
+		mb.nk = f->dent->nk;
+		mb.v = buf;
+		mb.nv = p - buf;
+
+		upsert(f->mnt, &mb, 1);
+		wunlock(f->dent);
+		poperror();
+	}
+	f->mode = mode2bits(m->mode);
+	if(m->mode & ORCLOSE)
+		f->rclose = emalloc(sizeof(Amsg), 1);
+	unlock(f);
+	poperror();
+	respond(m, &r);
+	putfid(f);
+}
+
+static void
+readsnap(Fmsg *m, Fid *f, Fcall *r)
+{
+	char pfx[1], *p;
+	int n, ns;
+	Scan *s;
+	Xdir d;
+
+	s = f->scan;
+	if(s != nil && s->offset != 0 && s->offset != m->offset)
+		error(Edscan);
+	if(s == nil || m->offset == 0){
+		s = emalloc(sizeof(Scan), 1);
+		pfx[0] = Klabel;
+		btnewscan(s, pfx, 1);
+		lock(f);
+		if(f->scan != nil){
+			free(f->scan);
+		}
+		f->scan = s;
+		unlock(f);
+	}
+	if(s->donescan){
+		r->count = 0;
+		return;
+	}
+	p = r->data;
+	n = m->count;
+	filldumpdir(&d);
+	if(s->overflow){
+		memcpy(d.name, s->kv.k+1, s->kv.nk-1);
+		d.name[s->kv.nk-1] = 0;
+		d.qid.path = UNPACK64(s->kv.v + 1);
+		if((ns = dir2statbuf(&d, p, n)) == -1){
+			r->count = 0;
+			return;
+		}
+		s->overflow = 0;
+		p += ns;
+		n -= ns;
+	}
+	btenter(&fs->snap, s);
+	while(1){
+		if(!btnext(s, &s->kv))
+			break;
+		memcpy(d.name, s->kv.k+1, s->kv.nk-1);
+		d.name[s->kv.nk-1] = 0;
+		d.qid.path = UNPACK64(s->kv.v + 1);
+		if((ns = dir2statbuf(&d, p, n)) == -1){
+			s->overflow = 1;
+			break;
+		}
+		p += ns;
+		n -= ns;
+	}
+	btexit(s);
+	r->count = p - r->data;
+	return;
+}
+
+static void
+readdir(Fmsg *m, Fid *f, Fcall *r)
+{
+	char pfx[Dpfxsz], *p;
+	int n, ns;
+	Tree *t;
+	Scan *s;
+
+	s = f->scan;
+	t = agetp(&f->mnt->root);
+	if(s != nil && s->offset != 0 && s->offset != m->offset)
+		error(Edscan);
+	if(s == nil || m->offset == 0){
+		s = emalloc(sizeof(Scan), 1);
+		packdkey(pfx, sizeof(pfx), f->qpath, nil);
+		btnewscan(s, pfx, sizeof(pfx));
+		lock(f);
+		if(f->scan != nil)
+			free(f->scan);
+		f->scan = s;
+		unlock(f);
+	}
+	if(s->donescan){
+		r->count = 0;
+		return;
+	}
+	p = r->data;
+	n = m->count;
+	if(s->overflow){
+		if((ns = kv2statbuf(&s->kv, p, n)) == -1){
+			r->count = 0;
+			return;
+		}
+		s->overflow = 0;
+		p += ns;
+		n -= ns;
+	}
+	btenter(t, s);
+	while(1){
+		if(!btnext(s, &s->kv))
+			break;
+		if((ns = kv2statbuf(&s->kv, p, n)) == -1){
+			s->overflow = 1;
+			break;
+		}
+		p += ns;
+		n -= ns;
+	}
+	btexit(s);
+	r->count = p - r->data;
+}
+
+static void
+readfile(Fmsg *m, Fid *f, Fcall *r)
+{
+	vlong n, c, o;
+	char *p;
+	Dent *e;
+	Tree *t;
+
+	e = f->dent;
+	rlock(e);
+	if(m->offset > e->length){
+		runlock(e);
+		return;
+	}
+	p = r->data;
+	c = m->count;
+	o = m->offset;
+	t = agetp(&f->mnt->root);
+	if(m->offset + m->count > e->length)
+		c = e->length - m->offset;
+	while(c != 0){
+		n = readb(t, f, p, o, c, e->length);
+		r->count += n;
+		if(n == 0)
+			break;
+		p += n;
+		o += n;
+		c -= n;
+	}
+	runlock(e);
+}
+
+static void
+fsread(Fmsg *m)
+{
+	Fcall r;
+	Fid *f;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	r.type = Rread;
+	r.count = 0;
+	r.data = nil;
+	if(waserror()){
+		rerror(m, errmsg());
+		free(r.data);
+		putfid(f);
+		return;
+	}	
+	r.data = emalloc(m->count, 0);
+	if(f->dent->qid.type & QTAUTH)
+		authread(f, &r, r.data, m->count);
+	else if(f->dent->qid.path == Qdump)
+		readsnap(m, f, &r);
+	else if(f->dent->qid.type & QTDIR)
+		readdir(m, f, &r);
+	else
+		readfile(m, f, &r);
+	respond(m, &r);
+	free(r.data);
+	poperror();
+	putfid(f);
+}
+
+static void
+fswrite(Fmsg *m, int id)
+{
+	char sbuf[Wstatmax], kbuf[Max9p/Blksz+2][Offksz], vbuf[Max9p/Blksz+2][Ptrsz];
+	Bptr bp[Max9p/Blksz + 2];
+	Msg kv[Max9p/Blksz + 2];
+	vlong n, o, c, w;
+	int i, j;
+	char *p;
+	Fcall r;
+	Tree *t;
+	Fid *f;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	if(!(f->mode & DMWRITE)){
+		rerror(m, Einuse);
+		putfid(f);
+		return;
+	}
+	truncwait(f->dent, id);
+	wlock(f->dent);
+	if(waserror()){
+		rerror(m, errmsg());
+		wunlock(f->dent);
+		putfid(f);
+		return;
+	}
+	if(f->dent->gone)
+		error(Ephase);
+	if(f->dent->qid.type & QTAUTH){
+		authwrite(f, &r, m->data, m->count);
+		goto Out;
+	}	
+
+	w = 0;
+	p = m->data;
+	o = m->offset;
+	c = m->count;
+	if(f->dent->mode & DMAPPEND)
+		o = f->dent->length;
+	t = agetp(&f->mnt->root);
+	for(i = 0; c != 0; i++){
+		assert(i < nelem(kv));
+		assert(i == 0 || o%Blksz == 0);
+		kv[i].op = Oinsert;
+		kv[i].k = kbuf[i];
+		kv[i].nk = sizeof(kbuf[i]);
+		kv[i].v = vbuf[i];
+		kv[i].nv = sizeof(vbuf[i]);
+		if(waserror()){
+			if(!fs->rdonly)
+				for(j = 0; j < i; j++)
+					freebp(t, bp[j]);
+			nexterror();
+		}
+		n = writeb(f, &kv[i], &bp[i], p, o, c, f->dent->length);
+		poperror();
+		w += n;
+		p += n;
+		o += n;
+		c -= n;
+	}
+
+	p = sbuf;
+	kv[i].op = Owstat;
+	kv[i].k = f->dent->k;
+	kv[i].nk = f->dent->nk;
+	*p++ = 0;
+	if(o > f->dent->length){ 
+		sbuf[0] |= Owsize;
+		PACK64(p, o);
+		p += 8;
+		f->dent->length = m->offset+m->count;
+	}
+	sbuf[0] |= Owmtime;
+	f->dent->mtime = nsec();
+	PACK64(p, f->dent->mtime);
+	p += 8;
+	sbuf[0] |= Owmuid;
+	PACK32(p, f->uid);
+	p += 4;
+
+	kv[i].v = sbuf;
+	kv[i].nv = p - sbuf;
+	upsert(f->mnt, kv, i+1);
+
+	r.type = Rwrite;
+	r.count = w;
+Out:
+	poperror();
+ 	respond(m, &r);
+	wunlock(f->dent);
+	putfid(f);	
+}
+
+void
+fsflush(Fmsg *m)
+{
+	Fcall r;
+
+	r.type = Rflush;
+	respond(m, &r);
+}
+
+Conn *
+newconn(int rfd, int wfd)
+{
+	Conn *c;
+
+	if((c = mallocz(sizeof(*c), 1)) == nil)
+		return nil;
+	c->rfd = rfd;
+	c->wfd = wfd;
+	c->iounit = Max9p;
+	c->next = fs->conns;
+	lock(&fs->connlk);
+	fs->conns = c;
+	unlock(&fs->connlk);
+	return c;
+}
+
+void
+runfs(int, void *pc)
+{
+	char err[128];
+	RWLock *lk;
+	Amsg *a;
+	Conn *c;
+	Fcall r;
+	Fmsg *m;
+	u32int h;
+
+	c = pc;
+	while(1){
+		if(readmsg(c, &m) < 0){
+			fshangup(c, "read message: %r");
+			return;
+		}
+		if(m == nil)
+			break;
+		if(convM2S(m->buf, m->sz, m) == 0){
+			fshangup(c, "invalid message: %r");
+			return;
+		}
+		if(m->type != Tversion && !c->versioned){
+			fshangup(c, "version required");
+			return;
+		}
+		dprint("← %F\n", &m->Fcall);
+
+		if(m->type == Tflush){
+			lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
+			wlock(lk);
+		}else{
+			lk = &fs->flushq[ihash(m->tag) % Nflushtab];
+			rlock(lk);
+		}
+
+		a = nil;
+		h = ihash(m->fid) % fs->nreaders;
+		switch(m->type){
+		/* sync setup, must not access tree */
+		case Tversion:	fsversion(m);	break;
+		case Tauth:	fsauth(m);	break;
+		case Tflush:	fsflush(m);	break;
+		case Tclunk:	fsclunk(m, &a);	break;
+
+		/* mutators */
+		case Tcreate:	chsend(fs->wrchan, m);	break;
+		case Twrite:	chsend(fs->wrchan, m);	break;
+		case Twstat:	chsend(fs->wrchan, m);	break;
+		case Tremove:	chsend(fs->wrchan, m);	break;
+
+		/* reads */
+		case Tattach:	chsend(fs->rdchan[h], m);	break;
+		case Twalk:	chsend(fs->rdchan[h], m);	break;
+		case Tread:	chsend(fs->rdchan[h], m);	break;
+		case Tstat:	chsend(fs->rdchan[h], m);	break;
+
+		/* both */
+		case Topen:
+			if((m->mode & OTRUNC) || (m->mode & ORCLOSE) != 0)
+				chsend(fs->wrchan, m);
+			else
+				chsend(fs->rdchan[h], m);
+			break;
+
+		default:
+			fprint(2, "unknown message %F\n", &m->Fcall);
+			snprint(err, sizeof(err), "unknown message: %F", &m->Fcall);
+			r.type = Rerror;
+			r.ename = err;
+			respond(m, &r);
+			break;
+		}
+		assert(estacksz() == 0);
+		if(a != nil)
+			chsend(fs->admchan, a);
+	}
+}
+
+void
+runmutate(int id, void *)
+{
+	Fmsg *m;
+	Amsg *a;
+	Fid *f;
+
+	while(1){
+		a = nil;
+		m = chrecv(fs->wrchan);
+		if(fs->rdonly){
+			/*
+			 * special case: even if Tremove fails, we need
+			 * to clunk the fid.
+			 */
+			if(m->type == Tremove){
+				if((f = getfid(m->conn, m->fid)) == nil){
+					rerror(m, Enofid);
+					continue;
+				}
+				clunkfid(m->conn, f, &a);
+				/* read only: ignore rclose */
+				f->rclose = nil;
+				free(a);
+				putfid(f);
+			}
+			rerror(m, Erdonly);
+			continue;
+ 		}
+
+		qlock(&fs->mutlk);
+		epochstart(id);
+		fs->snap.dirty = 1;
+		switch(m->type){
+		case Tcreate:	fscreate(m);		break;
+		case Twrite:	fswrite(m, id);		break;
+		case Twstat:	fswstat(m, id, &a);	break;
+		case Tremove:	fsremove(m, id, &a);	break;
+		case Topen:	fsopen(m, id, &a);	break;
+		default:	abort();		break;
+		}
+		assert(estacksz() == 0);
+		epochend(id);
+		qunlock(&fs->mutlk);
+		epochclean();
+
+		if(a != nil)
+			chsend(fs->admchan, a);
+	}
+}
+
+void
+runread(int id, void *ch)
+{
+	Fmsg *m;
+
+	while(1){
+		m = chrecv(ch);
+		epochstart(id);
+		switch(m->type){
+		case Tattach:	fsattach(m);		break;
+		case Twalk:	fswalk(m);		break;
+		case Tread:	fsread(m);		break;
+		case Tstat:	fsstat(m);		break;
+		case Topen:	fsopen(m, id, nil);	break;
+		}
+		assert(estacksz() == 0);
+		epochend(id);
+	}
+}
+
+void
+freetree(Bptr rb, vlong pred)
+{
+	Bptr bp;
+	Blk *b;
+	Kvp kv;
+	int i;
+
+	b = getblk(rb, 0);
+	if(b->type == Tpivot){
+		for(i = 0; i < b->nval; i++){
+			getval(b, i, &kv);
+			bp = unpackbp(kv.v, kv.nv);
+			freetree(bp, pred);
+			qlock(&fs->mutlk);
+			qunlock(&fs->mutlk);
+			epochclean();
+		}
+	}
+	if(rb.gen > pred)
+		freebp(nil, rb);
+	dropblk(b);
+}
+
+/*
+ * Here, we clean epochs frequently, but we run outside of
+ * an epoch; this is because the caller of this function
+ * has already waited for an epoch to tick over, there's
+ * nobody that can be accessing the tree other than us,
+ * and we just need to keep the limbo list short.
+ *
+ * Because this is the last reference to the tree, we don't
+ * need to hold the mutlk, other than when we free or kill
+ * blocks via epochclean.
+ */
+void
+sweeptree(Tree *t)
+{
+	char pfx[1];
+	Scan s;
+	Bptr bp;
+	pfx[0] = Kdat;
+	btnewscan(&s, pfx, 1);
+	btenter(t, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		bp = unpackbp(s.kv.v, s.kv.nv);
+		if(bp.gen > t->pred)
+			freebp(nil, bp);
+		qlock(&fs->mutlk);
+		qunlock(&fs->mutlk);
+		epochclean();
+	}
+	btexit(&s);
+	freetree(t->bp, t->pred);
+}
+
+void
+runsweep(int id, void*)
+{
+	char buf[Kvmax];
+	Msg mb[Kvmax/Offksz];
+	Bptr bp, nb, *oldhd;
+	int i, nm;
+	vlong off;
+	Tree *t;
+	Arena *a;
+	Amsg *am;
+	Blk *b;
+
+	if((oldhd = calloc(fs->narena, sizeof(Bptr))) == nil)
+		sysfatal("malloc log heads");
+	while(1){
+		am = chrecv(fs->admchan);
+		switch(am->op){
+		case AOsync:
+			tracem("syncreq");
+			if(!fs->snap.dirty && !am->halt)
+				goto Next;
+			if(agetl(&fs->rdonly))
+				goto Justhalt;
+			if(waserror()){
+				fprint(2, "sync error: %s\n", errmsg());
+				ainc(&fs->rdonly);
+				break;
+			}
+
+			if(am->halt)
+				ainc(&fs->rdonly);
+			for(i = 0; i < fs->narena; i++){
+				a = &fs->arenas[i];
+				oldhd[i].addr = -1;
+				oldhd[i].hash = -1;
+				oldhd[i].gen = -1;
+				qlock(a);
+				/*
+				 * arbitrary heuristic -- try compressing
+				 * when the log doubles in size.
+				 */
+				if(a->nlog >= 2*a->lastlogsz){
+					oldhd[i] = a->loghd;
+					epochstart(id);
+					if(waserror()){
+						epochend(id);
+						qunlock(a);
+						nexterror();
+					}
+					compresslog(a);
+					epochend(id);
+					poperror();
+				}
+				qunlock(a);
+				epochclean();
+			}
+			sync();
+
+			for(i = 0; i < fs->narena; i++){
+				for(bp = oldhd[i]; bp.addr != -1; bp = nb){
+					qlock(&fs->mutlk);
+					epochstart(id);
+					b = getblk(bp, 0);
+					nb = b->logp;
+					freeblk(nil, b);
+					dropblk(b);
+					epochend(id);
+					qunlock(&fs->mutlk);
+					epochclean();
+				}
+			}
+
+Justhalt:
+			if(am->halt){
+				assert(fs->snapdl.hd.addr == -1);
+				assert(fs->snapdl.tl.addr == -1);
+				postnote(PNGROUP, getpid(), "halted");
+				exits(nil);
+			}
+			poperror();
+			break;
+
+		case AOsnap:
+			tracem("snapreq");
+			if(agetl(&fs->rdonly)){
+				fprint(2, "snap on read only fs");
+				goto Next;
+			}
+			if(waserror()){
+				fprint(2, "taking snap: %s\n", errmsg());
+				ainc(&fs->rdonly);
+				break;
+			}
+
+			qlock(&fs->mutlk);
+			if(waserror()){
+				qunlock(&fs->mutlk);
+				nexterror();
+			}
+			epochstart(id);
+			snapfs(am, &t);
+			epochend(id);
+			poperror();
+			qunlock(&fs->mutlk);
+
+			sync();
+
+			if(t != nil){
+				epochwait();
+				sweeptree(t);
+				closesnap(t);
+			}
+			poperror();
+			break;
+
+		case AOrclose:
+			if(agetl(&fs->rdonly)){
+				fprint(2, "rclose on read only fs");
+				goto Next;
+			}
+			nm = 0;
+			mb[nm].op = Odelete;
+			mb[nm].k = am->dent->k;
+			mb[nm].nk = am->dent->nk;
+			mb[nm].nv = 0;
+			nm++;
+			if(am->dent->qid.type & QTDIR){
+				packsuper(buf, sizeof(buf), am->qpath);
+				mb[nm].op = Oclobber;
+				mb[nm].k = buf;
+				mb[nm].nk = Upksz;
+				mb[nm].nv = 0;
+				nm++;
+			}
+			qlock(&fs->mutlk);
+			upsert(am->mnt, mb, nm);
+			qunlock(&fs->mutlk);
+			/* fallthrough */
+		case AOclear:
+			if(agetl(&fs->rdonly)){
+				fprint(2, "clear on read only fs");
+				goto Next;
+			}
+			tracem("bgclear");
+			if(waserror()){
+				fprint(2, "clear file %llx: %s\n", am->qpath, errmsg());
+				ainc(&fs->rdonly);
+				break;
+			}
+			if(am->dent != nil)
+				qlock(&am->dent->trunclk);
+			fs->snap.dirty = 1;
+			nm = 0;
+			for(off = am->off; off < am->end; off += Blksz){
+				mb[nm].op = Oclearb;
+				mb[nm].k = buf + Offksz * nm;
+				mb[nm].nk = Offksz;
+				mb[nm].k[0] = Kdat;
+				PACK64(mb[nm].k+1, am->qpath);
+				PACK64(mb[nm].k+9, off);
+				mb[nm].v = nil;
+				mb[nm].nv = 0;
+				if(++nm >= nelem(mb) || off + Blksz >= am->end){
+					qlock(&fs->mutlk);
+					if(waserror()){
+						qunlock(&fs->mutlk);
+						nexterror();
+					}
+					epochstart(id);
+					upsert(am->mnt, mb, nm);
+					epochend(id);
+					qunlock(&fs->mutlk);
+					epochclean();
+					poperror();
+					nm = 0;
+				}
+			}
+			if(am->dent != nil){
+				am->dent->trunc = 0;
+				rwakeup(&am->dent->truncrz);
+				qunlock(&am->dent->trunclk);
+				clunkdent(am->mnt, am->dent);
+			}
+			clunkmount(am->mnt);
+			poperror();
+			break;
+		}
+Next:
+		assert(estacksz() == 0);
+		free(am);
+	}
+}
+
+void
+snapmsg(char *old, char *new, int flg)
+{
+	Amsg *a;
+
+	a = emalloc(sizeof(Amsg), 1);
+	a->op = AOsnap;
+	a->fd = -1;
+	a->flag = flg;
+	strecpy(a->old, a->old+sizeof(a->old), old);
+	if(new == nil)
+		a->delete = 1;
+	else
+		strecpy(a->new, a->new+sizeof(a->new), new);
+	chsend(fs->admchan, a);
+}
+
+void
+runtasks(int, void *)
+{
+	char buf[128];
+	Tm now, then;
+	Mount *mnt;
+	int m, h;
+	Amsg *a;
+
+	m = 0;
+	h = 0;
+	tmnow(&then, nil);
+	tmnow(&now, nil);
+	while(1){
+		sleep(5000);
+		if(fs->rdonly)
+			continue;
+		if(waserror()){
+			fprint(2, "task error: %s\n", errmsg());
+			continue;
+		}
+		a = emalloc(sizeof(Amsg), 1);
+		a->op = AOsync;
+		a->halt = 0;
+		a->fd = -1;
+		chsend(fs->admchan, a);
+
+		tmnow(&now, nil);
+		for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+			if(!(mnt->flag & Ltsnap))
+				continue;
+			if(now.yday != then.yday){
+				snprint(buf, sizeof(buf),
+					"%s@day.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+				snapmsg("main", buf, Lauto);
+			}
+			if(now.hour != then.hour){
+				if(mnt->hourly[h][0] != 0)
+					snapmsg(mnt->hourly[h], nil, 0);
+				snprint(mnt->hourly[h], sizeof(mnt->hourly[h]),
+					"%s@hour.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+				snapmsg("main", mnt->hourly[h], Lauto);
+			}
+			if(now.min != then.min){
+				if(mnt->minutely[m][0] != 0)
+					snapmsg(mnt->minutely[m], nil, 0);
+				snprint(mnt->minutely[m], sizeof(mnt->minutely[m]),
+					"%s@minute.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+				snapmsg("main", mnt->minutely[m], Lauto);
+			}
+		}
+		if(now.hour != then.hour)
+			h = (h+1)%24;
+		if(now.min != then.min)
+			m = (m+1)%60;
+		then = now;
+		poperror();
+	}
+}
+
+void
+fixfs(void)
+{
+	char *p, kbuf[2][Keymax], vbuf[Inlmax], kvbuf[Msgmax];
+	Mount *mnt;
+	Tree *t;
+	Msg mb[2];
+	Kvp kv;
+	Key k;
+
+	fprint(2, "getting adm mount...\n");
+	if((mnt = getmount("adm")) == nil){
+		sysfatal("failed to get adm mount");
+	}
+	t = mnt->root;
+	p = packsuper(kbuf[0], sizeof(kbuf[0]), 0);
+	k.k = kbuf[0];
+	k.nk = p - kbuf[0];
+	fprint(2, "checking for valid adm root backlink...\n");
+	if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf))){
+		sysfatal("no mis-reamed key");
+	}
+
+	mb[0].op = Oinsert;
+	p = packsuper(kbuf[0], sizeof(kbuf[0]), 1);
+	mb[0].k = kbuf[0];
+	mb[0].nk = p - kbuf[0];
+	p = packdkey(vbuf, sizeof(vbuf), -1, "");
+	mb[0].v = vbuf;
+	mb[0].nv = p - vbuf;
+
+	mb[1].op = Odelete;
+	p = packsuper(kbuf[1], sizeof(kbuf[1]), 0);
+	mb[1].k = kbuf[1];
+	mb[1].nk = p - kbuf[1];
+	mb[1].v = nil;
+	mb[1].nv = 0;
+
+	fprint(2, "repairing adm root backlink...\n");
+	qlock(&fs->mutlk);
+	btupsert(t, mb, 2);
+	qunlock(&fs->mutlk);
+	fprint(2, "syncing changes...\n");
+	sync();
+	fprint(2, "done\n");
+	exits(nil);
+}
--- /dev/null
+++ b/hash.c
@@ -1,0 +1,153 @@
+// metrohash64.cpp
+//
+// The MIT License (MIT)
+//
+// Copyright (c) 2015 J. Andrew Rogers
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+#define _le64toh(x) \
+	GBIT64((char*)&x)
+
+
+#define ROTATE(x, b) (u64int)( ((x) << (b)) | ( (x) >> (64 - (b))) )
+
+#define HALF_ROUND(a,b,c,d,s,t)			\
+	a += b; c += d;				\
+	b = ROTATE(b, s) ^ a;			\
+	d = ROTATE(d, t) ^ c;			\
+	a = ROTATE(a, 32);
+
+#define DOUBLE_ROUND(v0,v1,v2,v3)		\
+	HALF_ROUND(v0,v1,v2,v3,13,16);		\
+	HALF_ROUND(v2,v1,v0,v3,17,21);		\
+	HALF_ROUND(v0,v1,v2,v3,13,16);		\
+	HALF_ROUND(v2,v1,v0,v3,17,21);
+
+#define rotate_right(v, k)\
+	((v >> k) | (v << (64 - k)))
+#define read_u64(ptr) \
+	(*(u64int*)ptr)
+#define read_u32(ptr) \
+	(*(u32int*)ptr)
+#define read_u16(ptr) \
+	(*(u16int*)ptr)
+#define read_u8(ptr) \
+	(*(u8int*)ptr)
+
+uvlong
+metrohash64_1(void * key, u64int len, u32int seed)
+{
+	static const u64int k0 = 0xC83A91E1;
+	static const u64int k1 = 0x8648DBDB;
+	static const u64int k2 = 0x7BDEC03B;
+	static const u64int k3 = 0x2F5870A5;
+
+	const uchar * ptr = key;
+	const uchar * const end = ptr + len;
+	
+	u64int hash = ((((u64int) seed) + k2) * k0) + len;
+	
+	if(len >= 32){
+		u64int v[4];
+		v[0] = hash;
+		v[1] = hash;
+		v[2] = hash;
+		v[3] = hash;
+		
+		do{
+			v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
+			v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3];
+			v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0];
+			v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1];
+		}
+		while(ptr <= (end - 32));
+
+		v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1;
+		v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0;
+		v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1;
+		v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0;
+		hash += v[0] ^ v[1];
+	}
+	
+	if((end - ptr) >= 16){
+		u64int v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1;
+		u64int v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2;
+		v0 ^= rotate_right(v0 * k0, 35) + v1;
+		v1 ^= rotate_right(v1 * k3, 35) + v0;
+		hash += v1;
+	}
+	
+	if((end - ptr) >= 8){
+		hash += read_u64(ptr) * k3; ptr += 8;
+		hash ^= rotate_right(hash, 33) * k1;
+		
+	}
+	
+	if((end - ptr) >= 4){
+		hash += read_u32(ptr) * k3; ptr += 4;
+		hash ^= rotate_right(hash, 15) * k1;
+	}
+	
+	if((end - ptr) >= 2){
+		hash += read_u16(ptr) * k3; ptr += 2;
+		hash ^= rotate_right(hash, 13) * k1;
+	}
+	
+	if((end - ptr) >= 1){
+		hash += read_u8 (ptr) * k3;
+		hash ^= rotate_right(hash, 25) * k1;
+	}
+	
+	hash ^= rotate_right(hash, 33);
+	hash *= k0;
+	hash ^= rotate_right(hash, 33);
+
+	return hash;
+}
+
+uvlong
+bufhash(void *src, usize len)
+{
+	return metrohash64_1(src, len, 0x6765);
+}
+
+uvlong
+blkhash(Blk *b)
+{
+	return metrohash64_1(b->buf, Blksz, 0x6765);
+}
+
+u32int
+ihash(uvlong x)
+{
+	x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
+	x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
+	x = x ^ (x >> 31);
+	return x;
+}
--- /dev/null
+++ b/load.c
@@ -1,0 +1,144 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+rangecmp(Avl *a, Avl *b)
+{
+	if(((Arange*)a)->off < ((Arange*)b)->off)
+		return -1;
+	if(((Arange*)a)->off > ((Arange*)b)->off)
+		return 1;
+	return 0;
+}
+
+void
+loadarena(Arena *a, Bptr hd)
+{
+	Blk *h0, *h1, *b;
+	Bptr bp;
+
+	/* try to load block pointers with consistency check */
+	bp = hd;
+	h0 = nil;
+	h1 = nil;
+	if(!waserror()){
+		h0 = getblk(bp, GBsoftchk);
+		poperror();
+	}else
+		print("loading arena primary header: %s\n", errmsg());
+	bp.addr += Blksz;
+	if(!waserror()){
+		h1 = getblk(bp, GBsoftchk);
+		poperror();
+	}else
+		print("loading arena backup header: %s\n", errmsg());
+
+	/* if neither head nor tail is consistent, we're hosed */
+	b = (h0 != nil) ? h0 : h1;
+	if(b == nil)
+		error(Efs);
+
+	/* otherwise, we could have crashed mid-pass, just load the blocks */
+	bp = hd;
+	if(h0 == nil)
+		h0 = getblk(bp, GBnochk);
+	bp.addr += Blksz;
+	if(h1 == nil)
+		h1 = getblk(bp, GBnochk);
+
+	unpackarena(a, b->data, Arenasz);
+	if((a->free = avlcreate(rangecmp)) == nil)
+		error(Enomem);
+	a->logbuf[0] = cachepluck();
+	a->logbuf[1] = cachepluck();
+	a->logbuf[0]->bp = (Bptr){-1, -1, -1};
+	a->logbuf[1]->bp = (Bptr){-1, -1, -1};
+	setflag(a->logbuf[0], Bstatic, 0);
+	setflag(a->logbuf[1], Bstatic, 0);
+	a->h0 = h0;
+	a->h1 = h1;
+	a->used = a->size;
+}
+
+void
+loadfs(char *dev)
+{
+	Bptr bhd, btl;
+	Mount *dump;
+	Arena *a;
+	Tree *t;
+	Dir *d;
+	int i;
+	vlong eb;
+
+	if((dump = mallocz(sizeof(*dump), 1)) == nil)
+		sysfatal("malloc: %r");
+	if(waserror())
+		sysfatal("load fs: %s", errmsg());
+	snprint(dump->name, sizeof(dump->name), "dump");
+	dump->ref = 1;
+	dump->gen = -1;
+	dump->root = &fs->snap;
+
+	fs->snapmnt = dump;
+	fs->narena = 1;
+	if((fs->fd = open(dev, ORDWR)) == -1)
+		sysfatal("open %s: %r", dev);
+	if((d = dirfstat(fs->fd)) == nil)
+		sysfatal("stat %s: %r", dev);
+	eb = d->length;
+	eb = eb - (eb%Blksz) - Blksz;
+	bhd = (Bptr){0, -1, -1};
+	btl = (Bptr){eb, -1, -1};
+	fs->sb0 = getblk(bhd, GBnochk);
+	fs->sb1 = getblk(btl, GBnochk);
+	if(!waserror()){
+		unpacksb(fs, fs->sb0->buf, Blksz);
+		poperror();
+	}else{
+		fprint(2, "unable to load primary superblock: %s\n", errmsg());
+		if(waserror()){
+			fprint(2, "unable to load primary superblock: %s\n", errmsg());
+			exits("corrupt");
+		}
+		unpacksb(fs, fs->sb1->buf, Blksz);
+		poperror();
+	}
+
+	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+		sysfatal("malloc: %r");
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		loadarena(a, fs->arenabp[i]);
+		a->reserve = a->size / 1024;
+		if(a->reserve < 512*KiB)
+			a->reserve = 512*KiB;
+		if(a->reserve > 8*MiB)
+			a->reserve = 8*MiB;
+	}
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		loadlog(a, a->loghd);
+	}
+
+	if((t = opensnap("adm", nil)) == nil)
+		sysfatal("load users: no adm label");
+	loadusers(2, t);
+	poperror();
+
+	fprint(2, "load %s:\n", dev);
+	fprint(2, "\tsnaptree:\t%B\n", fs->snap.bp);
+	fprint(2, "\tnarenas:\t%d\n", fs->narena);
+	fprint(2, "\tfeatures:\t%lld\n", fs->flag);
+	fprint(2, "\tnextqid:\t%lld\n", fs->nextqid);
+	fprint(2, "\tlastqgen:\t%lld\n", fs->qgen);
+	fprint(2, "\tnextgen:\t%lld\n", fs->nextgen);
+	fprint(2, "\tblocksize:\t%lld\n", Blksz);
+	fprint(2, "\tcachesz:\t%lld MiB\n", fs->cmax*Blksz/MiB);
+	closesnap(t);
+}
--- /dev/null
+++ b/main.c
@@ -1,0 +1,353 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+Gefs *fs;
+
+int	ream;
+int	grow;
+int	debug;
+int	stdio;
+int	noauth;
+int	nproc;
+int	permissive;
+int	usereserve;
+int	checkonly;
+char	*reamuser;
+char	*dev;
+vlong	tracesz		= 16*MiB;
+vlong	cachesz 	= 512*MiB;
+char	*srvname 	= "gefs";
+int	noneid		= 0;
+int	nogroupid	= 9999;
+int	admid		= -1;
+Blk	*blkbuf;
+Bfree	*bfbuf;
+Errctx	**errctx;
+
+void
+_trace(char *msg, Bptr bp, vlong v0, vlong v1)
+{
+	Trace *t;
+	ulong idx;
+
+	idx = aincl(&fs->traceidx, 1);
+	t = &fs->trace[(idx-1) % fs->ntrace];
+	strecpy(t->msg, t->msg+sizeof(t->msg), msg);
+	t->tid = (*errctx)->tid;
+	t->qgen = agetv(&fs->qgen);
+	t->bp = bp;
+	t->v0 = v0;
+	t->v1 = v1;
+}
+
+static void
+nokill(void)
+{
+	char buf[128];
+	int fd;
+
+	snprint(buf, sizeof(buf), "/proc/%d/ctl", getpid());
+	if((fd = open(buf, OWRITE)) == -1){
+		fprint(2, "nokill: open %s: %r", buf);
+		return;
+	}
+	if(fprint(fd, "noswap\n") == -1){
+		fprint(2, "nokill: write %s: %r", buf);
+		return;
+	}
+}
+
+static uvlong
+memsize(void)
+{
+	char *ln, *f[2];
+	vlong mem;
+	Biobuf *bp;
+
+	mem = 512*MiB;
+	if((bp = Bopen("/dev/swap", OREAD)) == nil)
+		return mem;
+	while((ln = Brdstr(bp, '\n', 1)) != nil){
+		if(tokenize(ln, f, nelem(f)) != 2)
+			continue;
+		if(strcmp(f[1], "memory") == 0){
+			mem = strtoll(f[0], 0, 0);
+			free(ln);
+			break;
+		}
+		free(ln);
+	}
+	Bterm(bp);
+	return mem;
+}
+
+jmp_buf*
+_waserror(void)
+{
+	Errctx *c;
+
+	c = *errctx;
+	c->nerrlab++;
+	assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+	return c->errlab + (c->nerrlab-1);
+}
+
+_Noreturn static void
+errorv(char *fmt, va_list ap, int broke)
+{
+	Errctx *c;
+
+	c = *errctx;
+	vsnprint(c->err, sizeof(c->err), fmt, ap);
+	if(broke){
+		fprint(2, "%s\n", c->err);
+		abort();
+	}
+	assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+	longjmp(c->errlab[--c->nerrlab], -1);
+}
+
+_Noreturn void
+broke(char *fmt, ...)
+{
+	va_list ap;
+
+	aincl(&fs->rdonly, 1);
+	va_start(ap, fmt);
+	errorv(fmt, ap, 1);
+}
+
+_Noreturn void
+error(char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	errorv(fmt, ap, 0);
+}
+
+_Noreturn void
+nexterror(void)
+{
+	Errctx *c;
+
+	c = *errctx;
+	assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+	longjmp(c->errlab[--c->nerrlab], -1);
+}
+
+void*
+emalloc(usize sz, int zero)
+{
+	void *p;
+
+	if((p = mallocz(sz, zero)) == nil)
+		error(Enomem);
+	setmalloctag(p, getcallerpc(&sz));
+	return p;
+}
+
+static void
+initfs(vlong cachesz)
+{
+	Bfree *f, *g;
+	Blk *b;
+
+	if((fs = mallocz(sizeof(Gefs), 1)) == nil)
+		sysfatal("malloc: %r");
+
+	if(tracesz != 0){
+		fs->trace = emalloc(tracesz, 1);
+		fs->ntrace = tracesz/sizeof(Trace);
+	}
+	fs->lrurz.l = &fs->lrulk;
+	fs->syncrz.l = &fs->synclk;
+	fs->bfreerz.l = &fs->bfreelk;
+	fs->noauth = noauth;
+	fs->cmax = cachesz/Blksz;
+	if(fs->cmax > (1<<30))
+		sysfatal("cache too big");
+	if((fs->bcache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
+		sysfatal("malloc: %r");
+	fs->dlcmax = fs->cmax/10;
+	if(fs->dlcmax < 4)
+		fs->dlcmax = 4;
+	if(fs->dlcmax > 512)
+		fs->dlcmax = 512;
+	if((fs->dlcache = mallocz(fs->dlcmax*sizeof(Dlist*), 1)) == nil)
+		sysfatal("malloc: %r");
+
+	bfbuf = sbrk(fs->cmax * sizeof(Bfree));
+	if(bfbuf == (void*)-1)
+		sysfatal("sbrk: %r");
+
+	g = nil;
+	for(f = bfbuf; f != bfbuf+fs->cmax; f++){
+		f->bp = Zb;
+		f->next = g;
+		g = f;
+	}
+	fs->bfree = g;
+
+	blkbuf = sbrk(fs->cmax * sizeof(Blk));
+	if(blkbuf == (void*)-1)
+		sysfatal("sbrk: %r");
+	for(b = blkbuf; b != blkbuf+fs->cmax; b++){
+		b->bp = Zb;
+		b->magic = Magic;
+		lrutop(b);
+	}
+}
+
+static void
+launch(void (*f)(int, void *), void *arg, char *text)
+{
+	long pid, id;
+
+	assert(fs->nworker < nelem(fs->lepoch));
+	pid = rfork(RFPROC|RFMEM|RFNOWAIT);
+	if (pid < 0)
+		sysfatal("can't fork: %r");
+	if (pid == 0) {
+		nokill();
+		id = aincl(&fs->nworker, 1);
+		if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
+			sysfatal("malloc: %r");
+		(*errctx)->tid = id;
+		procsetname("%s.%ld", text, id);
+		(*f)(id, arg);
+		exits("child returned");
+	}
+}
+
+static int
+postfd(char *name, char *suff, int mode)
+{
+	char buf[80];
+	int fd[2];
+	int cfd;
+
+	if(pipe(fd) < 0)
+		sysfatal("can't make a pipe");
+	snprint(buf, sizeof buf, "/srv/%s%s", name, suff);
+	if((cfd = create(buf, OWRITE|ORCLOSE|OCEXEC, mode)) == -1)
+		sysfatal("create %s: %r", buf);
+	if(fprint(cfd, "%d", fd[0]) == -1)
+		sysfatal("write %s: %r", buf);
+	close(fd[0]);
+	return fd[1];
+}
+
+static void
+runannounce(int, void *arg)
+{
+	char *ann, adir[40], ldir[40];
+	int actl, lctl, fd;
+	Conn *c;
+
+	ann = arg;
+	if((actl = announce(ann, adir)) < 0)
+		sysfatal("announce %s: %r", ann);
+	while(1){
+		if((lctl = listen(adir, ldir)) < 0){
+			fprint(2, "listen %s: %r", adir);
+			break;
+		}
+		fd = accept(lctl, ldir);
+		close(lctl);
+		if(fd < 0){
+			fprint(2, "accept %s: %r", ldir);
+			continue;
+		}
+		if(!(c = newconn(fd, fd))){
+			close(fd);
+			fprint(2, "%r");
+			continue;
+		}
+
+		launch(runfs, c, "netio");
+	}
+	close(actl);
+}
+
+static void
+usage(void)
+{
+	fprint(2, "usage: %s [-SA] [-r user] [-m mem] [-n srv] [-a net]... -f dev\n", argv0);
+	exits("usage");
+}
+
+void
+main(int argc, char **argv)
+{
+	int i;
+
+	cachesz = 64*MiB;
+	ARGBEGIN{
+	case 'd':
+		debug++;
+		break;
+	case 'f':
+		dev = EARGF(usage());
+		break;
+	default:
+		usage();
+		break;
+	}ARGEND;
+	if(dev == nil)
+		usage();
+
+	/*
+	 * sanity checks -- I've tuned these to stupid
+	 * values in the past.
+	 */
+	assert(4*Kpmax < Pivspc);
+	assert(2*Msgmax < Bufspc);
+	assert(Treesz < Inlmax);
+
+	initfs(cachesz);
+	initshow();
+	errctx = privalloc();
+	if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
+		sysfatal("malloc: %r");
+	tmfmtinstall();
+	fmtinstall('H', encodefmt);
+	fmtinstall('B', Bconv);
+	fmtinstall('M', Mconv);
+	fmtinstall('P', Pconv);
+	fmtinstall('K', Kconv);
+	fmtinstall('R', Rconv);
+	fmtinstall('F', fcallfmt);
+	fmtinstall('Q', Qconv);
+
+	nproc = 2;
+
+	rfork(RFNOTEG);
+	loadfs(dev);
+	/*
+	 * for spinning disks, parallel sync tanks performance
+	 * for ssds, it doesn't help much.
+	 */
+	fs->nsyncers = 1;
+	if(fs->nsyncers > fs->narena)
+		fs->nsyncers = fs->narena;
+	for(i = 0; i < fs->nsyncers; i++)
+		qinit(&fs->syncq[i]);
+	if((fs->rdchan = malloc(fs->nreaders*sizeof(Chan*))) == nil)
+		sysfatal("malloc: %r");
+	for(i = 0; i < fs->nreaders; i++)
+		fs->rdchan[i] = mkchan(32);
+	for(i = 0; i < fs->narena; i++)
+		fs->arenas[i].sync = &fs->syncq[i%fs->nsyncers];
+	for(i = 0; i < fs->nsyncers; i++)
+		launch(runsync, &fs->syncq[i], "syncio");
+	fixfs();
+	abort();
+}
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,39 @@
+</$objtype/mkfile
+
+TARG=fix
+BIN=/$objtype/bin
+OFILES=\
+	blk.$O\
+	cache.$O\
+	check.$O\
+	cons.$O\
+	dump.$O\
+	error.$O\
+	fs.$O\
+	hash.$O\
+	load.$O\
+	fix.$O\
+	pack.$O\
+	ream.$O\
+	snap.$O\
+	tree.$O\
+	user.$O\
+	\
+	atomic-$objtype.$O
+
+HFILES=\
+	dat.h\
+	fns.h\
+	atomic.h
+
+</sys/src/cmd/mkone
+</sys/doc/fonts
+
+%.ps: %.ms
+	{ echo $FONTS; cat $stem.ms } | pic | tbl | eqn | troff -ms | lp -dstdout > $target
+%.pdf: %.ps
+	ps2pdf $stem.ps $stem.pdf
+
+man.install: gefs.4.man gefs.8.man
+	cp gefs.4.man /sys/man/4/gefs
+	cp gefs.8.man /sys/man/8/gefs
--- /dev/null
+++ b/pack.c
@@ -1,0 +1,510 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+/* Terminated so we can use them directly in C */
+char*
+unpackstr(char *p, char *e, char **s)
+{
+	int n;
+
+	assert(e - p >= 3);
+	n = UNPACK16(p);
+	if(e - p < n + 3 || p[n+2] != 0)
+		broke(Efs);
+	*s = p+2;
+	return p+3+n;
+}
+
+/* Terminated so we can use them directly in C */
+char*
+packstr(char *p, char *e, char *s)
+{
+	int n;
+
+	n = strlen(s);
+	assert(e - p >= n+3);
+	PACK16(p, n);		p += 2;
+	memmove(p, s, n);	p += n;
+	*p = 0;			p += 1;
+	return p;
+}
+		
+void
+dir2kv(vlong up, Xdir *d, Kvp *kv, char *buf, int nbuf)
+{
+	char *ek, *ev, *eb;
+
+	ek = packdkey(buf, nbuf, up, d->name);
+	kv->k = buf;
+	kv->nk = ek - buf;
+	eb = buf + nbuf;
+	ev = packdval(ek, eb - ek, d);
+	kv->v = ek;
+	kv->nv = ev - ek;
+}
+
+char*
+packdkey(char *p, int sz, vlong up, char *name)
+{
+	char *ep;
+
+	ep = p + sz;
+	PACK8(p, Kent);	p += 1;
+	PACK64(p, up);	p += 8;
+	if(name != nil)
+		p = packstr(p, ep, name);
+	return p;
+}
+
+char*
+unpackdkey(char *p, int sz, vlong *up)
+{
+	char key, *ep, *name;
+
+	ep = p + sz;
+	assert(sz > 9);
+	key = UNPACK8(p);	p += 1;
+	*up = UNPACK64(p);	p += 8;
+	assert(key == Kent);
+	p = unpackstr(p, ep, &name);
+	assert(p <= ep);
+	return name;
+}
+
+char*
+packsuper(char *p, int sz, vlong up)
+{
+	char *ep;
+
+	ep = p+sz;
+	PACK8(p, Kup);	p += 1;
+	PACK64(p, up);	p += 8;
+	assert(p <= ep);
+	return p;
+}
+
+char*
+packdval(char *p, int sz, Xdir *d)
+{
+	char *e;
+
+	e = p + sz;
+	PACK64(p, d->flag);	p += 8;
+	PACK64(p, d->qid.path);	p += 8;
+	PACK32(p, d->qid.vers);	p += 4;
+	PACK8(p, d->qid.type);	p += 1;
+	PACK32(p, d->mode);	p += 4;
+	PACK64(p, d->atime);	p += 8;
+	PACK64(p, d->mtime);	p += 8;
+	PACK64(p, d->length);	p += 8;
+	PACK32(p, d->uid);	p += 4;
+	PACK32(p, d->gid);	p += 4;
+	PACK32(p, d->muid);	p += 4;
+	assert(p <= e);
+	return p;
+}
+
+void
+kv2dir(Kvp *kv, Xdir *d)
+{
+	char *k, *ek, *v, *ev;
+
+	memset(d, 0, sizeof(Xdir));
+	k = kv->k + 9;
+	ek = kv->k + kv->nk;
+	k = unpackstr(k, ek, &d->name);
+
+	v = kv->v;
+	ev = v + kv->nv;
+	d->flag 	= UNPACK64(v);	v += 8;
+	d->qid.path	= UNPACK64(v);	v += 8;
+	d->qid.vers	= UNPACK32(v);	v += 4;
+	d->qid.type	= UNPACK8(v);	v += 1;
+	d->mode		= UNPACK32(v);	v += 4;
+	d->atime	= UNPACK64(v);	v += 8;
+	d->mtime	= UNPACK64(v);	v += 8;
+	d->length	= UNPACK64(v);	v += 8;
+	d->uid		= UNPACK32(v);	v += 4;
+	d->gid		= UNPACK32(v);	v += 4;
+	d->muid		= UNPACK32(v);	v += 4;
+	assert(v <= ev);
+	if(k != ek)
+		broke(Efs);
+	if(v != ev)
+		broke(Efs);
+}
+
+int
+dir2statbuf(Xdir *d, char *buf, int nbuf)
+{
+	int sz, nn, nu, ng, nm;
+	vlong atime, mtime;
+	User *u, *g, *m;
+	char *p;
+
+	rlock(&fs->userlk);
+	if((u = uid2user(d->uid)) == nil)
+		u = uid2user(noneid);
+	if((g = uid2user(d->gid)) == nil)
+		u = uid2user(nogroupid);
+	if((m = uid2user(d->muid)) == nil)
+		m = uid2user(noneid);
+	if(u == nil || g == nil || m == nil)
+		error(Eperm);
+
+	p = buf;
+	nn = strlen(d->name);
+	nu = strlen(u->name);
+	ng = strlen(g->name);
+	nm = strlen(m->name);
+	atime = (d->atime+Nsec/2)/Nsec;
+	mtime = (d->mtime+Nsec/2)/Nsec;
+	sz = STATFIXLEN + nn + nu + ng + nm;
+	if(sz > nbuf){
+		runlock(&fs->userlk);
+		return -1;
+	}
+	
+	PBIT16(p, sz-2);		p += 2;
+	PBIT16(p, -1 /*type*/);		p += 2;
+	PBIT32(p, -1 /*dev*/);		p += 4;
+	PBIT8(p, d->qid.type);		p += 1;
+	PBIT32(p, d->qid.vers);		p += 4;
+	PBIT64(p, d->qid.path);		p += 8;
+	PBIT32(p, d->mode);		p += 4;
+	PBIT32(p, atime);		p += 4;
+	PBIT32(p, mtime);		p += 4;
+	PBIT64(p, d->length);		p += 8;
+
+	PBIT16(p, nn);			p += 2;
+	memcpy(p, d->name, nn);		p += nn;
+	PBIT16(p, nu);			p += 2;
+	memcpy(p, u->name, nu);		p += nu;
+	PBIT16(p, ng);			p += 2;
+	memcpy(p, g->name, ng);		p += ng;
+	PBIT16(p, nm);			p += 2;
+	memcpy(p, m->name, nm);		p += nm;
+	assert(p - buf == sz);
+	runlock(&fs->userlk);
+	return sz;
+}
+
+int
+kv2statbuf(Kvp *kv, char *buf, int nbuf)
+{
+	Xdir d;
+
+	kv2dir(kv, &d);
+	return dir2statbuf(&d, buf, nbuf);
+}
+
+void
+kv2qid(Kvp *kv, Qid *q)
+{
+	char *v, *e;
+
+	v = kv->v;
+	e = v + kv->nv;
+	q->path = UNPACK64(v);	v += 8;
+	q->vers = UNPACK64(v);	v += 8;
+	assert(v <= e);
+}
+
+void
+kv2dlist(Kvp *kv, Dlist *dl)
+{
+	char *p, *e;
+
+	p = kv->k;
+	e = p + kv->nk;
+	p++;
+	dl->gen = UNPACK64(p);	p += 8;
+	dl->bgen = UNPACK64(p);	p += 8;
+	assert(p <= e);
+	
+	p = kv->v;
+	e = p + kv->nv;
+	dl->hd = unpackbp(p, e-p);	p += Ptrsz;
+	dl->tl = unpackbp(p, e-p);	p += Ptrsz;
+	assert(p <= e);
+}
+
+void
+dlist2kv(Dlist *dl, Kvp *kv, char *buf, int nbuf)
+{
+	char *p, *e;
+
+	assert(nbuf >= Dlkvpsz);
+	p = buf;
+	e = buf+nbuf;
+
+	kv->k = p;
+	*p++ = Kdlist;
+	PACK64(p, dl->gen);	p += 8;
+	PACK64(p, dl->bgen);	p += 8;
+	kv->nk = (p - kv->k);
+	
+	kv->v = p;
+	p = packbp(p, e-p, &dl->hd);
+	p = packbp(p, e-p, &dl->tl);
+	kv->nv = (p - kv->v);
+}
+
+void
+tree2kv(Tree *t, Kvp *kv, char *buf, int nbuf)
+{
+	char *p, *e;
+
+	p = buf;
+	e = buf+nbuf;
+
+	kv->k = p;
+	if((p = packsnap(p, e-p, t->gen)) == nil)
+		abort();
+	kv->nk = p - kv->k;
+
+	kv->v = p;
+	if((p = packtree(p, e-p, t)) == nil)
+		abort();
+	kv->nv = p - kv->v;
+}
+
+void
+retag2kv(vlong gen, vlong link, int dlbl, int dref, Kvp *kv, char *buf, int nbuf)
+{
+	char *p;
+
+	assert(nbuf >= 8+1+1);
+	kv->k = buf;
+	if((p = packsnap(buf, nbuf, gen)) == nil)
+		abort();
+	kv->nk = p - buf;
+
+	kv->v = p;
+	PACK64(p, link);	p += 8;
+	*p = dlbl;		p += 1;
+	*p = dref;		p += 1;
+	kv->nv = p - kv->v;
+}
+
+void
+lbl2kv(char *lbl, vlong gen, uint flg, Kvp *kv, char *buf, int nbuf)
+{
+	char *p;
+	int n;
+
+	n = strlen(lbl);
+	assert(nbuf >= 1+n + 1+8+4);
+
+	p = buf;
+	kv->k = p;
+	p[0] = Klabel;		p += 1;
+	memcpy(p, lbl, n);	p += n;
+	kv->nk = p - kv->k;
+
+	kv->v = p;
+	p[0] = Ksnap;		p += 1;
+	PACK64(p, gen);		p += 8;
+	PACK32(p, flg);		p += 4;
+	kv->nv = p - kv->v;
+}
+
+char*
+packlbl(char *p, int sz, char *name)
+{
+	int n;
+
+	n = strlen(name);
+	assert(sz >= n+1);
+	p[0] = Klabel;		p += 1;
+	memcpy(p, name, n);	p += n;
+	return p;
+}
+
+char*
+packsnap(char *p, int sz, vlong id)
+{
+	assert(sz >= Snapsz);
+	p[0] = Ksnap;		p += 1;
+	PACK64(p, id);		p += 8;
+	return p;
+}
+
+char*
+packbp(char *p, int sz, Bptr *bp)
+{
+	assert(sz >= Ptrsz);
+	PACK64(p, bp->addr);	p += 8;
+	PACK64(p, bp->hash);	p += 8;
+	PACK64(p, bp->gen);	p += 8;
+	return p;
+}
+
+Bptr
+unpackbp(char *p, int sz)
+{
+	Bptr bp;
+
+	assert(sz >= Ptrsz);
+	bp.addr = UNPACK64(p);	p += 8;
+	bp.hash = UNPACK64(p);	p += 8;
+	bp.gen = UNPACK64(p);
+	return bp;
+}
+
+Tree*
+unpacktree(Tree *t, char *p, int sz)
+{
+	assert(sz >= Treesz);
+	memset(t, 0, sizeof(Tree));
+	t->nref = UNPACK32(p);		p += 4;
+	t->nlbl = UNPACK32(p);		p += 4;
+	t->ht = UNPACK32(p);		p += 4;
+	t->flag = UNPACK32(p);		p += 4;
+	t->gen = UNPACK64(p);		p += 8;
+	t->pred = UNPACK64(p);		p += 8;
+	t->succ = UNPACK64(p);		p += 8;
+	t->base = UNPACK64(p);		p += 8;
+	t->bp.addr = UNPACK64(p);	p += 8;
+	t->bp.hash = UNPACK64(p);	p += 8;
+	t->bp.gen = UNPACK64(p);	//p += 8;
+
+	return t;
+}
+
+char*
+packtree(char *p, int sz, Tree *t)
+{
+	assert(sz >= Treesz);
+	PACK32(p, t->nref);	p += 4;
+	PACK32(p, t->nlbl);	p += 4;
+	PACK32(p, t->ht);	p += 4;
+	PACK32(p, t->flag);	p += 4;
+	PACK64(p, t->gen);	p += 8;
+	PACK64(p, t->pred);	p += 8;
+	PACK64(p, t->succ);	p += 8;
+	PACK64(p, t->base);	p += 8;
+	PACK64(p, t->bp.addr);	p += 8;
+	PACK64(p, t->bp.hash);	p += 8;
+	PACK64(p, t->bp.gen);	p += 8;
+	return p;
+}
+
+char*
+packarena(char *p, int sz, Arena *a)
+{
+	char *e;
+
+	assert(sz >= Arenasz);
+	e = p + Arenasz;
+	PACK64(p, a->loghd.addr);	p += 8;	/* freelist addr */
+	PACK64(p, a->loghd.hash);	p += 8;	/* freelist hash */
+	PACK64(p, a->size);		p += 8;	/* arena size */
+	PACK64(p, a->used);		p += 8;	/* arena used */
+	assert(p <= e);
+	return p;
+}
+
+char*
+unpackarena(Arena *a, char *p, int sz)
+{
+	char *e;
+
+	assert(sz >= Arenasz);
+	memset(a, 0, sizeof(*a));
+
+	e = p + Arenasz;
+	a->loghd.addr = UNPACK64(p);	p += 8;
+	a->loghd.hash = UNPACK64(p);	p += 8;
+	a->loghd.gen = -1;		p += 0;
+	a->size = UNPACK64(p);		p += 8;
+	a->used = UNPACK64(p);		p += 8;
+	a->logtl = nil;
+
+	assert(p <= e);
+	return p;
+}
+
+char*
+packsb(char *p0, int sz, Gefs *fi)
+{
+	uvlong h;
+	char *p;
+	int i;
+
+	assert(sz == Blksz);
+	assert(fi->narena < 512);
+	p = p0;
+	memcpy(p, "gefs9.00", 8);	p += 8;
+	PACK32(p, Blksz);		p += 4;
+	PACK32(p, Bufspc);		p += 4;
+	PACK32(p, fi->narena);		p += 4;
+	PACK32(p, fi->snap.ht);		p += 4;
+	PACK64(p, fi->snap.bp.addr);	p += 8;
+	PACK64(p, fi->snap.bp.hash);	p += 8;
+	PACK64(p, fi->snapdl.hd.addr);	p += 8;
+	PACK64(p, fi->snapdl.hd.hash);	p += 8;
+	PACK64(p, fi->snapdl.tl.addr);	p += 8;
+	PACK64(p, fi->snapdl.tl.hash);	p += 8;
+	PACK64(p, fi->flag);		p += 8;
+	PACK64(p, fi->nextqid);		p += 8;
+	PACK64(p, fi->nextgen);		p += 8;
+	PACK64(p, fi->qgen);		p += 8;
+	for(i = 0; i < fi->narena; i++){
+		PACK64(p, fi->arenabp[i].addr);	p += 8;
+		PACK64(p, fi->arenabp[i].hash);	p += 8;
+	}
+	h = bufhash(p0, p - p0);
+	PACK64(p, h);			p += 8;
+	return p;
+}
+
+char*
+unpacksb(Gefs *fi, char *p0, int sz)
+{
+	uvlong dh, xh;
+	char *p;
+	int i;
+
+	assert(sz == Blksz);
+	p = p0;
+	if(memcmp(p, "gefs9.00", 8) != 0)
+		error("%s %.8s", Efsvers, p);
+	p += 8;
+	fi->blksz = UNPACK32(p);		p += 4;
+	fi->bufspc = UNPACK32(p);		p += 4;
+	fi->narena = UNPACK32(p);		p += 4;
+	fi->snap.ht = UNPACK32(p);		p += 4;
+	fi->snap.bp.addr = UNPACK64(p);		p += 8;
+	fi->snap.bp.hash = UNPACK64(p);		p += 8;
+	fi->snap.bp.gen = -1;			p += 0;
+	fi->snapdl.hd.addr = UNPACK64(p);	p += 8;
+	fi->snapdl.hd.hash = UNPACK64(p);	p += 8;
+	fi->snapdl.hd.gen = -1;			p += 0;
+	fi->snapdl.gen = -1;			p += 0;
+	fi->snapdl.tl.addr = UNPACK64(p);	p += 8;
+	fi->snapdl.tl.hash = UNPACK64(p);	p += 8;
+	fi->snapdl.hd.gen = -1;			p += 0;
+	fi->snapdl.gen = -1;			p += 0;
+	fi->flag = UNPACK64(p);			p += 8;
+	fi->nextqid = UNPACK64(p);		p += 8;
+	fi->nextgen = UNPACK64(p);		p += 8;
+	fi->qgen = UNPACK64(p);	p += 8;
+	fi->arenabp = emalloc(fi->narena * sizeof(Bptr), 0);
+	for(i = 0; i < fi->narena; i++){
+		fi->arenabp[i].addr = UNPACK64(p);	p += 8;
+		fi->arenabp[i].hash = UNPACK64(p);	p += 8;
+		fi->arenabp[i].gen = -1;
+	}
+	xh = bufhash(p0, p - p0);
+	dh = UNPACK64(p);			p += 8;
+	if(dh != xh)
+		error("corrupt superblock: %llx != %llx", dh, xh);
+	assert(fi->narena < 256);	/* should be more than anyone needs */
+	return p;
+}
--- /dev/null
+++ b/ream.c
@@ -1,0 +1,460 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+enum {
+	Qmainroot,
+	Qadmroot,
+	Qadmuser,
+	Nreamqid,
+};
+
+static void
+fillxdir(Xdir *d, vlong qid, char *name, int type, int mode)
+{
+	memset(d, 0, sizeof(Xdir));
+	d->qid = (Qid){qid, 0, type};
+	d->mode = mode;
+	d->atime = 0;
+	d->mtime = 0;
+	d->length = 0;
+	d->name = name;
+	d->uid = -1;
+	d->gid = -1;
+	d->muid = 0;
+}
+
+static void
+initadm(Blk *r, Blk *u, int nu)
+{
+	char *p, kbuf[Keymax], vbuf[Inlmax];
+	Kvp kv;
+	Xdir d;
+
+	/* nb: values must be inserted in key order */
+	kv.k = kbuf;
+	kv.nk = Offksz;
+	kv.v = vbuf;
+	kv.nv = Ptrsz;
+	kbuf[0] = Kdat;
+	PACK64(kbuf+1, (uvlong)Qadmuser);
+	PACK64(kbuf+9, 0ULL);
+	packbp(kv.v, kv.nv, &u->bp);
+	setval(r, &kv);
+
+	fillxdir(&d, Qadmuser, "users", QTFILE, 0664);
+	d.length = nu;
+	dir2kv(Qadmroot, &d, &kv, vbuf, sizeof(vbuf));
+	setval(r, &kv);
+	fillxdir(&d, Qadmroot, "", QTDIR, DMDIR|0775);
+	dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
+	setval(r, &kv);
+
+	p = packsuper(kbuf, sizeof(kbuf), Qadmroot);
+	kv.k = kbuf;
+	kv.nk = p - kbuf;
+	p = packdkey(vbuf, sizeof(vbuf), -1, "");
+	kv.v = vbuf;
+	kv.nv = p - vbuf;
+	setval(r, &kv);
+}
+
+static void
+initroot(Blk *r)
+{
+	char *p, kbuf[Keymax], vbuf[Inlmax];
+	Kvp kv;
+	Xdir d;
+
+	/* nb: values must be inserted in key order */
+	fillxdir(&d, Qmainroot, "", QTDIR, DMDIR|0775);
+	dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
+	setval(r, &kv);
+
+	p = packsuper(kbuf, sizeof(kbuf), Qmainroot);
+	kv.k = kbuf;
+	kv.nk = p - kbuf;
+	p = packdkey(vbuf, sizeof(vbuf), -1, "");
+	kv.v = vbuf;
+	kv.nv = p - vbuf;
+	setval(r, &kv);
+}
+
+static void
+initsnap(Blk *s, Blk *r, Blk *a)
+{
+	char *p, *e, buf[Kvmax];
+	Tree t;
+	Kvp kv;
+
+	lbl2kv("adm", 1, Lmut|Ltsnap, &kv, buf, sizeof(buf));
+	setval(s, &kv);
+	lbl2kv("empty", 0, 0, &kv, buf, sizeof(buf));
+	setval(s, &kv);
+	lbl2kv("main", 2, Lmut|Ltsnap, &kv, buf, sizeof(buf));
+	setval(s, &kv);
+
+	p = buf;
+	e = p + sizeof(buf);
+
+	/* empty */
+	kv.k = p;
+	p = packsnap(buf, e - p, 0);
+	kv.nk = p - kv.k;
+	kv.v = p;
+	memset(&t, 0, sizeof(Tree));
+	t.flag = 0;
+	t.nref = 2;
+	t.nlbl = 1;
+	t.ht = 1;
+	t.gen = fs->nextgen++;
+	t.pred = 0;
+	t.succ = 2;
+	t.bp = r->bp;
+	p = packtree(p, e - p, &t);
+	kv.nv = p - kv.v;
+	setval(s, &kv);
+
+	p = buf;
+	e = p + sizeof(buf);
+
+	/* adm */
+	kv.k = p;
+	p = packsnap(p, e - p, 1);
+	kv.nk = p - kv.k;
+	kv.v = p;
+	memset(&t, 0, sizeof(Tree));
+	t.nref = 0;
+	t.nlbl = 1;
+	t.ht = 1;
+	t.gen = fs->nextgen++;
+	t.pred = 0;
+	t.succ = -1;
+	t.bp = a->bp;
+	p = packtree(p, e - p, &t);
+	kv.nv = p - kv.v;
+	setval(s, &kv);
+
+	p = buf;
+	e = p + sizeof(buf);
+
+	/* main */
+	kv.k = p;
+	p = packsnap(buf, e - p, 2);
+	kv.nk = p - kv.k;
+	kv.v = p;
+	memset(&t, 0, sizeof(Tree));
+	t.nref = 0;
+	t.nlbl = 1;
+	t.ht = 1;
+	t.gen = fs->nextgen++;
+	t.pred = 0;
+	t.succ = -1;
+	t.bp = r->bp;
+	p = packtree(p, e - p, &t);
+	kv.nv = p - kv.v;
+	setval(s, &kv);
+}
+
+static void
+initarena(Arena *a, uvlong hdaddr, vlong asz)
+{
+	Blk *b, *h0, *h1;
+	uvlong addr;
+	char *p;
+
+	b = cachepluck();
+
+	addr = hdaddr+2*Blksz;	/* leave room for arena hdr */
+
+	a->loghd.addr = -1;
+	a->loghd.hash = -1;
+	a->loghd.gen = -1;
+
+	memset(b->buf, 0, sizeof(b->buf));
+	b->type = Tlog;
+	b->bp.addr = addr;
+	b->logsz = 0;
+	b->logp = (Bptr){-1, -1, -1};
+	b->data = b->buf + Loghdsz;
+	setflag(b, Bdirty, 0);
+
+	p = b->buf + Loghdsz;
+	b->logp = (Bptr){-1, -1, -1};
+	PACK64(p, addr|LogFree);	p += 8;	/* addr */
+	PACK64(p, asz-2*Blksz);		p += 8;	/* len */
+	PACK64(p, b->bp.addr|LogAlloc);	p += 8;	/* addr */
+	PACK64(p, Blksz);		p += 8;	/* len */
+	PACK64(p, (uvlong)LogSync);	p += 8;	/* barrier */
+	b->logsz = p - b->data;
+	finalize(b);
+	syncblk(b);
+	dropblk(b);
+
+	a->loghd = b->bp;
+	a->loghd.gen = -1;
+	a->size = asz;
+	a->used = Blksz;
+
+	h0 = cachepluck();
+	h1 = cachepluck();
+
+	memset(h0->buf, 0, sizeof(h0->buf));
+	h0->type = Tarena;
+	h0->bp.addr = hdaddr;
+	h0->data = h0->buf+2;
+	packarena(h0->data, Arenasz, a);
+	finalize(h0);
+	syncblk(h0);
+	a->h0 = h0;
+
+	memset(h1->buf, 0, sizeof(h1->buf));
+	h1->type = Tarena;
+	h1->bp.addr = hdaddr+Blksz;
+	h1->data = h1->buf+2;
+	packarena(h1->data, Arenasz, a);
+	finalize(h1);
+	syncblk(h1);
+	a->h1 = h1;
+}
+
+void
+reamfs(char *dev)
+{
+	Blk *sb0, *sb1, *tb, *mb, *ab, *ub;
+	vlong sz, asz, off;
+	Mount *mnt, *adm;
+	Arena *a;
+	char *utab;
+	Dir *d;
+	int i;
+
+	if(waserror())
+		sysfatal("ream %s: %s\n", dev, errmsg());
+	if((fs->fd = open(dev, ORDWR)) == -1)
+		sysfatal("open %s: %r", dev);
+	if((d = dirfstat(fs->fd)) == nil)
+		sysfatal("ream: %r");
+	sz = d->length;
+	free(d);
+
+	print("reaming %s\n", dev);
+	if(sz < 128*MiB+Blksz)
+		sysfatal("ream: disk too small");
+	mnt = emalloc(sizeof(Mount), 1);
+	mnt->root = mallocz(sizeof(Tree), 1);
+	adm = mallocz(sizeof(Mount), 1);
+	adm->root = mallocz(sizeof(Tree), 1);
+
+	sz = sz - sz%Blksz - 2*Blksz;
+	fs->narena = (sz + 4096ULL*GiB - 1) / (4096ULL*GiB);
+	if(fs->narena < 8)
+		fs->narena = 8;
+	if(fs->narena >= 32)
+		fs->narena = 32;
+	fs->arenas = emalloc(fs->narena*sizeof(Arena), 1);
+
+
+	off = Blksz;
+	asz = sz/fs->narena;
+	asz = asz - (asz % Blksz) - 2*Blksz;
+
+	sb0 = cachepluck();
+	sb1 = cachepluck();
+	sb0->bp = (Bptr){0, -1, -1};
+	sb1->bp = (Bptr){sz+Blksz, -1, -1};
+
+	fs->arenabp = emalloc(fs->narena * sizeof(Bptr), 1);
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		print("\tarena %d: %lld blocks at %llx\n", i, asz/Blksz, off);
+		initarena(a, off, asz);
+		fs->arenabp[i] = a->h0->bp;
+		off += asz+2*Blksz;
+
+	}
+	
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		loadarena(a, a->h0->bp);
+		loadlog(a, a->loghd);
+	}
+
+	if((mb = newblk(mnt->root, Tleaf)) == nil)
+		sysfatal("ream: allocate root: %r");
+	holdblk(mb);
+	initroot(mb);
+	finalize(mb);
+	syncblk(mb);
+
+	mnt->root->ht = 1;
+	mnt->root->bp = mb->bp;
+
+	if((ab = newblk(adm->root, Tleaf)) == nil)
+		sysfatal("ream: allocate root: %r");
+	if((ub = newdblk(adm->root, 0, 1)) == nil)
+		sysfatal("ream: allocate root: %r");
+	holdblk(ab);
+	holdblk(ub);
+	utab = smprint(
+		"-1:adm::%s\n"
+		"0:none::\n"
+		"1:%s:%s:\n",
+		reamuser, reamuser, reamuser);
+	memcpy(ub->data, utab, strlen(utab));
+	finalize(ub);
+	syncblk(ub);
+	initadm(ab, ub, strlen(utab));
+	finalize(ab);
+	syncblk(ab);
+
+	adm->root->ht = 1;
+	adm->root->bp = ab->bp;
+
+	/*
+	 * Now that we have a completely empty fs, give it
+	 * a single snap block that the tree will insert
+	 * into, and take a snapshot as the initial state.
+	 */
+	if((tb = newblk(mnt->root, Tleaf)) == nil)
+		sysfatal("ream: allocate snaps: %r");
+	holdblk(tb);
+	initsnap(tb, mb, ab);
+	finalize(tb);
+	syncblk(tb);
+
+	fs->snap.bp = tb->bp;
+	fs->snap.ht = 1;
+	fs->snapdl.hd.addr = -1;
+	fs->snapdl.hd.hash = -1;
+	fs->snapdl.tl.addr = -1;
+	fs->snapdl.tl.hash = -1;
+	fs->nextqid = Nreamqid;
+
+	dropblk(mb);
+	dropblk(ab);
+	dropblk(ub);
+	dropblk(tb);
+	fs->nextqid = Nreamqid;
+
+	/*
+	 * We need to write back all of the arenas
+	 * with the updated free lists
+	 */
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		finalize(a->logtl);
+		syncblk(a->logtl);
+		packarena(a->h0->data, Blksz, a);
+		finalize(a->h0);
+		syncblk(a->h0);
+		packarena(a->h1->data, Blksz, a);
+		finalize(a->h1);
+		syncblk(a->h1);
+		fs->arenabp[i] = a->h0->bp;
+		dropblk(a->h0);
+		dropblk(a->h1);
+	}
+
+	dropblk(mb);
+	dropblk(ab);
+	dropblk(ub);
+	dropblk(tb);
+
+	/*
+	 * Finally, write back the superblock and backup
+	 * superblock.
+	 */
+	packsb(sb0->buf, Blksz, fs);
+	packsb(sb1->buf, Blksz, fs);
+	finalize(sb0);
+	finalize(sb1);
+	syncblk(sb0);
+	syncblk(sb1);
+	dropblk(sb0);
+	dropblk(sb1);
+	free(mnt);
+	poperror();
+}
+
+void
+growfs(char *dev)
+{
+	vlong oldsz, newsz, asz, off, eb;
+	int i, narena;
+	Arena *a;
+	Bptr bp;
+	Dir *d;
+
+	if(waserror())
+		sysfatal("grow %s: %s\n", dev, errmsg());
+	if((fs->fd = open(dev, ORDWR)) == -1)
+		sysfatal("open %s: %r", dev);
+	if((d = dirfstat(fs->fd)) == nil)
+		sysfatal("ream: %r");
+
+	bp = (Bptr){0, -1, -1};
+	fs->sb0 = getblk(bp, GBnochk);
+	unpacksb(fs, fs->sb0->buf, Blksz);
+	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+		sysfatal("malloc: %r");
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		loadarena(a, fs->arenabp[i]);
+		fs->arenabp[i] = a->h0->bp;
+	}
+	a = &fs->arenas[fs->narena-1];
+	oldsz = a->h0->bp.addr + a->size + 2*Blksz;
+	newsz = d->length - d->length%Blksz - 2*Blksz;
+	if(newsz - oldsz < 64*MiB)
+		sysfatal("new arenas too small (%lld < %lld), not growing", newsz - oldsz, 64*MiB);
+	asz = (newsz - oldsz)/4;
+	asz = asz - asz % Blksz - 2*Blksz;
+	narena = fs->narena + 4;
+	assert(oldsz % Blksz == 0);
+	if((fs->arenas = realloc(fs->arenas, narena*sizeof(Arena))) == nil)
+		error(Enomem);
+	if((fs->arenabp = realloc(fs->arenabp, narena*sizeof(Bptr))) == nil)
+		error(Enomem);
+
+	off = oldsz;
+	for(i = fs->narena; i < narena; i++){
+		a = &fs->arenas[i];
+		print("\tnew arena %d: adding %lld blocks at %llx\n", i, asz/Blksz, off);
+		initarena(&fs->arenas[i], off, asz);
+		loadarena(a, a->h0->bp);
+		loadlog(a, a->loghd);
+		a = &fs->arenas[i];
+		packarena(a->h0->data, Blksz, a);
+		packarena(a->h1->data, Blksz, a);
+		finalize(a->h0);
+		finalize(a->h1);
+		syncblk(a->h0);
+		syncblk(a->h1);
+
+		fs->arenabp[i] = a->h0->bp;
+		off += asz+2*Blksz;
+	}
+	fs->narena = narena;
+	packsb(fs->sb0->buf, Blksz, fs);
+	finalize(fs->sb0);
+	syncblk(fs->sb0);
+	/*
+	 * We're being a bit tricksy here: because we're on a bigger
+	 * partition, we don't know where the end is; just load the
+	 * first block, and patch the address in to the right place
+	 * when we write it back.
+	 */
+	eb = d->length;
+	eb = eb - (eb%Blksz) - Blksz;
+	fs->sb0->bp = (Bptr){eb, -1, -1};
+	packsb(fs->sb0->buf, Blksz, fs);
+	finalize(fs->sb0);
+	syncblk(fs->sb0);
+	free(d);
+	poperror();
+}
--- /dev/null
+++ b/snap.c
@@ -1,0 +1,613 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "atomic.h"
+#include "dat.h"
+#include "fns.h"
+
+static void
+dlflush(Dlist *dl)
+{
+	char kvbuf[512];
+	Msg m;
+
+	if(dl->ins == nil)
+		return;
+	traceb("dlflush", dl->ins->bp);
+	enqueue(dl->ins);
+	dropblk(dl->ins);
+	dl->hd = dl->ins->bp;
+	if(dl->tl.addr == dl->hd.addr)
+		dl->tl = dl->hd;
+	dl->ins = nil;
+	/* special case: the snap dlist has gen -1, skip it */
+	if(dl->gen != -1){
+		m.op = Oinsert;
+		dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+		btupsert(&fs->snap, &m, 1);
+	}
+}
+
+static void
+dlcachedel(Dlist *dl, int hdel)
+{
+	uint h;
+	Dlist *d, **p;
+
+	h = ihash(dl->gen) ^ ihash(dl->bgen);
+	if(hdel){
+		p = &fs->dlcache[h % fs->dlcmax];
+		for(d = *p; d != nil; d = d->chain){
+			if(d->gen == dl->gen && d->bgen == dl->bgen)
+				break;
+			p = &d->chain;
+		}
+		if(d != nil)
+			*p  = d->chain;
+	}
+	if(dl == fs->dlhead)
+		fs->dlhead = dl->cnext;
+	if(dl == fs->dltail)
+		fs->dltail = dl->cprev;
+	if(dl->cnext != nil)
+		dl->cnext->cprev = dl->cprev;
+	if(dl->cprev != nil)
+		dl->cprev->cnext = dl->cnext;
+	dl->cnext = nil;
+	dl->cprev = nil;
+}
+
+static Dlist*
+dlcacheget(vlong gen, vlong bgen)
+{
+	Dlist *dl;
+	uint h;
+
+	h = ihash(gen) ^ ihash(bgen);
+	for(dl = fs->dlcache[h % fs->dlcmax]; dl != nil; dl = dl->chain)
+		if(dl->gen == gen && dl->bgen == bgen)
+			break;
+	if(dl != nil)
+		dlcachedel(dl, 0);
+	return dl;
+}
+
+static Dlist*
+getdl(vlong gen, vlong bgen)
+{
+	char kbuf[Dlksz], kvbuf[Dlkvpsz];
+	Dlist *dl, **p;
+	uint h;
+	Msg m;
+	Kvp kv;
+	Key k;
+
+	if((dl = dlcacheget(gen, bgen)) != nil)
+		return dl;
+	dl = emalloc(sizeof(Dlist), 1);
+	if(waserror()){
+		free(dl);
+		nexterror();
+	}
+	kbuf[0] = Kdlist;
+	PACK64(kbuf+1, gen);
+	PACK64(kbuf+9, bgen);
+	k.k = kbuf;
+	k.nk = sizeof(kbuf);
+
+	/* load up existing dlist */
+	if(btlookup(&fs->snap, &k, &kv, kvbuf, sizeof(kvbuf))){
+		kv2dlist(&kv, dl);
+		goto Found;
+	}
+
+	/* create a new one if it didn't exist */
+	dl->gen = gen;
+	dl->bgen = bgen;
+	dl->hd.addr = -1;
+	dl->tl.addr = -1;
+	dl->ins = nil;
+
+	m.op = Oinsert;
+	dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+	btupsert(&fs->snap, &m, 1);
+Found:
+	poperror();
+	h = ihash(gen) ^ ihash(bgen);
+	p = &fs->dlcache[h % fs->dlcmax];
+	dl->chain = *p;
+	*p = dl;
+	return dl;
+}
+
+void
+putdl(Dlist *dl)
+{
+	Dlist *dt;
+
+	if(dl->gen == -1)
+		return;
+	dlcachedel(dl, 0);
+	while(fs->dltail != nil && fs->dlcount >= fs->dlcmax){
+		dt = fs->dltail;
+		dlflush(dt);
+		dlcachedel(dt, 1);
+		dropblk(dt->ins);
+		free(dt);
+	}
+
+	dl->cprev = nil;
+	dl->cnext = fs->dlhead;
+	if(fs->dltail == nil)
+		fs->dltail = dl;
+	if(fs->dlhead != nil)
+		fs->dlhead->cprev = dl;
+	fs->dlhead = dl;
+}
+
+void
+freedl(Dlist *dl, int docontents)
+{
+	char buf[Kvmax];
+	Arena *a;
+	Qent qe;
+	Bptr bp;
+	Msg m;
+	Blk *b;
+	char *p;
+
+	bp = dl->hd;
+	if(dl->gen != -1){
+		m.op = Odelete;
+		dlist2kv(dl, &m, buf, sizeof(buf));
+		btupsert(&fs->snap, &m, 1);
+	}
+	while(bp.addr != -1){
+		b = getblk(bp, 0);
+		/*
+		 * Because these deadlists are dead-dead at this point,
+		 * they'll never be read from again; we can avoid worrying
+		 * about deferred reclamation, and queue them up to be freed
+		 * directly, which means we don't need to worry about watiing
+		 * for a quiescent state, and the associated out-of-block
+		 * deadlocks that come with it.
+		 */
+		if(docontents){
+			for(p = b->data; p != b->data+b->logsz; p += 8){
+				qe.op = Qfree;
+				qe.bp.addr = UNPACK64(p);
+				qe.bp.hash = -1;
+				qe.bp.gen = -1;
+				qe.b = nil;
+				a = getarena(qe.bp.addr);
+				qput(a->sync, qe);
+				traceb("dlclear", qe.bp);
+			}
+		}
+		bp = b->logp;
+		qe.op = Qfree;
+		qe.bp = b->bp;
+		qe.b = nil;
+		a = getarena(qe.bp.addr);
+		qput(a->sync, qe);
+		traceb("dlfreeb", qe.bp);
+		dropblk(b);
+	}
+}
+
+static void
+mergedl(vlong merge, vlong gen, vlong bgen)
+{
+	char buf[2][Kvmax];
+	Dlist *d, *m;
+	Msg msg[2];
+	Blk *b;
+
+	d = nil;
+	m = nil;
+	if(waserror()){
+		putdl(m);
+		putdl(d);
+		nexterror();
+	}
+	d = getdl(merge, bgen);
+	m = getdl(gen, bgen);
+	assert(d != m);
+	/*
+	 * If the dest dlist didn't exist,
+	 * just move the merge dlist over
+	 * and be done with it, otherwise
+	 * chain onto the existing dlist
+	 * tail.
+	 */
+	if(d->hd.addr == -1){
+		assert(d->ins == nil);
+		d->hd = m->hd;
+		d->tl = m->tl;
+		d->ins = m->ins;
+		if(d->ins != nil)
+			holdblk(d->ins);
+	}else{
+		if(m->ins != nil){
+			enqueue(m->ins);
+			dropblk(m->ins);
+			m->ins = nil;
+		}
+		b = getblk(d->tl, 0);
+		b->logp = m->hd;
+		assert(d->hd.addr != m->hd.addr);
+		finalize(b);
+		syncblk(b);
+		dropblk(b);
+	}
+	msg[0].op = Odelete;
+	dlist2kv(m, &msg[0], buf[0], sizeof(buf[0]));
+	msg[1].op = Oinsert;
+	dlist2kv(d, &msg[1], buf[1], sizeof(buf[1]));
+	btupsert(&fs->snap, msg, 2);
+	putdl(m);
+	putdl(d);
+	poperror();
+}
+
+static void
+reclaimblocks(vlong gen, vlong succ, vlong prev)
+{
+	char pfx[9];
+	Dlist dl;
+	Scan s;
+
+	pfx[0] = Kdlist;
+	PACK64(pfx+1, gen);
+	btnewscan(&s, pfx, sizeof(pfx));
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		kv2dlist(&s.kv, &dl);
+
+		if(succ != -1 && dl.bgen <= prev)
+			mergedl(succ, dl.gen, dl.bgen);
+		else if(dl.bgen <= prev)
+			mergedl(prev, dl.gen, dl.bgen);
+		else
+			freedl(&dl, 1);
+	}
+	btexit(&s);
+	if(succ != -1){
+		pfx[0] = Kdlist;
+		PACK64(pfx+1, succ);
+		btnewscan(&s, pfx, sizeof(pfx));
+		btenter(&fs->snap, &s);
+		while(1){
+			if(!btnext(&s, &s.kv))
+				break;
+			kv2dlist(&s.kv, &dl);
+			if(dl.bgen > prev)
+				freedl(&dl, 1);
+		}
+		btexit(&s);
+	}
+}
+
+/*
+ * Removes a label from a snapshot, allowing
+ * it to be reclaimed if it is not a direct
+ * predecessor of more than one other snapshot.
+ *
+ * If it has one successor and no label, then
+ * it will be merged with that successor.
+ */
+void
+delsnap(Tree *t, vlong succ, char *name)
+{
+	char *p, buf[4][Kvmax];
+	int nm, deltree;
+	Mount *mnt;
+	Msg m[4];
+
+	nm = 0;
+	deltree = 0;
+	if(name != nil){
+		if(strcmp(name, "dump") == 0
+		|| strcmp(name, "empty") == 0
+		|| strcmp(name, "adm") == 0)
+			error(Ename);
+
+		m[nm].op = Odelete;
+		m[nm].k = buf[nm];
+		p = packlbl(buf[nm], sizeof(buf[nm]), name);
+		m[nm].nk = p - m[nm].k;
+		m[nm].v = nil;
+		m[nm].nv = 0;
+		t->nlbl--;
+		nm++;
+	}
+ 
+	if(t->nlbl == 0 && t->nref <= 1){
+		deltree = 1;
+		m[nm].op = Orelink;
+		retag2kv(t->pred, succ, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
+		nm++;
+		if(t->succ != -1){
+			m[nm].op = Oreprev;
+			retag2kv(t->succ, t->pred, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
+			nm++;
+		}
+		m[nm].op = Odelete;
+		m[nm].k = buf[nm];
+		p = packsnap(buf[nm], sizeof(buf[nm]), t->gen);
+		m[nm].nk = p - m[nm].k;
+		m[nm].v = nil;
+		m[nm].nv = 0;
+		nm++;
+	}
+	assert(nm <= nelem(m));
+	dlsync();
+	btupsert(&fs->snap, m, nm);
+	if(deltree){
+		reclaimblocks(t->gen, succ, t->pred);
+		for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+			if(mnt->root->gen == t->succ)
+				mnt->root->pred = t->pred;
+			if(mnt->root->gen == t->pred)
+				mnt->root->succ = t->succ;
+		}
+	}
+}
+
+/*
+ * Attaches a label to a tree, incrementing
+ * its reference count. This labelled snapshot
+ * will show up in the dump.
+ */
+void
+tagsnap(Tree *t, char *name, int flg)
+{
+	char buf[3][Kvmax];
+	Msg m[3];
+	Tree *n;
+	int i;
+
+	if(strcmp(name, "dump") == 0
+	|| strcmp(name, "empty") == 0
+	|| strcmp(name, "adm") == 0)
+		error(Ename);
+
+	i = 0;
+	n = nil;
+	if(flg & Lmut){
+		n = emalloc(sizeof(Tree), 1);
+		if(waserror()){
+			free(n);
+			nexterror();
+		}
+		n->memref = 1;
+		n->dirty = 0;
+		n->nlbl = 1;
+		n->nref = 0;
+		n->ht = t->ht;
+		n->bp = t->bp;
+		n->succ = -1;
+		n->pred = t->gen;
+		n->base = t->gen;
+		n->gen = aincv(&fs->nextgen, 1);
+		n->memgen = aincv(&fs->nextgen, 1);
+
+		t->nref++;
+		m[i].op = Orelink;
+		retag2kv(t->gen, t->succ, 0, 1, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+		m[i].op = Oinsert;
+		lbl2kv(name, n->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+		m[i].op = Oinsert;
+		tree2kv(n, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+		poperror();
+	}else{
+		t->nlbl++;
+		m[i].op = Orelink;
+		retag2kv(t->gen, t->succ, 1, 0, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+
+		m[i].op = Oinsert;
+		t->pred = t->gen;
+		t->nlbl++;
+		lbl2kv(name, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+	}
+	btupsert(&fs->snap, m, i);
+	free(n);
+}
+
+/*
+ * Updates a snapshot; keeps the generation the same if possible,
+ * otherwise moves to a new generation. A snapshot may only stay
+ * at the same generation as long as it is at the tip of a snapshot
+ * list; once it's observable by a derived snapshot it must be
+ * immutable.
+ */
+void
+updatesnap(Tree **r, Tree *o, char *lbl, int flg)
+{
+	char buf[4][Kvmax];
+	Msg m[4];
+	Tree *t;
+	int i;
+
+	if(!o->dirty)
+		return;
+
+	traceb("updatesnap", o->bp);
+	/* update the old kvp */
+	o->nlbl--;
+	o->nref++;
+
+	/* create the new one */
+
+	t = emalloc(sizeof(Tree), 1);
+	if(waserror()){
+		free(t);
+		nexterror();
+	}
+	t->memref = 1;
+	t->dirty = 0;
+
+	t->nlbl = 1;
+	t->nref = 0;
+	t->ht = o->ht;
+	t->bp = o->bp;
+	t->succ = -1;
+	t->base = o->base;
+	t->gen = o->memgen;
+	t->memgen = aincv(&fs->nextgen, 1);
+
+	i = 0;
+	m[i].op = Orelink;
+	if(o->nlbl == 0 && o->nref == 1){
+		t->pred = o->pred;
+		retag2kv(t->pred, t->gen, 0, 0, &m[i], buf[i], sizeof(buf[i]));
+	}else{
+		t->pred = o->gen;
+		retag2kv(t->pred, t->gen, -1, 1, &m[i], buf[i], sizeof(buf[i]));
+	}
+	i++;
+
+	m[i].op = Oinsert;
+	tree2kv(t, &m[i], buf[i], sizeof(buf[i]));
+	i++;
+	m[i].op = Oinsert;
+	lbl2kv(lbl, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+	i++;
+	btupsert(&fs->snap, m, i);
+
+	/* only update the dirty status after we sync */
+	o->dirty = 0;
+
+	/* this was the last ref to the snap */
+	if(o->nlbl == 0 && o->nref == 1)
+		delsnap(o, t->gen, nil);
+	closesnap(o);
+	asetp(r, t);
+	poperror();
+}
+
+/*
+ * open snapshot by label, returning a tree.
+ */
+Tree*
+opensnap(char *label, int *flg)
+{
+	char *p, buf[Kvmax];
+	Tree *t;
+	vlong gen;
+	Kvp kv;
+	Key k;
+
+	/* Klabel{"name"} => Ksnap{id} */
+	if((p = packlbl(buf, sizeof(buf), label)) == nil)
+		return nil;
+	k.k = buf;
+	k.nk = p - buf;
+	if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
+		return nil;
+	assert(kv.nv == 1+8+4);
+	gen = UNPACK64(kv.v + 1);
+	if(flg != nil)
+		*flg = UNPACK32(kv.v + 1+8);
+
+	t = mallocz(sizeof(Tree), 1);
+	if(waserror()){
+		free(t);
+		nexterror();
+	}
+	p = packsnap(buf, sizeof(buf), gen);
+	k.k = buf;
+	k.nk = p - buf;
+	if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
+		broke(Efs);
+	unpacktree(t, kv.v, kv.nv);
+	t->memref = 1;
+	t->memgen = aincv(&fs->nextgen, 1);
+	poperror();
+	return t;
+}
+
+/*
+ * close snapshot, flushing and freeing in-memory
+ * representation.
+ */
+void
+closesnap(Tree *t)
+{
+	if(t == nil || adec(&t->memref) != 0)
+		return;
+	limbo(DFtree, t);
+}
+
+void
+dlsync(void)
+{
+	Dlist *dl, *n;
+
+	tracem("dlsync");
+	dlflush(&fs->snapdl);
+	for(dl = fs->dlhead; dl != nil; dl = n){
+		n = dl->cnext;
+		dlflush(dl);
+	}
+}
+
+/*
+ * Marks a block as killed by the tree
+ * t, which means that it will be free
+ * for use after t is reclaimed.
+ *
+ * t must be an active snapshot with
+ * no successors.
+ */
+void
+killblk(Tree *t, Bptr bp)
+{
+	Dlist *dl;
+	Blk *b;
+	char *p;
+
+	/* 
+	 * When we have a forked snap, blocks allocated before the fork
+	 * are the responsibility of the other chain; in this chain, we
+	 * leak it and let the last reference in the other chain clean up
+	 */
+	if(t == &fs->snap)
+		dl = &fs->snapdl;
+	else if(bp.gen > t->base)
+		dl = getdl(t->memgen, bp.gen);
+	else
+		return;
+	if(waserror()){
+		putdl(dl);
+		nexterror();
+	}
+	if(dl->ins == nil || Logspc - dl->ins->logsz < Logslop){
+		b = newblk(&fs->snap, Tdlist);
+		if(dl->ins != nil){
+			enqueue(dl->ins);
+			dropblk(dl->ins);
+		}
+		if(dl->tl.addr == -1)
+			dl->tl = b->bp;
+		b->logp = dl->hd;
+		dl->hd = b->bp;
+		dl->ins = b;
+		cacheins(b);
+	}
+	p = dl->ins->data + dl->ins->logsz;
+	dl->ins->logsz += 8;
+	setflag(dl->ins, Bdirty, 0);
+	PACK64(p, bp.addr);
+	poperror();
+	putdl(dl);
+}
--- /dev/null
+++ b/tree.c
@@ -1,0 +1,1543 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Path	Path;
+
+struct Path {
+	/* Flowing down for flush */
+	Msg	*ins;	/* inserted values, bounded by lo..hi */
+	Blk	*b;	/* to shadow */
+	int	idx;	/* insert at */
+	int	lo;	/* key range */
+	int	hi;	/* key range */
+	int	sz;	/* size of range */
+
+	/* Flowing up from flush */
+	int	op;	/* change done along path */
+	Blk	*m;	/* node merged against, for post-update free */
+	Blk	*nl;	/* new left */
+	Blk	*nr;	/* new right, if we split or rotated */
+	int	midx;	/* modification index */
+	int	npull;	/* number of messages successfully pulled */
+	int	pullsz;	/* size of pulled messages */
+};
+
+#define efreeblk(t, b) do { \
+	if(b != nil) \
+		freeblk(t, b); \
+	} while(0)
+
+static void
+stablesort(Msg *m, int nm)
+{
+	int i, j;
+	Msg t;
+
+	for(i = 1; i < nm; i++){
+		for(j = i; j > 0; j--){
+			if(keycmp(&m[j-1], &m[j]) <= 0)
+				break;
+			t = m[j-1];
+			m[j-1] = m[j];
+			m[j] = t;
+		}
+	}
+}
+
+void
+cpkey(Key *dst, Key *src, char *buf, int nbuf)
+{
+	assert(src->nk <= nbuf);
+	memmove(buf, src->k, src->nk);
+	dst->k = buf;
+	dst->nk = src->nk;
+}
+
+void
+cpkvp(Kvp *dst, Kvp *src, char *buf, int nbuf)
+{
+	assert(src->nk+src->nv <= nbuf);
+	memmove(buf, src->k, src->nk);
+	memmove(buf+ src->nk, src->v, src->nv);
+	dst->k = buf;
+	dst->nk = src->nk;
+	dst->v = buf+src->nk;
+	dst->nv = src->nv;
+}
+
+int
+keycmp(Key *a, Key *b)
+{
+	int c, n;
+
+	n = (a->nk < b->nk) ? a->nk : b->nk;
+	if((c = memcmp(a->k, b->k, n)) != 0)
+		return c < 0 ? -1 : 1;
+	if(a->nk < b->nk)
+		return -1;
+	else if(a->nk > b->nk)
+		return 1;
+	else
+		return 0;
+}
+
+static int
+msgsz(Msg *m)
+{
+	/* disp + op + klen + key + vlen + v */
+	return 2+1+2+m->nk +2+ m->nv;
+}
+
+static int
+valsz(Kvp *kv)
+{
+	return 2 + 2+kv->nk + 2+kv->nv;
+}
+
+void
+getval(Blk *b, int i, Kvp *kv)
+{
+	char *p;
+	int o;
+
+	assert(i >= 0 && i < b->nval);
+	p = b->data + 2*i;
+	o = UNPACK16(p);	p = b->data + o;
+	kv->nk = UNPACK16(p);	p += 2;
+	kv->k = p;		p += kv->nk;
+	kv->nv = UNPACK16(p);	p += 2;
+	kv->v = p;
+}
+
+Bptr
+getptr(Kvp *kv, int *fill)
+{
+	assert(kv->nv == Ptrsz || kv->nv == Ptrsz+2);
+	*fill = UNPACK16(kv->v + Ptrsz);
+	return unpackbp(kv->v, kv->nv);
+}
+
+/* Exported for reaming */
+void
+setval(Blk *b, Kvp *kv)
+{
+	int off, spc;
+	char *p;
+
+	spc = (b->type == Tleaf) ? Leafspc : Pivspc;
+	b->valsz += 2 + kv->nk + 2 + kv->nv;
+	off = spc - b->valsz;
+
+	assert(2*(b->nval+1) + b->valsz <= spc);
+	assert(2*(b->nval+1) <= off);
+
+	p = b->data + 2*b->nval;
+	PACK16(p, off);
+
+	p = b->data + off;
+	PACK16(p, kv->nk);		p += 2;
+	memmove(p, kv->k, kv->nk);	p += kv->nk;
+	PACK16(p, kv->nv);		p += 2;
+	memmove(p, kv->v, kv->nv);
+
+	b->nval++;
+}
+
+static void
+setptr(Blk *b, Key *k, Bptr bp, int fill)
+{
+	char *p, buf[Ptrsz+2];
+	Kvp kv;
+
+	kv.k = k->k;
+	kv.nk = k->nk;
+	kv.v = buf;
+	kv.nv = sizeof(buf);
+	p = packbp(buf, sizeof(buf), &bp);
+	PACK16(p, fill);
+	setval(b, &kv);
+}
+
+static void
+setmsg(Blk *b, Msg *m)
+{
+	char *p;
+	int o;
+
+	assert(b->type == Tpivot);
+	b->bufsz += msgsz(m)-2;
+
+	p = b->data + Pivspc + 2*b->nbuf;
+	o = Bufspc - b->bufsz;
+	PACK16(p, o);
+
+	p = b->data + Pivspc + o;
+	*p = m->op;			p += 1;
+	PACK16(p, m->nk);		p += 2;
+	memmove(p, m->k, m->nk);	p += m->nk;
+	PACK16(p, m->nv);		p += 2;
+	memmove(p, m->v, m->nv);
+
+	b->nbuf++;
+}
+
+void
+getmsg(Blk *b, int i, Msg *m)
+{
+	char *p;
+	int o;
+
+	assert(b->type == Tpivot);
+	assert(i >= 0 && i < b->nbuf);
+	p = b->data + Pivspc + 2*i;
+	o = UNPACK16(p);
+	p = b->data + Pivspc + o;
+	m->op = *p;		p += 1;
+	m->nk = UNPACK16(p);	p += 2;
+	m->k = p;		p += m->nk;
+	m->nv = UNPACK16(p);	p += 2;
+	m->v = p;
+}
+
+static int
+bufsearch(Blk *b, Key *k, Msg *m, int *same)
+{
+	int lo, hi, ri, mid, r;
+	Msg cmp;
+
+	ri = -1;
+	lo = 0;
+	hi = b->nbuf-1;
+	while(lo <= hi){
+		mid = (hi + lo) / 2;
+		getmsg(b, mid, &cmp);
+		r = keycmp(k, &cmp);
+		switch(r){
+		case -1:
+			hi = mid-1;
+			break;
+		case 0:
+			ri = mid;
+			hi = mid-1;
+			break;
+		case 1:
+			lo = mid+1;
+			break;
+		}
+	}
+	/*
+	 * we can have duplicate messages, and we
+	 * want to point to the first of them:
+	 * scan backwards.
+	 */
+	*same = 0;
+	if(ri == -1)
+		ri = lo-1;
+	else
+		*same = 1;
+	if(m != nil && ri >= 0)
+		getmsg(b, ri, m);
+	return ri;
+}
+
+static int
+blksearch(Blk *b, Key *k, Kvp *rp, int *same)
+{
+	int lo, hi, ri, mid, r;
+	Kvp cmp;
+
+	ri = -1;
+	lo = 0;
+	hi = b->nval-1;
+	while(lo <= hi){
+		mid = (hi + lo) / 2;
+		getval(b, mid, &cmp);
+		r = keycmp(k, &cmp);
+		switch(r){
+		case -1:
+			hi = mid-1;
+			break;
+		case 0:
+			ri = mid;
+			hi = mid-1;
+			break;
+		case 1:
+			lo = mid+1;
+			break;
+		}
+	}
+	*same = 0;
+	if(ri == -1)
+		ri = lo-1;
+	else
+		*same = 1;
+	if(ri >= 0)
+		getval(b, ri, rp);
+	return ri;
+}
+
+static int
+buffill(Blk *b)
+{
+	assert(b->type == Tpivot);
+	return 2*b->nbuf + b->bufsz;
+}
+
+static int
+filledbuf(Blk *b, int nmsg, int needed)
+{
+	assert(b->type == Tpivot);
+	return 2*(b->nbuf+nmsg) + b->bufsz + needed > Bufspc;
+}
+
+static int
+filledleaf(Blk *b, int needed)
+{
+	assert(b->type == Tleaf);
+	return 2*(b->nval+1) + b->valsz + needed > Leafspc;
+}
+
+static int
+filledpiv(Blk *b, int reserve)
+{
+	/* 
+	 * We need to guarantee there's room for one message
+	 * at all times, so that splits along the whole path
+	 * have somewhere to go as they propagate up.
+	 */
+	assert(b->type == Tpivot);
+	return 2*(b->nval+1) + b->valsz + reserve*Kpmax > Pivspc;
+}
+
+static void
+copyup(Blk *n, Path *pp, int *nbytes)
+{
+	Kvp kv;
+	Msg m;
+
+	/*
+	 * It's possible for the previous node to have
+	 * been fully cleared out by a large number of
+	 * delete messages, so we need to check if
+	 * there's anything in it to copy up.
+	 */
+	if(pp->nl->nval > 0){
+		getval(pp->nl, 0, &kv);
+		if(pp->nl->nbuf > 0){
+			getmsg(pp->nl, 0, &m);
+			if(keycmp(&kv, &m) > 0)
+				kv.Key = m.Key;
+		}
+		setptr(n, &kv, pp->nl->bp, blkfill(pp->nl));
+		if(nbytes != nil)
+			*nbytes += valsz(&kv);
+	}
+	if(pp->nr != nil && pp->nr->nval > 0){
+		getval(pp->nr, 0, &kv);
+		if(pp->nr->nbuf > 0){
+			getmsg(pp->nr, 0, &m);
+			if(keycmp(&kv, &m) > 0)
+				kv.Key = m.Key;
+		}
+		setptr(n, &kv, pp->nr->bp, blkfill(pp->nr));
+		if(nbytes != nil)
+			*nbytes += valsz(&kv);
+	}
+}
+
+static void
+statupdate(Kvp *kv, Msg *m)
+{
+	int op;
+	char *p;
+	Xdir d;
+
+	p = m->v;
+	op = *p++;
+	kv2dir(kv, &d);
+	/* bump version */
+	d.qid.vers++;
+	if(op & Owsize){
+		d.length = UNPACK64(p);
+		p += 8;
+	}
+	if(op & Owmode){
+		d.mode = UNPACK32(p);
+		d.qid.type = d.mode>>24;
+		p += 4;
+	}
+	if(op & Owmtime){
+		d.mtime = UNPACK64(p);
+		p += 8;
+	}
+	if(op & Owatime){
+		d.atime = UNPACK64(p);
+		p += 8;
+	}
+	if(op & Owuid){
+		d.uid = UNPACK32(p);
+		p += 4;
+	}
+	if(op & Owgid){
+		d.gid = UNPACK32(p);
+		p += 4;
+	}
+	if(op & Owmuid){
+		d.muid = UNPACK32(p);
+		p += 4;
+	}
+	if(p != m->v + m->nv)
+		fatal("malformed stat: kv=%P, m=%M\n", kv, m);
+	if(packdval(kv->v, kv->nv, &d) == nil)
+		fatal("repacking dir failed\n");
+}
+
+static int
+apply(Kvp *kv, Msg *m, char *buf, int nbuf)
+{
+	vlong *pv;
+	char *p;
+	Tree t;
+
+	switch(m->op){
+	case Odelete:
+		assert(keycmp(kv, m) == 0);
+		return 0;
+	case Oclearb:
+	case Oclobber:
+		return 0;
+	case Oinsert:
+		cpkvp(kv, m, buf, nbuf);
+		return 1;
+	case Owstat:
+		assert(keycmp(kv, m) == 0);
+		statupdate(kv, m);
+		return 1;
+	case Orelink:
+	case Oreprev:
+		unpacktree(&t, kv->v, kv->nv);
+		p = m->v;
+		pv = (m->op == Orelink) ? &t.succ : &t.pred;
+		*pv = UNPACK64(p);	p += 8;
+		t.nlbl += *p;		p++;
+		t.nref += *p;		p++;
+		assert(t.nlbl >= 0 && t.nref >= 0);
+		assert(p == m->v + m->nv);
+		packtree(kv->v, kv->nv, &t);
+		return 1;
+	default:
+		fatal("invalid op %d\n", m->op);
+	}
+	return 0;
+}
+
+static int
+pullmsg(Path *p, int i, Kvp *v, Msg *m, int *full, int spc)
+{
+	if(i < 0 || i >= p->hi || *full)
+		return -1;
+
+	if(p->ins != nil)
+		*m = p->ins[i];
+	else
+		getmsg(p->b, i, m);
+	if(msgsz(m) <= spc)
+		return (v == nil) ? 0 : keycmp(v, m);
+	*full = 1;
+	return -1;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1, 
+ */
+static void
+updateleaf(Tree *t, Path *up, Path *p)
+{
+	char buf[Msgmax];
+	int i, j, c, ok, full, spc;
+	Blk *b, *n;
+	Bptr bp;
+	Msg m;
+	Kvp v;
+
+	i = 0;
+	j = up->lo;
+	b = p->b;
+	/*
+	 * spc is the amount of room we have
+	 * to copy data down from the parent; it's
+	 * necessarily a bit conservative, because
+	 * deletion messages don't take space -- but
+	 * we don't know how what the types of all
+	 * messages are.
+	 */
+	full = 0;
+	spc = Leafspc - blkfill(b);
+	n = newblk(t, b->type);
+	assert(i >= 0 && j >= 0);
+	while(i < b->nval || j < up->hi){
+		if(i >= b->nval)
+			c = 1;
+		else{
+			c = -1;
+			getval(p->b, i, &v);
+			if(j < up->hi){
+				if(up->ins != nil)
+					m = up->ins[j];
+				else
+					getmsg(up->b, j, &m);
+				if(msgsz(&m) <= spc)
+					c = keycmp(&v, &m);
+				else
+					full = 1;
+			}
+		}
+		switch(c){
+		/* Value before message: just copy value */
+		case -1:
+			i++;
+			setval(n, &v);
+			break;
+		/* Value merges with message sequence */
+		case 0:
+			i++;
+			j++;
+			cpkvp(&v, &v, buf, sizeof(buf));
+			if(v.nk > 0 && v.k[0] == Kdat)
+			if(m.op == Oclearb
+			|| m.op == Oinsert
+			|| m.op == Odelete){
+				bp = unpackbp(v.v, v.nv);
+				freebp(t, bp);
+			}
+			ok = apply(&v, &m, buf, sizeof(buf));
+			goto Copyloop;
+		/* Message before value: Insert message sequence */
+		case 1:
+			j++;
+			cpkvp(&v, &m, buf, sizeof(buf));
+			ok = 0;
+			if(m.op != Oclearb && m.op != Oclobber){
+				spc -= valsz(&m);
+				p->pullsz += msgsz(&m);
+				ok = 1;
+			}
+			goto Copyloop;
+		Copyloop:
+			while(j < up->hi){
+				if(pullmsg(up, j, &v, &m, &full, spc) != 0)
+					break;
+				if(ok && v.nk > 0 && v.k[0] == Kdat)
+				if(m.op == Oclearb
+				|| m.op == Oinsert
+				|| m.op == Odelete){
+					bp = unpackbp(v.v, v.nv);
+					freebp(t, bp);
+				}
+				p->pullsz += msgsz(&m);
+				ok = apply(&v, &m, buf, sizeof(buf));
+				j++;
+			}
+			if(ok)
+				setval(n, &v);
+			break;
+		}
+	}
+	p->npull = (j - up->lo);
+	p->nl = n;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1, 
+ */
+static void
+updatepiv(Tree *t, Path *up, Path *p, Path *pp)
+{
+	char buf[Msgmax];
+	int i, j, sz, full, spc;
+	Blk *b, *n;
+	Msg m, u;
+
+	b = p->b;
+	n = newblk(t, b->type);
+	for(i = 0; i < b->nval; i++){
+		if(pp != nil && i == p->midx){
+			copyup(n, pp, nil);
+			if(pp->op == POrot || pp->op == POmerge)
+				i++;
+		}else{
+			getval(b, i, &m);
+			setval(n, &m);
+		}
+	}
+	i = 0;
+	j = up->lo;
+	sz = 0;
+	full = 0;
+	spc = Bufspc - buffill(b);
+	if(pp != nil)
+		spc += pp->pullsz;
+	while(i < b->nbuf){
+		if(i == p->lo)
+			i += pp->npull;
+		if(i == b->nbuf)
+			break;
+		getmsg(b, i, &m);
+		switch(pullmsg(up, j, &m, &u, &full, spc - sz)){
+		case -1:
+		case 0:
+			setmsg(n, &m);
+			i++;
+			break;
+		case 1:
+			cpkvp(&m, &u, buf, sizeof(buf));
+			while(pullmsg(up, j, &m, &u, &full, spc) == 0){
+				setmsg(n, &u);
+				sz = msgsz(&u);
+				p->pullsz += sz;
+				spc -= sz;
+				j++;
+			}
+		}
+	}
+	while(j < up->hi){
+		pullmsg(up, j, nil, &u, &full, spc);
+		if(full)
+			break;
+		setmsg(n, &u);
+		sz = msgsz(&u);
+		p->pullsz += sz;
+		spc -= sz;
+		j++;
+	}
+	p->npull = (j - up->lo);
+	p->nl = n;
+}
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the tree by more than 1.
+ */
+static void
+splitleaf(Tree *t, Path *up, Path *p, Kvp *mid)
+{
+	char buf[Msgmax];
+	Blk *b, *d, *l, *r;
+	int full, copied, spc, ok, halfsz;
+	int i, j, c;
+	Bptr bp;
+	Msg m;
+	Kvp v;
+
+	/*
+	 * If the block one entry up the
+	 * p is nil, we're at the root,
+	 * so we want to make a new block.
+	 */
+	b = p->b;
+	l = nil;
+	r = nil;
+	if(waserror()){
+		efreeblk(t, l);
+		efreeblk(t, r);
+		nexterror();
+	}
+	l = newblk(t, b->type);
+	r = newblk(t, b->type);
+
+	d = l;
+	i = 0;
+	j = up->lo;
+	full = 0;
+	copied = 0;
+	halfsz = (2*b->nval + b->valsz + up->sz) / 2;
+	if(halfsz > Leafspc/2)
+		halfsz = Leafspc/2;
+	spc = Leafspc - (halfsz + Msgmax);
+	assert(b->nval >= 4);
+	while(i < b->nval){
+		/*
+		 * We're trying to balance size,
+		 * but we need at least 2 nodes
+		 * in each half of the split if
+		 * we want a valid tree.
+		 */
+		if(d == l)
+		if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
+			d = r;
+			spc = Leafspc - (halfsz + Msgmax);
+			getval(b, i, mid);
+		}
+		getval(b, i, &v);
+ 		c = pullmsg(up, j, &v, &m, &full, spc);
+		switch(c){
+		case -1:
+			i++;
+			setval(d, &v);
+			copied += valsz(&v);
+			break;
+		case 0:
+			i++;
+			j++;
+			cpkvp(&v, &v, buf, sizeof(buf));
+			copied += valsz(&v);
+			if(v.nk > 0 && v.k[0] == Kdat)
+			if(m.op == Oclearb
+			|| m.op == Oinsert
+			|| m.op == Odelete){
+				bp = unpackbp(v.v, v.nv);
+				freebp(t, bp);
+			}
+			ok = apply(&v, &m, buf, sizeof(buf));
+			goto Copyloop;
+		case 1:
+			j++;
+			cpkvp(&v, &m, buf, sizeof(buf));
+			copied += valsz(&v);
+			ok = 0;
+			if(m.op != Oclearb && m.op != Oclobber){
+				spc -= valsz(&m);
+				p->pullsz += msgsz(&m);
+				ok = 1;
+			}
+			goto Copyloop;
+		Copyloop:
+			while(j < up->hi){
+				if(pullmsg(up, j, &v, &m, &full, spc) != 0)
+					break;
+				if(ok && v.nk > 0 && v.k[0] == Kdat)
+				if(m.op == Oclearb
+				|| m.op == Oinsert
+				|| m.op == Odelete){
+					bp = unpackbp(v.v, v.nv);
+					freebp(t, bp);
+				}
+				p->pullsz += msgsz(&m);
+				ok = apply(&v, &m, buf, sizeof(buf));
+				j++;
+			}
+			if(ok)
+				setval(d, &v);
+			break;
+		}
+	}
+	p->npull = (j - up->lo);
+	p->op = POsplit;
+	p->nl = l;
+	p->nr = r;
+	poperror();
+}
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the tree by more
+ * than one.
+ */
+static void
+splitpiv(Tree *t, Path *, Path *p, Path *pp, Kvp *mid)
+{
+	int i, copied, halfsz;
+	Blk *b, *d, *l, *r;
+	Kvp tk;
+	Msg m;
+
+	/*
+	 * If the bp->lock one entry up the
+	 * p is nil, we're at the root,
+	 * so we want to make a new bp->lock.
+	 */
+	b = p->b;
+	l = nil;
+	r = nil;
+	if(waserror()){
+		efreeblk(t, l);
+		efreeblk(t, r);
+		nexterror();
+	}
+	l = newblk(t, b->type);
+	r = newblk(t, b->type);
+	d = l;
+	copied = 0;
+	halfsz = (2*b->nval + b->valsz)/2;
+	assert(b->nval >= 4);
+	for(i = 0; i < b->nval; i++){
+		/*
+		 * We're trying to balance size,
+		 * but we need at least 2 nodes
+		 * in each half of the split if
+		 * we want a valid tree.
+		 */
+		if(d == l)
+		if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
+			d = r;
+			getval(b, i, mid);
+		}
+		if(i == p->idx){
+			copyup(d, pp, &copied);
+			continue;
+		}
+		getval(b, i, &tk);
+		setval(d, &tk);
+		copied += valsz(&tk);
+	}
+	d = l;
+	for(i = 0; i < b->nbuf; i++){
+		if(i == p->lo)
+			i += pp->npull;
+		if(i == b->nbuf)
+			break;
+		getmsg(b, i, &m);
+		if(d == l && keycmp(&m, mid) >= 0)
+			d = r;
+		setmsg(d, &m);
+	}
+	p->op = POsplit;
+	p->nl = l;
+	p->nr = r;
+	poperror();
+}
+
+static void
+merge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
+{
+	Blk *d;
+	Msg m;
+	int i;
+
+	d = newblk(t, a->type);
+	for(i = 0; i < a->nval; i++){
+		getval(a, i, &m);
+		setval(d, &m);
+	}
+	for(i = 0; i < b->nval; i++){
+		getval(b, i, &m);
+		setval(d, &m);
+	}
+	if(a->type == Tpivot){
+		for(i = 0; i < a->nbuf; i++){
+			getmsg(a, i, &m);
+			setmsg(d, &m);
+		}
+		for(i = 0; i < b->nbuf; i++){
+			getmsg(b, i, &m);
+			setmsg(d, &m);
+		}
+	}
+	enqueue(d);
+	p->midx = idx;
+	pp->nl = d;
+	pp->op = POmerge;
+	pp->nr = nil;
+}
+
+/*
+ * Scan a single block for the split offset;
+ * returns 1 if we'd spill out of the buffer,
+ * updates *idx and returns 0 otherwise.
+ */
+static int
+spillscan(Blk *d, Blk *b, Msg *m, int *idx, int o)
+{
+	int i, used;
+	Msg n;
+
+	used = 2*d->nbuf + d->bufsz;
+	for(i = *idx; i < b->nbuf; i++){
+		getmsg(b, i, &n);
+		if(keycmp(m, &n) <= 0){
+			*idx = i + o;
+			return 0;
+		}
+		used += msgsz(&n);
+		if(used > Bufspc)
+			return 1;
+	}
+	*idx = b->nbuf;
+	return 0;
+}
+
+/*
+ * Returns whether the keys in b between
+ * idx and m would spill out of the buffer
+ * of d.
+ */
+static int
+spillsbuf(Blk *d, Blk *l, Blk *r, Msg *m, int *idx)
+{
+	if(l->type == Tleaf)
+		return 0;
+
+	if(*idx < l->nbuf && spillscan(d, l, m, idx, 0))
+		return 1;
+	if(*idx >= l->nbuf && spillscan(d, r, m, idx, l->nbuf))
+		return 1;
+	return 0;
+}
+
+static void
+rotate(Tree *t, Path *p, Path *pp, int midx, Blk *a, Blk *b, int halfpiv)
+{
+	int i, o, cp, sp, idx;
+	Blk *d, *l, *r;
+	Msg m;
+
+	l = nil;
+	r = nil;
+	if(waserror()){
+		efreeblk(t, l);
+		efreeblk(t, r);
+		nexterror();
+	}
+	l = newblk(t, a->type);
+	r = newblk(t, a->type);
+	d = l;
+	cp = 0;
+	sp = -1;
+	idx = 0;
+	for(i = 0; i < a->nval; i++){
+		getval(a, i, &m);
+		if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+			sp = idx;
+			d = r;
+		}
+		setval(d, &m);
+		cp += valsz(&m);
+	}
+	for(i = 0; i < b->nval; i++){
+		getval(b, i, &m);
+		if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+			sp = idx;
+			d = r;
+		}
+		setval(d, &m);
+		cp += valsz(&m);
+	}
+	if(a->type == Tpivot){
+		d = l;
+		o = 0;
+		for(i = 0; i < a->nbuf; i++){
+			if(o == sp){
+				d = r;
+				o = 0;
+			}
+			getmsg(a, i, &m);
+			setmsg(d, &m);
+			o++;
+		}
+		for(i = 0; i < b->nbuf; i++){
+			if(o == sp){
+				d = r;
+				o = 0;
+			}
+			getmsg(b, i, &m);
+			setmsg(d, &m);
+			o++;
+		}
+	}
+	enqueue(l);
+	enqueue(r);
+	p->midx = midx;
+	pp->op = POrot;
+	pp->nl = l;
+	pp->nr = r;
+	poperror();
+}
+
+static void
+rotmerge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
+{
+	int na, nb, ma, mb, imbalance;
+
+	assert(a->type == b->type);
+
+	na = 2*a->nval + a->valsz;
+	nb = 2*b->nval + b->valsz;
+	if(a->type == Tleaf){
+		ma = 0;
+		mb = 0;
+	}else{
+		ma = 2*a->nbuf + a->bufsz;
+		mb = 2*b->nbuf + b->bufsz;
+	}
+	imbalance = na - nb;
+	if(imbalance < 0)
+		imbalance *= -1;
+	/* works for leaf, because 0 always < Bufspc */
+	if(na + nb < (Pivspc - 4*Msgmax) && ma + mb < Bufspc)
+		merge(t, p, pp, idx, a, b);
+	else if(imbalance > 4*Msgmax)
+		rotate(t, p, pp, idx, a, b, (na + nb)/2);
+}
+
+static void
+trybalance(Tree *t, Path *p, Path *pp, int idx)
+{
+	Blk *l, *m, *r;
+	Kvp kl, kr;
+	int spc, fill;
+	Bptr bp;
+
+	if(p->idx == -1 || pp == nil || pp->nl == nil)
+		return;
+	if(pp->op != POmod || pp->op != POmerge)
+		return;
+
+	l = nil;
+	r = nil;
+	m = holdblk(pp->nl);
+	if(waserror()){
+		dropblk(m);
+		dropblk(l);
+		dropblk(r);
+		nexterror();
+	}
+	spc = (m->type == Tleaf) ? Leafspc : Pivspc;
+	if(idx-1 >= 0){
+		getval(p->b, idx-1, &kl);
+		bp = getptr(&kl, &fill);
+		if(fill + blkfill(m) < spc){
+			l = getblk(bp, 0);
+			rotmerge(t, p, pp, idx-1, l, m);
+			goto Done;
+		}
+	}
+	if(idx+1 < p->b->nval){
+		getval(p->b, idx+1, &kr);
+		bp = getptr(&kr, &fill);
+		if(fill + blkfill(m) < spc){
+			r = getblk(bp, 0);
+			rotmerge(t, p, pp, idx, m, r);
+			goto Done;
+		}
+	}
+Done:
+	dropblk(m);
+	dropblk(l);
+	dropblk(r);
+	poperror();
+}
+
+static Path*
+flush(Tree *t, Path *path, int npath)
+{
+
+	Path *up, *p, *pp, *rp;
+	Kvp mid;
+
+	/*
+	 * The path must contain at minimum two elements:
+	 * we must have 1 node we're inserting into, and
+	 * an empty element at the top of the path that
+	 * we put the new root into if the root gets split.
+	 */
+	assert(npath >= 2);
+	rp = nil;
+	pp = nil;
+	p = &path[npath - 1];
+	up = &path[npath - 2];
+	if(p->b->type == Tleaf){
+		if(!filledleaf(p->b, up->sz)){
+			updateleaf(t, p-1, p);
+			enqueue(p->nl);
+			rp = p;
+		}else{
+			splitleaf(t, up, p, &mid);
+			enqueue(p->nl);
+			enqueue(p->nr);
+		}
+		p->midx = -1;
+		pp = p;
+		up--;
+		p--;
+	}
+	while(p != path){
+		if(!filledpiv(p->b, 1)){
+			trybalance(t, p, pp, p->idx);
+			/* If we merged the root node, break out. */
+			if(up == path && pp != nil && pp->op == POmerge && p->b->nval == 2){
+				rp = pp;
+				goto Out;
+			}
+			updatepiv(t, up, p, pp);
+			enqueue(p->nl);
+			rp = p;
+		}else{
+			splitpiv(t, up, p, pp, &mid);
+			enqueue(p->nl);
+			enqueue(p->nr);
+		}
+		pp = p;
+		up--;
+		p--;
+	}
+	if(pp->nl != nil && pp->nr != nil){
+		rp = &path[0];
+		rp->nl = newblk(t, Tpivot);
+		rp->npull = pp->npull;
+		rp->pullsz = pp->pullsz;
+		copyup(rp->nl, pp, nil);
+		enqueue(rp->nl);
+	}
+Out:
+	return rp;
+}
+
+static void
+freepath(Tree *t, Path *path, int npath)
+{
+	Path *p;
+
+	for(p = path; p != path + npath; p++){
+		if(p->b != nil)
+			freeblk(t, p->b);
+		if(p->m != nil)
+			freeblk(t, p->b);
+		dropblk(p->b);
+		dropblk(p->nl);
+		dropblk(p->nr);
+	}
+	free(path);
+}
+
+/*
+ * Select child node that with the largest message
+ * segment in the current node's buffer.
+ */
+static void
+victim(Blk *b, Path *p)
+{
+	int i, j, lo, maxsz, cursz;
+	Kvp kv;
+	Msg m;
+
+	j = 0;
+	maxsz = 0;
+	p->b = b;
+	/* 
+	 * Start at the second pivot: all values <= this
+	 * go to the first node. Stop *after* the last entry,
+	 * because entries >= the last entry all go into it.
+	 */
+	for(i = 1; i <= b->nval; i++){
+		if(i < b->nval)
+			getval(b, i, &kv);
+		cursz = 0;
+		lo = j;
+		for(; j < b->nbuf; j++){
+			getmsg(b, j, &m);
+			if(i < b->nval && keycmp(&m, &kv) >= 0)
+				break;
+			/* 2 bytes for offset, plus message size in buffer */
+			cursz += msgsz(&m);
+		}
+		if(cursz > maxsz){
+			maxsz = cursz;
+			p->op = POmod;
+			p->lo = lo;
+			p->hi = j;
+			p->sz = maxsz;
+			p->idx = i - 1;
+			p->midx = i - 1;
+			p->npull = 0;
+			p->pullsz = 0;
+		}
+	}
+}
+
+static void
+fastupsert(Tree *t, Blk *b, Msg *msg, int nmsg)
+{
+	int i, c, o, ri, lo, hi, mid, nbuf;
+	Msg cmp;
+	char *p;
+	Blk *r;
+
+	if((r = dupblk(t, b)) == nil)
+		error(Enomem);
+
+	nbuf = r->nbuf;
+	for(i = 0; i < nmsg; i++)
+		setmsg(r, &msg[i]);
+
+	for(i = 0; i < nmsg; i++){
+		ri = -1;
+		lo = 0;
+		hi = nbuf+i-1;
+		while(lo <= hi){
+			mid = (hi + lo) / 2;
+			getmsg(r, mid, &cmp);
+			c = keycmp(&msg[i], &cmp);
+			switch(c){
+			case -1:
+				hi = mid-1;
+				break;
+			case 0:
+				ri = mid+1;
+				lo = mid+1;
+				break;
+			case 1:
+				lo = mid+1;
+				break;
+			}
+		}
+		if(ri == -1)
+			ri = hi+1;
+		p = r->data + Pivspc + 2*(nbuf+i);
+		o = UNPACK16(p);
+		p = r->data + Pivspc + 2*ri;
+		memmove(p+2, p, 2*(nbuf+i-ri));
+		PACK16(p, o);
+	}
+	enqueue(r);
+
+	lock(&t->lk);
+	t->bp = r->bp;
+	t->dirty = 1;
+	unlock(&t->lk);
+
+	freeblk(t, b);
+	dropblk(b);
+	dropblk(r);
+}
+	
+
+void
+btupsert(Tree *t, Msg *msg, int nmsg)
+{
+	int i, npath, npull, dh, sz, height;
+	Path *path, *rp;
+	Blk *b, *rb;
+	Kvp sep;
+	Bptr bp;
+
+	assert(!canqlock(&fs->mutlk));
+	sz = 0;
+	stablesort(msg, nmsg);
+	for(i = 0; i < nmsg; i++)
+		sz += msgsz(&msg[i]);
+	npull = 0;
+	path = nil;
+	npath = 0;
+
+Again:
+	if(waserror()){
+		freepath(t, path, npath);
+		nexterror();
+	}
+
+	b = getroot(t, &height);
+	if(npull == 0 && b->type == Tpivot && !filledbuf(b, nmsg, sz)){
+		fastupsert(t, b, msg, nmsg);
+		poperror();
+		return;
+	}
+	/*
+	 * The tree can grow in height by 1 when we
+	 * split, so we allocate room for one extra
+	 * node in the path.
+	 */
+	npath = 0;
+	if((path = calloc((height + 2), sizeof(Path))) == nil)
+		error(Enomem);
+	path[npath].b = nil;
+	path[npath].idx = -1;
+	path[npath].midx = -1;
+	npath++;
+
+	path[0].sz = sz;
+	path[0].ins = msg;
+	path[0].lo = npull;
+	path[0].hi = nmsg;
+	while(b->type == Tpivot){
+		if(!filledbuf(b, nmsg, path[npath - 1].sz))
+			break;
+		victim(b, &path[npath]);
+		getval(b, path[npath].idx, &sep);
+		bp = unpackbp(sep.v, sep.nv);
+		b = getblk(bp, 0);
+		npath++;
+	}
+	path[npath].b = b;
+	path[npath].idx = -1;
+	path[npath].midx = -1;
+	path[npath].lo = -1;
+	path[npath].hi = -1;
+	path[npath].npull = 0;
+	path[npath].pullsz = 0;
+	npath++;
+
+	rp = flush(t, path, npath);
+	rb = rp->nl;
+
+	if(path[0].nl != nil)
+		dh = 1;
+	else if(path[1].nl != nil)
+		dh = 0;
+	else if(npath >2 && path[2].nl != nil)
+		dh = -1;
+	else
+		fatal("broken path change");
+
+	assert(rb->bp.addr != 0);
+	assert(rb->bp.addr != 0);
+
+	lock(&t->lk);
+	traceb("setroot", rb->bp);
+	t->ht += dh;
+	t->bp = rb->bp;
+	t->dirty = 1;
+	unlock(&t->lk);
+
+	npull += rp->npull;
+	freepath(t, path, npath);
+	poperror();
+
+	if(npull != nmsg){
+		tracem("short pull");
+		goto Again;
+	}
+}
+
+Blk*
+getroot(Tree *t, int *h)
+{
+	Bptr bp;
+
+	lock(&t->lk);
+	bp = t->bp;
+	if(h != nil)
+		*h = t->ht;
+	unlock(&t->lk);
+
+	return getblk(bp, 0);
+}
+
+int
+btlookup(Tree *t, Key *k, Kvp *r, char *buf, int nbuf)
+{
+	int i, j, h, ok, same;
+	Blk *b, **p;
+	Bptr bp;
+	Msg m;
+
+	b = getroot(t, &h);
+	if((p = calloc(h, sizeof(Blk*))) == nil){
+		dropblk(b);
+		error(Enomem);
+	}
+	ok = 0;
+	p[0] = holdblk(b);
+	for(i = 1; i < h; i++){
+		if(blksearch(p[i-1], k, r, &same) == -1)
+			break;
+		bp = unpackbp(r->v, r->nv);
+		p[i] = getblk(bp, 0);
+	}
+	if(p[h-1] != nil)
+		blksearch(p[h-1], k, r, &ok);
+	if(ok)
+		cpkvp(r, r, buf, nbuf);
+	for(i = h-2; i >= 0; i--){
+		if(p[i] == nil)
+			continue;
+		j = bufsearch(p[i], k, &m, &same);
+		if(j < 0 || !same)
+			continue;
+		if(ok || m.op == Oinsert)
+			ok = apply(r, &m, buf, nbuf);
+		else if(m.op != Oclearb && m.op != Oclobber)
+			fatal("lookup %K << %M missing insert\n", k, &m);
+		for(j++; j < p[i]->nbuf; j++){
+			getmsg(p[i], j, &m);
+			if(keycmp(k, &m) != 0)
+				break;
+			ok = apply(r, &m, buf, nbuf);
+		}
+	}
+	for(i = 0; i < h; i++)
+		if(p[i] != nil)
+			dropblk(p[i]);
+	dropblk(b);
+	free(p);
+	return ok;
+}
+
+void
+btnewscan(Scan *s, char *pfx, int npfx)
+{
+	memset(s, 0, sizeof(*s));
+	s->first = 1;
+	s->donescan = 0;
+	s->offset = 0;
+	s->pfx.k = s->pfxbuf;
+	s->pfx.nk = npfx;
+	memmove(s->pfxbuf, pfx, npfx);
+
+	s->kv.v = s->kvbuf+npfx;
+	s->kv.nv = 0;
+	cpkey(&s->kv, &s->pfx, s->kvbuf, sizeof(s->kvbuf));
+}
+
+void
+btenter(Tree *t, Scan *s)
+{
+	int i, same;
+	Scanp *p;
+	Msg m, c;
+	Bptr bp;
+	Blk *b;
+	Kvp v;
+
+	if(s->donescan)
+		return;
+	b = getroot(t, &s->ht);
+	if((s->path = calloc(s->ht, sizeof(Scanp))) == nil){
+		dropblk(b);
+		error(Enomem);
+	}
+	p = s->path;
+	p[0].b = b;
+	for(i = 0; i < s->ht; i++){
+		p[i].vi = blksearch(b, &s->kv, &v, &same);
+		if(b->type == Tpivot){
+			if(p[i].vi == -1)
+				getval(b, ++p[i].vi, &v);
+			p[i].bi = bufsearch(b, &s->kv, &m, &same);
+			if(p[i].bi == -1){
+				p[i].bi++;
+			}else if(!same || !s->first){
+				/* scan past repeated messages */
+				while(p[i].bi < p[i].b->nbuf){
+					getmsg(p[i].b, p[i].bi, &c);
+					if(keycmp(&m, &c) != 0)
+						break;
+					p[i].bi++;
+				}
+			}
+			bp = unpackbp(v.v, v.nv);
+			b = getblk(bp, 0);
+			p[i+1].b = b;
+		}else if(p[i].vi == -1 || !same || !s->first)
+			p[i].vi++;
+	}
+	s->first = 0;
+}
+
+int
+btnext(Scan *s, Kvp *r)
+{
+	int i, j, h, ok, start, bufsrc;
+	Scanp *p;
+	Msg m, n;
+	Bptr bp;
+	Kvp kv;
+
+Again:
+	p = s->path;
+	h = s->ht;
+	start = h;
+	bufsrc = -1;
+	if(s->donescan)
+		return 0;
+	if(waserror()){
+		btexit(s);
+		nexterror();
+	}
+	/* load up the correct blocks for the scan */
+	for(i = h-1; i >= 0; i--){
+		if(p[i].b != nil
+		&&(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf))
+			break;
+		if(i == 0){
+			s->donescan = 1;
+			poperror();
+			return 0;
+		}
+		if(p[i].b != nil)
+			dropblk(p[i].b);
+		p[i].b = nil;
+		p[i].vi = 0;
+		p[i].bi = 0;
+		p[i-1].vi++;
+		start = i;
+	}
+
+	if(p[start-1].vi < p[start-1].b->nval){
+		for(i = start; i < h; i++){
+			getval(p[i-1].b, p[i-1].vi, &kv);
+			bp = unpackbp(kv.v, kv.nv);
+			p[i].b = getblk(bp, 0);
+		}
+	
+		/* find the minimum key along the path up */
+		m.op = Oinsert;
+		getval(p[h-1].b, p[h-1].vi, &m);
+	}else{
+		getmsg(p[start-1].b, p[start-1].bi, &m);
+		assert(m.op == Oinsert);
+		bufsrc = start-1;
+	}
+
+	for(i = h-2; i >= 0; i--){
+		if(p[i].b == nil || p[i].bi == p[i].b->nbuf)
+			continue;
+		getmsg(p[i].b, p[i].bi, &n);
+		if(keycmp(&n, &m) < 0){
+			bufsrc = i;
+			m = n;
+		}
+	}
+	if(m.nk < s->pfx.nk || memcmp(m.k, s->pfx.k, s->pfx.nk) != 0){
+		s->donescan = 1;
+		poperror();
+		return 0;
+	}
+
+	/* scan all messages applying to the message */
+	ok = 1;
+	cpkvp(r, &m, s->kvbuf, sizeof(s->kvbuf));
+	if(bufsrc == -1)
+		p[h-1].vi++;
+	else
+		p[bufsrc].bi++;
+	for(i = h-2; i >= 0; i--){
+		for(j = p[i].bi; p[i].b != nil && j < p[i].b->nbuf; j++){
+			getmsg(p[i].b, j, &m);
+			if(keycmp(r, &m) != 0)
+				break;
+			ok = apply(r, &m, s->kvbuf, sizeof(s->kvbuf));
+			p[i].bi++;
+		}
+	}
+	poperror();
+	if(!ok)
+		goto Again;
+	return 1;
+}
+
+void
+btexit(Scan *s)
+{
+	int i;
+
+	for(i = 0; i < s->ht; i++)
+		dropblk(s->path[i].b);
+	free(s->path);
+}
--- /dev/null
+++ b/user.c
@@ -1,0 +1,260 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static char*
+slurp(Tree *t, vlong path, vlong len)
+{
+	char *ret, buf[Offksz], kvbuf[Offksz + Ptrsz];
+	vlong o;
+	Blk *b;
+	Bptr bp;
+	Key k;
+	Kvp kv;
+
+	if((ret = malloc(len + 1)) == nil)
+		error(Enomem);
+	k.k = buf;
+	k.nk = Offksz;
+	for(o = 0; o < len; o += Blksz){
+		k.k[0] = Kdat;
+		PACK64(k.k+1, path);
+		PACK64(k.k+9, o);
+		if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+			error(Esrch);
+		bp = unpackbp(kv.v, kv.nv);
+		b = getblk(bp, GBraw);
+		if(len - o >= Blksz)
+			memcpy(ret + o, b->buf, Blksz);
+		else
+			memcpy(ret + o, b->buf, len - o);
+	}
+	ret[len] = 0;
+	return ret;
+}
+
+static char*
+readline(char **p, char *buf, int nbuf)
+{
+	char *e;
+	int n;
+
+	if((e = strchr(*p, '\n')) == nil)
+		return nil;
+	n = (e - *p) + 1;
+	if(n >= nbuf)
+		n = nbuf - 1;
+	strecpy(buf, buf + n, *p);
+	*p = e+1;
+	return buf;
+}
+
+static char*
+getfield(char **p, char delim)
+{
+	char *r;
+
+	if(*p == nil)
+		return nil;
+	r = *p;
+	*p = strchr(*p, delim);
+	if(*p != nil){
+		**p = '\0';
+		*p += 1;
+	}
+	return r;
+}
+
+User*
+name2user(char *name)
+{
+	int i;
+
+	for(i = 0; i < fs->nusers; i++)
+		if(strcmp(fs->users[i].name, name) == 0)
+			return &fs->users[i];
+	return nil;
+}
+
+User*
+uid2user(int id)
+{
+	int i;
+
+	for(i = 0; i < fs->nusers; i++)
+		if(fs->users[i].id == id)
+			return &fs->users[i];
+	return nil;
+}
+
+static char*
+parseusers(int fd, char *udata)
+{
+	char *pu, *p, *f, *m, *err, buf[8192];
+	int i, j, lnum, ngrp, nusers, usersz;
+	User *u, *n, *users;
+	int *g, *grp;
+
+	i = 0;
+	err = nil;
+	nusers = 0;
+	usersz = 8;
+	if((users = calloc(usersz, sizeof(User))) == nil)
+		return Enomem;
+	pu = udata;
+	lnum = 0;
+	while((p = readline(&pu, buf, sizeof(buf))) != nil){
+		lnum++;
+		if(p[0] == '#' || p[0] == 0)
+			continue;
+		if(i == usersz){
+			usersz *= 2;
+			n = realloc(users, usersz*sizeof(User));
+			if(n == nil){
+				free(users);
+				return Enomem;
+			}
+			users = n;
+		}
+		if((f = getfield(&p, ':')) == nil){
+			fprint(fd, "/adm/users:%d: missing ':' after id\n", lnum);
+			err = Esyntax;
+			goto Error;
+		}
+		u = &users[i];
+		u->id = atol(f);
+		if((f = getfield(&p, ':')) == nil){
+			fprint(fd, "/adm/users:%d: missing ':' after name\n", lnum);
+			err = Esyntax;
+			goto Error;
+		}
+		snprint(u->name, sizeof(u->name), "%s", f);
+		u->memb = nil;
+		u->nmemb = 0;
+		i++;
+	}
+	nusers = i;
+
+
+	i = 0;
+	pu = udata;
+	lnum = 0;
+	while((p = readline(&pu, buf, sizeof(buf))) != nil){
+		lnum++;
+		if(buf[0] == '#' || buf[0] == 0)
+			continue;
+		getfield(&p, ':');	/* skip id */
+		getfield(&p, ':');	/* skip name */
+		if((f = getfield(&p, ':')) == nil){
+			fprint(fd, "/adm/users:%d: missing ':' after name\n", lnum);
+			err = Esyntax;
+			goto Error;
+		}
+		if(f[0] != '\0'){
+			u = nil;
+			for(j = 0; j < nusers; j++)
+				if(strcmp(users[j].name, f) == 0)
+					u = &users[j];
+			if(u == nil){
+				fprint(fd, "/adm/users:%d: leader %s does not exist\n", lnum, f);
+				err = Enouser;
+				goto Error;
+			}
+			users[i].lead = u->id;
+		}
+		if((f = getfield(&p, ':')) == nil){
+			err = Esyntax;
+			goto Error;
+		}
+		grp = nil;
+		ngrp = 0;
+		while((m = getfield(&f, ',')) != nil){
+			if(m[0] == '\0')
+				continue;
+			u = nil;
+			for(j = 0; j < nusers; j++)
+				if(strcmp(users[j].name, m) == 0)
+					u = &users[j];
+			if(u == nil){
+				fprint(fd, "/adm/users:%d: user %s does not exist\n", lnum, m);
+				free(grp);
+				err = Enouser;
+				goto Error;
+			}
+			if((g = realloc(grp, (ngrp+1)*sizeof(int))) == nil){
+				free(grp);
+				err = Enomem;
+				goto Error;
+			}
+			grp = g;
+			grp[ngrp++] = u->id;
+		}
+		users[i].memb = grp;
+		users[i].nmemb = ngrp;
+		i++;
+	}
+
+	wlock(&fs->userlk);
+	n = fs->users;
+	i = fs->nusers;
+	fs->users = users;
+	fs->nusers = nusers;
+	wunlock(&fs->userlk);
+	users = n;
+	nusers = i;
+
+Error:
+	if(users != nil)
+		for(i = 0; i < nusers; i++)
+			free(users[i].memb);
+	free(users);
+		
+	return err;
+		
+}
+
+void
+loadusers(int fd, Tree *t)
+{
+	char *s, *e;
+	vlong len;
+	Qid q;
+	User *u;
+
+	if(walk1(t, -1, "", &q, &len) == -1)
+		error(Efs);
+	if(walk1(t, q.path, "users", &q, &len) == -1)
+		error(Esrch);
+	if(q.type & QTDIR)
+		error(Etype);
+	if(len >= 1*MiB)
+		error(Efsize);
+	s = slurp(t, q.path, len);
+	e = parseusers(fd, s);
+	if(e != nil){
+		if(fs->users != nil){
+			fprint(2, "load users: %s\n", e);
+			fprint(2, "keeping old table\n");
+			error(e);
+		}
+		if(!permissive){
+			fprint(2, "user table broken: %s\n", e);
+			fprint(2, "\tnot permissive: bailing\n");
+			error(e);
+		}
+		fprint(2, "user table broken: %s\n", e);
+		fprint(2, "\tfalling back to default\n");
+		parseusers(fd, "-1:adm::\n0:none::\n");
+	}
+	if((u = name2user("none")) != nil)
+		noneid = u->id;
+	if((u = name2user("adm")) != nil)
+		admid = u->id;
+	if((u = name2user("nogroup")) != nil)
+		nogroupid = u->id;
+	free(s);
+}