shithub: gefs

ref: ea50e8ab08f8c76c9bed6169cb2bd6ea2696f44e
dir: gefs/fs.c

View raw version
#include <u.h>
#include <libc.h>
#include <auth.h>
#include <fcall.h>
#include <avl.h>

#include "dat.h"
#include "fns.h"
#include "atomic.h"

int
walk1(Tree *t, vlong up, char *name, Qid *qid, vlong *len)
{
	char *p, kbuf[Keymax], rbuf[Kvmax];
	int err;
	Xdir d;
	Kvp kv;
	Key k;

	err = 0;
	p = packdkey(kbuf, sizeof(kbuf), up, name);
	k.k = kbuf;
	k.nk = p - kbuf;
	if(err)
		return -1;
	if(!btlookup(t, &k, &kv, rbuf, sizeof(rbuf)))
		return -1;
	kv2dir(&kv, &d);
	*qid = d.qid;
	*len = d.length;
	return 0;
}

static void
wrbarrier(void)
{
	Qent qe;
	int i;
	
	aincv(&fs->qgen, 1);
	tracev("barrier", fs->qgen);
	fs->syncing = fs->nsyncers;
	for(i = 0; i < fs->nsyncers; i++){
		qe.op = Qfence;
		qe.bp.addr = 0;
		qe.bp.hash = -1;
		qe.bp.gen = -1;
		qe.b = nil;
		qput(&fs->syncq[i], qe);
	}
	aincv(&fs->qgen, 1);
	while(fs->syncing != 0)
		rsleep(&fs->syncrz);
	tracev("flushed", fs->qgen);
}

static void
sync(void)
{
	Mount *mnt;
	Arena *a;
	Dlist dl;
	int i;


	if(fs->rdonly)
		return;
	qlock(&fs->synclk);
	if(waserror()){
		fprint(2, "failed to sync: %s\n", errmsg());
		qunlock(&fs->synclk);
		nexterror();
	}

	/* 
	 * Wait for data that we're syncing to hit disk
	 */
	tracem("flush1");
	wrbarrier();
	/*
	 * pass 0: Update all open snapshots, and
	 *  pack the blocks we want to sync. Snap
	 *  while holding the write lock, and then
	 *  wait until all the blocks they point at
	 *  have hit disk; once they're on disk, we
	 *  can take a consistent snapshot.
         */
	qlock(&fs->mutlk);
	tracem("packb");
	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next)
		updatesnap(&mnt->root, mnt->root, mnt->name);
	/*
	 * Now that we've updated the snaps, we can sync the
	 * dlist; the snap tree will not change from here.
	 */
	dlsync();
	dl = fs->snapdl;
	fs->snapdl.hd = Zb;
	fs->snapdl.tl = Zb;
	fs->snapdl.ins = nil;
	traceb("syncdl.dl", dl.hd);
	traceb("syncdl.rb", fs->snap.bp);
	for(i = 0; i < fs->narena; i++){
		a = &fs->arenas[i];
		qlock(a);
		/*
		 * because the log uses preallocated
		 * blocks, we need to write the log
		 * block out synchronously, or it may
		 * get reused.
		 */
		logbarrier(a, fs->qgen);
		finalize(a->logtl);
		syncblk(a->logtl);

		packarena(a->h0->data, Blksz, a);
		packarena(a->h1->data, Blksz, a);
		finalize(a->h0);
		finalize(a->h1);
		setflag(a->h0, Bdirty);
		setflag(a->h1, Bdirty);
		fs->arenabp[i] = a->h0->bp;
		qunlock(a);
	}
	assert(fs->snapdl.hd.addr == -1);
	traceb("packsb.rb", fs->snap.bp);
	packsb(fs->sb0->buf, Blksz, fs);
	packsb(fs->sb1->buf, Blksz, fs);
	finalize(fs->sb0);
	finalize(fs->sb1);
	fs->snap.dirty = 0;
	qunlock(&fs->mutlk);

	/*
	 * pass 1: sync block headers; if we crash here,
	 *  the block footers are consistent, and we can
	 *  use them.
	 */
	tracem("arenas0");
	for(i = 0; i < fs->narena; i++)
		enqueue(fs->arenas[i].h0);
	wrbarrier();

	/*
	 * pass 2: sync superblock; we have a consistent
	 * set of block headers, so if we crash, we can
	 * use the loaded block headers; the footers will
	 * get synced after so that we can use them next
	 * time around.
         */
	qlock(&fs->mutlk);
	tracem("supers");
	syncblk(fs->sb0);
	syncblk(fs->sb1);

	/*
	 * pass 3: sync block footers; if we crash here,
	 *  the block headers are consistent, and we can
	 *  use them.
         */
	tracem("arenas1");
	for(i = 0; i < fs->narena; i++)
		enqueue(fs->arenas[i].h1);

	/*
	 * Pass 4: clean up the old snap tree's deadlist
	 */
	tracem("snapdl");
	wrbarrier();
	qunlock(&fs->mutlk);
	freedl(&dl, 1);
	qunlock(&fs->synclk);
	tracem("synced");
	poperror();
}

static void
snapfs(Amsg *a, Tree **tp)
{
	Tree *t, *s;
	Mount *mnt;

	if(waserror()){
		*tp = nil;
		nexterror();
	}
	t = nil;
	*tp = nil;
	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
		if(strcmp(a->old, mnt->name) == 0){
			updatesnap(&mnt->root, mnt->root, mnt->name);
			t = agetp(&mnt->root);
			ainc(&t->memref);
			break;
		}
	}
	if(t == nil && (t = opensnap(a->old, nil)) == nil){
		if(a->fd != -1)
			fprint(a->fd, "snap: open '%s': does not exist\n", a->old);
		poperror();
		return;
	}
	if(a->delete){
		if(mnt != nil) {
			if(a->fd != -1)
				fprint(a->fd, "snap: snap is mounted: '%s'\n", a->old);
			poperror();
			return;
		}
		if(t->nlbl == 1 && t->nref <= 1 && t->succ == -1){
			aincl(&t->memref, 1);
			*tp = t;
		}
		delsnap(t, t->succ, a->old);
	}else{
		if((s = opensnap(a->new, nil)) != nil){
			if(a->fd != -1)
				fprint(a->fd, "snap: already exists '%s'\n", a->new);
			closesnap(s);
			poperror();
			return;
		}
		tagsnap(t, a->new, a->flag);
	}
	closesnap(t);
	poperror();
	if(a->fd != -1){
		if(a->delete)
			fprint(a->fd, "deleted: %s\n", a->old);
		else if(a->flag & Lmut)
			fprint(a->fd, "forked: %s from %s\n", a->new, a->old);
		else
			fprint(a->fd, "labeled: %s from %s\n", a->new, a->old);
	}
}

static void
filldumpdir(Xdir *d)
{
	memset(d, 0, sizeof(Xdir));
	d->name = "/";
	d->qid.path = Qdump;
	d->qid.vers = fs->nextgen;
	d->qid.type = QTDIR;
	d->mode = 0555;
	d->atime = 0;
	d->mtime = 0;
	d->length = 0;
	d->uid = -1;
	d->gid = -1;
	d->muid = -1;
}

static int
okname(char *name)
{
	int i;

	if(name[0] == 0)
		return -1;
	if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
		return -1;
	for(i = 0; i < Maxname; i++){
		if(name[i] == 0)
			return 0;
		if((name[i]&0xff) < 0x20 || name[i] == '/')
			return -1;
	}
	return -1;
}

Chan*
mkchan(int size)
{
	Chan *c;

	if((c = mallocz(sizeof(Chan) + size*sizeof(void*), 1)) == nil)
		sysfatal("create channel");
	c->size = size;
	c->avail = size;
	c->count = 0;
	c->rp = c->args;
	c->wp = c->args;
	return c;

}

void*
chrecv(Chan *c)
{
	void *a;
	long v;

	v = agetl(&c->count);
	if(v == 0 || !acasl(&c->count, v, v-1))
		semacquire(&c->count, 1);
	lock(&c->rl);
	a = *c->rp;
	if(++c->rp >= &c->args[c->size])
		c->rp = c->args;
	unlock(&c->rl);
	semrelease(&c->avail, 1);
	return a;
}

void
chsend(Chan *c, void *m)
{
	long v;

	v = agetl(&c->avail);
	if(v == 0 || !acasl(&c->avail, v, v-1))
		semacquire(&c->avail, 1);
	lock(&c->wl);
	*c->wp = m;
	if(++c->wp >= &c->args[c->size])
		c->wp = c->args;
	unlock(&c->wl);
	semrelease(&c->count, 1);
}

static void
fshangup(Conn *c, char *fmt, ...)
{
	char buf[ERRMAX];
	va_list ap;

	va_start(ap, fmt);
	vsnprint(buf, sizeof(buf), fmt, ap);
	va_end(ap);
	fprint(2, "%s\n", buf);
	close(c->rfd);
	close(c->wfd);
}

static void
respond(Fmsg *m, Fcall *r)
{
	RWLock *lk;
	uchar buf[Max9p+IOHDRSZ];
	int w, n;

	r->tag = m->tag;
	dprint("→ %F\n", r);
	assert(m->type+1 == r->type || r->type == Rerror);
	if((n = convS2M(r, buf, sizeof(buf))) == 0)
		abort();
	qlock(&m->conn->wrlk);
	w = write(m->conn->wfd, buf, n);
	qunlock(&m->conn->wrlk);
	if(w != n)
		fshangup(m->conn, Eio);
	if(m->type == Tflush){
		lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
		wunlock(lk);
	}else{
		lk = &fs->flushq[ihash(m->tag) % Nflushtab];
		runlock(lk);
	}
	free(m);
}

static void
rerror(Fmsg *m, char *fmt, ...)
{
	char buf[128];
	va_list ap;
	Fcall r;

	va_start(ap, fmt);
	vsnprint(buf, sizeof(buf), fmt, ap);
	va_end(ap);
	r.type = Rerror;
	r.ename = buf;
	respond(m, &r);
}


static void
upsert(Mount *mnt, Msg *m, int nm)
{
	if(!mnt->mutable)
		error(Erdonly);
	if(mnt->root->nlbl != 1 || mnt->root->nref != 0)
		updatesnap(&mnt->root, mnt->root, mnt->name);
	btupsert(mnt->root, m, nm);
}

/*
 * When truncating a file, mutations need
 * to wait for the sweeper to finish; this
 * means the mutator needs to release the
 * mutation lock, exit the epoch, and
 * allow the sweeper to finish its job
 * before resuming.
 */
static void
truncwait(Dent *de, int id)
{
	epochend(id);
	qunlock(&fs->mutlk);
	qlock(&de->trunclk);
	while(de->trunc)
		rsleep(&de->truncrz);
	qunlock(&de->trunclk);
	qlock(&fs->mutlk);
	epochstart(id);
}

static int
readb(Tree *t, Fid *f, char *d, vlong o, vlong n, vlong sz)
{
	char buf[17], kvbuf[17+32];
	vlong fb, fo;
	Bptr bp;
	Blk *b;
	Key k;
	Kvp kv;

	if(o >= sz)
		return 0;

	fb = o & ~(Blksz-1);
	fo = o & (Blksz-1);
	if(fo+n > Blksz)
		n = Blksz-fo;

	k.k = buf;
	k.nk = sizeof(buf);
	k.k[0] = Kdat;
	PACK64(k.k+1, f->qpath);
	PACK64(k.k+9, fb);

	if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf))){
		memset(d, 0, n);
		return n;
	}

	bp = unpackbp(kv.v, kv.nv);
	b = getblk(bp, GBraw);
	memcpy(d, b->buf+fo, n);
	dropblk(b);
	return n;
}

static int
writeb(Fid *f, Msg *m, Bptr *ret, char *s, vlong o, vlong n, vlong sz)
{
	char buf[Kvmax];
	vlong fb, fo;
	Blk *b, *t;
	Tree *r;
	Bptr bp;
	Kvp kv;

	fb = o & ~(Blksz-1);
	fo = o & (Blksz-1);

	m->k[0] = Kdat;
	PACK64(m->k+1, f->qpath);
	PACK64(m->k+9, fb);

	b = newblk(f->mnt->root, Tdat, f->qpath);
	t = nil;
	r = f->mnt->root;
	if(btlookup(r, m, &kv, buf, sizeof(buf))){
		bp = unpackbp(kv.v, kv.nv);
		if(fb < sz && (fo != 0 || n != Blksz)){
			t = getblk(bp, GBraw);
			memcpy(b->buf, t->buf, Blksz);
			dropblk(t);
		}
	}
	if(fo+n > Blksz)
		n = Blksz-fo;
	memcpy(b->buf+fo, s, n);
	if(t == nil){
		if(fo > 0)
			memset(b->buf, 0, fo);
		if(fo+n < Blksz)
			memset(b->buf+fo+n, 0, Blksz-fo-n);
	}
	enqueue(b);

	packbp(m->v, m->nv, &b->bp);
	*ret = b->bp;
	dropblk(b);
	return n;
}

static Dent*
getdent(vlong pqid, Xdir *d)
{
	Dent *de;
	char *e;
	u32int h;

	h = ihash(d->qid.path) % Ndtab;
	lock(&fs->dtablk);
	for(de = fs->dtab[h]; de != nil; de = de->next){
		if(de->qid.path == d->qid.path){
			ainc(&de->ref);
			goto Out;
		}
	}

	de = emalloc(sizeof(Dent), 1);
	de->Xdir = *d;
	de->ref = 1;
	de->up = pqid;
	de->qid = d->qid;
	de->length = d->length;
	de->truncrz.l = &de->trunclk;

	if((e = packdkey(de->buf, sizeof(de->buf), pqid, d->name)) == nil){
		free(de);
		de = nil;
		goto Out;
	}
	de->k = de->buf;
	de->nk = e - de->buf;
	de->name = de->buf + 11;
	de->next = fs->dtab[h];
	fs->dtab[h] = de;

Out:
	unlock(&fs->dtablk);
	return de;
}

static void
loadautos(Mount *mnt)
{
	char pfx[128];
	int m, h, ns;
	uint flg;
	Scan s;

	m = 0;
	h = 0;
	pfx[0] = Klabel;
	ns = snprint(pfx+1, sizeof(pfx)-1, "%s@minute.", mnt->name);
	btnewscan(&s, pfx, ns+1);
	btenter(&fs->snap, &s);
	while(1){
		if(!btnext(&s, &s.kv))
			break;
		flg = UNPACK32(s.kv.v+1+8);
		if(flg & Lauto){
			memcpy(mnt->minutely[m], s.kv.k+1, s.kv.nk-1);
			mnt->minutely[m][s.kv.nk-1] = 0;
			m = (m+1)%60;
			continue;
		}
	}
	btexit(&s);

	pfx[0] = Klabel;
	ns = snprint(pfx+1, sizeof(pfx)-1, "%s@hour.", mnt->name);
	btnewscan(&s, pfx, ns+1);
	btenter(&fs->snap, &s);
	while(1){
		if(!btnext(&s, &s.kv))
			break;
		flg = UNPACK32(s.kv.v+1+8);
		if(flg & Lauto){
			memcpy(mnt->hourly[h], s.kv.k+1, s.kv.nk-1);
			mnt->hourly[h][s.kv.nk-1] = 0;
			h = (h+1)%24;
			continue;
		}
	}
	btexit(&s);
}

Mount *
getmount(char *name)
{
	Mount *mnt;
	Tree *t;
	int flg;

	if(strcmp(name, "dump") == 0){
		ainc(&fs->snapmnt->ref);
		return fs->snapmnt;
	}

	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
		if(strcmp(name, mnt->name) == 0){
			ainc(&mnt->ref);
			goto Out;
		}
	}

	if((mnt = mallocz(sizeof(*mnt), 1)) == nil)
		error(Enomem);
	if(waserror()){
		free(mnt);
		nexterror();
	}
	mnt->ref = 1;
	snprint(mnt->name, sizeof(mnt->name), "%s", name);
	if((t = opensnap(name, &flg)) == nil)
		error(Enosnap);
	loadautos(mnt);
	mnt->mutable = (flg & Lmut) != 0;
	mnt->noauto = (flg & Lnoauto) != 0;
	mnt->root = t;
	mnt->next = fs->mounts;
	asetp(&fs->mounts, mnt);
	poperror();

Out:
	return mnt;
}

void
clunkmount(Mount *mnt)
{
	Mount *me, **p;
	Bfree *f;

	if(mnt == nil)
		return;
	if(adec(&mnt->ref) == 0){
		for(p = &fs->mounts; (me = *p) != nil; p = &me->next){
			if(me == mnt)
				break;
		}
		assert(me != nil);
		f = emalloc(sizeof(Bfree), 0);
		f->op = DFblk;
		f->m = mnt;
		*p = me->next;
		limbo(f);
	}
}

static void
clunkdent(Dent *de)
{
	Dent *e, **pe;
	u32int h;

	if(de == nil)
		return;
	if(de->qid.type == QTAUTH && adec(&de->ref) == 0){
		free(de);
		return;
	}
	lock(&fs->dtablk);
	if(adec(&de->ref) != 0)
		goto Out;
	h = ihash(de->qid.path) % Ndtab;
	pe = &fs->dtab[h];
	for(e = fs->dtab[h]; e != nil; e = e->next){
		if(e == de)
			break;
		pe = &e->next;
	}
	assert(e != nil);
	*pe = e->next;
	free(de);
Out:
	unlock(&fs->dtablk);
}

static Fid*
getfid(Conn *c, u32int fid)
{
	u32int h;
	Fid *f;

	h = ihash(fid) % Nfidtab;
	lock(&c->fidtablk[h]);
	for(f = c->fidtab[h]; f != nil; f = f->next)
		if(f->fid == fid){
			ainc(&f->ref);
			break;
		}
	unlock(&c->fidtablk[h]);
	return f;
}

static void
putfid(Fid *f)
{
	if(adec(&f->ref) != 0)
		return;
	clunkmount(f->mnt);
	clunkdent(f->dent);
	free(f);
}

static Fid*
dupfid(Conn *c, u32int new, Fid *f)
{
	Fid *n, *o;
	u32int h;

	h = ihash(new) % Nfidtab;
	if((n = malloc(sizeof(Fid))) == nil)
		return nil;

	*n = *f;
	n->fid = new;
	n->ref = 2; /* one for dup, one for clunk */
	n->mode = -1;
	n->next = nil;

	lock(&c->fidtablk[h]);
	for(o = c->fidtab[h]; o != nil; o = o->next)
		if(o->fid == new)
			break;
	if(o == nil){
		n->next = c->fidtab[h];
		c->fidtab[h] = n;
	}
	unlock(&c->fidtablk[h]);

	if(o != nil){
		fprint(2, "fid in use: %d == %d\n", o->fid, new);
		free(n);
		return nil;
	}
	if(n->mnt != nil)
		ainc(&n->mnt->ref);
	ainc(&n->dent->ref);
	setmalloctag(n, getcallerpc(&c));
	return n;
}

static void
clunkfid(Conn *c, Fid *fid)
{
	Fid *f, **pf;
	u32int h;

	h = ihash(fid->fid) % Nfidtab;
	lock(&c->fidtablk[h]);
	pf = &c->fidtab[h];
	for(f = c->fidtab[h]; f != nil; f = f->next){
		if(f == fid){
			assert(adec(&f->ref) != 0);
			*pf = f->next;
			break;
		}
		pf = &f->next;
	}
	assert(f != nil);
	unlock(&c->fidtablk[h]);
}

static int
readmsg(Conn *c, Fmsg **pm)
{
	char szbuf[4];
	int sz, n;
	Fmsg *m;

	n = readn(c->rfd, szbuf, 4);
	if(n <= 0){
		*pm = nil;
		return n;
	}
	if(n != 4){
		werrstr("short read: %r");
		return -1;
	}
	sz = GBIT32(szbuf);
	if(sz > c->iounit){
		werrstr("message size too large");
		return -1;
	}
	if((m = malloc(sizeof(Fmsg)+sz)) == nil)
		return -1;
	if(readn(c->rfd, m->buf+4, sz-4) != sz-4){
		werrstr("short read: %r");
		free(m);
		return -1;
	}
	m->conn = c;
	m->sz = sz;
	PBIT32(m->buf, sz);
	*pm = m;
	return 0;
}

static void
fsversion(Fmsg *m)
{
	Fcall r;
	char *p;

	memset(&r, 0, sizeof(Fcall));
	p = strchr(m->version, '.');
	if(p != nil)
		*p = '\0';
	r.type = Rversion;
	r.msize = Max9p + IOHDRSZ;
	if(strcmp(m->version, "9P2000") == 0){
		if(m->msize < r.msize)
			r.msize = m->msize;
		r.version = "9P2000";
		m->conn->versioned = 1;
		m->conn->iounit = r.msize;
	}else{
		r.version = "unknown";
		m->conn->versioned = 0;
	}
	respond(m, &r);
}

void
authfree(AuthRpc *auth)
{
	AuthRpc *rpc;

	if(rpc = auth){
		close(rpc->afd);
		auth_freerpc(rpc);
	}
}

AuthRpc*
authnew(void)
{
	static char *keyspec = "proto=p9any role=server";
	AuthRpc *rpc;
	int fd;

	if(access("/mnt/factotum", 0) < 0)
		if((fd = open("/srv/factotum", ORDWR)) >= 0)
			mount(fd, -1, "/mnt", MBEFORE, "");
	if((fd = open("/mnt/factotum/rpc", ORDWR)) < 0)
		return nil;
	if((rpc = auth_allocrpc(fd)) == nil){
		close(fd);
		return nil;
	}
	if(auth_rpc(rpc, "start", keyspec, strlen(keyspec)) != ARok){
		authfree(rpc);
		return nil;
	}
	return rpc;
}

static void
authread(Fid *f, Fcall *r, void *data, vlong count)
{
	AuthInfo *ai;
	AuthRpc *rpc;
	User *u;

	if((rpc = f->auth) == nil)
		error(Etype);

	switch(auth_rpc(rpc, "read", nil, 0)){
	default:
		error(Eauthp);
	case ARdone:
		if((ai = auth_getinfo(rpc)) == nil)
			goto Phase;
		rlock(&fs->userlk);
		u = name2user(ai->cuid);
		auth_freeAI(ai);
		if(u == nil){
			runlock(&fs->userlk);
			error(Enouser);
		}
		f->uid = u->id;
		runlock(&fs->userlk);
		return;
	case ARok:
		if(count < rpc->narg)
			error(Eauthd);
		memmove(data, rpc->arg, rpc->narg);
		r->count = rpc->narg;
		return;
	case ARphase:
	Phase:
		error(Eauthph);
	}
}

static void
authwrite(Fid *f, Fcall *r, void *data, vlong count)
{
	AuthRpc *rpc;

	if((rpc = f->auth) == nil)
		error(Etype);
	if(auth_rpc(rpc, "write", data, count) != ARok)
		error(Ebotch);
	r->type = Rwrite;
	r->count = count;

}

static void
fsauth(Fmsg *m)
{
	Dent *de;
	Fcall r;
	Fid f;

	if(fs->noauth){
		rerror(m, Eauth);
		return;
	}
	if(strcmp(m->uname, "none") == 0){
		rerror(m, Enone);
		return;
	}
	if((de = mallocz(sizeof(Dent), 1)) == nil){
		rerror(m, Enomem);
		return;
	}
	memset(de, 0, sizeof(Dent));
	de->ref = 0;
	de->qid.type = QTAUTH;
	de->qid.path = aincv(&fs->nextqid, 1);
	de->qid.vers = 0;
	de->length = 0;
	de->k = nil;
	de->nk = 0;

	memset(&f, 0, sizeof(Fid));
	f.fid = NOFID;
	f.mnt = nil;
	f.qpath = de->qid.path;
	f.pqpath = de->qid.path;
	f.mode = -1;
	f.iounit = m->conn->iounit;
	f.dent = de;
	f.uid = -1;
	f.duid = -1;
	f.dgid = -1;
	f.dmode = 0600;
	f.auth = authnew();
	if(dupfid(m->conn, m->afid, &f) == nil){
		rerror(m, Efid);
		free(de);
		return;
	}
	r.type = Rauth;
	r.aqid = de->qid;
	respond(m, &r);
}

static int
ingroup(int uid, int gid)
{
	User *u, *g;
	int i, in;

	rlock(&fs->userlk);
	in = 0;
	u = uid2user(uid);
	g = uid2user(gid);
	if(u != nil && g != nil)
		if(u->id == g->id)
			in = 1;
		else for(i = 0; i < g->nmemb; i++)
			if(u->id == g->memb[i])
				in = 1;
	runlock(&fs->userlk);
	return in;
}

static int
groupleader(int uid, int gid)
{
	User *g;
	int i, lead;

	lead = 0;
	rlock(&fs->userlk);
	g = uid2user(gid);
	if(g != nil){
		if(g->lead == 0){
			for(i = 0; i < g->nmemb; i++)
				if(g->memb[i] == uid){
					lead = 1;
					break;
				}
		}else if(uid == g->lead)
			lead = 1;
	}
	runlock(&fs->userlk);
	return lead;

}

static int
mode2bits(int req)
{
	int m;

	m = 0;
	switch(req&0xf){
	case OREAD:	m = DMREAD;		break;
	case OWRITE:	m = DMWRITE;		break;
	case ORDWR:	m = DMREAD|DMWRITE;	break;
	case OEXEC:	m = DMREAD|DMEXEC;	break;
	}
	if(req&OTRUNC)
		m |= DMWRITE;
	return m;
}

static int
fsaccess(Fid *f, ulong fmode, int fuid, int fgid, int m)
{
	/* uid none gets only other permissions */
	if(f->permit)
		return 0;
	if(f->uid != noneid) {
		if(f->uid == fuid)
			if((m & (fmode>>6)) == m)
				return 0;
		if(ingroup(f->uid, fgid))
			if((m & (fmode>>3)) == m)
				return 0;
	}
	if(m & fmode) {
		if((fmode & DMDIR) && (m == DMEXEC))
			return 0;
		if(!ingroup(f->uid, nogroupid))
			return 0;
	}
	return -1;
}

static void
fsattach(Fmsg *m)
{
	char dbuf[Kvmax], kvbuf[Kvmax];
	char *p, *n, *aname;
	Mount *mnt;
	Dent *de;
	Tree *t;
	User *u;
	Fcall r;
	Xdir d;
	Kvp kv;
	Key dk;
	Fid f, *af;
	int uid;

	de = nil;
	mnt = nil;
	if(waserror()){
		rerror(m, errmsg());
		goto Err;
	}
	aname = m->aname;
	if(aname[0] == '%')
		aname++;
	if(aname[0] == '\0')
		aname = "main";
	if((mnt = getmount(aname)) == nil)
		error(Enosnap);

	rlock(&fs->userlk);
	n = m->uname;
	/*
	 * to allow people to add themselves to the user file,
	 * we need to force the user id to one that exists.
	 */
	if(permissive && strcmp(aname, "adm") == 0)
		n = "adm";
	if((u = name2user(n)) == nil){
		runlock(&fs->userlk);
		error(Enouser);
	}
	uid = u->id;
	runlock(&fs->userlk);

	if(m->afid != NOFID){
		r.data = nil;
		r.count = 0;
		if((af = getfid(m->conn, m->afid)) == nil)
			error(Enofid);
		authread(af, &r, nil, 0);
		putfid(af);
		if(af->uid != uid)
			error(Ebadu);
	}else if(!fs->noauth && strcmp(m->uname, "none") != 0)
		error(Ebadu);

	if(strcmp(m->aname, "dump") == 0){
		memset(&d, 0, sizeof(d));
		filldumpdir(&d);
	}else{
		if((p = packdkey(dbuf, sizeof(dbuf), -1ULL, "")) == nil)
			error(Elength);
		dk.k = dbuf;
		dk.nk = p - dbuf;
		t = agetp(&mnt->root);
		if(!btlookup(t, &dk, &kv, kvbuf, sizeof(kvbuf)))
			error(Enosnap);
		kv2dir(&kv, &d);
	}
	de = getdent(-1, &d);
	memset(&f, 0, sizeof(Fid));
	f.fid = NOFID;
	f.mnt = mnt;
	f.qpath = d.qid.path;
	f.pqpath = d.qid.path;
	f.mode = -1;
	f.iounit = m->conn->iounit;
	f.dent = de;
	f.uid = uid;
	f.duid = d.uid;
	f.dgid = d.gid;
	f.dmode = d.mode;
	if(m->aname[0] == '%'){
		if(!permissive && !ingroup(uid, admid))
			error(Eperm);
		f.permit = 1;
	}
	if(dupfid(m->conn, m->fid, &f) == nil)
		error(Efid);

	r.type = Rattach;
	r.qid = d.qid;
	respond(m, &r);
	poperror();


Err:	clunkdent(de);
	clunkmount(mnt);
}

static int
findparent(Tree *t, Fid *f, vlong *qpath, char **name, char *buf, int nbuf)
{
	char *p, kbuf[Keymax];
	Kvp kv;
	Key k;

	p = packsuper(kbuf, sizeof(kbuf), f->pqpath);
	k.k = kbuf;
	k.nk = p - kbuf;
	if(!btlookup(t, &k, &kv, buf, nbuf))
		return 0;
	*name = unpackdkey(kv.v, kv.nv, qpath);
	return 1;
}

static void
fswalk(Fmsg *m)
{
	char *p, *name, kbuf[Maxent], kvbuf[Kvmax];
	int duid, dgid, dmode;
	vlong up, prev;
	Fid *o, *f;
	Dent *dent;
	Mount *mnt;
	Tree *t;
	Fcall r;
	Xdir d;
	Kvp kv;
	Key k;
	int i;

	if((o = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	if(waserror()){
		rerror(m, errmsg());
		putfid(o);
		return;
	}
	if(o->mode != -1)
		error(Einuse);
	t = o->mnt->root;
	mnt = o->mnt;
	up = o->qpath;
	prev = o->qpath;
	rlock(o->dent);
	d = *o->dent;
	runlock(o->dent);
	duid = d.uid;
	dgid = d.gid;
	dmode = d.mode;
	r.type = Rwalk;
	for(i = 0; i < m->nwname; i++){
		if(fsaccess(o, d.mode, d.uid, d.gid, DMEXEC) != 0)
			error(Eperm);
		name = m->wname[i];
		if(d.qid.path == Qdump){
			if((mnt = getmount(m->wname[i])) == nil)
				error(Esrch);
			if(waserror()){
				clunkmount(mnt);
				nexterror();
			}
			p = packdkey(kbuf, sizeof(kbuf), -1ULL, "");
			poperror();
		}else{
			if(strcmp(m->wname[i], "..") == 0){
				if(o->pqpath == Qdump){
					mnt = fs->snapmnt;
					filldumpdir(&d);
					duid = d.uid;
					dgid = d.gid;
					dmode = d.mode;
					goto Found;
				}
				if(!findparent(t, o, &prev, &name, kbuf, sizeof(kbuf)))
					error(Esrch);
			}
			p = packdkey(kbuf, sizeof(kbuf), prev, name);
		}
		duid = d.uid;
		dgid = d.gid;
		dmode = d.mode;
		k.k = kbuf;
		k.nk = p - kbuf;
		if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
			break;
		kv2dir(&kv, &d);
Found:
		up = prev;
		prev = d.qid.path;
		r.wqid[i] = d.qid;
	}
	r.nwqid = i;
	if(i == 0 && m->nwname != 0)
		error(Esrch);
	f = o;
	if(m->fid != m->newfid && i == m->nwname){
		if((f = dupfid(m->conn, m->newfid, o)) == nil)
			error(Efid);
		putfid(o);
	}
	if(i > 0 && i == m->nwname){
		lock(f);
		if(waserror()){
			if(f != o)
				clunkfid(m->conn, f);
			unlock(f);
			nexterror();
		}
		if(up == Qdump)
			dent = getdent(-1ULL, &d);
		else
			dent = getdent(up, &d);
		if(mnt != f->mnt){
			clunkmount(f->mnt);
			ainc(&mnt->ref);
			f->mnt = mnt;
		}
		clunkdent(f->dent);
		f->qpath = r.wqid[i-1].path;
		f->pqpath = up;
		f->dent = dent;
		f->duid = duid;
		f->dgid = dgid;
		f->dmode = dmode;
		poperror();
		unlock(f);
	}
	respond(m, &r);
	poperror();
	putfid(f);
}

static void
fsstat(Fmsg *m)
{
	char buf[STATMAX];
	Fcall r;
	Fid *f;
	int n;

	if((f = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	if(waserror()){
		rerror(m, errmsg());
		putfid(f);
		return;
	}
	rlock(f->dent);
	if((n = dir2statbuf(f->dent, buf, sizeof(buf))) == -1)
		error(Efs);
	runlock(f->dent);
	r.type = Rstat;
	r.stat = (uchar*)buf;
	r.nstat = n;
	respond(m, &r);
	poperror();
	putfid(f);
}

static void
fswstat(Fmsg *m, int id, Amsg **ao)
{
	char rnbuf[Kvmax], opbuf[Kvmax], upbuf[Upksz];
	char *p, strs[65535];
	int op, nm, rename;
	vlong oldlen;
	Qid old;
	Fcall r;
	Dent *de;
	Msg mb[3];
	Xdir n;
	Dir d;
	Tree *t;
	Fid *f;
	Key k;
	User *u;

	*ao = nil;
	rename = 0;
	if((f = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	de = f->dent;
	truncwait(de, id);
	wlock(de);
	if(waserror()){
		rerror(m, errmsg());
		free(*ao);
		*ao = nil;
		goto Err;
	}
	if(de->gone)
		error(Ephase);
	if((de->qid.type & QTAUTH) || (de->qid.path & Qdump))
		error(Emode);
	if(convM2D(m->stat, m->nstat, &d, strs) <= BIT16SZ)
		error(Edir);

	t = agetp(&f->mnt->root);
	n = de->Xdir;
	n.qid.vers++;
	p = opbuf+1;
	op = 0;

	/* check validity of updated fields and construct Owstat message */
	if(d.qid.path != ~0 || d.qid.vers != ~0){
		if(d.qid.path != de->qid.path)
			error(Ewstatp);
		if(d.qid.vers != de->qid.vers)
			error(Ewstatv);
	}
	if(*d.name != '\0'){
		if(strcmp(d.name, de->name) != 0){
			rename = 1;
			if(okname(d.name) == -1)
				error(Ename);
			if(walk1(t, f->dent->up, d.name, &old, &oldlen) == 0)
				error(Eexist);
			n.name = d.name;
		}
	}
	if(d.length != ~0){
		if(d.length < 0)
			error(Ewstatl);
		if(d.length != de->length){
			if(d.length < de->length){
				if((*ao = malloc(sizeof(Amsg))) == nil)
					error(Enomem);
				qlock(&de->trunclk);
				de->trunc = 1;
				qunlock(&de->trunclk);
				aincl(&de->ref, 1);
				aincl(&f->mnt->ref, 1);
				(*ao)->op = AOclear;
				(*ao)->mnt = f->mnt;
				(*ao)->qpath = f->qpath;
				(*ao)->off = d.length;
				(*ao)->end = f->dent->length;
				(*ao)->dent = de;
			}
			de->length = d.length;
			n.length = d.length;
			op |= Owsize;
			PACK64(p, n.length);
			p += 8;
		}
	}
	if(d.mode != ~0){
		if((d.mode^de->mode) & DMDIR)
			error(Ewstatd);
		if(d.mode & ~(DMDIR|DMAPPEND|DMEXCL|DMTMP|0777))
			error(Ewstatb);
		if(d.mode != de->mode){
			n.mode = d.mode;
			n.qid.type = d.mode>>24;
			op |= Owmode;
			PACK32(p, n.mode);
			p += 4;
		}
	}
	if(d.mtime != ~0){
		n.mtime = d.mtime*Nsec;
		if(n.mtime != de->mtime){
			op |= Owmtime;
			PACK64(p, n.mtime);
			p += 8;
		}
	}
	if(*d.uid != '\0'){
		rlock(&fs->userlk);
		u = name2user(d.uid);
		if(u == nil){
			runlock(&fs->userlk);
			error(Enouser);
		}
		n.uid = u->id;
		runlock(&fs->userlk);
		if(n.uid != de->uid){
			op |= Owuid;
			PACK32(p, n.uid);
			p += 4;
		}
	}
	if(*d.gid != '\0'){
		rlock(&fs->userlk);
		u = name2user(d.gid);
		if(u == nil){
			runlock(&fs->userlk);
			error(Enogrp);
		}
		n.gid = u->id;
		runlock(&fs->userlk);
		if(n.gid != de->gid){
			op |= Owgid;
			PACK32(p, n.gid);
			p += 4;
		}
	}
	op |= Owmuid;
	n.muid = f->uid;
	PACK32(p, n.muid);
	p += 4;

	/* check permissions */
	if(rename)
		if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
			error(Eperm);
	if(op & Owsize)
		if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1)
			error(Eperm);
	if(op & (Owmode|Owmtime))
		if(!f->permit && f->uid != de->uid && !groupleader(f->uid, de->gid))
			error(Ewstato);
	if(op & Owuid)
		if(!f->permit)
			error(Ewstatu);
	if(op & Owgid)
		if(!f->permit
		&& !(f->uid == de->uid && ingroup(f->uid, n.gid))
		&& !(groupleader(f->uid, de->gid) && groupleader(f->uid, n.gid)))
			error(Ewstatg);

	/* update directory entry */
	nm = 0;
	if(rename && !de->gone){
		mb[nm].op = Oclobber;
		mb[nm].Key = de->Key;
		mb[nm].v = nil;
		mb[nm].nv = 0;
		nm++;
	
		mb[nm].op = Oinsert;
		dir2kv(f->pqpath, &n, &mb[nm], rnbuf, sizeof(rnbuf));
		k = mb[nm].Key;
		nm++;

		if(de->qid.type & QTDIR){
			packsuper(upbuf, sizeof(upbuf), f->qpath);
			mb[nm].op = Oinsert;
			mb[nm].k = upbuf;
			mb[nm].nk = Upksz;
			mb[nm].v = mb[nm-1].k;
			mb[nm].nv = mb[nm-1].nk;
			nm++;
		}
	}else{
		opbuf[0] = op;
		mb[nm].op = Owstat;
		mb[nm].Key = de->Key;
		mb[nm].v = opbuf;
		mb[nm].nv = p - opbuf;
		nm++;
	}
	assert(nm <= nelem(mb));
	upsert(f->mnt, mb, nm);

	de->Xdir = n;
	if(rename)
		cpkey(de, &k, de->buf, sizeof(de->buf));

	r.type = Rwstat;
	respond(m, &r);
	poperror();

Err:	wunlock(de);
	putfid(f);
}


static void
fsclunk(Fmsg *m)
{
	Fcall r;
	Fid *f;

	if((f = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	lock(f);
	if(f->scan != nil){
		free(f->scan);
		f->scan = nil;
	}
	clunkfid(m->conn, f);
	unlock(f);
	r.type = Rclunk;
	respond(m, &r);
	putfid(f);
}

static void
fscreate(Fmsg *m)
{
	char *p, buf[Kvmax], upkbuf[Keymax], upvbuf[Inlmax];
	Dent *de;
	vlong oldlen;
	Qid old;
	Fcall r;
	Msg mb[2];
	Fid *f;
	Xdir d;
	int nm;

	if(okname(m->name) == -1){
		rerror(m, Ename);
		return;
	}
	if(m->perm & (DMMOUNT|DMAUTH)){
		rerror(m, Ebotch);
		return;
	}
	if((f = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	lock(f);

	if(waserror()){
		rerror(m, errmsg());
		goto Err;
		
	}
	if(f->mode != -1){
		rerror(m, Einuse);
		goto Out;
	}
	de = f->dent;
	if(walk1(f->mnt->root, f->qpath, m->name, &old, &oldlen) == 0){
		rerror(m, Eexist);
		goto Out;
	}

	rlock(de);
	if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1){
		rerror(m, Eperm);
		runlock(de);
		goto Out;
	}

	d.gid = de->gid;
	runlock(de);

	nm = 0;
	d.qid.type = 0;
	if(m->perm & DMDIR)
		d.qid.type |= QTDIR;
	if(m->perm & DMAPPEND)
		d.qid.type |= QTAPPEND;
	if(m->perm & DMEXCL)
		d.qid.type |= QTEXCL;
	if(m->perm & DMTMP)
		d.qid.type |= QTTMP;
	d.qid.path = aincv(&fs->nextqid, 1);
	d.qid.vers = 0;
	d.mode = m->perm;
	if(m->perm & DMDIR)
		d.mode &= ~0777 | de->mode & 0777;
	else
		d.mode &= ~0666 | de->mode & 0666;
	d.name = m->name;
	d.atime = nsec();
	d.mtime = d.atime;
	d.length = 0;
	d.uid = f->uid;
	d.muid = f->uid;

	mb[nm].op = Oinsert;
	dir2kv(f->qpath, &d, &mb[nm], buf, sizeof(buf));
	nm++;

	if(m->perm & DMDIR){
		mb[nm].op = Oinsert;
		if((p = packsuper(upkbuf, sizeof(upkbuf), d.qid.path)) == nil)
			sysfatal("ream: pack super");
		mb[nm].k = upkbuf;
		mb[nm].nk = p - upkbuf;
		if((p = packdkey(upvbuf, sizeof(upvbuf), f->qpath, d.name)) == nil)
			sysfatal("ream: pack super");
		mb[nm].v = upvbuf;
		mb[nm].nv = p - upvbuf;
		nm++;
	}
	upsert(f->mnt, mb, nm);

	de = getdent(f->qpath, &d);
	clunkdent(f->dent);
	f->mode = mode2bits(m->mode);
	f->pqpath = f->qpath;
	f->qpath = d.qid.path;
	f->dent = de;

	r.type = Rcreate;
	r.qid = d.qid;
	r.iounit = f->iounit;
	respond(m, &r);
Out:	poperror();
Err:	unlock(f);
	putfid(f);
	return;
}

static char*
candelete(Fid *f)
{
	char *e, pfx[Dpfxsz];
	Tree *t;
	Scan s;

	if(!(f->dent->qid.type & QTDIR))
		return nil;

	t = agetp(&f->mnt->root);
	packdkey(pfx, sizeof(pfx), f->qpath, nil);
	btnewscan(&s, pfx, sizeof(pfx));
	btenter(t, &s);
	if(btnext(&s, &s.kv))
		e = Enempty;
	else
		e = nil;
	btexit(&s);
	return e;
}

static void
fsremove(Fmsg *m, int id, Amsg **ao)
{
	char *e, upbuf[Upksz];
	Fcall r;
	Msg mb[2];
	Fid *f;

	if((f = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	clunkfid(m->conn, f);

	truncwait(f->dent, id);
	wlock(f->dent);
	*ao = nil;
	if(waserror()){
		rerror(m, errmsg());
		free(*ao);
		*ao = nil;
		goto Err;
	}
	if(f->dent->gone)
		error(Ephase);
	if((e = candelete(f)) != nil)
		error(e);
	if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
		error(Eperm);
	mb[0].op = Odelete;
	mb[0].k = f->dent->k;
	mb[0].nk = f->dent->nk;
	mb[0].nv = 0;

	if(f->dent->qid.type & QTDIR){
		packsuper(upbuf, sizeof(upbuf), f->qpath);
		mb[1].op = Oclobber;
		mb[1].k = upbuf;
		mb[1].nk = Upksz;
		mb[1].nv = 0;
		upsert(f->mnt, mb, 2);
	}else{
		*ao = emalloc(sizeof(Amsg), 1);
		aincl(&f->mnt->ref, 1);
		(*ao)->op = AOclear;
		(*ao)->mnt = f->mnt;
		(*ao)->qpath = f->qpath;
		(*ao)->off = 0;
		(*ao)->end = f->dent->length;
		(*ao)->dent = nil;
		upsert(f->mnt, mb, 1);
	}
	f->dent->gone = 1;
	r.type = Rremove;
	respond(m, &r);
	poperror();
Err:
	wunlock(f->dent);
	putfid(f);
	return;
}

static void
fsopen(Fmsg *m, int id, Amsg **ao)
{
	char *p, buf[Kvmax];
	int mbits;
	Tree *t;
	Fcall r;
	Xdir d;
	Fid *f;
	Kvp kv;
	Msg mb;

	mbits = mode2bits(m->mode);
	if((f = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	if(waserror()){
		rerror(m, errmsg());
		putfid(f);
		return;
	}
	if(m->mode & OTRUNC)
		truncwait(f->dent, id);
	t = agetp(&f->mnt->root);
	if((f->qpath & Qdump) != 0){
		filldumpdir(&d);
	}else{
		if(!btlookup(t, f->dent, &kv, buf, sizeof(buf)))
			error(Esrch);
		kv2dir(&kv, &d);
	}
	wlock(f->dent);
	if(waserror()){
		wunlock(f->dent);
		nexterror();
	}
	if(f->dent->gone)
		error(Ephase);
	if(f->dent->qid.type & QTEXCL)
	if(f->dent->ref != 1)
		error(Elocked);
	if(fsaccess(f, d.mode, d.uid, d.gid, mbits) == -1)
		error(Eperm);
	f->dent->length = d.length;
	poperror();
	wunlock(f->dent);
	r.type = Ropen;
	r.qid = d.qid;
	r.iounit = f->iounit;

	lock(f);
	if(f->mode != -1){
		unlock(f);
		error(Einuse);
	}
	f->mode = mode2bits(m->mode);
	if(m->mode & OTRUNC){
		wlock(f->dent);

		if(waserror()){
			wunlock(f->dent);
			free(*ao);
			*ao = nil;
			nexterror();
		}
		*ao = emalloc(sizeof(Amsg), 1);
		qlock(&f->dent->trunclk);
		f->dent->trunc = 1;
		qunlock(&f->dent->trunclk);
		aincl(&f->dent->ref, 1);
		aincl(&f->mnt->ref, 1);
		(*ao)->op = AOclear;
		(*ao)->mnt = f->mnt;
		(*ao)->qpath = f->qpath;
		(*ao)->off = 0;
		(*ao)->end = f->dent->length;
		(*ao)->dent = f->dent;

		f->dent->muid = f->uid;
		f->dent->qid.vers++;
		f->dent->length = 0;

		mb.op = Owstat;
		p = buf;
		p[0] = Owsize|Owmuid;	p += 1;
		PACK64(p, 0);		p += 8;
		PACK32(p, f->uid);	p += 4;
		mb.k = f->dent->k;
		mb.nk = f->dent->nk;
		mb.v = buf;
		mb.nv = p - buf;

		upsert(f->mnt, &mb, 1);
		wunlock(f->dent);
		poperror();
	}
	unlock(f);
	poperror();
	respond(m, &r);
	putfid(f);
}

static void
readsnap(Fmsg *m, Fid *f, Fcall *r)
{
	char pfx[1], *p;
	int n, ns;
	Scan *s;
	Xdir d;

	s = f->scan;
	if(s != nil && s->offset != 0 && s->offset != m->offset)
		error(Edscan);
	if(s == nil || m->offset == 0){
		s = emalloc(sizeof(Scan), 1);
		pfx[0] = Klabel;
		btnewscan(s, pfx, 1);
		lock(f);
		if(f->scan != nil){
			free(f->scan);
		}
		f->scan = s;
		unlock(f);
	}
	if(s->donescan){
		r->count = 0;
		return;
	}
	p = r->data;
	n = m->count;
	d = f->dent->Xdir;
	if(s->overflow){
		memcpy(d.name, s->kv.k+1, s->kv.nk-1);
		d.name[s->kv.nk-1] = 0;
		d.qid.path = UNPACK64(s->kv.v + 1);
		if((ns = dir2statbuf(&d, p, n)) == -1){
			r->count = 0;
			return;
		}
		s->overflow = 0;
		p += ns;
		n -= ns;
	}
	btenter(&fs->snap, s);
	while(1){
		if(!btnext(s, &s->kv))
			break;
		memcpy(d.name, s->kv.k+1, s->kv.nk-1);
		d.name[s->kv.nk-1] = 0;
		d.qid.path = UNPACK64(s->kv.v + 1);
		if((ns = dir2statbuf(&d, p, n)) == -1){
			s->overflow = 1;
			break;
		}
		p += ns;
		n -= ns;
	}
	btexit(s);
	r->count = p - r->data;
	return;
}

static void
readdir(Fmsg *m, Fid *f, Fcall *r)
{
	char pfx[Dpfxsz], *p;
	int n, ns;
	Tree *t;
	Scan *s;

	s = f->scan;
	t = agetp(&f->mnt->root);
	if(s != nil && s->offset != 0 && s->offset != m->offset)
		error(Edscan);
	if(s == nil || m->offset == 0){
		s = emalloc(sizeof(Scan), 1);
		packdkey(pfx, sizeof(pfx), f->qpath, nil);
		btnewscan(s, pfx, sizeof(pfx));
		lock(f);
		if(f->scan != nil)
			free(f->scan);
		f->scan = s;
		unlock(f);
	}
	if(s->donescan){
		r->count = 0;
		return;
	}
	p = r->data;
	n = m->count;
	if(s->overflow){
		if((ns = kv2statbuf(&s->kv, p, n)) == -1){
			r->count = 0;
			return;
		}
		s->overflow = 0;
		p += ns;
		n -= ns;
	}
	btenter(t, s);
	while(1){
		if(!btnext(s, &s->kv))
			break;
		if((ns = kv2statbuf(&s->kv, p, n)) == -1){
			s->overflow = 1;
			break;
		}
		p += ns;
		n -= ns;
	}
	btexit(s);
	r->count = p - r->data;
}

static void
readfile(Fmsg *m, Fid *f, Fcall *r)
{
	vlong n, c, o;
	char *p;
	Dent *e;
	Tree *t;

	e = f->dent;
	rlock(e);
	if(m->offset > e->length){
		runlock(e);
		return;
	}
	p = r->data;
	c = m->count;
	o = m->offset;
	t = agetp(&f->mnt->root);
	if(m->offset + m->count > e->length)
		c = e->length - m->offset;
	while(c != 0){
		n = readb(t, f, p, o, c, e->length);
		r->count += n;
		if(n == 0)
			break;
		p += n;
		o += n;
		c -= n;
	}
	runlock(e);
}

static void
fsread(Fmsg *m)
{
	Fcall r;
	Fid *f;

	if((f = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	r.type = Rread;
	r.count = 0;
	r.data = nil;
	if(waserror()){
		rerror(m, errmsg());
		free(r.data);
		putfid(f);
		return;
	}	
	r.data = emalloc(m->count, 0);
	if(f->dent->qid.type & QTAUTH)
		authread(f, &r, r.data, m->count);
	else if(f->dent->qid.path == Qdump)
		readsnap(m, f, &r);
	else if(f->dent->qid.type & QTDIR)
		readdir(m, f, &r);
	else
		readfile(m, f, &r);
	respond(m, &r);
	free(r.data);
	poperror();
	putfid(f);
}

static void
fswrite(Fmsg *m, int id)
{
	char sbuf[Wstatmax], kbuf[Max9p/Blksz+2][Offksz], vbuf[Max9p/Blksz+2][Ptrsz];
	Bptr bp[Max9p/Blksz + 2];
	Msg kv[Max9p/Blksz + 2];
	vlong n, o, c, w;
	int i, j;
	char *p;
	Fcall r;
	Tree *t;
	Fid *f;

	if((f = getfid(m->conn, m->fid)) == nil){
		rerror(m, Enofid);
		return;
	}
	if(!(f->mode & DMWRITE)){
		rerror(m, Einuse);
		putfid(f);
		return;
	}
	truncwait(f->dent, id);
	wlock(f->dent);
	if(waserror()){
		rerror(m, errmsg());
		wunlock(f->dent);
		putfid(f);
		return;
	}
	if(f->dent->gone)
		error(Ephase);
	if(f->dent->qid.type == QTAUTH){
		authwrite(f, &r, m->data, m->count);
		goto Out;
	}		

	w = 0;
	p = m->data;
	o = m->offset;
	c = m->count;
	t = agetp(&f->mnt->root);
	for(i = 0; i < nelem(kv)-1 && c != 0; i++){
		assert(i == 0 || o%Blksz == 0);
		kv[i].op = Oinsert;
		kv[i].k = kbuf[i];
		kv[i].nk = sizeof(kbuf[i]);
		kv[i].v = vbuf[i];
		kv[i].nv = sizeof(vbuf[i]);
		if(waserror()){
			if(!fs->rdonly)
				for(j = 0; j < i; j++)
					freeblk(t, nil, bp[j]);
			nexterror();
		}
		n = writeb(f, &kv[i], &bp[i], p, o, c, f->dent->length);
		poperror();
		w += n;
		p += n;
		o += n;
		c -= n;
	}

	p = sbuf;
	kv[i].op = Owstat;
	kv[i].k = f->dent->k;
	kv[i].nk = f->dent->nk;
	n = m->offset+w;
	*p++ = 0;
	if(n > f->dent->length){
		sbuf[0] |= Owsize;
		PACK64(p, n);
		p += 8;
		f->dent->length = m->offset+m->count;
	}
	sbuf[0] |= Owmtime;
	f->dent->mtime = nsec();
	PACK64(p, f->dent->mtime);
	p += 8;
	sbuf[0] |= Owmuid;
	PACK32(p, f->uid);
	p += 4;

	kv[i].v = sbuf;
	kv[i].nv = p - sbuf;
	upsert(f->mnt, kv, i+1);

	r.type = Rwrite;
	r.count = w;
Out:
	poperror();
 	respond(m, &r);
	wunlock(f->dent);
	putfid(f);	
}

void
fsflush(Fmsg *m)
{
	Fcall r;

	r.type = Rflush;
	respond(m, &r);
}

Conn *
newconn(int rfd, int wfd)
{
	Conn *c;

	if((c = mallocz(sizeof(*c), 1)) == nil)
		return nil;
	c->rfd = rfd;
	c->wfd = wfd;
	c->iounit = Max9p;
	c->next = fs->conns;
	lock(&fs->connlk);
	fs->conns = c;
	unlock(&fs->connlk);
	return c;
}

void
runfs(int, void *pc)
{
	char err[128];
	RWLock *lk;
	Conn *c;
	Fcall r;
	Fmsg *m;
	u32int h;

	c = pc;
	while(1){
		if(readmsg(c, &m) < 0){
			fshangup(c, "read message: %r");
			return;
		}
		if(m == nil)
			break;
		if(convM2S(m->buf, m->sz, m) == 0){
			fshangup(c, "invalid message: %r");
			return;
		}
		if(m->type != Tversion && !c->versioned){
			fshangup(c, "version required");
			return;
		}
		dprint("← %F\n", &m->Fcall);

		if(m->type == Tflush){
			lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
			wlock(lk);
		}else{
			lk = &fs->flushq[ihash(m->tag) % Nflushtab];
			rlock(lk);
		}

		h = ihash(m->fid) % fs->nreaders;
		switch(m->type){
		/* sync setup, must not access tree */
		case Tversion:	fsversion(m);	break;
		case Tauth:	fsauth(m);	break;
		case Tclunk:	fsclunk(m);	break;
		case Tflush:	fsflush(m);	break;

		/* mutators */
		case Tcreate:	chsend(fs->wrchan, m);	break;
		case Twrite:	chsend(fs->wrchan, m);	break;
		case Twstat:	chsend(fs->wrchan, m);	break;
		case Tremove:	chsend(fs->wrchan, m);	break;

		/* reads */
		case Tattach:	chsend(fs->rdchan[h], m);	break;
		case Twalk:	chsend(fs->rdchan[h], m);	break;
		case Tread:	chsend(fs->rdchan[h], m);	break;
		case Tstat:	chsend(fs->rdchan[h], m);	break;

		/* both */
		case Topen:
			if((m->mode & OTRUNC) || (m->mode & 0xf) == OEXEC)
				chsend(fs->wrchan, m);
			else
				chsend(fs->rdchan[h], m);
			break;

		default:
			fprint(2, "unknown message %F\n", &m->Fcall);
			snprint(err, sizeof(err), "unknown message: %F", &m->Fcall);
			r.type = Rerror;
			r.ename = err;
			respond(m, &r);
			break;
		}
		assert(estacksz() == 0);
	}
}

void
runmutate(int id, void *)
{
	Fmsg *m;
	Amsg *a;
	Fid *f;

	while(1){
		a = nil;
		m = chrecv(fs->wrchan);
		if(fs->rdonly){
			/*
			 * special case: even if Tremove fails, we need
			 * to clunk the fid.
			 */
			if(m->type == Tremove){
				if((f = getfid(m->conn, m->fid)) == nil){
					rerror(m, Enofid);
					continue;
				}
				clunkfid(m->conn, f);
				putfid(f);
			}
			rerror(m, Erdonly);
			continue;
 		}

		qlock(&fs->mutlk);
		epochstart(id);
		fs->snap.dirty = 1;
		switch(m->type){
		case Tcreate:	fscreate(m);		break;
		case Twrite:	fswrite(m, id);		break;
		case Twstat:	fswstat(m, id, &a);	break;
		case Tremove:	fsremove(m, id, &a);	break;
		case Topen:	fsopen(m, id, &a);	break;
		default:	abort();		break;
		}
		assert(estacksz() == 0);
		epochend(id);
		epochclean();
		qunlock(&fs->mutlk);

		if(a != nil)
			chsend(fs->admchan, a);
	}
}

void
runread(int id, void *ch)
{
	Fmsg *m;

	while(1){
		m = chrecv(ch);
		epochstart(id);
		switch(m->type){
		case Tattach:	fsattach(m);		break;
		case Twalk:	fswalk(m);		break;
		case Tread:	fsread(m);		break;
		case Tstat:	fsstat(m);		break;
		case Topen:	fsopen(m, id, nil);	break;
		}
		assert(estacksz() == 0);
		epochend(id);
	}
}

void
freetree(Bptr rb, vlong pred)
{
	Bptr bp;
	Blk *b;
	Kvp kv;
	int i;

	b = getblk(rb, 0);
	if(b->type == Tpivot){
		for(i = 0; i < b->nval; i++){
			getval(b, i, &kv);
			bp = unpackbp(kv.v, kv.nv);
			freetree(bp, pred);
			qlock(&fs->mutlk);
			epochclean();
			qunlock(&fs->mutlk);
		}
	}
	if(rb.gen > pred)
		freeblk(nil, nil, rb);
	dropblk(b);
}

/*
 * Here, we clean epochs frequently, but we run outside of
 * an epoch; this is because the caller of this function
 * has already waited for an epoch to tick over, there's
 * nobody that can be accessing the tree other than us,
 * and we just need to keep the limbo list short.
 *
 * Because this is the last reference to the tree, we don't
 * need to hold the mutlk, other than when we free or kill
 * blocks via epochclean.
 */
void
sweeptree(Tree *t)
{
	char pfx[1];
	Scan s;
	Bptr bp;
	pfx[0] = Kdat;
	btnewscan(&s, pfx, 1);
	btenter(t, &s);
	while(1){
		if(!btnext(&s, &s.kv))
			break;
		bp = unpackbp(s.kv.v, s.kv.nv);
		if(bp.gen > t->pred)
			freeblk(nil, nil, bp);
		qlock(&fs->mutlk);
		epochclean();
		qunlock(&fs->mutlk);
	}
	btexit(&s);
	freetree(t->bp, t->pred);
}

void
runsweep(int id, void*)
{
	char buf[Offksz];
	Bptr bp, nb, *oldhd;
	vlong off;
	Tree *t;
	Arena *a;
	Amsg *am;
	Blk *b;
	Msg m;
	int i;

	if((oldhd = calloc(fs->narena, sizeof(Bptr))) == nil)
		sysfatal("malloc log heads");
	while(1){
		am = chrecv(fs->admchan);
		if(agetl(&fs->rdonly)){
			fprint(2, "spurious adm message\n");
			break;
		}
		switch(am->op){
		case AOsync:
			tracem("syncreq");
			if(!fs->snap.dirty && !am->halt)
				continue;
			if(waserror()){
				fprint(2, "sync error: %s\n", errmsg());
				ainc(&fs->rdonly);
				break;
			}

			if(am->halt)
				ainc(&fs->rdonly);
			qlock(&fs->mutlk);
			for(i = 0; i < fs->narena; i++){
				a = &fs->arenas[i];
				qlock(a);
				if(a->nlog < a->reserve/(10*Blksz)){
					oldhd[i].addr = -1;
					oldhd[i].hash = -1;
					oldhd[i].gen = -1;
					qunlock(a);
					continue;
				}
				if(waserror()){
					qunlock(&fs->mutlk);
					qunlock(a);
					nexterror();
				}
				oldhd[i] = a->loghd;
				epochstart(id);
				compresslog(a);
				qunlock(a);
				epochend(id);
				epochclean();
				poperror();
			}
			qunlock(&fs->mutlk);
			sync();

			for(i = 0; i < fs->narena; i++){
				for(bp = oldhd[i]; bp.addr != -1; bp = nb){
					qlock(&fs->mutlk);
					epochstart(id);
					b = getblk(bp, 0);
					nb = b->logp;
					freeblk(nil, b, b->bp);
					dropblk(b);
					epochend(id);
					epochclean();
					qunlock(&fs->mutlk);
				}
			}

			if(am->halt){
				assert(fs->snapdl.hd.addr == -1);
				assert(fs->snapdl.tl.addr == -1);
				postnote(PNGROUP, getpid(), "halted");
				exits(nil);
			}
			poperror();
			break;

		case AOsnap:
			tracem("snapreq");
			if(waserror()){
				fprint(2, "taking snap: %s\n", errmsg());
				ainc(&fs->rdonly);
				break;
			}

			qlock(&fs->mutlk);
			if(waserror()){
				qunlock(&fs->mutlk);
				nexterror();
			}
			epochstart(id);
			snapfs(am, &t);
			epochend(id);
			poperror();
			qunlock(&fs->mutlk);

			sync();

			if(t != nil){
				epochwait();
				sweeptree(t);
				closesnap(t);
			}
			poperror();
			break;

		case AOclear:
			tracem("bgclear");
			if(waserror()){
				fprint(2, "clear file %llx: %s\n", am->qpath, errmsg());
				ainc(&fs->rdonly);
				break;
			}
			if(am->dent != nil)
				qlock(&am->dent->trunclk);
			fs->snap.dirty = 1;
			for(off = am->off; off < am->end; off += Blksz){
				qlock(&fs->mutlk);
				if(waserror()){
					qunlock(&fs->mutlk);
					nexterror();
				}
				epochstart(id);
				m.k = buf;
				m.nk = sizeof(buf);
				m.op = Oclearb;
				m.k[0] = Kdat;
				PACK64(m.k+1, am->qpath);
				PACK64(m.k+9, off);
				m.v = nil;
				m.nv = 0;
				upsert(am->mnt, &m, 1);
				epochend(id);
				epochclean();
				qunlock(&fs->mutlk);
				poperror();
			}
			if(am->dent != nil){
				am->dent->trunc = 0;
				rwakeup(&am->dent->truncrz);
				qunlock(&am->dent->trunclk);
				clunkdent(am->dent);
			}
			clunkmount(am->mnt);
			poperror();
			break;
		}
		assert(estacksz() == 0);
		free(am);
	}
}

void
snapmsg(char *old, char *new, int flg)
{
	Amsg *a;

	a = emalloc(sizeof(Amsg), 1);
	a->op = AOsnap;
	a->fd = -1;
	a->flag = flg;
	strecpy(a->old, a->old+sizeof(a->old), old);
	if(new == nil)
		a->delete = 1;
	else
		strecpy(a->new, a->new+sizeof(a->new), new);
	chsend(fs->admchan, a);
}

void
runtasks(int, void *)
{
	char buf[128];
	Tm now, then;
	Mount *mnt;
	int m, h;
	Amsg *a;

	m = 0;
	h = 0;
	tmnow(&then, nil);
	tmnow(&now, nil);
	while(1){
		sleep(5000);
		if(fs->rdonly)
			continue;
		if(waserror()){
			fprint(2, "task error: %s\n", errmsg());
			continue;
		}
		a = emalloc(sizeof(Amsg), 1);
		a->op = AOsync;
		a->halt = 0;
		a->fd = -1;
		chsend(fs->admchan, a);

		tmnow(&now, nil);
		for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
			if(mnt->noauto)
				continue;
			if(now.yday != then.yday){
				snprint(buf, sizeof(buf),
					"%s@day.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
				snapmsg("main", buf, Lauto);
			}
			if(now.hour != then.hour){
				if(mnt->hourly[h][0] != 0)
					snapmsg(mnt->hourly[h], nil, 0);
				snprint(mnt->hourly[h], sizeof(mnt->hourly[h]),
					"%s@hour.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
				snapmsg("main", mnt->hourly[h], Lauto);
			}
			if(now.min != then.min){
				if(mnt->minutely[m][0] != 0)
					snapmsg(mnt->minutely[m], nil, 0);
				snprint(mnt->minutely[m], sizeof(mnt->minutely[m]),
					"%s@minute.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
				snapmsg("main", mnt->minutely[m], Lauto);
			}
		}
		if(now.hour != then.hour)
			h = (h+1)%24;
		if(now.min != then.min)
			m = (m+1)%60;
		then = now;
		poperror();
	}
}