ref: 5eafd0ab6729981323928f2db6b7b2c10fe317c6
author: Ori Bernstein <ori@eigenstate.org>
date: Sat Jul 20 21:15:44 EDT 2024
gefix: hacked gefs to fix ream issue
--- /dev/null
+++ b/atomic-386.s
@@ -1,0 +1,100 @@
+/* get variants */
+TEXT ageti+0(SB),1,$0
+TEXT agetl+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+ MOVL p+0(FP), AX
+ MOVL 0(AX), AX
+ RET
+
+TEXT agetv+0(SB),1,$0
+ MOVL r+0(FP), AX
+ MOVL p+4(FP), BX
+ FMOVD (BX), F0
+ FMOVDP F0, (AX)
+ RET
+
+/* set variants */
+TEXT aseti+0(SB),1,$0
+TEXT asetl+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+ MOVL p+0(FP), BX
+ MOVL v+4(FP), AX
+ LOCK; XCHGL (BX), AX
+ RET
+
+TEXT asetv+0(SB),1,$0
+ MOVL p+4(FP), DI
+ MOVL nv+8(FP), BX
+ MOVL nv+12(FP), CX
+ MOVL 0(DI), AX
+ MOVL 4(DI), DX
+loop:
+ LOCK; CMPXCHG8B (DI)
+ JNE loop
+ MOVL p+0(FP),DI
+ MOVL AX, 0(DI)
+ MOVL DX, 4(DI)
+ RET
+
+/* inc variants */
+TEXT ainci+0(SB),1,$0
+TEXT aincl+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+ MOVL p+0(FP), BX
+ MOVL v+4(FP), CX
+ MOVL CX, AX
+ LOCK; XADDL AX, (BX)
+ ADDL CX, AX
+ RET
+
+TEXT aincv+0(SB),1,$0
+ MOVL p+4(FP), DI
+retry:
+ MOVL 0(DI), AX
+ MOVL 4(DI), DX
+ MOVL AX, BX
+ MOVL DX, CX
+ ADDL v+8(FP), BX
+ ADCL v+12(FP), CX
+ LOCK; CMPXCHG8B (DI)
+ JNE retry
+ MOVL r+0(FP), DI
+ MOVL BX, 0x0(DI)
+ MOVL CX, 0x4(DI)
+ RET
+
+/* cas variants */
+TEXT acasi+0(SB),1,$0
+TEXT acasl+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+ MOVL p+0(FP), CX
+ MOVL ov+4(FP), AX
+ MOVL nv+8(FP), DX
+ LOCK; CMPXCHGL DX, (CX)
+ JNE fail32
+ MOVL $1,AX
+ RET
+fail32:
+ MOVL $0,AX
+ RET
+
+TEXT acasv+0(SB),1,$0
+ MOVL p+0(FP), DI
+ MOVL ov+4(FP), AX
+ MOVL ov+8(FP), DX
+ MOVL nv+12(FP), BX
+ MOVL nv+16(FP), CX
+ LOCK; CMPXCHG8B (DI)
+ JNE fail64
+ MOVL $1,AX
+ RET
+fail64:
+ MOVL $0,AX
+ RET
+
+/* barriers (do we want to distinguish types?) */
+TEXT coherence+0(SB),1,$0
+ /* this is essentially mfence but that requires sse2 */
+ XORL AX, AX
+ LOCK; XADDL AX, (SP)
+ RET
--- /dev/null
+++ b/atomic-amd64.s
@@ -1,0 +1,59 @@
+/* get variants */
+TEXT agetl+0(SB),1,$0
+ MOVL (RARG), AX
+ RET
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+ MOVQ (RARG), AX
+ RET
+
+/* set variants */
+TEXT asetl+0(SB),1,$0
+ MOVL v+8(FP), AX
+ LOCK; XCHGL (RARG), AX
+ RET
+
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+ MOVQ v+8(FP), AX
+ LOCK; XCHGQ (RARG), AX
+ RET
+
+/* inc variants */
+TEXT aincl+0(SB),1,$0
+ MOVQ v+8(FP), BX
+ MOVQ BX, AX
+ LOCK; XADDL AX, (RARG)
+ ADDQ BX, AX
+ RET
+
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+ MOVQ v+8(FP), BX
+ MOVQ BX, AX
+ LOCK; XADDQ AX, (RARG)
+ ADDQ BX, AX
+ RET
+
+/* cas variants */
+TEXT acasl+0(SB),1,$0
+ MOVL c+8(FP), AX
+ MOVL v+16(FP), BX
+ LOCK; CMPXCHGL BX, (RARG)
+ SETEQ AX
+ MOVBLZX AX, AX
+ RET
+
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+ MOVQ c+8(FP), AX
+ MOVQ v+16(FP), BX
+ LOCK; CMPXCHGQ BX, (RARG)
+ SETEQ AX
+ MOVBLZX AX, AX
+ RET
+
+/* barriers (do we want to distinguish types?) */
+TEXT coherence+0(SB),1,$0
+ MFENCE
+ RET
--- /dev/null
+++ b/atomic-arm.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+ uintptr x = (uintptr)p;
+
+ /* constants from splitmix32 rng */
+ x = (x ^ (x >> 16)) * 0x85ebca6b;
+ x = (x ^ (x >> 13)) * 0xc2b2ae35;
+ x = (x ^ (x >> 16));
+ return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+ T n(T *p) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define SET(T, n) \
+ T n(T *p, T v) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ *p = v; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define INC(T, n) \
+ T n(T *p, T dv) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ *p += dv; \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define CAS(T, n) \
+ int n(T *p, T ov, T nv) \
+ { \
+ uintptr h; \
+ int r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ if(*p == ov){ \
+ *p = nv; \
+ r = 1; \
+ }else \
+ r = 0; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/atomic-arm64.s
@@ -1,0 +1,79 @@
+/* get variants */
+TEXT agetl+0(SB),1,$0
+ MOVW (R0), R0
+ RETURN
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+ MOV (R0), R0
+ RETURN
+
+/* set variants */
+TEXT asetl+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV R0, R2
+_setl:
+ LDAXRW (R2), R0
+ STLXRW R1, (R2), R3
+ CBNZW R3, _setl
+ RETURN
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV R0, R2
+_setp:
+ LDAXR (R2), R0
+ STLXR R1, (R2), R3
+ CBNZW R3, _setp
+ RETURN
+
+/* inc variants */
+TEXT aincl+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV R0, R2
+_incl:
+ LDAXRW (R2), R0
+ ADDW R1, R0, R3
+ STLXRW R3, (R2), R4
+ CBNZW R4, _incl
+ RETURN
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV R0, R2
+_incp:
+ LDAXR (R2), R0
+ ADD R1, R0, R3
+ STLXR R3, (R2), R4
+ CBNZW R4, _incp
+ RETURN
+
+/* cas variants */
+TEXT acasl+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV 0x10(FP), R2
+ LDAXRW (R0), R3
+ CMPW R1, R3
+ BNE _casl
+ STLXRW R2, (R0), R4
+ CMPW $0, R4
+_casl:
+ CSETW EQ, R0
+ RETURN
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV 0x10(FP), R2
+ LDAXR (R0), R3
+ CMP R1, R3
+ BNE _casp
+ STLXR R2, (R0), R4
+ CMPW $0, R4
+_casp:
+ CSETW EQ, R0
+ RETURN
+
+/* barriers */
+#define ISH (2<<2 | 3)
+TEXT coherence+0(SB),1,$0
+ DMB $ISH
+ RETURN
--- /dev/null
+++ b/atomic-mips.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+ uintptr x = (uintptr)p;
+
+ /* constants from splitmix32 rng */
+ x = (x ^ (x >> 16)) * 0x85ebca6b;
+ x = (x ^ (x >> 13)) * 0xc2b2ae35;
+ x = (x ^ (x >> 16));
+ return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+ T n(T *p) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define SET(T, n) \
+ T n(T *p, T v) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ *p = v; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define INC(T, n) \
+ T n(T *p, T dv) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ *p += dv; \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define CAS(T, n) \
+ int n(T *p, T ov, T nv) \
+ { \
+ uintptr h; \
+ int r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ if(*p == ov){ \
+ *p = nv; \
+ r = 1; \
+ }else \
+ r = 0; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/atomic-power64.s
@@ -1,0 +1,101 @@
+/* get variants */
+TEXT agetl+0(SB),1,$0
+ SYNC
+ // See ISA 3.0B section B.2.3, "Safe Fetch"
+ MOVWZ 0(RARG), RARG
+ CMPW RARG, RARG, CR7
+ BC 4, 30, 1(PC) // bne- cr7,0x4
+ ISYNC
+ RETURN
+
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+ SYNC
+ // See ISA 3.0B section B.2.3, "Safe Fetch"
+ MOVD 0(RARG), RARG
+ CMP RARG, RARG, CR7
+ BC 4, 30, 1(PC) // bne- cr7,0x4
+ ISYNC
+ RETURN
+
+/* set variants */
+TEXT asetl+0(SB),1,$0
+ MOVW val+8(FP), R4
+ SYNC
+ MOVW R4, 0(RARG)
+ RETURN
+
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+ MOVD val+8(FP), R4
+ SYNC
+ MOVD R4, 0(RARG)
+ RETURN
+
+/* inc variants */
+TEXT aincl+0(SB),1,$0
+ MOVD RARG, R4
+ MOVW delta+8(FP), R5
+ LWSYNC
+ LWAR (R4), RARG
+ ADD R5, RARG
+ STWCCC RARG, (R4)
+ BNE -3(PC)
+ RETURN
+
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+ MOVD RARG, R4
+ MOVD delta+8(FP), R5
+ LWSYNC
+ LDAR (R4), RARG
+ ADD R5, RARG
+ STDCCC RARG, (R4)
+ BNE -3(PC)
+ RETURN
+
+/* cas variants */
+TEXT acasl+0(SB),1,$0
+ MOVWZ old+8(FP), R4
+ MOVWZ new+16(FP), R5
+ LWSYNC
+casagain:
+ LWAR (RARG), R6
+ CMPW R6, R4
+ BNE casfail
+ STWCCC R5, (RARG)
+ BNE casagain
+ MOVD $1, RARG
+ LWSYNC
+ RETURN
+casfail:
+ LWSYNC
+ AND R0, RARG
+ RETURN
+
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+ MOVD old+8(FP), R4
+ MOVD new+16(FP), R5
+ LWSYNC
+cas64again:
+ LDAR (RARG), R6
+ CMP R6, R4
+ BNE cas64fail
+ STDCCC R5, (RARG)
+ BNE cas64again
+ MOVD $1, RARG
+ LWSYNC
+ RETURN
+cas64fail:
+ LWSYNC
+ AND R0, RARG
+ RETURN
+
+/* barriers */
+TEXT coherence+0(SB),1,$0
+ // LWSYNC is the "export" barrier recommended by Power ISA
+ // v2.07 book II, appendix B.2.2.2.
+ // LWSYNC is a load/load, load/store, and store/store barrier.
+ LWSYNC
+ RETURN
--- /dev/null
+++ b/atomic-spim.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+ uintptr x = (uintptr)p;
+
+ /* constants from splitmix32 rng */
+ x = (x ^ (x >> 16)) * 0x85ebca6b;
+ x = (x ^ (x >> 13)) * 0xc2b2ae35;
+ x = (x ^ (x >> 16));
+ return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+ T n(T *p) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define SET(T, n) \
+ T n(T *p, T v) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ *p = v; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define INC(T, n) \
+ T n(T *p, T dv) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ *p += dv; \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define CAS(T, n) \
+ int n(T *p, T ov, T nv) \
+ { \
+ uintptr h; \
+ int r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ if(*p == ov){ \
+ *p = nv; \
+ r = 1; \
+ }else \
+ r = 0; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/atomic.h
@@ -1,0 +1,16 @@
+long agetl(long*);
+vlong agetv(vlong*);
+void* agetp(void**);
+
+long asetl(long*, long);
+vlong asetv(vlong*, vlong);
+void* asetp(void**, void*);
+
+long aincl(long*, long);
+vlong aincv(vlong*, vlong);
+
+int acasl(long*, long, long);
+int acasv(vlong*, vlong, vlong);
+int acasp(void**, void*, void*);
+
+void coherence(void);
--- /dev/null
+++ b/blk.c
@@ -1,0 +1,1124 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+static vlong blkalloc_lk(Arena*, int);
+static vlong blkalloc(int, uint, int);
+static void blkdealloc_lk(Arena*, vlong);
+static Blk* initblk(Blk*, vlong, vlong, int);
+static void readblk(Blk*, Bptr, int);
+
+int
+checkflag(Blk *b, int set, int clr)
+{
+ long v;
+
+ v = agetl(&b->flag);
+ return (v & (set|clr)) == set;
+}
+
+void
+setflag(Blk *b, int set, int clr)
+{
+ long ov, nv;
+
+ while(1){
+ ov = agetl(&b->flag);
+ nv = (ov & ~clr) | set;
+ if(acasl(&b->flag, ov, nv))
+ break;
+ }
+}
+
+void
+syncblk(Blk *b)
+{
+ assert(checkflag(b, Bfinal, 0));
+ assert(b->bp.addr >= 0);
+ tracex("syncblk", b->bp, b->type, -1);
+ if(pwrite(fs->fd, b->buf, Blksz, b->bp.addr) == -1)
+ broke("%B %s: %r", b->bp, Eio);
+ setflag(b, 0, Bdirty);
+}
+
+static void
+readblk(Blk *b, Bptr bp, int flg)
+{
+ vlong off, xh, ck, rem, n;
+ char *p;
+
+ off = bp.addr;
+ rem = Blksz;
+ while(rem != 0){
+ n = pread(fs->fd, b->buf, rem, off);
+ if(n <= 0)
+ error("%s: %r", Eio);
+ off += n;
+ rem -= n;
+ }
+ b->cnext = nil;
+ b->cprev = nil;
+ b->hnext = nil;
+
+ b->bp.addr = bp.addr;
+ b->bp.hash = -1;
+ b->bp.gen = -1;
+
+ b->nval = 0;
+ b->valsz = 0;
+ b->nbuf = 0;
+ b->bufsz = 0;
+ b->logsz = 0;
+
+ p = b->buf + 2;
+ b->type = (flg&GBraw) ? Tdat : UNPACK16(b->buf+0);
+ switch(b->type){
+ default:
+ broke("invalid block type %d @%llx", b->type, bp);
+ break;
+ case Tdat:
+ case Tsuper:
+ b->data = b->buf;
+ break;
+ case Tarena:
+ b->data = p;
+ break;
+ case Tdlist:
+ case Tlog:
+ b->logsz = UNPACK16(p); p += 2;
+ b->logh = UNPACK64(p); p += 8;
+ b->logp = unpackbp(p, Ptrsz); p += Ptrsz;
+ assert(p - b->buf == Loghdsz);
+ b->data = p;
+ break;
+ case Tpivot:
+ b->nval = UNPACK16(p); p += 2;
+ b->valsz = UNPACK16(p); p += 2;
+ b->nbuf = UNPACK16(p); p += 2;
+ b->bufsz = UNPACK16(p); p += 2;
+ assert(p - b->buf == Pivhdsz);
+ b->data = p;
+ break;
+ case Tleaf:
+ b->nval = UNPACK16(p); p += 2;
+ b->valsz = UNPACK16(p); p += 2;
+ assert(p - b->buf == Leafhdsz);
+ b->data = p;
+ break;
+ }
+ if(b->type == Tlog || b->type == Tdlist){
+ xh = b->logh;
+ ck = bufhash(b->data, b->logsz);
+ }else{
+ xh = bp.hash;
+ ck = blkhash(b);
+ }
+ if((!flg&GBnochk) && ck != xh){
+ if(!(flg&GBsoftchk))
+ broke("%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+ fprint(2, "%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+ error(Ecorrupt);
+ }
+ assert(b->magic == Magic);
+}
+
+static Arena*
+pickarena(uint ty, uint hint, int tries)
+{
+ uint n, r;
+
+ r = ainc(&fs->roundrobin)/2048;
+ if(ty == Tdat)
+ n = hint % (fs->narena - 1) + r + 1;
+ else
+ n = r;
+ return &fs->arenas[(n + tries) % fs->narena];
+}
+
+Arena*
+getarena(vlong b)
+{
+ int hi, lo, mid;
+ vlong alo, ahi;
+ Arena *a;
+
+ lo = 0;
+ hi = fs->narena;
+ if(b == fs->sb0->bp.addr)
+ return &fs->arenas[0];
+ if(b == fs->sb1->bp.addr)
+ return &fs->arenas[hi-1];
+ while(1){
+ mid = (hi + lo)/2;
+ a = &fs->arenas[mid];
+ alo = a->h0->bp.addr;
+ ahi = alo + a->size + 2*Blksz;
+ if(b < alo)
+ hi = mid-1;
+ else if(b > ahi)
+ lo = mid+1;
+ else
+ return a;
+ }
+}
+
+
+static void
+freerange(Avltree *t, vlong off, vlong len)
+{
+ Arange *r, *s;
+
+ assert(len % Blksz == 0);
+ if((r = calloc(1, sizeof(Arange))) == nil)
+ error(Enomem);
+ r->off = off;
+ r->len = len;
+ assert(avllookup(t, r, 0) == nil);
+ avlinsert(t, r);
+
+Again:
+ s = (Arange*)avlprev(r);
+ if(s != nil && s->off+s->len == r->off){
+ avldelete(t, r);
+ s->len = s->len + r->len;
+ free(r);
+ r = s;
+ goto Again;
+ }
+ s = (Arange*)avlnext(r);
+ if(s != nil && r->off+r->len == s->off){
+ avldelete(t, r);
+ s->off = r->off;
+ s->len = s->len + r->len;
+ free(r);
+ r = s;
+ goto Again;
+ }
+}
+
+static void
+grabrange(Avltree *t, vlong off, vlong len)
+{
+ Arange *r, *s, q;
+ vlong l;
+
+ assert(len % Blksz == 0);
+ q.off = off;
+ q.len = len;
+ r = (Arange*)avllookup(t, &q.Avl, -1);
+ if(r == nil || off + len > r->off + r->len)
+ abort();
+
+ if(off == r->off){
+ r->off += len;
+ r->len -= len;
+ }else if(off + len == r->off + r->len){
+ r->len -= len;
+ }else if(off > r->off && off+len < r->off + r->len){
+ s = emalloc(sizeof(Arange), 0);
+ l = r->len;
+ s->off = off + len;
+ r->len = off - r->off;
+ s->len = l - r->len - len;
+ avlinsert(t, s);
+ }else
+ abort();
+
+ if(r->len == 0){
+ avldelete(t, r);
+ free(r);
+ }
+}
+
+static Blk*
+mklogblk(Arena *a, vlong o)
+{
+ Blk *lb;
+
+ lb = a->logbuf[0];
+ if(lb == a->logtl)
+ lb = a->logbuf[1];
+ assert(lb->ref == 1);
+ lb->flag = Bstatic;
+ initblk(lb, o, -1, Tlog);
+ traceb("logblk" , lb->bp);
+ lb->lasthold0 = lb->lasthold;
+ lb = holdblk(lb);
+ lb->lasthold = getcallerpc(&a);
+ return lb;
+}
+
+/*
+ * Logs an allocation. Must be called
+ * with arena lock held. Duplicates some
+ * of the work in allocblk to prevent
+ * recursion.
+ */
+static void
+logappend(Arena *a, vlong off, vlong len, int op)
+{
+ vlong o, start, end;
+ Blk *lb;
+ char *p;
+
+ assert((off & 0xff) == 0);
+ assert(op == LogAlloc || op == LogFree || op == LogSync);
+ if(op != LogSync){
+ start = a->h0->bp.addr;
+ end = start + a->size + 2*Blksz;
+ assert(off >= start);
+ assert(off < end);
+ }
+ lb = a->logtl;
+ assert(lb->ref > 0);
+ assert(lb->type == Tlog);
+ assert(lb->logsz >= 0);
+ dprint("logop %d: %llx+%llx@%x\n", op, off, len, lb->logsz);
+
+ if(checkflag(lb, 0, Bdirty))
+ setflag(lb, Bdirty, Bfinal);
+
+ /*
+ * move to the next block when we have
+ * too little room in the log:
+ * We're appending up to 16 bytes as
+ * part of the operation, followed by
+ * 16 bytes of new log entry allocation
+ * and chaining.
+ */
+ if(lb->logsz >= Logspc - Logslop){
+ o = blkalloc_lk(a, 0);
+ if(o == -1)
+ error(Efull);
+ p = lb->data + lb->logsz;
+ PACK64(p, o|LogAlloc1);
+ lb->logsz += 8;
+ lb->logp = (Bptr){o, -1, -1};
+ lb = mklogblk(a, o);
+ }
+ if(len == Blksz){
+ if(op == LogAlloc)
+ op = LogAlloc1;
+ else if(op == LogFree)
+ op = LogFree1;
+ }
+ off |= op;
+ p = lb->data + lb->logsz;
+ PACK64(p, off);
+ lb->logsz += 8;
+ if(op >= Log2wide){
+ PACK64(p+8, len);
+ lb->logsz += 8;
+ }
+ if(lb != a->logtl) {
+ finalize(lb);
+ syncblk(lb);
+
+ finalize(a->logtl);
+ syncblk(a->logtl);
+ dropblk(a->logtl);
+ a->logtl = lb;
+ a->nlog++;
+ }
+}
+
+void
+loadlog(Arena *a, Bptr bp)
+{
+ vlong ent, off, len, gen;
+ int op, i, n;
+ char *d;
+ Blk *b;
+
+
+ dprint("loadlog %B\n", bp);
+ traceb("loadlog", bp);
+ b = a->logbuf[0];
+ while(1){
+ assert(checkflag(b, Bstatic, Bcached));
+ holdblk(b);
+ readblk(b, bp, 0);
+ dprint("\tload %B chain %B\n", bp, b->logp);
+ a->nlog++;
+ for(i = 0; i < b->logsz; i += n){
+ d = b->data + i;
+ ent = UNPACK64(d);
+ op = ent & 0xff;
+ off = ent & ~0xff;
+ n = (op >= Log2wide) ? 16 : 8;
+ switch(op){
+ case LogSync:
+ gen = ent >> 8;
+ dprint("\tlog@%x: sync %lld\n", i, gen);
+ if(gen >= fs->qgen){
+ if(a->logtl == nil){
+ b->logsz = i;
+ a->logtl = b;
+ cachedel(b->bp.addr);
+ setflag(b, Bdirty, 0);
+ return;
+ }
+ dropblk(b);
+ return;
+ }
+ break;
+
+ case LogAlloc:
+ case LogAlloc1:
+ len = (op >= Log2wide) ? UNPACK64(d+8) : Blksz;
+ dprint("\tlog@%x alloc: %llx+%llx\n", i, off, len);
+ grabrange(a->free, off & ~0xff, len);
+ a->used += len;
+ break;
+ case LogFree:
+ case LogFree1:
+ len = (op >= Log2wide) ? UNPACK64(d+8) : Blksz;
+ dprint("\tlog@%x free: %llx+%llx\n", i, off, len);
+ freerange(a->free, off & ~0xff, len);
+ a->used -= len;
+ break;
+ default:
+ dprint("\tlog@%x: log op %d\n", i, op);
+ abort();
+ break;
+ }
+ }
+ if(b->logp.addr == -1){
+ a->logtl = b;
+ return;
+ }
+ bp = b->logp;
+ dropblk(b);
+ }
+}
+
+void
+flushlog(Arena *a)
+{
+ if(checkflag(a->logtl, 0, Bdirty|Bstatic))
+ return;
+ finalize(a->logtl);
+ syncblk(a->logtl);
+}
+
+void
+compresslog(Arena *a)
+{
+ int i, nr, nblks, nlog;
+ vlong sz, *blks;
+ Blk *b;
+ Arange *r;
+ char *p;
+
+ flushlog(a);
+ /*
+ * Prepare what we're writing back.
+ * Arenas must be sized so that we can
+ * keep the merged log in memory for
+ * a rewrite.
+ */
+ sz = 0;
+ nr = 0;
+ nlog = 0;
+ for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+ sz += 16;
+ nr++;
+ }
+
+ /*
+ * Make a pessimistic estimate of the number of blocks
+ * needed to store the ranges, as well as the blocks
+ * used to store the range allocations.
+ *
+ * This does modify the tree, but it's safe because
+ * we can only be removing entries from the tree, not
+ * splitting or inserting new ones.
+ */
+ nblks = (sz+Logspc)/(Logspc - Logslop) + 16*nr/(Logspc-Logslop) + 1;
+ if((blks = calloc(nblks, sizeof(vlong))) == nil)
+ error(Enomem);
+ if(waserror()){
+ free(blks);
+ nexterror();
+ }
+ for(i = 0; i < nblks; i++){
+ blks[i] = blkalloc_lk(a, 1);
+ if(blks[i] == -1)
+ error(Efull);
+ }
+
+ /* fill up the log with the ranges from the tree */
+ i = 0;
+ b = mklogblk(a, blks[i++]);
+ for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+ if(b->logsz >= Logspc - Logslop){
+ b->logp = (Bptr){blks[i], -1, -1};
+ finalize(b);
+ syncblk(b);
+ dropblk(b);
+ nlog++;
+ b = mklogblk(a, blks[i++]);
+ }
+ p = b->data + b->logsz;
+ PACK64(p+0, r->off|LogFree);
+ PACK64(p+8, r->len);
+ b->logsz += 16;
+ }
+
+ /*
+ * now we have a valid freelist, and we can start
+ * appending stuff to it. Clean up the eagerly
+ * allocated extra blocks.
+ *
+ * Note that we need to drop the reference to the
+ * old logtl before we free the old blocks, because
+ * deallocating a block may require another block.
+ */
+ dropblk(a->logtl);
+ a->loghd = (Bptr){blks[0], -1, -1};
+ a->logtl = b; /* written back by sync() later */
+ a->nlog = nlog;
+ a->lastlogsz = nlog;
+
+ /* May add blocks to new log */
+ for(; i < nblks; i++)
+ blkdealloc_lk(a, blks[i]);
+ poperror();
+ free(blks);
+}
+
+int
+logbarrier(Arena *a, vlong gen)
+{
+ logappend(a, gen<<8, 0, LogSync);
+ return 0;
+}
+
+/*
+ * Allocate from an arena, with lock
+ * held. May be called multiple times
+ * per operation, to alloc space for
+ * the alloc log.
+ */
+static vlong
+blkalloc_lk(Arena *a, int seq)
+{
+ Arange *r;
+ vlong b;
+
+ if(seq)
+ r = (Arange*)avlmin(a->free);
+ else
+ r = (Arange*)avlmax(a->free);
+ if(!usereserve && a->size - a->used <= a->reserve)
+ return -1;
+ if(r == nil)
+ broke(Estuffed);
+
+ /*
+ * A bit of sleight of hand here:
+ * while we're changing the sorting
+ * key, but we know it won't change
+ * the sort order because the tree
+ * covers disjoint ranges
+ */
+ if(seq){
+ b = r->off;
+ r->len -= Blksz;
+ r->off += Blksz;
+ }else{
+ r->len -= Blksz;
+ b = r->off + r->len;
+ }
+ if(r->len == 0){
+ avldelete(a->free, r);
+ free(r);
+ }
+ a->used += Blksz;
+ return b;
+}
+
+static void
+blkdealloc_lk(Arena *a, vlong b)
+{
+ cachedel(b);
+ logappend(a, b, Blksz, LogFree);
+ freerange(a->free, b, Blksz);
+ a->used -= Blksz;
+}
+
+static vlong
+blkalloc(int ty, uint hint, int seq)
+{
+ Arena *a;
+ vlong b;
+ int tries;
+
+ tries = 0;
+Again:
+ a = pickarena(ty, hint, tries);
+ /*
+ * Loop through the arena up to 2 times.
+ * The first pass tries to find an arena
+ * that has space and is not in use, the
+ * second waits until an arena is free.
+ */
+ if(tries == 2*fs->narena)
+ error(Efull);
+ tries++;
+ if(tries < fs->narena){
+ if(canqlock(a) == 0)
+ goto Again;
+ }else
+ qlock(a);
+ if(waserror()){
+ qunlock(a);
+ nexterror();
+ }
+ b = blkalloc_lk(a, seq);
+ if(b == -1){
+ qunlock(a);
+ poperror();
+ goto Again;
+ }
+ logappend(a, b, Blksz, LogAlloc);
+ qunlock(a);
+ poperror();
+ return b;
+}
+
+static Blk*
+initblk(Blk *b, vlong bp, vlong gen, int ty)
+{
+ Blk *ob;
+
+ ob = cacheget(bp);
+ if(ob != nil)
+ fatal("double alloc: %#p %B %#p %B", b, b->bp, ob, ob->bp);
+ b->type = ty;
+ b->bp.addr = bp;
+ b->bp.hash = -1;
+ b->bp.gen = gen;
+ switch(ty){
+ case Tdat:
+ b->data = b->buf;
+ break;
+ case Tarena:
+ b->data = b->buf+2;
+ break;
+ case Tdlist:
+ case Tlog:
+ b->logsz = 0;
+ b->logp = (Bptr){-1, -1, -1};
+ b->data = b->buf + Loghdsz;
+ break;
+ case Tpivot:
+ b->data = b->buf + Pivhdsz;
+ break;
+ case Tleaf:
+ b->data = b->buf + Leafhdsz;
+ break;
+ }
+ setflag(b, Bdirty, 0);
+ b->nval = 0;
+ b->valsz = 0;
+ b->nbuf = 0;
+ b->bufsz = 0;
+ b->logsz = 0;
+ b->alloced = getcallerpc(&b);
+
+ return b;
+}
+
+Blk*
+newdblk(Tree *t, vlong hint, int seq)
+{
+ vlong bp;
+ Blk *b;
+
+ bp = blkalloc(Tdat, hint, seq);
+ b = cachepluck();
+ initblk(b, bp, t->memgen, Tdat);
+ b->alloced = getcallerpc(&t);
+ tracex("newblk" , b->bp, Tdat, -1);
+ return b;
+
+}
+
+Blk*
+newblk(Tree *t, int ty)
+{
+ vlong bp;
+ Blk *b;
+
+fprint(2, "newblk from %p", getcallerpc(&t));
+ bp = blkalloc(ty, 0, 0);
+ b = cachepluck();
+ initblk(b, bp, t->memgen, ty);
+ b->alloced = getcallerpc(&t);
+ tracex("newblk" , b->bp, ty, -1);
+ return b;
+}
+
+Blk*
+dupblk(Tree *t, Blk *b)
+{
+ Blk *r;
+
+ if((r = newblk(t, b->type)) == nil)
+ return nil;
+
+ tracex("dup" , b->bp, b->type, t->gen);
+ r->bp.hash = -1;
+ r->nval = b->nval;
+ r->valsz = b->valsz;
+ r->nbuf = b->nbuf;
+ r->bufsz = b->bufsz;
+ r->logsz = b->logsz;
+ r->alloced = getcallerpc(&t);
+ memcpy(r->buf, b->buf, sizeof(r->buf));
+ return r;
+}
+
+void
+finalize(Blk *b)
+{
+ if(b->type != Tdat)
+ PACK16(b->buf, b->type);
+
+ switch(b->type){
+ default:
+ abort();
+ break;
+ case Tpivot:
+ PACK16(b->buf+2, b->nval);
+ PACK16(b->buf+4, b->valsz);
+ PACK16(b->buf+6, b->nbuf);
+ PACK16(b->buf+8, b->bufsz);
+ break;
+ case Tleaf:
+ PACK16(b->buf+2, b->nval);
+ PACK16(b->buf+4, b->valsz);
+ break;
+ case Tdlist:
+ case Tlog:
+ b->logh = bufhash(b->data, b->logsz);
+ PACK16(b->buf+2, b->logsz);
+ PACK64(b->buf+4, b->logh);
+ packbp(b->buf+12, Ptrsz, &b->logp);
+ break;
+ case Tdat:
+ case Tarena:
+ case Tsuper:
+ break;
+ }
+
+ b->bp.hash = blkhash(b);
+ setflag(b, Bdirty|Bfinal, 0);
+}
+
+Blk*
+getblk(Bptr bp, int flg)
+{
+ Blk *b;
+ int i;
+
+ i = ihash(bp.addr) % nelem(fs->blklk);
+ qlock(&fs->blklk[i]);
+ if(waserror()){
+ qunlock(&fs->blklk[i]);
+ nexterror();
+ }
+ if((b = cacheget(bp.addr)) != nil){
+ assert(checkflag(b, 0, Bfreed));
+ b->lasthold = getcallerpc(&bp);
+ qunlock(&fs->blklk[i]);
+ poperror();
+ return b;
+ }
+ b = cachepluck();
+ b->alloced = getcallerpc(&bp);
+ b->alloced = getcallerpc(&bp);
+ readblk(b, bp, flg);
+ b->bp.gen = bp.gen;
+ b->lasthold = getcallerpc(&bp);
+ cacheins(b);
+ qunlock(&fs->blklk[i]);
+ poperror();
+
+ return b;
+}
+
+
+Blk*
+holdblk(Blk *b)
+{
+ ainc(&b->ref);
+ b->lasthold = getcallerpc(&b);
+ return b;
+}
+
+void
+dropblk(Blk *b)
+{
+ if(b == nil)
+ return;
+ b->lastdrop = getcallerpc(&b);
+ if(adec(&b->ref) != 0)
+ return;
+ /*
+ * freed blocks go to the LRU bottom
+ * for early reuse.
+ */
+ if(checkflag(b, Bfreed, 0))
+ lrubot(b);
+ else
+ lrutop(b);
+}
+
+ushort
+blkfill(Blk *b)
+{
+ switch(b->type){
+ case Tpivot:
+ return 2*b->nbuf + b->bufsz + 2*b->nval + b->valsz;
+ case Tleaf:
+ return 2*b->nval + b->valsz;
+ default:
+ fprint(2, "invalid block @%lld\n", b->bp.addr);
+ abort();
+ }
+}
+
+void
+limbo(int op, Limbo *l)
+{
+ Limbo *p;
+ ulong ge;
+
+ l->op = op;
+ while(1){
+ ge = agetl(&fs->epoch);
+ p = agetp(&fs->limbo[ge]);
+ l->next = p;
+ if(acasp(&fs->limbo[ge], p, l)){
+ aincl(&fs->nlimbo, 1);
+ break;
+ }
+ }
+}
+
+void
+freeblk(Tree *t, Blk *b)
+{
+ if(t == &fs->snap || (t != nil && b->bp.gen < t->memgen)){
+ tracex("killb", b->bp, getcallerpc(&t), -1);
+ killblk(t, b->bp);
+ return;
+ }
+ b->freed = getcallerpc(&t);
+ tracex("freeb", b->bp, getcallerpc(&t), -1);
+ setflag(b, Blimbo, 0);
+ holdblk(b);
+ assert(b->ref > 1);
+ limbo(DFblk, b);
+}
+
+void
+freebp(Tree *t, Bptr bp)
+{
+ Bfree *f;
+
+ if(t == &fs->snap || (t != nil && bp.gen < t->memgen)){
+ tracex("killb", bp, getcallerpc(&t), -1);
+ killblk(t, bp);
+ return;
+ }
+ tracex("freeb", bp, getcallerpc(&t), -1);
+
+ qlock(&fs->bfreelk);
+ while(fs->bfree == nil)
+ rsleep(&fs->bfreerz);
+ f = fs->bfree;
+ fs->bfree = (Bfree*)f->next;
+ qunlock(&fs->bfreelk);
+
+ f->bp = bp;
+ limbo(DFbp, f);
+}
+
+void
+epochstart(int tid)
+{
+ ulong ge;
+
+ ge = agetl(&fs->epoch);
+ asetl(&fs->lepoch[tid], ge | Eactive);
+}
+
+void
+epochend(int tid)
+{
+ ulong le;
+
+ le = agetl(&fs->lepoch[tid]);
+ asetl(&fs->lepoch[tid], le &~ Eactive);
+}
+
+void
+epochwait(void)
+{
+ int i, delay;
+ ulong e, ge;
+
+ delay = 0;
+Again:
+ ge = agetl(&fs->epoch);
+ for(i = 0; i < fs->nworker; i++){
+ e = agetl(&fs->lepoch[i]);
+ if((e & Eactive) && e != (ge | Eactive)){
+ if(delay < 1000)
+ delay++;
+ else
+ fprint(2, "stalled epoch %lx [worker %d]\n", e, i);
+ sleep(delay);
+ goto Again;
+ }
+ }
+}
+
+void
+epochclean(void)
+{
+ ulong c, e, ge;
+ Limbo *p, *n;
+ Blk *b;
+ Bfree *f;
+ Arena *a;
+ Qent qe;
+ int i;
+
+ c = agetl(&fs->nlimbo);
+ ge = agetl(&fs->epoch);
+ for(i = 0; i < fs->nworker; i++){
+ e = agetl(&fs->lepoch[i]);
+ if((e & Eactive) && e != (ge | Eactive)){
+ if(c < fs->cmax/4)
+ return;
+ epochwait();
+ }
+ }
+ epochwait();
+ p = asetp(&fs->limbo[(ge+1)%3], nil);
+ asetl(&fs->epoch, (ge+1)%3);
+
+ for(; p != nil; p = n){
+ n = p->next;
+ switch(p->op){
+ case DFtree:
+ free(p);
+ break;
+ case DFmnt:
+ free(p);
+ break;
+ case DFbp:
+ f = (Bfree*)p;
+ a = getarena(f->bp.addr);
+ if((b = cacheget(f->bp.addr)) != nil){
+ setflag(b, Bfreed, Bdirty|Blimbo);
+ dropblk(b);
+ }
+ qe.op = Qfree;
+ qe.bp = f->bp;
+ qe.b = nil;
+ qput(a->sync, qe);
+ qlock(&fs->bfreelk);
+ f->next = fs->bfree;
+ fs->bfree = f;
+ rwakeup(&fs->bfreerz);
+ qunlock(&fs->bfreelk);
+ break;
+ case DFblk:
+ b = (Blk*)p;
+ qe.op = Qfree;
+ qe.bp = b->bp;
+ qe.b = nil;
+ setflag(b, Bfreed, Bdirty|Blimbo);
+ a = getarena(b->bp.addr);
+ dropblk(b);
+ qput(a->sync, qe);
+ break;
+ default:
+ abort();
+ }
+ aincl(&fs->nlimbo, -1);
+ }
+}
+
+void
+enqueue(Blk *b)
+{
+ Arena *a;
+ Qent qe;
+
+ assert(checkflag(b, Bdirty, Bqueued|Bstatic));
+ assert(b->bp.addr >= 0);
+ finalize(b);
+ if(checkflag(b, 0, Bcached)){
+ cacheins(b);
+ b->cached = getcallerpc(&b);
+ }
+ holdblk(b);
+
+ b->enqueued = getcallerpc(&b);
+ traceb("queueb", b->bp);
+ a = getarena(b->bp.addr);
+ qe.op = Qwrite;
+ qe.bp = b->bp;
+ qe.b = b;
+ qput(a->sync, qe);
+}
+
+void
+qinit(Syncq *q)
+{
+ q->fullrz.l = &q->lk;
+ q->emptyrz.l = &q->lk;
+ q->nheap = 0;
+ q->heapsz = fs->cmax;
+ q->heap = emalloc(q->heapsz*sizeof(Qent), 1);
+}
+
+static int
+qcmp(Qent *a, Qent *b)
+{
+ if(a->qgen != b->qgen)
+ return (a->qgen < b->qgen) ? -1 : 1;
+ if(a->op != b->op)
+ return (a->op < b->op) ? -1 : 1;
+ if(a->bp.addr != b->bp.addr)
+ return (a->bp.addr < b->bp.addr) ? -1 : 1;
+ return 0;
+}
+
+void
+qput(Syncq *q, Qent qe)
+{
+ int i;
+
+ if(qe.op == Qfree || qe.op == Qwrite)
+ assert((qe.bp.addr & (Blksz-1)) == 0);
+ else if(qe.op == Qfence)
+ assert(fs->syncing > 0);
+ else
+ abort();
+ if(qe.b != nil)
+ assert(qe.b->ref > 0);
+ qlock(&q->lk);
+ qe.qgen = agetv(&fs->qgen);
+ while(q->nheap == q->heapsz)
+ rsleep(&q->fullrz);
+ for(i = q->nheap; i > 0; i = (i-1)/2){
+ if(qcmp(&qe, &q->heap[(i-1)/2]) == 1)
+ break;
+ q->heap[i] = q->heap[(i-1)/2];
+ }
+ q->heap[i] = qe;
+ q->nheap++;
+ rwakeup(&q->emptyrz);
+ qunlock(&q->lk);
+}
+
+static Qent
+qpop(Syncq *q)
+{
+ int i, l, r, m;
+ Qent e, t;
+
+ qlock(&q->lk);
+ while(q->nheap == 0)
+ rsleep(&q->emptyrz);
+ e = q->heap[0];
+ if(--q->nheap == 0)
+ goto Out;
+
+ i = 0;
+ q->heap[0] = q->heap[q->nheap];
+ while(1){
+ m = i;
+ l = 2*i+1;
+ r = 2*i+2;
+ if(l < q->nheap && qcmp(&q->heap[m], &q->heap[l]) == 1)
+ m = l;
+ if(r < q->nheap && qcmp(&q->heap[m], &q->heap[r]) == 1)
+ m = r;
+ if(m == i)
+ break;
+ t = q->heap[m];
+ q->heap[m] = q->heap[i];
+ q->heap[i] = t;
+ i = m;
+ }
+Out:
+ rwakeup(&q->fullrz);
+ qunlock(&q->lk);
+ if(e.b != nil){
+ setflag(e.b, 0, Bqueued);
+ e.b->queued = 0;
+ }
+ return e;
+}
+
+void
+runsync(int, void *p)
+{
+ Arena *a;
+ Syncq *q;
+ Qent qe;
+
+ q = p;
+ if(waserror()){
+ aincl(&fs->rdonly, 1);
+ fprint(2, "error syncing: %s\n", errmsg());
+ return;
+ }
+ while(1){
+ qe = qpop(q);
+ switch(qe.op){
+ case Qfree:
+ tracex("qfreeb", qe.bp, qe.qgen, -1);
+ /*
+ * we shouldn't have a block in a free op,
+ * the frees go into the queue just to ensure
+ * write/reuse ordering.
+ */
+ assert(qe.b == nil);
+ a = getarena(qe.bp.addr);
+ qlock(a);
+ blkdealloc_lk(a, qe.bp.addr);
+ qunlock(a);
+ break;
+ case Qfence:
+ tracev("qfence", qe.qgen);
+ qlock(&fs->synclk);
+ if(--fs->syncing == 0)
+ rwakeupall(&fs->syncrz);
+ qunlock(&fs->synclk);
+ break;
+ case Qwrite:
+ tracex("qsyncb", qe.bp, qe.qgen, -1);
+ if(checkflag(qe.b, Bfreed, Bstatic) == 0)
+ syncblk(qe.b);
+ dropblk(qe.b);
+ break;
+ default:
+ abort();
+ }
+ assert(estacksz() == 1);
+ }
+}
--- /dev/null
+++ b/cache.c
@@ -1,0 +1,190 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static void
+lrudel(Blk *b)
+{
+ if(b == fs->chead)
+ fs->chead = b->cnext;
+ if(b == fs->ctail)
+ fs->ctail = b->cprev;
+ if(b->cnext != nil)
+ b->cnext->cprev = b->cprev;
+ if(b->cprev != nil)
+ b->cprev->cnext = b->cnext;
+ b->cnext = nil;
+ b->cprev = nil;
+}
+
+void
+lrutop(Blk *b)
+{
+ qlock(&fs->lrulk);
+ /*
+ * Someone got in first and did a
+ * cache lookup; we no longer want
+ * to put this into the LRU, because
+ * its now in use.
+ */
+ assert(b->magic == Magic);
+ assert(checkflag(b, 0, Bstatic));
+ if(b->ref != 0){
+ qunlock(&fs->lrulk);
+ return;
+ }
+ lrudel(b);
+ if(fs->chead != nil)
+ fs->chead->cprev = b;
+ if(fs->ctail == nil)
+ fs->ctail = b;
+ b->cnext = fs->chead;
+ fs->chead = b;
+ rwakeup(&fs->lrurz);
+ qunlock(&fs->lrulk);
+}
+
+void
+lrubot(Blk *b)
+{
+ qlock(&fs->lrulk);
+ /*
+ * Someone got in first and did a
+ * cache lookup; we no longer want
+ * to put this into the LRU, because
+ * its now in use.
+ */
+ assert(b->magic == Magic);
+ assert(checkflag(b, 0, Bstatic));
+ if(b->ref != 0){
+ qunlock(&fs->lrulk);
+ return;
+ }
+ lrudel(b);
+ if(fs->ctail != nil)
+ fs->ctail->cnext = b;
+ if(fs->chead == nil)
+ fs->chead = b;
+ b->cprev = fs->ctail;
+ fs->ctail = b;
+ rwakeup(&fs->lrurz);
+ qunlock(&fs->lrulk);
+}
+
+void
+cacheins(Blk *b)
+{
+ Bucket *bkt;
+ u32int h;
+
+ assert(b->magic == Magic);
+ h = ihash(b->bp.addr);
+ bkt = &fs->bcache[h % fs->cmax];
+ qlock(&fs->lrulk);
+ traceb("cache", b->bp);
+ assert(checkflag(b, 0, Bstatic|Bcached));
+ setflag(b, Bcached, 0);
+ assert(b->hnext == nil);
+ for(Blk *bb = bkt->b; bb != nil; bb = bb->hnext)
+ assert(b != bb && b->bp.addr != bb->bp.addr);
+ b->cached = getcallerpc(&b);
+ b->hnext = bkt->b;
+ bkt->b = b;
+ qunlock(&fs->lrulk);
+}
+
+static void
+cachedel_lk(vlong addr)
+{
+ Bucket *bkt;
+ Blk *b, **p;
+ u32int h;
+
+ if(addr == -1)
+ return;
+
+ Bptr bp = {addr, -1, -1};
+ tracex("uncache", bp, -1, getcallerpc(&addr));
+ h = ihash(addr);
+ bkt = &fs->bcache[h % fs->cmax];
+ p = &bkt->b;
+ for(b = bkt->b; b != nil; b = b->hnext){
+ if(b->bp.addr == addr){
+ /* FIXME: Until we clean up snap.c, we can have dirty blocks in cache */
+ assert(checkflag(b, Bcached, Bstatic)); //Bdirty));
+ *p = b->hnext;
+ b->uncached = getcallerpc(&addr);
+ b->hnext = nil;
+ setflag(b, 0, Bcached);
+ break;
+ }
+ p = &b->hnext;
+ }
+}
+void
+cachedel(vlong addr)
+{
+ qlock(&fs->lrulk);
+ Bptr bp = {addr, -1, -1};
+ tracex("uncachelk", bp, -1, getcallerpc(&addr));
+ cachedel_lk(addr);
+ qunlock(&fs->lrulk);
+}
+
+Blk*
+cacheget(vlong addr)
+{
+ Bucket *bkt;
+ u32int h;
+ Blk *b;
+
+ h = ihash(addr);
+ bkt = &fs->bcache[h % fs->cmax];
+ qlock(&fs->lrulk);
+ for(b = bkt->b; b != nil; b = b->hnext){
+ if(b->bp.addr == addr){
+ holdblk(b);
+ lrudel(b);
+ b->lasthold = getcallerpc(&addr);
+ break;
+ }
+ }
+ qunlock(&fs->lrulk);
+
+ return b;
+}
+
+/*
+ * Pulls the block from the bottom of the LRU for reuse.
+ */
+Blk*
+cachepluck(void)
+{
+ Blk *b;
+
+ qlock(&fs->lrulk);
+ while(fs->ctail == nil)
+ rsleep(&fs->lrurz);
+
+ b = fs->ctail;
+ assert(b->magic == Magic);
+ assert(b->ref == 0);
+ if(checkflag(b, Bcached, 0))
+ cachedel_lk(b->bp.addr);
+ if(checkflag(b, Bcached, 0))
+ fprint(2, "%B cached %#p freed %#p\n", b->bp, b->cached, b->freed);
+ assert(checkflag(b, 0, Bcached));
+ lrudel(b);
+ b->flag = 0;
+ b->lasthold = 0;
+ b->lastdrop = 0;
+ b->freed = 0;
+ b->hnext = nil;
+ qunlock(&fs->lrulk);
+
+ return holdblk(b);
+}
--- /dev/null
+++ b/check.c
@@ -1,0 +1,306 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <atomic.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+isfree(vlong bp)
+{
+ Arange *r, q;
+ Arena *a;
+
+ q.off = bp;
+ q.len = Blksz;
+
+ a = getarena(bp);
+ r = (Arange*)avllookup(a->free, &q, -1);
+ if(r == nil)
+ return 0;
+ return bp < (r->off + r->len);
+}
+
+static int
+checktree(int fd, Blk *b, int h, Kvp *lo, Kvp *hi)
+{
+ Kvp x, y;
+ Msg mx, my;
+ int i, r, fill;
+ Blk *c;
+ int fail;
+ Bptr bp;
+
+ fail = 0;
+ if(h < 0){
+ fprint(fd, "node too deep (loop?\n");
+ fail++;
+ return fail;
+ }
+ if(b->type == Tleaf){
+ if(h != 0){
+ fprint(fd, "unbalanced leaf\n");
+ fail++;
+ }
+ if(h != 0 && b->nval < 2){
+ fprint(fd, "warning: underfilled leaf %B\n", b->bp);
+ fail++;
+ }
+ }
+ if(b->type == Tpivot && b->nval < 2)
+ fprint(fd, "warning: underfilled pivot %B\n", b->bp);
+ getval(b, 0, &x);
+ if(lo && keycmp(lo, &x) > 0){
+ fprint(fd, "out of range keys %P != %P\n", lo, &x);
+ showblk(fd, b, "out of range", 1);
+ fail++;
+ }
+ for(i = 1; i < b->nval; i++){
+ getval(b, i, &y);
+ if(hi && keycmp(&y, hi) >= 0){
+ fprint(fd, "out of range keys %P >= %P\n", &y, hi);
+ fail++;
+ }
+ if(b->type == Tpivot){
+ bp = getptr(&x, &fill);
+ if(isfree(bp.addr)){
+ fprint(fd, "freed block in use: %llx\n", bp.addr);
+ fail++;
+ }
+ if((c = getblk(bp, 0)) == nil){
+ fprint(fd, "corrupt block: %B\n", bp);
+ fail++;
+ continue;
+ }
+ if(blkfill(c) != fill){
+ fprint(fd, "mismatched block fill\n");
+ fail++;
+ }
+ if(checktree(fd, c, h - 1, &x, &y))
+ fail++;
+ dropblk(c);
+ }
+ r = keycmp(&x, &y);
+ switch(r){
+ case -1:
+ break;
+ case 0:
+ fprint(fd, "duplicate keys %P, %P\n", &x, &y);
+ fail++;
+ break;
+ case 1:
+ fprint(fd, "misordered keys %P, %P\n", &x, &y);
+ fail++;
+ break;
+ }
+ x = y;
+ }
+ if(b->type == Tpivot){
+ getval(b, b->nval-1, &y);
+ bp = getptr(&x, &fill);
+ if((c = getblk(bp, 0)) == nil){
+ fprint(fd, "corrupt block: %B\n", bp);
+ fail++;
+ }
+ if(c != nil && checktree(fd, c, h - 1, &y, nil))
+ fail++;
+ dropblk(c);
+ if(b->nbuf > 0){
+ getmsg(b, 0, &mx);
+ if(hi && keycmp(&mx, hi) >= 0){
+ fprint(fd, "out of range messages %P != %M\n", hi, &mx);
+ fail++;
+ }
+ }
+ for(i = 1; i < b->nbuf; i++){
+ getmsg(b, i, &my);
+ switch(my.op){
+ case Owstat: /* kvp dirent */
+ if((my.v[0] & ~(Owsize|Owmode|Owmtime|Owatime|Owuid|Owgid|Owmuid)) != 0){
+ fprint(fd, "invalid stat op %x\n", my.v[0]);
+ fail++;
+ }
+ break;
+ default:
+ if(my.op <= 0 || my.op >= Nmsgtype){
+ fprint(fd, "invalid message op %d\n", my.op);
+ fail++;
+ }
+ break;
+ }
+ if(hi && keycmp(&y, hi) > 0){
+ fprint(fd, "out of range keys %P >= %P\n", &y, hi);
+ fail++;
+ }
+ if(keycmp(&mx, &my) == 1){
+ fprint(fd, "misordered keys %P, %P\n", &x, &y);
+ fail++;
+ break;
+ }
+ mx = my;
+ }
+
+ }
+ return fail;
+}
+
+static int
+checklog(int fd, Bptr hd)
+{
+ Bptr bp, nb;
+ Blk *b;
+
+ bp = (Bptr){-1, -1, -1};
+ for(bp = hd; bp.addr != -1; bp = nb){
+ if(waserror()){
+ fprint(fd, "error loading %B\n", bp);
+ return 0;
+ }
+traceb("chklg", bp);
+ b = getblk(bp, 0);
+ nb = b->logp;
+ dropblk(b);
+ poperror();
+ }
+ return 1;
+}
+
+static int
+checkfree(int fd)
+{
+ Arena *a;
+ Arange *r, *n;
+ int i, fail;
+
+ fail = 0;
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ qlock(a);
+ r = (Arange*)avlmin(a->free);
+ for(n = (Arange*)avlnext(r); n != nil; n = (Arange*)avlnext(n)){
+ if(r->off >= n->off){
+ fprint(2, "misordered length %llx >= %llx\n", r->off, n->off);
+ fail++;
+ }
+ if(r->off+r->len >= n->off){
+ fprint(2, "overlaping range %llx+%llx >= %llx\n", r->off, r->len, n->off);
+ fail++;
+ }
+ r = n;
+ }
+ if(!checklog(fd, a->loghd))
+ fprint(fd, "arena %d: broken freelist\n", i);
+ qunlock(a);
+ }
+ return fail;
+}
+
+static int
+checkdlist(int fd)
+{
+ char pfx[1];
+ Dlist dl;
+ Scan s;
+
+ checklog(fd, fs->snapdl.hd);
+ pfx[0] = Kdlist;
+ btnewscan(&s, pfx, 1);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ kv2dlist(&s.kv, &dl);
+ if(!checklog(fd, dl.hd))
+ print("bad dlist %P: %s\n", &s.kv, errmsg());
+ }
+ btexit(&s);
+ return 0;
+}
+
+static int
+checkdata(int, Tree *t)
+{
+ char pfx[1];
+ Bptr bp;
+ Scan s;
+ Blk *b;
+
+ pfx[0] = Klabel;
+ btnewscan(&s, pfx, 1);
+ btenter(t, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ bp = unpackbp(s.kv.v, s.kv.nv);
+ if(isfree(bp.addr)){
+ fprint(2, "free block in use: %B\n", bp);
+ error("free block in use");
+ }
+ b = getblk(bp, GBraw);
+ dropblk(b);
+ }
+ btexit(&s);
+ return 0;
+}
+
+int
+checkfs(int fd)
+{
+ int ok, height;
+ char pfx[1], name[Keymax+1];
+ Tree *t;
+ Scan s;
+ Blk *b;
+
+ ok = 1;
+ aincl(&fs->rdonly, 1);
+ epochwait();
+ if(waserror()){
+ fprint(fd, "error checking %s\n", errmsg());
+ return 0;
+ }
+ fprint(fd, "checking freelist\n");
+ if(checkfree(fd))
+ ok = 0;
+ fprint(fd, "checking deadlist\n");
+ if(checkdlist(fd))
+ ok = 0;
+ fprint(fd, "checking snap tree: %B\n", fs->snap.bp);
+ if((b = getroot(&fs->snap, &height)) != nil){
+ if(checktree(fd, b, height-1, nil, 0))
+ ok = 0;
+ dropblk(b);
+ }
+ pfx[0] = Klabel;
+ btnewscan(&s, pfx, 1);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ if(waserror()){
+ fprint(fd, "moving on: %s\n", errmsg());
+ continue;
+ }
+ memcpy(name, s.kv.k+1, s.kv.nk-1);
+ name[s.kv.nk-1] = 0;
+ if((t = opensnap(name, nil)) == nil){
+ fprint(2, "invalid snap label %s\n", name);
+ ok = 0;
+ break;
+ }
+ fprint(fd, "checking snap %s: %B\n", name, t->bp);
+ b = getroot(t, &height);
+ if(checktree(fd, b, height-1, nil, 0))
+ ok = 0;
+ if(checkdata(fd, t))
+ ok = 0;
+ dropblk(b);
+ poperror();
+ }
+ btexit(&s);
+ aincl(&fs->rdonly, -1);
+ poperror();
+ return ok;
+}
--- /dev/null
+++ b/cons.c
@@ -1,0 +1,465 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Cmd Cmd;
+
+struct Cmd {
+ char *name;
+ char *sub;
+ int minarg;
+ int maxarg;
+ int epoch;
+ void (*fn)(int, char**, int);
+};
+
+static void
+setdbg(int fd, char **ap, int na)
+{
+ debug = (na == 1) ? atoi(ap[0]) : !debug;
+ fprint(fd, "debug → %d\n", debug);
+}
+
+static void
+sendsync(int fd, int halt)
+{
+ Amsg *a;
+
+ a = mallocz(sizeof(Amsg), 1);
+ if(a == nil){
+ fprint(fd, "alloc sync msg: %r\n");
+ free(a);
+ return;
+ }
+ a->op = AOsync;
+ a->halt = halt;
+ a->fd = fd;
+ chsend(fs->admchan, a);
+}
+
+static void
+syncfs(int fd, char **, int)
+{
+ sendsync(fd, 0);
+ fprint(fd, "synced\n");
+}
+
+static void
+haltfs(int fd, char **, int)
+{
+ sendsync(fd, 1);
+ fprint(fd, "gefs: ending...\n");
+}
+
+static void
+listsnap(int fd)
+{
+ char pfx[Snapsz];
+ Scan s;
+ uint flg;
+ int sz;
+
+ pfx[0] = Klabel;
+ sz = 1;
+ btnewscan(&s, pfx, sz);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ flg = UNPACK32(s.kv.v+1+8);
+ fprint(fd, "snap %.*s", s.kv.nk-1, s.kv.k+1);
+ if(flg != 0)
+ fprint(fd, " [");
+ if(flg & Lmut)
+ fprint(fd, " mutable");
+ if(flg & Lauto)
+ fprint(fd, " auto");
+ if(flg & Ltsnap)
+ fprint(fd, " tsnap");
+ if(flg != 0)
+ fprint(fd, " ]");
+ fprint(fd, "\n");
+ }
+ btexit(&s);
+}
+
+static void
+snapfs(int fd, char **ap, int na)
+{
+ Amsg *a;
+ int i;
+
+ if((a = mallocz(sizeof(Amsg), 1)) == nil){
+ fprint(fd, "alloc sync msg: %r\n");
+ return;
+ }
+ a->op = AOsnap;
+ a->fd = fd;
+ a->flag = Ltsnap;
+ while(ap[0][0] == '-'){
+ for(i = 1; ap[0][i]; i++){
+ switch(ap[0][i]){
+ case 'S': a->flag &= ~Ltsnap; break;
+ case 'm': a->flag |= Lmut; break;
+ case 'd': a->delete++; break;
+ case 'l':
+ listsnap(fd);
+ free(a);
+ return;
+ default:
+ fprint(fd, "usage: snap -[Smdl] [old [new]]\n");
+ free(a);
+ return;
+ }
+ }
+ na--;
+ ap++;
+ }
+ if(a->delete && na != 1 || !a->delete && na != 2){
+ fprint(fd, "usage: snap -[md] old [new]\n");
+ free(a);
+ return;
+ }
+ if(na >= 1)
+ strecpy(a->old, a->old+sizeof(a->old), ap[0]);
+ if(na >= 2)
+ strecpy(a->new, a->new+sizeof(a->new), ap[1]);
+ sendsync(fd, 0);
+ chsend(fs->admchan, a);
+}
+
+static void
+fsckfs(int fd, char**, int)
+{
+ if(checkfs(fd))
+ fprint(fd, "ok\n");
+ else
+ fprint(fd, "broken\n");
+}
+
+static void
+refreshusers(int fd, char **, int)
+{
+ Mount *mnt;
+
+ if((mnt = getmount("adm")) == nil){
+ fprint(fd, "load users: missing 'adm'\n");
+ return;
+ }
+ if(waserror()){
+ fprint(fd, "load users: %s\n", errmsg());
+ clunkmount(mnt);
+ return;
+ }
+ loadusers(fd, mnt->root);
+ fprint(fd, "refreshed users\n");
+ clunkmount(mnt);
+}
+
+static void
+showbstate(int fd, char**, int)
+{
+ char *p, fbuf[8];
+ Blk *b;
+
+ for(b = blkbuf; b != blkbuf+fs->cmax; b++){
+ p = fbuf;
+ if(b->flag & Bdirty) *p++ = 'd';
+ if(b->flag & Bfinal) *p++ = 'f';
+ if(b->flag & Bfreed) *p++ = 'F';
+ if(b->flag & Bcached) *p++ = 'c';
+ if(b->flag & Bqueued) *p++ = 'q';
+ if(b->flag & Blimbo) *p++ = 'L';
+ *p = 0;
+ fprint(fd, "blk %#p type %d flag %s bp %B ref %ld alloc %#p queued %#p, hold %#p drop %#p cached %#p\n",
+ b, b->type, fbuf, b->bp, b->ref, b->alloced, b->queued, b->lasthold, b->lastdrop, b->cached);
+ }
+}
+
+static void
+showusers(int fd, char**, int)
+{
+ User *u, *v;
+ int i, j;
+ char *sep;
+
+ rlock(&fs->userlk);
+ for(i = 0; i < fs->nusers; i++){
+ u = &fs->users[i];
+ fprint(fd, "%d:%s:", u->id, u->name);
+ if((v = uid2user(u->lead)) == nil)
+ fprint(fd, "???:");
+ else
+ fprint(fd, "%s:", v->name);
+ sep = "";
+ for(j = 0; j < u->nmemb; j++){
+ if((v = uid2user(u->memb[j])) == nil)
+ fprint(fd, "%s???", sep);
+ else
+ fprint(fd, "%s%s", sep, v->name);
+ sep = ",";
+ }
+ fprint(fd, "\n");
+ }
+ runlock(&fs->userlk);
+}
+
+static void
+showdf(int fd, char**, int)
+{
+ char *units[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", nil};
+ vlong size, used, free;
+ double hsize, hused, hfree;
+ double pct;
+ Arena *a;
+ int i, us, uu, uf;
+
+ size = 0;
+ used = 0;
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ qlock(a);
+ size += a->size;
+ used += a->used;
+ qunlock(a);
+ fprint(fd, "arena %d: %llx/%llx (%.2f%%)\n", i, a->used, a->size, 100*(double)a->used/(double)a->size);
+ }
+ free = size - used;
+ hsize = size;
+ hused = used;
+ hfree = free;
+ for(us = 0; us < nelem(units)-1 && hsize >= 500 ; us++)
+ hsize /= 1024;
+ for(uu = 0; uu < nelem(units)-1 && hused >= 500 ; uu++)
+ hused /= 1024;
+ for(uf = 0; uf < nelem(units)-1 && hfree >= 500 ; uf++)
+ hfree /= 1024;
+ pct = 100.0*(double)used/(double)size;
+ fprint(fd, "fill:\t%.2f%%\n", pct);
+ fprint(fd, "used:\t%lld (%.2f %s)\n", used, hused, units[uu]);
+ fprint(fd, "size:\t%lld (%.2f %s)\n", size, hsize, units[us]);
+ fprint(fd, "free:\t%lld (%.2f %s)\n", free, hfree, units[uf]);
+}
+
+void
+showfid(int fd, char**, int)
+{
+ int i;
+ Fid *f;
+ Conn *c;
+
+ for(c = fs->conns; c != nil; c = c->next){
+ fprint(fd, "-- conn %p: fids --\n", c);
+ for(i = 0; i < Nfidtab; i++){
+ lock(&c->fidtablk[i]);
+ for(f = c->fidtab[i]; f != nil; f = f->next){
+ rlock(f->dent);
+ fprint(fd, "\tfid[%d] from %#zx: %d [refs=%ld, k=%K, qid=%Q m=%d, dmode:%d duid: %d, dgid: %d]\n",
+ i, getmalloctag(f), f->fid, f->dent->ref, &f->dent->Key, f->dent->qid,
+ f->mode, f->dmode, f->duid, f->dgid);
+ runlock(f->dent);
+ }
+ unlock(&c->fidtablk[i]);
+ }
+ }
+}
+
+void
+showtree(int fd, char **ap, int na)
+{
+ char *name;
+ Tree *t;
+ Blk *b;
+ int h;
+
+ name = "main";
+ memset(&t, 0, sizeof(t));
+ if(na == 1)
+ name = ap[0];
+ if(strcmp(name, "snap") == 0)
+ t = &fs->snap;
+ else if((t = opensnap(name, nil)) == nil){
+ fprint(fd, "open %s: %r\n", name);
+ return;
+ }
+ b = getroot(t, &h);
+ fprint(fd, "=== [%s] %B @%d\n", name, t->bp, t->ht);
+ showblk(fd, b, "contents", 1);
+ dropblk(b);
+ if(t != &fs->snap)
+ closesnap(t);
+}
+
+static void
+permflip(int fd, char **ap, int)
+{
+ if(strcmp(ap[0], "on") == 0)
+ permissive = 1;
+ else if(strcmp(ap[0], "off") == 0)
+ permissive = 0;
+ else
+ fprint(2, "unknown permissive %s\n", ap[0]);
+ fprint(fd, "permissive: %d → %d\n", !permissive, permissive);
+}
+
+static void
+savetrace(int fd, char **ap, int na)
+{
+ Biobuf *bfd;
+ Trace *t;
+ int i;
+
+ if(na == 0)
+ bfd = Bfdopen(dup(fd, -1), OWRITE);
+ else
+ bfd = Bopen(ap[0], OWRITE);
+ if(bfd == nil){
+ fprint(fd, "error opening output");
+ return;
+ }
+ for(i = 0; i < fs->ntrace; i++){
+ t = &fs->trace[(fs->traceidx + i) % fs->ntrace];
+ if(t->msg[0] == 0)
+ continue;
+ Bprint(bfd, "[%d@%d] %s", t->tid, t->qgen, t->msg);
+ if(t->bp.addr != -1)
+ Bprint(bfd, " %B", t->bp);
+ if(t->v0 != -1)
+ Bprint(bfd, " %llx", t->v0);
+ if(t->v1 != -1)
+ Bprint(bfd, " %llx", t->v1);
+ Bprint(bfd, "\n");
+ }
+ Bterm(bfd);
+ fprint(fd, "saved\n");
+}
+
+static void
+showfree(int fd, char **, int)
+{
+ Arange *r;
+ Arena *a;
+ int i;
+
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ qlock(a);
+ fprint(fd, "arena %d %llx+%llx{\n", i, a->h0->bp.addr, a->size);
+ for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r))
+ fprint(fd, "\t%llx..%llx (%llx)\n", r->off, r->off+r->len, r->len);
+ fprint(fd, "}\n");
+ qunlock(a);
+ }
+}
+
+static void
+unreserve(int fd, char **ap, int)
+{
+ if(strcmp(ap[0], "on") == 0)
+ usereserve = 0;
+ else if(strcmp(ap[0], "off") == 0)
+ usereserve = 1;
+ else
+ fprint(2, "unknown reserve %s\n", ap[0]);
+ fprint(fd, "reserve: %d → %d\n", !permissive, permissive);
+}
+
+static void
+help(int fd, char**, int)
+{
+ char *msg =
+ "help -- show this help\n"
+ "check -- check for consistency\n"
+ "df -- show disk usage\n"
+ "halt -- stop all writers, sync, and go read-only\n"
+ "permit [on|off] -- switch to/from permissive mode\n"
+ "reserve [on|off] -- enable block reserves\n"
+ "snap -[Smdl] [old [new]] -- manage snapshots\n"
+ "sync -- flush all pending writes to disk\n"
+ "users -- reload user table from adm snapshot\n"
+ "save trace [name] -- save a trace of recent activity\n"
+ "show -- debug dumps\n"
+ " tree [name]\n"
+ " fid\n"
+ " users\n";
+ fprint(fd, "%s", msg);
+}
+
+Cmd cmdtab[] = {
+ /* admin */
+ {.name="check", .sub=nil, .minarg=0, .maxarg=0, .fn=fsckfs, .epoch=1},
+ {.name="df", .sub=nil, .minarg=0, .maxarg=0, .fn=showdf},
+ {.name="halt", .sub=nil, .minarg=0, .maxarg=0, .fn=haltfs},
+ {.name="help", .sub=nil, .minarg=0, .maxarg=0, .fn=help},
+ {.name="permit", .sub=nil, .minarg=1, .maxarg=1, .fn=permflip},
+ {.name="snap", .sub=nil, .minarg=1, .maxarg=3, .fn=snapfs},
+ {.name="sync", .sub=nil, .minarg=0, .maxarg=0, .fn=syncfs},
+ {.name="reserve", .sub=nil, .minarg=0, .maxarg=1, .fn=unreserve},
+ {.name="users", .sub=nil, .minarg=0, .maxarg=1, .fn=refreshusers},
+
+ /* debugging */
+ {.name="show", .sub="fid", .minarg=0, .maxarg=0, .fn=showfid},
+ {.name="show", .sub="tree", .minarg=0, .maxarg=1, .fn=showtree, .epoch=1},
+ {.name="show", .sub="users", .minarg=0, .maxarg=0, .fn=showusers},
+ {.name="show", .sub="bstate", .minarg=0, .maxarg=0, .fn=showbstate, .epoch=1},
+ {.name="show", .sub="free", .minarg=0, .maxarg=0, .fn=showfree},
+ {.name="debug", .sub=nil, .minarg=0, .maxarg=1, .fn=setdbg},
+ {.name="save", .sub="trace", .minarg=0, .maxarg=1, .fn=savetrace},
+ {.name=nil, .sub=nil},
+};
+
+void
+runcons(int tid, void *pfd)
+{
+ char buf[256], *f[4], **ap;
+ int i, n, nf, na, fd;
+ Cmd *c;
+
+ fd = (uintptr)pfd;
+ while(1){
+ fprint(fd, "gefs# ");
+ if((n = read(fd, buf, sizeof(buf)-1)) == -1)
+ break;
+ buf[n] = 0;
+ nf = tokenize(buf, f, nelem(f));
+ if(nf == 0 || strlen(f[0]) == 0)
+ continue;
+ for(c = cmdtab; c->name != nil; c++){
+ ap = f;
+ na = nf;
+ if(strcmp(c->name, *ap) != 0)
+ continue;
+ ap++;
+ na--;
+ if(c->sub != nil){
+ if(na == 0 || strcmp(c->sub, *ap) != 0)
+ continue;
+ ap++;
+ na--;
+ }
+ if(na < c->minarg || na > c->maxarg)
+ continue;
+ if(c->epoch)
+ epochstart(tid);
+ if(!waserror()){
+ c->fn(fd, ap, na);
+ poperror();
+ }else
+ fprint(fd, "%s: %s\n", f[0], errmsg());
+ if(c->epoch)
+ epochend(tid);
+ break;
+ }
+ if(c->name == nil){
+ fprint(fd, "unknown command '%s", f[0]);
+ for(i = 1; i < nf; i++)
+ fprint(fd, " %s", f[i]);
+ fprint(fd, "'\n");
+ }
+ }
+}
--- /dev/null
+++ b/dat.h
@@ -1,0 +1,776 @@
+typedef struct Blk Blk;
+typedef struct Amsg Amsg;
+typedef struct Gefs Gefs;
+typedef struct Errctx Errctx;
+typedef struct Fmsg Fmsg;
+typedef struct Fid Fid;
+typedef struct Msg Msg;
+typedef struct Key Key;
+typedef struct Val Val;
+typedef struct Kvp Kvp;
+typedef struct Xdir Xdir;
+typedef struct Bptr Bptr;
+typedef struct Limbo Limbo;
+typedef struct Bfree Bfree;
+typedef struct Scan Scan;
+typedef struct Dent Dent;
+typedef struct Scanp Scanp;
+typedef struct Arena Arena;
+typedef struct Arange Arange;
+typedef struct Bucket Bucket;
+typedef struct Chan Chan;
+typedef struct Syncq Syncq;
+typedef struct Qent Qent;
+typedef struct Trace Trace;
+typedef struct Tree Tree;
+typedef struct Dlist Dlist;
+typedef struct Mount Mount;
+typedef struct User User;
+typedef struct Conn Conn;
+
+enum {
+ KiB = 1024ULL,
+ MiB = 1024ULL*KiB,
+ GiB = 1024ULL*MiB,
+ TiB = 1024ULL*GiB,
+
+ Lgblk = 14,
+ Blksz = (1ULL<<Lgblk),
+
+ Nrefbuf = 1024, /* number of ref incs before syncing */
+ Nfidtab = 1024, /* number of fit hash entries */
+ Nflushtab = 1024, /* flush table size */
+ Ndtab = 1024, /* number of dir tab entries */
+ Max9p = 32*KiB, /* biggest message size we're willing to negotiate */
+ Nsec = 1000LL*1000*1000, /* nanoseconds to the second */
+ Maxent = 256, /* maximum size of ent key, with terminator */
+ Maxname = Maxent-1-9-1, /* maximum size of a name element */
+ Maxuname= 64, /* maximum length of a username */
+ Maxtag = 1<<16, /* maximum tag in 9p */
+
+ /*
+ * Kpmax must be no more than 1/4 of pivspc, or
+ * there is no way to get a valid split of a
+ * maximally filled tree.
+ */
+ Keymax = Maxent, /* key data limit */
+ Inlmax = 512, /* inline data limit */
+ Ptrsz = 24, /* off, hash, gen */
+ Pptrsz = 26, /* off, hash, gen, fill */
+ Fillsz = 2, /* block fill count */
+ Offksz = 17, /* type, qid, off */
+ Snapsz = 9, /* tag, snapid */
+ Dpfxsz = 9, /* directory prefix */
+ Upksz = 9, /* directory prefix */
+ Dlksz = 1+8+8, /* tag, death, birth */
+ Dlvsz = Ptrsz+Ptrsz, /* hd,tl of deadlist */
+ Dlkvpsz = Dlksz+Dlvsz, /* full size of dlist kvp */
+ Treesz = 4+4+4+4 /* ref, ht, flg, gen, pred, succ, base, root */
+ +8+8+8+8+Ptrsz,
+ Kvmax = Keymax + Inlmax, /* Key and value */
+ Kpmax = Keymax + Ptrsz, /* Key and pointer */
+ Wstatmax = 4+8+8+8, /* mode, size, atime, mtime */
+ Arenasz = 8+8+8+8, /* loghd, loghash, size, used */
+
+ Pivhdsz = 10,
+ Leafhdsz = 6,
+ Loghdsz = 2+2+8+Ptrsz, /* type, len, hash, chain */
+ Rootsz = 4+Ptrsz, /* root pointer */
+ Pivsz = Blksz - Pivhdsz,
+ Bufspc = (Blksz - Pivhdsz)/2, /* pivot room */
+ Pivspc = Blksz - Pivhdsz - Bufspc,
+ Logspc = Blksz - Loghdsz,
+ Logslop = 16+16+8, /* val, nextb, chain */
+ Leafspc = Blksz - Leafhdsz,
+ Msgmax = 1 + (Kvmax > Kpmax ? Kvmax : Kpmax),
+ Estacksz = 64,
+};
+
+enum {
+ Eactive = 1UL<<30, /* epoch active flag */
+};
+
+enum {
+ /*
+ * dent: pqid[8] qid[8] -- a directory entry key.
+ * ptr: off[8] hash[8] gen[8] -- a key for an Dir block.
+ * dir: serialized Xdir
+ */
+
+ /* fs keys */
+ Kdat, /* qid[8] off[8] => ptr: pointer to data page */
+ Kent, /* pqid[8] name[n] => dir[n]: serialized Dir */
+ Kup, /* qid[8] => Kent: parent dir */
+
+ /* snapshot keys */
+ Klabel, /* name[] => snapid[]: snapshot label */
+ Ksnap, /* sid[8] => ref[8], tree[52]: snapshot root */
+ Kdlist, /* snap[8] gen[8] => hd[ptr],tl[ptr] deadlist */
+};
+
+enum {
+ Bdirty = 1 << 0,
+ Bfinal = 1 << 1,
+ Bfreed = 1 << 2,
+ Bcached = 1 << 3,
+ Bqueued = 1 << 4,
+ Blimbo = 1 << 5,
+ Bstatic = 1 << 6,
+};
+
+enum {
+ Lmut = 1 << 0, /* can we modify snaps via this label */
+ Lauto = 1 << 1, /* was this label generated automatically */
+ Ltsnap = 1 << 2, /* should we skip the timed snapshots */
+};
+
+enum {
+ Qdump = 1ULL << 63,
+};
+
+#define Zb (Bptr){-1, -1, -1}
+
+/* internal errors */
+//#define Efs (abort(), "fs broke")
+extern char Efs[];
+extern char Ecorrupt[];
+extern char Efsvers[];
+extern char Eimpl[];
+extern char Ebotch[];
+extern char Eio[];
+extern char Enofid[];
+extern char Efid[];
+extern char Etype[];
+extern char Edscan[];
+extern char Esrch[];
+extern char Eexist[];
+extern char Emode[];
+extern char Efull[];
+extern char Estuffed[];
+extern char Eauth[];
+extern char Elength[];
+extern char Eperm[];
+extern char Einuse[];
+extern char Ebadf[];
+extern char Ename[];
+extern char Enomem[];
+extern char Eattach[];
+extern char Enosnap[];
+extern char Esnap[];
+extern char Edir[];
+extern char Esyntax[];
+extern char Enouser[];
+extern char Enogrp[];
+extern char Efsize[];
+extern char Ebadu[];
+extern char Erdonly[];
+extern char Elocked[];
+extern char Eauthp[];
+extern char Eauthd[];
+extern char Eauthph[];
+extern char Ephase[];
+extern char Enone[];
+extern char Enoauth[];
+
+extern char Ewstatb[];
+extern char Ewstatd[];
+extern char Ewstatg[];
+extern char Ewstatl[];
+extern char Ewstatm[];
+extern char Ewstato[];
+extern char Ewstatp[];
+extern char Ewstatq[];
+extern char Ewstatu[];
+extern char Ewstatv[];
+extern char Enempty[];
+
+/*
+ * All metadata blocks share a common header:
+ *
+ * type[2]
+ *
+ * The None type is reserved for file data blocks
+ * and refcount blocks.
+ *
+ * The superblock has this layout:
+ * version[8] always "gefsNNNNN"
+ * blksz[4] block size in bytes
+ * bufsz[4] portion of leaf nodes
+ * allocated to buffers,
+ * in bytes
+ * height[4] tree height of root node
+ * rootb[8] address of root in last
+ * snapshot.
+ * rooth[8] hash of root node
+ * narena[4] number of arenas in tree
+ * flag[8] feature flag
+ * gen[8] The flush generation
+ *
+ * The arena zone blocks have this layout, and
+ * are overwritten in place:
+ *
+ * log[8] The head of the alloc log
+ * logh[8] The hash of the alloc log
+ *
+ * The log blocks have this layout, and are one of
+ * two types of blocks that get overwritten in place:
+ *
+ * hash[8] The hash of the previous log block
+ *
+ * The remainder of the block is filled with log
+ * entries. Each log entry has at least 8 bytes
+ * of entry. Some are longer. The opcode is or'ed
+ * into the low order bits of the first vlong.
+ * These ops take the following form:
+ *
+ * Alloc, Free:
+ * off[8] len[8]
+ * Alloc1, Free1:
+ * off[8]
+ * Ref:
+ * off[8]
+ * Flush:
+ * gen[8]
+ *
+ * Pivots have the following layout:
+ *
+ * nval[2]
+ * valsz[2]
+ * nbuf[2]
+ * bufsz[2]
+ *
+ * Leaves have the following layout:
+ *
+ * nval[2]
+ * valsz[2]
+ * pad[4]sure,
+ *
+ * Within these nodes, pointers have the following
+ * layout:
+ *
+ * off[8] hash[8] fill[2]
+ */
+enum {
+ Tdat,
+ Tpivot,
+ Tleaf,
+ Tlog,
+ Tdlist,
+ Tarena,
+ Tsuper = 0x6765, /* 'ge' bigendian */
+};
+
+enum {
+ Vinl, /* Inline value */
+ Vref, /* Block pointer */
+};
+
+enum {
+ GBraw = 1<<0,
+ GBwrite = 1<<1,
+ GBnochk = 1<<2,
+ GBsoftchk = 1<<3,
+};
+
+enum {
+ Onop, /* nothing */
+ Oinsert, /* new kvp */
+ Odelete, /* delete kvp */
+ Oclearb, /* free block ptr if exists */
+ Oclobber, /* remove file if it exists */
+ Owstat, /* update kvp dirent */
+ Orelink, /* rechain forwards */
+ Oreprev, /* rechain backwards */
+ Nmsgtype, /* maximum message type */
+};
+
+enum {
+ Magic = 0x979b929e98969c8c,
+};
+
+/*
+ * Wstat ops come with associated data, in the order
+ * of the bit flag.
+ */
+enum{
+ /* wstat flag */
+ Owsize = 1<<0, /* [8]fsize: update file size */
+ Owmode = 1<<1, /* [4]mode: update file mode */
+ Owmtime = 1<<2, /* [8]mtime: update mtime, in nsec */
+ Owatime = 1<<3, /* [8]atime: update atime, in nsec */
+ Owuid = 1<<4, /* [4]uid: set uid */
+ Owgid = 1<<5, /* [4]uid: set gid */
+ Owmuid = 1<<6, /* [4]uid: set muid */
+};
+
+/*
+ * Operations for the allocation log.
+ */
+enum {
+ LogNop, /* unused */
+ /* 1-wide entries */
+ LogAlloc1, /* alloc a block */
+ LogFree1, /* free a block */
+ LogSync, /* sync barrier for replay */
+
+ /* 2-wide entries */
+#define Log2wide LogAlloc
+ LogAlloc, /* alloc a range */
+ LogFree, /* free a range */
+};
+
+enum {
+ AOnone,
+ AOsnap,
+ AOsync,
+ AOclear,
+ AOrclose,
+};
+
+enum {
+ DFblk,
+ DFbp,
+ DFmnt,
+ DFtree,
+};
+
+struct Limbo {
+ Limbo *next;
+ int op;
+};
+
+struct Bptr {
+ vlong addr;
+ uvlong hash;
+ vlong gen;
+};
+
+struct Key{
+ char *k;
+ int nk;
+};
+
+struct Val {
+ short nv;
+ char *v;
+};
+
+struct Kvp {
+ Key;
+ Val;
+};
+
+struct Msg {
+ char op;
+ Kvp;
+};
+
+struct Dlist {
+ Dlist *cnext; /* cache next entry */
+ Dlist *cprev; /* cache prev entry */
+ Dlist *chain; /* hash table chain */
+ Blk *ins; /* loaded head */
+
+ vlong gen; /* deadlist gen */
+ vlong bgen; /* birth gen */
+ Bptr hd; /* deadlist head */
+ Bptr tl; /* deadlist tail */
+};
+
+struct Errctx {
+ long tid;
+ char err[128];
+ jmp_buf errlab[Estacksz];
+ int nerrlab;
+};
+
+struct Arange {
+ Avl;
+ vlong off;
+ vlong len;
+};
+
+struct Bucket {
+ Blk *b;
+};
+
+struct Amsg {
+ int op;
+ int fd;
+ union {
+ struct { /* AOsnap */
+ char old[128];
+ char new[128];
+ int flag;
+ char delete;
+
+ };
+ struct { /* AOsync */
+ int halt;
+ };
+ struct { /* AOclear, AOrclose */
+ Mount *mnt;
+ Dent *dent;
+ vlong qpath;
+ vlong off;
+ vlong end;
+ };
+ };
+};
+
+struct Fmsg {
+ Fcall;
+ Conn *conn;
+ int sz; /* the size of the message buf */
+ uchar buf[];
+};
+
+struct Tree {
+ Limbo;
+
+ /* in-memory */
+ Lock lk;
+ long memref; /* number of in-memory references to this */
+ vlong memgen; /* wip next generation */
+ int dirty;
+
+ /* on-disk */
+ int nref; /* number snapshots forked/after us */
+ int nlbl; /* number of labels referring to us */
+ int ht; /* height of the tree */
+ uint flag; /* flag set */
+ Bptr bp; /* block pointer of root */
+ vlong gen; /* generation */
+ vlong pred; /* previous snapshot */
+ vlong succ; /* next snapshot */
+ vlong base; /* base snapshot */
+};
+
+struct Bfree {
+ Limbo;
+ Bptr bp;
+};
+
+struct User {
+ int id;
+ int lead;
+ int *memb;
+ int nmemb;
+ char name[128];
+};
+
+enum {
+ /* in priority order */
+ Qnone,
+ Qfence,
+ Qwrite,
+ Qfree,
+};
+
+struct Qent {
+ vlong qgen;
+ Bptr bp;
+ Blk *b;
+ int op;
+};
+
+struct Syncq {
+ QLock lk;
+ Rendez fullrz;
+ Rendez emptyrz;
+ Qent *heap;
+ int nheap;
+ int heapsz;
+};
+
+struct Trace {
+ int tid;
+ int qgen;
+ char msg[16];
+ Bptr bp;
+ vlong v0;
+ vlong v1;
+};
+
+/*
+ * Overall state of the file sytem.
+ * Shadows the superblock contents.
+ */
+struct Gefs {
+ int blksz;
+ int bufspc;
+ Tree snap;
+ Dlist snapdl;
+ int narena;
+ vlong flag;
+ vlong nextqid;
+ vlong nextgen;
+ vlong qgen;
+ Bptr *arenabp;
+
+ /* superblocks */
+ Blk *sb0; /* primary */
+ Blk *sb1; /* backup */
+
+ /* arena allocation */
+ Arena *arenas;
+ long roundrobin;
+ long syncing;
+ long nsyncers;
+ long nreaders;
+
+ QLock synclk;
+ Rendez syncrz;
+
+ QLock mountlk;
+ Mount *mounts;
+ Mount *snapmnt;
+ Lock connlk;
+ Conn *conns;
+
+ Chan *wrchan;
+ Chan *admchan;
+ Chan **rdchan;
+
+ QLock mutlk;
+ long nworker;
+ long epoch;
+ long lepoch[32];
+ Limbo *limbo[3];
+ long nlimbo;
+
+ Syncq syncq[32];
+
+ int fd;
+ long rdonly;
+ int noauth;
+
+ /* user list */
+ RWLock userlk;
+ User *users;
+ int nusers;
+
+ /* slow block io */
+ QLock blklk[32];
+
+ /* deadlist cache */
+ Dlist **dlcache;
+ Dlist *dlhead;
+ Dlist *dltail;
+ int dlcount;
+ int dlcmax;
+
+ /* block lru */
+ QLock lrulk;
+ Rendez lrurz;
+ Bucket *bcache;
+ Blk *chead;
+ Blk *ctail;
+ usize ccount;
+ usize cmax;
+
+ /* preallocated deferred frees */
+ QLock bfreelk;
+ Rendez bfreerz;
+ Bfree *bfree;
+
+ RWLock flushq[Nflushtab];
+ int flushop[Nflushtab];
+
+ Trace *trace;
+ long traceidx;
+ long ntrace;
+};
+
+struct Arena {
+ QLock;
+ Avltree *free;
+ Blk **queue;
+ int nqueue;
+ Blk *logbuf[2]; /* preallocated log pages */
+ Blk *h0; /* arena header */
+ Blk *h1; /* arena footer */
+ Blk **q; /* write queue */
+ vlong nq;
+ vlong size;
+ vlong used;
+ vlong reserve;
+ /* allocation log */
+ vlong lastlogsz; /* size after last compression */
+ vlong nlog; /* number of blocks in log */
+ Bptr loghd; /* allocation log */
+ Blk *logtl; /* end of the log, open for writing */
+ Syncq *sync;
+};
+
+struct Xdir {
+ /* file data */
+ uvlong flag; /* storage flag */
+ Qid qid; /* unique id from server */
+ ulong mode; /* permissions */
+ vlong atime; /* last read time: nsec */
+ vlong mtime; /* last write time: nsec */
+ uvlong length; /* file length */
+ int uid; /* owner name */
+ int gid; /* group name */
+ int muid; /* last modifier name */
+ char *name; /* last element of path */
+};
+
+struct Dent {
+ RWLock;
+ Key;
+ Xdir;
+ Dent *next;
+ QLock trunclk;
+ Rendez truncrz;
+ vlong up;
+ long ref;
+ char gone;
+ char trunc;
+
+ char buf[Maxent];
+};
+
+struct Mount {
+ Limbo;
+ Lock;
+ Mount *next;
+ long ref;
+ vlong gen;
+ char name[64];
+ Tree *root; /* EBR protected */
+
+ int flag;
+
+ /* open directory entries */
+ Lock dtablk;
+ Dent *dtab[Ndtab];
+
+ /* snapshot history */
+ char minutely[60][128];
+ char hourly[24][128];
+};
+
+struct Conn {
+ Conn *next;
+ QLock wrlk;
+ int rfd;
+ int wfd;
+ int iounit;
+ int versioned;
+
+ /* fid hash table */
+ Lock fidtablk[Nfidtab];
+ Fid *fidtab[Nfidtab];
+};
+
+struct Fid {
+ Lock;
+ Fid *next;
+ /*
+ * if opened with OEXEC, we want to use a snapshot,
+ * instead of the most recent root, to prevent
+ * paging in the wrong executable.
+ */
+ Mount *mnt;
+ Scan *scan; /* in progres scan */
+ Dent *dent; /* (pqid, name) ref, modified on rename */
+ Dent *dir;
+ Amsg *rclose;
+ void *auth;
+
+ u32int fid;
+ vlong qpath;
+ vlong pqpath;
+ long ref;
+ int mode;
+ int iounit;
+
+ int uid;
+ int duid;
+ int dgid;
+ int dmode;
+
+ char permit;
+ char fromdump;
+};
+
+enum {
+ POmod,
+ POrot,
+ POsplit,
+ POmerge,
+};
+
+struct Scanp {
+ int bi;
+ int vi;
+ Blk *b;
+};
+
+struct Scan {
+ vlong offset; /* last read offset */
+ char first;
+ char donescan;
+ char overflow;
+ char present;
+ int ht;
+ Kvp kv;
+ Key pfx;
+ char kvbuf[Kvmax];
+ char pfxbuf[Keymax];
+ Scanp *path;
+};
+
+struct Blk {
+ Limbo;
+ /* cache entry */
+ Blk *cnext;
+ Blk *cprev;
+ Blk *hnext;
+
+ /* serialized to disk in header */
+ short type; /* @0, for all */
+ union {
+ struct {
+ short nval; /* @2, for Leaf, Pivot: data[0:2] */
+ short valsz; /* @4, for Leaf, Pivot: data[2:4] */
+ short nbuf; /* @6, for Pivot */
+ short bufsz; /* @8, for Pivot */
+ };
+ struct {
+ int logsz; /* @2 for allocation log */
+ uvlong logh; /* @4 for log body hash */
+ Bptr logp; /* @12 next deadlist chain */
+ };
+ };
+
+ /* debug */
+ uintptr queued;
+ uintptr lasthold;
+ uintptr lasthold0;
+ uintptr lastdrop;
+ uintptr enqueued;
+ uintptr cached;
+ uintptr uncached;
+ uintptr alloced;
+ uintptr freed;
+
+ Bptr bp;
+ long ref;
+ long flag;
+ char *data;
+ char buf[Blksz];
+ vlong magic;
+};
+
+struct Chan {
+ int size; /* size of queue */
+ long count; /* how many in queue (semaphore) */
+ long avail; /* how many available to send (semaphore) */
+ Lock rl, wl; /* circular pointers */
+ void **rp;
+ void **wp;
+ void* args[]; /* list of saved pointers, [->size] */
+};
--- /dev/null
+++ b/dump.c
@@ -1,0 +1,366 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include <ctype.h>
+
+#include "dat.h"
+#include "fns.h"
+
+char spc[128];
+
+static int
+showkey(Fmt *fmt, Key *k)
+{
+ int n;
+
+ /*
+ * dent: pqid[8] qid[8] -- a directory entry key.
+ * ptr: off[8] hash[8] -- a key for an Dir block.
+ * dir: fixed statbuf header, user ids
+ */
+ if(k->nk == 0)
+ return fmtprint(fmt, "\"\"");
+ switch(k->k[0]){
+ case Kdat: /* qid[8] off[8] => ptr[16]: pointer to data page */
+ n = fmtprint(fmt, "dat qid:%llx off:%llx",
+ UNPACK64(k->k+1), UNPACK64(k->k+9));
+ break;
+ case Kent: /* pqid[8] name[n] => dir[n]: serialized Dir */
+ n = fmtprint(fmt, "ent dir:%llx, name:\"%.*s\"",
+ UNPACK64(k->k+1), k->nk-11, k->k+11);
+ break;
+ case Klabel: /* name[n] => tree[24]: snapshot ref */
+ n = fmtprint(fmt, "label name:\"%.*s\"", k->nk-1, k->k+1);
+ break;
+ case Ksnap: /* name[n] => tree[24]: snapshot root */
+ n = fmtprint(fmt, "snap id:%lld", UNPACK64(k->k+1));
+ break;
+ case Kup: /* qid[8] => pqid[8]: parent dir */
+ n = fmtprint(fmt, "up dir:%llx", UNPACK64(k->k+1));
+ break;
+ case Kdlist:
+ n = fmtprint(fmt, "dlist gen:%lld, bgen:%lld",
+ UNPACK64(k->k+1), UNPACK64(k->k+9));
+ break;
+ default:
+ n = fmtprint(fmt, "??? %.*H", k->nk, k->k);
+ break;
+ }
+ return n;
+}
+
+static int
+showval(Fmt *fmt, Kvp *v, int op, int flg)
+{
+ int n, ws;
+ char *p;
+ Tree t;
+ Xdir d;
+
+ n = 0;
+ if(flg){
+ assert(v->nv == Ptrsz+2);
+ n = fmtprint(fmt, "(%B,%d)", unpackbp(v->v, v->nv), UNPACK16(v->v+Ptrsz));
+ return n;
+ }
+ if(op == Odelete || op == Oclearb){
+ n = fmtprint(fmt, "delete");
+ return n;
+ }
+ switch(v->k[0]){
+ case Kdat: /* qid[8] off[8] => ptr[16]: pointer to data page */
+ switch(op){
+ case Odelete:
+ case Oclearb:
+ n = 0;
+ break;
+ case Onop:
+ case Oinsert:
+ if(v->nv == Ptrsz)
+ n = fmtprint(fmt, "ptr:%B", unpackbp(v->v, v->nv));
+ else
+ n = fmtprint(fmt, "BROKEN ptr %.*H", v->nk, v->k);
+ break;
+ }
+ break;
+ case Kent: /* pqid[8] name[n] => dir[n]: serialized Dir */
+ switch(op){
+ case Onop:
+ case Oinsert:
+ kv2dir(v, &d);
+ n = fmtprint(fmt, "[qid=(%llux,%lud,%d), p=%luo, f=%llux, t=%lld,%lld, l=%lld, o=%d, g=%d m=%d]",
+ d.qid.path, d.qid.vers, d.qid.type, d.mode,
+ d.flag, d.atime, d.mtime, d.length,
+ d.uid, d.gid, d.muid);
+ break;
+ case Odelete:
+ n = fmtprint(fmt, "delete");
+ break;
+ case Owstat:
+ p = v->v;
+ ws = *p++;
+ if(ws & Owsize){
+ n += fmtprint(fmt, "size:%llx ", UNPACK64(p));
+ p += 8;
+ }
+ if(ws & Owmode){
+ n += fmtprint(fmt, "mode:%uo ", UNPACK32(p));
+ p += 4;
+ }
+ if(ws & Owmtime){
+ n += fmtprint(fmt, "mtime:%llx ", UNPACK64(p));
+ p += 8;
+ }
+ if(ws & Owatime){
+ n += fmtprint(fmt, "mtime:%llx ", UNPACK64(p));
+ p += 8;
+ }
+ if(ws & Owuid){
+ n += fmtprint(fmt, "uid:%d ", UNPACK32(p));
+ p += 4;
+ }
+ if(ws & Owgid){
+ n += fmtprint(fmt, "gid:%d ", UNPACK32(p));
+ p += 4;
+ }
+ if(ws & Owmuid){
+ n += fmtprint(fmt, "muid:%d ", UNPACK32(p));
+ p += 4;
+ }
+ if(p != v->v + v->nv){
+ fprint(2, "v->nv: %d, sz=%d\n", v->nv, (int)(p - v->v));
+ abort();
+ }
+ break;
+ }
+ break;
+ case Ksnap: /* name[n] => dent[16] ptr[16]: snapshot root */
+ switch(op){
+ case Orelink:
+ case Oreprev:
+ n = fmtprint(fmt, "gen: %lld, dlbl: %d, dref: %d",
+ UNPACK64(v->v), v->v[8], v->v[9]);
+ break;
+ case Onop:
+ case Oinsert:
+ if(unpacktree(&t, v->v, v->nv) == nil)
+ n = fmtprint(fmt, "corrupt tree");
+ else
+ n = fmtprint(fmt, "<tree %B [pred=%lld, succ=%lld, nref=%d, nlbl=%d]>",
+ t.bp, t.pred, t.succ, t.nref, t.nlbl);
+ break;
+ default:
+ n = fmtprint(fmt, "?? unknown op %d", op);
+ }
+ break;
+ case Klabel:
+ n = fmtprint(fmt, "snap id:%lld", UNPACK64(v->v+1));
+ break;
+ case Kup: /* qid[8] => pqid[8]: parent dir */
+ n = fmtprint(fmt, "super dir:%llx, name:\"%.*s\")",
+ UNPACK64(v->v+1), v->nv-11, v->v+11);
+ break;
+ case Kdlist:
+ n = fmtprint(fmt, "hd:%B, tl:%B",
+ unpackbp(v->v, v->nv),
+ unpackbp(v->v+Ptrsz, v->nv-Ptrsz));
+ break;
+ default:
+ n = fmtprint(fmt, "??? %.*H", v->nk, v->k);
+ break;
+ }
+ return n;
+
+}
+
+int
+Bconv(Fmt *fmt)
+{
+ Bptr bp;
+
+ bp = va_arg(fmt->args, Bptr);
+ return fmtprint(fmt, "(%llx,%.16llux,%llx)", bp.addr, bp.hash, bp.gen);
+}
+
+int
+Mconv(Fmt *fmt)
+{
+ char *opname[Nmsgtype] = {
+ [Oinsert] "Oinsert",
+ [Odelete] "Odelete",
+ [Oclearb] "Oclearb",
+ [Oclobber] "Oclobber",
+ [Owstat] "Owstat",
+ [Orelink] "Orelink",
+ [Oreprev] "Oreprev",
+ };
+ Msg *m;
+ int f, n;
+
+ f = (fmt->flags & FmtSharp) != 0;
+ m = va_arg(fmt->args, Msg*);
+ if(m == nil)
+ return fmtprint(fmt, "Msg{nil}");
+ n = fmtprint(fmt, "Msg(%s, ", opname[m->op]);
+ n += showkey(fmt, m);
+ n += fmtprint(fmt, ") => (");
+ n += showval(fmt, m, m->op, f);
+ n += fmtprint(fmt, ")");
+ return n;
+}
+
+int
+Pconv(Fmt *fmt)
+{
+ Kvp *kv;
+ int f, n;
+
+ f = (fmt->flags & FmtSharp) != 0;
+ kv = va_arg(fmt->args, Kvp*);
+ if(kv == nil)
+ return fmtprint(fmt, "Kvp{nil}");
+ n = fmtprint(fmt, "Kvp(");
+ n += showkey(fmt, kv);
+ n += fmtprint(fmt, ") => (");
+ n += showval(fmt, kv, Onop, f);
+ n += fmtprint(fmt, ")");
+ return n;
+}
+
+int
+Kconv(Fmt *fmt)
+{
+ Key *k;
+ int n;
+
+ k = va_arg(fmt->args, Key*);
+ if(k == nil)
+ return fmtprint(fmt, "Key{nil}");
+ n = fmtprint(fmt, "Key(");
+ n += showkey(fmt, k);
+ n += fmtprint(fmt, ")");
+ return n;
+}
+
+int
+Rconv(Fmt *fmt)
+{
+ Arange *r;
+
+ r = va_arg(fmt->args, Arange*);
+ if(r == nil)
+ return fmtprint(fmt, "<Arange:nil>");
+ else
+ return fmtprint(fmt, "Arange(%lld+%lld)", r->off, r->len);
+}
+
+int
+Qconv(Fmt *fmt)
+{
+ Qid q;
+
+ q = va_arg(fmt->args, Qid);
+ return fmtprint(fmt, "(%llx %ld %d)", q.path, q.vers, q.type);
+}
+
+static void
+rshowblk(int fd, Blk *b, int indent, int recurse)
+{
+ Blk *c;
+ int i;
+ Bptr bp;
+ Kvp kv;
+ Msg m;
+
+ if(indent > sizeof(spc)/4)
+ indent = sizeof(spc)/4;
+ if(b == nil){
+ fprint(fd, "NIL\n");
+ return;
+ }
+ fprint(fd, "%.*s[BLK]|{%B}\n", 4*indent, spc, b->bp);
+ switch(b->type){
+ case Tpivot:
+ for(i = 0; i < b->nbuf; i++){
+ getmsg(b, i, &m);
+ fprint(fd, "%.*s[%03d]|%M\n", 4*indent, spc, i, &m);
+ }
+ /* wet floor */
+ case Tleaf:
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &kv);
+ if(b->type == Tpivot){
+ fprint(fd, "%.*s[%03d]|%#P\n", 4*indent, spc, i, &kv);
+ bp = unpackbp(kv.v, kv.nv);
+ c = getblk(bp, 0);
+ if(recurse)
+ rshowblk(fd, c, indent + 1, 1);
+ dropblk(c);
+ }else{
+ fprint(fd, "%.*s[%03d]|%P\n", 4*indent, spc, i, &kv);
+ }
+ }
+ break;
+ case Tarena:
+ fprint(fd, "arena -- ");
+ goto Show;
+ case Tlog:
+ fprint(fd, "log -- ");
+ goto Show;
+ case Tdlist:
+ fprint(fd, "dlist -- ");
+ goto Show;
+ case Tdat:
+ fprint(fd, "dat -- ");
+ Show:
+ for(i = 0; i < 32; i++){
+ fprint(fd, "%x", b->buf[i] & 0xff);
+ if(i % 4 == 3)
+ fprint(fd, " ");
+ }
+ fprint(fd, "\n");
+ break;
+ }
+}
+
+void
+showblk(int fd, Blk *b, char *m, int recurse)
+{
+ fprint(fd, "=== %s\n", m);
+ rshowblk(fd, b, 0, recurse);
+}
+
+void
+showbp(int fd, Bptr bp, int recurse)
+{
+ Blk *b;
+
+ b = getblk(bp, GBnochk);
+ rshowblk(fd, b, 0, recurse);
+ dropblk(b);
+}
+
+void
+showtreeroot(int fd, Tree *t)
+{
+ fprint(fd, "\tflag\t0x%x\n", t->flag);
+ fprint(fd, "\tgen:\t%lld\n", t->gen);
+ fprint(fd, "\tbase\t%lld\n", t->base);
+ fprint(fd, "\tpred:\t%lld\n", t->pred);
+ fprint(fd, "\tsucc:\t%lld\n", t->succ);
+ fprint(fd, "\tnref:\t%d\n", t->nref);
+ fprint(fd, "\tnlbl:\t%d\n", t->nlbl);
+ fprint(fd, "\tht:\t%d\n", t->ht);
+ fprint(fd, "\tbp:\t%B\n", t->bp);
+}
+
+void
+initshow(void)
+{
+ int i;
+
+ memset(spc, ' ', sizeof(spc));
+ for(i = 0; i < sizeof(spc); i += 4)
+ spc[i] = '|';
+}
--- /dev/null
+++ b/error.c
@@ -1,0 +1,78 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include "dat.h"
+
+char Efs[] = "internal error";
+char Ecorrupt[] = "block contents corrupted";
+char Efsvers[] = "unknown fs version";
+char Eimpl[] = "not implemented";
+char Ebotch[] = "protocol botch";
+char Eio[] = "i/o error";
+char Enofid[] = "unknown fid";
+char Efid[] = "fid in use";
+char Etype[] = "invalid fid type";
+char Edscan[] = "invalid dir scan offset";
+char Esrch[] = "directory entry not found";
+char Eexist[] = "create/wstat -- file exists";
+char Emode[] = "open/create -- unknown mode";
+char Efull[] = "file system full";
+char Estuffed[] = "emergency blocks exhausted";
+char Eauth[] = "authentication failed";
+char Elength[] = "name too long";
+char Eperm[] = "permission denied";
+char Einuse[] = "resource in use";
+char Ebadf[] = "invalid file";
+char Ename[] = "create/wstat -- bad character in file name";
+char Enomem[] = "out of memory";
+char Eattach[] = "attach required";
+char Enosnap[] = "attach -- bad specifier";
+char Edir[] = "invalid directory";
+char Esyntax[] = "syntax error";
+char Enouser[] = "user does not exist";
+char Enogrp[] = "group does not exist";
+char Efsize[] = "file too big";
+char Ebadu[] = "attach -- unknown user or failed authentication";
+char Erdonly[] = "file system read only";
+char Elocked[] = "open/create -- file is locked";
+char Eauthp[] = "authread -- auth protocol not finished";
+char Eauthd[] = "authread -- not enough data";
+char Eauthph[] = "auth phase error";
+char Enone[] = "auth -- user 'none' requires no authentication";
+char Enoauth[] = "auth -- authentication disabled";
+char Ephase[] = "phase error -- use after remove";
+
+char Ewstatb[] = "wstat -- unknown bits in qid.type/mode";
+char Ewstatd[] = "wstat -- attempt to change directory";
+char Ewstatg[] = "wstat -- not in group";
+char Ewstatl[] = "wstat -- attempt to make length negative";
+char Ewstatm[] = "wstat -- attempt to change muid";
+char Ewstato[] = "wstat -- not owner or group leader";
+char Ewstatp[] = "wstat -- attempt to change qid.path";
+char Ewstatq[] = "wstat -- qid.type/dir.mode mismatch";
+char Ewstatu[] = "wstat -- not owner";
+char Ewstatv[] = "wstat -- attempt to change qid.vers";
+char Enempty[] = "directory is not empty";
+
+//char Echar[] = "bad character in directory name";
+//char Eopen[] = "read/write -- on non open fid";
+//char Ecount[] = "read/write -- count too big";
+//char Ealloc[] = "phase error -- directory entry not allocated";
+//char Eqid[] = "phase error -- qid does not match";
+//char Eaccess[] = "access permission denied";
+//char Eentry[] = "directory entry not found";
+//char Edir1[] = "walk -- in a non-directory";
+//char Edir2[] = "create -- in a non-directory";
+//char Edot[] = "create/wstat -- . and .. illegal names";
+//char Ewalk[] = "walk -- too many (system wide)";
+//char Eoffset[] = "read/write -- offset negative";
+//char Ebroken[] = "read/write -- lock is broken";
+//char Eauth[] = "attach -- authentication failed";
+//char Eauth2[] = "read/write -- authentication unimplemented";
+//char Etoolong[] = "name too long";
+//char Efidinuse[] = "fid in use";
+//char Eversion[] = "version conversion";
+//char Eauthnone[] = "auth -- user 'none' requires no authentication";
+//char Eauthdisabled[] = "auth -- authentication disabled"; /* development */
+//char Eauthfile[] = "auth -- out of auth files";
--- /dev/null
+++ b/fns.h
@@ -1,0 +1,213 @@
+#pragma varargck type "M" Msg*
+#pragma varargck type "P" Kvp*
+#pragma varargck type "K" Key*
+#pragma varargck type "V" Val*
+#pragma varargck type "B" Bptr
+#pragma varargck type "R" Arange*
+#pragma varargck type "X" char*
+#pragma varargck type "Q" Qid
+
+extern Gefs* fs;
+extern int debug;
+extern int permissive;
+extern int usereserve;
+extern char* reamuser;
+extern Errctx** errctx;
+extern Blk* blkbuf;
+extern int noneid;
+extern int nogroupid;
+extern int admid;
+
+#define UNPACK8(p) (((uchar*)(p))[0])
+#define UNPACK16(p) ((((uchar*)(p))[0]<<8)|(((uchar*)(p))[1]))
+#define UNPACK32(p) ((((uchar*)(p))[0]<<24)|(((uchar*)(p))[1]<<16)|\
+ (((uchar*)(p))[2]<<8)|(((uchar*)(p))[3]))
+#define UNPACK64(p) (((u64int)((((uchar*)(p))[0]<<24)|(((uchar*)(p))[1]<<16)|\
+ (((uchar*)(p))[2]<<8)|(((uchar*)(p))[3])))<<32 |\
+ ((u64int)((((uchar*)(p))[4]<<24)|(((uchar*)(p))[5]<<16)|\
+ (((uchar*)(p))[6]<<8)|(((uchar*)(p))[7]))))
+
+#define PACK8(p,v) do{(p)[0]=(v);}while(0)
+#define PACK16(p,v) do{(p)[0]=(v)>>8;(p)[1]=(v);}while(0)
+#define PACK32(p,v) do{(p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v);}while(0)
+#define PACK64(p,v) do{(p)[0]=(v)>>56;(p)[1]=(v)>>48;(p)[2]=(v)>>40;(p)[3]=(v)>>32;\
+ (p)[4]=(v)>>24;(p)[5]=(v)>>16;(p)[6]=(v)>>8;(p)[7]=(v);}while(0)
+
+void* emalloc(usize, int);
+
+Blk* newdblk(Tree*, vlong, int);
+Blk* newblk(Tree*, int);
+Blk* dupblk(Tree*, Blk*);
+Blk* getroot(Tree*, int*);
+Blk* getblk(Bptr, int);
+Blk* holdblk(Blk*);
+void dropblk(Blk*);
+
+void lrutop(Blk*);
+void lrubot(Blk*);
+void cacheins(Blk*);
+void cachedel(vlong);
+Blk* cacheget(vlong);
+Blk* cachepluck(void);
+
+void qinit(Syncq*);
+void qput(Syncq*, Qent);
+
+Arena* getarena(vlong);
+void syncblk(Blk*);
+void enqueue(Blk*);
+void epochstart(int);
+void epochend(int);
+void epochwait(void);
+void epochclean(void);
+void limbo(int op, Limbo*);
+void freeblk(Tree*, Blk*);
+void freebp(Tree*, Bptr);
+int logbarrier(Arena *, vlong);
+void dlappend(Dlist *dl, Bptr);
+void killblk(Tree*, Bptr);
+ushort blkfill(Blk*);
+uvlong blkhash(Blk*);
+uvlong bufhash(void*, usize);
+u32int ihash(uvlong);
+void finalize(Blk*);
+
+Mount* getmount(char*);
+void clunkmount(Mount*);
+
+void updatesnap(Tree**, Tree*, char*, int);
+void tagsnap(Tree*, char*, int);
+void delsnap(Tree*, vlong, char*);
+void freedl(Dlist*, int);
+Tree* opensnap(char*, int*);
+
+void closesnap(Tree*);
+void reamfs(char*);
+void growfs(char*);
+void loadarena(Arena*, Bptr);
+void loadfs(char*);
+void loadlog(Arena*, Bptr);
+void flushlog(Arena*);
+int scandead(Dlist*, int, void(*)(Bptr, void*), void*);
+int endfs(void);
+void compresslog(Arena*);
+void dlsync(void);
+void setval(Blk*, Kvp*);
+
+Conn* newconn(int, int);
+
+int walk1(Tree*, vlong, char*, Qid*, vlong*);
+void loadusers(int, Tree*);
+User* uid2user(int);
+User* name2user(char*);
+
+void btupsert(Tree*, Msg*, int);
+int btlookup(Tree*, Key*, Kvp*, char*, int);
+void btnewscan(Scan*, char*, int);
+void btenter(Tree*, Scan*);
+int btnext(Scan*, Kvp*);
+void btexit(Scan*);
+
+int checkflag(Blk *b, int, int);
+void setflag(Blk *b, int, int);
+
+char* estrdup(char*);
+
+int keycmp(Key *, Key *);
+void cpkey(Key*, Key*, char*, int);
+void cpkvp(Kvp*, Kvp*, char*, int);
+
+/* for dumping */
+void getval(Blk*, int, Kvp*);
+void getmsg(Blk*, int, Msg*);
+Bptr getptr(Kvp*, int*);
+
+void initshow(void);
+void showblk(int, Blk*, char*, int);
+void showbp(int, Bptr, int);
+void showtreeroot(int, Tree*);
+int checkfs(int);
+
+#define dprint(...) \
+ do{ \
+ if(debug) fprint(2, __VA_ARGS__); \
+ }while(0)
+
+#define fatal(...) \
+ do{ \
+ fprint(2, __VA_ARGS__); \
+ abort(); \
+ }while(0)
+
+#define tracex(msg, bp, v0, v1) \
+ do{ \
+ if(fs->trace != nil) \
+ _trace(msg, bp, v0, v1); \
+ } while(0)
+
+#define traceb(msg, bp) tracex(msg, bp, -1, -1)
+#define tracev(msg, v0) tracex(msg, Zb, v0, -1)
+#define tracem(msg) tracex(msg, Zb, -1, -1)
+
+jmp_buf* _waserror(void);
+_Noreturn void error(char*, ...);
+_Noreturn void broke(char*, ...);
+_Noreturn void nexterror(void);
+#define waserror() (setjmp(*_waserror()))
+#define errmsg() ((*errctx)->err)
+#define poperror() assert((*errctx)->nerrlab-- > 0)
+#define estacksz() ((*errctx)->nerrlab)
+void _trace(char*, Bptr, vlong, vlong);
+char* packstr(char*, char*, char*);
+
+void dir2kv(vlong, Xdir*, Kvp*, char*, int);
+int dir2statbuf(Xdir*, char*, int);
+void dlist2kv(Dlist*, Kvp*, char*, int);
+void lbl2kv(char*, vlong, uint, Kvp*, char*, int);
+void link2kv(vlong, vlong, Kvp*, char*, int);
+void retag2kv(vlong, vlong, int, int, Kvp*, char*, int);
+void tree2kv(Tree*, Kvp*, char*, int);
+
+void kv2dir(Kvp*, Xdir*);
+void kv2dlist(Kvp*, Dlist*);
+void kv2link(Kvp*, vlong*, vlong*);
+void kv2qid(Kvp*, Qid*);
+int kv2statbuf(Kvp*, char*, int);
+
+char* packarena(char*, int, Arena*);
+char* packbp(char*, int, Bptr*);
+char* packdkey(char*, int, vlong, char*);
+char* packdval(char*, int, Xdir*);
+char* packlbl(char*, int, char*);
+char* packsnap(char*, int, vlong);
+char* packsuper(char*, int, vlong);
+char* packtree(char*, int, Tree*);
+char* packsb(char*, int, Gefs*);
+
+char* unpackarena(Arena*, char*, int);
+Bptr unpackbp(char*, int);
+char* unpackdkey(char*, int, vlong*);
+Tree* unpacktree(Tree*, char*, int);
+char* unpacksb(Gefs*, char*, int);
+char* unpackstr(char*, char*, char**);
+
+/* fmt */
+int Bconv(Fmt*);
+int Mconv(Fmt*);
+int Pconv(Fmt*);
+int Rconv(Fmt*);
+int Kconv(Fmt*);
+int Qconv(Fmt*);
+
+Chan* mkchan(int);
+void* chrecv(Chan*);
+void chsend(Chan*, void*);
+void runfs(int, void*);
+void runmutate(int, void*);
+void runread(int, void*);
+void runcons(int, void*);
+void runtasks(int, void*);
+void runsync(int, void*);
+void runsweep(int, void*);
+void runsweep(int, void*);
+void fixfs(void);
--- /dev/null
+++ b/fs.c
@@ -1,0 +1,2796 @@
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+static void respond(Fmsg*, Fcall*);
+static void rerror(Fmsg*, char*, ...);
+static void clunkfid(Conn*, Fid*, Amsg**);
+
+int
+walk1(Tree *t, vlong up, char *name, Qid *qid, vlong *len)
+{
+ char *p, kbuf[Keymax], rbuf[Kvmax];
+ int err;
+ Xdir d;
+ Kvp kv;
+ Key k;
+
+ err = 0;
+ p = packdkey(kbuf, sizeof(kbuf), up, name);
+ k.k = kbuf;
+ k.nk = p - kbuf;
+ if(err)
+ return -1;
+ if(!btlookup(t, &k, &kv, rbuf, sizeof(rbuf)))
+ return -1;
+ kv2dir(&kv, &d);
+ *qid = d.qid;
+ *len = d.length;
+ return 0;
+}
+
+static void
+touch(Dent *de, Msg *msg)
+{
+ wlock(de);
+ de->qid.vers++;
+ msg->op = Owstat;
+ msg->k = de->k;
+ msg->nk = de->nk;
+ msg->v = "\0";
+ msg->nv = 1;
+ wunlock(de);
+}
+
+static void
+wrbarrier(void)
+{
+ tracev("barrier", fs->qgen);
+ aincv(&fs->qgen, 1);
+}
+
+static void
+wrwait(void)
+{
+ Qent qe;
+ int i;
+
+ tracev("wrwait", fs->qgen);
+ aincv(&fs->qgen, 1);
+ fs->syncing = fs->nsyncers;
+ for(i = 0; i < fs->nsyncers; i++){
+ qe.op = Qfence;
+ qe.bp.addr = 0;
+ qe.bp.hash = -1;
+ qe.bp.gen = -1;
+ qe.b = nil;
+ qput(&fs->syncq[i], qe);
+ }
+ aincv(&fs->qgen, 1);
+ while(fs->syncing != 0)
+ rsleep(&fs->syncrz);
+ tracev("flushed", fs->qgen);
+}
+
+static void
+sync(void)
+{
+ Mount *mnt;
+ Arena *a;
+ Dlist dl;
+ int i;
+
+ qlock(&fs->synclk);
+ if(waserror()){
+ fprint(2, "failed to sync: %s\n", errmsg());
+ qunlock(&fs->synclk);
+ nexterror();
+ }
+
+ /*
+ * Wait for data that we're syncing to hit disk
+ */
+ tracem("flush1");
+ wrbarrier();
+ /*
+ * pass 0: Update all open snapshots, and
+ * pack the blocks we want to sync. Snap
+ * while holding the write lock, and then
+ * wait until all the blocks they point at
+ * have hit disk; once they're on disk, we
+ * can take a consistent snapshot.
+ */
+ qlock(&fs->mutlk);
+ tracem("packb");
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next)
+ updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+ /*
+ * Now that we've updated the snaps, we can sync the
+ * dlist; the snap tree will not change from here.
+ */
+ dlsync();
+ dl = fs->snapdl;
+ fs->snapdl.hd = Zb;
+ fs->snapdl.tl = Zb;
+ fs->snapdl.ins = nil;
+ traceb("syncdl.dl", dl.hd);
+ traceb("syncdl.rb", fs->snap.bp);
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ qlock(a);
+ /*
+ * because the log uses preallocated
+ * blocks, we need to write the log
+ * block out synchronously, or it may
+ * get reused.
+ */
+ logbarrier(a, fs->qgen);
+ flushlog(a);
+
+ packarena(a->h0->data, Blksz, a);
+ packarena(a->h1->data, Blksz, a);
+ finalize(a->h0);
+ finalize(a->h1);
+ fs->arenabp[i] = a->h0->bp;
+ qunlock(a);
+ }
+ assert(fs->snapdl.hd.addr == -1);
+ traceb("packsb.rb", fs->snap.bp);
+ packsb(fs->sb0->buf, Blksz, fs);
+ packsb(fs->sb1->buf, Blksz, fs);
+ finalize(fs->sb0);
+ finalize(fs->sb1);
+ fs->snap.dirty = 0;
+ qunlock(&fs->mutlk);
+
+ /*
+ * pass 1: sync block headers; if we crash here,
+ * the block footers are consistent, and we can
+ * use them.
+ */
+ tracem("arenas0");
+ for(i = 0; i < fs->narena; i++)
+ enqueue(fs->arenas[i].h0);
+ wrbarrier();
+
+ /*
+ * pass 2: sync superblock; we have a consistent
+ * set of block headers, so if we crash, we can
+ * use the loaded block headers; the footers will
+ * get synced after so that we can use them next
+ * time around.
+ */
+ tracem("supers");
+ enqueue(fs->sb0);
+ enqueue(fs->sb1);
+ wrbarrier();
+
+ /*
+ * pass 3: sync block footers; if we crash here,
+ * the block headers are consistent, and we can
+ * use them.
+ */
+ tracem("arenas1");
+ for(i = 0; i < fs->narena; i++)
+ enqueue(fs->arenas[i].h1);
+
+ /*
+ * Pass 4: clean up the old snap tree's deadlist.
+ * we need to wait for all the new data to hit disk
+ * before we can free anything, otherwise it gets
+ * clobbered.
+ */
+ tracem("snapdl");
+ wrwait();
+ freedl(&dl, 1);
+ qunlock(&fs->synclk);
+ tracem("synced");
+ poperror();
+}
+
+static void
+snapfs(Amsg *a, Tree **tp)
+{
+ Tree *t, *s;
+ Mount *mnt;
+
+ if(waserror()){
+ *tp = nil;
+ nexterror();
+ }
+ t = nil;
+ *tp = nil;
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+ if(strcmp(a->old, mnt->name) == 0){
+ updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+ t = agetp(&mnt->root);
+ ainc(&t->memref);
+ break;
+ }
+ }
+ if(t == nil && (t = opensnap(a->old, nil)) == nil){
+ if(a->fd != -1)
+ fprint(a->fd, "snap: open '%s': does not exist\n", a->old);
+ poperror();
+ return;
+ }
+ if(a->delete){
+ if(mnt != nil) {
+ if(a->fd != -1)
+ fprint(a->fd, "snap: snap is mounted: '%s'\n", a->old);
+ poperror();
+ return;
+ }
+ if(t->nlbl == 1 && t->nref <= 1 && t->succ == -1){
+ aincl(&t->memref, 1);
+ *tp = t;
+ }
+ delsnap(t, t->succ, a->old);
+ }else{
+ if((s = opensnap(a->new, nil)) != nil){
+ if(a->fd != -1)
+ fprint(a->fd, "snap: already exists '%s'\n", a->new);
+ closesnap(s);
+ poperror();
+ return;
+ }
+ tagsnap(t, a->new, a->flag);
+ }
+ closesnap(t);
+ poperror();
+ if(a->fd != -1){
+ if(a->delete)
+ fprint(a->fd, "deleted: %s\n", a->old);
+ else if(a->flag & Lmut)
+ fprint(a->fd, "forked: %s from %s\n", a->new, a->old);
+ else
+ fprint(a->fd, "labeled: %s from %s\n", a->new, a->old);
+ }
+}
+
+static void
+filldumpdir(Xdir *d)
+{
+ memset(d, 0, sizeof(Xdir));
+ d->name = "/";
+ d->qid.path = Qdump;
+ d->qid.vers = fs->nextgen;
+ d->qid.type = QTDIR;
+ d->mode = DMDIR|0555;
+ d->atime = 0;
+ d->mtime = 0;
+ d->length = 0;
+ d->uid = -1;
+ d->gid = -1;
+ d->muid = -1;
+}
+
+static char*
+okname(char *name)
+{
+ int i;
+
+ if(name[0] == 0)
+ return Ename;
+ if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
+ return Ename;
+ for(i = 0; i < Maxname; i++){
+ if(name[i] == 0)
+ return nil;
+ if((name[i]&0xff) < 0x20 || name[i] == '/')
+ return Ename;
+ }
+ return Elength;
+}
+
+Chan*
+mkchan(int size)
+{
+ Chan *c;
+
+ if((c = mallocz(sizeof(Chan) + size*sizeof(void*), 1)) == nil)
+ sysfatal("create channel");
+ c->size = size;
+ c->avail = size;
+ c->count = 0;
+ c->rp = c->args;
+ c->wp = c->args;
+ return c;
+
+}
+
+void*
+chrecv(Chan *c)
+{
+ void *a;
+ long v;
+
+ v = agetl(&c->count);
+ if(v == 0 || !acasl(&c->count, v, v-1))
+ semacquire(&c->count, 1);
+ lock(&c->rl);
+ a = *c->rp;
+ if(++c->rp >= &c->args[c->size])
+ c->rp = c->args;
+ unlock(&c->rl);
+ semrelease(&c->avail, 1);
+ return a;
+}
+
+void
+chsend(Chan *c, void *m)
+{
+ long v;
+
+ v = agetl(&c->avail);
+ if(v == 0 || !acasl(&c->avail, v, v-1))
+ semacquire(&c->avail, 1);
+ lock(&c->wl);
+ *c->wp = m;
+ if(++c->wp >= &c->args[c->size])
+ c->wp = c->args;
+ unlock(&c->wl);
+ semrelease(&c->count, 1);
+}
+
+static void
+fshangup(Conn *c, char *fmt, ...)
+{
+ char buf[ERRMAX];
+ va_list ap;
+ Amsg *a;
+ Fid *f;
+ int i;
+
+ va_start(ap, fmt);
+ vsnprint(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+ fprint(2, "hangup: %s\n", buf);
+ close(c->rfd);
+ close(c->wfd);
+ for(i = 0; i < Nfidtab; i++){
+ lock(&c->fidtablk[i]);
+ for(f = c->fidtab[i]; f != nil; f = f->next){
+ lock(f);
+ if(waserror()){
+ unlock(f);
+ continue;
+ }
+ a = nil;
+ clunkfid(c, f, &a);
+ unlock(f);
+ if(a != nil)
+ chsend(fs->admchan, a);
+ nexterror();
+ }
+ unlock(&c->fidtablk[i]);
+ }
+}
+
+static void
+respond(Fmsg *m, Fcall *r)
+{
+ RWLock *lk;
+ uchar buf[Max9p+IOHDRSZ];
+ int w, n;
+
+ r->tag = m->tag;
+ dprint("→ %F\n", r);
+ assert(m->type+1 == r->type || r->type == Rerror);
+ if((n = convS2M(r, buf, sizeof(buf))) == 0)
+ abort();
+ qlock(&m->conn->wrlk);
+ w = write(m->conn->wfd, buf, n);
+ qunlock(&m->conn->wrlk);
+ if(w != n)
+ fshangup(m->conn, Eio);
+ if(m->type == Tflush){
+ lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
+ wunlock(lk);
+ }else{
+ lk = &fs->flushq[ihash(m->tag) % Nflushtab];
+ runlock(lk);
+ }
+ free(m);
+}
+
+static void
+rerror(Fmsg *m, char *fmt, ...)
+{
+ char buf[128];
+ va_list ap;
+ Fcall r;
+
+ va_start(ap, fmt);
+ vsnprint(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+ r.type = Rerror;
+ r.ename = buf;
+ respond(m, &r);
+}
+
+
+static void
+upsert(Mount *mnt, Msg *m, int nm)
+{
+ if(!(mnt->flag & Lmut))
+ error(Erdonly);
+ if(mnt->root->nlbl != 1 || mnt->root->nref != 0)
+ updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+ btupsert(mnt->root, m, nm);
+}
+
+/*
+ * When truncating a file, mutations need
+ * to wait for the sweeper to finish; this
+ * means the mutator needs to release the
+ * mutation lock, exit the epoch, and
+ * allow the sweeper to finish its job
+ * before resuming.
+ */
+static void
+truncwait(Dent *de, int id)
+{
+ epochend(id);
+ qunlock(&fs->mutlk);
+ qlock(&de->trunclk);
+ while(de->trunc)
+ rsleep(&de->truncrz);
+ qunlock(&de->trunclk);
+ qlock(&fs->mutlk);
+ epochstart(id);
+}
+
+static int
+readb(Tree *t, Fid *f, char *d, vlong o, vlong n, vlong sz)
+{
+ char buf[Offksz], kvbuf[Offksz+32];
+ vlong fb, fo;
+ Bptr bp;
+ Blk *b;
+ Key k;
+ Kvp kv;
+
+ if(o >= sz)
+ return 0;
+
+ fb = o & ~(Blksz-1);
+ fo = o & (Blksz-1);
+ if(fo+n > Blksz)
+ n = Blksz-fo;
+
+ k.k = buf;
+ k.nk = sizeof(buf);
+ k.k[0] = Kdat;
+ PACK64(k.k+1, f->qpath);
+ PACK64(k.k+9, fb);
+
+ if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf))){
+ memset(d, 0, n);
+ return n;
+ }
+
+ bp = unpackbp(kv.v, kv.nv);
+ b = getblk(bp, GBraw);
+ memcpy(d, b->buf+fo, n);
+ dropblk(b);
+ return n;
+}
+
+static int
+writeb(Fid *f, Msg *m, Bptr *ret, char *s, vlong o, vlong n, vlong sz)
+{
+ char buf[Kvmax];
+ vlong fb, fo;
+ Blk *b, *t;
+ int seq;
+ Tree *r;
+ Bptr bp;
+ Kvp kv;
+
+ fb = o & ~(Blksz-1);
+ fo = o & (Blksz-1);
+
+ m->k[0] = Kdat;
+ PACK64(m->k+1, f->qpath);
+ PACK64(m->k+9, fb);
+
+ if(fo+n >= Blksz)
+ seq = 1;
+ else
+ seq = 0;
+ b = newdblk(f->mnt->root, f->qpath, seq);
+ t = nil;
+ r = f->mnt->root;
+ if(btlookup(r, m, &kv, buf, sizeof(buf))){
+ bp = unpackbp(kv.v, kv.nv);
+ if(fb < sz && (fo != 0 || n != Blksz)){
+ t = getblk(bp, GBraw);
+ memcpy(b->buf, t->buf, Blksz);
+ dropblk(t);
+ }
+ }
+ if(fo+n > Blksz)
+ n = Blksz-fo;
+ memcpy(b->buf+fo, s, n);
+ if(t == nil){
+ if(fo > 0)
+ memset(b->buf, 0, fo);
+ if(fo+n < Blksz)
+ memset(b->buf+fo+n, 0, Blksz-fo-n);
+ }
+ enqueue(b);
+
+ packbp(m->v, m->nv, &b->bp);
+ *ret = b->bp;
+ dropblk(b);
+ return n;
+}
+
+static Dent*
+getdent(Mount *mnt, vlong pqid, Xdir *d)
+{
+ Dent *de;
+ char *e;
+ u32int h;
+
+ h = ihash(d->qid.path) % Ndtab;
+ lock(&mnt->dtablk);
+ for(de = mnt->dtab[h]; de != nil; de = de->next){
+ if(de->qid.path == d->qid.path){
+ ainc(&de->ref);
+ goto Out;
+ }
+ }
+
+ de = emalloc(sizeof(Dent), 1);
+ de->Xdir = *d;
+ de->ref = 1;
+ de->up = pqid;
+ de->qid = d->qid;
+ de->length = d->length;
+ de->truncrz.l = &de->trunclk;
+
+ if((e = packdkey(de->buf, sizeof(de->buf), pqid, d->name)) == nil){
+ free(de);
+ de = nil;
+ goto Out;
+ }
+ de->k = de->buf;
+ de->nk = e - de->buf;
+ de->name = de->buf + 11;
+ de->next = mnt->dtab[h];
+ mnt->dtab[h] = de;
+
+Out:
+ unlock(&mnt->dtablk);
+ return de;
+}
+
+static void
+loadautos(Mount *mnt)
+{
+ char pfx[128];
+ int m, h, ns;
+ uint flg;
+ Scan s;
+
+ m = 0;
+ h = 0;
+ pfx[0] = Klabel;
+ ns = snprint(pfx+1, sizeof(pfx)-1, "%s@minute.", mnt->name);
+ btnewscan(&s, pfx, ns+1);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ flg = UNPACK32(s.kv.v+1+8);
+ if(flg & Lauto){
+ memcpy(mnt->minutely[m], s.kv.k+1, s.kv.nk-1);
+ mnt->minutely[m][s.kv.nk-1] = 0;
+ m = (m+1)%60;
+ continue;
+ }
+ }
+ btexit(&s);
+
+ pfx[0] = Klabel;
+ ns = snprint(pfx+1, sizeof(pfx)-1, "%s@hour.", mnt->name);
+ btnewscan(&s, pfx, ns+1);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ flg = UNPACK32(s.kv.v+1+8);
+ if(flg & Lauto){
+ memcpy(mnt->hourly[h], s.kv.k+1, s.kv.nk-1);
+ mnt->hourly[h][s.kv.nk-1] = 0;
+ h = (h+1)%24;
+ continue;
+ }
+ }
+ btexit(&s);
+}
+
+Mount *
+getmount(char *name)
+{
+ Mount *mnt;
+ Tree *t;
+ int flg;
+
+ if(strcmp(name, "dump") == 0){
+ ainc(&fs->snapmnt->ref);
+ return fs->snapmnt;
+ }
+
+ qlock(&fs->mountlk);
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+ if(strcmp(name, mnt->name) == 0){
+ ainc(&mnt->ref);
+ goto Out;
+ }
+ }
+
+ if((mnt = mallocz(sizeof(*mnt), 1)) == nil)
+ error(Enomem);
+ if(waserror()){
+ qunlock(&fs->mountlk);
+ free(mnt);
+ nexterror();
+ }
+ mnt->ref = 1;
+ snprint(mnt->name, sizeof(mnt->name), "%s", name);
+ if((t = opensnap(name, &flg)) == nil)
+ error(Enosnap);
+ loadautos(mnt);
+ mnt->flag = flg;
+ mnt->root = t;
+ mnt->next = fs->mounts;
+ asetp(&fs->mounts, mnt);
+ poperror();
+
+Out:
+ qunlock(&fs->mountlk);
+ return mnt;
+}
+
+void
+clunkmount(Mount *mnt)
+{
+ Mount *me, **p;
+
+ if(mnt == nil)
+ return;
+ if(adec(&mnt->ref) == 0){
+ qlock(&fs->mountlk);
+ for(p = &fs->mounts; (me = *p) != nil; p = &me->next){
+ if(me == mnt)
+ break;
+ }
+ assert(me != nil);
+ *p = me->next;
+ limbo(DFmnt, me);
+ qunlock(&fs->mountlk);
+ }
+}
+
+static void
+clunkdent(Mount *mnt, Dent *de)
+{
+ Dent *e, **pe;
+ u32int h;
+
+ if(de == nil)
+ return;
+ if(de->qid.type & QTAUTH && adec(&de->ref) == 0){
+ free(de);
+ return;
+ }
+ lock(&mnt->dtablk);
+ if(adec(&de->ref) != 0)
+ goto Out;
+ h = ihash(de->qid.path) % Ndtab;
+ pe = &mnt->dtab[h];
+ for(e = mnt->dtab[h]; e != nil; e = e->next){
+ if(e == de)
+ break;
+ pe = &e->next;
+ }
+ assert(e != nil);
+ *pe = e->next;
+ free(de);
+Out:
+ unlock(&mnt->dtablk);
+}
+
+static Fid*
+getfid(Conn *c, u32int fid)
+{
+ u32int h;
+ Fid *f;
+
+ h = ihash(fid) % Nfidtab;
+ lock(&c->fidtablk[h]);
+ for(f = c->fidtab[h]; f != nil; f = f->next)
+ if(f->fid == fid){
+ ainc(&f->ref);
+ break;
+ }
+ unlock(&c->fidtablk[h]);
+ return f;
+}
+
+static void
+putfid(Fid *f)
+{
+ if(adec(&f->ref) != 0)
+ return;
+ clunkdent(f->mnt, f->dent);
+ clunkdent(f->mnt, f->dir);
+ clunkmount(f->mnt);
+ free(f);
+}
+
+static Fid*
+dupfid(Conn *c, u32int new, Fid *f)
+{
+ Fid *n, *o;
+ u32int h;
+
+ h = ihash(new) % Nfidtab;
+ if((n = malloc(sizeof(Fid))) == nil)
+ return nil;
+
+ *n = *f;
+ n->fid = new;
+ n->ref = 2; /* one for dup, one for clunk */
+ n->mode = -1;
+ n->next = nil;
+
+ lock(&c->fidtablk[h]);
+ for(o = c->fidtab[h]; o != nil; o = o->next)
+ if(o->fid == new)
+ break;
+ if(o == nil){
+ n->next = c->fidtab[h];
+ c->fidtab[h] = n;
+ }
+ unlock(&c->fidtablk[h]);
+
+ if(o != nil){
+ fprint(2, "fid in use: %d == %d\n", o->fid, new);
+ free(n);
+ return nil;
+ }
+ if(n->mnt != nil)
+ ainc(&n->mnt->ref);
+ ainc(&n->dent->ref);
+ ainc(&n->dir->ref);
+ setmalloctag(n, getcallerpc(&c));
+ return n;
+}
+
+static void
+clunkfid(Conn *c, Fid *fid, Amsg **ao)
+{
+ Fid *f, **pf;
+ u32int h;
+
+ h = ihash(fid->fid) % Nfidtab;
+ lock(&c->fidtablk[h]);
+ pf = &c->fidtab[h];
+ for(f = c->fidtab[h]; f != nil; f = f->next){
+ if(f == fid){
+ assert(adec(&f->ref) != 0);
+ *pf = f->next;
+ break;
+ }
+ pf = &f->next;
+ }
+ assert(f != nil);
+ if(f->scan != nil){
+ free(f->scan);
+ f->scan = nil;
+ }
+ if(f->rclose != nil){
+ *ao = f->rclose;
+
+ qlock(&f->dent->trunclk);
+ f->dent->trunc = 1;
+ qunlock(&f->dent->trunclk);
+
+ wlock(f->dent);
+ f->dent->gone = 1;
+ wunlock(f->dent);
+
+ aincl(&f->dent->ref, 1);
+ aincl(&f->mnt->ref, 1);
+ (*ao)->op = AOrclose;
+ (*ao)->mnt = f->mnt;
+ (*ao)->qpath = f->qpath;
+ (*ao)->off = 0;
+ (*ao)->end = f->dent->length;
+ (*ao)->dent = f->dent;
+ }
+ unlock(&c->fidtablk[h]);
+}
+
+static int
+readmsg(Conn *c, Fmsg **pm)
+{
+ char szbuf[4];
+ int sz, n;
+ Fmsg *m;
+
+ n = readn(c->rfd, szbuf, 4);
+ if(n <= 0){
+ *pm = nil;
+ return n;
+ }
+ if(n != 4){
+ werrstr("short read: %r");
+ return -1;
+ }
+ sz = GBIT32(szbuf);
+ if(sz > c->iounit){
+ werrstr("message size too large");
+ return -1;
+ }
+ if((m = malloc(sizeof(Fmsg)+sz)) == nil)
+ return -1;
+ if(readn(c->rfd, m->buf+4, sz-4) != sz-4){
+ werrstr("short read: %r");
+ free(m);
+ return -1;
+ }
+ m->conn = c;
+ m->sz = sz;
+ PBIT32(m->buf, sz);
+ *pm = m;
+ return 0;
+}
+
+static void
+fsversion(Fmsg *m)
+{
+ Fcall r;
+ char *p;
+
+ memset(&r, 0, sizeof(Fcall));
+ p = strchr(m->version, '.');
+ if(p != nil)
+ *p = '\0';
+ r.type = Rversion;
+ r.msize = Max9p + IOHDRSZ;
+ if(strcmp(m->version, "9P2000") == 0){
+ if(m->msize < r.msize)
+ r.msize = m->msize;
+ r.version = "9P2000";
+ m->conn->versioned = 1;
+ m->conn->iounit = r.msize;
+ }else{
+ r.version = "unknown";
+ m->conn->versioned = 0;
+ }
+ respond(m, &r);
+}
+
+void
+authfree(AuthRpc *auth)
+{
+ AuthRpc *rpc;
+
+ if(rpc = auth){
+ close(rpc->afd);
+ auth_freerpc(rpc);
+ }
+}
+
+AuthRpc*
+authnew(void)
+{
+ static char *keyspec = "proto=p9any role=server";
+ AuthRpc *rpc;
+ int fd;
+
+ if(access("/mnt/factotum", 0) < 0)
+ if((fd = open("/srv/factotum", ORDWR)) >= 0)
+ mount(fd, -1, "/mnt", MBEFORE, "");
+ if((fd = open("/mnt/factotum/rpc", ORDWR)) < 0)
+ return nil;
+ if((rpc = auth_allocrpc(fd)) == nil){
+ close(fd);
+ return nil;
+ }
+ if(auth_rpc(rpc, "start", keyspec, strlen(keyspec)) != ARok){
+ authfree(rpc);
+ return nil;
+ }
+ return rpc;
+}
+
+static void
+authread(Fid *f, Fcall *r, void *data, vlong count)
+{
+ AuthInfo *ai;
+ AuthRpc *rpc;
+ User *u;
+
+ if((rpc = f->auth) == nil)
+ error(Etype);
+
+ switch(auth_rpc(rpc, "read", nil, 0)){
+ default:
+ error(Eauthp);
+ case ARdone:
+ if((ai = auth_getinfo(rpc)) == nil)
+ goto Phase;
+ rlock(&fs->userlk);
+ u = name2user(ai->cuid);
+ auth_freeAI(ai);
+ if(u == nil){
+ runlock(&fs->userlk);
+ error(Enouser);
+ }
+ f->uid = u->id;
+ runlock(&fs->userlk);
+ return;
+ case ARok:
+ if(count < rpc->narg)
+ error(Eauthd);
+ memmove(data, rpc->arg, rpc->narg);
+ r->count = rpc->narg;
+ return;
+ case ARphase:
+ Phase:
+ error(Eauthph);
+ }
+}
+
+static void
+authwrite(Fid *f, Fcall *r, void *data, vlong count)
+{
+ AuthRpc *rpc;
+
+ if((rpc = f->auth) == nil)
+ error(Etype);
+ if(auth_rpc(rpc, "write", data, count) != ARok)
+ error(Ebotch);
+ r->type = Rwrite;
+ r->count = count;
+
+}
+
+static void
+fsauth(Fmsg *m)
+{
+ Dent *de;
+ Fcall r;
+ Fid f;
+
+ if(fs->noauth){
+ rerror(m, Eauth);
+ return;
+ }
+ if(strcmp(m->uname, "none") == 0){
+ rerror(m, Enone);
+ return;
+ }
+ if((de = mallocz(sizeof(Dent), 1)) == nil){
+ rerror(m, Enomem);
+ return;
+ }
+ memset(de, 0, sizeof(Dent));
+ de->ref = 0;
+ de->qid.type = QTAUTH;
+ de->qid.path = aincv(&fs->nextqid, 1);
+ de->qid.vers = 0;
+ de->length = 0;
+ de->k = nil;
+ de->nk = 0;
+
+ memset(&f, 0, sizeof(Fid));
+ f.fid = NOFID;
+ f.mnt = nil;
+ f.qpath = de->qid.path;
+ f.pqpath = de->qid.path;
+ f.mode = -1;
+ f.iounit = m->conn->iounit;
+ f.dent = de;
+ f.dir = de;
+ f.uid = -1;
+ f.duid = -1;
+ f.dgid = -1;
+ f.dmode = 0600;
+ f.auth = authnew();
+ if(dupfid(m->conn, m->afid, &f) == nil){
+ rerror(m, Efid);
+ free(de);
+ return;
+ }
+ r.type = Rauth;
+ r.aqid = de->qid;
+ respond(m, &r);
+}
+
+static int
+ingroup(int uid, int gid)
+{
+ User *u, *g;
+ int i, in;
+
+ rlock(&fs->userlk);
+ in = 0;
+ u = uid2user(uid);
+ g = uid2user(gid);
+ if(u != nil && g != nil)
+ if(u->id == g->id)
+ in = 1;
+ else for(i = 0; i < g->nmemb; i++)
+ if(u->id == g->memb[i])
+ in = 1;
+ runlock(&fs->userlk);
+ return in;
+}
+
+static int
+groupleader(int uid, int gid)
+{
+ User *g;
+ int i, lead;
+
+ lead = 0;
+ rlock(&fs->userlk);
+ g = uid2user(gid);
+ if(g != nil){
+ if(g->lead == 0){
+ for(i = 0; i < g->nmemb; i++)
+ if(g->memb[i] == uid){
+ lead = 1;
+ break;
+ }
+ }else if(uid == g->lead)
+ lead = 1;
+ }
+ runlock(&fs->userlk);
+ return lead;
+
+}
+
+static int
+mode2bits(int req)
+{
+ int m;
+
+ m = 0;
+ switch(req&0xf){
+ case OREAD: m = DMREAD; break;
+ case OWRITE: m = DMWRITE; break;
+ case ORDWR: m = DMREAD|DMWRITE; break;
+ case OEXEC: m = DMREAD|DMEXEC; break;
+ }
+ if(req&OTRUNC)
+ m |= DMWRITE;
+ return m;
+}
+
+static int
+fsaccess(Fid *f, ulong fmode, int fuid, int fgid, int m)
+{
+ /* uid none gets only other permissions */
+ if(f->permit)
+ return 0;
+ if(f->uid != noneid) {
+ if(f->uid == fuid)
+ if((m & (fmode>>6)) == m)
+ return 0;
+ if(ingroup(f->uid, fgid))
+ if((m & (fmode>>3)) == m)
+ return 0;
+ }
+ if((m & fmode) == m) {
+ if((fmode & DMDIR) && (m == DMEXEC))
+ return 0;
+ if(!ingroup(f->uid, nogroupid))
+ return 0;
+ }
+ return -1;
+}
+
+static void
+fsattach(Fmsg *m)
+{
+ char dbuf[Kvmax], kvbuf[Kvmax];
+ char *p, *n, *aname;
+ Mount *mnt;
+ Dent *de;
+ Tree *t;
+ User *u;
+ Fcall r;
+ Xdir d;
+ Kvp kv;
+ Key dk;
+ Fid f, *af;
+ int uid;
+
+ de = nil;
+ mnt = nil;
+ if(waserror()){
+ rerror(m, errmsg());
+ goto Err;
+ }
+ aname = m->aname;
+ if(aname[0] == '%')
+ aname++;
+ if(aname[0] == '\0')
+ aname = "main";
+ if((mnt = getmount(aname)) == nil)
+ error(Enosnap);
+
+ rlock(&fs->userlk);
+ n = m->uname;
+ /*
+ * to allow people to add themselves to the user file,
+ * we need to force the user id to one that exists.
+ */
+ if(permissive && strcmp(aname, "adm") == 0)
+ n = "adm";
+ if((u = name2user(n)) == nil){
+ runlock(&fs->userlk);
+ error(Enouser);
+ }
+ uid = u->id;
+ runlock(&fs->userlk);
+
+ if(m->afid != NOFID){
+ r.data = nil;
+ r.count = 0;
+ if((af = getfid(m->conn, m->afid)) == nil)
+ error(Enofid);
+ authread(af, &r, nil, 0);
+ putfid(af);
+ if(af->uid != uid)
+ error(Ebadu);
+ }else if(!fs->noauth && strcmp(m->uname, "none") != 0)
+ error(Ebadu);
+
+ if(strcmp(m->aname, "dump") == 0){
+ memset(&d, 0, sizeof(d));
+ filldumpdir(&d);
+ }else{
+ if((p = packdkey(dbuf, sizeof(dbuf), -1ULL, "")) == nil)
+ error(Elength);
+ dk.k = dbuf;
+ dk.nk = p - dbuf;
+ t = agetp(&mnt->root);
+ if(!btlookup(t, &dk, &kv, kvbuf, sizeof(kvbuf)))
+ error(Enosnap);
+ kv2dir(&kv, &d);
+ }
+ de = getdent(mnt, -1, &d);
+ memset(&f, 0, sizeof(Fid));
+ f.fid = NOFID;
+ f.mnt = mnt;
+ f.qpath = d.qid.path;
+ f.pqpath = d.qid.path;
+ f.mode = -1;
+ f.iounit = m->conn->iounit;
+ f.dent = de;
+ f.dir = de;
+ f.uid = uid;
+ f.duid = d.uid;
+ f.dgid = d.gid;
+ f.dmode = d.mode;
+ if(m->aname[0] == '%'){
+ if(!permissive && !ingroup(uid, admid))
+ error(Eperm);
+ f.permit = 1;
+ }
+ if(strcmp(aname, "dump") == 0)
+ f.fromdump = 1;
+ if(dupfid(m->conn, m->fid, &f) == nil)
+ error(Efid);
+
+ r.type = Rattach;
+ r.qid = d.qid;
+ respond(m, &r);
+ poperror();
+
+
+Err: clunkdent(mnt, de);
+ clunkmount(mnt);
+}
+
+static int
+findparent(Tree *t, vlong up, vlong *qpath, char **name, char *buf, int nbuf)
+{
+ char *p, kbuf[Keymax];
+ Kvp kv;
+ Key k;
+
+ p = packsuper(kbuf, sizeof(kbuf), up);
+ k.k = kbuf;
+ k.nk = p - kbuf;
+ if(!btlookup(t, &k, &kv, buf, nbuf))
+ error(Esrch);
+ *name = unpackdkey(kv.v, kv.nv, qpath);
+ return 1;
+}
+
+static void
+dkey(Key *k, vlong up, char *name, char *buf, int nbuf)
+{
+ char *p;
+
+ p = packdkey(buf, nbuf, up, name);
+ k->k = buf;
+ k->nk = p - buf;
+}
+
+static void
+fswalk(Fmsg *m)
+{
+ char *name, kbuf[Maxent], kvbuf[Kvmax];
+ int duid, dgid, dmode;
+ vlong up, upup, prev;
+ Dent *dent, *dir;
+ Fid *o, *f;
+ Mount *mnt;
+ Amsg *ao;
+ Tree *t;
+ Fcall r;
+ Xdir d;
+ Kvp kv;
+ Key k;
+ int i;
+
+ if((o = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ if(waserror()){
+ rerror(m, errmsg());
+ putfid(o);
+ return;
+ }
+ if(o->mode != -1)
+ error(Einuse);
+ t = o->mnt->root;
+ mnt = o->mnt;
+ up = o->pqpath;
+ prev = o->qpath;
+ rlock(o->dent);
+ d = *o->dent;
+ runlock(o->dent);
+ duid = d.uid;
+ dgid = d.gid;
+ dmode = d.mode;
+ r.type = Rwalk;
+ for(i = 0; i < m->nwname; i++){
+ name = m->wname[i];
+ if(strlen(name) > Maxname)
+ error(Elength);
+ if(fsaccess(o, d.mode, d.uid, d.gid, DMEXEC) != 0)
+ break;
+ if(strcmp(name, "..") == 0){
+ if(up == -1 && o->fromdump){
+ mnt = fs->snapmnt;
+ filldumpdir(&d);
+ prev = -1ULL;
+ up = -1ULL;
+ r.wqid[i] = d.qid;
+ continue;
+ }
+ findparent(t, up, &prev, &name, kbuf, sizeof(kbuf));
+ }else if(d.qid.path == Qdump){
+ mnt = getmount(m->wname[i]);
+ name = "";
+ prev = -1ULL;
+ t = mnt->root;
+ }
+ up = prev;
+ duid = d.uid;
+ dgid = d.gid;
+ dmode = d.mode;
+ dkey(&k, prev, name, kbuf, sizeof(kbuf));
+ if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+ break;
+ kv2dir(&kv, &d);
+ prev = d.qid.path;
+ r.wqid[i] = d.qid;
+ }
+ r.nwqid = i;
+ if(i == 0 && m->nwname != 0)
+ error(Esrch);
+ f = o;
+ if(m->fid != m->newfid && i == m->nwname){
+ if((f = dupfid(m->conn, m->newfid, o)) == nil)
+ error(Efid);
+ putfid(o);
+ }
+ if(i > 0 && i == m->nwname){
+ lock(f);
+ ao = nil;
+ if(waserror()){
+ if(f != o)
+ clunkfid(m->conn, f, &ao);
+ assert(ao == nil);
+ unlock(f);
+ nexterror();
+ }
+ if(up == -1ULL){
+ /* the root contains itself, I guess */
+ dent = getdent(mnt, up, &d);
+ dir = getdent(mnt, up, &d);
+ }else{
+ dent = getdent(mnt, up, &d);
+ findparent(t, up, &upup, &name, kbuf, sizeof(kbuf));
+ dkey(&k, upup, name, kbuf, sizeof(kbuf));
+ if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+ broke("missing parent");
+ kv2dir(&kv, &d);
+ dir = getdent(mnt, upup, &d);
+ }
+ clunkdent(f->mnt, f->dent);
+ clunkdent(f->mnt, f->dir);
+ if(mnt != f->mnt){
+ clunkmount(f->mnt);
+ ainc(&mnt->ref);
+ f->mnt = mnt;
+ }
+ f->qpath = r.wqid[i-1].path;
+ f->pqpath = up;
+ f->dent = dent;
+ f->dir = dir;
+ f->duid = duid;
+ f->dgid = dgid;
+ f->dmode = dmode;
+ poperror();
+ unlock(f);
+ }
+ respond(m, &r);
+ poperror();
+ putfid(f);
+}
+
+static void
+fsstat(Fmsg *m)
+{
+ char buf[STATMAX];
+ Fcall r;
+ Fid *f;
+ int n;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ if(waserror()){
+ rerror(m, errmsg());
+ putfid(f);
+ return;
+ }
+ rlock(f->dent);
+ if((n = dir2statbuf(f->dent, buf, sizeof(buf))) == -1)
+ error(Efs);
+ runlock(f->dent);
+ r.type = Rstat;
+ r.stat = (uchar*)buf;
+ r.nstat = n;
+ respond(m, &r);
+ poperror();
+ putfid(f);
+}
+
+static void
+fswstat(Fmsg *m, int id, Amsg **ao)
+{
+ char rnbuf[Kvmax], opbuf[Kvmax], upbuf[Upksz];
+ char *p, *e, strs[65535];
+ int op, nm, rename;
+ vlong oldlen;
+ Qid old;
+ Fcall r;
+ Dent *de;
+ Msg mb[4];
+ Xdir n;
+ Dir d;
+ Tree *t;
+ Fid *f;
+ Key k;
+ User *u;
+
+ *ao = nil;
+ rename = 0;
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ de = f->dent;
+ truncwait(de, id);
+ wlock(de);
+ if(waserror()){
+ rerror(m, errmsg());
+ free(*ao);
+ *ao = nil;
+ goto Err;
+ }
+ if(de->gone)
+ error(Ephase);
+ if((de->qid.type & QTAUTH) || (de->qid.path & Qdump))
+ error(Emode);
+ if(convM2D(m->stat, m->nstat, &d, strs) <= BIT16SZ)
+ error(Edir);
+
+ t = agetp(&f->mnt->root);
+ n = de->Xdir;
+ n.qid.vers++;
+ p = opbuf+1;
+ op = 0;
+
+ /* check validity of updated fields and construct Owstat message */
+ if(d.qid.path != ~0 || d.qid.vers != ~0){
+ if(d.qid.path != de->qid.path)
+ error(Ewstatp);
+ if(d.qid.vers != de->qid.vers)
+ error(Ewstatv);
+ }
+ if(*d.name != '\0'){
+ if(strlen(d.name) > Maxname)
+ error(Elength);
+ if(strcmp(d.name, de->name) != 0){
+ rename = 1;
+ if((e = okname(d.name)) != nil)
+ error(e);
+ if(walk1(t, f->dent->up, d.name, &old, &oldlen) == 0)
+ error(Eexist);
+ n.name = d.name;
+ }
+ }
+ if(d.length != ~0){
+ if(d.length < 0)
+ error(Ewstatl);
+ if(d.length != de->length){
+ if(d.length < de->length){
+ if((*ao = malloc(sizeof(Amsg))) == nil)
+ error(Enomem);
+ qlock(&de->trunclk);
+ de->trunc = 1;
+ qunlock(&de->trunclk);
+ aincl(&de->ref, 1);
+ aincl(&f->mnt->ref, 1);
+ (*ao)->op = AOclear;
+ (*ao)->mnt = f->mnt;
+ (*ao)->qpath = f->qpath;
+ (*ao)->off = d.length;
+ (*ao)->end = f->dent->length;
+ (*ao)->dent = de;
+ }
+ de->length = d.length;
+ n.length = d.length;
+ op |= Owsize;
+ PACK64(p, n.length);
+ p += 8;
+ }
+ }
+ if(d.mode != ~0){
+ if((d.mode^de->mode) & DMDIR)
+ error(Ewstatd);
+ if(d.mode & ~(DMDIR|DMAPPEND|DMEXCL|DMTMP|0777))
+ error(Ewstatb);
+ if(d.mode != de->mode){
+ n.mode = d.mode;
+ n.qid.type = d.mode>>24;
+ op |= Owmode;
+ PACK32(p, n.mode);
+ p += 4;
+ }
+ }
+ if(d.mtime != ~0){
+ n.mtime = d.mtime*Nsec;
+ if(n.mtime != de->mtime){
+ op |= Owmtime;
+ PACK64(p, n.mtime);
+ p += 8;
+ }
+ }
+ if(*d.uid != '\0'){
+ if(strlen(d.uid) > Maxuname)
+ error(Elength);
+ rlock(&fs->userlk);
+ u = name2user(d.uid);
+ if(u == nil){
+ runlock(&fs->userlk);
+ error(Enouser);
+ }
+ n.uid = u->id;
+ runlock(&fs->userlk);
+ if(n.uid != de->uid){
+ op |= Owuid;
+ PACK32(p, n.uid);
+ p += 4;
+ }
+ }
+ if(*d.gid != '\0'){
+ if(strlen(d.gid) > Maxuname)
+ error(Elength);
+ rlock(&fs->userlk);
+ u = name2user(d.gid);
+ if(u == nil){
+ runlock(&fs->userlk);
+ error(Enogrp);
+ }
+ n.gid = u->id;
+ runlock(&fs->userlk);
+ if(n.gid != de->gid){
+ op |= Owgid;
+ PACK32(p, n.gid);
+ p += 4;
+ }
+ }
+ op |= Owmuid;
+ n.muid = f->uid;
+ PACK32(p, n.muid);
+ p += 4;
+
+ /* check permissions */
+ if(rename)
+ if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
+ error(Eperm);
+ if(op & Owsize)
+ if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1)
+ error(Eperm);
+ if(op & (Owmode|Owmtime))
+ if(!f->permit && f->uid != de->uid && !groupleader(f->uid, de->gid))
+ error(Ewstato);
+ if(op & Owuid)
+ if(!f->permit)
+ error(Ewstatu);
+ if(op & Owgid)
+ if(!f->permit
+ && !(f->uid == de->uid && ingroup(f->uid, n.gid))
+ && !(groupleader(f->uid, de->gid) && groupleader(f->uid, n.gid)))
+ error(Ewstatg);
+
+ /* update directory entry */
+ nm = 0;
+ if(rename && !de->gone){
+ mb[nm].op = Oclobber;
+ mb[nm].Key = de->Key;
+ mb[nm].v = nil;
+ mb[nm].nv = 0;
+ nm++;
+
+ mb[nm].op = Oinsert;
+ dir2kv(f->pqpath, &n, &mb[nm], rnbuf, sizeof(rnbuf));
+ k = mb[nm].Key;
+ nm++;
+
+ if(de->qid.type & QTDIR){
+ packsuper(upbuf, sizeof(upbuf), f->qpath);
+ mb[nm].op = Oinsert;
+ mb[nm].k = upbuf;
+ mb[nm].nk = Upksz;
+ mb[nm].v = mb[nm-1].k;
+ mb[nm].nv = mb[nm-1].nk;
+ nm++;
+ }
+ touch(f->dir, &mb[nm++]);
+ }else{
+ opbuf[0] = op;
+ mb[nm].op = Owstat;
+ mb[nm].Key = de->Key;
+ mb[nm].v = opbuf;
+ mb[nm].nv = p - opbuf;
+ nm++;
+ }
+ assert(nm <= nelem(mb));
+ upsert(f->mnt, mb, nm);
+
+ de->Xdir = n;
+ if(rename)
+ cpkey(de, &k, de->buf, sizeof(de->buf));
+
+ r.type = Rwstat;
+ respond(m, &r);
+ poperror();
+
+Err: wunlock(de);
+ putfid(f);
+}
+
+
+static void
+fsclunk(Fmsg *m, Amsg **ao)
+{
+ Fcall r;
+ Fid *f;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ lock(f);
+ clunkfid(m->conn, f, ao);
+ unlock(f);
+ r.type = Rclunk;
+ respond(m, &r);
+ putfid(f);
+}
+
+static void
+fscreate(Fmsg *m)
+{
+ char *p, *e, buf[Kvmax], upkbuf[Keymax], upvbuf[Inlmax];
+ int nm, duid, dgid, dmode;
+ Dent *de;
+ vlong oldlen;
+ Qid old;
+ Fcall r;
+ Msg mb[3];
+ Fid *f;
+ Xdir d;
+
+ if((e = okname(m->name)) != nil){
+ rerror(m, e);
+ return;
+ }
+ if(m->perm & (DMMOUNT|DMAUTH)){
+ rerror(m, Ebotch);
+ return;
+ }
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ lock(f);
+
+ if(waserror()){
+ rerror(m, errmsg());
+ goto Err;
+
+ }
+ if(f->mode != -1){
+ rerror(m, Einuse);
+ goto Out;
+ }
+ de = f->dent;
+ if(walk1(f->mnt->root, f->qpath, m->name, &old, &oldlen) == 0){
+ rerror(m, Eexist);
+ goto Out;
+ }
+
+ rlock(de);
+ if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1){
+ rerror(m, Eperm);
+ runlock(de);
+ goto Out;
+ }
+ duid = de->uid;
+ dgid = de->gid;
+ dmode = de->mode;
+ runlock(de);
+
+ nm = 0;
+ d.qid.type = 0;
+ if(m->perm & DMDIR)
+ d.qid.type |= QTDIR;
+ if(m->perm & DMAPPEND)
+ d.qid.type |= QTAPPEND;
+ if(m->perm & DMEXCL)
+ d.qid.type |= QTEXCL;
+ if(m->perm & DMTMP)
+ d.qid.type |= QTTMP;
+ d.qid.path = aincv(&fs->nextqid, 1);
+ d.qid.vers = 0;
+ d.mode = m->perm;
+ if(m->perm & DMDIR)
+ d.mode &= ~0777 | de->mode & 0777;
+ else
+ d.mode &= ~0666 | de->mode & 0666;
+ d.name = m->name;
+ d.atime = nsec();
+ d.mtime = d.atime;
+ d.length = 0;
+ d.uid = f->uid;
+ d.gid = dgid;
+ d.muid = f->uid;
+
+ mb[nm].op = Oinsert;
+ dir2kv(f->qpath, &d, &mb[nm], buf, sizeof(buf));
+ nm++;
+
+ if(m->perm & DMDIR){
+ mb[nm].op = Oinsert;
+ if((p = packsuper(upkbuf, sizeof(upkbuf), d.qid.path)) == nil)
+ sysfatal("ream: pack super");
+ mb[nm].k = upkbuf;
+ mb[nm].nk = p - upkbuf;
+ if((p = packdkey(upvbuf, sizeof(upvbuf), f->qpath, d.name)) == nil)
+ sysfatal("ream: pack super");
+ mb[nm].v = upvbuf;
+ mb[nm].nv = p - upvbuf;
+ nm++;
+ }
+ touch(f->dent, &mb[nm++]);
+ assert(nm <= nelem(mb));
+ upsert(f->mnt, mb, nm);
+
+ de = getdent(f->mnt, f->qpath, &d);
+ clunkdent(f->mnt, f->dent);
+ f->mode = mode2bits(m->mode);
+ f->pqpath = f->qpath;
+ f->qpath = d.qid.path;
+ f->dent = de;
+ f->duid = duid;
+ f->dgid = dgid;
+ f->dmode = dmode;
+ if(m->mode & ORCLOSE)
+ f->rclose = emalloc(sizeof(Amsg), 1);
+
+ r.type = Rcreate;
+ r.qid = d.qid;
+ r.iounit = f->iounit;
+ respond(m, &r);
+Out: poperror();
+Err: unlock(f);
+ putfid(f);
+ return;
+}
+
+static char*
+candelete(Fid *f)
+{
+ char *e, pfx[Dpfxsz];
+ Tree *t;
+ Scan s;
+
+ if(!(f->dent->qid.type & QTDIR))
+ return nil;
+
+ t = agetp(&f->mnt->root);
+ packdkey(pfx, sizeof(pfx), f->qpath, nil);
+ btnewscan(&s, pfx, sizeof(pfx));
+ btenter(t, &s);
+ if(btnext(&s, &s.kv))
+ e = Enempty;
+ else
+ e = nil;
+ btexit(&s);
+ return e;
+}
+
+static void
+fsremove(Fmsg *m, int id, Amsg **ao)
+{
+ char *e, buf[Kvmax];
+ Fcall r;
+ int nm;
+ Msg mb[3];
+ Tree *t;
+ Kvp kv;
+ Fid *f;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ t = f->mnt->root;
+ nm = 0;
+ *ao = nil;
+ lock(f);
+ clunkfid(m->conn, f, ao);
+ /* rclose files are getting removed here anyways */
+ if(*ao != nil)
+ f->rclose = nil;
+ unlock(f);
+
+ truncwait(f->dent, id);
+ wlock(f->dent);
+ if(waserror()){
+ rerror(m, errmsg());
+ free(*ao);
+ *ao = nil;
+ goto Err;
+ }
+ if(f->dent->gone)
+ error(Ephase);
+ /*
+ * we need a double check that the file is in the tree
+ * here, because the walk to the fid is done in a reader
+ * proc that can look it up in a stale version of the
+ * tree, while we clunk the dent in the mutator proc.
+ *
+ * this means we can theoretically get some deletions
+ * of files that are already gone.
+ */
+ if(!btlookup(t, &f->dent->Key, &kv, buf, sizeof(buf)))
+ error(Ephase);
+ if((e = candelete(f)) != nil)
+ error(e);
+ if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
+ error(Eperm);
+ lock(f);
+ mb[nm].op = Odelete;
+ mb[nm].k = f->dent->k;
+ mb[nm].nk = f->dent->nk;
+ mb[nm].v = "\0";
+ mb[nm].nv = 1;
+ nm++;
+ unlock(f);
+
+ if(f->dent->qid.type & QTDIR){
+ packsuper(buf, sizeof(buf), f->qpath);
+ mb[nm].op = Oclobber;
+ mb[nm].k = buf;
+ mb[nm].nk = Upksz;
+ mb[nm].nv = 0;
+ nm++;
+ }else{
+ if(*ao == nil)
+ *ao = emalloc(sizeof(Amsg), 1);
+ aincl(&f->mnt->ref, 1);
+ (*ao)->op = AOclear;
+ (*ao)->mnt = f->mnt;
+ (*ao)->qpath = f->qpath;
+ (*ao)->off = 0;
+ (*ao)->end = f->dent->length;
+ (*ao)->dent = nil;
+ }
+ touch(f->dir, &mb[nm++]);
+ assert(nm <= nelem(mb));
+ upsert(f->mnt, mb, nm);
+ f->dent->gone = 1;
+ r.type = Rremove;
+ respond(m, &r);
+ poperror();
+Err:
+ wunlock(f->dent);
+ putfid(f);
+ return;
+}
+
+static void
+fsopen(Fmsg *m, int id, Amsg **ao)
+{
+ char *p, *e, buf[Kvmax];
+ int mbits;
+ Tree *t;
+ Fcall r;
+ Xdir d;
+ Fid *f;
+ Kvp kv;
+ Msg mb;
+
+ mbits = mode2bits(m->mode);
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ if(waserror()){
+ rerror(m, errmsg());
+ putfid(f);
+ return;
+ }
+ if(m->mode & OTRUNC)
+ truncwait(f->dent, id);
+ t = agetp(&f->mnt->root);
+ if((f->qpath & Qdump) != 0){
+ filldumpdir(&d);
+ }else{
+ if(!btlookup(t, f->dent, &kv, buf, sizeof(buf)))
+ error(Esrch);
+ kv2dir(&kv, &d);
+ }
+ wlock(f->dent);
+ if(waserror()){
+ wunlock(f->dent);
+ nexterror();
+ }
+ if(f->dent->gone)
+ error(Ephase);
+ if(f->dent->qid.type & QTEXCL)
+ if(f->dent->ref != 1)
+ error(Elocked);
+ if(m->mode & ORCLOSE)
+ if((e = candelete(f)) != nil)
+ error(e);
+ if(fsaccess(f, d.mode, d.uid, d.gid, mbits) == -1)
+ error(Eperm);
+ f->dent->length = d.length;
+ poperror();
+ wunlock(f->dent);
+ r.type = Ropen;
+ r.qid = d.qid;
+ r.iounit = f->iounit;
+
+ lock(f);
+ if(f->mode != -1){
+ unlock(f);
+ error(Einuse);
+ }
+ if((m->mode & OTRUNC) && !(f->dent->mode & DMAPPEND)){
+ wlock(f->dent);
+
+ if(waserror()){
+ wunlock(f->dent);
+ free(*ao);
+ *ao = nil;
+ nexterror();
+ }
+ *ao = emalloc(sizeof(Amsg), 1);
+ qlock(&f->dent->trunclk);
+ f->dent->trunc = 1;
+ qunlock(&f->dent->trunclk);
+ aincl(&f->dent->ref, 1);
+ aincl(&f->mnt->ref, 1);
+ (*ao)->op = AOclear;
+ (*ao)->mnt = f->mnt;
+ (*ao)->qpath = f->qpath;
+ (*ao)->off = 0;
+ (*ao)->end = f->dent->length;
+ (*ao)->dent = f->dent;
+
+ f->dent->muid = f->uid;
+ f->dent->qid.vers++;
+ f->dent->length = 0;
+
+ mb.op = Owstat;
+ p = buf;
+ p[0] = Owsize|Owmuid; p += 1;
+ PACK64(p, 0); p += 8;
+ PACK32(p, f->uid); p += 4;
+ mb.k = f->dent->k;
+ mb.nk = f->dent->nk;
+ mb.v = buf;
+ mb.nv = p - buf;
+
+ upsert(f->mnt, &mb, 1);
+ wunlock(f->dent);
+ poperror();
+ }
+ f->mode = mode2bits(m->mode);
+ if(m->mode & ORCLOSE)
+ f->rclose = emalloc(sizeof(Amsg), 1);
+ unlock(f);
+ poperror();
+ respond(m, &r);
+ putfid(f);
+}
+
+static void
+readsnap(Fmsg *m, Fid *f, Fcall *r)
+{
+ char pfx[1], *p;
+ int n, ns;
+ Scan *s;
+ Xdir d;
+
+ s = f->scan;
+ if(s != nil && s->offset != 0 && s->offset != m->offset)
+ error(Edscan);
+ if(s == nil || m->offset == 0){
+ s = emalloc(sizeof(Scan), 1);
+ pfx[0] = Klabel;
+ btnewscan(s, pfx, 1);
+ lock(f);
+ if(f->scan != nil){
+ free(f->scan);
+ }
+ f->scan = s;
+ unlock(f);
+ }
+ if(s->donescan){
+ r->count = 0;
+ return;
+ }
+ p = r->data;
+ n = m->count;
+ filldumpdir(&d);
+ if(s->overflow){
+ memcpy(d.name, s->kv.k+1, s->kv.nk-1);
+ d.name[s->kv.nk-1] = 0;
+ d.qid.path = UNPACK64(s->kv.v + 1);
+ if((ns = dir2statbuf(&d, p, n)) == -1){
+ r->count = 0;
+ return;
+ }
+ s->overflow = 0;
+ p += ns;
+ n -= ns;
+ }
+ btenter(&fs->snap, s);
+ while(1){
+ if(!btnext(s, &s->kv))
+ break;
+ memcpy(d.name, s->kv.k+1, s->kv.nk-1);
+ d.name[s->kv.nk-1] = 0;
+ d.qid.path = UNPACK64(s->kv.v + 1);
+ if((ns = dir2statbuf(&d, p, n)) == -1){
+ s->overflow = 1;
+ break;
+ }
+ p += ns;
+ n -= ns;
+ }
+ btexit(s);
+ r->count = p - r->data;
+ return;
+}
+
+static void
+readdir(Fmsg *m, Fid *f, Fcall *r)
+{
+ char pfx[Dpfxsz], *p;
+ int n, ns;
+ Tree *t;
+ Scan *s;
+
+ s = f->scan;
+ t = agetp(&f->mnt->root);
+ if(s != nil && s->offset != 0 && s->offset != m->offset)
+ error(Edscan);
+ if(s == nil || m->offset == 0){
+ s = emalloc(sizeof(Scan), 1);
+ packdkey(pfx, sizeof(pfx), f->qpath, nil);
+ btnewscan(s, pfx, sizeof(pfx));
+ lock(f);
+ if(f->scan != nil)
+ free(f->scan);
+ f->scan = s;
+ unlock(f);
+ }
+ if(s->donescan){
+ r->count = 0;
+ return;
+ }
+ p = r->data;
+ n = m->count;
+ if(s->overflow){
+ if((ns = kv2statbuf(&s->kv, p, n)) == -1){
+ r->count = 0;
+ return;
+ }
+ s->overflow = 0;
+ p += ns;
+ n -= ns;
+ }
+ btenter(t, s);
+ while(1){
+ if(!btnext(s, &s->kv))
+ break;
+ if((ns = kv2statbuf(&s->kv, p, n)) == -1){
+ s->overflow = 1;
+ break;
+ }
+ p += ns;
+ n -= ns;
+ }
+ btexit(s);
+ r->count = p - r->data;
+}
+
+static void
+readfile(Fmsg *m, Fid *f, Fcall *r)
+{
+ vlong n, c, o;
+ char *p;
+ Dent *e;
+ Tree *t;
+
+ e = f->dent;
+ rlock(e);
+ if(m->offset > e->length){
+ runlock(e);
+ return;
+ }
+ p = r->data;
+ c = m->count;
+ o = m->offset;
+ t = agetp(&f->mnt->root);
+ if(m->offset + m->count > e->length)
+ c = e->length - m->offset;
+ while(c != 0){
+ n = readb(t, f, p, o, c, e->length);
+ r->count += n;
+ if(n == 0)
+ break;
+ p += n;
+ o += n;
+ c -= n;
+ }
+ runlock(e);
+}
+
+static void
+fsread(Fmsg *m)
+{
+ Fcall r;
+ Fid *f;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ r.type = Rread;
+ r.count = 0;
+ r.data = nil;
+ if(waserror()){
+ rerror(m, errmsg());
+ free(r.data);
+ putfid(f);
+ return;
+ }
+ r.data = emalloc(m->count, 0);
+ if(f->dent->qid.type & QTAUTH)
+ authread(f, &r, r.data, m->count);
+ else if(f->dent->qid.path == Qdump)
+ readsnap(m, f, &r);
+ else if(f->dent->qid.type & QTDIR)
+ readdir(m, f, &r);
+ else
+ readfile(m, f, &r);
+ respond(m, &r);
+ free(r.data);
+ poperror();
+ putfid(f);
+}
+
+static void
+fswrite(Fmsg *m, int id)
+{
+ char sbuf[Wstatmax], kbuf[Max9p/Blksz+2][Offksz], vbuf[Max9p/Blksz+2][Ptrsz];
+ Bptr bp[Max9p/Blksz + 2];
+ Msg kv[Max9p/Blksz + 2];
+ vlong n, o, c, w;
+ int i, j;
+ char *p;
+ Fcall r;
+ Tree *t;
+ Fid *f;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ if(!(f->mode & DMWRITE)){
+ rerror(m, Einuse);
+ putfid(f);
+ return;
+ }
+ truncwait(f->dent, id);
+ wlock(f->dent);
+ if(waserror()){
+ rerror(m, errmsg());
+ wunlock(f->dent);
+ putfid(f);
+ return;
+ }
+ if(f->dent->gone)
+ error(Ephase);
+ if(f->dent->qid.type & QTAUTH){
+ authwrite(f, &r, m->data, m->count);
+ goto Out;
+ }
+
+ w = 0;
+ p = m->data;
+ o = m->offset;
+ c = m->count;
+ if(f->dent->mode & DMAPPEND)
+ o = f->dent->length;
+ t = agetp(&f->mnt->root);
+ for(i = 0; c != 0; i++){
+ assert(i < nelem(kv));
+ assert(i == 0 || o%Blksz == 0);
+ kv[i].op = Oinsert;
+ kv[i].k = kbuf[i];
+ kv[i].nk = sizeof(kbuf[i]);
+ kv[i].v = vbuf[i];
+ kv[i].nv = sizeof(vbuf[i]);
+ if(waserror()){
+ if(!fs->rdonly)
+ for(j = 0; j < i; j++)
+ freebp(t, bp[j]);
+ nexterror();
+ }
+ n = writeb(f, &kv[i], &bp[i], p, o, c, f->dent->length);
+ poperror();
+ w += n;
+ p += n;
+ o += n;
+ c -= n;
+ }
+
+ p = sbuf;
+ kv[i].op = Owstat;
+ kv[i].k = f->dent->k;
+ kv[i].nk = f->dent->nk;
+ *p++ = 0;
+ if(o > f->dent->length){
+ sbuf[0] |= Owsize;
+ PACK64(p, o);
+ p += 8;
+ f->dent->length = m->offset+m->count;
+ }
+ sbuf[0] |= Owmtime;
+ f->dent->mtime = nsec();
+ PACK64(p, f->dent->mtime);
+ p += 8;
+ sbuf[0] |= Owmuid;
+ PACK32(p, f->uid);
+ p += 4;
+
+ kv[i].v = sbuf;
+ kv[i].nv = p - sbuf;
+ upsert(f->mnt, kv, i+1);
+
+ r.type = Rwrite;
+ r.count = w;
+Out:
+ poperror();
+ respond(m, &r);
+ wunlock(f->dent);
+ putfid(f);
+}
+
+void
+fsflush(Fmsg *m)
+{
+ Fcall r;
+
+ r.type = Rflush;
+ respond(m, &r);
+}
+
+Conn *
+newconn(int rfd, int wfd)
+{
+ Conn *c;
+
+ if((c = mallocz(sizeof(*c), 1)) == nil)
+ return nil;
+ c->rfd = rfd;
+ c->wfd = wfd;
+ c->iounit = Max9p;
+ c->next = fs->conns;
+ lock(&fs->connlk);
+ fs->conns = c;
+ unlock(&fs->connlk);
+ return c;
+}
+
+void
+runfs(int, void *pc)
+{
+ char err[128];
+ RWLock *lk;
+ Amsg *a;
+ Conn *c;
+ Fcall r;
+ Fmsg *m;
+ u32int h;
+
+ c = pc;
+ while(1){
+ if(readmsg(c, &m) < 0){
+ fshangup(c, "read message: %r");
+ return;
+ }
+ if(m == nil)
+ break;
+ if(convM2S(m->buf, m->sz, m) == 0){
+ fshangup(c, "invalid message: %r");
+ return;
+ }
+ if(m->type != Tversion && !c->versioned){
+ fshangup(c, "version required");
+ return;
+ }
+ dprint("← %F\n", &m->Fcall);
+
+ if(m->type == Tflush){
+ lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
+ wlock(lk);
+ }else{
+ lk = &fs->flushq[ihash(m->tag) % Nflushtab];
+ rlock(lk);
+ }
+
+ a = nil;
+ h = ihash(m->fid) % fs->nreaders;
+ switch(m->type){
+ /* sync setup, must not access tree */
+ case Tversion: fsversion(m); break;
+ case Tauth: fsauth(m); break;
+ case Tflush: fsflush(m); break;
+ case Tclunk: fsclunk(m, &a); break;
+
+ /* mutators */
+ case Tcreate: chsend(fs->wrchan, m); break;
+ case Twrite: chsend(fs->wrchan, m); break;
+ case Twstat: chsend(fs->wrchan, m); break;
+ case Tremove: chsend(fs->wrchan, m); break;
+
+ /* reads */
+ case Tattach: chsend(fs->rdchan[h], m); break;
+ case Twalk: chsend(fs->rdchan[h], m); break;
+ case Tread: chsend(fs->rdchan[h], m); break;
+ case Tstat: chsend(fs->rdchan[h], m); break;
+
+ /* both */
+ case Topen:
+ if((m->mode & OTRUNC) || (m->mode & ORCLOSE) != 0)
+ chsend(fs->wrchan, m);
+ else
+ chsend(fs->rdchan[h], m);
+ break;
+
+ default:
+ fprint(2, "unknown message %F\n", &m->Fcall);
+ snprint(err, sizeof(err), "unknown message: %F", &m->Fcall);
+ r.type = Rerror;
+ r.ename = err;
+ respond(m, &r);
+ break;
+ }
+ assert(estacksz() == 0);
+ if(a != nil)
+ chsend(fs->admchan, a);
+ }
+}
+
+void
+runmutate(int id, void *)
+{
+ Fmsg *m;
+ Amsg *a;
+ Fid *f;
+
+ while(1){
+ a = nil;
+ m = chrecv(fs->wrchan);
+ if(fs->rdonly){
+ /*
+ * special case: even if Tremove fails, we need
+ * to clunk the fid.
+ */
+ if(m->type == Tremove){
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ continue;
+ }
+ clunkfid(m->conn, f, &a);
+ /* read only: ignore rclose */
+ f->rclose = nil;
+ free(a);
+ putfid(f);
+ }
+ rerror(m, Erdonly);
+ continue;
+ }
+
+ qlock(&fs->mutlk);
+ epochstart(id);
+ fs->snap.dirty = 1;
+ switch(m->type){
+ case Tcreate: fscreate(m); break;
+ case Twrite: fswrite(m, id); break;
+ case Twstat: fswstat(m, id, &a); break;
+ case Tremove: fsremove(m, id, &a); break;
+ case Topen: fsopen(m, id, &a); break;
+ default: abort(); break;
+ }
+ assert(estacksz() == 0);
+ epochend(id);
+ qunlock(&fs->mutlk);
+ epochclean();
+
+ if(a != nil)
+ chsend(fs->admchan, a);
+ }
+}
+
+void
+runread(int id, void *ch)
+{
+ Fmsg *m;
+
+ while(1){
+ m = chrecv(ch);
+ epochstart(id);
+ switch(m->type){
+ case Tattach: fsattach(m); break;
+ case Twalk: fswalk(m); break;
+ case Tread: fsread(m); break;
+ case Tstat: fsstat(m); break;
+ case Topen: fsopen(m, id, nil); break;
+ }
+ assert(estacksz() == 0);
+ epochend(id);
+ }
+}
+
+void
+freetree(Bptr rb, vlong pred)
+{
+ Bptr bp;
+ Blk *b;
+ Kvp kv;
+ int i;
+
+ b = getblk(rb, 0);
+ if(b->type == Tpivot){
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &kv);
+ bp = unpackbp(kv.v, kv.nv);
+ freetree(bp, pred);
+ qlock(&fs->mutlk);
+ qunlock(&fs->mutlk);
+ epochclean();
+ }
+ }
+ if(rb.gen > pred)
+ freebp(nil, rb);
+ dropblk(b);
+}
+
+/*
+ * Here, we clean epochs frequently, but we run outside of
+ * an epoch; this is because the caller of this function
+ * has already waited for an epoch to tick over, there's
+ * nobody that can be accessing the tree other than us,
+ * and we just need to keep the limbo list short.
+ *
+ * Because this is the last reference to the tree, we don't
+ * need to hold the mutlk, other than when we free or kill
+ * blocks via epochclean.
+ */
+void
+sweeptree(Tree *t)
+{
+ char pfx[1];
+ Scan s;
+ Bptr bp;
+ pfx[0] = Kdat;
+ btnewscan(&s, pfx, 1);
+ btenter(t, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ bp = unpackbp(s.kv.v, s.kv.nv);
+ if(bp.gen > t->pred)
+ freebp(nil, bp);
+ qlock(&fs->mutlk);
+ qunlock(&fs->mutlk);
+ epochclean();
+ }
+ btexit(&s);
+ freetree(t->bp, t->pred);
+}
+
+void
+runsweep(int id, void*)
+{
+ char buf[Kvmax];
+ Msg mb[Kvmax/Offksz];
+ Bptr bp, nb, *oldhd;
+ int i, nm;
+ vlong off;
+ Tree *t;
+ Arena *a;
+ Amsg *am;
+ Blk *b;
+
+ if((oldhd = calloc(fs->narena, sizeof(Bptr))) == nil)
+ sysfatal("malloc log heads");
+ while(1){
+ am = chrecv(fs->admchan);
+ switch(am->op){
+ case AOsync:
+ tracem("syncreq");
+ if(!fs->snap.dirty && !am->halt)
+ goto Next;
+ if(agetl(&fs->rdonly))
+ goto Justhalt;
+ if(waserror()){
+ fprint(2, "sync error: %s\n", errmsg());
+ ainc(&fs->rdonly);
+ break;
+ }
+
+ if(am->halt)
+ ainc(&fs->rdonly);
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ oldhd[i].addr = -1;
+ oldhd[i].hash = -1;
+ oldhd[i].gen = -1;
+ qlock(a);
+ /*
+ * arbitrary heuristic -- try compressing
+ * when the log doubles in size.
+ */
+ if(a->nlog >= 2*a->lastlogsz){
+ oldhd[i] = a->loghd;
+ epochstart(id);
+ if(waserror()){
+ epochend(id);
+ qunlock(a);
+ nexterror();
+ }
+ compresslog(a);
+ epochend(id);
+ poperror();
+ }
+ qunlock(a);
+ epochclean();
+ }
+ sync();
+
+ for(i = 0; i < fs->narena; i++){
+ for(bp = oldhd[i]; bp.addr != -1; bp = nb){
+ qlock(&fs->mutlk);
+ epochstart(id);
+ b = getblk(bp, 0);
+ nb = b->logp;
+ freeblk(nil, b);
+ dropblk(b);
+ epochend(id);
+ qunlock(&fs->mutlk);
+ epochclean();
+ }
+ }
+
+Justhalt:
+ if(am->halt){
+ assert(fs->snapdl.hd.addr == -1);
+ assert(fs->snapdl.tl.addr == -1);
+ postnote(PNGROUP, getpid(), "halted");
+ exits(nil);
+ }
+ poperror();
+ break;
+
+ case AOsnap:
+ tracem("snapreq");
+ if(agetl(&fs->rdonly)){
+ fprint(2, "snap on read only fs");
+ goto Next;
+ }
+ if(waserror()){
+ fprint(2, "taking snap: %s\n", errmsg());
+ ainc(&fs->rdonly);
+ break;
+ }
+
+ qlock(&fs->mutlk);
+ if(waserror()){
+ qunlock(&fs->mutlk);
+ nexterror();
+ }
+ epochstart(id);
+ snapfs(am, &t);
+ epochend(id);
+ poperror();
+ qunlock(&fs->mutlk);
+
+ sync();
+
+ if(t != nil){
+ epochwait();
+ sweeptree(t);
+ closesnap(t);
+ }
+ poperror();
+ break;
+
+ case AOrclose:
+ if(agetl(&fs->rdonly)){
+ fprint(2, "rclose on read only fs");
+ goto Next;
+ }
+ nm = 0;
+ mb[nm].op = Odelete;
+ mb[nm].k = am->dent->k;
+ mb[nm].nk = am->dent->nk;
+ mb[nm].nv = 0;
+ nm++;
+ if(am->dent->qid.type & QTDIR){
+ packsuper(buf, sizeof(buf), am->qpath);
+ mb[nm].op = Oclobber;
+ mb[nm].k = buf;
+ mb[nm].nk = Upksz;
+ mb[nm].nv = 0;
+ nm++;
+ }
+ qlock(&fs->mutlk);
+ upsert(am->mnt, mb, nm);
+ qunlock(&fs->mutlk);
+ /* fallthrough */
+ case AOclear:
+ if(agetl(&fs->rdonly)){
+ fprint(2, "clear on read only fs");
+ goto Next;
+ }
+ tracem("bgclear");
+ if(waserror()){
+ fprint(2, "clear file %llx: %s\n", am->qpath, errmsg());
+ ainc(&fs->rdonly);
+ break;
+ }
+ if(am->dent != nil)
+ qlock(&am->dent->trunclk);
+ fs->snap.dirty = 1;
+ nm = 0;
+ for(off = am->off; off < am->end; off += Blksz){
+ mb[nm].op = Oclearb;
+ mb[nm].k = buf + Offksz * nm;
+ mb[nm].nk = Offksz;
+ mb[nm].k[0] = Kdat;
+ PACK64(mb[nm].k+1, am->qpath);
+ PACK64(mb[nm].k+9, off);
+ mb[nm].v = nil;
+ mb[nm].nv = 0;
+ if(++nm >= nelem(mb) || off + Blksz >= am->end){
+ qlock(&fs->mutlk);
+ if(waserror()){
+ qunlock(&fs->mutlk);
+ nexterror();
+ }
+ epochstart(id);
+ upsert(am->mnt, mb, nm);
+ epochend(id);
+ qunlock(&fs->mutlk);
+ epochclean();
+ poperror();
+ nm = 0;
+ }
+ }
+ if(am->dent != nil){
+ am->dent->trunc = 0;
+ rwakeup(&am->dent->truncrz);
+ qunlock(&am->dent->trunclk);
+ clunkdent(am->mnt, am->dent);
+ }
+ clunkmount(am->mnt);
+ poperror();
+ break;
+ }
+Next:
+ assert(estacksz() == 0);
+ free(am);
+ }
+}
+
+void
+snapmsg(char *old, char *new, int flg)
+{
+ Amsg *a;
+
+ a = emalloc(sizeof(Amsg), 1);
+ a->op = AOsnap;
+ a->fd = -1;
+ a->flag = flg;
+ strecpy(a->old, a->old+sizeof(a->old), old);
+ if(new == nil)
+ a->delete = 1;
+ else
+ strecpy(a->new, a->new+sizeof(a->new), new);
+ chsend(fs->admchan, a);
+}
+
+void
+runtasks(int, void *)
+{
+ char buf[128];
+ Tm now, then;
+ Mount *mnt;
+ int m, h;
+ Amsg *a;
+
+ m = 0;
+ h = 0;
+ tmnow(&then, nil);
+ tmnow(&now, nil);
+ while(1){
+ sleep(5000);
+ if(fs->rdonly)
+ continue;
+ if(waserror()){
+ fprint(2, "task error: %s\n", errmsg());
+ continue;
+ }
+ a = emalloc(sizeof(Amsg), 1);
+ a->op = AOsync;
+ a->halt = 0;
+ a->fd = -1;
+ chsend(fs->admchan, a);
+
+ tmnow(&now, nil);
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+ if(!(mnt->flag & Ltsnap))
+ continue;
+ if(now.yday != then.yday){
+ snprint(buf, sizeof(buf),
+ "%s@day.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+ snapmsg("main", buf, Lauto);
+ }
+ if(now.hour != then.hour){
+ if(mnt->hourly[h][0] != 0)
+ snapmsg(mnt->hourly[h], nil, 0);
+ snprint(mnt->hourly[h], sizeof(mnt->hourly[h]),
+ "%s@hour.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+ snapmsg("main", mnt->hourly[h], Lauto);
+ }
+ if(now.min != then.min){
+ if(mnt->minutely[m][0] != 0)
+ snapmsg(mnt->minutely[m], nil, 0);
+ snprint(mnt->minutely[m], sizeof(mnt->minutely[m]),
+ "%s@minute.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+ snapmsg("main", mnt->minutely[m], Lauto);
+ }
+ }
+ if(now.hour != then.hour)
+ h = (h+1)%24;
+ if(now.min != then.min)
+ m = (m+1)%60;
+ then = now;
+ poperror();
+ }
+}
+
+void
+fixfs(void)
+{
+ char *p, kbuf[2][Keymax], vbuf[Inlmax], kvbuf[Msgmax];
+ Mount *mnt;
+ Tree *t;
+ Msg mb[2];
+ Kvp kv;
+ Key k;
+
+ fprint(2, "getting adm mount...\n");
+ if((mnt = getmount("adm")) == nil){
+ sysfatal("failed to get adm mount");
+ }
+ t = mnt->root;
+ p = packsuper(kbuf[0], sizeof(kbuf[0]), 0);
+ k.k = kbuf[0];
+ k.nk = p - kbuf[0];
+ fprint(2, "checking for valid adm root backlink...\n");
+ if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf))){
+ sysfatal("no mis-reamed key");
+ }
+
+ mb[0].op = Oinsert;
+ p = packsuper(kbuf[0], sizeof(kbuf[0]), 1);
+ mb[0].k = kbuf[0];
+ mb[0].nk = p - kbuf[0];
+ p = packdkey(vbuf, sizeof(vbuf), -1, "");
+ mb[0].v = vbuf;
+ mb[0].nv = p - vbuf;
+
+ mb[1].op = Odelete;
+ p = packsuper(kbuf[1], sizeof(kbuf[1]), 0);
+ mb[1].k = kbuf[1];
+ mb[1].nk = p - kbuf[1];
+ mb[1].v = nil;
+ mb[1].nv = 0;
+
+ fprint(2, "repairing adm root backlink...\n");
+ qlock(&fs->mutlk);
+ btupsert(t, mb, 2);
+ qunlock(&fs->mutlk);
+ fprint(2, "syncing changes...\n");
+ sync();
+ fprint(2, "done\n");
+ exits(nil);
+}
--- /dev/null
+++ b/hash.c
@@ -1,0 +1,153 @@
+// metrohash64.cpp
+//
+// The MIT License (MIT)
+//
+// Copyright (c) 2015 J. Andrew Rogers
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+#define _le64toh(x) \
+ GBIT64((char*)&x)
+
+
+#define ROTATE(x, b) (u64int)( ((x) << (b)) | ( (x) >> (64 - (b))) )
+
+#define HALF_ROUND(a,b,c,d,s,t) \
+ a += b; c += d; \
+ b = ROTATE(b, s) ^ a; \
+ d = ROTATE(d, t) ^ c; \
+ a = ROTATE(a, 32);
+
+#define DOUBLE_ROUND(v0,v1,v2,v3) \
+ HALF_ROUND(v0,v1,v2,v3,13,16); \
+ HALF_ROUND(v2,v1,v0,v3,17,21); \
+ HALF_ROUND(v0,v1,v2,v3,13,16); \
+ HALF_ROUND(v2,v1,v0,v3,17,21);
+
+#define rotate_right(v, k)\
+ ((v >> k) | (v << (64 - k)))
+#define read_u64(ptr) \
+ (*(u64int*)ptr)
+#define read_u32(ptr) \
+ (*(u32int*)ptr)
+#define read_u16(ptr) \
+ (*(u16int*)ptr)
+#define read_u8(ptr) \
+ (*(u8int*)ptr)
+
+uvlong
+metrohash64_1(void * key, u64int len, u32int seed)
+{
+ static const u64int k0 = 0xC83A91E1;
+ static const u64int k1 = 0x8648DBDB;
+ static const u64int k2 = 0x7BDEC03B;
+ static const u64int k3 = 0x2F5870A5;
+
+ const uchar * ptr = key;
+ const uchar * const end = ptr + len;
+
+ u64int hash = ((((u64int) seed) + k2) * k0) + len;
+
+ if(len >= 32){
+ u64int v[4];
+ v[0] = hash;
+ v[1] = hash;
+ v[2] = hash;
+ v[3] = hash;
+
+ do{
+ v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
+ v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3];
+ v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0];
+ v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1];
+ }
+ while(ptr <= (end - 32));
+
+ v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1;
+ v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0;
+ v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1;
+ v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0;
+ hash += v[0] ^ v[1];
+ }
+
+ if((end - ptr) >= 16){
+ u64int v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1;
+ u64int v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2;
+ v0 ^= rotate_right(v0 * k0, 35) + v1;
+ v1 ^= rotate_right(v1 * k3, 35) + v0;
+ hash += v1;
+ }
+
+ if((end - ptr) >= 8){
+ hash += read_u64(ptr) * k3; ptr += 8;
+ hash ^= rotate_right(hash, 33) * k1;
+
+ }
+
+ if((end - ptr) >= 4){
+ hash += read_u32(ptr) * k3; ptr += 4;
+ hash ^= rotate_right(hash, 15) * k1;
+ }
+
+ if((end - ptr) >= 2){
+ hash += read_u16(ptr) * k3; ptr += 2;
+ hash ^= rotate_right(hash, 13) * k1;
+ }
+
+ if((end - ptr) >= 1){
+ hash += read_u8 (ptr) * k3;
+ hash ^= rotate_right(hash, 25) * k1;
+ }
+
+ hash ^= rotate_right(hash, 33);
+ hash *= k0;
+ hash ^= rotate_right(hash, 33);
+
+ return hash;
+}
+
+uvlong
+bufhash(void *src, usize len)
+{
+ return metrohash64_1(src, len, 0x6765);
+}
+
+uvlong
+blkhash(Blk *b)
+{
+ return metrohash64_1(b->buf, Blksz, 0x6765);
+}
+
+u32int
+ihash(uvlong x)
+{
+ x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
+ x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
+ x = x ^ (x >> 31);
+ return x;
+}
--- /dev/null
+++ b/load.c
@@ -1,0 +1,144 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+rangecmp(Avl *a, Avl *b)
+{
+ if(((Arange*)a)->off < ((Arange*)b)->off)
+ return -1;
+ if(((Arange*)a)->off > ((Arange*)b)->off)
+ return 1;
+ return 0;
+}
+
+void
+loadarena(Arena *a, Bptr hd)
+{
+ Blk *h0, *h1, *b;
+ Bptr bp;
+
+ /* try to load block pointers with consistency check */
+ bp = hd;
+ h0 = nil;
+ h1 = nil;
+ if(!waserror()){
+ h0 = getblk(bp, GBsoftchk);
+ poperror();
+ }else
+ print("loading arena primary header: %s\n", errmsg());
+ bp.addr += Blksz;
+ if(!waserror()){
+ h1 = getblk(bp, GBsoftchk);
+ poperror();
+ }else
+ print("loading arena backup header: %s\n", errmsg());
+
+ /* if neither head nor tail is consistent, we're hosed */
+ b = (h0 != nil) ? h0 : h1;
+ if(b == nil)
+ error(Efs);
+
+ /* otherwise, we could have crashed mid-pass, just load the blocks */
+ bp = hd;
+ if(h0 == nil)
+ h0 = getblk(bp, GBnochk);
+ bp.addr += Blksz;
+ if(h1 == nil)
+ h1 = getblk(bp, GBnochk);
+
+ unpackarena(a, b->data, Arenasz);
+ if((a->free = avlcreate(rangecmp)) == nil)
+ error(Enomem);
+ a->logbuf[0] = cachepluck();
+ a->logbuf[1] = cachepluck();
+ a->logbuf[0]->bp = (Bptr){-1, -1, -1};
+ a->logbuf[1]->bp = (Bptr){-1, -1, -1};
+ setflag(a->logbuf[0], Bstatic, 0);
+ setflag(a->logbuf[1], Bstatic, 0);
+ a->h0 = h0;
+ a->h1 = h1;
+ a->used = a->size;
+}
+
+void
+loadfs(char *dev)
+{
+ Bptr bhd, btl;
+ Mount *dump;
+ Arena *a;
+ Tree *t;
+ Dir *d;
+ int i;
+ vlong eb;
+
+ if((dump = mallocz(sizeof(*dump), 1)) == nil)
+ sysfatal("malloc: %r");
+ if(waserror())
+ sysfatal("load fs: %s", errmsg());
+ snprint(dump->name, sizeof(dump->name), "dump");
+ dump->ref = 1;
+ dump->gen = -1;
+ dump->root = &fs->snap;
+
+ fs->snapmnt = dump;
+ fs->narena = 1;
+ if((fs->fd = open(dev, ORDWR)) == -1)
+ sysfatal("open %s: %r", dev);
+ if((d = dirfstat(fs->fd)) == nil)
+ sysfatal("stat %s: %r", dev);
+ eb = d->length;
+ eb = eb - (eb%Blksz) - Blksz;
+ bhd = (Bptr){0, -1, -1};
+ btl = (Bptr){eb, -1, -1};
+ fs->sb0 = getblk(bhd, GBnochk);
+ fs->sb1 = getblk(btl, GBnochk);
+ if(!waserror()){
+ unpacksb(fs, fs->sb0->buf, Blksz);
+ poperror();
+ }else{
+ fprint(2, "unable to load primary superblock: %s\n", errmsg());
+ if(waserror()){
+ fprint(2, "unable to load primary superblock: %s\n", errmsg());
+ exits("corrupt");
+ }
+ unpacksb(fs, fs->sb1->buf, Blksz);
+ poperror();
+ }
+
+ if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+ sysfatal("malloc: %r");
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ loadarena(a, fs->arenabp[i]);
+ a->reserve = a->size / 1024;
+ if(a->reserve < 512*KiB)
+ a->reserve = 512*KiB;
+ if(a->reserve > 8*MiB)
+ a->reserve = 8*MiB;
+ }
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ loadlog(a, a->loghd);
+ }
+
+ if((t = opensnap("adm", nil)) == nil)
+ sysfatal("load users: no adm label");
+ loadusers(2, t);
+ poperror();
+
+ fprint(2, "load %s:\n", dev);
+ fprint(2, "\tsnaptree:\t%B\n", fs->snap.bp);
+ fprint(2, "\tnarenas:\t%d\n", fs->narena);
+ fprint(2, "\tfeatures:\t%lld\n", fs->flag);
+ fprint(2, "\tnextqid:\t%lld\n", fs->nextqid);
+ fprint(2, "\tlastqgen:\t%lld\n", fs->qgen);
+ fprint(2, "\tnextgen:\t%lld\n", fs->nextgen);
+ fprint(2, "\tblocksize:\t%lld\n", Blksz);
+ fprint(2, "\tcachesz:\t%lld MiB\n", fs->cmax*Blksz/MiB);
+ closesnap(t);
+}
--- /dev/null
+++ b/main.c
@@ -1,0 +1,353 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+Gefs *fs;
+
+int ream;
+int grow;
+int debug;
+int stdio;
+int noauth;
+int nproc;
+int permissive;
+int usereserve;
+int checkonly;
+char *reamuser;
+char *dev;
+vlong tracesz = 16*MiB;
+vlong cachesz = 512*MiB;
+char *srvname = "gefs";
+int noneid = 0;
+int nogroupid = 9999;
+int admid = -1;
+Blk *blkbuf;
+Bfree *bfbuf;
+Errctx **errctx;
+
+void
+_trace(char *msg, Bptr bp, vlong v0, vlong v1)
+{
+ Trace *t;
+ ulong idx;
+
+ idx = aincl(&fs->traceidx, 1);
+ t = &fs->trace[(idx-1) % fs->ntrace];
+ strecpy(t->msg, t->msg+sizeof(t->msg), msg);
+ t->tid = (*errctx)->tid;
+ t->qgen = agetv(&fs->qgen);
+ t->bp = bp;
+ t->v0 = v0;
+ t->v1 = v1;
+}
+
+static void
+nokill(void)
+{
+ char buf[128];
+ int fd;
+
+ snprint(buf, sizeof(buf), "/proc/%d/ctl", getpid());
+ if((fd = open(buf, OWRITE)) == -1){
+ fprint(2, "nokill: open %s: %r", buf);
+ return;
+ }
+ if(fprint(fd, "noswap\n") == -1){
+ fprint(2, "nokill: write %s: %r", buf);
+ return;
+ }
+}
+
+static uvlong
+memsize(void)
+{
+ char *ln, *f[2];
+ vlong mem;
+ Biobuf *bp;
+
+ mem = 512*MiB;
+ if((bp = Bopen("/dev/swap", OREAD)) == nil)
+ return mem;
+ while((ln = Brdstr(bp, '\n', 1)) != nil){
+ if(tokenize(ln, f, nelem(f)) != 2)
+ continue;
+ if(strcmp(f[1], "memory") == 0){
+ mem = strtoll(f[0], 0, 0);
+ free(ln);
+ break;
+ }
+ free(ln);
+ }
+ Bterm(bp);
+ return mem;
+}
+
+jmp_buf*
+_waserror(void)
+{
+ Errctx *c;
+
+ c = *errctx;
+ c->nerrlab++;
+ assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+ return c->errlab + (c->nerrlab-1);
+}
+
+_Noreturn static void
+errorv(char *fmt, va_list ap, int broke)
+{
+ Errctx *c;
+
+ c = *errctx;
+ vsnprint(c->err, sizeof(c->err), fmt, ap);
+ if(broke){
+ fprint(2, "%s\n", c->err);
+ abort();
+ }
+ assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+ longjmp(c->errlab[--c->nerrlab], -1);
+}
+
+_Noreturn void
+broke(char *fmt, ...)
+{
+ va_list ap;
+
+ aincl(&fs->rdonly, 1);
+ va_start(ap, fmt);
+ errorv(fmt, ap, 1);
+}
+
+_Noreturn void
+error(char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ errorv(fmt, ap, 0);
+}
+
+_Noreturn void
+nexterror(void)
+{
+ Errctx *c;
+
+ c = *errctx;
+ assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+ longjmp(c->errlab[--c->nerrlab], -1);
+}
+
+void*
+emalloc(usize sz, int zero)
+{
+ void *p;
+
+ if((p = mallocz(sz, zero)) == nil)
+ error(Enomem);
+ setmalloctag(p, getcallerpc(&sz));
+ return p;
+}
+
+static void
+initfs(vlong cachesz)
+{
+ Bfree *f, *g;
+ Blk *b;
+
+ if((fs = mallocz(sizeof(Gefs), 1)) == nil)
+ sysfatal("malloc: %r");
+
+ if(tracesz != 0){
+ fs->trace = emalloc(tracesz, 1);
+ fs->ntrace = tracesz/sizeof(Trace);
+ }
+ fs->lrurz.l = &fs->lrulk;
+ fs->syncrz.l = &fs->synclk;
+ fs->bfreerz.l = &fs->bfreelk;
+ fs->noauth = noauth;
+ fs->cmax = cachesz/Blksz;
+ if(fs->cmax > (1<<30))
+ sysfatal("cache too big");
+ if((fs->bcache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
+ sysfatal("malloc: %r");
+ fs->dlcmax = fs->cmax/10;
+ if(fs->dlcmax < 4)
+ fs->dlcmax = 4;
+ if(fs->dlcmax > 512)
+ fs->dlcmax = 512;
+ if((fs->dlcache = mallocz(fs->dlcmax*sizeof(Dlist*), 1)) == nil)
+ sysfatal("malloc: %r");
+
+ bfbuf = sbrk(fs->cmax * sizeof(Bfree));
+ if(bfbuf == (void*)-1)
+ sysfatal("sbrk: %r");
+
+ g = nil;
+ for(f = bfbuf; f != bfbuf+fs->cmax; f++){
+ f->bp = Zb;
+ f->next = g;
+ g = f;
+ }
+ fs->bfree = g;
+
+ blkbuf = sbrk(fs->cmax * sizeof(Blk));
+ if(blkbuf == (void*)-1)
+ sysfatal("sbrk: %r");
+ for(b = blkbuf; b != blkbuf+fs->cmax; b++){
+ b->bp = Zb;
+ b->magic = Magic;
+ lrutop(b);
+ }
+}
+
+static void
+launch(void (*f)(int, void *), void *arg, char *text)
+{
+ long pid, id;
+
+ assert(fs->nworker < nelem(fs->lepoch));
+ pid = rfork(RFPROC|RFMEM|RFNOWAIT);
+ if (pid < 0)
+ sysfatal("can't fork: %r");
+ if (pid == 0) {
+ nokill();
+ id = aincl(&fs->nworker, 1);
+ if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
+ sysfatal("malloc: %r");
+ (*errctx)->tid = id;
+ procsetname("%s.%ld", text, id);
+ (*f)(id, arg);
+ exits("child returned");
+ }
+}
+
+static int
+postfd(char *name, char *suff, int mode)
+{
+ char buf[80];
+ int fd[2];
+ int cfd;
+
+ if(pipe(fd) < 0)
+ sysfatal("can't make a pipe");
+ snprint(buf, sizeof buf, "/srv/%s%s", name, suff);
+ if((cfd = create(buf, OWRITE|ORCLOSE|OCEXEC, mode)) == -1)
+ sysfatal("create %s: %r", buf);
+ if(fprint(cfd, "%d", fd[0]) == -1)
+ sysfatal("write %s: %r", buf);
+ close(fd[0]);
+ return fd[1];
+}
+
+static void
+runannounce(int, void *arg)
+{
+ char *ann, adir[40], ldir[40];
+ int actl, lctl, fd;
+ Conn *c;
+
+ ann = arg;
+ if((actl = announce(ann, adir)) < 0)
+ sysfatal("announce %s: %r", ann);
+ while(1){
+ if((lctl = listen(adir, ldir)) < 0){
+ fprint(2, "listen %s: %r", adir);
+ break;
+ }
+ fd = accept(lctl, ldir);
+ close(lctl);
+ if(fd < 0){
+ fprint(2, "accept %s: %r", ldir);
+ continue;
+ }
+ if(!(c = newconn(fd, fd))){
+ close(fd);
+ fprint(2, "%r");
+ continue;
+ }
+
+ launch(runfs, c, "netio");
+ }
+ close(actl);
+}
+
+static void
+usage(void)
+{
+ fprint(2, "usage: %s [-SA] [-r user] [-m mem] [-n srv] [-a net]... -f dev\n", argv0);
+ exits("usage");
+}
+
+void
+main(int argc, char **argv)
+{
+ int i;
+
+ cachesz = 64*MiB;
+ ARGBEGIN{
+ case 'd':
+ debug++;
+ break;
+ case 'f':
+ dev = EARGF(usage());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND;
+ if(dev == nil)
+ usage();
+
+ /*
+ * sanity checks -- I've tuned these to stupid
+ * values in the past.
+ */
+ assert(4*Kpmax < Pivspc);
+ assert(2*Msgmax < Bufspc);
+ assert(Treesz < Inlmax);
+
+ initfs(cachesz);
+ initshow();
+ errctx = privalloc();
+ if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
+ sysfatal("malloc: %r");
+ tmfmtinstall();
+ fmtinstall('H', encodefmt);
+ fmtinstall('B', Bconv);
+ fmtinstall('M', Mconv);
+ fmtinstall('P', Pconv);
+ fmtinstall('K', Kconv);
+ fmtinstall('R', Rconv);
+ fmtinstall('F', fcallfmt);
+ fmtinstall('Q', Qconv);
+
+ nproc = 2;
+
+ rfork(RFNOTEG);
+ loadfs(dev);
+ /*
+ * for spinning disks, parallel sync tanks performance
+ * for ssds, it doesn't help much.
+ */
+ fs->nsyncers = 1;
+ if(fs->nsyncers > fs->narena)
+ fs->nsyncers = fs->narena;
+ for(i = 0; i < fs->nsyncers; i++)
+ qinit(&fs->syncq[i]);
+ if((fs->rdchan = malloc(fs->nreaders*sizeof(Chan*))) == nil)
+ sysfatal("malloc: %r");
+ for(i = 0; i < fs->nreaders; i++)
+ fs->rdchan[i] = mkchan(32);
+ for(i = 0; i < fs->narena; i++)
+ fs->arenas[i].sync = &fs->syncq[i%fs->nsyncers];
+ for(i = 0; i < fs->nsyncers; i++)
+ launch(runsync, &fs->syncq[i], "syncio");
+ fixfs();
+ abort();
+}
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,39 @@
+</$objtype/mkfile
+
+TARG=fix
+BIN=/$objtype/bin
+OFILES=\
+ blk.$O\
+ cache.$O\
+ check.$O\
+ cons.$O\
+ dump.$O\
+ error.$O\
+ fs.$O\
+ hash.$O\
+ load.$O\
+ fix.$O\
+ pack.$O\
+ ream.$O\
+ snap.$O\
+ tree.$O\
+ user.$O\
+ \
+ atomic-$objtype.$O
+
+HFILES=\
+ dat.h\
+ fns.h\
+ atomic.h
+
+</sys/src/cmd/mkone
+</sys/doc/fonts
+
+%.ps: %.ms
+ { echo $FONTS; cat $stem.ms } | pic | tbl | eqn | troff -ms | lp -dstdout > $target
+%.pdf: %.ps
+ ps2pdf $stem.ps $stem.pdf
+
+man.install: gefs.4.man gefs.8.man
+ cp gefs.4.man /sys/man/4/gefs
+ cp gefs.8.man /sys/man/8/gefs
--- /dev/null
+++ b/pack.c
@@ -1,0 +1,510 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+/* Terminated so we can use them directly in C */
+char*
+unpackstr(char *p, char *e, char **s)
+{
+ int n;
+
+ assert(e - p >= 3);
+ n = UNPACK16(p);
+ if(e - p < n + 3 || p[n+2] != 0)
+ broke(Efs);
+ *s = p+2;
+ return p+3+n;
+}
+
+/* Terminated so we can use them directly in C */
+char*
+packstr(char *p, char *e, char *s)
+{
+ int n;
+
+ n = strlen(s);
+ assert(e - p >= n+3);
+ PACK16(p, n); p += 2;
+ memmove(p, s, n); p += n;
+ *p = 0; p += 1;
+ return p;
+}
+
+void
+dir2kv(vlong up, Xdir *d, Kvp *kv, char *buf, int nbuf)
+{
+ char *ek, *ev, *eb;
+
+ ek = packdkey(buf, nbuf, up, d->name);
+ kv->k = buf;
+ kv->nk = ek - buf;
+ eb = buf + nbuf;
+ ev = packdval(ek, eb - ek, d);
+ kv->v = ek;
+ kv->nv = ev - ek;
+}
+
+char*
+packdkey(char *p, int sz, vlong up, char *name)
+{
+ char *ep;
+
+ ep = p + sz;
+ PACK8(p, Kent); p += 1;
+ PACK64(p, up); p += 8;
+ if(name != nil)
+ p = packstr(p, ep, name);
+ return p;
+}
+
+char*
+unpackdkey(char *p, int sz, vlong *up)
+{
+ char key, *ep, *name;
+
+ ep = p + sz;
+ assert(sz > 9);
+ key = UNPACK8(p); p += 1;
+ *up = UNPACK64(p); p += 8;
+ assert(key == Kent);
+ p = unpackstr(p, ep, &name);
+ assert(p <= ep);
+ return name;
+}
+
+char*
+packsuper(char *p, int sz, vlong up)
+{
+ char *ep;
+
+ ep = p+sz;
+ PACK8(p, Kup); p += 1;
+ PACK64(p, up); p += 8;
+ assert(p <= ep);
+ return p;
+}
+
+char*
+packdval(char *p, int sz, Xdir *d)
+{
+ char *e;
+
+ e = p + sz;
+ PACK64(p, d->flag); p += 8;
+ PACK64(p, d->qid.path); p += 8;
+ PACK32(p, d->qid.vers); p += 4;
+ PACK8(p, d->qid.type); p += 1;
+ PACK32(p, d->mode); p += 4;
+ PACK64(p, d->atime); p += 8;
+ PACK64(p, d->mtime); p += 8;
+ PACK64(p, d->length); p += 8;
+ PACK32(p, d->uid); p += 4;
+ PACK32(p, d->gid); p += 4;
+ PACK32(p, d->muid); p += 4;
+ assert(p <= e);
+ return p;
+}
+
+void
+kv2dir(Kvp *kv, Xdir *d)
+{
+ char *k, *ek, *v, *ev;
+
+ memset(d, 0, sizeof(Xdir));
+ k = kv->k + 9;
+ ek = kv->k + kv->nk;
+ k = unpackstr(k, ek, &d->name);
+
+ v = kv->v;
+ ev = v + kv->nv;
+ d->flag = UNPACK64(v); v += 8;
+ d->qid.path = UNPACK64(v); v += 8;
+ d->qid.vers = UNPACK32(v); v += 4;
+ d->qid.type = UNPACK8(v); v += 1;
+ d->mode = UNPACK32(v); v += 4;
+ d->atime = UNPACK64(v); v += 8;
+ d->mtime = UNPACK64(v); v += 8;
+ d->length = UNPACK64(v); v += 8;
+ d->uid = UNPACK32(v); v += 4;
+ d->gid = UNPACK32(v); v += 4;
+ d->muid = UNPACK32(v); v += 4;
+ assert(v <= ev);
+ if(k != ek)
+ broke(Efs);
+ if(v != ev)
+ broke(Efs);
+}
+
+int
+dir2statbuf(Xdir *d, char *buf, int nbuf)
+{
+ int sz, nn, nu, ng, nm;
+ vlong atime, mtime;
+ User *u, *g, *m;
+ char *p;
+
+ rlock(&fs->userlk);
+ if((u = uid2user(d->uid)) == nil)
+ u = uid2user(noneid);
+ if((g = uid2user(d->gid)) == nil)
+ u = uid2user(nogroupid);
+ if((m = uid2user(d->muid)) == nil)
+ m = uid2user(noneid);
+ if(u == nil || g == nil || m == nil)
+ error(Eperm);
+
+ p = buf;
+ nn = strlen(d->name);
+ nu = strlen(u->name);
+ ng = strlen(g->name);
+ nm = strlen(m->name);
+ atime = (d->atime+Nsec/2)/Nsec;
+ mtime = (d->mtime+Nsec/2)/Nsec;
+ sz = STATFIXLEN + nn + nu + ng + nm;
+ if(sz > nbuf){
+ runlock(&fs->userlk);
+ return -1;
+ }
+
+ PBIT16(p, sz-2); p += 2;
+ PBIT16(p, -1 /*type*/); p += 2;
+ PBIT32(p, -1 /*dev*/); p += 4;
+ PBIT8(p, d->qid.type); p += 1;
+ PBIT32(p, d->qid.vers); p += 4;
+ PBIT64(p, d->qid.path); p += 8;
+ PBIT32(p, d->mode); p += 4;
+ PBIT32(p, atime); p += 4;
+ PBIT32(p, mtime); p += 4;
+ PBIT64(p, d->length); p += 8;
+
+ PBIT16(p, nn); p += 2;
+ memcpy(p, d->name, nn); p += nn;
+ PBIT16(p, nu); p += 2;
+ memcpy(p, u->name, nu); p += nu;
+ PBIT16(p, ng); p += 2;
+ memcpy(p, g->name, ng); p += ng;
+ PBIT16(p, nm); p += 2;
+ memcpy(p, m->name, nm); p += nm;
+ assert(p - buf == sz);
+ runlock(&fs->userlk);
+ return sz;
+}
+
+int
+kv2statbuf(Kvp *kv, char *buf, int nbuf)
+{
+ Xdir d;
+
+ kv2dir(kv, &d);
+ return dir2statbuf(&d, buf, nbuf);
+}
+
+void
+kv2qid(Kvp *kv, Qid *q)
+{
+ char *v, *e;
+
+ v = kv->v;
+ e = v + kv->nv;
+ q->path = UNPACK64(v); v += 8;
+ q->vers = UNPACK64(v); v += 8;
+ assert(v <= e);
+}
+
+void
+kv2dlist(Kvp *kv, Dlist *dl)
+{
+ char *p, *e;
+
+ p = kv->k;
+ e = p + kv->nk;
+ p++;
+ dl->gen = UNPACK64(p); p += 8;
+ dl->bgen = UNPACK64(p); p += 8;
+ assert(p <= e);
+
+ p = kv->v;
+ e = p + kv->nv;
+ dl->hd = unpackbp(p, e-p); p += Ptrsz;
+ dl->tl = unpackbp(p, e-p); p += Ptrsz;
+ assert(p <= e);
+}
+
+void
+dlist2kv(Dlist *dl, Kvp *kv, char *buf, int nbuf)
+{
+ char *p, *e;
+
+ assert(nbuf >= Dlkvpsz);
+ p = buf;
+ e = buf+nbuf;
+
+ kv->k = p;
+ *p++ = Kdlist;
+ PACK64(p, dl->gen); p += 8;
+ PACK64(p, dl->bgen); p += 8;
+ kv->nk = (p - kv->k);
+
+ kv->v = p;
+ p = packbp(p, e-p, &dl->hd);
+ p = packbp(p, e-p, &dl->tl);
+ kv->nv = (p - kv->v);
+}
+
+void
+tree2kv(Tree *t, Kvp *kv, char *buf, int nbuf)
+{
+ char *p, *e;
+
+ p = buf;
+ e = buf+nbuf;
+
+ kv->k = p;
+ if((p = packsnap(p, e-p, t->gen)) == nil)
+ abort();
+ kv->nk = p - kv->k;
+
+ kv->v = p;
+ if((p = packtree(p, e-p, t)) == nil)
+ abort();
+ kv->nv = p - kv->v;
+}
+
+void
+retag2kv(vlong gen, vlong link, int dlbl, int dref, Kvp *kv, char *buf, int nbuf)
+{
+ char *p;
+
+ assert(nbuf >= 8+1+1);
+ kv->k = buf;
+ if((p = packsnap(buf, nbuf, gen)) == nil)
+ abort();
+ kv->nk = p - buf;
+
+ kv->v = p;
+ PACK64(p, link); p += 8;
+ *p = dlbl; p += 1;
+ *p = dref; p += 1;
+ kv->nv = p - kv->v;
+}
+
+void
+lbl2kv(char *lbl, vlong gen, uint flg, Kvp *kv, char *buf, int nbuf)
+{
+ char *p;
+ int n;
+
+ n = strlen(lbl);
+ assert(nbuf >= 1+n + 1+8+4);
+
+ p = buf;
+ kv->k = p;
+ p[0] = Klabel; p += 1;
+ memcpy(p, lbl, n); p += n;
+ kv->nk = p - kv->k;
+
+ kv->v = p;
+ p[0] = Ksnap; p += 1;
+ PACK64(p, gen); p += 8;
+ PACK32(p, flg); p += 4;
+ kv->nv = p - kv->v;
+}
+
+char*
+packlbl(char *p, int sz, char *name)
+{
+ int n;
+
+ n = strlen(name);
+ assert(sz >= n+1);
+ p[0] = Klabel; p += 1;
+ memcpy(p, name, n); p += n;
+ return p;
+}
+
+char*
+packsnap(char *p, int sz, vlong id)
+{
+ assert(sz >= Snapsz);
+ p[0] = Ksnap; p += 1;
+ PACK64(p, id); p += 8;
+ return p;
+}
+
+char*
+packbp(char *p, int sz, Bptr *bp)
+{
+ assert(sz >= Ptrsz);
+ PACK64(p, bp->addr); p += 8;
+ PACK64(p, bp->hash); p += 8;
+ PACK64(p, bp->gen); p += 8;
+ return p;
+}
+
+Bptr
+unpackbp(char *p, int sz)
+{
+ Bptr bp;
+
+ assert(sz >= Ptrsz);
+ bp.addr = UNPACK64(p); p += 8;
+ bp.hash = UNPACK64(p); p += 8;
+ bp.gen = UNPACK64(p);
+ return bp;
+}
+
+Tree*
+unpacktree(Tree *t, char *p, int sz)
+{
+ assert(sz >= Treesz);
+ memset(t, 0, sizeof(Tree));
+ t->nref = UNPACK32(p); p += 4;
+ t->nlbl = UNPACK32(p); p += 4;
+ t->ht = UNPACK32(p); p += 4;
+ t->flag = UNPACK32(p); p += 4;
+ t->gen = UNPACK64(p); p += 8;
+ t->pred = UNPACK64(p); p += 8;
+ t->succ = UNPACK64(p); p += 8;
+ t->base = UNPACK64(p); p += 8;
+ t->bp.addr = UNPACK64(p); p += 8;
+ t->bp.hash = UNPACK64(p); p += 8;
+ t->bp.gen = UNPACK64(p); //p += 8;
+
+ return t;
+}
+
+char*
+packtree(char *p, int sz, Tree *t)
+{
+ assert(sz >= Treesz);
+ PACK32(p, t->nref); p += 4;
+ PACK32(p, t->nlbl); p += 4;
+ PACK32(p, t->ht); p += 4;
+ PACK32(p, t->flag); p += 4;
+ PACK64(p, t->gen); p += 8;
+ PACK64(p, t->pred); p += 8;
+ PACK64(p, t->succ); p += 8;
+ PACK64(p, t->base); p += 8;
+ PACK64(p, t->bp.addr); p += 8;
+ PACK64(p, t->bp.hash); p += 8;
+ PACK64(p, t->bp.gen); p += 8;
+ return p;
+}
+
+char*
+packarena(char *p, int sz, Arena *a)
+{
+ char *e;
+
+ assert(sz >= Arenasz);
+ e = p + Arenasz;
+ PACK64(p, a->loghd.addr); p += 8; /* freelist addr */
+ PACK64(p, a->loghd.hash); p += 8; /* freelist hash */
+ PACK64(p, a->size); p += 8; /* arena size */
+ PACK64(p, a->used); p += 8; /* arena used */
+ assert(p <= e);
+ return p;
+}
+
+char*
+unpackarena(Arena *a, char *p, int sz)
+{
+ char *e;
+
+ assert(sz >= Arenasz);
+ memset(a, 0, sizeof(*a));
+
+ e = p + Arenasz;
+ a->loghd.addr = UNPACK64(p); p += 8;
+ a->loghd.hash = UNPACK64(p); p += 8;
+ a->loghd.gen = -1; p += 0;
+ a->size = UNPACK64(p); p += 8;
+ a->used = UNPACK64(p); p += 8;
+ a->logtl = nil;
+
+ assert(p <= e);
+ return p;
+}
+
+char*
+packsb(char *p0, int sz, Gefs *fi)
+{
+ uvlong h;
+ char *p;
+ int i;
+
+ assert(sz == Blksz);
+ assert(fi->narena < 512);
+ p = p0;
+ memcpy(p, "gefs9.00", 8); p += 8;
+ PACK32(p, Blksz); p += 4;
+ PACK32(p, Bufspc); p += 4;
+ PACK32(p, fi->narena); p += 4;
+ PACK32(p, fi->snap.ht); p += 4;
+ PACK64(p, fi->snap.bp.addr); p += 8;
+ PACK64(p, fi->snap.bp.hash); p += 8;
+ PACK64(p, fi->snapdl.hd.addr); p += 8;
+ PACK64(p, fi->snapdl.hd.hash); p += 8;
+ PACK64(p, fi->snapdl.tl.addr); p += 8;
+ PACK64(p, fi->snapdl.tl.hash); p += 8;
+ PACK64(p, fi->flag); p += 8;
+ PACK64(p, fi->nextqid); p += 8;
+ PACK64(p, fi->nextgen); p += 8;
+ PACK64(p, fi->qgen); p += 8;
+ for(i = 0; i < fi->narena; i++){
+ PACK64(p, fi->arenabp[i].addr); p += 8;
+ PACK64(p, fi->arenabp[i].hash); p += 8;
+ }
+ h = bufhash(p0, p - p0);
+ PACK64(p, h); p += 8;
+ return p;
+}
+
+char*
+unpacksb(Gefs *fi, char *p0, int sz)
+{
+ uvlong dh, xh;
+ char *p;
+ int i;
+
+ assert(sz == Blksz);
+ p = p0;
+ if(memcmp(p, "gefs9.00", 8) != 0)
+ error("%s %.8s", Efsvers, p);
+ p += 8;
+ fi->blksz = UNPACK32(p); p += 4;
+ fi->bufspc = UNPACK32(p); p += 4;
+ fi->narena = UNPACK32(p); p += 4;
+ fi->snap.ht = UNPACK32(p); p += 4;
+ fi->snap.bp.addr = UNPACK64(p); p += 8;
+ fi->snap.bp.hash = UNPACK64(p); p += 8;
+ fi->snap.bp.gen = -1; p += 0;
+ fi->snapdl.hd.addr = UNPACK64(p); p += 8;
+ fi->snapdl.hd.hash = UNPACK64(p); p += 8;
+ fi->snapdl.hd.gen = -1; p += 0;
+ fi->snapdl.gen = -1; p += 0;
+ fi->snapdl.tl.addr = UNPACK64(p); p += 8;
+ fi->snapdl.tl.hash = UNPACK64(p); p += 8;
+ fi->snapdl.hd.gen = -1; p += 0;
+ fi->snapdl.gen = -1; p += 0;
+ fi->flag = UNPACK64(p); p += 8;
+ fi->nextqid = UNPACK64(p); p += 8;
+ fi->nextgen = UNPACK64(p); p += 8;
+ fi->qgen = UNPACK64(p); p += 8;
+ fi->arenabp = emalloc(fi->narena * sizeof(Bptr), 0);
+ for(i = 0; i < fi->narena; i++){
+ fi->arenabp[i].addr = UNPACK64(p); p += 8;
+ fi->arenabp[i].hash = UNPACK64(p); p += 8;
+ fi->arenabp[i].gen = -1;
+ }
+ xh = bufhash(p0, p - p0);
+ dh = UNPACK64(p); p += 8;
+ if(dh != xh)
+ error("corrupt superblock: %llx != %llx", dh, xh);
+ assert(fi->narena < 256); /* should be more than anyone needs */
+ return p;
+}
--- /dev/null
+++ b/ream.c
@@ -1,0 +1,460 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+enum {
+ Qmainroot,
+ Qadmroot,
+ Qadmuser,
+ Nreamqid,
+};
+
+static void
+fillxdir(Xdir *d, vlong qid, char *name, int type, int mode)
+{
+ memset(d, 0, sizeof(Xdir));
+ d->qid = (Qid){qid, 0, type};
+ d->mode = mode;
+ d->atime = 0;
+ d->mtime = 0;
+ d->length = 0;
+ d->name = name;
+ d->uid = -1;
+ d->gid = -1;
+ d->muid = 0;
+}
+
+static void
+initadm(Blk *r, Blk *u, int nu)
+{
+ char *p, kbuf[Keymax], vbuf[Inlmax];
+ Kvp kv;
+ Xdir d;
+
+ /* nb: values must be inserted in key order */
+ kv.k = kbuf;
+ kv.nk = Offksz;
+ kv.v = vbuf;
+ kv.nv = Ptrsz;
+ kbuf[0] = Kdat;
+ PACK64(kbuf+1, (uvlong)Qadmuser);
+ PACK64(kbuf+9, 0ULL);
+ packbp(kv.v, kv.nv, &u->bp);
+ setval(r, &kv);
+
+ fillxdir(&d, Qadmuser, "users", QTFILE, 0664);
+ d.length = nu;
+ dir2kv(Qadmroot, &d, &kv, vbuf, sizeof(vbuf));
+ setval(r, &kv);
+ fillxdir(&d, Qadmroot, "", QTDIR, DMDIR|0775);
+ dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
+ setval(r, &kv);
+
+ p = packsuper(kbuf, sizeof(kbuf), Qadmroot);
+ kv.k = kbuf;
+ kv.nk = p - kbuf;
+ p = packdkey(vbuf, sizeof(vbuf), -1, "");
+ kv.v = vbuf;
+ kv.nv = p - vbuf;
+ setval(r, &kv);
+}
+
+static void
+initroot(Blk *r)
+{
+ char *p, kbuf[Keymax], vbuf[Inlmax];
+ Kvp kv;
+ Xdir d;
+
+ /* nb: values must be inserted in key order */
+ fillxdir(&d, Qmainroot, "", QTDIR, DMDIR|0775);
+ dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
+ setval(r, &kv);
+
+ p = packsuper(kbuf, sizeof(kbuf), Qmainroot);
+ kv.k = kbuf;
+ kv.nk = p - kbuf;
+ p = packdkey(vbuf, sizeof(vbuf), -1, "");
+ kv.v = vbuf;
+ kv.nv = p - vbuf;
+ setval(r, &kv);
+}
+
+static void
+initsnap(Blk *s, Blk *r, Blk *a)
+{
+ char *p, *e, buf[Kvmax];
+ Tree t;
+ Kvp kv;
+
+ lbl2kv("adm", 1, Lmut|Ltsnap, &kv, buf, sizeof(buf));
+ setval(s, &kv);
+ lbl2kv("empty", 0, 0, &kv, buf, sizeof(buf));
+ setval(s, &kv);
+ lbl2kv("main", 2, Lmut|Ltsnap, &kv, buf, sizeof(buf));
+ setval(s, &kv);
+
+ p = buf;
+ e = p + sizeof(buf);
+
+ /* empty */
+ kv.k = p;
+ p = packsnap(buf, e - p, 0);
+ kv.nk = p - kv.k;
+ kv.v = p;
+ memset(&t, 0, sizeof(Tree));
+ t.flag = 0;
+ t.nref = 2;
+ t.nlbl = 1;
+ t.ht = 1;
+ t.gen = fs->nextgen++;
+ t.pred = 0;
+ t.succ = 2;
+ t.bp = r->bp;
+ p = packtree(p, e - p, &t);
+ kv.nv = p - kv.v;
+ setval(s, &kv);
+
+ p = buf;
+ e = p + sizeof(buf);
+
+ /* adm */
+ kv.k = p;
+ p = packsnap(p, e - p, 1);
+ kv.nk = p - kv.k;
+ kv.v = p;
+ memset(&t, 0, sizeof(Tree));
+ t.nref = 0;
+ t.nlbl = 1;
+ t.ht = 1;
+ t.gen = fs->nextgen++;
+ t.pred = 0;
+ t.succ = -1;
+ t.bp = a->bp;
+ p = packtree(p, e - p, &t);
+ kv.nv = p - kv.v;
+ setval(s, &kv);
+
+ p = buf;
+ e = p + sizeof(buf);
+
+ /* main */
+ kv.k = p;
+ p = packsnap(buf, e - p, 2);
+ kv.nk = p - kv.k;
+ kv.v = p;
+ memset(&t, 0, sizeof(Tree));
+ t.nref = 0;
+ t.nlbl = 1;
+ t.ht = 1;
+ t.gen = fs->nextgen++;
+ t.pred = 0;
+ t.succ = -1;
+ t.bp = r->bp;
+ p = packtree(p, e - p, &t);
+ kv.nv = p - kv.v;
+ setval(s, &kv);
+}
+
+static void
+initarena(Arena *a, uvlong hdaddr, vlong asz)
+{
+ Blk *b, *h0, *h1;
+ uvlong addr;
+ char *p;
+
+ b = cachepluck();
+
+ addr = hdaddr+2*Blksz; /* leave room for arena hdr */
+
+ a->loghd.addr = -1;
+ a->loghd.hash = -1;
+ a->loghd.gen = -1;
+
+ memset(b->buf, 0, sizeof(b->buf));
+ b->type = Tlog;
+ b->bp.addr = addr;
+ b->logsz = 0;
+ b->logp = (Bptr){-1, -1, -1};
+ b->data = b->buf + Loghdsz;
+ setflag(b, Bdirty, 0);
+
+ p = b->buf + Loghdsz;
+ b->logp = (Bptr){-1, -1, -1};
+ PACK64(p, addr|LogFree); p += 8; /* addr */
+ PACK64(p, asz-2*Blksz); p += 8; /* len */
+ PACK64(p, b->bp.addr|LogAlloc); p += 8; /* addr */
+ PACK64(p, Blksz); p += 8; /* len */
+ PACK64(p, (uvlong)LogSync); p += 8; /* barrier */
+ b->logsz = p - b->data;
+ finalize(b);
+ syncblk(b);
+ dropblk(b);
+
+ a->loghd = b->bp;
+ a->loghd.gen = -1;
+ a->size = asz;
+ a->used = Blksz;
+
+ h0 = cachepluck();
+ h1 = cachepluck();
+
+ memset(h0->buf, 0, sizeof(h0->buf));
+ h0->type = Tarena;
+ h0->bp.addr = hdaddr;
+ h0->data = h0->buf+2;
+ packarena(h0->data, Arenasz, a);
+ finalize(h0);
+ syncblk(h0);
+ a->h0 = h0;
+
+ memset(h1->buf, 0, sizeof(h1->buf));
+ h1->type = Tarena;
+ h1->bp.addr = hdaddr+Blksz;
+ h1->data = h1->buf+2;
+ packarena(h1->data, Arenasz, a);
+ finalize(h1);
+ syncblk(h1);
+ a->h1 = h1;
+}
+
+void
+reamfs(char *dev)
+{
+ Blk *sb0, *sb1, *tb, *mb, *ab, *ub;
+ vlong sz, asz, off;
+ Mount *mnt, *adm;
+ Arena *a;
+ char *utab;
+ Dir *d;
+ int i;
+
+ if(waserror())
+ sysfatal("ream %s: %s\n", dev, errmsg());
+ if((fs->fd = open(dev, ORDWR)) == -1)
+ sysfatal("open %s: %r", dev);
+ if((d = dirfstat(fs->fd)) == nil)
+ sysfatal("ream: %r");
+ sz = d->length;
+ free(d);
+
+ print("reaming %s\n", dev);
+ if(sz < 128*MiB+Blksz)
+ sysfatal("ream: disk too small");
+ mnt = emalloc(sizeof(Mount), 1);
+ mnt->root = mallocz(sizeof(Tree), 1);
+ adm = mallocz(sizeof(Mount), 1);
+ adm->root = mallocz(sizeof(Tree), 1);
+
+ sz = sz - sz%Blksz - 2*Blksz;
+ fs->narena = (sz + 4096ULL*GiB - 1) / (4096ULL*GiB);
+ if(fs->narena < 8)
+ fs->narena = 8;
+ if(fs->narena >= 32)
+ fs->narena = 32;
+ fs->arenas = emalloc(fs->narena*sizeof(Arena), 1);
+
+
+ off = Blksz;
+ asz = sz/fs->narena;
+ asz = asz - (asz % Blksz) - 2*Blksz;
+
+ sb0 = cachepluck();
+ sb1 = cachepluck();
+ sb0->bp = (Bptr){0, -1, -1};
+ sb1->bp = (Bptr){sz+Blksz, -1, -1};
+
+ fs->arenabp = emalloc(fs->narena * sizeof(Bptr), 1);
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ print("\tarena %d: %lld blocks at %llx\n", i, asz/Blksz, off);
+ initarena(a, off, asz);
+ fs->arenabp[i] = a->h0->bp;
+ off += asz+2*Blksz;
+
+ }
+
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ loadarena(a, a->h0->bp);
+ loadlog(a, a->loghd);
+ }
+
+ if((mb = newblk(mnt->root, Tleaf)) == nil)
+ sysfatal("ream: allocate root: %r");
+ holdblk(mb);
+ initroot(mb);
+ finalize(mb);
+ syncblk(mb);
+
+ mnt->root->ht = 1;
+ mnt->root->bp = mb->bp;
+
+ if((ab = newblk(adm->root, Tleaf)) == nil)
+ sysfatal("ream: allocate root: %r");
+ if((ub = newdblk(adm->root, 0, 1)) == nil)
+ sysfatal("ream: allocate root: %r");
+ holdblk(ab);
+ holdblk(ub);
+ utab = smprint(
+ "-1:adm::%s\n"
+ "0:none::\n"
+ "1:%s:%s:\n",
+ reamuser, reamuser, reamuser);
+ memcpy(ub->data, utab, strlen(utab));
+ finalize(ub);
+ syncblk(ub);
+ initadm(ab, ub, strlen(utab));
+ finalize(ab);
+ syncblk(ab);
+
+ adm->root->ht = 1;
+ adm->root->bp = ab->bp;
+
+ /*
+ * Now that we have a completely empty fs, give it
+ * a single snap block that the tree will insert
+ * into, and take a snapshot as the initial state.
+ */
+ if((tb = newblk(mnt->root, Tleaf)) == nil)
+ sysfatal("ream: allocate snaps: %r");
+ holdblk(tb);
+ initsnap(tb, mb, ab);
+ finalize(tb);
+ syncblk(tb);
+
+ fs->snap.bp = tb->bp;
+ fs->snap.ht = 1;
+ fs->snapdl.hd.addr = -1;
+ fs->snapdl.hd.hash = -1;
+ fs->snapdl.tl.addr = -1;
+ fs->snapdl.tl.hash = -1;
+ fs->nextqid = Nreamqid;
+
+ dropblk(mb);
+ dropblk(ab);
+ dropblk(ub);
+ dropblk(tb);
+ fs->nextqid = Nreamqid;
+
+ /*
+ * We need to write back all of the arenas
+ * with the updated free lists
+ */
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ finalize(a->logtl);
+ syncblk(a->logtl);
+ packarena(a->h0->data, Blksz, a);
+ finalize(a->h0);
+ syncblk(a->h0);
+ packarena(a->h1->data, Blksz, a);
+ finalize(a->h1);
+ syncblk(a->h1);
+ fs->arenabp[i] = a->h0->bp;
+ dropblk(a->h0);
+ dropblk(a->h1);
+ }
+
+ dropblk(mb);
+ dropblk(ab);
+ dropblk(ub);
+ dropblk(tb);
+
+ /*
+ * Finally, write back the superblock and backup
+ * superblock.
+ */
+ packsb(sb0->buf, Blksz, fs);
+ packsb(sb1->buf, Blksz, fs);
+ finalize(sb0);
+ finalize(sb1);
+ syncblk(sb0);
+ syncblk(sb1);
+ dropblk(sb0);
+ dropblk(sb1);
+ free(mnt);
+ poperror();
+}
+
+void
+growfs(char *dev)
+{
+ vlong oldsz, newsz, asz, off, eb;
+ int i, narena;
+ Arena *a;
+ Bptr bp;
+ Dir *d;
+
+ if(waserror())
+ sysfatal("grow %s: %s\n", dev, errmsg());
+ if((fs->fd = open(dev, ORDWR)) == -1)
+ sysfatal("open %s: %r", dev);
+ if((d = dirfstat(fs->fd)) == nil)
+ sysfatal("ream: %r");
+
+ bp = (Bptr){0, -1, -1};
+ fs->sb0 = getblk(bp, GBnochk);
+ unpacksb(fs, fs->sb0->buf, Blksz);
+ if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+ sysfatal("malloc: %r");
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ loadarena(a, fs->arenabp[i]);
+ fs->arenabp[i] = a->h0->bp;
+ }
+ a = &fs->arenas[fs->narena-1];
+ oldsz = a->h0->bp.addr + a->size + 2*Blksz;
+ newsz = d->length - d->length%Blksz - 2*Blksz;
+ if(newsz - oldsz < 64*MiB)
+ sysfatal("new arenas too small (%lld < %lld), not growing", newsz - oldsz, 64*MiB);
+ asz = (newsz - oldsz)/4;
+ asz = asz - asz % Blksz - 2*Blksz;
+ narena = fs->narena + 4;
+ assert(oldsz % Blksz == 0);
+ if((fs->arenas = realloc(fs->arenas, narena*sizeof(Arena))) == nil)
+ error(Enomem);
+ if((fs->arenabp = realloc(fs->arenabp, narena*sizeof(Bptr))) == nil)
+ error(Enomem);
+
+ off = oldsz;
+ for(i = fs->narena; i < narena; i++){
+ a = &fs->arenas[i];
+ print("\tnew arena %d: adding %lld blocks at %llx\n", i, asz/Blksz, off);
+ initarena(&fs->arenas[i], off, asz);
+ loadarena(a, a->h0->bp);
+ loadlog(a, a->loghd);
+ a = &fs->arenas[i];
+ packarena(a->h0->data, Blksz, a);
+ packarena(a->h1->data, Blksz, a);
+ finalize(a->h0);
+ finalize(a->h1);
+ syncblk(a->h0);
+ syncblk(a->h1);
+
+ fs->arenabp[i] = a->h0->bp;
+ off += asz+2*Blksz;
+ }
+ fs->narena = narena;
+ packsb(fs->sb0->buf, Blksz, fs);
+ finalize(fs->sb0);
+ syncblk(fs->sb0);
+ /*
+ * We're being a bit tricksy here: because we're on a bigger
+ * partition, we don't know where the end is; just load the
+ * first block, and patch the address in to the right place
+ * when we write it back.
+ */
+ eb = d->length;
+ eb = eb - (eb%Blksz) - Blksz;
+ fs->sb0->bp = (Bptr){eb, -1, -1};
+ packsb(fs->sb0->buf, Blksz, fs);
+ finalize(fs->sb0);
+ syncblk(fs->sb0);
+ free(d);
+ poperror();
+}
--- /dev/null
+++ b/snap.c
@@ -1,0 +1,613 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "atomic.h"
+#include "dat.h"
+#include "fns.h"
+
+static void
+dlflush(Dlist *dl)
+{
+ char kvbuf[512];
+ Msg m;
+
+ if(dl->ins == nil)
+ return;
+ traceb("dlflush", dl->ins->bp);
+ enqueue(dl->ins);
+ dropblk(dl->ins);
+ dl->hd = dl->ins->bp;
+ if(dl->tl.addr == dl->hd.addr)
+ dl->tl = dl->hd;
+ dl->ins = nil;
+ /* special case: the snap dlist has gen -1, skip it */
+ if(dl->gen != -1){
+ m.op = Oinsert;
+ dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+ btupsert(&fs->snap, &m, 1);
+ }
+}
+
+static void
+dlcachedel(Dlist *dl, int hdel)
+{
+ uint h;
+ Dlist *d, **p;
+
+ h = ihash(dl->gen) ^ ihash(dl->bgen);
+ if(hdel){
+ p = &fs->dlcache[h % fs->dlcmax];
+ for(d = *p; d != nil; d = d->chain){
+ if(d->gen == dl->gen && d->bgen == dl->bgen)
+ break;
+ p = &d->chain;
+ }
+ if(d != nil)
+ *p = d->chain;
+ }
+ if(dl == fs->dlhead)
+ fs->dlhead = dl->cnext;
+ if(dl == fs->dltail)
+ fs->dltail = dl->cprev;
+ if(dl->cnext != nil)
+ dl->cnext->cprev = dl->cprev;
+ if(dl->cprev != nil)
+ dl->cprev->cnext = dl->cnext;
+ dl->cnext = nil;
+ dl->cprev = nil;
+}
+
+static Dlist*
+dlcacheget(vlong gen, vlong bgen)
+{
+ Dlist *dl;
+ uint h;
+
+ h = ihash(gen) ^ ihash(bgen);
+ for(dl = fs->dlcache[h % fs->dlcmax]; dl != nil; dl = dl->chain)
+ if(dl->gen == gen && dl->bgen == bgen)
+ break;
+ if(dl != nil)
+ dlcachedel(dl, 0);
+ return dl;
+}
+
+static Dlist*
+getdl(vlong gen, vlong bgen)
+{
+ char kbuf[Dlksz], kvbuf[Dlkvpsz];
+ Dlist *dl, **p;
+ uint h;
+ Msg m;
+ Kvp kv;
+ Key k;
+
+ if((dl = dlcacheget(gen, bgen)) != nil)
+ return dl;
+ dl = emalloc(sizeof(Dlist), 1);
+ if(waserror()){
+ free(dl);
+ nexterror();
+ }
+ kbuf[0] = Kdlist;
+ PACK64(kbuf+1, gen);
+ PACK64(kbuf+9, bgen);
+ k.k = kbuf;
+ k.nk = sizeof(kbuf);
+
+ /* load up existing dlist */
+ if(btlookup(&fs->snap, &k, &kv, kvbuf, sizeof(kvbuf))){
+ kv2dlist(&kv, dl);
+ goto Found;
+ }
+
+ /* create a new one if it didn't exist */
+ dl->gen = gen;
+ dl->bgen = bgen;
+ dl->hd.addr = -1;
+ dl->tl.addr = -1;
+ dl->ins = nil;
+
+ m.op = Oinsert;
+ dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+ btupsert(&fs->snap, &m, 1);
+Found:
+ poperror();
+ h = ihash(gen) ^ ihash(bgen);
+ p = &fs->dlcache[h % fs->dlcmax];
+ dl->chain = *p;
+ *p = dl;
+ return dl;
+}
+
+void
+putdl(Dlist *dl)
+{
+ Dlist *dt;
+
+ if(dl->gen == -1)
+ return;
+ dlcachedel(dl, 0);
+ while(fs->dltail != nil && fs->dlcount >= fs->dlcmax){
+ dt = fs->dltail;
+ dlflush(dt);
+ dlcachedel(dt, 1);
+ dropblk(dt->ins);
+ free(dt);
+ }
+
+ dl->cprev = nil;
+ dl->cnext = fs->dlhead;
+ if(fs->dltail == nil)
+ fs->dltail = dl;
+ if(fs->dlhead != nil)
+ fs->dlhead->cprev = dl;
+ fs->dlhead = dl;
+}
+
+void
+freedl(Dlist *dl, int docontents)
+{
+ char buf[Kvmax];
+ Arena *a;
+ Qent qe;
+ Bptr bp;
+ Msg m;
+ Blk *b;
+ char *p;
+
+ bp = dl->hd;
+ if(dl->gen != -1){
+ m.op = Odelete;
+ dlist2kv(dl, &m, buf, sizeof(buf));
+ btupsert(&fs->snap, &m, 1);
+ }
+ while(bp.addr != -1){
+ b = getblk(bp, 0);
+ /*
+ * Because these deadlists are dead-dead at this point,
+ * they'll never be read from again; we can avoid worrying
+ * about deferred reclamation, and queue them up to be freed
+ * directly, which means we don't need to worry about watiing
+ * for a quiescent state, and the associated out-of-block
+ * deadlocks that come with it.
+ */
+ if(docontents){
+ for(p = b->data; p != b->data+b->logsz; p += 8){
+ qe.op = Qfree;
+ qe.bp.addr = UNPACK64(p);
+ qe.bp.hash = -1;
+ qe.bp.gen = -1;
+ qe.b = nil;
+ a = getarena(qe.bp.addr);
+ qput(a->sync, qe);
+ traceb("dlclear", qe.bp);
+ }
+ }
+ bp = b->logp;
+ qe.op = Qfree;
+ qe.bp = b->bp;
+ qe.b = nil;
+ a = getarena(qe.bp.addr);
+ qput(a->sync, qe);
+ traceb("dlfreeb", qe.bp);
+ dropblk(b);
+ }
+}
+
+static void
+mergedl(vlong merge, vlong gen, vlong bgen)
+{
+ char buf[2][Kvmax];
+ Dlist *d, *m;
+ Msg msg[2];
+ Blk *b;
+
+ d = nil;
+ m = nil;
+ if(waserror()){
+ putdl(m);
+ putdl(d);
+ nexterror();
+ }
+ d = getdl(merge, bgen);
+ m = getdl(gen, bgen);
+ assert(d != m);
+ /*
+ * If the dest dlist didn't exist,
+ * just move the merge dlist over
+ * and be done with it, otherwise
+ * chain onto the existing dlist
+ * tail.
+ */
+ if(d->hd.addr == -1){
+ assert(d->ins == nil);
+ d->hd = m->hd;
+ d->tl = m->tl;
+ d->ins = m->ins;
+ if(d->ins != nil)
+ holdblk(d->ins);
+ }else{
+ if(m->ins != nil){
+ enqueue(m->ins);
+ dropblk(m->ins);
+ m->ins = nil;
+ }
+ b = getblk(d->tl, 0);
+ b->logp = m->hd;
+ assert(d->hd.addr != m->hd.addr);
+ finalize(b);
+ syncblk(b);
+ dropblk(b);
+ }
+ msg[0].op = Odelete;
+ dlist2kv(m, &msg[0], buf[0], sizeof(buf[0]));
+ msg[1].op = Oinsert;
+ dlist2kv(d, &msg[1], buf[1], sizeof(buf[1]));
+ btupsert(&fs->snap, msg, 2);
+ putdl(m);
+ putdl(d);
+ poperror();
+}
+
+static void
+reclaimblocks(vlong gen, vlong succ, vlong prev)
+{
+ char pfx[9];
+ Dlist dl;
+ Scan s;
+
+ pfx[0] = Kdlist;
+ PACK64(pfx+1, gen);
+ btnewscan(&s, pfx, sizeof(pfx));
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ kv2dlist(&s.kv, &dl);
+
+ if(succ != -1 && dl.bgen <= prev)
+ mergedl(succ, dl.gen, dl.bgen);
+ else if(dl.bgen <= prev)
+ mergedl(prev, dl.gen, dl.bgen);
+ else
+ freedl(&dl, 1);
+ }
+ btexit(&s);
+ if(succ != -1){
+ pfx[0] = Kdlist;
+ PACK64(pfx+1, succ);
+ btnewscan(&s, pfx, sizeof(pfx));
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ kv2dlist(&s.kv, &dl);
+ if(dl.bgen > prev)
+ freedl(&dl, 1);
+ }
+ btexit(&s);
+ }
+}
+
+/*
+ * Removes a label from a snapshot, allowing
+ * it to be reclaimed if it is not a direct
+ * predecessor of more than one other snapshot.
+ *
+ * If it has one successor and no label, then
+ * it will be merged with that successor.
+ */
+void
+delsnap(Tree *t, vlong succ, char *name)
+{
+ char *p, buf[4][Kvmax];
+ int nm, deltree;
+ Mount *mnt;
+ Msg m[4];
+
+ nm = 0;
+ deltree = 0;
+ if(name != nil){
+ if(strcmp(name, "dump") == 0
+ || strcmp(name, "empty") == 0
+ || strcmp(name, "adm") == 0)
+ error(Ename);
+
+ m[nm].op = Odelete;
+ m[nm].k = buf[nm];
+ p = packlbl(buf[nm], sizeof(buf[nm]), name);
+ m[nm].nk = p - m[nm].k;
+ m[nm].v = nil;
+ m[nm].nv = 0;
+ t->nlbl--;
+ nm++;
+ }
+
+ if(t->nlbl == 0 && t->nref <= 1){
+ deltree = 1;
+ m[nm].op = Orelink;
+ retag2kv(t->pred, succ, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
+ nm++;
+ if(t->succ != -1){
+ m[nm].op = Oreprev;
+ retag2kv(t->succ, t->pred, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
+ nm++;
+ }
+ m[nm].op = Odelete;
+ m[nm].k = buf[nm];
+ p = packsnap(buf[nm], sizeof(buf[nm]), t->gen);
+ m[nm].nk = p - m[nm].k;
+ m[nm].v = nil;
+ m[nm].nv = 0;
+ nm++;
+ }
+ assert(nm <= nelem(m));
+ dlsync();
+ btupsert(&fs->snap, m, nm);
+ if(deltree){
+ reclaimblocks(t->gen, succ, t->pred);
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+ if(mnt->root->gen == t->succ)
+ mnt->root->pred = t->pred;
+ if(mnt->root->gen == t->pred)
+ mnt->root->succ = t->succ;
+ }
+ }
+}
+
+/*
+ * Attaches a label to a tree, incrementing
+ * its reference count. This labelled snapshot
+ * will show up in the dump.
+ */
+void
+tagsnap(Tree *t, char *name, int flg)
+{
+ char buf[3][Kvmax];
+ Msg m[3];
+ Tree *n;
+ int i;
+
+ if(strcmp(name, "dump") == 0
+ || strcmp(name, "empty") == 0
+ || strcmp(name, "adm") == 0)
+ error(Ename);
+
+ i = 0;
+ n = nil;
+ if(flg & Lmut){
+ n = emalloc(sizeof(Tree), 1);
+ if(waserror()){
+ free(n);
+ nexterror();
+ }
+ n->memref = 1;
+ n->dirty = 0;
+ n->nlbl = 1;
+ n->nref = 0;
+ n->ht = t->ht;
+ n->bp = t->bp;
+ n->succ = -1;
+ n->pred = t->gen;
+ n->base = t->gen;
+ n->gen = aincv(&fs->nextgen, 1);
+ n->memgen = aincv(&fs->nextgen, 1);
+
+ t->nref++;
+ m[i].op = Orelink;
+ retag2kv(t->gen, t->succ, 0, 1, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ m[i].op = Oinsert;
+ lbl2kv(name, n->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ m[i].op = Oinsert;
+ tree2kv(n, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ poperror();
+ }else{
+ t->nlbl++;
+ m[i].op = Orelink;
+ retag2kv(t->gen, t->succ, 1, 0, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+
+ m[i].op = Oinsert;
+ t->pred = t->gen;
+ t->nlbl++;
+ lbl2kv(name, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ }
+ btupsert(&fs->snap, m, i);
+ free(n);
+}
+
+/*
+ * Updates a snapshot; keeps the generation the same if possible,
+ * otherwise moves to a new generation. A snapshot may only stay
+ * at the same generation as long as it is at the tip of a snapshot
+ * list; once it's observable by a derived snapshot it must be
+ * immutable.
+ */
+void
+updatesnap(Tree **r, Tree *o, char *lbl, int flg)
+{
+ char buf[4][Kvmax];
+ Msg m[4];
+ Tree *t;
+ int i;
+
+ if(!o->dirty)
+ return;
+
+ traceb("updatesnap", o->bp);
+ /* update the old kvp */
+ o->nlbl--;
+ o->nref++;
+
+ /* create the new one */
+
+ t = emalloc(sizeof(Tree), 1);
+ if(waserror()){
+ free(t);
+ nexterror();
+ }
+ t->memref = 1;
+ t->dirty = 0;
+
+ t->nlbl = 1;
+ t->nref = 0;
+ t->ht = o->ht;
+ t->bp = o->bp;
+ t->succ = -1;
+ t->base = o->base;
+ t->gen = o->memgen;
+ t->memgen = aincv(&fs->nextgen, 1);
+
+ i = 0;
+ m[i].op = Orelink;
+ if(o->nlbl == 0 && o->nref == 1){
+ t->pred = o->pred;
+ retag2kv(t->pred, t->gen, 0, 0, &m[i], buf[i], sizeof(buf[i]));
+ }else{
+ t->pred = o->gen;
+ retag2kv(t->pred, t->gen, -1, 1, &m[i], buf[i], sizeof(buf[i]));
+ }
+ i++;
+
+ m[i].op = Oinsert;
+ tree2kv(t, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ m[i].op = Oinsert;
+ lbl2kv(lbl, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ btupsert(&fs->snap, m, i);
+
+ /* only update the dirty status after we sync */
+ o->dirty = 0;
+
+ /* this was the last ref to the snap */
+ if(o->nlbl == 0 && o->nref == 1)
+ delsnap(o, t->gen, nil);
+ closesnap(o);
+ asetp(r, t);
+ poperror();
+}
+
+/*
+ * open snapshot by label, returning a tree.
+ */
+Tree*
+opensnap(char *label, int *flg)
+{
+ char *p, buf[Kvmax];
+ Tree *t;
+ vlong gen;
+ Kvp kv;
+ Key k;
+
+ /* Klabel{"name"} => Ksnap{id} */
+ if((p = packlbl(buf, sizeof(buf), label)) == nil)
+ return nil;
+ k.k = buf;
+ k.nk = p - buf;
+ if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
+ return nil;
+ assert(kv.nv == 1+8+4);
+ gen = UNPACK64(kv.v + 1);
+ if(flg != nil)
+ *flg = UNPACK32(kv.v + 1+8);
+
+ t = mallocz(sizeof(Tree), 1);
+ if(waserror()){
+ free(t);
+ nexterror();
+ }
+ p = packsnap(buf, sizeof(buf), gen);
+ k.k = buf;
+ k.nk = p - buf;
+ if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
+ broke(Efs);
+ unpacktree(t, kv.v, kv.nv);
+ t->memref = 1;
+ t->memgen = aincv(&fs->nextgen, 1);
+ poperror();
+ return t;
+}
+
+/*
+ * close snapshot, flushing and freeing in-memory
+ * representation.
+ */
+void
+closesnap(Tree *t)
+{
+ if(t == nil || adec(&t->memref) != 0)
+ return;
+ limbo(DFtree, t);
+}
+
+void
+dlsync(void)
+{
+ Dlist *dl, *n;
+
+ tracem("dlsync");
+ dlflush(&fs->snapdl);
+ for(dl = fs->dlhead; dl != nil; dl = n){
+ n = dl->cnext;
+ dlflush(dl);
+ }
+}
+
+/*
+ * Marks a block as killed by the tree
+ * t, which means that it will be free
+ * for use after t is reclaimed.
+ *
+ * t must be an active snapshot with
+ * no successors.
+ */
+void
+killblk(Tree *t, Bptr bp)
+{
+ Dlist *dl;
+ Blk *b;
+ char *p;
+
+ /*
+ * When we have a forked snap, blocks allocated before the fork
+ * are the responsibility of the other chain; in this chain, we
+ * leak it and let the last reference in the other chain clean up
+ */
+ if(t == &fs->snap)
+ dl = &fs->snapdl;
+ else if(bp.gen > t->base)
+ dl = getdl(t->memgen, bp.gen);
+ else
+ return;
+ if(waserror()){
+ putdl(dl);
+ nexterror();
+ }
+ if(dl->ins == nil || Logspc - dl->ins->logsz < Logslop){
+ b = newblk(&fs->snap, Tdlist);
+ if(dl->ins != nil){
+ enqueue(dl->ins);
+ dropblk(dl->ins);
+ }
+ if(dl->tl.addr == -1)
+ dl->tl = b->bp;
+ b->logp = dl->hd;
+ dl->hd = b->bp;
+ dl->ins = b;
+ cacheins(b);
+ }
+ p = dl->ins->data + dl->ins->logsz;
+ dl->ins->logsz += 8;
+ setflag(dl->ins, Bdirty, 0);
+ PACK64(p, bp.addr);
+ poperror();
+ putdl(dl);
+}
--- /dev/null
+++ b/tree.c
@@ -1,0 +1,1543 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Path Path;
+
+struct Path {
+ /* Flowing down for flush */
+ Msg *ins; /* inserted values, bounded by lo..hi */
+ Blk *b; /* to shadow */
+ int idx; /* insert at */
+ int lo; /* key range */
+ int hi; /* key range */
+ int sz; /* size of range */
+
+ /* Flowing up from flush */
+ int op; /* change done along path */
+ Blk *m; /* node merged against, for post-update free */
+ Blk *nl; /* new left */
+ Blk *nr; /* new right, if we split or rotated */
+ int midx; /* modification index */
+ int npull; /* number of messages successfully pulled */
+ int pullsz; /* size of pulled messages */
+};
+
+#define efreeblk(t, b) do { \
+ if(b != nil) \
+ freeblk(t, b); \
+ } while(0)
+
+static void
+stablesort(Msg *m, int nm)
+{
+ int i, j;
+ Msg t;
+
+ for(i = 1; i < nm; i++){
+ for(j = i; j > 0; j--){
+ if(keycmp(&m[j-1], &m[j]) <= 0)
+ break;
+ t = m[j-1];
+ m[j-1] = m[j];
+ m[j] = t;
+ }
+ }
+}
+
+void
+cpkey(Key *dst, Key *src, char *buf, int nbuf)
+{
+ assert(src->nk <= nbuf);
+ memmove(buf, src->k, src->nk);
+ dst->k = buf;
+ dst->nk = src->nk;
+}
+
+void
+cpkvp(Kvp *dst, Kvp *src, char *buf, int nbuf)
+{
+ assert(src->nk+src->nv <= nbuf);
+ memmove(buf, src->k, src->nk);
+ memmove(buf+ src->nk, src->v, src->nv);
+ dst->k = buf;
+ dst->nk = src->nk;
+ dst->v = buf+src->nk;
+ dst->nv = src->nv;
+}
+
+int
+keycmp(Key *a, Key *b)
+{
+ int c, n;
+
+ n = (a->nk < b->nk) ? a->nk : b->nk;
+ if((c = memcmp(a->k, b->k, n)) != 0)
+ return c < 0 ? -1 : 1;
+ if(a->nk < b->nk)
+ return -1;
+ else if(a->nk > b->nk)
+ return 1;
+ else
+ return 0;
+}
+
+static int
+msgsz(Msg *m)
+{
+ /* disp + op + klen + key + vlen + v */
+ return 2+1+2+m->nk +2+ m->nv;
+}
+
+static int
+valsz(Kvp *kv)
+{
+ return 2 + 2+kv->nk + 2+kv->nv;
+}
+
+void
+getval(Blk *b, int i, Kvp *kv)
+{
+ char *p;
+ int o;
+
+ assert(i >= 0 && i < b->nval);
+ p = b->data + 2*i;
+ o = UNPACK16(p); p = b->data + o;
+ kv->nk = UNPACK16(p); p += 2;
+ kv->k = p; p += kv->nk;
+ kv->nv = UNPACK16(p); p += 2;
+ kv->v = p;
+}
+
+Bptr
+getptr(Kvp *kv, int *fill)
+{
+ assert(kv->nv == Ptrsz || kv->nv == Ptrsz+2);
+ *fill = UNPACK16(kv->v + Ptrsz);
+ return unpackbp(kv->v, kv->nv);
+}
+
+/* Exported for reaming */
+void
+setval(Blk *b, Kvp *kv)
+{
+ int off, spc;
+ char *p;
+
+ spc = (b->type == Tleaf) ? Leafspc : Pivspc;
+ b->valsz += 2 + kv->nk + 2 + kv->nv;
+ off = spc - b->valsz;
+
+ assert(2*(b->nval+1) + b->valsz <= spc);
+ assert(2*(b->nval+1) <= off);
+
+ p = b->data + 2*b->nval;
+ PACK16(p, off);
+
+ p = b->data + off;
+ PACK16(p, kv->nk); p += 2;
+ memmove(p, kv->k, kv->nk); p += kv->nk;
+ PACK16(p, kv->nv); p += 2;
+ memmove(p, kv->v, kv->nv);
+
+ b->nval++;
+}
+
+static void
+setptr(Blk *b, Key *k, Bptr bp, int fill)
+{
+ char *p, buf[Ptrsz+2];
+ Kvp kv;
+
+ kv.k = k->k;
+ kv.nk = k->nk;
+ kv.v = buf;
+ kv.nv = sizeof(buf);
+ p = packbp(buf, sizeof(buf), &bp);
+ PACK16(p, fill);
+ setval(b, &kv);
+}
+
+static void
+setmsg(Blk *b, Msg *m)
+{
+ char *p;
+ int o;
+
+ assert(b->type == Tpivot);
+ b->bufsz += msgsz(m)-2;
+
+ p = b->data + Pivspc + 2*b->nbuf;
+ o = Bufspc - b->bufsz;
+ PACK16(p, o);
+
+ p = b->data + Pivspc + o;
+ *p = m->op; p += 1;
+ PACK16(p, m->nk); p += 2;
+ memmove(p, m->k, m->nk); p += m->nk;
+ PACK16(p, m->nv); p += 2;
+ memmove(p, m->v, m->nv);
+
+ b->nbuf++;
+}
+
+void
+getmsg(Blk *b, int i, Msg *m)
+{
+ char *p;
+ int o;
+
+ assert(b->type == Tpivot);
+ assert(i >= 0 && i < b->nbuf);
+ p = b->data + Pivspc + 2*i;
+ o = UNPACK16(p);
+ p = b->data + Pivspc + o;
+ m->op = *p; p += 1;
+ m->nk = UNPACK16(p); p += 2;
+ m->k = p; p += m->nk;
+ m->nv = UNPACK16(p); p += 2;
+ m->v = p;
+}
+
+static int
+bufsearch(Blk *b, Key *k, Msg *m, int *same)
+{
+ int lo, hi, ri, mid, r;
+ Msg cmp;
+
+ ri = -1;
+ lo = 0;
+ hi = b->nbuf-1;
+ while(lo <= hi){
+ mid = (hi + lo) / 2;
+ getmsg(b, mid, &cmp);
+ r = keycmp(k, &cmp);
+ switch(r){
+ case -1:
+ hi = mid-1;
+ break;
+ case 0:
+ ri = mid;
+ hi = mid-1;
+ break;
+ case 1:
+ lo = mid+1;
+ break;
+ }
+ }
+ /*
+ * we can have duplicate messages, and we
+ * want to point to the first of them:
+ * scan backwards.
+ */
+ *same = 0;
+ if(ri == -1)
+ ri = lo-1;
+ else
+ *same = 1;
+ if(m != nil && ri >= 0)
+ getmsg(b, ri, m);
+ return ri;
+}
+
+static int
+blksearch(Blk *b, Key *k, Kvp *rp, int *same)
+{
+ int lo, hi, ri, mid, r;
+ Kvp cmp;
+
+ ri = -1;
+ lo = 0;
+ hi = b->nval-1;
+ while(lo <= hi){
+ mid = (hi + lo) / 2;
+ getval(b, mid, &cmp);
+ r = keycmp(k, &cmp);
+ switch(r){
+ case -1:
+ hi = mid-1;
+ break;
+ case 0:
+ ri = mid;
+ hi = mid-1;
+ break;
+ case 1:
+ lo = mid+1;
+ break;
+ }
+ }
+ *same = 0;
+ if(ri == -1)
+ ri = lo-1;
+ else
+ *same = 1;
+ if(ri >= 0)
+ getval(b, ri, rp);
+ return ri;
+}
+
+static int
+buffill(Blk *b)
+{
+ assert(b->type == Tpivot);
+ return 2*b->nbuf + b->bufsz;
+}
+
+static int
+filledbuf(Blk *b, int nmsg, int needed)
+{
+ assert(b->type == Tpivot);
+ return 2*(b->nbuf+nmsg) + b->bufsz + needed > Bufspc;
+}
+
+static int
+filledleaf(Blk *b, int needed)
+{
+ assert(b->type == Tleaf);
+ return 2*(b->nval+1) + b->valsz + needed > Leafspc;
+}
+
+static int
+filledpiv(Blk *b, int reserve)
+{
+ /*
+ * We need to guarantee there's room for one message
+ * at all times, so that splits along the whole path
+ * have somewhere to go as they propagate up.
+ */
+ assert(b->type == Tpivot);
+ return 2*(b->nval+1) + b->valsz + reserve*Kpmax > Pivspc;
+}
+
+static void
+copyup(Blk *n, Path *pp, int *nbytes)
+{
+ Kvp kv;
+ Msg m;
+
+ /*
+ * It's possible for the previous node to have
+ * been fully cleared out by a large number of
+ * delete messages, so we need to check if
+ * there's anything in it to copy up.
+ */
+ if(pp->nl->nval > 0){
+ getval(pp->nl, 0, &kv);
+ if(pp->nl->nbuf > 0){
+ getmsg(pp->nl, 0, &m);
+ if(keycmp(&kv, &m) > 0)
+ kv.Key = m.Key;
+ }
+ setptr(n, &kv, pp->nl->bp, blkfill(pp->nl));
+ if(nbytes != nil)
+ *nbytes += valsz(&kv);
+ }
+ if(pp->nr != nil && pp->nr->nval > 0){
+ getval(pp->nr, 0, &kv);
+ if(pp->nr->nbuf > 0){
+ getmsg(pp->nr, 0, &m);
+ if(keycmp(&kv, &m) > 0)
+ kv.Key = m.Key;
+ }
+ setptr(n, &kv, pp->nr->bp, blkfill(pp->nr));
+ if(nbytes != nil)
+ *nbytes += valsz(&kv);
+ }
+}
+
+static void
+statupdate(Kvp *kv, Msg *m)
+{
+ int op;
+ char *p;
+ Xdir d;
+
+ p = m->v;
+ op = *p++;
+ kv2dir(kv, &d);
+ /* bump version */
+ d.qid.vers++;
+ if(op & Owsize){
+ d.length = UNPACK64(p);
+ p += 8;
+ }
+ if(op & Owmode){
+ d.mode = UNPACK32(p);
+ d.qid.type = d.mode>>24;
+ p += 4;
+ }
+ if(op & Owmtime){
+ d.mtime = UNPACK64(p);
+ p += 8;
+ }
+ if(op & Owatime){
+ d.atime = UNPACK64(p);
+ p += 8;
+ }
+ if(op & Owuid){
+ d.uid = UNPACK32(p);
+ p += 4;
+ }
+ if(op & Owgid){
+ d.gid = UNPACK32(p);
+ p += 4;
+ }
+ if(op & Owmuid){
+ d.muid = UNPACK32(p);
+ p += 4;
+ }
+ if(p != m->v + m->nv)
+ fatal("malformed stat: kv=%P, m=%M\n", kv, m);
+ if(packdval(kv->v, kv->nv, &d) == nil)
+ fatal("repacking dir failed\n");
+}
+
+static int
+apply(Kvp *kv, Msg *m, char *buf, int nbuf)
+{
+ vlong *pv;
+ char *p;
+ Tree t;
+
+ switch(m->op){
+ case Odelete:
+ assert(keycmp(kv, m) == 0);
+ return 0;
+ case Oclearb:
+ case Oclobber:
+ return 0;
+ case Oinsert:
+ cpkvp(kv, m, buf, nbuf);
+ return 1;
+ case Owstat:
+ assert(keycmp(kv, m) == 0);
+ statupdate(kv, m);
+ return 1;
+ case Orelink:
+ case Oreprev:
+ unpacktree(&t, kv->v, kv->nv);
+ p = m->v;
+ pv = (m->op == Orelink) ? &t.succ : &t.pred;
+ *pv = UNPACK64(p); p += 8;
+ t.nlbl += *p; p++;
+ t.nref += *p; p++;
+ assert(t.nlbl >= 0 && t.nref >= 0);
+ assert(p == m->v + m->nv);
+ packtree(kv->v, kv->nv, &t);
+ return 1;
+ default:
+ fatal("invalid op %d\n", m->op);
+ }
+ return 0;
+}
+
+static int
+pullmsg(Path *p, int i, Kvp *v, Msg *m, int *full, int spc)
+{
+ if(i < 0 || i >= p->hi || *full)
+ return -1;
+
+ if(p->ins != nil)
+ *m = p->ins[i];
+ else
+ getmsg(p->b, i, m);
+ if(msgsz(m) <= spc)
+ return (v == nil) ? 0 : keycmp(v, m);
+ *full = 1;
+ return -1;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1,
+ */
+static void
+updateleaf(Tree *t, Path *up, Path *p)
+{
+ char buf[Msgmax];
+ int i, j, c, ok, full, spc;
+ Blk *b, *n;
+ Bptr bp;
+ Msg m;
+ Kvp v;
+
+ i = 0;
+ j = up->lo;
+ b = p->b;
+ /*
+ * spc is the amount of room we have
+ * to copy data down from the parent; it's
+ * necessarily a bit conservative, because
+ * deletion messages don't take space -- but
+ * we don't know how what the types of all
+ * messages are.
+ */
+ full = 0;
+ spc = Leafspc - blkfill(b);
+ n = newblk(t, b->type);
+ assert(i >= 0 && j >= 0);
+ while(i < b->nval || j < up->hi){
+ if(i >= b->nval)
+ c = 1;
+ else{
+ c = -1;
+ getval(p->b, i, &v);
+ if(j < up->hi){
+ if(up->ins != nil)
+ m = up->ins[j];
+ else
+ getmsg(up->b, j, &m);
+ if(msgsz(&m) <= spc)
+ c = keycmp(&v, &m);
+ else
+ full = 1;
+ }
+ }
+ switch(c){
+ /* Value before message: just copy value */
+ case -1:
+ i++;
+ setval(n, &v);
+ break;
+ /* Value merges with message sequence */
+ case 0:
+ i++;
+ j++;
+ cpkvp(&v, &v, buf, sizeof(buf));
+ if(v.nk > 0 && v.k[0] == Kdat)
+ if(m.op == Oclearb
+ || m.op == Oinsert
+ || m.op == Odelete){
+ bp = unpackbp(v.v, v.nv);
+ freebp(t, bp);
+ }
+ ok = apply(&v, &m, buf, sizeof(buf));
+ goto Copyloop;
+ /* Message before value: Insert message sequence */
+ case 1:
+ j++;
+ cpkvp(&v, &m, buf, sizeof(buf));
+ ok = 0;
+ if(m.op != Oclearb && m.op != Oclobber){
+ spc -= valsz(&m);
+ p->pullsz += msgsz(&m);
+ ok = 1;
+ }
+ goto Copyloop;
+ Copyloop:
+ while(j < up->hi){
+ if(pullmsg(up, j, &v, &m, &full, spc) != 0)
+ break;
+ if(ok && v.nk > 0 && v.k[0] == Kdat)
+ if(m.op == Oclearb
+ || m.op == Oinsert
+ || m.op == Odelete){
+ bp = unpackbp(v.v, v.nv);
+ freebp(t, bp);
+ }
+ p->pullsz += msgsz(&m);
+ ok = apply(&v, &m, buf, sizeof(buf));
+ j++;
+ }
+ if(ok)
+ setval(n, &v);
+ break;
+ }
+ }
+ p->npull = (j - up->lo);
+ p->nl = n;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1,
+ */
+static void
+updatepiv(Tree *t, Path *up, Path *p, Path *pp)
+{
+ char buf[Msgmax];
+ int i, j, sz, full, spc;
+ Blk *b, *n;
+ Msg m, u;
+
+ b = p->b;
+ n = newblk(t, b->type);
+ for(i = 0; i < b->nval; i++){
+ if(pp != nil && i == p->midx){
+ copyup(n, pp, nil);
+ if(pp->op == POrot || pp->op == POmerge)
+ i++;
+ }else{
+ getval(b, i, &m);
+ setval(n, &m);
+ }
+ }
+ i = 0;
+ j = up->lo;
+ sz = 0;
+ full = 0;
+ spc = Bufspc - buffill(b);
+ if(pp != nil)
+ spc += pp->pullsz;
+ while(i < b->nbuf){
+ if(i == p->lo)
+ i += pp->npull;
+ if(i == b->nbuf)
+ break;
+ getmsg(b, i, &m);
+ switch(pullmsg(up, j, &m, &u, &full, spc - sz)){
+ case -1:
+ case 0:
+ setmsg(n, &m);
+ i++;
+ break;
+ case 1:
+ cpkvp(&m, &u, buf, sizeof(buf));
+ while(pullmsg(up, j, &m, &u, &full, spc) == 0){
+ setmsg(n, &u);
+ sz = msgsz(&u);
+ p->pullsz += sz;
+ spc -= sz;
+ j++;
+ }
+ }
+ }
+ while(j < up->hi){
+ pullmsg(up, j, nil, &u, &full, spc);
+ if(full)
+ break;
+ setmsg(n, &u);
+ sz = msgsz(&u);
+ p->pullsz += sz;
+ spc -= sz;
+ j++;
+ }
+ p->npull = (j - up->lo);
+ p->nl = n;
+}
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the tree by more than 1.
+ */
+static void
+splitleaf(Tree *t, Path *up, Path *p, Kvp *mid)
+{
+ char buf[Msgmax];
+ Blk *b, *d, *l, *r;
+ int full, copied, spc, ok, halfsz;
+ int i, j, c;
+ Bptr bp;
+ Msg m;
+ Kvp v;
+
+ /*
+ * If the block one entry up the
+ * p is nil, we're at the root,
+ * so we want to make a new block.
+ */
+ b = p->b;
+ l = nil;
+ r = nil;
+ if(waserror()){
+ efreeblk(t, l);
+ efreeblk(t, r);
+ nexterror();
+ }
+ l = newblk(t, b->type);
+ r = newblk(t, b->type);
+
+ d = l;
+ i = 0;
+ j = up->lo;
+ full = 0;
+ copied = 0;
+ halfsz = (2*b->nval + b->valsz + up->sz) / 2;
+ if(halfsz > Leafspc/2)
+ halfsz = Leafspc/2;
+ spc = Leafspc - (halfsz + Msgmax);
+ assert(b->nval >= 4);
+ while(i < b->nval){
+ /*
+ * We're trying to balance size,
+ * but we need at least 2 nodes
+ * in each half of the split if
+ * we want a valid tree.
+ */
+ if(d == l)
+ if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
+ d = r;
+ spc = Leafspc - (halfsz + Msgmax);
+ getval(b, i, mid);
+ }
+ getval(b, i, &v);
+ c = pullmsg(up, j, &v, &m, &full, spc);
+ switch(c){
+ case -1:
+ i++;
+ setval(d, &v);
+ copied += valsz(&v);
+ break;
+ case 0:
+ i++;
+ j++;
+ cpkvp(&v, &v, buf, sizeof(buf));
+ copied += valsz(&v);
+ if(v.nk > 0 && v.k[0] == Kdat)
+ if(m.op == Oclearb
+ || m.op == Oinsert
+ || m.op == Odelete){
+ bp = unpackbp(v.v, v.nv);
+ freebp(t, bp);
+ }
+ ok = apply(&v, &m, buf, sizeof(buf));
+ goto Copyloop;
+ case 1:
+ j++;
+ cpkvp(&v, &m, buf, sizeof(buf));
+ copied += valsz(&v);
+ ok = 0;
+ if(m.op != Oclearb && m.op != Oclobber){
+ spc -= valsz(&m);
+ p->pullsz += msgsz(&m);
+ ok = 1;
+ }
+ goto Copyloop;
+ Copyloop:
+ while(j < up->hi){
+ if(pullmsg(up, j, &v, &m, &full, spc) != 0)
+ break;
+ if(ok && v.nk > 0 && v.k[0] == Kdat)
+ if(m.op == Oclearb
+ || m.op == Oinsert
+ || m.op == Odelete){
+ bp = unpackbp(v.v, v.nv);
+ freebp(t, bp);
+ }
+ p->pullsz += msgsz(&m);
+ ok = apply(&v, &m, buf, sizeof(buf));
+ j++;
+ }
+ if(ok)
+ setval(d, &v);
+ break;
+ }
+ }
+ p->npull = (j - up->lo);
+ p->op = POsplit;
+ p->nl = l;
+ p->nr = r;
+ poperror();
+}
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the tree by more
+ * than one.
+ */
+static void
+splitpiv(Tree *t, Path *, Path *p, Path *pp, Kvp *mid)
+{
+ int i, copied, halfsz;
+ Blk *b, *d, *l, *r;
+ Kvp tk;
+ Msg m;
+
+ /*
+ * If the bp->lock one entry up the
+ * p is nil, we're at the root,
+ * so we want to make a new bp->lock.
+ */
+ b = p->b;
+ l = nil;
+ r = nil;
+ if(waserror()){
+ efreeblk(t, l);
+ efreeblk(t, r);
+ nexterror();
+ }
+ l = newblk(t, b->type);
+ r = newblk(t, b->type);
+ d = l;
+ copied = 0;
+ halfsz = (2*b->nval + b->valsz)/2;
+ assert(b->nval >= 4);
+ for(i = 0; i < b->nval; i++){
+ /*
+ * We're trying to balance size,
+ * but we need at least 2 nodes
+ * in each half of the split if
+ * we want a valid tree.
+ */
+ if(d == l)
+ if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
+ d = r;
+ getval(b, i, mid);
+ }
+ if(i == p->idx){
+ copyup(d, pp, &copied);
+ continue;
+ }
+ getval(b, i, &tk);
+ setval(d, &tk);
+ copied += valsz(&tk);
+ }
+ d = l;
+ for(i = 0; i < b->nbuf; i++){
+ if(i == p->lo)
+ i += pp->npull;
+ if(i == b->nbuf)
+ break;
+ getmsg(b, i, &m);
+ if(d == l && keycmp(&m, mid) >= 0)
+ d = r;
+ setmsg(d, &m);
+ }
+ p->op = POsplit;
+ p->nl = l;
+ p->nr = r;
+ poperror();
+}
+
+static void
+merge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
+{
+ Blk *d;
+ Msg m;
+ int i;
+
+ d = newblk(t, a->type);
+ for(i = 0; i < a->nval; i++){
+ getval(a, i, &m);
+ setval(d, &m);
+ }
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &m);
+ setval(d, &m);
+ }
+ if(a->type == Tpivot){
+ for(i = 0; i < a->nbuf; i++){
+ getmsg(a, i, &m);
+ setmsg(d, &m);
+ }
+ for(i = 0; i < b->nbuf; i++){
+ getmsg(b, i, &m);
+ setmsg(d, &m);
+ }
+ }
+ enqueue(d);
+ p->midx = idx;
+ pp->nl = d;
+ pp->op = POmerge;
+ pp->nr = nil;
+}
+
+/*
+ * Scan a single block for the split offset;
+ * returns 1 if we'd spill out of the buffer,
+ * updates *idx and returns 0 otherwise.
+ */
+static int
+spillscan(Blk *d, Blk *b, Msg *m, int *idx, int o)
+{
+ int i, used;
+ Msg n;
+
+ used = 2*d->nbuf + d->bufsz;
+ for(i = *idx; i < b->nbuf; i++){
+ getmsg(b, i, &n);
+ if(keycmp(m, &n) <= 0){
+ *idx = i + o;
+ return 0;
+ }
+ used += msgsz(&n);
+ if(used > Bufspc)
+ return 1;
+ }
+ *idx = b->nbuf;
+ return 0;
+}
+
+/*
+ * Returns whether the keys in b between
+ * idx and m would spill out of the buffer
+ * of d.
+ */
+static int
+spillsbuf(Blk *d, Blk *l, Blk *r, Msg *m, int *idx)
+{
+ if(l->type == Tleaf)
+ return 0;
+
+ if(*idx < l->nbuf && spillscan(d, l, m, idx, 0))
+ return 1;
+ if(*idx >= l->nbuf && spillscan(d, r, m, idx, l->nbuf))
+ return 1;
+ return 0;
+}
+
+static void
+rotate(Tree *t, Path *p, Path *pp, int midx, Blk *a, Blk *b, int halfpiv)
+{
+ int i, o, cp, sp, idx;
+ Blk *d, *l, *r;
+ Msg m;
+
+ l = nil;
+ r = nil;
+ if(waserror()){
+ efreeblk(t, l);
+ efreeblk(t, r);
+ nexterror();
+ }
+ l = newblk(t, a->type);
+ r = newblk(t, a->type);
+ d = l;
+ cp = 0;
+ sp = -1;
+ idx = 0;
+ for(i = 0; i < a->nval; i++){
+ getval(a, i, &m);
+ if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+ sp = idx;
+ d = r;
+ }
+ setval(d, &m);
+ cp += valsz(&m);
+ }
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &m);
+ if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+ sp = idx;
+ d = r;
+ }
+ setval(d, &m);
+ cp += valsz(&m);
+ }
+ if(a->type == Tpivot){
+ d = l;
+ o = 0;
+ for(i = 0; i < a->nbuf; i++){
+ if(o == sp){
+ d = r;
+ o = 0;
+ }
+ getmsg(a, i, &m);
+ setmsg(d, &m);
+ o++;
+ }
+ for(i = 0; i < b->nbuf; i++){
+ if(o == sp){
+ d = r;
+ o = 0;
+ }
+ getmsg(b, i, &m);
+ setmsg(d, &m);
+ o++;
+ }
+ }
+ enqueue(l);
+ enqueue(r);
+ p->midx = midx;
+ pp->op = POrot;
+ pp->nl = l;
+ pp->nr = r;
+ poperror();
+}
+
+static void
+rotmerge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
+{
+ int na, nb, ma, mb, imbalance;
+
+ assert(a->type == b->type);
+
+ na = 2*a->nval + a->valsz;
+ nb = 2*b->nval + b->valsz;
+ if(a->type == Tleaf){
+ ma = 0;
+ mb = 0;
+ }else{
+ ma = 2*a->nbuf + a->bufsz;
+ mb = 2*b->nbuf + b->bufsz;
+ }
+ imbalance = na - nb;
+ if(imbalance < 0)
+ imbalance *= -1;
+ /* works for leaf, because 0 always < Bufspc */
+ if(na + nb < (Pivspc - 4*Msgmax) && ma + mb < Bufspc)
+ merge(t, p, pp, idx, a, b);
+ else if(imbalance > 4*Msgmax)
+ rotate(t, p, pp, idx, a, b, (na + nb)/2);
+}
+
+static void
+trybalance(Tree *t, Path *p, Path *pp, int idx)
+{
+ Blk *l, *m, *r;
+ Kvp kl, kr;
+ int spc, fill;
+ Bptr bp;
+
+ if(p->idx == -1 || pp == nil || pp->nl == nil)
+ return;
+ if(pp->op != POmod || pp->op != POmerge)
+ return;
+
+ l = nil;
+ r = nil;
+ m = holdblk(pp->nl);
+ if(waserror()){
+ dropblk(m);
+ dropblk(l);
+ dropblk(r);
+ nexterror();
+ }
+ spc = (m->type == Tleaf) ? Leafspc : Pivspc;
+ if(idx-1 >= 0){
+ getval(p->b, idx-1, &kl);
+ bp = getptr(&kl, &fill);
+ if(fill + blkfill(m) < spc){
+ l = getblk(bp, 0);
+ rotmerge(t, p, pp, idx-1, l, m);
+ goto Done;
+ }
+ }
+ if(idx+1 < p->b->nval){
+ getval(p->b, idx+1, &kr);
+ bp = getptr(&kr, &fill);
+ if(fill + blkfill(m) < spc){
+ r = getblk(bp, 0);
+ rotmerge(t, p, pp, idx, m, r);
+ goto Done;
+ }
+ }
+Done:
+ dropblk(m);
+ dropblk(l);
+ dropblk(r);
+ poperror();
+}
+
+static Path*
+flush(Tree *t, Path *path, int npath)
+{
+
+ Path *up, *p, *pp, *rp;
+ Kvp mid;
+
+ /*
+ * The path must contain at minimum two elements:
+ * we must have 1 node we're inserting into, and
+ * an empty element at the top of the path that
+ * we put the new root into if the root gets split.
+ */
+ assert(npath >= 2);
+ rp = nil;
+ pp = nil;
+ p = &path[npath - 1];
+ up = &path[npath - 2];
+ if(p->b->type == Tleaf){
+ if(!filledleaf(p->b, up->sz)){
+ updateleaf(t, p-1, p);
+ enqueue(p->nl);
+ rp = p;
+ }else{
+ splitleaf(t, up, p, &mid);
+ enqueue(p->nl);
+ enqueue(p->nr);
+ }
+ p->midx = -1;
+ pp = p;
+ up--;
+ p--;
+ }
+ while(p != path){
+ if(!filledpiv(p->b, 1)){
+ trybalance(t, p, pp, p->idx);
+ /* If we merged the root node, break out. */
+ if(up == path && pp != nil && pp->op == POmerge && p->b->nval == 2){
+ rp = pp;
+ goto Out;
+ }
+ updatepiv(t, up, p, pp);
+ enqueue(p->nl);
+ rp = p;
+ }else{
+ splitpiv(t, up, p, pp, &mid);
+ enqueue(p->nl);
+ enqueue(p->nr);
+ }
+ pp = p;
+ up--;
+ p--;
+ }
+ if(pp->nl != nil && pp->nr != nil){
+ rp = &path[0];
+ rp->nl = newblk(t, Tpivot);
+ rp->npull = pp->npull;
+ rp->pullsz = pp->pullsz;
+ copyup(rp->nl, pp, nil);
+ enqueue(rp->nl);
+ }
+Out:
+ return rp;
+}
+
+static void
+freepath(Tree *t, Path *path, int npath)
+{
+ Path *p;
+
+ for(p = path; p != path + npath; p++){
+ if(p->b != nil)
+ freeblk(t, p->b);
+ if(p->m != nil)
+ freeblk(t, p->b);
+ dropblk(p->b);
+ dropblk(p->nl);
+ dropblk(p->nr);
+ }
+ free(path);
+}
+
+/*
+ * Select child node that with the largest message
+ * segment in the current node's buffer.
+ */
+static void
+victim(Blk *b, Path *p)
+{
+ int i, j, lo, maxsz, cursz;
+ Kvp kv;
+ Msg m;
+
+ j = 0;
+ maxsz = 0;
+ p->b = b;
+ /*
+ * Start at the second pivot: all values <= this
+ * go to the first node. Stop *after* the last entry,
+ * because entries >= the last entry all go into it.
+ */
+ for(i = 1; i <= b->nval; i++){
+ if(i < b->nval)
+ getval(b, i, &kv);
+ cursz = 0;
+ lo = j;
+ for(; j < b->nbuf; j++){
+ getmsg(b, j, &m);
+ if(i < b->nval && keycmp(&m, &kv) >= 0)
+ break;
+ /* 2 bytes for offset, plus message size in buffer */
+ cursz += msgsz(&m);
+ }
+ if(cursz > maxsz){
+ maxsz = cursz;
+ p->op = POmod;
+ p->lo = lo;
+ p->hi = j;
+ p->sz = maxsz;
+ p->idx = i - 1;
+ p->midx = i - 1;
+ p->npull = 0;
+ p->pullsz = 0;
+ }
+ }
+}
+
+static void
+fastupsert(Tree *t, Blk *b, Msg *msg, int nmsg)
+{
+ int i, c, o, ri, lo, hi, mid, nbuf;
+ Msg cmp;
+ char *p;
+ Blk *r;
+
+ if((r = dupblk(t, b)) == nil)
+ error(Enomem);
+
+ nbuf = r->nbuf;
+ for(i = 0; i < nmsg; i++)
+ setmsg(r, &msg[i]);
+
+ for(i = 0; i < nmsg; i++){
+ ri = -1;
+ lo = 0;
+ hi = nbuf+i-1;
+ while(lo <= hi){
+ mid = (hi + lo) / 2;
+ getmsg(r, mid, &cmp);
+ c = keycmp(&msg[i], &cmp);
+ switch(c){
+ case -1:
+ hi = mid-1;
+ break;
+ case 0:
+ ri = mid+1;
+ lo = mid+1;
+ break;
+ case 1:
+ lo = mid+1;
+ break;
+ }
+ }
+ if(ri == -1)
+ ri = hi+1;
+ p = r->data + Pivspc + 2*(nbuf+i);
+ o = UNPACK16(p);
+ p = r->data + Pivspc + 2*ri;
+ memmove(p+2, p, 2*(nbuf+i-ri));
+ PACK16(p, o);
+ }
+ enqueue(r);
+
+ lock(&t->lk);
+ t->bp = r->bp;
+ t->dirty = 1;
+ unlock(&t->lk);
+
+ freeblk(t, b);
+ dropblk(b);
+ dropblk(r);
+}
+
+
+void
+btupsert(Tree *t, Msg *msg, int nmsg)
+{
+ int i, npath, npull, dh, sz, height;
+ Path *path, *rp;
+ Blk *b, *rb;
+ Kvp sep;
+ Bptr bp;
+
+ assert(!canqlock(&fs->mutlk));
+ sz = 0;
+ stablesort(msg, nmsg);
+ for(i = 0; i < nmsg; i++)
+ sz += msgsz(&msg[i]);
+ npull = 0;
+ path = nil;
+ npath = 0;
+
+Again:
+ if(waserror()){
+ freepath(t, path, npath);
+ nexterror();
+ }
+
+ b = getroot(t, &height);
+ if(npull == 0 && b->type == Tpivot && !filledbuf(b, nmsg, sz)){
+ fastupsert(t, b, msg, nmsg);
+ poperror();
+ return;
+ }
+ /*
+ * The tree can grow in height by 1 when we
+ * split, so we allocate room for one extra
+ * node in the path.
+ */
+ npath = 0;
+ if((path = calloc((height + 2), sizeof(Path))) == nil)
+ error(Enomem);
+ path[npath].b = nil;
+ path[npath].idx = -1;
+ path[npath].midx = -1;
+ npath++;
+
+ path[0].sz = sz;
+ path[0].ins = msg;
+ path[0].lo = npull;
+ path[0].hi = nmsg;
+ while(b->type == Tpivot){
+ if(!filledbuf(b, nmsg, path[npath - 1].sz))
+ break;
+ victim(b, &path[npath]);
+ getval(b, path[npath].idx, &sep);
+ bp = unpackbp(sep.v, sep.nv);
+ b = getblk(bp, 0);
+ npath++;
+ }
+ path[npath].b = b;
+ path[npath].idx = -1;
+ path[npath].midx = -1;
+ path[npath].lo = -1;
+ path[npath].hi = -1;
+ path[npath].npull = 0;
+ path[npath].pullsz = 0;
+ npath++;
+
+ rp = flush(t, path, npath);
+ rb = rp->nl;
+
+ if(path[0].nl != nil)
+ dh = 1;
+ else if(path[1].nl != nil)
+ dh = 0;
+ else if(npath >2 && path[2].nl != nil)
+ dh = -1;
+ else
+ fatal("broken path change");
+
+ assert(rb->bp.addr != 0);
+ assert(rb->bp.addr != 0);
+
+ lock(&t->lk);
+ traceb("setroot", rb->bp);
+ t->ht += dh;
+ t->bp = rb->bp;
+ t->dirty = 1;
+ unlock(&t->lk);
+
+ npull += rp->npull;
+ freepath(t, path, npath);
+ poperror();
+
+ if(npull != nmsg){
+ tracem("short pull");
+ goto Again;
+ }
+}
+
+Blk*
+getroot(Tree *t, int *h)
+{
+ Bptr bp;
+
+ lock(&t->lk);
+ bp = t->bp;
+ if(h != nil)
+ *h = t->ht;
+ unlock(&t->lk);
+
+ return getblk(bp, 0);
+}
+
+int
+btlookup(Tree *t, Key *k, Kvp *r, char *buf, int nbuf)
+{
+ int i, j, h, ok, same;
+ Blk *b, **p;
+ Bptr bp;
+ Msg m;
+
+ b = getroot(t, &h);
+ if((p = calloc(h, sizeof(Blk*))) == nil){
+ dropblk(b);
+ error(Enomem);
+ }
+ ok = 0;
+ p[0] = holdblk(b);
+ for(i = 1; i < h; i++){
+ if(blksearch(p[i-1], k, r, &same) == -1)
+ break;
+ bp = unpackbp(r->v, r->nv);
+ p[i] = getblk(bp, 0);
+ }
+ if(p[h-1] != nil)
+ blksearch(p[h-1], k, r, &ok);
+ if(ok)
+ cpkvp(r, r, buf, nbuf);
+ for(i = h-2; i >= 0; i--){
+ if(p[i] == nil)
+ continue;
+ j = bufsearch(p[i], k, &m, &same);
+ if(j < 0 || !same)
+ continue;
+ if(ok || m.op == Oinsert)
+ ok = apply(r, &m, buf, nbuf);
+ else if(m.op != Oclearb && m.op != Oclobber)
+ fatal("lookup %K << %M missing insert\n", k, &m);
+ for(j++; j < p[i]->nbuf; j++){
+ getmsg(p[i], j, &m);
+ if(keycmp(k, &m) != 0)
+ break;
+ ok = apply(r, &m, buf, nbuf);
+ }
+ }
+ for(i = 0; i < h; i++)
+ if(p[i] != nil)
+ dropblk(p[i]);
+ dropblk(b);
+ free(p);
+ return ok;
+}
+
+void
+btnewscan(Scan *s, char *pfx, int npfx)
+{
+ memset(s, 0, sizeof(*s));
+ s->first = 1;
+ s->donescan = 0;
+ s->offset = 0;
+ s->pfx.k = s->pfxbuf;
+ s->pfx.nk = npfx;
+ memmove(s->pfxbuf, pfx, npfx);
+
+ s->kv.v = s->kvbuf+npfx;
+ s->kv.nv = 0;
+ cpkey(&s->kv, &s->pfx, s->kvbuf, sizeof(s->kvbuf));
+}
+
+void
+btenter(Tree *t, Scan *s)
+{
+ int i, same;
+ Scanp *p;
+ Msg m, c;
+ Bptr bp;
+ Blk *b;
+ Kvp v;
+
+ if(s->donescan)
+ return;
+ b = getroot(t, &s->ht);
+ if((s->path = calloc(s->ht, sizeof(Scanp))) == nil){
+ dropblk(b);
+ error(Enomem);
+ }
+ p = s->path;
+ p[0].b = b;
+ for(i = 0; i < s->ht; i++){
+ p[i].vi = blksearch(b, &s->kv, &v, &same);
+ if(b->type == Tpivot){
+ if(p[i].vi == -1)
+ getval(b, ++p[i].vi, &v);
+ p[i].bi = bufsearch(b, &s->kv, &m, &same);
+ if(p[i].bi == -1){
+ p[i].bi++;
+ }else if(!same || !s->first){
+ /* scan past repeated messages */
+ while(p[i].bi < p[i].b->nbuf){
+ getmsg(p[i].b, p[i].bi, &c);
+ if(keycmp(&m, &c) != 0)
+ break;
+ p[i].bi++;
+ }
+ }
+ bp = unpackbp(v.v, v.nv);
+ b = getblk(bp, 0);
+ p[i+1].b = b;
+ }else if(p[i].vi == -1 || !same || !s->first)
+ p[i].vi++;
+ }
+ s->first = 0;
+}
+
+int
+btnext(Scan *s, Kvp *r)
+{
+ int i, j, h, ok, start, bufsrc;
+ Scanp *p;
+ Msg m, n;
+ Bptr bp;
+ Kvp kv;
+
+Again:
+ p = s->path;
+ h = s->ht;
+ start = h;
+ bufsrc = -1;
+ if(s->donescan)
+ return 0;
+ if(waserror()){
+ btexit(s);
+ nexterror();
+ }
+ /* load up the correct blocks for the scan */
+ for(i = h-1; i >= 0; i--){
+ if(p[i].b != nil
+ &&(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf))
+ break;
+ if(i == 0){
+ s->donescan = 1;
+ poperror();
+ return 0;
+ }
+ if(p[i].b != nil)
+ dropblk(p[i].b);
+ p[i].b = nil;
+ p[i].vi = 0;
+ p[i].bi = 0;
+ p[i-1].vi++;
+ start = i;
+ }
+
+ if(p[start-1].vi < p[start-1].b->nval){
+ for(i = start; i < h; i++){
+ getval(p[i-1].b, p[i-1].vi, &kv);
+ bp = unpackbp(kv.v, kv.nv);
+ p[i].b = getblk(bp, 0);
+ }
+
+ /* find the minimum key along the path up */
+ m.op = Oinsert;
+ getval(p[h-1].b, p[h-1].vi, &m);
+ }else{
+ getmsg(p[start-1].b, p[start-1].bi, &m);
+ assert(m.op == Oinsert);
+ bufsrc = start-1;
+ }
+
+ for(i = h-2; i >= 0; i--){
+ if(p[i].b == nil || p[i].bi == p[i].b->nbuf)
+ continue;
+ getmsg(p[i].b, p[i].bi, &n);
+ if(keycmp(&n, &m) < 0){
+ bufsrc = i;
+ m = n;
+ }
+ }
+ if(m.nk < s->pfx.nk || memcmp(m.k, s->pfx.k, s->pfx.nk) != 0){
+ s->donescan = 1;
+ poperror();
+ return 0;
+ }
+
+ /* scan all messages applying to the message */
+ ok = 1;
+ cpkvp(r, &m, s->kvbuf, sizeof(s->kvbuf));
+ if(bufsrc == -1)
+ p[h-1].vi++;
+ else
+ p[bufsrc].bi++;
+ for(i = h-2; i >= 0; i--){
+ for(j = p[i].bi; p[i].b != nil && j < p[i].b->nbuf; j++){
+ getmsg(p[i].b, j, &m);
+ if(keycmp(r, &m) != 0)
+ break;
+ ok = apply(r, &m, s->kvbuf, sizeof(s->kvbuf));
+ p[i].bi++;
+ }
+ }
+ poperror();
+ if(!ok)
+ goto Again;
+ return 1;
+}
+
+void
+btexit(Scan *s)
+{
+ int i;
+
+ for(i = 0; i < s->ht; i++)
+ dropblk(s->path[i].b);
+ free(s->path);
+}
--- /dev/null
+++ b/user.c
@@ -1,0 +1,260 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static char*
+slurp(Tree *t, vlong path, vlong len)
+{
+ char *ret, buf[Offksz], kvbuf[Offksz + Ptrsz];
+ vlong o;
+ Blk *b;
+ Bptr bp;
+ Key k;
+ Kvp kv;
+
+ if((ret = malloc(len + 1)) == nil)
+ error(Enomem);
+ k.k = buf;
+ k.nk = Offksz;
+ for(o = 0; o < len; o += Blksz){
+ k.k[0] = Kdat;
+ PACK64(k.k+1, path);
+ PACK64(k.k+9, o);
+ if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+ error(Esrch);
+ bp = unpackbp(kv.v, kv.nv);
+ b = getblk(bp, GBraw);
+ if(len - o >= Blksz)
+ memcpy(ret + o, b->buf, Blksz);
+ else
+ memcpy(ret + o, b->buf, len - o);
+ }
+ ret[len] = 0;
+ return ret;
+}
+
+static char*
+readline(char **p, char *buf, int nbuf)
+{
+ char *e;
+ int n;
+
+ if((e = strchr(*p, '\n')) == nil)
+ return nil;
+ n = (e - *p) + 1;
+ if(n >= nbuf)
+ n = nbuf - 1;
+ strecpy(buf, buf + n, *p);
+ *p = e+1;
+ return buf;
+}
+
+static char*
+getfield(char **p, char delim)
+{
+ char *r;
+
+ if(*p == nil)
+ return nil;
+ r = *p;
+ *p = strchr(*p, delim);
+ if(*p != nil){
+ **p = '\0';
+ *p += 1;
+ }
+ return r;
+}
+
+User*
+name2user(char *name)
+{
+ int i;
+
+ for(i = 0; i < fs->nusers; i++)
+ if(strcmp(fs->users[i].name, name) == 0)
+ return &fs->users[i];
+ return nil;
+}
+
+User*
+uid2user(int id)
+{
+ int i;
+
+ for(i = 0; i < fs->nusers; i++)
+ if(fs->users[i].id == id)
+ return &fs->users[i];
+ return nil;
+}
+
+static char*
+parseusers(int fd, char *udata)
+{
+ char *pu, *p, *f, *m, *err, buf[8192];
+ int i, j, lnum, ngrp, nusers, usersz;
+ User *u, *n, *users;
+ int *g, *grp;
+
+ i = 0;
+ err = nil;
+ nusers = 0;
+ usersz = 8;
+ if((users = calloc(usersz, sizeof(User))) == nil)
+ return Enomem;
+ pu = udata;
+ lnum = 0;
+ while((p = readline(&pu, buf, sizeof(buf))) != nil){
+ lnum++;
+ if(p[0] == '#' || p[0] == 0)
+ continue;
+ if(i == usersz){
+ usersz *= 2;
+ n = realloc(users, usersz*sizeof(User));
+ if(n == nil){
+ free(users);
+ return Enomem;
+ }
+ users = n;
+ }
+ if((f = getfield(&p, ':')) == nil){
+ fprint(fd, "/adm/users:%d: missing ':' after id\n", lnum);
+ err = Esyntax;
+ goto Error;
+ }
+ u = &users[i];
+ u->id = atol(f);
+ if((f = getfield(&p, ':')) == nil){
+ fprint(fd, "/adm/users:%d: missing ':' after name\n", lnum);
+ err = Esyntax;
+ goto Error;
+ }
+ snprint(u->name, sizeof(u->name), "%s", f);
+ u->memb = nil;
+ u->nmemb = 0;
+ i++;
+ }
+ nusers = i;
+
+
+ i = 0;
+ pu = udata;
+ lnum = 0;
+ while((p = readline(&pu, buf, sizeof(buf))) != nil){
+ lnum++;
+ if(buf[0] == '#' || buf[0] == 0)
+ continue;
+ getfield(&p, ':'); /* skip id */
+ getfield(&p, ':'); /* skip name */
+ if((f = getfield(&p, ':')) == nil){
+ fprint(fd, "/adm/users:%d: missing ':' after name\n", lnum);
+ err = Esyntax;
+ goto Error;
+ }
+ if(f[0] != '\0'){
+ u = nil;
+ for(j = 0; j < nusers; j++)
+ if(strcmp(users[j].name, f) == 0)
+ u = &users[j];
+ if(u == nil){
+ fprint(fd, "/adm/users:%d: leader %s does not exist\n", lnum, f);
+ err = Enouser;
+ goto Error;
+ }
+ users[i].lead = u->id;
+ }
+ if((f = getfield(&p, ':')) == nil){
+ err = Esyntax;
+ goto Error;
+ }
+ grp = nil;
+ ngrp = 0;
+ while((m = getfield(&f, ',')) != nil){
+ if(m[0] == '\0')
+ continue;
+ u = nil;
+ for(j = 0; j < nusers; j++)
+ if(strcmp(users[j].name, m) == 0)
+ u = &users[j];
+ if(u == nil){
+ fprint(fd, "/adm/users:%d: user %s does not exist\n", lnum, m);
+ free(grp);
+ err = Enouser;
+ goto Error;
+ }
+ if((g = realloc(grp, (ngrp+1)*sizeof(int))) == nil){
+ free(grp);
+ err = Enomem;
+ goto Error;
+ }
+ grp = g;
+ grp[ngrp++] = u->id;
+ }
+ users[i].memb = grp;
+ users[i].nmemb = ngrp;
+ i++;
+ }
+
+ wlock(&fs->userlk);
+ n = fs->users;
+ i = fs->nusers;
+ fs->users = users;
+ fs->nusers = nusers;
+ wunlock(&fs->userlk);
+ users = n;
+ nusers = i;
+
+Error:
+ if(users != nil)
+ for(i = 0; i < nusers; i++)
+ free(users[i].memb);
+ free(users);
+
+ return err;
+
+}
+
+void
+loadusers(int fd, Tree *t)
+{
+ char *s, *e;
+ vlong len;
+ Qid q;
+ User *u;
+
+ if(walk1(t, -1, "", &q, &len) == -1)
+ error(Efs);
+ if(walk1(t, q.path, "users", &q, &len) == -1)
+ error(Esrch);
+ if(q.type & QTDIR)
+ error(Etype);
+ if(len >= 1*MiB)
+ error(Efsize);
+ s = slurp(t, q.path, len);
+ e = parseusers(fd, s);
+ if(e != nil){
+ if(fs->users != nil){
+ fprint(2, "load users: %s\n", e);
+ fprint(2, "keeping old table\n");
+ error(e);
+ }
+ if(!permissive){
+ fprint(2, "user table broken: %s\n", e);
+ fprint(2, "\tnot permissive: bailing\n");
+ error(e);
+ }
+ fprint(2, "user table broken: %s\n", e);
+ fprint(2, "\tfalling back to default\n");
+ parseusers(fd, "-1:adm::\n0:none::\n");
+ }
+ if((u = name2user("none")) != nil)
+ noneid = u->id;
+ if((u = name2user("adm")) != nil)
+ admid = u->id;
+ if((u = name2user("nogroup")) != nil)
+ nogroupid = u->id;
+ free(s);
+}