shithub: riscv

Download patch

ref: da1daf301206b27d0385c0d4276c18eaa22e8e25
parent: 9fdbc87b8edfb64f28c6a985f123cf514b8827df
author: Ori Bernstein <ori@eigenstate.org>
date: Thu May 16 16:47:43 EDT 2024

gefs: initial import

diff: cannot open b/sys/src/cmd/gefs//null: file does not exist: 'b/sys/src/cmd/gefs//null'
--- a/rc/bin/fshalt
+++ b/rc/bin/fshalt
@@ -31,6 +31,7 @@
 c=`{ls /srv/cwfs*cmd >[2]/dev/null}
 h=`{ls /srv/hjfs*cmd >[2]/dev/null}
 e=`{ls /srv/ext4*cmd >[2]/dev/null}
+g=`{ls /srv/gefs*cmd >[2]/dev/null}
 s=`{awk '/^sd./ {print substr($1,3,1)}' '#S/sdctl' >[2]/dev/null}
 
 # for scram, don't scram other systems
@@ -66,9 +67,9 @@
 fn x {
 	echo
 	echo -n halting...
-	for(i in $c $h $e)
+	for(i in $c $h $e $g)
 		echo halt >>$i
-	for(i in $c $h $e){
+	for(i in $c $h $e $g){
 		echo -n $i...
 		while(test -e $i)
 			sleep 1
--- /dev/null
+++ b/sys/doc/gefs.ms
@@ -1,0 +1,1179 @@
+.am DS
+.ft I
+..
+.ta 1i 2.3i 4.5i  (optional to set tabs)
+.TL
+GEFS, A Good Enough File System
+.AU
+Ori Bernstein
+ori@eigenstate.org
+.AB
+GEFS is a new file system built for Plan 9.
+It aims to be a crash-safe, corruption-detecting, simple, and fast snapshotting file system, in that order.
+GEFS achieves these goals by building a traditional 9p file system interface on top of a forest of copy-on-write Bε trees.
+It doesn't try to be optimal on all axes, but good enough for daily use.
+.AE
+.NH 1
+The Current Situation
+.PP
+Plan 9 has several general purpose disk file systems available.
+While they have served us well, all of them leave much to be desired.
+On power loss, the file systems may get corrupted.
+Partial disk failure is not caught by the file system, and reads may silently return incorrect data.
+They tend to require a large, unshrinkable disk for archival dumps, and behave poorly when the disk fills.
+Additionally, all of them perform O(n) scans to look up files in directories when walking to a file.
+This causes poor performance in large directories.
+.PP
+CWFS, the default file system on 9front, has proven to be performant and reliable, but is not crash safe.
+While the root file system can be recovered from the dump, this is inconvenient and can lead to a large amount of lost data.
+It has no way to reclaim space from the dump.
+In addition, due to its age, it has a lot of historical baggage and complexity.
+.PP
+HJFS, a new experimental system in 9front, is extremely simple, with fewer lines of code than any of the other on-disk storage options.
+It has dumps, but does not separate dump storage from cache storage, allowing full use of small disks.
+However, it is extremely slow, not crash safe, and lacks consistency check and recovery mechanisms.
+.PP
+Finally, fossil, the default file system on 9legacy, is large and complicated.
+It uses soft-updates for crash safety[7], an approach that has worked poorly in practice for the BSD filesystems[8].
+While the bugs can be fixed as they're found, simplicity requires a rethink of the on disk data structures.
+And even after adding all this complexity, the fossil+venti system provides no way to recover space when the disk fills.
+.NH 1
+Why GEFS Is Good Enough
+.PP
+GEFS aims to solve these problems with the above file systems.
+The data and metadata is copied on write, with atomic commits.
+This happens by construction, with fewer subtle ordering requirements than soft updates.
+If the file server crashes before the superblocks are updated,
+then the next mount will see the last commit that was synced to disk.
+Some data may be lost, but no corruption will occur.
+Furthermore, because of the use of an indexed data structure, directories do not suffer from O(n) lookups,
+solving a long standing performance issue with large directories.
+.PP
+The file system is based around a relatively novel data structure: the Bε tree [1].
+The Bε tree is a write optimized variant of a B+ tree.
+In addition to good overall performance, it plays particularly nicely with copy on write semantics.
+This allows GEFS to greatly reduce write amplification seen with traditional copy on write B-trees.
+The reduced write amplification allows GEFS to get away with a nearly trivial implementation of snapshotting.
+.PP
+As a result of the choice of data structure, archival dumps are replaced with snapshots.
+Snapshots may be deleted at any time, allowing data within a snapshot to be reclaimed for reuse.
+To enable this, each block pointer contains a birth generation.
+Blocks are reclaimed using a deadlist algorithm inspired by ZFS.
+This algorithm is described later in the paper.
+.PP
+While snapshot consistency is useful to keep data consistent, disks often fail over time.
+In order to detect corruption, block pointers contain a hash of the data that they point at.
+If corrupted data is returned by the underlying storage medium, this is detected via the block hashes.
+And if a programmer error causes the file system to write garbage to disk, this can be often be caught early.
+The corruption is reported, and the damaged data may then be recovered from backups, RAID restoration, or some other means.
+.PP
+By selecting a suitable data structure, a large amount of complexity elsewhere in the file system falls away.
+The complexity of the core data structure pays dividends.
+Being able to atomically update multiple attributes in the Bε tree,
+making the core data structure safely traversable without locks,
+and having a simple, unified set of operations makes everything else simpler.
+.NH 1
+Bε Trees: A Short Summary
+.PP
+The core data structure used in GEFS is a Bε tree.
+A Bε tree is a modification of a B+ tree, which optimizes writes
+by adding a write buffer to the pivot nodes.
+Like B-trees, Bε trees consist of leaf nodes, which contain keys and values, and pivot nodes.
+Like B-trees, the pivot nodes contain pointers to their children, which are either pivot nodes or leaf nodes.
+Unlike B-trees, the pivot nodes also contain a write buffer.
+.PP
+The Bε tree implements a simple key-value API, with point queries and range scans.
+It diverges form a traditional B-tree key value store by the addition of an upsert operation.
+Upsert operations are operations that insert a modification message into the tree.
+These modifications are addressed to a key.
+.PP
+To insert to the tree, the root node is copied, and the new message is
+inserted into its write buffer.
+When the write buffer is full, it is inspected, and the number of messages directed
+to each child is counted up.
+The child with the largest number of pending writes is picked as the victim.
+The root's write buffer is flushed into the selected victim.
+This proceeds recursively down the tree, until either an intermediate node has
+sufficient space in its write buffer, or the messages reach a leaf node, at which
+point the value in the leaf is updated.
+.PP
+In order to query a value, the tree is walked as normal, however the path to the
+leaf node is recorded.
+When a value is found, the write buffers along the path to the root are inspected,
+and any messages that have not yet reached the leaves are applied to the final
+value read back.
+.PP
+Because mutations to the leaf nodes are messages that describe a mutation, updates to
+data may be performed without inspecting the data at all.
+For example, when writing to a file, the modification time and QID version of the file
+may be incremented without inspecting the current QID; a 'new version' message may
+be upserted instead.
+This allows skipping read-modify-write cycles that access distant regions of the tree,
+in favor of a simple insertion into the root nodes write buffer.
+Additionally, because all upserts go into the root node, a number of operations may
+be upserted in a single update. As long as we ensure that there is sufficient space
+in the root node's write buffer, the batch insert is atomic.
+Inserts and deletions are upserts, but so are mutations to existing data.
+.PS
+.ps 6
+.vs 4
+boxht=0.2
+down
+
+R: [
+	right
+R0:	box "k0" wid 0.2
+	box "k16" wid 0.2
+	box "k32" wid 0.2
+R1:	box "k48" wid 0.2
+	box "m0" wid 0.2 fill
+	box "m1" wid 0.2 fill
+	box  wid 0.6 fill
+]
+move down 0.5
+P: [
+	right
+P0:	box "k0" wid 0.2
+P1:	box "k4" wid 0.2
+	box "k8" wid 0.2
+	box "k12" wid 0.2
+	box "m0" wid 0.2 fill
+	box "m1" wid 0.2 fill
+	box  wid 0.6 fill
+	
+	box invis wid 1 "..."
+	
+P2:	box "k48" wid 0.2
+	box "k56" wid 0.2
+	box "k60" wid 0.2
+	box "k64" wid 0.2
+	box "m0" wid 0.2 fill
+	box "m1" wid 0.2 fill
+	box  wid 0.6 fill
+]
+move down 0.5
+
+L: [
+	right
+L0:	box "k0" wid 0.2
+	box "v0" wid 0.2
+	box "..." wid 0.2
+	box "k3" wid 0.2
+	box "v3" wid 0.2
+
+	box invis wid 1
+
+L1:	box "k4" wid 0.2
+	box "v4" wid 0.2
+	box "..." wid 0.2
+	box "k7" wid 0.2
+	box "v7" wid 0.2
+
+B0:	box invis wid 1 "..."
+
+L2:	box "k48" wid 0.2
+	box "v49" wid 0.2
+	box "..." wid 0.2
+	box "k54" wid 0.2
+	box "v55" wid 0.2
+]
+
+arrow from R.R0.s to P.P0.n
+arrow from R.R1.s to P.P2.n
+
+arrow from P.P0.s to L.L0.n
+arrow from P.P1.s to L.L1.n
+arrow from P.P2.s to L.L2.n
+.PE
+.PP
+For the sake of simplicity, GEFS makes all blocks the same size.
+This implies that the Bε tree blocks are smaller than optimal,
+and the disk blocks are larger than optimal.
+The simplifications this allows in the block layer appear to be worthwhile.
+.PP
+Within a single block, the pivot keys are stored as offsets to variable width data.
+The data itself is unsorted, but the offsets pointing to it are sorted.
+This allows O(1) access to the keys and values given an index, or O(log(n))
+access while searching, while allowing variable size keys and values.
+.PS
+.ps 6
+.vs 4
+boxht=0.3
+box "o0" wid 0.2
+box "o1" wid 0.2
+box "o2" wid 0.2
+box  "unused" wid 0.8 fill
+box "k2" wid 0.2
+box "v2" wid 0.7
+box "k0" wid 0.2
+box "v0" wid 0.3
+box "k1" wid 0.4
+box "v1" wid 0.2
+.PE
+.PP
+In order to allow for efficient copy on write operation, the Bε tree in GEFS relaxes several
+of the balance properties of B-trees [5].
+It allows for a smaller amount of fill than would normally be required, and merges nodes with
+their siblings opportunistically.
+In order to prevent sideways pointers between sibling nodes that would need copy on write updates,
+the fill levels are stored in the parent blocks, and updated when updating the child pointers.
+.NH 1
+Mapping Files to Bε Operations
+.PP
+With a description of the core data structure completed, we now need
+to describe how a file system is mapped on to Bε trees.
+.PP
+A GEFS file system consists of a snapshot tree, which points to a number of file system trees.
+The snapshot tree exists to track snapshots, and will be covered later.
+Each snapshot points to a single GEFS metadata tree, which contains all file system state for
+a single version of the file system.
+GEFS is somewhat unique in that all file system data is recorded within a single flat key value
+store.
+There are no directory structures, no indirect blocks, and no other traditional structures.
+Instead, GEFS has the following key-value pairs:
+.LP
+.CW "Kdat(qid, offset) → (ptr)"
+.IP
+Data keys store pointers to data blocks.
+The key is the file qid, concatenated to the block-aligned file offset.
+The value is the pointer to the data block that is being looked up.
+.LP
+.CW "Kent(pqid, name) → (stat)"
+.IP
+Entry keys contain file metadata.
+The key is the qid of the containing directory, concatenated to the name of the file within the directory.
+The value is a stat struct, containing the file metadata, including the qid of the directory entry.
+.LP
+.CW "Kup(qid) → Kent(pqid, name)"
+.IP
+Up keys are maintained so that '..' walks can find their parent directory.
+The key is the qid of the directory.
+The value is the key for the parent directory.
+.PP
+Walking a path is done by starting at the root, which has a parent qid of ~0, and a name of "/".
+The QID of the root is looked up, and the key for the next step on the walk is constructed
+by concatenating the walk element with the root qid.
+This produces the key for the next walk element, which is then looked up, and the next key
+for the walk path is constructed. This continues until the full walk has completed.
+If one of the path elements is '..' instead of a name, then the super key is inspected
+instead to find the parent link of the directory.
+.PP
+If we had a file hierarchy containing the paths 'foo/bar', 'foo/baz/meh', 'quux', 'blorp',
+with 'blorp' containing the text 'hello world', this file system may be represented
+with the following set of keys and values:
+.P1
+Kdat(qid=3, off=0) → Bptr(off=0x712000, hash=04a73, gen=712)
+Kent(pqid=1, name='blorp') → Dir(qid=3, mode=0644, ...)
+Kent(pqid=1, name='foo') → Dir(qid=2, mode=DMDIR|0755, ...)
+Kent(pqid=1, name='quux') → Dir(qid=4, mode=0644, ...)
+Kent(pqid=2, name='bar') → Dir(qid=6, mode=DMDIR|0755, ...)
+Kent(pqid=2, name='baz') → Dir(qid=5, mode=DMDIR|0755, ...)
+Kent(pqid=5, name='meh') → Dir(qid=5, mode=0600, ...)
+Kent(pqid=-1, name='') → Dir(qid=1, mode=DMDIR|0755, ...)
+Kup(qid=2) → Kent(pqid=-1, name='')
+Kup(qid=5) → Kent(pqid=2, name='foo')
+.P2
+Note that all of the keys for a single directory are grouped because they sort together,
+and that if we were to read a file sequentially, all of the data keys for the file would
+be similarly grouped.
+.PP
+If we were to walk 
+.CW "foo/bar"
+then we would begin by constructing the key
+.CW "Kent(-1, '')"
+to get the root directory entry.
+The directory entry contains the qid.
+For this example, let's assume that the root qid is 123.
+The key for
+.CW foo
+is then constructed by concatenating the root qid to the first walk name, giving the key
+.CW "Kent(123, foo)"
+This is then looked up, giving the directory entry for 
+.CW foo .
+If the directory entry contains the qid 234, then the key
+.CW "Kent(234, bar)"
+is then constructed and looked up.
+The walk is then done.
+.PP
+Because a Bε tree is a sorted data structure, range scans are efficient.
+As a result, listing a directory is done by doing a range scan of all keys
+that start with the qid of the directory entry.
+.PP
+Reading from a file proceeds in a similar way, though with less iteration: When
+writing to a file, the qid is known, so the block key is created by
+concatenating the file qid with the read offset.
+This is then looked up, and the address of the block containing the data is found.
+The block is then read, and the data is returned.
+.PP
+Writing proceeds in a similar manner to reading, and in the general case begins by
+looking up the existing block containing the data so that it can be modified and
+updated.
+If a write happens to fully cover a data block, then a blind upsert of the data
+is done instead.
+Atomically along with the upsert of the new data, a blind write of the version number incremnt,
+mtime, and muid is performed.
+.PP
+Stats and wstat operations both construct and look up the keys for the directory entries,
+either upserting modifications or reading the data back directly.
+.NH 1
+Snapshots
+.PP
+Snapshots are an important feature of GEFS.
+Each GEFS snapshot is referred to by a unique integer id, and is fully immutable once it is taken.
+Snapshots are labelled with a human readable string.
+When marked mutable, the labels move to new snapshots as the file system is written to and synced.
+A snapshot may only be referred to by 0 or 1 mutable labels, along with as many immutable labels as desired.
+.PP
+If there was no space reclamation in gefs, then snapshots would be trivial.
+The tree is copy on write.
+Therefore, as long as blocks are never reclaimed, it would be sufficient to save the current root of the tree
+once all blocks in it were synced to disk.
+However, because snapshots are taken every 5 seconds, disk space would get used uncomfortably quickly.
+.PS
+.ps 6
+.vs 4
+boxht=0.2
+down
+
+R: [
+	right
+R0:	box  "piv" wid 0.4
+	box  "buf" wid 0.2 fill
+	box  wid 0.2 fill 0.75
+	move right 0.5
+R1:	box  "piv" wid 0.4
+	box  "buf" wid 0.3 fill
+	box  wid 0.1 fill 0.75
+]
+move down 0.5
+P: [
+	right
+P0:	box "piv" wid 0.4
+	box "buf" wid 0.4 fill
+	
+	box invis wid 1 "..."
+	
+P1:	box "piv" wid 0.4
+	box "buf" wid 0.4 fill
+]
+move down 0.5
+L: [
+	right
+L0:	box "vals" wid 1
+	box invis wid 1
+L1:	box "vals" wid 1
+	box invis wid 1 "..."
+L2:	box "vals" wid 1
+]
+
+arrow from R.R0.sw to P.P0.n
+arrow from R.R0.se to P.P1.n
+arrow from R.R1.sw to P.P0.n
+arrow from R.R1.se to P.P1.n
+arrow from P.P0.sw to L.L0.n
+arrow from P.P0.se to L.L1.n
+arrow from P.P1.s to L.L2.n
+.PE
+.PP
+There are a number of options for space reclamation.
+Some that were considered when implementing GEFS included garbage collection, in the style of HAMMER [3],
+or optimized reference counting in the style of BTRFS [4], but both of these options have significant downsides.
+Garbage collection requires that the entire disk get scanned to find unreferenced blocks.
+This means that there are scheduled performance degradations, and in the limit of throughput, the bandwidth spent scanning
+must approach the bandwidth spent on metadata updates, as each block must be scanned and then reclaimed.
+Reference counting implies a large number of scattered writes to maintain the reference counts of blocks.
+.PP
+As a result, the algorithm for space reclamation is borrowed from ZFS [6].
+It is based on the idea of using deadlists to track blocks that became free within a snapshot.
+If snapshots are immutable, then a block may not be freed as long as a snapshot exists.
+This implies that block lifetimes are contiguous.
+A block may not exist in a snapshot and be available for reallocation.
+Thus, when freeing a block, there are 2 cases: Either a block was born within the pending snapshot, and died within it,
+or it was born in a previous snapshot and was killed by the pending snapshot.
+.PP
+To build intuition, let's start by imagining the crudest possible implementation of snapshot space reclamation.
+Assuming that block pointers contain their birth generation, we can walk the entire tree.
+When a block's birth time is <= the previous snapshot, it is referred to by an older snapshot.
+We may not reclaim it.
+If the subsequent snapshot refers to this block, then it was born in this snapshot but is still in use.
+We may not reclaim it.
+Otherwise, the block is free, and we can reclaim it.
+.PP
+Obviously, this is slow: It involves full tree walks of multiple snapshots.
+It may walk large numbers of blocks that are not freed.
+.PP
+So, in order to do better, we can keep track of blocks that we want to delete from this snapshot as we delete them,
+instead of trying to reconstruct the list when we delete the snapshot.
+When we attempt to delete a block, there are two cases:
+First, the block's birth time may be newer than the previous snapshot, in which case it may be freed immediately.
+And second, the block may have been born in the previous snapshot or earlier, in which case we need to put it on the current
+snapshot's deadlist.
+When the current snapshot is deleted, the current snapshot's deadlist is merged with the next snapshot's deadlist.
+All blocks on the deadlist that were born after the previous snapshot are freed.
+.PS
+.ps 6
+.vs 4
+down
+H: [
+	P:[
+		move right 0
+		line <-
+		box invis "prev" wid 0.35
+	]
+	D: [
+		move right 0.5
+		line <-
+	D:	box invis "del" wid 0.35
+	] with .w at P.w - (0, P.ht)
+	N: [
+		move right 1
+		line <-
+	N:	box invis "next" wid 0.35
+	] with .w at D.w - (0, D.ht)
+S:	spline -> from D.D.e right 0.2 then to N.N.n
+	"merge" at S.nw + (0.1, 0.1)
+]
+S:[
+	right
+	line with .nw at H.sw + (0, 0.2)
+P:	[circle fill wid 0.1]
+	line
+D:	[circle below wid 0.1]
+	line
+N:	[circle fill wid 0.1]
+	"prev" at P.s + (0, - 0.1)
+	"del" at D.s + (0, -0.1)
+	"next" at N.s + (0, -0.1)
+]
+.PE
+.PP
+There's one further optimization we can do on top of this to make deletions extremely fast.
+The deadlists may be sharded by birth generation.
+When a snapshot is deleted, all deadlists within the snapshot are appended to the descendant
+snapshot, and any deadlists with a birth time after the deleted snapshot in the descendant
+may be reclaimed.
+With this approach, the only lists that need to be scanned are the ones consisting wholly of blocks that must be freed.
+.PP
+All of this assumes that there is a single, linear history of snapshots.
+However, GEFS allows users to take mutable snapshots off of any label, which breaks the assumption.
+If the assumption is broken, two different mutable labels may kill the same block,
+which would lead to double frees.
+GEFS handles this by adding the concept of a
+.I base
+to each snapshot.
+This base id is the first snapshot in a snapshot timeline.
+Any blocks born before the base are not considered owned by the snapshot,
+and no record of their demise will be made in that snapshot.
+The cleanup is left to the snapshot that was used as the base.
+.PS
+.ps 6
+.vs 4
+down
+H: [
+	P:[
+		move right 0
+	L0:	line <-
+	T:	box invis "b0" wid 0.35
+	L1:	line <- with .w at L0.w - (0, 0.15)
+		box invis "b1" wid 0.35
+	L2:	line <- with .w at L1.w - (0, 0.15)
+		box invis "b2" wid 0.35
+	]
+	box invis "prev (gen = 2)" with .w at P.e
+	D: [
+		move right 0.5
+	L0:	line <-
+		box invis "b0" wid 0.35
+	L1:	line <- at L0.w - (0, 0.15)
+	T:	box invis "b1" wid 0.35
+	L1:	line <- with .w at L1.w - (0, 0.15)
+		box invis "b2" wid 0.35
+	] with .w at P.w - (0, P.ht) fill
+	box invis "del (gen = 7)" with .w at D.e + (0.5, 0)
+	N: [
+		move right 1
+	L0:	line <-
+	T:	box invis "b0" wid 0.35
+	L1:	line <- with .w at L0.w - (0, 0.15)
+		box invis "b1" wid 0.35
+	L2:	line <- with .w at L1.w - (0, 0.15)
+		box invis "b7" wid 0.35
+		"(free)"
+	] with .w at D.w - (0, D.ht)
+	box invis "next (gen = 9)" with .w at N.e
+S:	spline -> from D.T.e right 0.2 then to N.T.n
+	"merge" at S.sw + (0.15, 0.15)
+	
+]
+S:[
+	right
+	line with .nw at H.sw + (0, 0.2)
+P:	[circle fill wid 0.1]
+	line
+D:	[circle below wid 0.1]
+	line
+N:	[circle fill wid 0.1]
+	"prev" at P.s + (0, - 0.1)
+	"del" at D.s + (0, -0.1)
+	"next" at N.s + (0, -0.1)
+]
+.PE
+.PP
+The disadvantage of this approach is that appending to the deadlists may need more random writes.
+This is because, in the worst case, blocks deleted may be scattered across a large number of generations.
+It seems likely that in practice, most bulk deletions will touch files that were written in a small number of generations,
+and not scattered across the whole history of the disk.
+.PP
+The information about the snapshots, deadlists, and labels are stored in a separate
+snapshot tree. The snapshot tree, of course, can never be snapshotted itself.
+However, it's also a copy on write Bε tree where blocks are reclaimed immediately.
+It's kept consistent by syncing both the root of the snapshot tree and the freelists at the same time.
+If any blocks in the snapshot tree are freed, this freeing is only reflected after the snapshot tree is synced to disk fully.
+.PP
+The key-value pairs in the snapshot tree are stored as follows
+.LP
+.CW "Ksnap(id) → (tree)"
+.IP
+Snapshot keys take a unique numeric snapshot id.
+The value contains the tree root.
+This includes the block pointer for the tree, the snapshot generation of the tree, the previous snapshot of the tree,
+its reference count, and its height.
+.LP
+.CW "Klabel(name) → (snapid)"
+.IP
+Label keys contain a human-readable string identifying a snapshot.
+The value is a snapshot id.
+Labels regularly move between snapshots.
+When mounting a mutable snapshot, the label is updated to point at the latest snapshot every time the tree is synced to disk.
+.LP
+.CW "Kslink(snap, next) → ()"
+.IP
+A snap link key contains a snapshot id, and the id of one of its successors.
+Ideally, the successor would be a value, but our Bε tree requires unique keys, so we hack around it by putting both values
+into the key.
+When we have exactly one next link, and no labels that point at this snapshot, we merge with our successor.
+.LP
+.CW "Kdead(snap, gen) → (headptr, tailptr)"
+.IP
+A dead key contains a pair of snapshot id and deadlist generation.
+The value contains a head and tail pointer for a deadlist.
+These are used to quickly look up and merge deadlists, as described earlier in this paper.
+.NH 1
+Block Allocation
+.PP
+In GEFS, blocks are allocated from arenas.
+Within an arena, allocations are stored in a linked list of blocks, which is read at file system initialization.
+The blocks contain a journal of free or allocate operations, which free or allocate regions of disk.
+When the file system starts, it replays this log of allocations and frees, storing the available regions of blocks in an in-memory AVL tree.
+As the file system runs, it appends to the free space log, and occasionally compresses this log,
+collapsing adjacent free or used blocks into larger regions.
+.PP
+Because of the copy on write structure, it's fairly common for metadata blocks to get allocated and deallocated rapidly.
+Drives (even solid state drives) care a lot about sequential access, so it's beneficial to make a best effort attempt at keeping
+data sequential.
+As a result, GEFS selects the arena to allocate from via round robin, offsetting by the type of block.
+If the round robin counter is 10, and we have 7 arenas, then data blocks (type 0) are allocated from arena 3 ((10+0)%7),
+pivot blocks (type 1) are allocated from arena 4 ((10+1)%7), and leaf blocks (type 2) are allocated from arena 5 ((10+2)%7).
+The round robin counter is incremented after every few thousand block writes, in order to balance writes across arenas.
+Since all arenas are the same, if an arena is full, we simply advance to the next arena.
+.NH 1
+Process Structure
+.PP
+GEFS is implemented in a multiprocess manner.
+There are six types of proc that GEFS uses for its operation:
+The
+.I console ,
+.I dispatch ,
+.I mutation ,
+.I sweeper ,
+.I reader ,
+and
+.I sycer .
+Most of these processes can be replicated,
+however, there may only be one
+.IR mutator ,
+.IR sweeper ,
+or
+.I console
+at a time.
+Protocol parsing is handled by one of several dispatch procs.
+There is one of these per posted service or listener.
+Each dispatches 9p messages to the appropriate worker, depending on the 9p message type.
+Read-only messages get dispatched to one of multiple reader procs.
+Write messages get dispatched to the mutator proc, which modifies the in-memory representation of the file system.
+The mutator proc generates dirty blocks purely in memory, and sends them to the syncer procs.
+The job of the syncer proc is simply to write blocks back to disk asynchronously.
+There are also some tasks that may take a long time, and can be done in the background.
+These are sent to the sweeper proc.
+Because the tree is a shared data structure, the sweeper and mutator do not work in parallel.
+Instead, they must hold the mutator lock to accomplish anything.
+Finally, the task proc schedules periodic maintenance operations.
+These include syncing the file system and taking automatic snapshots.
+.PP
+The work of the sweeper could be done by the mutator,
+and in early versions of the file system, it was.
+However, some operations such as removing very large files
+can involve a lot of messages being inserted into the tree,
+which may block other writers.
+As a result, the long running operations are better deferred to a
+background process, which splits them into small chunks, allowing
+the mutator to make progress between them.
+.PP
+Data flow through these processes is unidirectional,
+and any block that has made it out of the mutating processes is immutable.
+This makes it reasonably easy to reason about consistency.
+.PS
+.ps 6
+.vs 4
+R: [
+	down
+C:	box "cons"	wid 0.7
+	move 0.5
+T:	box "task"	wid 0.7
+	move 0.5
+P0:	box "srv"	wid 0.7
+]
+move 0.5
+S: [
+	down
+S0:	box "sweeper"	wid 0.7
+	move 0.5
+M0:	box "mutator"	wid 0.7
+	move 0.5
+R0:	box "reader0"	wid 0.7
+	move 0.5
+R1:	box "reader1"	wid 0.7
+]
+move 0.5
+F: [
+	down
+S0:	box "syncer0"	wid 0.7
+	move 0.5
+S1:	box "syncer1"	wid 0.7
+	move 0.5
+S2:	box "syncer2"	wid 0.7
+]
+arrow from R.C.e to S.M0.w
+arrow from R.T.e to S.M0.w
+arrow from R.P0.e to S.M0.w
+arrow from R.P0.e to S.R0.w
+arrow from R.P0.e to S.R1.w
+arrow from S.M0.e to F.S0.w
+arrow from S.M0.e to F.S1.w
+arrow from S.M0.e to F.S2.w
+arrow from S.S0.e to F.S0.w
+arrow from S.S0.e to F.S1.w
+arrow from S.S0.e to F.S2.w
+arrow from S.M0.n to S.S0.s
+.PE
+.PP
+Because the file system is copy on write,
+as long as the blocks aren't reclaimed while a reader is accessing the tree, writes need not block reads.
+However, if a block is freed within the same snapshot,
+a naive implementation would allow the reader to observe a corrupt block.
+As a result, some additional cleverness is needed:
+block reclamation needs to be deferred until all readers are done reading a block.
+The algorithm selected for this is epoch based reclamation.
+.PP
+When a proc starts to operate on the tree, it enters an epoch.
+This is done by atomically taking the current global epoch,
+and setting the proc's local epoch to that,
+with an additional bit set to indicate that the proc is active:
+.P1
+	epoch[pid] = atomic_load(globalepoch) | Active
+.P2
+As the mutator frees blocks, instead of immediately making them reusable,
+it puts the blocks on the limbo list for its current generation:
+.P1
+	limbo[gen] = append(limbo[gen], b)
+.P2
+When the proc finishes operating on the tree, it leaves the epoch by clearing the active bit.
+When the mutator leaves the current epoch, it also attempts to advance the global epoch.
+This is done by looping over all worker epochs, and checking if any of them are active in an old epoch.
+If the old epoch is empty, then it's safe to advance the current epoch and clear the old epoch's limbo list.
+.P1
+ge = atomic_load(globalepoch);
+for(w in workers){
+	e = atomic_load(epoch[w]);
+	if((e & Active) && e != (ge | Active))
+		return;
+}
+globalepoch = globalepoch+1
+freeblks(limbo[globalepoch - 2])
+.P2
+.PP
+If the old epoch is not empty, then the blocks are not freed, and the cleanup is deferred.
+If a reader stalls out for a very long time, this can lead to a large accumulation of garbage,
+and as a result, GEFS starts to apply backpressure to the writers if the limbo list begins
+to get too large.
+.PP
+This epoch based approach allows GEFS to avoid contention between writes and reads.
+A writer may freely mutate the tree as multiple readers traverse it, with no locking between the processes,
+beyond what is required for the 9p implementation.
+There is still contention on the FID table, the block cache,
+and a number of other in-memory data structures.
+.NH 1
+Appendix A: Data Formats
+.PP
+The formats used for GEFS on-disk storage are described below.
+There are several data structures that are described:
+Superblocks, arena headers, tree nodes, and tree values.
+.PP
+All blocks except raw data blocks begin with a 2 byte header.
+The superblock header is chosen such that it coincides with
+the ascii representation of 'ge'.
+.PP
+All numbers in GEFS are big-endian integers, byte packed.
+.PP
+The headers are listed below:
+.TS
+allbox center;
+c c
+c l.
+Value	Description
+0	Unused
+1	Pivot node
+2	Leaf node
+3	Allocation log
+4	Deadlist log
+5	Arena Header
+0x6765	Superblock header
+.TE
+.NH 2
+Superblocks
+.PP
+Superblocks are the root of the file system,
+containing all information needed to load it.
+There is one superblock at offset 0,
+and one superblock at the last block of the file system.
+These two superblocks are duplicates,
+and only one intact superblock is needed to successfully load GEFS.
+Because the superblock fits into a single block,
+all the arenas must also fit into it.
+This imposes an upper bound on the arena count.
+With 16k blocks, this natural limit is approximately 1000 arenas.
+Gefs imposes a smaller limit internally, limiting to 256 arenas by default.
+.IP
+.I header[8]
+= "gefs9.00"
+.br
+.I blksz[4] ": the block size for this file system"
+.br
+.I bufspc[4] ": the buffer space for this file system"
+.br
+.I snap.ht[4] ": the height of the snapshot tree"
+.br
+.I snap.addr[8] ": the root block of the snapshot tree"
+.br
+.I snap.hash[8] ": the hash of the snapshot tree root"
+.br
+.I snapdl.hd.addr ": the address of the snap deadlist head"
+.br
+.I snapdl.hd.hash ": the hash of the snap deadlist head"
+.br
+.I snapdl.tl.addr ": the address of the snap deadlist tail"
+.br
+.I snapdl.tl.hash  ": the hash of the snap deadlist tail"
+.br
+.I narena[4] ": the number of arenas"
+.br
+.I flags[8] ": flags for future expansion"
+.br
+.I nextqid[8] ": the next qid that will be allocated"
+.br
+.I nextgen[8] ": the next generation number that will be written"
+.br
+.I qgen[8] ": the last queue generation synced to disk"
+.br
+.I "arena0.addr[8], arena0.hash[8]" ": the location of the 0th arena"
+.br
+.I "arena1.addr[8], arena1.hash[8]" ": the location of the 1st arena
+.br
+.I ...
+.br
+.I "arenaN.addr[8], arenaN.hash[8]" ": the location of the N'th arena"
+.br
+.I sbhash[8] ": hash of superblock contents up to the last arena"
+.NH 2
+Arenas
+.PP
+An arena header contains the freelist, the arena size,
+and (for debugging) the amount of space used within the arena.
+.IP
+.I type[2]
+= Tarena
+.br
+.I free.addr[8] ": the address of the start of the freelist"
+.br
+.I free.hash[8] ": the hash of the start of the freelist"
+.br
+.I size[8] ": the size of the arena"
+.br
+.I used[8] ": the amount of used space in the arena"
+.NH 2
+Logs
+.PP
+Logs are used to track allocations. They are the only structure that is
+mutated in place, and therefore is not fully merkelized.
+There are two types of log in gefs: Allocation logs and deadlists.
+They share a common structure, but contain slightly different data.
+.PP
+All logs share a common header:
+.IP
+.I type[2]
+= Tlog or Tdlist
+.br
+.I logsz[2] ": the amount of log space used"
+.br
+.I loghash[8] ": the hash of all data after the log header"
+.br
+.I chainp[24] ": the block pointer this log block chains to"
+.NH 3
+Allocation Logs
+.PP
+When the type of a log block is Tlog,
+the contents of the block are formatted as an allocation log.
+In an allocation log, each entry is either a single u64int,
+recording an allocation or free of a single block,
+or a pair of u64ints, representing an operation on a range of blocks.
+.PP
+The operations are listed below:
+.LP
+.TS
+allbox center;
+c c
+c l.
+Value	Description
+1	Allocate 1 block
+2	Free 1 block
+3	Sync barrier
+4	Alloc block range
+5	Free block range
+.TE
+Operations are packed with the operation in the low order byte.
+The rest of the value is packed in the upper bits.
+For multi-block operations, the range length is packed in the second byte.
+.IP
+.P1
+PACK64(logent, addr|op);
+if(op == 4 || op == 5)
+	PACK64(logent+8, len);
+.P2
+.NH 3
+Deadlist Logs
+.PP
+Deadlist logs are simpler than allocation logs.
+They only contain a flat list of blocks that have been killed.
+.NH 2
+The Tree
+.PP
+The tree is composed of two types of block:
+Pivot blocks, and leaf blocks.
+The block types were 
+.NH 3
+Pivot Blocks
+.PP
+Pivot blocks contain the inner nodes of the tree.
+They have the following header. The layout is as
+described earlier in the paper.
+.IP
+.I type[2] " = Tpivot"
+.br
+.I nval[2] ": the count of values"
+.br
+.I valsz[2] ": the number of bytes of value data"
+.br
+.I nbuf[2] ": the count of buffered messages"
+.br
+.I bufsz[2] ": the number of bytes of buffered messages"
+.PP
+.NH 3
+Pivot leaves
+.PP
+Within the block, the first half of the space after
+the header contains a key/pointer set. The head of
+the space contains an array of 2-byte offsets to keys,
+and the tail of the space contains a packed set of keys
+and block pointers.
+.PP
+The offset table is simple:
+.IP
+.I off[2*nval] ": the offset table"
+.PP
+the keys/pointers are slightly more complicated.
+They contain a length prefixed key, and a pointer
+to the child block for that key.
+.IP
+.I nkey[2] ": the length of the key"
+.br
+.I key[nkey] ": the key data"
+.br
+.I addr ": the address of the pointed to block"
+.br
+.I hash ": the hash of the pointed to block"
+.br
+.I gen ": the generation number of the pointed to block"
+.PP
+The second half of the space consists of messages
+directed to a value in the leaf. This is formatted
+similarly to the key/pointer set, but instead of
+offsets to key/pointer pairs, the offsets point
+to messages.
+.PP
+The array of offsets grows towards the end of the block,
+and the array of values or messages grows towards the start of the block.
+.PP
+The offset table is the same, however, instead of
+having
+.I nval
+entries, it has
+.I nbuf
+entries.
+.IP
+.I off[2*nbuf]
+.PP
+The messages contain a single byte opcode,
+a key, and a message that contains an incremental
+update to the value.
+.IP
+.I op[1] ": the message operation"
+.br
+.I nkey[2] ": the length of the target key"
+.br
+.I key[nkey] ": the contents of the target key"
+.br
+.I nval[2] ": the length of the message"
+.br
+.I val[nval] ": the contents of the message"
+.NH 3
+Leaf Blocks
+.PP
+Leaf blocks contain the leaf nodes of the tree.
+They have the following header. The layout is as
+described earlier in the paper.
+.IP
+.I type[2] ": the block type"
+Tleaf
+.I nval[2] ": the number of key value pairs"
+.br
+.I valsz[2] ": the size of the key value pairs"
+.PP
+Within a leaf, the layout is very similar to a pivot.
+There is a table of key-value offsets,
+and an array of packed messages.
+As before,
+the array of offsets grows towards the end of the block,
+and the array of values grows towards the start of the block.
+.IP
+.I off[2*nval] ": the offset table"
+.PP
+Each key value pair is encoded as below:
+.IP
+.I nkey[2] ": the length of the key"
+.br
+.I key[nkey] ": the contents of the key"
+.br
+.I nval[2] ": the length of the value"
+.br
+.I val[nval] ": the contents of the value"
+.NH 2
+Keys and Values.
+.PP
+In GEFS, keys begin with a single type byte,
+and are followed by a set of data in a known format.
+Here are the types of known keys:
+.PP
+.I "Kdat qid[8] off[8]"
+describes pointer to a data block.
+The value for this data key must be a block pointer.
+Block pointers are encoded as
+.I "addr[8] hash[8] gen[8]" .
+This entry is only valid in file system trees.
+.PP
+.I  "Kent pqid[8] name[n]"
+describes a pointer to a file entry (stat structure).
+The value must be the body of a dir structure.
+This entry is only valid in file system trees.
+The dir structure is structured as below:
+.IP
+.I flag[8] ": flags for future expansion"
+.br
+.I qid.path[8] ": the qid path"
+.br
+.I qid.vers[4] ": the qid version"
+.br
+.I qid.type[1] ": the qid type"
+.br
+.I mode[4] ": the permission bits"
+.br
+.I atime[8] ": the access time"
+.br
+.I mtime[8] ": the modification time"
+.br
+.I length[8] ": the file size"
+.br
+.I uid[4] ": the owning user id"
+.br
+.I gid[4] ": the owning group id"
+.br
+.I muid[4] ": the last user that modified the file"
+.PP
+.I  "Kup qid[8]"
+describes a pointer to a parent directory.
+The value is the
+.I Kent
+formatted key.
+This key is the entry of the containing directory.
+It's only present for directories.
+This entry is only valid in file system trees.
+.PP
+.I "Klabel name[]"
+describes a label for a snapshot.
+The value is a
+.I snapid[8] ,
+referring to a snapid indexed by Ksnap.
+This key is only valid in snapshot trees.
+.PP
+.I "Ksnap snapid[8]"
+describes a key referring to a snapshot tree.
+The value is a tree entry.
+The tree is formatted as:
+.IP
+.br
+.I nref[4] ": the number of references from other trees"
+.br
+.I nlbl[4] ": the number of references from labels"
+.br
+.I ht[4] ": the height of the tree"
+.br
+.I flag[4] ": flags for future expansion"
+.br
+.I gen[8] ": the tree generation number"
+.br
+.I pred[8] ": the predecessor snapshot"
+.br
+.I succ[8] ": the successor snapshot"
+.br
+.I base[8] ": the base snapshot"
+.br
+.I bp.addr[8] ": the address of the root block"
+.br
+.I bp.hash[8] ": the hash of the root block"
+.br
+.I bp.gen[8] ": the generation of the root block"
+.PP
+.I "Kdlist snap[8] gen[8]"
+describes a key referring to a deadlist.
+The
+.I snap
+field refers to the snapshot that the deadlist belongs to,
+and the
+.I bgen
+field refers to the birth generation of the blocks on the deadlist.
+The value of the deadlist entry is a pair of block pointers,
+pointing to the head and tail of the block list.
+.NH 2
+Messages
+.PP
+.I Oinsert
+and
+.I Odelete
+can have any key/value pair as an operand.
+They replace or remove a key/value pair respectively.
+.PP
+.I Oclearb
+inserts a deferred free of a block,
+without reading it first.
+It has no value, but the key must be a
+.I Kdat
+key.
+.PP
+.I Oclobber
+is similar to
+.I Oclearb ,
+but its operand must be a
+.I Kent
+key.
+.I Owstat
+updates an existing file entry.
+The key of an
+.I Owstat
+message must be a
+.I Kent ,
+and the value is a bit field of fields to update,
+along with the new values.
+The first byte is a set of wstat flags, and the
+remaining data is the packed value associated with each flag.
+It can contain the following updates:
+.IP
+.I "Owsize fsize[8]" ": update file size"
+.br
+.I "Owmode mode[4]" ": update file mode"
+.br
+.I "Owmtime mtime[8]" ": update mtime, in nsec"
+.br
+.I "Owatime atime[8]" ": update atime, in nsec"
+.br
+.I "Owuid uid[4]" ": set uid"
+.br
+.I "Owgid uid[4]" ": set gid"
+.br
+.I "Omuid uid[4]" ": set muid"
+.PP
+.I Orelink
+and
+.I Oreprev
+rechain snapshots.
+The key of either of these messages is a
+.I Ksnap ,
+and the operand is the ID of a new
+predecessor or successor snap.
+.NH 1
+References
+.LP
+[1] Michael A. Bender, Martin Farach-Colton, William Jannen, Rob Johnson,
+Bradley C. Kuszmaul, Donald E. Porter, Jun Yuan, and Yang Zhan,
+.LP
+``An Introduction to Bε Trees and Write-Optimization,''
+.I ";login:",
+ October 2015, Vol. 40, No. 5" ,
+.LP
+[2] William Jannen, Jun Yuan, Yang Zhan, Amogh Akshintala, John Esmet, Yizheng Jiao,
+Ankur Mittal, Prashant Pandey, Phaneendra Reddy, Leif Walsh, Michael Bender,
+Martin Farach-Colton, Rob Johnson, Bradley C. Kuszmaul, and Donald E. Porter,
+``BetrFS: A Right-Optimized Write-Optimized File System,''
+.I "Proceedings of the 13th USENIX Conference on File and Storage Technologies,"
+2015
+.LP
+[3] Matthew Dillon, "The HAMMER Filesystem,"
+June 2008.
+.LP
+[4] Ohad Rodeh, Josef Bacik, Chris Mason, "BTRFS: The Linux B-Tree Filesystem"
+.I "ACM Transactions on Storage, Volume 9, Issue 3, Article No 9, pp 1-32,"
+August 2013 
+.LP
+[5] Ohad Rodeh, "B-trees, Shadowing, and Clones",
+.LP
+.I H-0245 (H0611-006)
+November 12, 2006
+.LP
+[6] Matt Ahrens, `` How ZFS Snapshots Really Work,''
+.I BSDCan,
+2019
+.LP
+[7] Gregory R. Ganger, Marshall Kirk McKusick, Craig A. N. Soules,
+and Yale N. Patt.
+``Soft Updates: A Solution to the Metadata Update Problem
+in File Systems,''
+.I "ACM Transactions on Computer Systems" ,
+Vol 18., No. 2, May 2000, pp. 127\-153.
+.LP
+[8] Valerie Aurora,
+``Soft updates, hard problems''
+.I "Linux Weekly News",
+July 1, 2009,
+https://lwn.net/Articles/339337/
+.LP
+[9] kvik,
+.I "Clone",
+https://shithub.us/kvik/clone/HEAD/info.html
--- /dev/null
+++ b/sys/man/4/gefs
@@ -1,0 +1,161 @@
+.TH GEFS 4
+.SH NAME
+gefs \- file server
+.SH SYNOPSIS
+.B gefs
+[
+.B -A
+]
+[
+.B -r
+.I user
+]
+[
+.B -f
+.I file
+]
+[
+.B -m
+.I mem
+]
+[
+.B -n
+.I name
+]
+[
+.B -a
+.I ann
+] ...
+[
+.B -S
+]
+[
+.B -s
+]
+.SH DESCRIPTION
+.PP
+.I Gefs
+is an experimental file server.
+It attempts to be crash safe, snapshotting, and corruption-detecting,
+without giving up too much performance.
+.PP
+Gefs allows multiple snapshots to be mounted and maintained concurrently.
+These snapshots all share the same storage pool, but can be written to,
+snapshotted, and rolled back independently.
+.PP
+The snapshot to mount is selected by using the attach specifier when
+mounting. If the attach specifier begins with a
+.I %
+sigil, then the snapshot is mounted in permissive mode.
+In permissive mode, permissions are not checked, and
+.IR wstat (5)
+may change any attributes of any file including the owner.
+Unless the file system is started with the permissive flag,
+only users in the
+.I adm
+group may mount snapshots permissively.
+.PP
+Gefs accepts the following options:
+.TP
+.B -A
+Disable auth. Permissions are still checked, but anyone will be able
+to attach as any user.
+.TP
+.BI "-a " ann
+Announce and listen on the specified network address.
+.TP
+.BI "-f " file
+Use
+.I file
+as the disk.
+.TP
+.B -g
+Grow the file system to fill the current partition.
+.TP
+.BI "-m " mem
+Specify the amount of memory to use as cache.
+The
+.I mem
+parameter recognizes
+.IR M ,
+.IR G ,
+and
+.I %
+as suffixes.
+If left unspecified, it defaults to 25% of installed RAM.
+.TP
+.BI "-n " name
+Use
+.I name
+as the name of the service.
+If unspecified, the default service name is
+.IR gefs .
+.TP
+.BI "-r " user
+Ream the file system, erasing all of the old data.
+Create a user named
+.I user
+in the
+.I adm
+group.
+After reaming,
+.I gefs
+will exit.
+.TP
+.B -S
+Allow permissive mounts for all users.
+Additionally, if the user file is unreadable, fall back to the default user table.
+Without god, all things are permitted.
+.TP
+.B -s
+Read and write protocol messages on standard file descriptors zero and one.
+.TP
+.B -t
+Set the size of the trace buffer in megabytes.
+If set to 0, no debug traces are recorded.
+By default, 16 megabytes of trace buffer are kept.
+.SH EXAMPLES
+.PP
+Mount snapshots
+.I gefs
+from the partition
+.I /dev/sdE0/fs
+onto a few different mountpoints.
+The
+.I main
+snapshot is mounted to
+.IR /n/gefs .
+The
+.I sys
+snapshot is mounted to
+.IR /n/gefs/sys .
+And finally, the
+.I adm
+snapshot is mounted in permissive mode to
+.IR  /n/adm .
+.IP
+.EX
+gefs -f /dev/sdE0/fs
+mount /srv/gefs /n/gefs
+mount /srv/gefs /n/gefs/sys sys
+mount /srv/gefs /n/adm %adm
+.EE
+.PP
+Initialize a new file system on a device.
+Note, this assumes the disk has already been prepared with
+.IR prep (8),
+and a
+.I fs
+partition has been created.
+.IP
+.EX
+gefs -r $user -f /dev/sdE0/fs
+.EE
+.SH SEE ALSO
+.IR cwfs (4),
+.IR hjfs (4),
+.IR gefs (8),
+.IR prep (8),
+.IR sd (3)
+.SH BUGS
+Yes
--- /dev/null
+++ b/sys/man/8/gefs
@@ -1,0 +1,199 @@
+.TH GEFS 8
+.SH NAME
+gefs \- file server maintenance
+.SH SYNOPSIS
+.PD 0
+.PP
+.B check
+.PP
+.B df
+.PP
+.B halt
+.PP
+.B help
+.PP
+.B permit
+[
+.B on
+|
+.BR off
+]
+.PP
+.B save trace
+.I filename
+.PP
+.B snap
+[
+-Smdl
+]
+[
+.I old 
+[
+.I new
+]
+]
+.PP
+.B sync
+.PP
+.B users
+.SH DESCRIPTION
+.IR Gefs (4)
+provides an administration console on
+.IR /srv/gefs.cmd .
+By default, this console is only readable
+and writable by the owner of the file system.
+.SH CONSOLE
+.PP
+The console handles the following commands:
+.PP
+.I Check
+applies basic consistency checks to the file system,
+reporting invalid blocks, broken metadata, and other
+similar structural issues.
+.PP
+.I Df
+prints the amount of used space and total space in megabytes,
+as well as the percentage of space occupied.
+.PP
+.I Halt
+syncs all IO to disk and exits the file system.
+While the syncing occurs, the file system does not
+allow new writes.
+.PP
+.I Help
+prints a summary of the available commands.
+This table includes additional debug commands that are
+subject to change, and are intentionally undocumented.
+.PP
+.I Permit
+[
+.B on
+|
+.B off
+]
+has two effects.
+First, if the user table is broken, it allows a fallback to a default user list.
+This allows the system administrator to recover if they reboot with a broken user file.
+Second, it allows mounts to occur in permissive mode by any user.
+Permissive mounts are designated by prefixing the attach spec with a
+.I %
+sigil.
+Permissive disables permissions checks when accessing files, and allows
+.IR wstat (5)
+to modify the owner of the file.
+This may be useful during file system initialization.
+.PP
+.B Snap
+manages snapshots.
+It can be invoked as
+.I snap
+.BR -l ,
+.I snap
+.B -d
+.IR snap ,
+or
+.I snap
+[
+.B -flags
+]
+.IR "old new" ,
+which will list, delete, or create new snapshots respectively.
+It accepts the following options:
+.TP
+.B -l
+Lists snapshots and their attributes.
+.TP
+.BI "-d " snap
+Deletes a snapshot, reclaiming whatever space is not shared
+is not shared with other snapshots.
+.TP
+.B -m
+Flags that the newly created snapshot should be mutable.
+.TP
+.B -S
+Disables automatic snapshots.
+.I old
+and gives it the name
+.IR new .
+.PP
+.I Sync
+writes dirty blocks in memory to the disk.
+.PP
+.B Users
+attempts to reload the user table from
+.IR /adm/users .
+.PP
+.I save trace
+saves a trace of recent operations to a file.
+If a file is not specified, it prints to the console.
+.SH ADM FILES
+.PP
+Gefs supports independent snapshots in the same file system.
+As a result, global configuration needs to be separated from snapshots.
+The global configuration resides in a well known snapshot called
+.IR adm .
+.PP
+The adm snapshot would conventionally be mounted in
+.IR /adm .
+It contains the
+.IR users (6)
+file.
+.IR
+The
+.I users
+file is read at file system startup, or when the
+.I users
+command is run on the console.
+If the users file is malformed at file system start, then the file system will refuse to initialize.
+.I Permissive
+mode will allow the file system to fall back to a default users table.
+It will also allow any user to mount the
+.I adm
+snapshot: this can help recover from disasters.
+.PP
+The
+.B default
+table looks like this:
+.IP
+.EX
+-1:adm:adm:
+0:none::
+1:$user:$user:
+.EE
+.PP
+Where
+.I $user
+is specified at the time that the file system is reamed.
+.SH EXAMPLES
+.PP
+To show current disk usage, the following may be written on the console:
+.IP
+.EX
+df
+.EE
+To create a new snapshot:
+.IP
+.EX
+snap main myimmutable
+.EE
+.PP
+To create a new mutable snapshot that does not take automatic
+checkpoints:
+.IP
+.EX
+snap -Sm main mymutable
+.EE
+.PP
+To delete a snapshot:
+.IP
+.EX
+snap -d mysnap
+.EE
+.SH BUGS
+.PP
+Currently, it's not possible to change the mutability of a snapshot.
+Instead, a new label needs to be created.
+.PP
+.SH SEE ALSO
+.IR gefs (4)
+
--- a/sys/src/9/boot/bootfs.proto
+++ b/sys/src/9/boot/bootfs.proto
@@ -20,6 +20,7 @@
 		dossrv
 		echo
 		cwfs64x
+		gefs
 		grep
 		ip
 			ipconfig
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-386.s
@@ -1,0 +1,109 @@
+#define CMPXCHG	/* (CX) */\
+	BYTE $0x0F; BYTE $0xB1; BYTE $0x11
+#define CMPXCHG64 /* (DI) */\
+	BYTE $0x0F; BYTE $0xC7; BYTE $0x0F
+#define XADDL /* BX, (AX) */ \
+	BYTE $0x0F; BYTE $0xC1; BYTE $0x03
+#define XADDLSP /* AX, (SP) */ \
+	BYTE $0x0F; BYTE $0xC1; BYTE $0x04; BYTE $0x24
+
+/*  get variants */
+TEXT ageti+0(SB),1,$0
+TEXT agetl+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+	MOVL	p+0(FP), AX
+	MOVL	0(AX), AX
+	RET
+
+TEXT agetv+0(SB),1,$0
+	MOVL	r+0(FP), AX
+	MOVL	p+4(FP), BX
+	FMOVD	(BX), F0
+	FMOVDP	F0, (AX)
+	RET
+
+/*  set variants */
+TEXT aseti+0(SB),1,$0
+TEXT asetl+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+	MOVL		p+0(FP), BX
+	MOVL		v+4(FP), AX
+	LOCK; XCHGL	(BX), AX
+	RET
+
+TEXT asetv+0(SB),1,$0
+	MOVL	p+4(FP), DI
+	MOVL	nv+8(FP), BX
+	MOVL	nv+12(FP), CX
+	MOVL	0(DI), AX
+	MOVL	4(DI), DX
+loop:
+	LOCK;	CMPXCHG64
+        JNE     loop
+	MOVL	p+0(FP),DI
+	MOVL	AX, 0(DI)
+	MOVL	DX, 4(DI)
+	RET
+
+/*  inc variants */
+TEXT ainci+0(SB),1,$0
+TEXT aincl+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+	MOVL	p+0(FP), BX
+	MOVL	v+4(FP), CX
+	MOVL	CX, AX
+	LOCK; XADDL
+	ADDL	CX, AX
+	RET
+
+TEXT aincv+0(SB),1,$0
+	MOVL	p+4(FP), DI
+retry:
+	MOVL	0(DI), AX
+	MOVL	4(DI), DX
+	MOVL 	AX, BX
+	MOVL	DX, CX
+	ADDL	v+8(FP), BX
+	ADCL	v+12(FP), CX
+	LOCK; CMPXCHG64
+	JNE	retry
+	MOVL	r+0(FP), DI
+	MOVL	BX, 0x0(DI)
+	MOVL	CX, 0x4(DI)
+	RET
+
+/*  cas variants */
+TEXT acasi+0(SB),1,$0
+TEXT acasl+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+	MOVL	p+0(FP), CX
+	MOVL	ov+4(FP), AX
+	MOVL	nv+8(FP), DX
+	LOCK; CMPXCHG
+	JNE	fail32
+	MOVL	$1,AX
+	RET
+fail32:
+	MOVL	$0,AX
+	RET
+
+TEXT acasv+0(SB),1,$0
+	MOVL	p+0(FP), DI
+	MOVL	ov+4(FP), AX
+	MOVL	ov+8(FP), DX
+	MOVL	nv+12(FP), BX
+	MOVL	nv+16(FP), CX
+	LOCK; CMPXCHG64
+	JNE	fail64
+	MOVL	$1,AX
+	RET
+fail64:
+	MOVL	$0,AX
+	RET
+
+/* barriers (do we want to distinguish types?) */
+TEXT coherence+0(SB),1,$0
+	/* this is essentially mfence but that requires sse2 */
+	XORL	AX, AX
+	LOCK; XADDLSP
+	RET
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-amd64.s
@@ -1,0 +1,59 @@
+/*  get variants */
+TEXT agetl+0(SB),1,$0
+	MOVL	(RARG), AX
+	RET
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+	MOVQ	(RARG), AX
+	RET
+
+/*  set variants */
+TEXT asetl+0(SB),1,$0
+	MOVL		v+8(FP), AX
+	LOCK; XCHGL	(RARG), AX
+	RET
+
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+	MOVQ		v+8(FP), AX
+	LOCK; XCHGQ	(RARG), AX
+	RET
+
+/*  inc variants */
+TEXT aincl+0(SB),1,$0
+	MOVQ		v+8(FP), BX
+	MOVQ		BX, AX
+	LOCK; XADDL	AX, (RARG)
+	ADDQ		BX, AX
+	RET
+
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+	MOVQ		v+8(FP), BX
+	MOVQ		BX, AX
+	LOCK; XADDQ	AX, (RARG)
+	ADDQ		BX, AX
+	RET
+
+/*  cas variants */
+TEXT acasl+0(SB),1,$0
+	MOVL	c+8(FP), AX
+	MOVL	v+16(FP), BX
+	LOCK; CMPXCHGL	BX, (RARG)
+	SETEQ	AX
+	MOVBLZX	AX, AX
+	RET
+
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+	MOVQ	c+8(FP), AX
+	MOVQ	v+16(FP), BX
+	LOCK; CMPXCHGQ BX, (RARG)
+	SETEQ	AX
+	MOVBLZX	AX, AX
+	RET
+
+/* barriers (do we want to distinguish types?) */
+TEXT coherence+0(SB),1,$0
+	MFENCE
+	RET
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-arm.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+	uintptr x = (uintptr)p;
+
+	/* constants from splitmix32 rng */
+	x = (x ^ (x >> 16)) * 0x85ebca6b;
+	x = (x ^ (x >> 13)) * 0xc2b2ae35;
+	x = (x ^ (x >> 16));
+	return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+	T n(T *p)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define SET(T, n) \
+	T n(T *p, T v)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		*p = v;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define INC(T, n) \
+	T n(T *p, T dv)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		*p += dv;		\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define CAS(T, n) \
+	int n(T *p, T ov, T nv)		\
+	{				\
+		uintptr h;		\
+		int r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		if(*p == ov){		\
+			*p = nv;	\
+			r = 1;		\
+		}else			\
+			r = 0;		\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-arm64.s
@@ -1,0 +1,79 @@
+/*  get variants */
+TEXT agetl+0(SB),1,$0
+	MOVW	(R0), R0
+	RETURN
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+	MOV	(R0), R0
+	RETURN
+
+/*  set variants */
+TEXT asetl+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	R0, R2
+_setl:
+	LDAXRW	(R2), R0
+	STLXRW	R1, (R2), R3
+	CBNZW	R3, _setl
+	RETURN
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	R0, R2
+_setp:
+	LDAXR	(R2), R0
+	STLXR	R1, (R2), R3
+	CBNZW	R3, _setp
+	RETURN
+
+/*  inc variants */
+TEXT aincl+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	R0, R2
+_incl:
+	LDAXRW	(R2), R0
+	ADDW	R1, R0, R3
+	STLXRW	R3, (R2), R4
+	CBNZW	R4, _incl
+	RETURN
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	R0, R2
+_incp:
+	LDAXR	(R2), R0
+	ADD	R1, R0, R3
+	STLXR	R3, (R2), R4
+	CBNZW	R4, _incp
+	RETURN
+
+/*  cas variants */
+TEXT acasl+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	0x10(FP), R2
+	LDAXRW	(R0), R3
+	CMPW	R1, R3
+	BNE	_casl
+	STLXRW	R2, (R0), R4
+	CMPW	$0, R4
+_casl:
+	CSETW	EQ, R0
+	RETURN
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+	MOV	0x08(FP), R1
+	MOV	0x10(FP), R2
+	LDAXR	(R0), R3
+	CMP	R1, R3
+	BNE	_casp
+	STLXR	R2, (R0), R4
+	CMPW	$0, R4
+_casp:
+	CSETW	EQ, R0
+	RETURN
+
+/* barriers */
+#define ISH	(2<<2 | 3)
+TEXT coherence+0(SB),1,$0
+	DMB	$ISH
+	RETURN
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-mips.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+	uintptr x = (uintptr)p;
+
+	/* constants from splitmix32 rng */
+	x = (x ^ (x >> 16)) * 0x85ebca6b;
+	x = (x ^ (x >> 13)) * 0xc2b2ae35;
+	x = (x ^ (x >> 16));
+	return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+	T n(T *p)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define SET(T, n) \
+	T n(T *p, T v)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		*p = v;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define INC(T, n) \
+	T n(T *p, T dv)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		*p += dv;		\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define CAS(T, n) \
+	int n(T *p, T ov, T nv)		\
+	{				\
+		uintptr h;		\
+		int r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		if(*p == ov){		\
+			*p = nv;	\
+			r = 1;		\
+		}else			\
+			r = 0;		\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-power64.s
@@ -1,0 +1,106 @@
+/*  get variants */
+TEXT agetl+0(SB),1,$0
+	SYNC
+	// See ISA 3.0B section B.2.3, "Safe Fetch"
+	MOVWZ	0(R3), R3
+	CMPW	R3, R3, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	MOVW	R3, ret+8(FP)
+	RETURN
+
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+	SYNC
+	// See ISA 3.0B section B.2.3, "Safe Fetch"
+	MOVD	0(R3), R3
+	CMP	R3, R3, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	MOVD	R3, ret+8(FP)
+	RETURN
+
+/*  set variants */
+TEXT asetl+0(SB),1,$0
+	MOVW	val+8(FP), R4
+	SYNC
+	MOVW	R4, 0(R3)
+	RETURN
+
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+	MOVD	val+8(FP), R4
+	SYNC
+	MOVD	R4, 0(R3)
+	RETURN
+
+/*  inc variants */
+TEXT aincl+0(SB),1,$0
+	MOVD	R3, R4
+	MOVW	delta+8(FP), R5
+	LWSYNC
+	LWAR	(R4), R3
+	ADD	R5, R3
+	STWCCC	R3, (R4)
+	BNE	-3(PC)
+	MOVW	R3, ret+16(FP)
+	RETURN
+
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+	MOVD	delta+8(FP), R5
+	LWSYNC
+	LDAR	(R3), R4
+	ADD	R5, R4
+	STDCCC	R4, (R3)
+	BNE	-3(PC)
+	MOVD	R4, ret+16(FP)
+	RETURN
+
+/*  cas variants */
+TEXT acasl+0(SB),1,$0
+	MOVWZ	old+8(FP), R4
+	MOVWZ	new+12(FP), R5
+	LWSYNC
+casagain:
+	LWAR	(R3), R6
+	CMPW	R6, R4
+	BNE	casfail
+	STWCCC	R5, (R3)
+	BNE	casagain
+	MOVD	$1, R3
+	LWSYNC
+	MOVB	R3, ret+16(FP)
+	RETURN
+casfail:
+	LWSYNC
+	MOVB	R0, ret+16(FP)
+	RETURN
+
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+	MOVD	old+8(FP), R4
+	MOVD	new+16(FP), R5
+	LWSYNC
+cas64again:
+	LDAR	(R3), R6
+	CMP	R6, R4
+	BNE	cas64fail
+	STDCCC	R5, (R3)
+	BNE	cas64again
+	MOVD	$1, R3
+	LWSYNC
+	MOVB	R3, ret+24(FP)
+	RETURN
+cas64fail:
+	LWSYNC
+	MOVB	R0, ret+24(FP)
+	RETURN
+
+/* barriers */
+TEXT coherence+0(SB),1,$0
+	// LWSYNC is the "export" barrier recommended by Power ISA
+	// v2.07 book II, appendix B.2.2.2.
+	// LWSYNC is a load/load, load/store, and store/store barrier.
+	LWSYNC
+	RETURN
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-spim.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+	uintptr x = (uintptr)p;
+
+	/* constants from splitmix32 rng */
+	x = (x ^ (x >> 16)) * 0x85ebca6b;
+	x = (x ^ (x >> 13)) * 0xc2b2ae35;
+	x = (x ^ (x >> 16));
+	return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+	T n(T *p)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define SET(T, n) \
+	T n(T *p, T v)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		r = *p;			\
+		*p = v;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define INC(T, n) \
+	T n(T *p, T dv)			\
+	{				\
+		uintptr h;		\
+		T r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		*p += dv;		\
+		r = *p;			\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+#define CAS(T, n) \
+	int n(T *p, T ov, T nv)		\
+	{				\
+		uintptr h;		\
+		int r;			\
+					\
+		h = ihash(p);		\
+		lock(&locktab[h]);	\
+		if(*p == ov){		\
+			*p = nv;	\
+			r = 1;		\
+		}else			\
+			r = 0;		\
+		unlock(&locktab[h]);	\
+		return r;		\
+	}
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic.h
@@ -1,0 +1,16 @@
+long	agetl(long*);
+vlong	agetv(vlong*);
+void*	agetp(void**);
+
+long	asetl(long*, long);
+vlong	asetv(vlong*, vlong);
+void*	asetp(void**, void*);
+
+long	aincl(long*, long);
+vlong	aincv(vlong*, vlong);
+
+int	acasl(long*, long, long);
+int	acasv(vlong*, vlong, vlong);
+int	acasp(void**, void*, void*);
+
+void	coherence(void);
--- /dev/null
+++ b/sys/src/cmd/gefs/blk.c
@@ -1,0 +1,1095 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+static vlong	blkalloc_lk(Arena*);
+static vlong	blkalloc(int, uint);
+static void	blkdealloc_lk(Arena*, vlong);
+static Blk*	initblk(Blk*, vlong, vlong, int);
+
+int
+checkflag(Blk *b, int f)
+{
+	long v;
+
+	v = agetl(&b->flag);
+	return (v & f) == f;
+}
+
+void
+setflag(Blk *b, int f)
+{
+	long ov, nv;
+
+	while(1){
+		ov = agetl(&b->flag);
+		nv = ov | f;
+		if(acasl(&b->flag, ov, nv))
+			break;
+	}
+}
+
+void
+clrflag(Blk *b, int f)
+{
+	long ov, nv;
+
+	while(1){
+		ov = agetl(&b->flag);
+		nv = ov & ~f;
+		if(acasl(&b->flag, ov, nv))
+			break;
+	}
+}
+
+void
+syncblk(Blk *b)
+{
+	assert(checkflag(b, Bfinal));
+	assert(b->bp.addr >= 0);
+	clrflag(b, Bdirty);
+	if(pwrite(fs->fd, b->buf, Blksz, b->bp.addr) == -1)
+		broke("%B %s: %r", b->bp, Eio);
+}
+
+static Blk*
+readblk(vlong bp, int flg)
+{
+	vlong off, rem, n;
+	char *p;
+	Blk *b;
+
+	assert(bp != -1);
+	b = cachepluck();
+	b->alloced = getcallerpc(&bp);
+	off = bp;
+	rem = Blksz;
+	while(rem != 0){
+		n = pread(fs->fd, b->buf, rem, off);
+		if(n <= 0)
+			error("%s: %r", Eio);
+		off += n;
+		rem -= n;
+	}
+	b->cnext = nil;
+	b->cprev = nil;
+	b->hnext = nil;
+	b->flag = 0;
+
+	b->bp.addr = bp;
+	b->bp.hash = -1;
+	b->bp.gen = -1;
+	b->fnext = nil;
+
+	b->nval = 0;
+	b->valsz = 0;
+	b->nbuf = 0;
+	b->bufsz = 0;
+	b->logsz = 0;
+
+	p = b->buf + 2;
+	b->type = (flg&GBraw) ? Tdat : UNPACK16(b->buf+0);
+	switch(b->type){
+	default:
+		broke("invalid block type %d @%llx", b->type, bp);
+		break;
+	case Tdat:
+	case Tsuper:
+		b->data = b->buf;
+		break;
+	case Tarena:
+		b->data = p;
+		break;
+	case Tdlist:
+	case Tlog:
+		b->logsz = UNPACK16(p);		p += 2;
+		b->logh = UNPACK64(p);		p += 8;
+		b->logp = unpackbp(p, Ptrsz);	p += Ptrsz;
+		assert(p - b->buf == Loghdsz);
+		b->data = p;
+		break;
+	case Tpivot:
+		b->nval = UNPACK16(p);		p += 2;
+		b->valsz = UNPACK16(p);		p += 2;
+		b->nbuf = UNPACK16(p);		p += 2;
+		b->bufsz = UNPACK16(p);		p += 2;
+		assert(p - b->buf == Pivhdsz);
+		b->data = p;
+		break;
+	case Tleaf:
+		b->nval = UNPACK16(p);		p += 2;
+		b->valsz = UNPACK16(p);		p += 2;
+		assert(p - b->buf == Leafhdsz);
+		b->data = p;
+		break;
+	}
+	assert(b->magic == Magic);
+	return b;
+}
+
+static Arena*
+pickarena(uint ty, uint hint, int tries)
+{
+	uint n;
+
+	n = hint + tries + ainc(&fs->roundrobin)/1024;
+	if(ty == Tdat)
+		n++;
+	if(hint % fs->narena == 0)
+		n++;
+	return &fs->arenas[n%fs->narena];
+}
+
+Arena*
+getarena(vlong b)
+{
+	int hi, lo, mid;
+	vlong alo, ahi;
+	Arena *a;
+
+	lo = 0;
+	hi = fs->narena;
+	if(b == 0)
+		return &fs->arenas[0];
+	while(1){
+		mid = (hi + lo)/2;
+		a = &fs->arenas[mid];
+		alo = a->h0->bp.addr;
+		ahi = alo + a->size + 2*Blksz;
+		if(b < alo)
+			hi = mid-1;
+		else if(b > ahi)
+			lo = mid+1;
+		else
+			return a;
+	}
+}
+
+
+static void
+freerange(Avltree *t, vlong off, vlong len)
+{
+	Arange *r, *s;
+
+	assert(len % Blksz == 0);
+	if((r = calloc(1, sizeof(Arange))) == nil)
+		error(Enomem);
+	r->off = off;
+	r->len = len;
+	assert(avllookup(t, r, 0) == nil);
+	avlinsert(t, r);
+
+Again:
+	s = (Arange*)avlprev(r);
+	if(s != nil && s->off+s->len == r->off){
+		avldelete(t, r);
+		s->len = s->len + r->len;
+		free(r);
+		r = s;
+		goto Again;
+	}
+	s = (Arange*)avlnext(r);
+	if(s != nil && r->off+r->len == s->off){
+		avldelete(t, r);
+		s->off = r->off;
+		s->len = s->len + r->len;
+		free(r);
+		r = s;
+		goto Again;
+	}
+}
+
+static void
+grabrange(Avltree *t, vlong off, vlong len)
+{
+	Arange *r, *s, q;
+	vlong l;
+
+	assert(len % Blksz == 0);
+	q.off = off;
+	q.len = len;
+	r = (Arange*)avllookup(t, &q.Avl, -1);
+	if(r == nil || off + len > r->off + r->len)
+		abort();
+
+	if(off == r->off){
+		r->off += len;
+		r->len -= len;
+	}else if(off + len == r->off + r->len){
+		r->len -= len;
+	}else if(off > r->off && off+len < r->off + r->len){
+		s = emalloc(sizeof(Arange), 0);
+		l = r->len;
+		s->off = off + len;
+		r->len = off - r->off;
+		s->len = l - r->len - len;
+		avlinsert(t, s);
+	}else
+		abort();
+
+	if(r->len == 0){
+		avldelete(t, r);
+		free(r);
+	}
+}
+
+static Blk*
+mklogblk(Arena *a, vlong o)
+{
+	Blk *lb;
+
+	lb = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
+	if(lb->bp.addr != -1)
+		cachedel(lb->bp.addr);
+	initblk(lb, o, -1, Tlog);
+	finalize(lb);
+	syncblk(lb);
+	traceb("logblk" , lb->bp);
+	return lb;
+}
+
+/*
+ * Logs an allocation. Must be called
+ * with arena lock held. Duplicates some
+ * of the work in allocblk to prevent
+ * recursion.
+ */
+static void
+logappend(Arena *a, vlong off, vlong len, int op)
+{
+	vlong o, start, end;
+	Blk *nl, *lb;
+	char *p, *name;
+
+	lb = a->logtl;
+	assert((off & 0xff) == 0);
+	assert(op == LogAlloc || op == LogFree || op == LogSync);
+	if(op != LogSync){
+		start = a->h0->bp.addr;
+		end = start + a->size + 2*Blksz;
+		assert(lb == nil || lb->type == Tlog);
+		assert(off >= start);
+		assert(off <= end);
+	}
+	switch(op){
+	case LogAlloc:	name = "alloc";	break;
+	case LogFree:	name = "free";	break;
+	case LogSync:	name = "sync";	break;
+	default:	name = "???";	break;
+	}
+	assert(lb == nil || lb->logsz >= 0);
+	dprint("logop %llx+%llx@%x: %s\n", off, len, lb?lb->logsz:-1, name);
+	/*
+	 * move to the next block when we have
+	 * too little room in the log:
+	 * We're appending up to 16 bytes as
+	 * part of the operation, followed by
+	 * 16 bytes of new log entry allocation
+	 * and chaining.
+	 */
+	if(lb == nil || lb->logsz >= Logspc - Logslop){
+		o = blkalloc_lk(a);
+		if(o == -1)
+			error(Efull);
+		nl = mklogblk(a, o);
+		p = lb->data + lb->logsz;
+		PACK64(p, o|LogAlloc1);
+		lb->logsz += 8;
+		lb->logp = nl->bp;
+		finalize(lb);
+		syncblk(lb);
+		a->logtl = nl;
+		a->nlog++;
+		lb = nl;
+	}
+
+	setflag(lb, Bdirty);
+	if(len == Blksz){
+		if(op == LogAlloc)
+			op = LogAlloc1;
+		else if(op == LogFree)
+			op = LogFree1;
+	}
+	off |= op;
+	p = lb->data + lb->logsz;
+	PACK64(p, off);
+	lb->logsz += 8;
+	if(op >= Log2wide){
+		PACK64(p+8, len);
+		lb->logsz += 8;
+	}
+}
+
+void
+loadlog(Arena *a, Bptr bp)
+{
+	vlong ent, off, len, gen;
+	int op, i, n;
+	char *d;
+	Blk *b;
+
+
+	dprint("loadlog %B\n", bp);
+	traceb("loadlog", bp);
+	while(1){
+		b = getblk(bp, 0);
+		dprint("\tload %B chain %B\n", bp, b->logp);
+		/* the hash covers the log and offset */
+		for(i = 0; i < b->logsz; i += n){
+			d = b->data + i;
+			ent = UNPACK64(d);
+			op = ent & 0xff;
+			off = ent & ~0xff;
+			n = (op >= Log2wide) ? 16 : 8;
+			switch(op){
+			case LogSync:
+				gen = ent >> 8;
+				dprint("\tlog@%x: sync %lld\n", i, gen);
+				if(gen >= fs->qgen){
+					if(a->logtl == nil){
+						b->logsz = i;
+						a->logtl = holdblk(b);
+						return;
+					}
+					dropblk(b);
+					return;
+				}
+				break;
+	
+			case LogAlloc:
+			case LogAlloc1:
+				len = (op >= Log2wide) ? UNPACK64(d+8) : Blksz;
+				dprint("\tlog@%x alloc: %llx+%llx\n", i, off, len);
+				grabrange(a->free, off & ~0xff, len);
+				a->used += len;
+				break;
+			case LogFree:
+			case LogFree1:
+				len = (op >= Log2wide) ? UNPACK64(d+8) : Blksz;
+				dprint("\tlog@%x free: %llx+%llx\n", i, off, len);
+				freerange(a->free, off & ~0xff, len);
+				a->used -= len;
+				break;
+			default:
+				dprint("\tlog@%x: log op %d\n", i, op);
+				abort();
+				break;
+			}
+		}
+		if(b->logp.addr == -1){
+			a->logtl = b;
+			return;
+		}
+		bp = b->logp;
+		dropblk(b);
+	}
+}
+
+void
+compresslog(Arena *a)
+{
+
+	int i, nr, nblks;
+	vlong sz, *blks;
+	Blk *b, *nb;
+	Arange *r;
+	Bptr hd;
+	char *p;
+
+	tracem("compresslog");
+	if(a->logtl != nil){
+		finalize(a->logtl);
+		syncblk(a->logtl);
+	}
+	/*
+	 * Prepare what we're writing back.
+	 * Arenas must be sized so that we can
+	 * keep the merged log in memory for
+	 * a rewrite.
+	 */
+	sz = 0;
+	nr = 0;
+	a->nlog = 0;
+	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+		sz += 16;
+		nr++;
+	}
+
+	/*
+	 * Make a pessimistic estimate of the number of blocks
+	 * needed to store the ranges, as well as the blocks
+	 * used to store the range allocations.
+	 *
+	 * This does modify the tree, but it's safe because
+	 * we can only be removing entries from the tree, not
+	 * splitting or inserting new ones.
+	 */
+	nblks = (sz+Logspc)/(Logspc - Logslop) + 16*nr/(Logspc-Logslop) + 1;
+	if((blks = calloc(nblks, sizeof(vlong))) == nil)
+		error(Enomem);
+	if(waserror()){
+		free(blks);
+		nexterror();
+	}
+	for(i = 0; i < nblks; i++){
+		blks[i] = blkalloc_lk(a);
+		if(blks[i] == -1)
+			error(Efull);
+	}
+	/* fill up the log with the ranges from the tree */
+	i = 0;
+	hd = (Bptr){blks[0], -1, -1};
+	b = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
+	a->logbuf[a->lbidx % nelem(a->logbuf)]->bp = Zb;
+	if(b->bp.addr != -1)
+		cachedel(b->bp.addr);
+	initblk(b, blks[i++], -1, Tlog);
+	finalize(b);
+	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+		if(b->logsz >= Logspc - Logslop){
+			a->nlog++;
+			nb = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
+			if(nb->bp.addr != -1)
+				cachedel(nb->bp.addr);
+			initblk(nb, blks[i++], -1, Tlog);
+			b->logp = nb->bp;
+			setflag(b, Bdirty);
+			finalize(b);
+			syncblk(b);
+			b = nb;
+		}
+		p = b->data + b->logsz;
+		PACK64(p+0, r->off|LogFree);
+		PACK64(p+8, r->len);
+		b->logsz += 16;
+	}
+	finalize(b);
+	syncblk(b);
+
+	/*
+	 * now we have a valid freelist, and we can start
+	 * appending stuff to it. Clean up the eagerly
+	 * allocated extra blocks.
+	 */
+	a->loghd = hd;
+	a->logtl = b;
+	for(; i < nblks; i++){
+		cachedel(b->bp.addr);
+		blkdealloc_lk(a, blks[i]);
+	}
+	poperror();
+	free(blks);
+}
+
+int
+logbarrier(Arena *a, vlong gen)
+{
+	logappend(a, gen<<8, 0, LogSync);
+	if(a->loghd.addr == -1)
+		a->loghd = a->logtl->bp;
+	return 0;
+}
+
+/*
+ * Allocate from an arena, with lock
+ * held. May be called multiple times
+ * per operation, to alloc space for
+ * the alloc log.
+ */
+static vlong
+blkalloc_lk(Arena *a)
+{
+	Avltree *t;
+	Arange *r;
+	vlong b;
+
+	t = a->free;
+	r = (Arange*)t->root;
+	if(!usereserve && a->size - a->used <= a->reserve)
+		return -1;
+	if(r == nil)
+		broke(Estuffed);
+
+	/*
+	 * A bit of sleight of hand here:
+	 * while we're changing the sorting
+	 * key, but we know it won't change
+	 * the sort order because the tree
+	 * covers disjoint ranges
+	 */
+	b = r->off;
+	r->len -= Blksz;
+	r->off += Blksz;
+	if(r->len == 0){
+		avldelete(t, r);
+		free(r);
+	}
+	a->used += Blksz;
+	return b;
+}
+
+static void
+blkdealloc_lk(Arena *a, vlong b)
+{
+	logappend(a, b, Blksz, LogFree);
+	if(a->loghd.addr == -1)
+		a->loghd = a->logtl->bp;
+	freerange(a->free, b, Blksz);
+	a->used -= Blksz;
+}
+
+void
+blkdealloc(vlong b)
+{
+	Arena *a;
+
+	a = getarena(b);
+ 	qlock(a);
+	blkdealloc_lk(a, b);
+	qunlock(a);
+}
+
+static vlong
+blkalloc(int ty, uint hint)
+{
+	Arena *a;
+	vlong b;
+	int tries;
+
+	tries = 0;
+Again:
+	a = pickarena(ty, hint, tries);
+	/*
+	 * Loop through the arena up to 2 times.
+	 * The first pass tries to find an arena
+	 * that has space and is not in use, the
+	 * second waits until an arena is free.
+	 */
+	if(tries == 2*fs->narena)
+		error(Efull);
+	tries++;
+	if(tries < fs->narena){
+		if(canqlock(a) == 0)
+			goto Again;
+	}else
+		qlock(a);
+	if(waserror()){
+		qunlock(a);
+		nexterror();
+	}
+	b = blkalloc_lk(a);
+	if(b == -1){
+		qunlock(a);
+		poperror();
+		goto Again;
+	}
+	logappend(a, b, Blksz, LogAlloc);
+	if(a->loghd.addr == -1)
+		a->loghd = a->logtl->bp;
+	qunlock(a);
+	poperror();
+	return b;
+}
+
+static Blk*
+initblk(Blk *b, vlong bp, vlong gen, int ty)
+{
+	Blk *ob;
+
+	ob = cacheget(bp);
+	if(ob != nil)
+		fatal("double alloc: %#p %B %#p %B", b, b->bp, ob, ob->bp);
+	b->type = ty;
+	b->bp.addr = bp;
+	b->bp.hash = -1;
+	b->bp.gen = gen;
+	switch(ty){
+	case Tdat:
+		b->data = b->buf;
+		break;
+	case Tarena:
+		b->data = b->buf+2;
+		break;
+	case Tdlist:
+	case Tlog:
+		b->logsz = 0;
+		b->logp = (Bptr){-1, -1, -1};
+		b->data = b->buf + Loghdsz;
+		break;
+	case Tpivot:
+		b->data = b->buf + Pivhdsz;
+		break;
+	case Tleaf:
+		b->data = b->buf + Leafhdsz;
+		break;
+	}
+	b->fnext = nil;
+
+	setflag(b, Bdirty);
+	b->nval = 0;
+	b->valsz = 0;
+	b->nbuf = 0;
+	b->bufsz = 0;
+	b->logsz = 0;
+	b->alloced = getcallerpc(&b);
+
+	return b;
+}
+
+Blk*
+newblk(Tree *t, int ty, vlong hint)
+{
+	vlong bp;
+	Blk *b;
+
+	bp = blkalloc(ty, hint);
+	b = cachepluck();
+	initblk(b, bp, t->memgen, ty);
+	b->alloced = getcallerpc(&t);
+	tracex("newblk" , b->bp, ty, -1);
+	return b;
+}
+
+Blk*
+dupblk(Tree *t, Blk *b)
+{
+	Blk *r;
+
+	if((r = newblk(t, b->type, 0)) == nil)
+		return nil;
+
+	tracex("dup" , b->bp, b->type, t->gen);
+	setflag(r, Bdirty);
+	r->bp.hash = -1;
+	r->nval = b->nval;
+	r->valsz = b->valsz;
+	r->nbuf = b->nbuf;
+	r->bufsz = b->bufsz;
+	r->logsz = b->logsz;
+	r->alloced = getcallerpc(&t);
+	memcpy(r->buf, b->buf, sizeof(r->buf));
+	return r;
+}
+
+void
+finalize(Blk *b)
+{
+	if(b->type != Tdat)
+		PACK16(b->buf, b->type);
+
+	switch(b->type){
+	default:
+		abort();
+		break;
+	case Tpivot:
+		PACK16(b->buf+2, b->nval);
+		PACK16(b->buf+4, b->valsz);
+		PACK16(b->buf+6, b->nbuf);
+		PACK16(b->buf+8, b->bufsz);
+		break;
+	case Tleaf:
+		PACK16(b->buf+2, b->nval);
+		PACK16(b->buf+4, b->valsz);
+		break;
+	case Tdlist:
+	case Tlog:
+		b->logh = bufhash(b->data, b->logsz);
+		PACK16(b->buf+2, b->logsz);
+		PACK64(b->buf+4, b->logh);
+		packbp(b->buf+12, Ptrsz, &b->logp);
+		break;
+	case Tdat:
+	case Tarena:
+	case Tsuper:
+		break;
+	}
+
+	b->bp.hash = blkhash(b);
+	setflag(b, Bfinal);
+	cacheins(b);
+	b->cached = getcallerpc(&b);
+}
+
+Blk*
+getblk(Bptr bp, int flg)
+{
+	uvlong xh, ck;
+	Blk *b;
+	int i;
+
+	i = ihash(bp.addr) % nelem(fs->blklk);
+	tracex("get" , bp, getcallerpc(&bp), -1);
+	qlock(&fs->blklk[i]);
+	if(waserror()){
+		qunlock(&fs->blklk[i]);
+		nexterror();
+	}
+	if((b = cacheget(bp.addr)) != nil){
+		b->lasthold = getcallerpc(&bp);
+		qunlock(&fs->blklk[i]);
+		poperror();
+		return b;
+	}
+	b = readblk(bp.addr, flg);
+	b->alloced = getcallerpc(&bp);
+	b->bp.hash = blkhash(b);
+	if((flg&GBnochk) == 0){
+		if(b->type == Tlog || b->type == Tdlist){
+			xh = b->logh;
+			ck = bufhash(b->data, b->logsz);
+		}else{
+			xh = bp.hash;
+			ck = b->bp.hash;
+		}
+		if(ck != xh){
+			if(flg & GBsoftchk){
+				fprint(2, "%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+				error(Ecorrupt);
+			}else{
+				broke("%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+			}
+		}
+	}
+	b->bp.gen = bp.gen;
+	b->lasthold = getcallerpc(&bp);
+	cacheins(b);
+	qunlock(&fs->blklk[i]);
+	poperror();
+
+	return b;
+}
+
+
+Blk*
+holdblk(Blk *b)
+{
+	ainc(&b->ref);
+	b->lasthold = getcallerpc(&b);
+	return b;
+}
+
+void
+dropblk(Blk *b)
+{
+	assert(b == nil || b->ref > 0);
+	if(b == nil || adec(&b->ref) != 0)
+		return;
+	b->lastdrop = getcallerpc(&b);
+	/*
+	 * freed blocks go to the LRU bottom
+	 * for early reuse.
+	 */
+	if(checkflag(b, Bfreed))
+		lrubot(b);
+	else
+		lrutop(b);
+}
+
+ushort
+blkfill(Blk *b)
+{
+	switch(b->type){
+	case Tpivot:
+		return 2*b->nbuf + b->bufsz +  2*b->nval + b->valsz;
+	case Tleaf:
+		return 2*b->nval + b->valsz;
+	default:
+		fprint(2, "invalid block @%lld\n", b->bp.addr);
+		abort();
+	}
+}
+
+void
+limbo(Bfree *f)
+{
+	Bfree *p;
+	ulong ge;
+
+	while(1){
+		ge = agetl(&fs->epoch);
+		p = agetp(&fs->limbo[ge]);
+		f->next = p;
+		if(acasp(&fs->limbo[ge], p, f)){
+			aincl(&fs->nlimbo, 1);
+			break;
+		}
+	}
+}
+
+void
+freeblk(Tree *t, Blk *b, Bptr bp)
+{
+	Bfree *f;
+
+	if(t == &fs->snap || (t != nil && bp.gen < t->memgen)){
+		tracex("killb", bp, getcallerpc(&t), -1);
+		killblk(t, bp);
+		return;
+	}
+
+	tracex("freeb", bp, getcallerpc(&t), -1);
+	f = emalloc(sizeof(Bfree), 0);
+	f->op = DFblk;
+	f->bp = bp;
+	f->b = nil;
+	if(b != nil){
+		setflag(b, Blimbo);
+		b->freed = getcallerpc(&t);
+		f->b = holdblk(b);
+	}
+	limbo(f);
+}
+
+void
+epochstart(int tid)
+{
+	ulong ge;
+
+	ge = agetl(&fs->epoch);
+	asetl(&fs->lepoch[tid], ge | Eactive);
+}
+
+void
+epochend(int tid)
+{
+	ulong le;
+
+	le = agetl(&fs->lepoch[tid]);
+	asetl(&fs->lepoch[tid], le &~ Eactive);
+}
+
+void
+epochwait(void)
+{
+	int i, delay;
+	ulong e, ge;
+
+	delay = 0;
+Again:
+	ge = agetl(&fs->epoch);
+	for(i = 0; i < fs->nworker; i++){
+		e = agetl(&fs->lepoch[i]);
+		if((e & Eactive) && e != (ge | Eactive)){
+			if(delay < 100)
+				delay++;
+			else
+				fprint(2, "stalled epoch %lx [worker %d]\n", e, i);
+			sleep(delay);
+			goto Again;
+		}
+	}
+}
+
+void
+epochclean(void)
+{
+	ulong c, e, ge;
+	Bfree *p, *n;
+	Arena *a;
+	Qent qe;
+	int i;
+
+	c = agetl(&fs->nlimbo);
+	ge = agetl(&fs->epoch);
+	for(i = 0; i < fs->nworker; i++){
+		e = agetl(&fs->lepoch[i]);
+		if((e & Eactive) && e != (ge | Eactive)){
+			if(c < fs->cmax/4)
+				return;
+			epochwait();
+		}
+	}
+	epochwait();
+	p = asetp(&fs->limbo[(ge+1)%3], nil);
+	asetl(&fs->epoch, (ge+1)%3);
+
+	for(; p != nil; p = n){
+		n = p->next;
+		switch(p->op){
+		case DFtree:
+			free(p->t);
+			break;
+		case DFmnt:
+			free(p->m);
+			break;
+		case DFblk:
+			a = getarena(p->bp.addr);
+			qe.op = Qfree;
+			qe.bp = p->bp;
+			qe.b = nil;
+			qput(a->sync, qe);
+			if(p->b != nil){
+				clrflag(p->b, Blimbo);
+				setflag(p->b, Bfreed);
+				dropblk(p->b);
+			}
+			break;
+		default:
+			abort();
+		}
+		aincl(&fs->nlimbo, -1);
+		free(p);
+	}
+}
+
+void
+enqueue(Blk *b)
+{
+	Arena *a;
+	Qent qe;
+
+	assert(checkflag(b, Bdirty));
+	assert(b->bp.addr >= 0);
+
+	b->enqueued = getcallerpc(&b);
+	a = getarena(b->bp.addr);
+	holdblk(b);
+	finalize(b);
+	traceb("queueb", b->bp);
+	setflag(b, Bqueued);
+	b->queued = getcallerpc(&b);
+	qe.op = Qwrite;
+	qe.bp = b->bp;
+	qe.b = b;
+	qput(a->sync, qe);
+}
+
+void
+qinit(Syncq *q)
+{
+	q->fullrz.l = &q->lk;
+	q->emptyrz.l = &q->lk;
+	q->nheap = 0;
+	q->heapsz = fs->cmax;
+	q->heap = emalloc(q->heapsz*sizeof(Qent), 1);
+
+}
+
+int
+qcmp(Qent *a, Qent *b)
+{
+	if(a->qgen != b->qgen)
+		return (a->qgen < b->qgen) ? -1 : 1;
+	if(a->op != b->op)
+		return (a->op < b->op) ? -1 : 1;
+	if(a->bp.addr != b->bp.addr)
+		return (a->bp.addr < b->bp.addr) ? -1 : 1;
+	return 0;
+}
+
+void
+qput(Syncq *q, Qent qe)
+{
+	int i;
+
+	if(qe.op == Qfree || qe.op == Qwrite)
+		assert(qe.bp.addr != 0 && (qe.bp.addr & (Blksz-1)) == 0);
+	else if(qe.op == Qfence)
+		assert(fs->syncing > 0);
+	else
+		abort();
+	qlock(&q->lk);
+	qe.qgen = agetv(&fs->qgen);
+	while(q->nheap == q->heapsz)
+		rsleep(&q->fullrz);
+	for(i = q->nheap; i > 0; i = (i-1)/2){
+		if(qcmp(&qe, &q->heap[(i-1)/2]) == 1)
+			break;
+		q->heap[i] = q->heap[(i-1)/2];
+	}
+	q->heap[i] = qe;
+	q->nheap++;
+	rwakeup(&q->emptyrz);
+	qunlock(&q->lk);
+}
+
+static Qent
+qpop(Syncq *q)
+{
+	int i, l, r, m;
+	Qent e, t;
+
+	qlock(&q->lk);
+	while(q->nheap == 0)
+		rsleep(&q->emptyrz);
+	e = q->heap[0];
+	if(--q->nheap == 0)
+		goto Out;
+
+	i = 0;
+	q->heap[0] = q->heap[q->nheap];
+	while(1){
+		m = i;
+		l = 2*i+1;
+		r = 2*i+2;
+		if(l < q->nheap && qcmp(&q->heap[m], &q->heap[l]) == 1)
+			m = l;
+		if(r < q->nheap && qcmp(&q->heap[m], &q->heap[r]) == 1)
+			m = r;
+		if(m == i)
+			break;
+		t = q->heap[m];
+		q->heap[m] = q->heap[i];
+		q->heap[i] = t;
+		i = m;
+	}
+Out:
+	rwakeup(&q->fullrz);
+	qunlock(&q->lk);
+	if(e.b != nil){
+		clrflag(e.b, Bqueued);
+		e.b->queued = 0;
+	}
+	return e;
+}
+
+void
+runsync(int, void *p)
+{
+	Arena *a;
+	Syncq *q;
+	Qent qe;
+
+	q = p;
+	if(waserror()){
+		aincl(&fs->rdonly, 1);
+		fprint(2, "error syncing: %s\n", errmsg());
+		return;
+	}
+	while(1){
+		qe = qpop(q);
+		switch(qe.op){
+		case Qfree:
+			tracex("qfreeb", qe.bp, qe.qgen, -1);
+			a = getarena(qe.bp.addr);
+			qlock(a);
+			cachedel(qe.bp.addr);
+			blkdealloc_lk(a, qe.bp.addr);
+			if(qe.b != nil)
+				dropblk(qe.b);
+			qunlock(a);
+			break;
+		case Qfence:
+			tracev("qfence", qe.qgen);
+			qlock(&fs->synclk);
+			if(--fs->syncing == 0)
+				rwakeupall(&fs->syncrz);
+			qunlock(&fs->synclk);
+			break;
+		case Qwrite:
+			tracex("qsyncb", qe.bp, qe.qgen, -1);
+			if(checkflag(qe.b, Bfreed) == 0)
+				syncblk(qe.b);
+			dropblk(qe.b);
+			break;
+		default:
+			abort();
+		}
+		assert(estacksz() == 1);
+	}
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/cache.c
@@ -1,0 +1,194 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static void
+lrudel(Blk *b)
+{
+	if(b == fs->chead)
+		fs->chead = b->cnext;
+	if(b == fs->ctail)
+		fs->ctail = b->cprev;
+	if(b->cnext != nil)
+		b->cnext->cprev = b->cprev;
+	if(b->cprev != nil)
+		b->cprev->cnext = b->cnext;
+	b->cnext = nil;
+	b->cprev = nil;		
+}
+
+void
+lrutop(Blk *b)
+{
+	qlock(&fs->lrulk);
+	/*
+	 * Someone got in first and did a
+	 * cache lookup; we no longer want
+	 * to put this into the LRU, because
+	 * its now in use.
+	 */
+	assert(b->magic == Magic);
+	if(b->ref != 0){
+		qunlock(&fs->lrulk);
+		return;
+	}
+	lrudel(b);
+	if(fs->chead != nil)
+		fs->chead->cprev = b;
+	if(fs->ctail == nil)
+		fs->ctail = b;
+	b->cnext = fs->chead;
+	fs->chead = b;
+	rwakeup(&fs->lrurz);
+	qunlock(&fs->lrulk);
+}
+
+void
+lrubot(Blk *b)
+{
+	qlock(&fs->lrulk);
+	/*
+	 * Someone got in first and did a
+	 * cache lookup; we no longer want
+	 * to put this into the LRU, because
+	 * its now in use.
+	 */
+	assert(b->magic == Magic);
+	if(b->ref != 0){
+		qunlock(&fs->lrulk);
+		return;
+	}
+	lrudel(b);
+	if(fs->ctail != nil)
+		fs->ctail->cnext = b;
+	if(fs->chead == nil)
+		fs->chead = b;
+	b->cprev = fs->ctail;
+	fs->ctail = b;
+	rwakeup(&fs->lrurz);
+	qunlock(&fs->lrulk);
+}
+
+void
+cacheins(Blk *b)
+{
+	Bucket *bkt;
+	u32int h;
+
+	assert(b->magic == Magic);
+	h = ihash(b->bp.addr);
+	bkt = &fs->bcache[h % fs->cmax];
+	qlock(&fs->lrulk);
+	traceb("cache", b->bp);
+	lock(bkt);
+	if(checkflag(b, Bcached)){
+		unlock(bkt);
+		qunlock(&fs->lrulk);
+		return;
+	}
+	assert(b->hnext == nil);
+	for(Blk *bb = bkt->b; bb != nil; bb = bb->hnext)
+		assert(b != bb);
+	setflag(b, Bcached);
+	b->cached = getcallerpc(&b);
+	b->hnext = bkt->b;
+	bkt->b = b;
+	unlock(bkt);
+	qunlock(&fs->lrulk);
+}
+
+void
+cachedel_lk(vlong addr)
+{
+	Bucket *bkt;
+	Blk *b, **p;
+	u32int h;
+
+	if(addr == -1)
+		return;
+
+	tracex("uncache", Zb, addr, getcallerpc(&addr));
+	h = ihash(addr);
+	bkt = &fs->bcache[h % fs->cmax];
+	lock(bkt);
+	p = &bkt->b;
+	for(b = bkt->b; b != nil; b = b->hnext){
+		if(b->bp.addr == addr){
+			*p = b->hnext;
+			clrflag(b, Bcached);
+			b->uncached = getcallerpc(&addr);
+			b->hnext = nil;
+			break;
+		}
+		p = &b->hnext;
+	}
+	unlock(bkt);
+}
+void
+cachedel(vlong addr)
+{
+	qlock(&fs->lrulk);
+	tracex("uncachelk", Zb, addr, getcallerpc(&addr));
+	cachedel_lk(addr);
+	qunlock(&fs->lrulk);
+}
+
+Blk*
+cacheget(vlong addr)
+{
+	Bucket *bkt;
+	u32int h;
+	Blk *b;
+
+	h = ihash(addr);
+	bkt = &fs->bcache[h % fs->cmax];
+	qlock(&fs->lrulk);
+	lock(bkt);
+	for(b = bkt->b; b != nil; b = b->hnext){
+		if(b->bp.addr == addr){
+			holdblk(b);
+			lrudel(b);
+			b->lasthold = getcallerpc(&addr);
+			break;
+		}
+	}
+	unlock(bkt);
+	qunlock(&fs->lrulk);
+
+	return b;
+}
+
+/*
+ * Pulls the block from the bottom of the LRU for reuse.
+ */
+Blk*
+cachepluck(void)
+{
+	Blk *b;
+
+	qlock(&fs->lrulk);
+	while(fs->ctail == nil)
+		rsleep(&fs->lrurz);
+
+	b = fs->ctail;
+	assert(b->magic == Magic);
+	assert(b->ref == 0);
+	if(checkflag(b, Bcached))
+		cachedel_lk(b->bp.addr);
+	if(checkflag(b, Bcached))
+		fprint(2, "%B cached %#p freed %#p\n", b->bp, b->cached, b->freed);
+	lrudel(b);
+	assert(!checkflag(b, Bcached));
+	b->flag = 0;
+	b->lasthold = 0;
+	b->lastdrop = 0;
+	b->freed = 0;
+	b->hnext = nil;
+	qunlock(&fs->lrulk);
+
+	return  holdblk(b);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/check.c
@@ -1,0 +1,305 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <atomic.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+isfree(vlong bp)
+{
+	Arange *r, q;
+	Arena *a;
+
+	q.off = bp;
+	q.len = Blksz;
+
+	a = getarena(bp);
+	r = (Arange*)avllookup(a->free, &q, -1);
+	if(r == nil)
+		return 0;
+	return bp < (r->off + r->len);
+}
+
+static int
+checktree(int fd, Blk *b, int h, Kvp *lo, Kvp *hi)
+{
+	Kvp x, y;
+	Msg mx, my;
+	int i, r, fill;
+	Blk *c;
+	int fail;
+	Bptr bp;
+
+	fail = 0;
+	if(h < 0){
+		fprint(fd, "node too deep (loop?\n");
+		fail++;
+		return fail;
+	} 
+	if(b->type == Tleaf){
+		if(h != 0){
+			fprint(fd, "unbalanced leaf\n");
+			fail++;
+		}
+		if(h != 0 && b->nval < 2){
+			fprint(fd, "warning: underfilled leaf %B\n", b->bp);
+			fail++;
+		}
+	}
+	if(b->type == Tpivot && b->nval < 2)
+		fprint(fd, "warning: underfilled pivot %B\n", b->bp);
+	getval(b, 0, &x);
+	if(lo && keycmp(lo, &x) > 0){
+		fprint(fd, "out of range keys %P != %P\n", lo, &x);
+		showblk(fd, b, "out of range", 1);
+		fail++;
+	}
+	for(i = 1; i < b->nval; i++){
+		getval(b, i, &y);
+		if(hi && keycmp(&y, hi) >= 0){
+			fprint(fd, "out of range keys %P >= %P\n", &y, hi);
+			fail++;
+		}
+		if(b->type == Tpivot){
+			bp = getptr(&x, &fill);
+			if(isfree(bp.addr)){
+				fprint(fd, "freed block in use: %llx\n", bp.addr);
+				fail++;
+			}
+			if((c = getblk(bp, 0)) == nil){
+				fprint(fd, "corrupt block: %B\n", bp);
+				fail++;
+				continue;
+			}
+			if(blkfill(c) != fill){
+				fprint(fd, "mismatched block fill\n");
+				fail++;
+			}
+			if(checktree(fd, c, h - 1, &x, &y))
+				fail++;
+			dropblk(c);
+		}
+		r = keycmp(&x, &y);
+		switch(r){
+		case -1:
+			break;
+		case 0:
+			fprint(fd, "duplicate keys %P, %P\n", &x, &y);
+			fail++;
+			break;
+		case 1:
+			fprint(fd, "misordered keys %P, %P\n", &x, &y);
+			fail++;
+			break;
+		}
+		x = y;
+	}
+	if(b->type == Tpivot){
+		getval(b, b->nval-1, &y);
+		bp = getptr(&x, &fill);
+		if((c = getblk(bp, 0)) == nil){
+			fprint(fd, "corrupt block: %B\n", bp);
+			fail++;
+		}
+		if(c != nil && checktree(fd, c, h - 1, &y, nil))
+			fail++;
+		dropblk(c);
+		if(b->nbuf > 0){
+			getmsg(b, 0, &mx);
+			if(hi && keycmp(&mx, hi) >= 0){
+				fprint(fd, "out of range messages %P != %M\n", hi, &mx);
+				fail++;
+			}
+		}
+		for(i = 1; i < b->nbuf; i++){
+			getmsg(b, i, &my);
+			switch(my.op){
+			case Owstat:		/* kvp dirent */
+				if((my.v[0] & ~(Owsize|Owmode|Owmtime|Owatime|Owuid|Owgid|Owmuid)) != 0){
+					fprint(fd, "invalid stat op %x\n", my.v[0]);
+					fail++;
+				}
+				break;
+			default:
+				if(my.op <= 0 || my.op >= Nmsgtype){
+					fprint(fd, "invalid message op %d\n", my.op);
+					fail++;
+				}
+				break;
+			}
+			if(hi && keycmp(&y, hi) > 0){
+				fprint(fd, "out of range keys %P >= %P\n", &y, hi);
+				fail++;
+			}
+			if(keycmp(&mx, &my) == 1){
+				fprint(fd, "misordered keys %P, %P\n", &x, &y);
+				fail++;
+				break;
+			}
+			mx = my;
+		}
+
+	}
+	return fail;
+}
+
+static int
+checklog(int fd, Bptr hd)
+{
+	Bptr bp, nb;
+	Blk *b;
+
+	bp = (Bptr){-1, -1, -1};
+	for(bp = hd; bp.addr != -1; bp = nb){
+		if(waserror()){
+			fprint(fd, "error loading %B\n", bp);
+			return 0;
+		}
+		b = getblk(bp, 0);
+		nb = b->logp;
+		dropblk(b);
+		poperror();
+	}
+	return 1;
+}
+
+static int
+checkfree(int fd)
+{
+	Arena *a;
+	Arange *r, *n;
+	int i, fail;
+
+	fail = 0;
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		qlock(a);
+		r = (Arange*)avlmin(a->free);
+		for(n = (Arange*)avlnext(r); n != nil; n = (Arange*)avlnext(n)){
+			if(r->off >= n->off){
+				fprint(2, "misordered length %llx >= %llx\n", r->off, n->off);
+				fail++;
+			}
+			if(r->off+r->len >= n->off){
+				fprint(2, "overlaping range %llx+%llx >= %llx\n", r->off, r->len, n->off);
+				fail++;
+			}
+			r = n;
+		}
+		if(!checklog(fd, a->loghd))
+			fprint(fd, "arena %d: broken freelist\n", i);
+		qunlock(a);
+	}
+	return fail;
+}
+
+static int
+checkdlist(int fd)
+{
+	char pfx[1];
+	Dlist dl;
+	Scan s;
+
+	checklog(fd, fs->snapdl.hd);
+	pfx[0] = Kdlist;
+	btnewscan(&s, pfx, 1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		kv2dlist(&s.kv, &dl);
+		if(!checklog(fd, dl.hd))
+			print("bad dlist %P: %s\n", &s.kv, errmsg());
+	}
+	btexit(&s);
+	return 0;
+}
+
+static int
+checkdata(int, Tree *t)
+{
+	char pfx[1];
+	Bptr bp;
+	Scan s;
+	Blk *b;
+
+	pfx[0] = Klabel;
+	btnewscan(&s, pfx, 1);
+	btenter(t, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		bp = unpackbp(s.kv.v, s.kv.nv);
+		if(isfree(bp.addr)){
+			fprint(2, "free block in use: %B\n", bp);
+			error("free block in use");
+		}
+		b = getblk(bp, GBraw);
+		dropblk(b);
+	}
+	btexit(&s);
+	return 0;
+}
+
+int
+checkfs(int fd)
+{
+	int ok, height;
+	char pfx[1], name[Keymax+1];
+	Tree *t;
+	Scan s;
+	Blk *b;
+
+	ok = 1;
+	aincl(&fs->rdonly, 1);
+	epochwait();
+	if(waserror()){
+		fprint(fd, "error checking %s\n", errmsg());
+		return 0;
+	}
+	fprint(fd, "checking freelist\n");
+	if(checkfree(fd))
+		ok = 0;
+	fprint(fd, "checking deadlist\n");
+	if(checkdlist(fd))
+		ok = 0;
+	fprint(fd, "checking snap tree: %B\n", fs->snap.bp);
+	if((b = getroot(&fs->snap, &height)) != nil){
+		if(checktree(fd, b, height-1, nil, 0))
+			ok = 0;
+		dropblk(b);
+	}
+	pfx[0] = Klabel;
+	btnewscan(&s, pfx, 1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		if(waserror()){
+			fprint(fd, "moving on: %s\n", errmsg());
+			continue;
+		}
+		memcpy(name, s.kv.k+1, s.kv.nk-1);
+		name[s.kv.nk-1] = 0;
+		if((t = opensnap(name, nil)) == nil){
+			fprint(2, "invalid snap label %s\n", name);
+			ok = 0;
+			break;
+		}
+		fprint(fd, "checking snap %s: %B\n", name, t->bp);
+		b = getroot(t, &height);
+		if(checktree(fd, b, height-1, nil, 0))
+			ok = 0;
+		if(checkdata(fd, t))
+			ok = 0;
+		dropblk(b);
+		poperror();
+	}
+	btexit(&s);
+	aincl(&fs->rdonly, -1);
+	poperror();
+	return ok;
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/cons.c
@@ -1,0 +1,439 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Cmd	Cmd;
+
+struct Cmd {
+	char	*name;
+	char	*sub;
+	int	minarg;
+	int	maxarg;
+	void	(*fn)(int, char**, int);
+};
+
+static void
+setdbg(int fd, char **ap, int na)
+{
+	debug = (na == 1) ? atoi(ap[0]) : !debug;
+	fprint(fd, "debug → %d\n", debug);
+}
+
+static void
+sendsync(int fd, int halt)
+{
+	Amsg *a;
+
+	a = mallocz(sizeof(Amsg), 1);
+	if(a == nil){
+		fprint(fd, "alloc sync msg: %r\n");
+		free(a);
+		return;
+	}
+	a->op = AOsync;
+	a->halt = halt;
+	a->fd = fd;
+	chsend(fs->admchan, a);		
+}
+
+static void
+syncfs(int fd, char **, int)
+{
+	sendsync(fd, 0);
+	fprint(fd, "synced\n");
+}
+
+static void
+haltfs(int fd, char **, int)
+{
+	sendsync(fd, 1);
+	fprint(fd, "gefs: ending...\n");
+}
+
+static void
+listsnap(int fd)
+{
+	char pfx[Snapsz];
+	Scan s;
+	uint flg;
+	int sz;
+
+	pfx[0] = Klabel;
+	sz = 1;
+	btnewscan(&s, pfx, sz);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		flg = UNPACK32(s.kv.v+1+8);
+		fprint(fd, "snap %.*s", s.kv.nk-1, s.kv.k+1);
+		if(flg != 0)
+			fprint(fd, " [");
+		if(flg & Lmut)
+			fprint(fd, " mutable");
+		if(flg & Lauto)
+			fprint(fd, " auto");
+		if(flg & Ltsnap)
+			fprint(fd, " tsnap");
+		if(flg != 0)
+			fprint(fd, " ]");
+		fprint(fd, "\n");
+	}
+	btexit(&s);
+}
+
+static void
+snapfs(int fd, char **ap, int na)
+{
+	Amsg *a;
+	int i;
+
+	if((a = mallocz(sizeof(Amsg), 1)) == nil){
+		fprint(fd, "alloc sync msg: %r\n");
+		return;
+	}
+	a->op = AOsnap;
+	a->fd = fd;
+	a->flag = Ltsnap;
+	while(ap[0][0] == '-'){
+		for(i = 1; ap[0][i]; i++){
+			switch(ap[0][i]){
+			case 'S':	a->flag &= ~Ltsnap;	break;
+			case 'm':	a->flag |= Lmut;	break;
+			case 'd':	a->delete++;		break;
+			case 'l':
+				listsnap(fd);
+				free(a);
+				return;
+			default:
+				fprint(fd, "usage: snap -[Smdl] [old [new]]\n");
+				free(a);
+				return;
+			}
+		}
+		na--;
+		ap++;
+	}
+	if(a->delete && na != 1 || !a->delete && na != 2){
+		fprint(fd, "usage: snap -[md] old [new]\n");
+		free(a);
+		return;
+	}
+	if(na >= 1)
+		strecpy(a->old, a->old+sizeof(a->old), ap[0]);
+	if(na >= 2)
+		strecpy(a->new, a->new+sizeof(a->new), ap[1]);
+	sendsync(fd, 0);
+	chsend(fs->admchan, a);
+}
+
+static void
+fsckfs(int fd, char**, int)
+{
+	if(checkfs(fd))
+		fprint(fd, "ok\n");
+	else
+		fprint(fd, "broken\n");
+}
+
+static void
+refreshusers(int fd, char **, int)
+{
+	Mount *mnt;
+
+	if((mnt = getmount("adm")) == nil){
+		fprint(fd, "load users: missing 'adm'\n");
+		return;
+	}
+	if(waserror()){
+		fprint(fd, "load users: %s\n", errmsg());
+		clunkmount(mnt);
+		return;
+	}
+	loadusers(fd, mnt->root);
+	fprint(fd, "refreshed users\n");
+	clunkmount(mnt);
+}
+
+static void
+showbstate(int fd, char**, int)
+{
+	char *p, fbuf[8];
+	Blk *b;
+
+	for(b = blkbuf; b != blkbuf+fs->cmax; b++){
+		p = fbuf;
+		if(b->flag & Bdirty)	*p++ = 'd';
+		if(b->flag & Bfinal)	*p++ = 'f';
+		if(b->flag & Bfreed)	*p++ = 'F';
+		if(b->flag & Bcached)	*p++ = 'c';
+		if(b->flag & Bqueued)	*p++ = 'q';
+		if(b->flag & Blimbo)	*p++ = 'L';
+		*p = 0;
+		fprint(fd, "blk %#p type %d flag %s bp %B ref %ld alloc %#p queued %#p, hold %#p drop %#p cached %#p\n",
+			b, b->type, fbuf, b->bp, b->ref, b->alloced, b->queued, b->lasthold, b->lastdrop, b->cached);
+	}
+}
+
+static void
+showusers(int fd, char**, int)
+{
+	User *u, *v;
+	int i, j;
+	char *sep;
+
+	rlock(&fs->userlk);
+	for(i = 0; i < fs->nusers; i++){
+		u = &fs->users[i];
+		fprint(fd, "%d:%s:", u->id, u->name);
+		if((v = uid2user(u->lead)) == nil)
+			fprint(fd, "???:");
+		else
+			fprint(fd, "%s:", v->name);
+		sep = "";
+		for(j = 0; j < u->nmemb; j++){
+			if((v = uid2user(u->memb[j])) == nil)
+				fprint(fd, "%s???", sep);
+			else
+				fprint(fd, "%s%s", sep, v->name);
+			sep = ",";
+		}
+		fprint(fd, "\n");
+	}
+	runlock(&fs->userlk);
+}
+
+static void
+showdf(int fd, char**, int)
+{
+	char *units[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", nil};
+	vlong size, used, free;
+	double hsize, hused, hfree;
+	double pct;
+	Arena *a;
+	int i, us, uu, uf;
+
+	size = 0;
+	used = 0;
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		qlock(a);
+		size += a->size;
+		used += a->used;
+		qunlock(a);
+		fprint(fd, "arena %d: %llx/%llx (%.2f%%)\n", i, a->used, a->size, 100*(double)a->used/(double)a->size);
+	}
+	free = size - used;
+	hsize = size;
+	hused = used;
+	hfree = free;
+	for(us = 0; us < nelem(units)-1 && hsize >= 500 ; us++)
+		hsize /= 1024;
+	for(uu = 0; uu < nelem(units)-1 && hused >= 500 ; uu++)
+		hused /= 1024;
+	for(uf = 0; uf < nelem(units)-1 && hfree >= 500 ; uf++)
+		hfree /= 1024;
+	pct = 100.0*(double)used/(double)size;
+	fprint(fd, "fill:\t%.2f%%\n", pct);
+	fprint(fd, "used:\t%lld (%.2f %s)\n", used, hused, units[uu]);
+	fprint(fd, "size:\t%lld (%.2f %s)\n", size, hsize, units[us]);
+	fprint(fd, "free:\t%lld (%.2f %s)\n", free, hfree, units[uf]);
+}
+
+void
+showfid(int fd, char**, int)
+{
+	int i;
+	Fid *f;
+	Conn *c;
+
+	for(c = fs->conns; c != nil; c = c->next){
+		fprint(fd, "fids:\n");
+		for(i = 0; i < Nfidtab; i++){
+			lock(&c->fidtablk[i]);
+			for(f = c->fidtab[i]; f != nil; f = f->next){
+				rlock(f->dent);
+				fprint(fd, "\tfid[%d] from %#zx: %d [refs=%ld, k=%K, qid=%Q]\n",
+					i, getmalloctag(f), f->fid, f->dent->ref, &f->dent->Key, f->dent->qid);
+				runlock(f->dent);
+			}
+			unlock(&c->fidtablk[i]);
+		}
+	}
+}
+
+void
+showtree(int fd, char **ap, int na)
+{
+	char *name;
+	Tree *t;
+	Blk *b;
+	int h;
+
+	name = "main";
+	memset(&t, 0, sizeof(t));
+	if(na == 1)
+		name = ap[0];
+	if(strcmp(name, "snap") == 0)
+		t = &fs->snap;
+	else if((t = opensnap(name, nil)) == nil){
+		fprint(fd, "open %s: %r\n", name);
+		return;
+	}
+	b = getroot(t, &h);
+	fprint(fd, "=== [%s] %B @%d\n", name, t->bp, t->ht);
+	showblk(fd, b, "contents", 1);
+	dropblk(b);
+	if(t != &fs->snap)
+		closesnap(t);
+}
+
+static void
+permflip(int fd, char **ap, int)
+{
+	if(strcmp(ap[0], "on") == 0)
+		permissive = 1;
+	else if(strcmp(ap[0], "off") == 0)
+		permissive = 0;
+	else
+		fprint(2, "unknown permissive %s\n", ap[0]);
+	fprint(fd, "permissive: %d → %d\n", !permissive, permissive);
+}
+
+static void
+savetrace(int fd, char **ap, int na)
+{
+	Biobuf *bfd;
+	Trace *t;
+	int i;
+
+	if(na == 0)
+		bfd = Bfdopen(dup(fd, -1), OWRITE);
+	else
+		bfd = Bopen(ap[0], OWRITE);
+	if(bfd == nil){
+		fprint(fd, "error opening output");
+		return;
+	}
+	for(i = 0; i < fs->ntrace; i++){
+		t = &fs->trace[(fs->traceidx + i) % fs->ntrace];
+		if(t->msg[0] == 0)
+			continue;
+		Bprint(bfd, "[%d@%d] %s", t->tid, t->qgen, t->msg);
+		if(t->bp.addr != -1)
+			Bprint(bfd, " %B", t->bp);
+		if(t->v0 != -1)
+			Bprint(bfd, " %llx", t->v0);
+		if(t->v1 != -1)
+			Bprint(bfd, " %llx", t->v1);
+		Bprint(bfd, "\n");
+	}
+	Bterm(bfd);
+	fprint(fd, "saved\n");
+}
+
+static void
+unreserve(int fd, char **ap, int)
+{
+	if(strcmp(ap[0], "on") == 0)
+		usereserve = 0;
+	else if(strcmp(ap[0], "off") == 0)
+		usereserve = 1;
+	else
+		fprint(2, "unknown reserve %s\n", ap[0]);
+	fprint(fd, "reserve: %d → %d\n", !permissive, permissive);
+}
+
+static void
+help(int fd, char**, int)
+{
+	char *msg =
+		"help -- show this help\n"
+		"check -- check for consistency\n"
+		"df -- show disk usage\n"
+		"halt -- stop all writers, sync, and go read-only\n"
+		"permit [on|off] -- switch to/from permissive mode\n"
+		"reserve [on|off] -- enable block reserves\n"
+		"snap -[Smdl] [old [new]] -- manage snapshots\n"
+		"sync -- flush all pending writes to disk\n"
+		"users -- reload user table from adm snapshot\n"
+		"save trace [name] -- save a trace of recent activity\n"
+		"show -- debug dumps\n"
+		"	tree [name]\n"
+		"	fid\n"
+		"	users\n";
+	fprint(fd, "%s", msg);
+}
+
+Cmd cmdtab[] = {
+	/* admin */
+	{.name="check",		.sub=nil,	.minarg=0, .maxarg=0, .fn=fsckfs},
+	{.name="df",		.sub=nil, 	.minarg=0, .maxarg=0, .fn=showdf},
+	{.name="halt",		.sub=nil,	.minarg=0, .maxarg=0, .fn=haltfs},
+	{.name="help",		.sub=nil,	.minarg=0, .maxarg=0, .fn=help},
+	{.name="permit",	.sub=nil,	.minarg=1, .maxarg=1, .fn=permflip},
+	{.name="snap",		.sub=nil,	.minarg=1, .maxarg=3, .fn=snapfs},
+	{.name="sync",		.sub=nil,	.minarg=0, .maxarg=0, .fn=syncfs},
+	{.name="reserve",	.sub=nil,	.minarg=0, .maxarg=1, .fn=unreserve},
+	{.name="users",		.sub=nil,	.minarg=0, .maxarg=1, .fn=refreshusers},
+
+	/* debugging */
+	{.name="show",		.sub="fid",	.minarg=0, .maxarg=0, .fn=showfid},
+	{.name="show",		.sub="tree",	.minarg=0, .maxarg=1, .fn=showtree},
+	{.name="show",		.sub="users",	.minarg=0, .maxarg=0, .fn=showusers},
+	{.name="show",		.sub="bstate",	.minarg=0, .maxarg=0, .fn=showbstate},
+	{.name="debug",		.sub=nil,	.minarg=0, .maxarg=1, .fn=setdbg},
+	{.name="save",		.sub="trace",	.minarg=0, .maxarg=1, .fn=savetrace},
+	{.name=nil, .sub=nil},
+};
+
+void
+runcons(int tid, void *pfd)
+{
+	char buf[256], *f[4], **ap;
+	int i, n, nf, na, fd;
+	Cmd *c;
+
+	fd = (uintptr)pfd;
+	while(1){
+		fprint(fd, "gefs# ");
+		if((n = read(fd, buf, sizeof(buf)-1)) == -1)
+			break;
+		epochstart(tid);
+		buf[n] = 0;
+		nf = tokenize(buf, f, nelem(f));
+		if(nf == 0 || strlen(f[0]) == 0)
+			goto Next;
+		for(c = cmdtab; c->name != nil; c++){
+			ap = f;
+			na = nf;
+			if(strcmp(c->name, *ap) != 0)
+				continue;
+			ap++;
+			na--;
+			if(c->sub != nil){
+				if(na == 0 || strcmp(c->sub, *ap) != 0)
+					continue;
+				ap++;
+				na--;
+			}
+			if(na < c->minarg || na > c->maxarg)
+				continue;
+			c->fn(fd, ap, na);
+			break;
+		}
+		if(c->name == nil){
+			fprint(fd, "unknown command '%s", f[0]);
+			for(i = 1; i < nf; i++)
+				fprint(fd, " %s", f[i]);
+			fprint(fd, "'\n");
+		}
+Next:
+		epochend(tid);
+	}
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/dat.h
@@ -1,0 +1,763 @@
+typedef struct Blk	Blk;
+typedef struct Amsg	Amsg;
+typedef struct Gefs	Gefs;
+typedef struct Errctx	Errctx;
+typedef struct Fmsg	Fmsg;
+typedef struct Fid	Fid;
+typedef struct Msg	Msg;
+typedef struct Key	Key;
+typedef struct Val	Val;
+typedef struct Kvp	Kvp;
+typedef struct Xdir	Xdir;
+typedef struct Bptr	Bptr;
+typedef struct Bfree	Bfree;
+typedef struct Scan	Scan;
+typedef struct Dent	Dent;
+typedef struct Scanp	Scanp;
+typedef struct Arena	Arena;
+typedef struct Arange	Arange;
+typedef struct Bucket	Bucket;
+typedef struct Chan	Chan;
+typedef struct Syncq	Syncq;
+typedef struct Qent	Qent;
+typedef struct Trace	Trace;
+typedef struct Tree	Tree;
+typedef struct Dlist	Dlist;
+typedef struct Mount	Mount;
+typedef struct User	User;
+typedef struct Conn	Conn;
+
+enum {
+	KiB	= 1024ULL,
+	MiB	= 1024ULL*KiB,
+	GiB	= 1024ULL*MiB,
+	TiB	= 1024ULL*GiB,
+
+	Lgblk	= 14,
+	Blksz	= (1ULL<<Lgblk),
+
+	Nrefbuf	= 1024,			/* number of ref incs before syncing */
+	Nfidtab	= 1024,			/* number of fit hash entries */
+	Nflushtab = 1024,		/* flush table size */
+	Ndtab	= 1024,			/* number of dir tab entries */
+	Max9p	= 32*KiB,		/* biggest message size we're willing to negotiate */
+	Nsec	= 1000LL*1000*1000,	/* nanoseconds to the second */
+	Maxname	= 256,			/* maximum size of a name element */
+	Maxent	= 9+Maxname+1,		/* maximum size of ent key, with terminator */
+	Maxtag	= 1<<16,		/* maximum tag in 9p */
+
+	/*
+	 * Kpmax must be no more than 1/4 of pivspc, or
+	 * there is no way to get a valid split of a
+	 * maximally filled tree.
+	 */
+	Keymax	= 128,			/* key data limit */
+	Inlmax	= 512,			/* inline data limit */
+	Ptrsz	= 24,			/* off, hash, gen */
+	Pptrsz	= 26,			/* off, hash, gen, fill */
+	Fillsz	= 2,			/* block fill count */
+	Offksz	= 17,			/* type, qid, off */
+	Snapsz	= 9,			/* tag, snapid */
+	Dpfxsz	= 9,			/* directory prefix */
+	Upksz	= 9,			/* directory prefix */
+	Dlksz	= 1+8+8,		/* tag, death, birth */
+	Dlvsz	= Ptrsz+Ptrsz,		/* hd,tl of deadlist */
+	Dlkvpsz	= Dlksz+Dlvsz,		/* full size of dlist kvp */
+	Treesz	= 4+4+4+4		/* ref, ht, flg, gen, pred, succ, base, root */
+		  +8+8+8+8+Ptrsz,
+	Kvmax	= Keymax + Inlmax,	/* Key and value */
+	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
+	Wstatmax = 4+8+8+8,		/* mode, size, atime, mtime */
+	Arenasz	= 8+8+8+8,		/* loghd, loghash, size, used */
+	
+	Pivhdsz		= 10,
+	Leafhdsz	= 6,
+	Loghdsz		= 2+2+8+Ptrsz,			/* type, len, hash, chain */
+	Rootsz		= 4+Ptrsz,			/* root pointer */
+	Pivsz		= Blksz - Pivhdsz,
+	Bufspc		= (Blksz - Pivhdsz)/2,		/* pivot room */
+	Pivspc		= Blksz - Pivhdsz - Bufspc,
+	Logspc		= Blksz - Loghdsz,
+	Logslop		= 16+16+8,			/* val, nextb, chain */
+	Leafspc 	= Blksz - Leafhdsz,
+	Msgmax  	= 1 + (Kvmax > Kpmax ? Kvmax : Kpmax),
+	Estacksz	= 64,
+};
+
+enum {
+	Eactive	= 1UL<<30,	/* epoch active flag */
+};
+
+enum {
+	/*
+	 * dent: pqid[8] qid[8] -- a directory entry key.
+	 * ptr:  off[8] hash[8] gen[8] -- a key for an Dir block.
+	 * dir:  serialized Xdir
+	 */
+
+	/* fs keys */
+	Kdat,	/* qid[8] off[8] => ptr:		pointer to data page */
+	Kent,	/* pqid[8] name[n] => dir[n]:		serialized Dir */
+	Kup,	/* qid[8] => Kent:			parent dir */
+
+	/* snapshot keys */
+	Klabel,	/* name[] => snapid[]:			snapshot label */
+	Ksnap,	/* sid[8] => ref[8], tree[52]:		snapshot root */
+	Kdlist,	/* snap[8] gen[8] => hd[ptr],tl[ptr]	deadlist  */
+};
+
+enum {
+	Bdirty	= 1 << 0,
+	Bfinal	= 1 << 1,
+	Bfreed	= 1 << 2,
+	Bcached	= 1 << 3,
+	Bqueued	= 1 << 4,
+	Blimbo	= 1 << 5,
+};
+
+enum {
+	Lmut	= 1 << 0,	/* can we modify snaps via this label */
+	Lauto	= 1 << 1,	/* was this label generated automatically */
+	Ltsnap	= 1 << 2,	/* should we skip the timed snapshots */
+};
+
+enum {
+	Qdump = 1ULL << 63,
+};
+
+#define Zb (Bptr){-1, -1, -1}
+
+/* internal errors */
+#define Efs	(abort(), "fs broke")
+extern char Ecorrupt[];
+extern char Efsvers[];
+extern char Eimpl[];
+extern char Ebotch[];
+extern char Eio[];
+extern char Enofid[];
+extern char Efid[];
+extern char Etype[];
+extern char Edscan[];
+extern char Esrch[];
+extern char Eexist[];
+extern char Emode[];
+extern char Efull[];
+extern char Estuffed[];
+extern char Eauth[];
+extern char Elength[];
+extern char Eperm[];
+extern char Einuse[];
+extern char Ebadf[];
+extern char Ename[];
+extern char Enomem[];
+extern char Eattach[];
+extern char Enosnap[];
+extern char Esnap[];
+extern char Edir[];
+extern char Esyntax[];
+extern char Enouser[];
+extern char Enogrp[];
+extern char Efsize[];
+extern char Ebadu[];
+extern char Erdonly[];
+extern char Elocked[];
+extern char Eauthp[];
+extern char Eauthd[];
+extern char Eauthph[];
+extern char Ephase[];
+extern char Enone[];
+extern char Enoauth[];
+
+extern char Ewstatb[];
+extern char Ewstatd[];
+extern char Ewstatg[];
+extern char Ewstatl[];
+extern char Ewstatm[];
+extern char Ewstato[];
+extern char Ewstatp[];
+extern char Ewstatq[];
+extern char Ewstatu[];
+extern char Ewstatv[];
+extern char Enempty[];
+
+/*
+ * All metadata blocks share a common header:
+ * 
+ *	type[2]
+ *
+ * The None type is reserved for file data blocks
+ * and refcount blocks.
+ *
+ * The superblock has this layout:
+ *	version[8]	always "gefsNNNNN"
+ *	blksz[4]	block size in bytes
+ *	bufsz[4]	portion of leaf nodes
+ *			allocated to buffers,
+ *			in bytes
+ *	height[4]	tree height of root node
+ *	rootb[8]	address of root in last
+ *			snapshot.
+ *	rooth[8]	hash of root node
+ *	narena[4]	number of arenas in tree
+ *	flag[8]	feature flag
+ *	gen[8]		The flush generation
+ *
+ * The arena zone blocks have this layout, and
+ * are overwritten in place:
+ *
+ *	log[8]		The head of the alloc log
+ *	logh[8]		The hash of the alloc log
+ *
+ * The log blocks have this layout, and are one of
+ * two types of blocks that get overwritten in place:
+ *
+ *	hash[8]		The hash of the previous log block
+ *
+ *	The remainder of the block is filled with log
+ *	entries. Each log entry has at least 8 bytes
+ *	of entry. Some are longer. The opcode is or'ed
+ *	into the low order bits of the first vlong.
+ *	These ops take the following form:
+ *
+ *	Alloc, Free:
+ *		off[8] len[8]
+ *	Alloc1, Free1:
+ *		off[8]
+ *	Ref:
+ *		off[8]
+ *	Flush:	
+ *		gen[8]
+ *
+ * Pivots have the following layout:
+ *
+ *	nval[2]
+ *	valsz[2]
+ *	nbuf[2]
+ *	bufsz[2]
+ *
+ * Leaves have the following layout:
+ *
+ *	nval[2]
+ *	valsz[2]
+ *	pad[4]sure, 
+ *
+ * Within these nodes, pointers have the following
+ * layout:
+ *
+ *	off[8] hash[8] fill[2]
+ */
+enum {
+	Tdat,
+	Tpivot,
+	Tleaf,
+	Tlog,
+	Tdlist,
+	Tarena,
+	Tsuper = 0x6765,	/* 'ge' bigendian */
+};
+
+enum {
+	Vinl,	/* Inline value */
+	Vref,	/* Block pointer */
+};
+
+enum {
+	GBraw	= 1<<0,
+	GBwrite	= 1<<1,
+	GBnochk	= 1<<2,
+	GBsoftchk = 1<<3,
+};
+
+enum {
+	Onop,		/* nothing */
+	Oinsert,	/* new kvp */
+	Odelete,	/* delete kvp */
+	Oclearb,	/* free block ptr if exists */
+	Oclobber,	/* remove file if it exists */
+	Owstat,		/* update kvp dirent */
+	Orelink,	/* rechain forwards */
+	Oreprev,	/* rechain backwards */
+	Nmsgtype,	/* maximum message type */
+};
+
+enum {
+	Magic = 0x979b929e98969c8c,
+};
+
+/*
+ * Wstat ops come with associated data, in the order
+ * of the bit flag.
+ */
+enum{
+	/* wstat flag */
+	Owsize	= 1<<0,	/* [8]fsize: update file size */
+	Owmode	= 1<<1,	/* [4]mode: update file mode */
+	Owmtime	= 1<<2, /* [8]mtime: update mtime, in nsec */
+	Owatime	= 1<<3, /* [8]atime: update atime, in nsec */
+	Owuid	= 1<<4,	/* [4]uid: set uid */
+	Owgid	= 1<<5,	/* [4]uid: set gid */
+	Owmuid	= 1<<6,	/* [4]uid: set muid */
+};
+
+/*
+ * Operations for the allocation log.
+ */
+enum {
+	LogNop,		/* unused */
+	/* 1-wide entries */
+	LogAlloc1,	/* alloc a block */
+	LogFree1,	/* free a block */
+	LogSync,	/* sync barrier for replay */
+
+	/* 2-wide entries */
+#define	Log2wide	LogAlloc
+	LogAlloc,	/* alloc a range */
+	LogFree,	/* free a range */
+};
+
+enum {
+	AOnone,
+	AOsnap,
+	AOsync,
+	AOclear,
+	AOrclose,
+};
+
+struct Bptr {
+	vlong	addr;
+	uvlong	hash;
+	vlong	gen;
+};
+
+struct Key{
+	char	*k;
+	int	nk;
+};
+
+struct Val {
+	short	nv;
+	char	*v;
+};
+
+struct Kvp {
+	Key;
+	Val;
+};
+
+struct Msg {
+	char	op;
+	Kvp;
+};
+
+struct Dlist {
+	Dlist	*cnext;	/* cache next entry */
+	Dlist	*cprev;	/* cache prev entry */
+	Dlist	*chain;	/* hash table chain */
+	Blk	*ins;	/* loaded head */
+
+	vlong	gen;	/* deadlist gen */
+	vlong	bgen;	/* birth gen */
+	Bptr	hd;	/* deadlist head */
+	Bptr	tl;	/* deadlist tail */
+};
+
+struct Errctx {
+	long	tid;
+	char	err[128];
+	jmp_buf	errlab[Estacksz];
+	int	nerrlab;
+};
+
+struct Arange {
+	Avl;
+	vlong	off;
+	vlong	len;
+};
+
+struct Bucket {
+	Lock;
+	Blk	*b;
+};
+
+struct Amsg {
+	int	op;
+	int	fd;
+	union {
+		struct {	/* AOsnap */
+			char	old[128];
+			char	new[128];
+			int	flag;
+			char	delete;
+
+		};
+		struct {	/* AOsync */
+			int	halt;
+		};
+		struct {	/* AOclear, AOrclose */
+			Mount	*mnt;
+			Dent	*dent;
+			vlong	qpath;
+			vlong	off;
+			vlong	end;
+		};
+	};
+};
+
+struct Fmsg {
+	Fcall;
+	Conn	*conn;
+	int	sz;	/* the size of the message buf */
+	uchar	buf[];
+};
+
+struct Tree {
+	/* in-memory */
+	Lock	lk;
+	long	memref;	/* number of in-memory references to this */
+	vlong	memgen;	/* wip next generation */
+	int	dirty;
+
+	/* on-disk */
+	int	nref;	/* number snapshots forked/after us */
+	int	nlbl;	/* number of labels referring to us */
+	int	ht;	/* height of the tree */
+	uint	flag;	/* flag set */
+	Bptr	bp;	/* block pointer of root */
+	vlong	gen;	/* generation */
+	vlong	pred;	/* previous snapshot */
+	vlong	succ;	/* next snapshot */
+	vlong	base;	/* base snapshot */
+};
+
+enum {
+	DFblk,
+	DFmnt,
+	DFtree,
+};
+
+struct Bfree {
+	Bfree	*next;
+	int	op;
+	Mount	*m;
+	Tree	*t;
+	Blk	*b;
+	Bptr	bp;
+};
+
+struct User {
+	int	id;
+	int	lead;
+	int	*memb;
+	int	nmemb;
+	char	name[128];
+};
+
+enum {
+	/* in priority order */
+	Qnone,
+	Qfence,
+	Qwrite,
+	Qfree,
+};
+
+struct Qent {
+	vlong	qgen;
+	Bptr	bp;
+	Blk	*b;
+	int	op;
+};
+
+struct Syncq {
+	QLock	lk;
+	Rendez	fullrz;
+	Rendez	emptyrz;
+	Qent	*heap;
+	int	nheap;
+	int	heapsz;
+};
+
+struct Trace {
+	int	tid;
+	int	qgen;
+	char	msg[16];
+	Bptr	bp;
+	vlong	v0;
+	vlong	v1;
+};
+
+/*
+ * Overall state of the file sytem.
+ * Shadows the superblock contents.
+ */
+struct Gefs {
+	int	blksz;
+	int	bufspc;
+	Tree	snap;
+	Dlist	snapdl;
+	int	narena;
+	vlong	flag;
+	vlong	nextqid;
+	vlong	nextgen;
+	vlong	qgen;
+	Bptr	*arenabp;
+
+	/* superblocks */
+	Blk	*sb0;	/* primary */
+	Blk	*sb1;	/* backup */
+
+	/* arena allocation */
+	Arena	*arenas;
+	long	roundrobin;
+	long	syncing;
+	long	nsyncers;
+	long	nreaders;
+
+	QLock	synclk;
+	Rendez	syncrz;
+
+	Mount	*mounts;
+	Mount	*snapmnt;
+	Lock	connlk;
+	Conn	*conns;
+
+	Chan	*wrchan;
+	Chan	*admchan;
+	Chan	**rdchan;
+
+	QLock	mutlk;
+	long	nworker;
+	long	epoch;
+	long	lepoch[32];
+	Bfree	*limbo[3];
+	long	nlimbo;
+
+	Syncq	syncq[32];
+
+
+	int	fd;
+	long	rdonly;
+	int	noauth;
+
+	/* user list */
+	RWLock	userlk;
+	User	*users;
+	int	nusers;
+
+	/* open directory entries */
+	Lock	dtablk;
+	Dent	*dtab[Ndtab];
+
+	/* slow block io */
+	QLock	blklk[32];
+	
+	/* deadlist cache */
+	Dlist	**dlcache;
+	Dlist	*dlhead;
+	Dlist	*dltail;
+	int	dlcount;
+	int	dlcmax;
+
+	/* block lru */
+	QLock	lrulk;
+	Rendez	lrurz;
+	Bucket	*bcache;
+	Blk	*chead;
+	Blk	*ctail;
+	usize	ccount;
+	usize	cmax;
+
+	RWLock	flushq[Nflushtab];
+	int	flushop[Nflushtab];
+
+	Trace	*trace;
+	long	traceidx;
+	long	ntrace;
+};
+
+struct Arena {
+	QLock;
+	Avltree *free;
+	Blk	**queue;
+	int	nqueue;
+	int	lbidx;
+	Blk	*logbuf[2];	/* preallocated log pages */
+	Blk	*h0;		/* arena header */
+	Blk	*h1;		/* arena footer */
+	Blk	**q;		/* write queue */
+	vlong	nq;
+	vlong	size;
+	vlong	used;
+	vlong	reserve;
+	/* allocation log */
+	vlong	nlog;		/* logged since last copression */
+	Bptr	loghd;		/* allocation log */
+	Blk	*logtl;		/* end of the log, open for writing */
+	Syncq	*sync;
+};
+
+struct Xdir {
+	/* file data */
+	uvlong	flag;	/* storage flag */
+	Qid	qid;	/* unique id from server */
+	ulong	mode;	/* permissions */
+	vlong	atime;	/* last read time: nsec */
+	vlong	mtime;	/* last write time: nsec */
+	uvlong	length;	/* file length */
+	int	uid;	/* owner name */
+	int	gid;	/* group name */
+	int	muid;	/* last modifier name */
+	char	*name;	/* last element of path */
+};
+
+struct Dent {
+	RWLock;
+	Key;
+	Xdir;
+	Dent	*next;
+	QLock	trunclk;
+	Rendez	truncrz;
+	vlong	up;
+	long	ref;
+	char	gone;
+	char	trunc;
+
+	char	buf[Maxent];
+};
+
+struct Mount {
+	Lock;
+	Mount	*next;
+	long	ref;
+	vlong	gen;
+	char	name[64];
+	Tree	*root;	/* EBR protected */
+
+	int	flag;
+
+	/* snapshot history */
+	char	minutely[60][128];
+	char	hourly[24][128];
+};
+
+struct Conn {
+	Conn	*next;
+	QLock	wrlk;
+	int	rfd;
+	int	wfd;
+	int	iounit;
+	int	versioned;
+
+	/* fid hash table */
+	Lock	fidtablk[Nfidtab];
+	Fid	*fidtab[Nfidtab];
+};
+
+struct Fid {
+	Lock;
+	Fid	*next;
+	/*
+	 * if opened with OEXEC, we want to use a snapshot,
+	 * instead of the most recent root, to prevent
+	 * paging in the wrong executable.
+	 */
+	Mount	*mnt;
+	Scan	*scan;	/* in progres scan */
+	Dent	*dent;	/* (pqid, name) ref, modified on rename */	
+	void	*auth;
+
+	u32int	fid;
+	vlong	qpath;
+	vlong	pqpath;
+	long	ref;
+	int	mode;
+	int	iounit;
+
+	int	uid;
+	int	duid;
+	int	dgid;
+	int	dmode;
+
+	char	permit;
+	char	rclose;
+};
+
+enum {
+	POmod,
+	POrot,
+	POsplit,
+	POmerge,
+};
+
+struct Scanp {
+	int	bi;
+	int	vi;
+	Blk	*b;
+};
+
+struct Scan {
+	vlong	offset;	/* last read offset */
+	char	first;
+	char	donescan;
+	char	overflow;
+	char	present;
+	int	ht;
+	Kvp	kv;
+	Key	pfx;
+	char	kvbuf[Kvmax];
+	char	pfxbuf[Keymax];
+	Scanp	*path;
+};
+
+struct Blk {
+	/* cache entry */
+	Blk	*cnext;
+	Blk	*cprev;
+	Blk	*hnext;
+
+	/* Freelist entry */
+	Blk	*fnext;
+
+	long	flag;
+
+	/* serialized to disk in header */
+	short	type;	/* @0, for all */
+	union {
+		struct {
+			short	nval;	/* @2, for Leaf, Pivot: data[0:2] */
+			short	valsz;	/* @4, for Leaf, Pivot: data[2:4] */
+			short   nbuf;	/* @6, for Pivot */
+			short   bufsz;	/* @8, for Pivot */
+		};
+		struct {
+			int	logsz;	/* @2 for allocation log */
+			uvlong	logh;	/* @4 for log body hash */
+			Bptr	logp;	/* @12 next deadlist chain */
+		};
+	};
+
+	/* debug */
+	uintptr queued;
+	uintptr lasthold;
+	uintptr lastdrop;
+	uintptr	enqueued;
+	uintptr cached;
+	uintptr uncached;
+	uintptr	alloced;
+	uintptr	freed;
+
+	Bptr	bp;
+	long	ref;
+	char	*data;
+	char	buf[Blksz];
+	vlong	magic;
+};
+
+struct Chan {
+	int	size;	/* size of queue */
+	long	count;	/* how many in queue (semaphore) */
+	long	avail;	/* how many available to send (semaphore) */
+	Lock	rl, wl;	/* circular pointers */
+	void	**rp;
+	void	**wp;
+	void*	args[];	/* list of saved pointers, [->size] */
+};
--- /dev/null
+++ b/sys/src/cmd/gefs/dump.c
@@ -1,0 +1,365 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include <ctype.h>
+
+#include "dat.h"
+#include "fns.h"
+
+char	spc[128];
+
+static int
+showkey(Fmt *fmt, Key *k)
+{
+	int n;
+
+	/*
+	 * dent: pqid[8] qid[8] -- a directory entry key.
+	 * ptr:  off[8] hash[8] -- a key for an Dir block.
+	 * dir:  fixed statbuf header, user ids
+	 */
+	if(k->nk == 0)
+		return fmtprint(fmt, "\"\"");
+	switch(k->k[0]){
+	case Kdat:	/* qid[8] off[8] => ptr[16]:	pointer to data page */
+		n = fmtprint(fmt, "dat qid:%llx off:%llx",
+			UNPACK64(k->k+1), UNPACK64(k->k+9));
+		break;
+	case Kent:	/* pqid[8] name[n] => dir[n]:	serialized Dir */
+		n = fmtprint(fmt, "ent dir:%llx, name:\"%.*s\"",
+			UNPACK64(k->k+1), k->nk-11, k->k+11);
+		break;
+	case Klabel:	/* name[n] => tree[24]:	snapshot ref */
+		n = fmtprint(fmt, "label name:\"%.*s\"", k->nk-1, k->k+1);
+		break;
+	case Ksnap:	/* name[n] => tree[24]:	snapshot root */
+		n = fmtprint(fmt, "snap id:%lld", UNPACK64(k->k+1));
+		break;
+	case Kup:	/* qid[8] => pqid[8]:		parent dir */
+		n = fmtprint(fmt, "up dir:%llx", UNPACK64(k->k+1));
+		break;
+	case Kdlist:
+		n = fmtprint(fmt, "dlist gen:%lld, bgen:%lld",
+			UNPACK64(k->k+1), UNPACK64(k->k+9));
+		break;
+	default:
+		n = fmtprint(fmt, "??? %.*H", k->nk, k->k);
+		break;
+	}
+	return n;
+}
+
+static int
+showval(Fmt *fmt, Kvp *v, int op, int flg)
+{
+	int n, ws;
+	char *p;
+	Tree t;
+	Xdir d;
+
+	n = 0;
+	if(flg){
+		assert(v->nv == Ptrsz+2);
+		n = fmtprint(fmt, "(%B,%d)", unpackbp(v->v, v->nv), UNPACK16(v->v+Ptrsz));
+		return n;
+	}
+	if(op == Odelete || op == Oclearb){
+		n = fmtprint(fmt, "delete");
+		return n;
+	}
+	switch(v->k[0]){
+	case Kdat:	/* qid[8] off[8] => ptr[16]:	pointer to data page */
+		switch(op){
+		case Odelete:
+		case Oclearb:
+			n = 0;
+			break;
+		case Onop:
+		case Oinsert:
+			if(v->nv == Ptrsz)
+				n = fmtprint(fmt, "ptr:%B", unpackbp(v->v, v->nv));
+			else
+				n = fmtprint(fmt, "BROKEN ptr %.*H", v->nk, v->k);
+			break;
+		}
+		break;
+	case Kent:	/* pqid[8] name[n] => dir[n]:	serialized Dir */
+		switch(op){
+		case Onop:
+		case Oinsert:
+			kv2dir(v, &d);
+			n = fmtprint(fmt, "[qid=(%llux,%lud,%d), %luo, t=%lld,%lld, l=%lld]",
+				d.qid.path, d.qid.vers, d.qid.type,
+				d.mode, d.atime, d.mtime, d.length);
+			break;
+		case Odelete:
+			n = fmtprint(fmt, "delete");
+			break;
+		case Owstat:
+			p = v->v;
+			ws = *p++;
+			if(ws & Owsize){
+				n += fmtprint(fmt, "size:%llx ", UNPACK64(p));
+				p += 8;
+			}
+			if(ws & Owmode){
+				n += fmtprint(fmt, "mode:%uo ", UNPACK32(p));
+				p += 4;
+			}
+			if(ws & Owmtime){
+				n += fmtprint(fmt, "mtime:%llx ", UNPACK64(p));
+				p += 8;
+			}
+			if(ws & Owatime){
+				n += fmtprint(fmt, "mtime:%llx ", UNPACK64(p));
+				p += 8;
+			}
+			if(ws & Owuid){
+				n += fmtprint(fmt, "uid:%d ", UNPACK32(p));
+				p += 4;
+			}
+			if(ws & Owgid){
+				n += fmtprint(fmt, "gid:%d ", UNPACK32(p));
+				p += 4;
+			}
+			if(ws & Owmuid){
+				n += fmtprint(fmt, "muid:%d ", UNPACK32(p));
+				p += 4;
+			}
+			if(p != v->v + v->nv){
+				fprint(2, "v->nv: %d, sz=%d\n", v->nv, (int)(p - v->v));
+				abort();
+			}
+			break;
+		}
+		break;
+	case Ksnap:	/* name[n] => dent[16] ptr[16]:	snapshot root */
+		switch(op){
+		case Orelink:
+		case Oreprev:
+			n = fmtprint(fmt, "gen: %lld, dlbl: %d, dref: %d",
+				UNPACK64(v->v), v->v[8], v->v[9]);
+			break;
+		case Onop:
+		case Oinsert:
+			if(unpacktree(&t, v->v, v->nv) == nil)
+				n = fmtprint(fmt, "corrupt tree");
+			else
+				n = fmtprint(fmt, "<tree %B [pred=%lld, succ=%lld, nref=%d, nlbl=%d]>",
+					t.bp, t.pred, t.succ, t.nref, t.nlbl);
+			break;
+		default:
+			n = fmtprint(fmt, "?? unknown op %d", op);
+		}
+		break;
+	case Klabel:
+		n = fmtprint(fmt, "snap id:%lld", UNPACK64(v->v+1));
+		break;
+	case Kup:	/* qid[8] => pqid[8]:		parent dir */
+		n = fmtprint(fmt, "super dir:%llx, name:\"%.*s\")",
+			UNPACK64(v->v+1), v->nv-11, v->v+11);
+		break;
+	case Kdlist:
+		n = fmtprint(fmt, "hd:%B, tl:%B",
+			unpackbp(v->v, v->nv),
+			unpackbp(v->v+Ptrsz, v->nv-Ptrsz));
+		break;
+	default:
+		n = fmtprint(fmt, "??? %.*H", v->nk, v->k);
+		break;
+	}
+	return n;
+
+}
+
+int
+Bconv(Fmt *fmt)
+{
+	Bptr bp;
+
+	bp = va_arg(fmt->args, Bptr);
+	return fmtprint(fmt, "(%llx,%.16llux,%llx)", bp.addr, bp.hash, bp.gen);
+}
+
+int
+Mconv(Fmt *fmt)
+{
+	char *opname[Nmsgtype] = {
+	[Oinsert]	"Oinsert",
+	[Odelete]	"Odelete",
+	[Oclearb]	"Oclearb",
+	[Oclobber]	"Oclobber",
+	[Owstat]	"Owstat",
+	[Orelink]	"Orelink",
+	[Oreprev]	"Oreprev",
+	};
+	Msg *m;
+	int f, n;
+
+	f = (fmt->flags & FmtSharp) != 0;
+	m = va_arg(fmt->args, Msg*);
+	if(m == nil)
+		return fmtprint(fmt, "Msg{nil}");
+	n = fmtprint(fmt, "Msg(%s, ", opname[m->op]);
+	n += showkey(fmt, m);
+	n += fmtprint(fmt, ") => (");
+	n += showval(fmt, m, m->op, f);
+	n += fmtprint(fmt, ")");
+	return n;
+}
+
+int
+Pconv(Fmt *fmt)
+{
+	Kvp *kv;
+	int f, n;
+
+	f = (fmt->flags & FmtSharp) != 0;
+	kv = va_arg(fmt->args, Kvp*);
+	if(kv == nil)
+		return fmtprint(fmt, "Kvp{nil}");
+	n = fmtprint(fmt, "Kvp(");
+	n += showkey(fmt, kv);
+	n += fmtprint(fmt, ") => (");
+	n += showval(fmt, kv, Onop, f);
+	n += fmtprint(fmt, ")");
+	return n;
+}
+
+int
+Kconv(Fmt *fmt)
+{
+	Key *k;
+	int n;
+
+	k = va_arg(fmt->args, Key*);
+	if(k == nil)
+		return fmtprint(fmt, "Key{nil}");
+	n = fmtprint(fmt, "Key(");
+	n += showkey(fmt, k);
+	n += fmtprint(fmt, ")");
+	return n;
+}
+
+int
+Rconv(Fmt *fmt)
+{
+	Arange *r;
+
+	r = va_arg(fmt->args, Arange*);
+	if(r == nil)
+		return fmtprint(fmt, "<Arange:nil>");
+	else
+		return fmtprint(fmt, "Arange(%lld+%lld)", r->off, r->len);
+}
+
+int
+Qconv(Fmt *fmt)
+{
+	Qid q;
+
+	q = va_arg(fmt->args, Qid);
+	return fmtprint(fmt, "(%llx %ld %d)", q.path, q.vers, q.type);
+}
+
+static void
+rshowblk(int fd, Blk *b, int indent, int recurse)
+{
+	Blk *c;
+	int i;
+	Bptr bp;
+	Kvp kv;
+	Msg m;
+
+	if(indent > sizeof(spc)/4)
+		indent = sizeof(spc)/4;
+	if(b == nil){
+		fprint(fd, "NIL\n");
+		return;
+	}
+	fprint(fd, "%.*s[BLK]|{%B}\n", 4*indent, spc, b->bp);
+	switch(b->type){
+	case Tpivot:
+		for(i = 0; i < b->nbuf; i++){
+			getmsg(b, i, &m);
+			fprint(fd, "%.*s[%03d]|%M\n", 4*indent, spc, i, &m);
+		}
+		/* wet floor */
+	case Tleaf:
+		for(i = 0; i < b->nval; i++){
+			getval(b, i, &kv);
+			if(b->type == Tpivot){
+				fprint(fd, "%.*s[%03d]|%#P\n", 4*indent, spc, i, &kv);
+				bp = unpackbp(kv.v, kv.nv);
+				c = getblk(bp, 0);
+				if(recurse)
+					rshowblk(fd, c, indent + 1, 1);
+				dropblk(c);
+			}else{
+				fprint(fd, "%.*s[%03d]|%P\n", 4*indent, spc, i, &kv);
+			}
+		}
+		break;
+	case Tarena:
+		fprint(fd, "arena -- ");
+		goto Show;
+	case Tlog:
+		fprint(fd, "log -- ");
+		goto Show;
+	case Tdlist:
+		fprint(fd, "dlist -- ");
+		goto Show;
+	case Tdat:
+		fprint(fd, "dat -- ");
+	Show:
+		for(i = 0; i < 32; i++){
+			fprint(fd, "%x", b->buf[i] & 0xff);
+			if(i % 4 == 3)
+				fprint(fd, " ");
+		}
+		fprint(fd, "\n");
+		break;
+	}
+}
+
+void
+showblk(int fd, Blk *b, char *m, int recurse)
+{
+	fprint(fd, "=== %s\n", m);
+	rshowblk(fd, b, 0, recurse);
+}
+
+void
+showbp(int fd, Bptr bp, int recurse)
+{
+	Blk *b;
+
+	b = getblk(bp, GBnochk);
+	rshowblk(fd, b, 0, recurse);
+	dropblk(b);
+}
+
+void
+showtreeroot(int fd, Tree *t)
+{
+	fprint(fd, "\tflag\t0x%x\n", t->flag);
+	fprint(fd, "\tgen:\t%lld\n", t->gen);
+	fprint(fd, "\tbase\t%lld\n", t->base);
+	fprint(fd, "\tpred:\t%lld\n", t->pred);
+	fprint(fd, "\tsucc:\t%lld\n", t->succ);
+	fprint(fd, "\tnref:\t%d\n", t->nref);
+	fprint(fd, "\tnlbl:\t%d\n", t->nlbl);
+	fprint(fd, "\tht:\t%d\n", t->ht);
+	fprint(fd, "\tbp:\t%B\n", t->bp);
+}
+
+void
+initshow(void)
+{
+	int i;
+
+	memset(spc, ' ', sizeof(spc));
+	for(i = 0; i < sizeof(spc); i += 4)
+		spc[i] = '|';
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/error.c
@@ -1,0 +1,77 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include "dat.h"
+
+char Ecorrupt[] = "block contents corrupted";
+char Efsvers[]	= "unknown fs version";
+char Eimpl[]	= "not implemented";
+char Ebotch[]	= "protocol botch";
+char Eio[]	= "i/o error";
+char Enofid[]	= "unknown fid";
+char Efid[]	= "fid in use";
+char Etype[]	= "invalid fid type";
+char Edscan[]	= "invalid dir scan offset";
+char Esrch[]	= "directory entry not found";
+char Eexist[]	= "create/wstat -- file exists";
+char Emode[]	= "open/create -- unknown mode";
+char Efull[]	= "file system full";
+char Estuffed[]	= "emergency blocks exhausted";
+char Eauth[]	= "authentication failed";
+char Elength[]	= "name too long";
+char Eperm[]	= "permission denied";
+char Einuse[]	= "resource in use";
+char Ebadf[]	= "invalid file";
+char Ename[]	= "create/wstat -- bad character in file name";
+char Enomem[]	= "out of memory";
+char Eattach[]	= "attach required";
+char Enosnap[]	= "attach -- bad specifier";
+char Edir[]	= "invalid directory";
+char Esyntax[]	= "syntax error";
+char Enouser[]	= "user does not exist";
+char Enogrp[]	= "group does not exist";
+char Efsize[]	= "file too big";
+char Ebadu[]	= "attach -- unknown user or failed authentication";
+char Erdonly[]	= "file system read only";
+char Elocked[]	= "open/create -- file is locked";
+char Eauthp[]	= "authread -- auth protocol not finished";
+char Eauthd[]	= "authread -- not enough data";
+char Eauthph[]	= "auth phase error";
+char Enone[]	= "auth -- user 'none' requires no authentication";
+char Enoauth[]	= "auth -- authentication disabled";
+char Ephase[]	= "phase error -- use after remove";
+
+char Ewstatb[]	= "wstat -- unknown bits in qid.type/mode";
+char Ewstatd[]	= "wstat -- attempt to change directory";
+char Ewstatg[]	= "wstat -- not in group";
+char Ewstatl[]	= "wstat -- attempt to make length negative";
+char Ewstatm[]	= "wstat -- attempt to change muid";
+char Ewstato[]	= "wstat -- not owner or group leader";
+char Ewstatp[]	= "wstat -- attempt to change qid.path";
+char Ewstatq[]	= "wstat -- qid.type/dir.mode mismatch";
+char Ewstatu[]	= "wstat -- not owner";
+char Ewstatv[]	= "wstat -- attempt to change qid.vers";
+char Enempty[]	= "directory is not empty";
+
+//char Echar[]		= "bad character in directory name";
+//char Eopen[]		= "read/write -- on non open fid";
+//char Ecount[]		= "read/write -- count too big";
+//char Ealloc[]		= "phase error -- directory entry not allocated";
+//char Eqid[]		= "phase error -- qid does not match";
+//char Eaccess[]	= "access permission denied";
+//char Eentry[]		= "directory entry not found";
+//char Edir1[]		= "walk -- in a non-directory";
+//char Edir2[]		= "create -- in a non-directory";
+//char Edot[]		= "create/wstat -- . and .. illegal names";
+//char Ewalk[]		= "walk -- too many (system wide)";
+//char Eoffset[]	= "read/write -- offset negative";
+//char Ebroken[]	= "read/write -- lock is broken";
+//char Eauth[]		= "attach -- authentication failed";
+//char Eauth2[]		= "read/write -- authentication unimplemented";
+//char Etoolong[]	= "name too long";
+//char Efidinuse[]	= "fid in use";
+//char Eversion[]	= "version conversion";
+//char Eauthnone[]	= "auth -- user 'none' requires no authentication";
+//char Eauthdisabled[]	= "auth -- authentication disabled";	/* development */
+//char Eauthfile[]	= "auth -- out of auth files";
--- /dev/null
+++ b/sys/src/cmd/gefs/fns.h
@@ -1,0 +1,211 @@
+#pragma varargck type "M"	Msg*
+#pragma varargck type "P"	Kvp*
+#pragma varargck type "K"	Key*
+#pragma varargck type "V"	Val*
+#pragma varargck type "B"	Bptr
+#pragma varargck type "R"	Arange*
+#pragma varargck type "X"	char*
+#pragma varargck type "Q"	Qid
+
+extern Gefs*	fs;
+extern int	debug;
+extern int	permissive;
+extern int	usereserve;
+extern char*	reamuser;
+extern Errctx**	errctx;
+extern Blk*	blkbuf;
+extern int	noneid;
+extern int	nogroupid;
+extern int	admid;
+
+#define	UNPACK8(p)	(((uchar*)(p))[0])
+#define	UNPACK16(p)	((((uchar*)(p))[0]<<8)|(((uchar*)(p))[1]))
+#define	UNPACK32(p)	((((uchar*)(p))[0]<<24)|(((uchar*)(p))[1]<<16)|\
+				(((uchar*)(p))[2]<<8)|(((uchar*)(p))[3]))
+#define	UNPACK64(p)	(((u64int)((((uchar*)(p))[0]<<24)|(((uchar*)(p))[1]<<16)|\
+				(((uchar*)(p))[2]<<8)|(((uchar*)(p))[3])))<<32 |\
+			((u64int)((((uchar*)(p))[4]<<24)|(((uchar*)(p))[5]<<16)|\
+				(((uchar*)(p))[6]<<8)|(((uchar*)(p))[7]))))
+
+#define	PACK8(p,v)	do{(p)[0]=(v);}while(0)
+#define	PACK16(p,v)	do{(p)[0]=(v)>>8;(p)[1]=(v);}while(0)
+#define	PACK32(p,v)	do{(p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v);}while(0)
+#define	PACK64(p,v)	do{(p)[0]=(v)>>56;(p)[1]=(v)>>48;(p)[2]=(v)>>40;(p)[3]=(v)>>32;\
+			   (p)[4]=(v)>>24;(p)[5]=(v)>>16;(p)[6]=(v)>>8;(p)[7]=(v);}while(0)
+
+void*	emalloc(usize, int);
+
+Blk*	newblk(Tree *, int, vlong);
+Blk*	dupblk(Tree *, Blk*);
+Blk*	getroot(Tree*, int*);
+Blk*	getblk(Bptr, int);
+Blk*	holdblk(Blk*);
+void	dropblk(Blk*);
+
+void	lrutop(Blk*);
+void	lrubot(Blk*);
+void	cacheins(Blk*);
+void	cachedel(vlong);
+Blk*	cacheget(vlong);
+Blk*	cachepluck(void);
+
+void	qinit(Syncq*);
+void	qput(Syncq*, Qent);
+
+Arena*	getarena(vlong);
+void	syncblk(Blk*);
+void	enqueue(Blk*);
+void	epochstart(int);
+void	epochend(int);
+void	epochwait(void);
+void	epochclean(void);
+void	limbo(Bfree*);
+void	freeblk(Tree*, Blk*, Bptr);
+int	logbarrier(Arena *, vlong);
+void	dlappend(Dlist *dl, Bptr);
+void	killblk(Tree*, Bptr);
+void	blkdealloc(vlong);
+ushort	blkfill(Blk*);
+uvlong	blkhash(Blk*);
+uvlong	bufhash(void*, usize);
+u32int	ihash(uvlong);
+void	finalize(Blk*);
+
+Mount*	getmount(char*);
+void	clunkmount(Mount*);
+
+void	updatesnap(Tree**, Tree*, char*, int);
+void	tagsnap(Tree*, char*, int);
+void	delsnap(Tree*, vlong, char*);
+void	freedl(Dlist*, int);
+Tree*	opensnap(char*, int*);
+
+void	closesnap(Tree*);
+void	reamfs(char*);
+void	growfs(char*);
+void	loadarena(Arena*, Bptr);
+void	loadfs(char*);
+void	loadlog(Arena*, Bptr);
+int	scandead(Dlist*, int, void(*)(Bptr, void*), void*);
+int	endfs(void);
+void	compresslog(Arena*);
+void	dlsync(void);
+void	setval(Blk*, Kvp*);
+
+Conn*	newconn(int, int);
+
+int	walk1(Tree*, vlong, char*, Qid*, vlong*);
+void	loadusers(int, Tree*);
+User*	uid2user(int);
+User*	name2user(char*);
+
+void	btupsert(Tree*, Msg*, int);
+int	btlookup(Tree*, Key*, Kvp*, char*, int);
+void	btnewscan(Scan*, char*, int);
+void	btenter(Tree*, Scan*);
+int	btnext(Scan*, Kvp*);
+void	btexit(Scan*);
+
+int	checkflag(Blk *b, int);
+void	setflag(Blk *b, int);
+void	clrflag(Blk *b, int);
+
+char*	estrdup(char*);
+
+int	keycmp(Key *, Key *);
+void	cpkey(Key*, Key*, char*, int);
+void	cpkvp(Kvp*, Kvp*, char*, int);
+
+/* for dumping */
+void	getval(Blk*, int, Kvp*);
+void	getmsg(Blk*, int, Msg*);
+Bptr	getptr(Kvp*, int*);
+
+void	initshow(void);
+void	showblk(int, Blk*, char*, int);
+void	showbp(int, Bptr, int);
+void	showtreeroot(int, Tree*);
+int	checkfs(int);
+
+#define dprint(...) \
+	do{ \
+		if(debug) fprint(2, __VA_ARGS__); \
+	}while(0)
+
+#define fatal(...) \
+	do{ \
+		fprint(2, __VA_ARGS__); \
+		abort(); \
+	}while(0)
+
+#define tracex(msg, bp, v0, v1) \
+	do{ \
+		if(fs->trace != nil) \
+			_trace(msg, bp, v0, v1); \
+	} while(0)
+
+#define traceb(msg, bp)	tracex(msg, bp, -1, -1)
+#define tracev(msg, v0)	tracex(msg, Zb, v0, -1)
+#define tracem(msg)	tracex(msg, Zb, -1, -1)
+
+jmp_buf*	_waserror(void);
+_Noreturn void	error(char*, ...);
+_Noreturn void	broke(char*, ...);
+_Noreturn void	nexterror(void);
+#define waserror()	(setjmp(*_waserror()))
+#define errmsg()	((*errctx)->err)
+#define	poperror()	assert((*errctx)->nerrlab-- > 0)
+#define estacksz()	((*errctx)->nerrlab)
+void	_trace(char*, Bptr, vlong, vlong);
+char*	packstr(char*, char*, char*);
+
+void	dir2kv(vlong, Xdir*, Kvp*, char*, int);
+int	dir2statbuf(Xdir*, char*, int);
+void	dlist2kv(Dlist*, Kvp*, char*, int);
+void	lbl2kv(char*, vlong, uint, Kvp*, char*, int);
+void	link2kv(vlong, vlong, Kvp*, char*, int);
+void	retag2kv(vlong, vlong, int, int, Kvp*, char*, int);
+void	tree2kv(Tree*, Kvp*, char*, int);
+
+void	kv2dir(Kvp*, Xdir*);
+void	kv2dlist(Kvp*, Dlist*);
+void	kv2link(Kvp*, vlong*, vlong*);
+void	kv2qid(Kvp*, Qid*);
+int	kv2statbuf(Kvp*, char*, int);
+
+char*	packarena(char*, int, Arena*);
+char*	packbp(char*, int, Bptr*);
+char*	packdkey(char*, int, vlong, char*);
+char*	packdval(char*, int, Xdir*);
+char*	packlbl(char*, int, char*);
+char*	packsnap(char*, int, vlong);
+char*	packsuper(char*, int, vlong);
+char*	packtree(char*, int, Tree*);
+char*	packsb(char*, int, Gefs*);
+
+char*	unpackarena(Arena*, char*, int);
+Bptr	unpackbp(char*, int);
+char*	unpackdkey(char*, int, vlong*);
+Tree*	unpacktree(Tree*, char*, int);
+char*	unpacksb(Gefs*, char*, int);
+char*	unpackstr(char*, char*, char**);
+
+/* fmt */
+int	Bconv(Fmt*);
+int	Mconv(Fmt*);
+int	Pconv(Fmt*);
+int	Rconv(Fmt*);
+int	Kconv(Fmt*);
+int	Qconv(Fmt*);
+
+Chan*	mkchan(int);
+void*	chrecv(Chan*);
+void	chsend(Chan*, void*);
+void	runfs(int, void*);
+void	runmutate(int, void*);
+void	runread(int, void*);
+void	runcons(int, void*);
+void	runtasks(int, void*);
+void	runsync(int, void*);
+void	runsweep(int, void*);
+void	runsweep(int, void*);
--- /dev/null
+++ b/sys/src/cmd/gefs/fs.c
@@ -1,0 +1,2657 @@
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+static void	respond(Fmsg*, Fcall*);
+static void	rerror(Fmsg*, char*, ...);
+static void	clunkfid(Conn*, Fid*, Amsg**);
+
+int
+walk1(Tree *t, vlong up, char *name, Qid *qid, vlong *len)
+{
+	char *p, kbuf[Keymax], rbuf[Kvmax];
+	int err;
+	Xdir d;
+	Kvp kv;
+	Key k;
+
+	err = 0;
+	p = packdkey(kbuf, sizeof(kbuf), up, name);
+	k.k = kbuf;
+	k.nk = p - kbuf;
+	if(err)
+		return -1;
+	if(!btlookup(t, &k, &kv, rbuf, sizeof(rbuf)))
+		return -1;
+	kv2dir(&kv, &d);
+	*qid = d.qid;
+	*len = d.length;
+	return 0;
+}
+
+static void
+wrbarrier(void)
+{
+	Qent qe;
+	int i;
+	
+	aincv(&fs->qgen, 1);
+	tracev("barrier", fs->qgen);
+	fs->syncing = fs->nsyncers;
+	for(i = 0; i < fs->nsyncers; i++){
+		qe.op = Qfence;
+		qe.bp.addr = 0;
+		qe.bp.hash = -1;
+		qe.bp.gen = -1;
+		qe.b = nil;
+		qput(&fs->syncq[i], qe);
+	}
+	aincv(&fs->qgen, 1);
+	while(fs->syncing != 0)
+		rsleep(&fs->syncrz);
+	tracev("flushed", fs->qgen);
+}
+
+static void
+sync(void)
+{
+	Mount *mnt;
+	Arena *a;
+	Dlist dl;
+	int i;
+
+
+	qlock(&fs->synclk);
+	if(waserror()){
+		fprint(2, "failed to sync: %s\n", errmsg());
+		qunlock(&fs->synclk);
+		nexterror();
+	}
+
+	/* 
+	 * Wait for data that we're syncing to hit disk
+	 */
+	tracem("flush1");
+	wrbarrier();
+	/*
+	 * pass 0: Update all open snapshots, and
+	 *  pack the blocks we want to sync. Snap
+	 *  while holding the write lock, and then
+	 *  wait until all the blocks they point at
+	 *  have hit disk; once they're on disk, we
+	 *  can take a consistent snapshot.
+         */
+	qlock(&fs->mutlk);
+	tracem("packb");
+	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next)
+		updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+	/*
+	 * Now that we've updated the snaps, we can sync the
+	 * dlist; the snap tree will not change from here.
+	 */
+	dlsync();
+	dl = fs->snapdl;
+	fs->snapdl.hd = Zb;
+	fs->snapdl.tl = Zb;
+	fs->snapdl.ins = nil;
+	traceb("syncdl.dl", dl.hd);
+	traceb("syncdl.rb", fs->snap.bp);
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		qlock(a);
+		/*
+		 * because the log uses preallocated
+		 * blocks, we need to write the log
+		 * block out synchronously, or it may
+		 * get reused.
+		 */
+		logbarrier(a, fs->qgen);
+		finalize(a->logtl);
+		syncblk(a->logtl);
+
+		packarena(a->h0->data, Blksz, a);
+		packarena(a->h1->data, Blksz, a);
+		finalize(a->h0);
+		finalize(a->h1);
+		setflag(a->h0, Bdirty);
+		setflag(a->h1, Bdirty);
+		fs->arenabp[i] = a->h0->bp;
+		qunlock(a);
+	}
+	assert(fs->snapdl.hd.addr == -1);
+	traceb("packsb.rb", fs->snap.bp);
+	packsb(fs->sb0->buf, Blksz, fs);
+	packsb(fs->sb1->buf, Blksz, fs);
+	finalize(fs->sb0);
+	finalize(fs->sb1);
+	fs->snap.dirty = 0;
+	qunlock(&fs->mutlk);
+
+	/*
+	 * pass 1: sync block headers; if we crash here,
+	 *  the block footers are consistent, and we can
+	 *  use them.
+	 */
+	tracem("arenas0");
+	for(i = 0; i < fs->narena; i++)
+		enqueue(fs->arenas[i].h0);
+	wrbarrier();
+
+	/*
+	 * pass 2: sync superblock; we have a consistent
+	 * set of block headers, so if we crash, we can
+	 * use the loaded block headers; the footers will
+	 * get synced after so that we can use them next
+	 * time around.
+         */
+	qlock(&fs->mutlk);
+	tracem("supers");
+	syncblk(fs->sb0);
+	syncblk(fs->sb1);
+
+	/*
+	 * pass 3: sync block footers; if we crash here,
+	 *  the block headers are consistent, and we can
+	 *  use them.
+         */
+	tracem("arenas1");
+	for(i = 0; i < fs->narena; i++)
+		enqueue(fs->arenas[i].h1);
+
+	/*
+	 * Pass 4: clean up the old snap tree's deadlist
+	 */
+	tracem("snapdl");
+	wrbarrier();
+	qunlock(&fs->mutlk);
+	freedl(&dl, 1);
+	qunlock(&fs->synclk);
+	tracem("synced");
+	poperror();
+}
+
+static void
+snapfs(Amsg *a, Tree **tp)
+{
+	Tree *t, *s;
+	Mount *mnt;
+
+	if(waserror()){
+		*tp = nil;
+		nexterror();
+	}
+	t = nil;
+	*tp = nil;
+	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+		if(strcmp(a->old, mnt->name) == 0){
+			updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+			t = agetp(&mnt->root);
+			ainc(&t->memref);
+			break;
+		}
+	}
+	if(t == nil && (t = opensnap(a->old, nil)) == nil){
+		if(a->fd != -1)
+			fprint(a->fd, "snap: open '%s': does not exist\n", a->old);
+		poperror();
+		return;
+	}
+	if(a->delete){
+		if(mnt != nil) {
+			if(a->fd != -1)
+				fprint(a->fd, "snap: snap is mounted: '%s'\n", a->old);
+			poperror();
+			return;
+		}
+		if(t->nlbl == 1 && t->nref <= 1 && t->succ == -1){
+			aincl(&t->memref, 1);
+			*tp = t;
+		}
+		delsnap(t, t->succ, a->old);
+	}else{
+		if((s = opensnap(a->new, nil)) != nil){
+			if(a->fd != -1)
+				fprint(a->fd, "snap: already exists '%s'\n", a->new);
+			closesnap(s);
+			poperror();
+			return;
+		}
+		tagsnap(t, a->new, a->flag);
+	}
+	closesnap(t);
+	poperror();
+	if(a->fd != -1){
+		if(a->delete)
+			fprint(a->fd, "deleted: %s\n", a->old);
+		else if(a->flag & Lmut)
+			fprint(a->fd, "forked: %s from %s\n", a->new, a->old);
+		else
+			fprint(a->fd, "labeled: %s from %s\n", a->new, a->old);
+	}
+}
+
+static void
+filldumpdir(Xdir *d)
+{
+	memset(d, 0, sizeof(Xdir));
+	d->name = "/";
+	d->qid.path = Qdump;
+	d->qid.vers = fs->nextgen;
+	d->qid.type = QTDIR;
+	d->mode = 0555;
+	d->atime = 0;
+	d->mtime = 0;
+	d->length = 0;
+	d->uid = -1;
+	d->gid = -1;
+	d->muid = -1;
+}
+
+static int
+okname(char *name)
+{
+	int i;
+
+	if(name[0] == 0)
+		return -1;
+	if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
+		return -1;
+	for(i = 0; i < Maxname; i++){
+		if(name[i] == 0)
+			return 0;
+		if((name[i]&0xff) < 0x20 || name[i] == '/')
+			return -1;
+	}
+	return -1;
+}
+
+Chan*
+mkchan(int size)
+{
+	Chan *c;
+
+	if((c = mallocz(sizeof(Chan) + size*sizeof(void*), 1)) == nil)
+		sysfatal("create channel");
+	c->size = size;
+	c->avail = size;
+	c->count = 0;
+	c->rp = c->args;
+	c->wp = c->args;
+	return c;
+
+}
+
+void*
+chrecv(Chan *c)
+{
+	void *a;
+	long v;
+
+	v = agetl(&c->count);
+	if(v == 0 || !acasl(&c->count, v, v-1))
+		semacquire(&c->count, 1);
+	lock(&c->rl);
+	a = *c->rp;
+	if(++c->rp >= &c->args[c->size])
+		c->rp = c->args;
+	unlock(&c->rl);
+	semrelease(&c->avail, 1);
+	return a;
+}
+
+void
+chsend(Chan *c, void *m)
+{
+	long v;
+
+	v = agetl(&c->avail);
+	if(v == 0 || !acasl(&c->avail, v, v-1))
+		semacquire(&c->avail, 1);
+	lock(&c->wl);
+	*c->wp = m;
+	if(++c->wp >= &c->args[c->size])
+		c->wp = c->args;
+	unlock(&c->wl);
+	semrelease(&c->count, 1);
+}
+
+static void
+fshangup(Conn *c, char *fmt, ...)
+{
+	char buf[ERRMAX];
+	va_list ap;
+	Amsg *a;
+	Fid *f;
+	int i;
+
+	va_start(ap, fmt);
+	vsnprint(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+	fprint(2, "hangup: %s\n", buf);
+	close(c->rfd);
+	close(c->wfd);
+	for(i = 0; i < Nfidtab; i++){
+		lock(&c->fidtablk[i]);
+		for(f = c->fidtab[i]; f != nil; f = f->next){
+			lock(f);
+			if(waserror()){
+				unlock(f);
+				continue;
+			}
+			a = nil;
+			clunkfid(c, f, &a);
+			unlock(f);
+			if(a != nil)
+				chsend(fs->admchan, a);
+			nexterror();
+		}
+		unlock(&c->fidtablk[i]);
+	}
+}
+
+static void
+respond(Fmsg *m, Fcall *r)
+{
+	RWLock *lk;
+	uchar buf[Max9p+IOHDRSZ];
+	int w, n;
+
+	r->tag = m->tag;
+	dprint("→ %F\n", r);
+	assert(m->type+1 == r->type || r->type == Rerror);
+	if((n = convS2M(r, buf, sizeof(buf))) == 0)
+		abort();
+	qlock(&m->conn->wrlk);
+	w = write(m->conn->wfd, buf, n);
+	qunlock(&m->conn->wrlk);
+	if(w != n)
+		fshangup(m->conn, Eio);
+	if(m->type == Tflush){
+		lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
+		wunlock(lk);
+	}else{
+		lk = &fs->flushq[ihash(m->tag) % Nflushtab];
+		runlock(lk);
+	}
+	free(m);
+}
+
+static void
+rerror(Fmsg *m, char *fmt, ...)
+{
+	char buf[128];
+	va_list ap;
+	Fcall r;
+
+	va_start(ap, fmt);
+	vsnprint(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+	r.type = Rerror;
+	r.ename = buf;
+	respond(m, &r);
+}
+
+
+static void
+upsert(Mount *mnt, Msg *m, int nm)
+{
+	if(!(mnt->flag & Lmut))
+		error(Erdonly);
+	if(mnt->root->nlbl != 1 || mnt->root->nref != 0)
+		updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+	btupsert(mnt->root, m, nm);
+}
+
+/*
+ * When truncating a file, mutations need
+ * to wait for the sweeper to finish; this
+ * means the mutator needs to release the
+ * mutation lock, exit the epoch, and
+ * allow the sweeper to finish its job
+ * before resuming.
+ */
+static void
+truncwait(Dent *de, int id)
+{
+	epochend(id);
+	qunlock(&fs->mutlk);
+	qlock(&de->trunclk);
+	while(de->trunc)
+		rsleep(&de->truncrz);
+	qunlock(&de->trunclk);
+	qlock(&fs->mutlk);
+	epochstart(id);
+}
+
+static int
+readb(Tree *t, Fid *f, char *d, vlong o, vlong n, vlong sz)
+{
+	char buf[17], kvbuf[17+32];
+	vlong fb, fo;
+	Bptr bp;
+	Blk *b;
+	Key k;
+	Kvp kv;
+
+	if(o >= sz)
+		return 0;
+
+	fb = o & ~(Blksz-1);
+	fo = o & (Blksz-1);
+	if(fo+n > Blksz)
+		n = Blksz-fo;
+
+	k.k = buf;
+	k.nk = sizeof(buf);
+	k.k[0] = Kdat;
+	PACK64(k.k+1, f->qpath);
+	PACK64(k.k+9, fb);
+
+	if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf))){
+		memset(d, 0, n);
+		return n;
+	}
+
+	bp = unpackbp(kv.v, kv.nv);
+	b = getblk(bp, GBraw);
+	memcpy(d, b->buf+fo, n);
+	dropblk(b);
+	return n;
+}
+
+static int
+writeb(Fid *f, Msg *m, Bptr *ret, char *s, vlong o, vlong n, vlong sz)
+{
+	char buf[Kvmax];
+	vlong fb, fo;
+	Blk *b, *t;
+	Tree *r;
+	Bptr bp;
+	Kvp kv;
+
+	fb = o & ~(Blksz-1);
+	fo = o & (Blksz-1);
+
+	m->k[0] = Kdat;
+	PACK64(m->k+1, f->qpath);
+	PACK64(m->k+9, fb);
+
+	b = newblk(f->mnt->root, Tdat, f->qpath);
+	t = nil;
+	r = f->mnt->root;
+	if(btlookup(r, m, &kv, buf, sizeof(buf))){
+		bp = unpackbp(kv.v, kv.nv);
+		if(fb < sz && (fo != 0 || n != Blksz)){
+			t = getblk(bp, GBraw);
+			memcpy(b->buf, t->buf, Blksz);
+			dropblk(t);
+		}
+	}
+	if(fo+n > Blksz)
+		n = Blksz-fo;
+	memcpy(b->buf+fo, s, n);
+	if(t == nil){
+		if(fo > 0)
+			memset(b->buf, 0, fo);
+		if(fo+n < Blksz)
+			memset(b->buf+fo+n, 0, Blksz-fo-n);
+	}
+	enqueue(b);
+
+	packbp(m->v, m->nv, &b->bp);
+	*ret = b->bp;
+	dropblk(b);
+	return n;
+}
+
+static Dent*
+getdent(vlong pqid, Xdir *d)
+{
+	Dent *de;
+	char *e;
+	u32int h;
+
+	h = ihash(d->qid.path) % Ndtab;
+	lock(&fs->dtablk);
+	for(de = fs->dtab[h]; de != nil; de = de->next){
+		if(de->qid.path == d->qid.path){
+			ainc(&de->ref);
+			goto Out;
+		}
+	}
+
+	de = emalloc(sizeof(Dent), 1);
+	de->Xdir = *d;
+	de->ref = 1;
+	de->up = pqid;
+	de->qid = d->qid;
+	de->length = d->length;
+	de->truncrz.l = &de->trunclk;
+
+	if((e = packdkey(de->buf, sizeof(de->buf), pqid, d->name)) == nil){
+		free(de);
+		de = nil;
+		goto Out;
+	}
+	de->k = de->buf;
+	de->nk = e - de->buf;
+	de->name = de->buf + 11;
+	de->next = fs->dtab[h];
+	fs->dtab[h] = de;
+
+Out:
+	unlock(&fs->dtablk);
+	return de;
+}
+
+static void
+loadautos(Mount *mnt)
+{
+	char pfx[128];
+	int m, h, ns;
+	uint flg;
+	Scan s;
+
+	m = 0;
+	h = 0;
+	pfx[0] = Klabel;
+	ns = snprint(pfx+1, sizeof(pfx)-1, "%s@minute.", mnt->name);
+	btnewscan(&s, pfx, ns+1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		flg = UNPACK32(s.kv.v+1+8);
+		if(flg & Lauto){
+			memcpy(mnt->minutely[m], s.kv.k+1, s.kv.nk-1);
+			mnt->minutely[m][s.kv.nk-1] = 0;
+			m = (m+1)%60;
+			continue;
+		}
+	}
+	btexit(&s);
+
+	pfx[0] = Klabel;
+	ns = snprint(pfx+1, sizeof(pfx)-1, "%s@hour.", mnt->name);
+	btnewscan(&s, pfx, ns+1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		flg = UNPACK32(s.kv.v+1+8);
+		if(flg & Lauto){
+			memcpy(mnt->hourly[h], s.kv.k+1, s.kv.nk-1);
+			mnt->hourly[h][s.kv.nk-1] = 0;
+			h = (h+1)%24;
+			continue;
+		}
+	}
+	btexit(&s);
+}
+
+Mount *
+getmount(char *name)
+{
+	Mount *mnt;
+	Tree *t;
+	int flg;
+
+	if(strcmp(name, "dump") == 0){
+		ainc(&fs->snapmnt->ref);
+		return fs->snapmnt;
+	}
+
+	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+		if(strcmp(name, mnt->name) == 0){
+			ainc(&mnt->ref);
+			goto Out;
+		}
+	}
+
+	if((mnt = mallocz(sizeof(*mnt), 1)) == nil)
+		error(Enomem);
+	if(waserror()){
+		free(mnt);
+		nexterror();
+	}
+	mnt->ref = 1;
+	snprint(mnt->name, sizeof(mnt->name), "%s", name);
+	if((t = opensnap(name, &flg)) == nil)
+		error(Enosnap);
+	loadautos(mnt);
+	mnt->flag = flg;
+	mnt->root = t;
+	mnt->next = fs->mounts;
+	asetp(&fs->mounts, mnt);
+	poperror();
+
+Out:
+	return mnt;
+}
+
+void
+clunkmount(Mount *mnt)
+{
+	Mount *me, **p;
+	Bfree *f;
+
+	if(mnt == nil)
+		return;
+	if(adec(&mnt->ref) == 0){
+		for(p = &fs->mounts; (me = *p) != nil; p = &me->next){
+			if(me == mnt)
+				break;
+		}
+		assert(me != nil);
+		f = emalloc(sizeof(Bfree), 0);
+		f->op = DFmnt;
+		f->m = mnt;
+		*p = me->next;
+		limbo(f);
+	}
+}
+
+static void
+clunkdent(Dent *de)
+{
+	Dent *e, **pe;
+	u32int h;
+
+	if(de == nil)
+		return;
+	if(de->qid.type & QTAUTH && adec(&de->ref) == 0){
+		free(de);
+		return;
+	}
+	lock(&fs->dtablk);
+	if(adec(&de->ref) != 0)
+		goto Out;
+	h = ihash(de->qid.path) % Ndtab;
+	pe = &fs->dtab[h];
+	for(e = fs->dtab[h]; e != nil; e = e->next){
+		if(e == de)
+			break;
+		pe = &e->next;
+	}
+	assert(e != nil);
+	*pe = e->next;
+	free(de);
+Out:
+	unlock(&fs->dtablk);
+}
+
+static Fid*
+getfid(Conn *c, u32int fid)
+{
+	u32int h;
+	Fid *f;
+
+	h = ihash(fid) % Nfidtab;
+	lock(&c->fidtablk[h]);
+	for(f = c->fidtab[h]; f != nil; f = f->next)
+		if(f->fid == fid){
+			ainc(&f->ref);
+			break;
+		}
+	unlock(&c->fidtablk[h]);
+	return f;
+}
+
+static void
+putfid(Fid *f)
+{
+	if(adec(&f->ref) != 0)
+		return;
+	clunkmount(f->mnt);
+	clunkdent(f->dent);
+	free(f);
+}
+
+static Fid*
+dupfid(Conn *c, u32int new, Fid *f)
+{
+	Fid *n, *o;
+	u32int h;
+
+	h = ihash(new) % Nfidtab;
+	if((n = malloc(sizeof(Fid))) == nil)
+		return nil;
+
+	*n = *f;
+	n->fid = new;
+	n->ref = 2; /* one for dup, one for clunk */
+	n->mode = -1;
+	n->next = nil;
+
+	lock(&c->fidtablk[h]);
+	for(o = c->fidtab[h]; o != nil; o = o->next)
+		if(o->fid == new)
+			break;
+	if(o == nil){
+		n->next = c->fidtab[h];
+		c->fidtab[h] = n;
+	}
+	unlock(&c->fidtablk[h]);
+
+	if(o != nil){
+		fprint(2, "fid in use: %d == %d\n", o->fid, new);
+		free(n);
+		return nil;
+	}
+	if(n->mnt != nil)
+		ainc(&n->mnt->ref);
+	ainc(&n->dent->ref);
+	setmalloctag(n, getcallerpc(&c));
+	return n;
+}
+
+static void
+clunkfid(Conn *c, Fid *fid, Amsg **ao)
+{
+	Fid *f, **pf;
+	u32int h;
+
+	h = ihash(fid->fid) % Nfidtab;
+	lock(&c->fidtablk[h]);
+	pf = &c->fidtab[h];
+	for(f = c->fidtab[h]; f != nil; f = f->next){
+		if(f == fid){
+			assert(adec(&f->ref) != 0);
+			*pf = f->next;
+			break;
+		}
+		pf = &f->next;
+	}
+	assert(f != nil);
+	if(f->scan != nil){
+		free(f->scan);
+		f->scan = nil;
+	}
+	if(f->rclose){
+		qlock(&f->dent->trunclk);
+		f->dent->trunc = 1;
+		qunlock(&f->dent->trunclk);
+		wlock(f->dent);
+		f->dent->gone = 1;
+		wunlock(f->dent);
+		*ao = emalloc(sizeof(Amsg), 1);
+		aincl(&f->dent->ref, 1);
+		aincl(&f->mnt->ref, 1);
+		(*ao)->op = AOrclose;
+		(*ao)->mnt = f->mnt;
+		(*ao)->qpath = f->qpath;
+		(*ao)->off = 0;
+		(*ao)->end = f->dent->length;
+		(*ao)->dent = f->dent;
+	}
+	unlock(&c->fidtablk[h]);
+}
+
+static int
+readmsg(Conn *c, Fmsg **pm)
+{
+	char szbuf[4];
+	int sz, n;
+	Fmsg *m;
+
+	n = readn(c->rfd, szbuf, 4);
+	if(n <= 0){
+		*pm = nil;
+		return n;
+	}
+	if(n != 4){
+		werrstr("short read: %r");
+		return -1;
+	}
+	sz = GBIT32(szbuf);
+	if(sz > c->iounit){
+		werrstr("message size too large");
+		return -1;
+	}
+	if((m = malloc(sizeof(Fmsg)+sz)) == nil)
+		return -1;
+	if(readn(c->rfd, m->buf+4, sz-4) != sz-4){
+		werrstr("short read: %r");
+		free(m);
+		return -1;
+	}
+	m->conn = c;
+	m->sz = sz;
+	PBIT32(m->buf, sz);
+	*pm = m;
+	return 0;
+}
+
+static void
+fsversion(Fmsg *m)
+{
+	Fcall r;
+	char *p;
+
+	memset(&r, 0, sizeof(Fcall));
+	p = strchr(m->version, '.');
+	if(p != nil)
+		*p = '\0';
+	r.type = Rversion;
+	r.msize = Max9p + IOHDRSZ;
+	if(strcmp(m->version, "9P2000") == 0){
+		if(m->msize < r.msize)
+			r.msize = m->msize;
+		r.version = "9P2000";
+		m->conn->versioned = 1;
+		m->conn->iounit = r.msize;
+	}else{
+		r.version = "unknown";
+		m->conn->versioned = 0;
+	}
+	respond(m, &r);
+}
+
+void
+authfree(AuthRpc *auth)
+{
+	AuthRpc *rpc;
+
+	if(rpc = auth){
+		close(rpc->afd);
+		auth_freerpc(rpc);
+	}
+}
+
+AuthRpc*
+authnew(void)
+{
+	static char *keyspec = "proto=p9any role=server";
+	AuthRpc *rpc;
+	int fd;
+
+	if(access("/mnt/factotum", 0) < 0)
+		if((fd = open("/srv/factotum", ORDWR)) >= 0)
+			mount(fd, -1, "/mnt", MBEFORE, "");
+	if((fd = open("/mnt/factotum/rpc", ORDWR)) < 0)
+		return nil;
+	if((rpc = auth_allocrpc(fd)) == nil){
+		close(fd);
+		return nil;
+	}
+	if(auth_rpc(rpc, "start", keyspec, strlen(keyspec)) != ARok){
+		authfree(rpc);
+		return nil;
+	}
+	return rpc;
+}
+
+static void
+authread(Fid *f, Fcall *r, void *data, vlong count)
+{
+	AuthInfo *ai;
+	AuthRpc *rpc;
+	User *u;
+
+	if((rpc = f->auth) == nil)
+		error(Etype);
+
+	switch(auth_rpc(rpc, "read", nil, 0)){
+	default:
+		error(Eauthp);
+	case ARdone:
+		if((ai = auth_getinfo(rpc)) == nil)
+			goto Phase;
+		rlock(&fs->userlk);
+		u = name2user(ai->cuid);
+		auth_freeAI(ai);
+		if(u == nil){
+			runlock(&fs->userlk);
+			error(Enouser);
+		}
+		f->uid = u->id;
+		runlock(&fs->userlk);
+		return;
+	case ARok:
+		if(count < rpc->narg)
+			error(Eauthd);
+		memmove(data, rpc->arg, rpc->narg);
+		r->count = rpc->narg;
+		return;
+	case ARphase:
+	Phase:
+		error(Eauthph);
+	}
+}
+
+static void
+authwrite(Fid *f, Fcall *r, void *data, vlong count)
+{
+	AuthRpc *rpc;
+
+	if((rpc = f->auth) == nil)
+		error(Etype);
+	if(auth_rpc(rpc, "write", data, count) != ARok)
+		error(Ebotch);
+	r->type = Rwrite;
+	r->count = count;
+
+}
+
+static void
+fsauth(Fmsg *m)
+{
+	Dent *de;
+	Fcall r;
+	Fid f;
+
+	if(fs->noauth){
+		rerror(m, Eauth);
+		return;
+	}
+	if(strcmp(m->uname, "none") == 0){
+		rerror(m, Enone);
+		return;
+	}
+	if((de = mallocz(sizeof(Dent), 1)) == nil){
+		rerror(m, Enomem);
+		return;
+	}
+	memset(de, 0, sizeof(Dent));
+	de->ref = 0;
+	de->qid.type = QTAUTH;
+	de->qid.path = aincv(&fs->nextqid, 1);
+	de->qid.vers = 0;
+	de->length = 0;
+	de->k = nil;
+	de->nk = 0;
+
+	memset(&f, 0, sizeof(Fid));
+	f.fid = NOFID;
+	f.mnt = nil;
+	f.qpath = de->qid.path;
+	f.pqpath = de->qid.path;
+	f.mode = -1;
+	f.iounit = m->conn->iounit;
+	f.dent = de;
+	f.uid = -1;
+	f.duid = -1;
+	f.dgid = -1;
+	f.dmode = 0600;
+	f.auth = authnew();
+	if(dupfid(m->conn, m->afid, &f) == nil){
+		rerror(m, Efid);
+		free(de);
+		return;
+	}
+	r.type = Rauth;
+	r.aqid = de->qid;
+	respond(m, &r);
+}
+
+static int
+ingroup(int uid, int gid)
+{
+	User *u, *g;
+	int i, in;
+
+	rlock(&fs->userlk);
+	in = 0;
+	u = uid2user(uid);
+	g = uid2user(gid);
+	if(u != nil && g != nil)
+		if(u->id == g->id)
+			in = 1;
+		else for(i = 0; i < g->nmemb; i++)
+			if(u->id == g->memb[i])
+				in = 1;
+	runlock(&fs->userlk);
+	return in;
+}
+
+static int
+groupleader(int uid, int gid)
+{
+	User *g;
+	int i, lead;
+
+	lead = 0;
+	rlock(&fs->userlk);
+	g = uid2user(gid);
+	if(g != nil){
+		if(g->lead == 0){
+			for(i = 0; i < g->nmemb; i++)
+				if(g->memb[i] == uid){
+					lead = 1;
+					break;
+				}
+		}else if(uid == g->lead)
+			lead = 1;
+	}
+	runlock(&fs->userlk);
+	return lead;
+
+}
+
+static int
+mode2bits(int req)
+{
+	int m;
+
+	m = 0;
+	switch(req&0xf){
+	case OREAD:	m = DMREAD;		break;
+	case OWRITE:	m = DMWRITE;		break;
+	case ORDWR:	m = DMREAD|DMWRITE;	break;
+	case OEXEC:	m = DMREAD|DMEXEC;	break;
+	}
+	if(req&OTRUNC)
+		m |= DMWRITE;
+	return m;
+}
+
+static int
+fsaccess(Fid *f, ulong fmode, int fuid, int fgid, int m)
+{
+	/* uid none gets only other permissions */
+	if(f->permit)
+		return 0;
+	if(f->uid != noneid) {
+		if(f->uid == fuid)
+			if((m & (fmode>>6)) == m)
+				return 0;
+		if(ingroup(f->uid, fgid))
+			if((m & (fmode>>3)) == m)
+				return 0;
+	}
+	if(m & fmode) {
+		if((fmode & DMDIR) && (m == DMEXEC))
+			return 0;
+		if(!ingroup(f->uid, nogroupid))
+			return 0;
+	}
+	return -1;
+}
+
+static void
+fsattach(Fmsg *m)
+{
+	char dbuf[Kvmax], kvbuf[Kvmax];
+	char *p, *n, *aname;
+	Mount *mnt;
+	Dent *de;
+	Tree *t;
+	User *u;
+	Fcall r;
+	Xdir d;
+	Kvp kv;
+	Key dk;
+	Fid f, *af;
+	int uid;
+
+	de = nil;
+	mnt = nil;
+	if(waserror()){
+		rerror(m, errmsg());
+		goto Err;
+	}
+	aname = m->aname;
+	if(aname[0] == '%')
+		aname++;
+	if(aname[0] == '\0')
+		aname = "main";
+	if((mnt = getmount(aname)) == nil)
+		error(Enosnap);
+
+	rlock(&fs->userlk);
+	n = m->uname;
+	/*
+	 * to allow people to add themselves to the user file,
+	 * we need to force the user id to one that exists.
+	 */
+	if(permissive && strcmp(aname, "adm") == 0)
+		n = "adm";
+	if((u = name2user(n)) == nil){
+		runlock(&fs->userlk);
+		error(Enouser);
+	}
+	uid = u->id;
+	runlock(&fs->userlk);
+
+	if(m->afid != NOFID){
+		r.data = nil;
+		r.count = 0;
+		if((af = getfid(m->conn, m->afid)) == nil)
+			error(Enofid);
+		authread(af, &r, nil, 0);
+		putfid(af);
+		if(af->uid != uid)
+			error(Ebadu);
+	}else if(!fs->noauth && strcmp(m->uname, "none") != 0)
+		error(Ebadu);
+
+	if(strcmp(m->aname, "dump") == 0){
+		memset(&d, 0, sizeof(d));
+		filldumpdir(&d);
+	}else{
+		if((p = packdkey(dbuf, sizeof(dbuf), -1ULL, "")) == nil)
+			error(Elength);
+		dk.k = dbuf;
+		dk.nk = p - dbuf;
+		t = agetp(&mnt->root);
+		if(!btlookup(t, &dk, &kv, kvbuf, sizeof(kvbuf)))
+			error(Enosnap);
+		kv2dir(&kv, &d);
+	}
+	de = getdent(-1, &d);
+	memset(&f, 0, sizeof(Fid));
+	f.fid = NOFID;
+	f.mnt = mnt;
+	f.qpath = d.qid.path;
+	f.pqpath = d.qid.path;
+	f.mode = -1;
+	f.iounit = m->conn->iounit;
+	f.dent = de;
+	f.uid = uid;
+	f.duid = d.uid;
+	f.dgid = d.gid;
+	f.dmode = d.mode;
+	if(m->aname[0] == '%'){
+		if(!permissive && !ingroup(uid, admid))
+			error(Eperm);
+		f.permit = 1;
+	}
+	if(dupfid(m->conn, m->fid, &f) == nil)
+		error(Efid);
+
+	r.type = Rattach;
+	r.qid = d.qid;
+	respond(m, &r);
+	poperror();
+
+
+Err:	clunkdent(de);
+	clunkmount(mnt);
+}
+
+static int
+findparent(Tree *t, Fid *f, vlong *qpath, char **name, char *buf, int nbuf)
+{
+	char *p, kbuf[Keymax];
+	Kvp kv;
+	Key k;
+
+	p = packsuper(kbuf, sizeof(kbuf), f->pqpath);
+	k.k = kbuf;
+	k.nk = p - kbuf;
+	if(!btlookup(t, &k, &kv, buf, nbuf))
+		return 0;
+	*name = unpackdkey(kv.v, kv.nv, qpath);
+	return 1;
+}
+
+static void
+fswalk(Fmsg *m)
+{
+	char *p, *name, kbuf[Maxent], kvbuf[Kvmax];
+	int duid, dgid, dmode;
+	vlong up, prev;
+	Fid *o, *f;
+	Dent *dent;
+	Mount *mnt;
+	Tree *t;
+	Fcall r;
+	Xdir d;
+	Kvp kv;
+	Key k;
+	int i;
+
+	if((o = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	if(waserror()){
+		rerror(m, errmsg());
+		putfid(o);
+		return;
+	}
+	if(o->mode != -1)
+		error(Einuse);
+	t = o->mnt->root;
+	mnt = o->mnt;
+	up = o->qpath;
+	prev = o->qpath;
+	rlock(o->dent);
+	d = *o->dent;
+	runlock(o->dent);
+	duid = d.uid;
+	dgid = d.gid;
+	dmode = d.mode;
+	r.type = Rwalk;
+	for(i = 0; i < m->nwname; i++){
+		if(fsaccess(o, d.mode, d.uid, d.gid, DMEXEC) != 0)
+			error(Eperm);
+		name = m->wname[i];
+		if(d.qid.path == Qdump){
+			if((mnt = getmount(m->wname[i])) == nil)
+				error(Esrch);
+			if(waserror()){
+				clunkmount(mnt);
+				nexterror();
+			}
+			t = mnt->root;
+			p = packdkey(kbuf, sizeof(kbuf), -1ULL, "");
+			poperror();
+		}else{
+			if(strcmp(m->wname[i], "..") == 0){
+				if(o->pqpath == Qdump){
+					mnt = fs->snapmnt;
+					filldumpdir(&d);
+					duid = d.uid;
+					dgid = d.gid;
+					dmode = d.mode;
+					goto Found;
+				}
+				if(!findparent(t, o, &prev, &name, kbuf, sizeof(kbuf)))
+					error(Esrch);
+			}
+			p = packdkey(kbuf, sizeof(kbuf), prev, name);
+		}
+		duid = d.uid;
+		dgid = d.gid;
+		dmode = d.mode;
+		k.k = kbuf;
+		k.nk = p - kbuf;
+		if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+			break;
+		kv2dir(&kv, &d);
+Found:
+		up = prev;
+		prev = d.qid.path;
+		r.wqid[i] = d.qid;
+	}
+	r.nwqid = i;
+	if(i == 0 && m->nwname != 0)
+		error(Esrch);
+	f = o;
+	if(m->fid != m->newfid && i == m->nwname){
+		if((f = dupfid(m->conn, m->newfid, o)) == nil)
+			error(Efid);
+		putfid(o);
+	}
+	if(i > 0 && i == m->nwname){
+		lock(f);
+		if(waserror()){
+			if(f != o)
+				clunkfid(m->conn, f, nil);
+			unlock(f);
+			nexterror();
+		}
+		if(up == Qdump)
+			dent = getdent(-1ULL, &d);
+		else
+			dent = getdent(up, &d);
+		if(mnt != f->mnt){
+			clunkmount(f->mnt);
+			ainc(&mnt->ref);
+			f->mnt = mnt;
+		}
+		clunkdent(f->dent);
+		f->qpath = r.wqid[i-1].path;
+		f->pqpath = up;
+		f->dent = dent;
+		f->duid = duid;
+		f->dgid = dgid;
+		f->dmode = dmode;
+		poperror();
+		unlock(f);
+	}
+	respond(m, &r);
+	poperror();
+	putfid(f);
+}
+
+static void
+fsstat(Fmsg *m)
+{
+	char buf[STATMAX];
+	Fcall r;
+	Fid *f;
+	int n;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	if(waserror()){
+		rerror(m, errmsg());
+		putfid(f);
+		return;
+	}
+	rlock(f->dent);
+	if((n = dir2statbuf(f->dent, buf, sizeof(buf))) == -1)
+		error(Efs);
+	runlock(f->dent);
+	r.type = Rstat;
+	r.stat = (uchar*)buf;
+	r.nstat = n;
+	respond(m, &r);
+	poperror();
+	putfid(f);
+}
+
+static void
+fswstat(Fmsg *m, int id, Amsg **ao)
+{
+	char rnbuf[Kvmax], opbuf[Kvmax], upbuf[Upksz];
+	char *p, strs[65535];
+	int op, nm, rename;
+	vlong oldlen;
+	Qid old;
+	Fcall r;
+	Dent *de;
+	Msg mb[3];
+	Xdir n;
+	Dir d;
+	Tree *t;
+	Fid *f;
+	Key k;
+	User *u;
+
+	*ao = nil;
+	rename = 0;
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	de = f->dent;
+	truncwait(de, id);
+	wlock(de);
+	if(waserror()){
+		rerror(m, errmsg());
+		free(*ao);
+		*ao = nil;
+		goto Err;
+	}
+	if(de->gone)
+		error(Ephase);
+	if((de->qid.type & QTAUTH) || (de->qid.path & Qdump))
+		error(Emode);
+	if(convM2D(m->stat, m->nstat, &d, strs) <= BIT16SZ)
+		error(Edir);
+
+	t = agetp(&f->mnt->root);
+	n = de->Xdir;
+	n.qid.vers++;
+	p = opbuf+1;
+	op = 0;
+
+	/* check validity of updated fields and construct Owstat message */
+	if(d.qid.path != ~0 || d.qid.vers != ~0){
+		if(d.qid.path != de->qid.path)
+			error(Ewstatp);
+		if(d.qid.vers != de->qid.vers)
+			error(Ewstatv);
+	}
+	if(*d.name != '\0'){
+		if(strcmp(d.name, de->name) != 0){
+			rename = 1;
+			if(okname(d.name) == -1)
+				error(Ename);
+			if(walk1(t, f->dent->up, d.name, &old, &oldlen) == 0)
+				error(Eexist);
+			n.name = d.name;
+		}
+	}
+	if(d.length != ~0){
+		if(d.length < 0)
+			error(Ewstatl);
+		if(d.length != de->length){
+			if(d.length < de->length){
+				if((*ao = malloc(sizeof(Amsg))) == nil)
+					error(Enomem);
+				qlock(&de->trunclk);
+				de->trunc = 1;
+				qunlock(&de->trunclk);
+				aincl(&de->ref, 1);
+				aincl(&f->mnt->ref, 1);
+				(*ao)->op = AOclear;
+				(*ao)->mnt = f->mnt;
+				(*ao)->qpath = f->qpath;
+				(*ao)->off = d.length;
+				(*ao)->end = f->dent->length;
+				(*ao)->dent = de;
+			}
+			de->length = d.length;
+			n.length = d.length;
+			op |= Owsize;
+			PACK64(p, n.length);
+			p += 8;
+		}
+	}
+	if(d.mode != ~0){
+		if((d.mode^de->mode) & DMDIR)
+			error(Ewstatd);
+		if(d.mode & ~(DMDIR|DMAPPEND|DMEXCL|DMTMP|0777))
+			error(Ewstatb);
+		if(d.mode != de->mode){
+			n.mode = d.mode;
+			n.qid.type = d.mode>>24;
+			op |= Owmode;
+			PACK32(p, n.mode);
+			p += 4;
+		}
+	}
+	if(d.mtime != ~0){
+		n.mtime = d.mtime*Nsec;
+		if(n.mtime != de->mtime){
+			op |= Owmtime;
+			PACK64(p, n.mtime);
+			p += 8;
+		}
+	}
+	if(*d.uid != '\0'){
+		rlock(&fs->userlk);
+		u = name2user(d.uid);
+		if(u == nil){
+			runlock(&fs->userlk);
+			error(Enouser);
+		}
+		n.uid = u->id;
+		runlock(&fs->userlk);
+		if(n.uid != de->uid){
+			op |= Owuid;
+			PACK32(p, n.uid);
+			p += 4;
+		}
+	}
+	if(*d.gid != '\0'){
+		rlock(&fs->userlk);
+		u = name2user(d.gid);
+		if(u == nil){
+			runlock(&fs->userlk);
+			error(Enogrp);
+		}
+		n.gid = u->id;
+		runlock(&fs->userlk);
+		if(n.gid != de->gid){
+			op |= Owgid;
+			PACK32(p, n.gid);
+			p += 4;
+		}
+	}
+	op |= Owmuid;
+	n.muid = f->uid;
+	PACK32(p, n.muid);
+	p += 4;
+
+	/* check permissions */
+	if(rename)
+		if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
+			error(Eperm);
+	if(op & Owsize)
+		if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1)
+			error(Eperm);
+	if(op & (Owmode|Owmtime))
+		if(!f->permit && f->uid != de->uid && !groupleader(f->uid, de->gid))
+			error(Ewstato);
+	if(op & Owuid)
+		if(!f->permit)
+			error(Ewstatu);
+	if(op & Owgid)
+		if(!f->permit
+		&& !(f->uid == de->uid && ingroup(f->uid, n.gid))
+		&& !(groupleader(f->uid, de->gid) && groupleader(f->uid, n.gid)))
+			error(Ewstatg);
+
+	/* update directory entry */
+	nm = 0;
+	if(rename && !de->gone){
+		mb[nm].op = Oclobber;
+		mb[nm].Key = de->Key;
+		mb[nm].v = nil;
+		mb[nm].nv = 0;
+		nm++;
+	
+		mb[nm].op = Oinsert;
+		dir2kv(f->pqpath, &n, &mb[nm], rnbuf, sizeof(rnbuf));
+		k = mb[nm].Key;
+		nm++;
+
+		if(de->qid.type & QTDIR){
+			packsuper(upbuf, sizeof(upbuf), f->qpath);
+			mb[nm].op = Oinsert;
+			mb[nm].k = upbuf;
+			mb[nm].nk = Upksz;
+			mb[nm].v = mb[nm-1].k;
+			mb[nm].nv = mb[nm-1].nk;
+			nm++;
+		}
+	}else{
+		opbuf[0] = op;
+		mb[nm].op = Owstat;
+		mb[nm].Key = de->Key;
+		mb[nm].v = opbuf;
+		mb[nm].nv = p - opbuf;
+		nm++;
+	}
+	assert(nm <= nelem(mb));
+	upsert(f->mnt, mb, nm);
+
+	de->Xdir = n;
+	if(rename)
+		cpkey(de, &k, de->buf, sizeof(de->buf));
+
+	r.type = Rwstat;
+	respond(m, &r);
+	poperror();
+
+Err:	wunlock(de);
+	putfid(f);
+}
+
+
+static void
+fsclunk(Fmsg *m, Amsg **ao)
+{
+	Fcall r;
+	Fid *f;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	lock(f);
+	clunkfid(m->conn, f, ao);
+	unlock(f);
+	r.type = Rclunk;
+	respond(m, &r);
+	putfid(f);
+}
+
+static void
+fscreate(Fmsg *m)
+{
+	char *p, buf[Kvmax], upkbuf[Keymax], upvbuf[Inlmax];
+	Dent *de;
+	vlong oldlen;
+	Qid old;
+	Fcall r;
+	Msg mb[2];
+	Fid *f;
+	Xdir d;
+	int nm;
+
+	if(okname(m->name) == -1){
+		rerror(m, Ename);
+		return;
+	}
+	if(m->perm & (DMMOUNT|DMAUTH)){
+		rerror(m, Ebotch);
+		return;
+	}
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	lock(f);
+
+	if(waserror()){
+		rerror(m, errmsg());
+		goto Err;
+		
+	}
+	if(f->mode != -1){
+		rerror(m, Einuse);
+		goto Out;
+	}
+	de = f->dent;
+	if(walk1(f->mnt->root, f->qpath, m->name, &old, &oldlen) == 0){
+		rerror(m, Eexist);
+		goto Out;
+	}
+
+	rlock(de);
+	if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1){
+		rerror(m, Eperm);
+		runlock(de);
+		goto Out;
+	}
+
+	d.gid = de->gid;
+	runlock(de);
+
+	nm = 0;
+	d.qid.type = 0;
+	if(m->perm & DMDIR)
+		d.qid.type |= QTDIR;
+	if(m->perm & DMAPPEND)
+		d.qid.type |= QTAPPEND;
+	if(m->perm & DMEXCL)
+		d.qid.type |= QTEXCL;
+	if(m->perm & DMTMP)
+		d.qid.type |= QTTMP;
+	d.qid.path = aincv(&fs->nextqid, 1);
+	d.qid.vers = 0;
+	d.mode = m->perm;
+	if(m->perm & DMDIR)
+		d.mode &= ~0777 | de->mode & 0777;
+	else
+		d.mode &= ~0666 | de->mode & 0666;
+	d.name = m->name;
+	d.atime = nsec();
+	d.mtime = d.atime;
+	d.length = 0;
+	d.uid = f->uid;
+	d.muid = f->uid;
+
+	mb[nm].op = Oinsert;
+	dir2kv(f->qpath, &d, &mb[nm], buf, sizeof(buf));
+	nm++;
+
+	if(m->perm & DMDIR){
+		mb[nm].op = Oinsert;
+		if((p = packsuper(upkbuf, sizeof(upkbuf), d.qid.path)) == nil)
+			sysfatal("ream: pack super");
+		mb[nm].k = upkbuf;
+		mb[nm].nk = p - upkbuf;
+		if((p = packdkey(upvbuf, sizeof(upvbuf), f->qpath, d.name)) == nil)
+			sysfatal("ream: pack super");
+		mb[nm].v = upvbuf;
+		mb[nm].nv = p - upvbuf;
+		nm++;
+	}
+	upsert(f->mnt, mb, nm);
+
+	de = getdent(f->qpath, &d);
+	clunkdent(f->dent);
+	f->mode = mode2bits(m->mode);
+	f->pqpath = f->qpath;
+	f->qpath = d.qid.path;
+	f->dent = de;
+	if(m->mode & ORCLOSE)
+		f->rclose = 1;
+
+	r.type = Rcreate;
+	r.qid = d.qid;
+	r.iounit = f->iounit;
+	respond(m, &r);
+Out:	poperror();
+Err:	unlock(f);
+	putfid(f);
+	return;
+}
+
+static char*
+candelete(Fid *f)
+{
+	char *e, pfx[Dpfxsz];
+	Tree *t;
+	Scan s;
+
+	if(!(f->dent->qid.type & QTDIR))
+		return nil;
+
+	t = agetp(&f->mnt->root);
+	packdkey(pfx, sizeof(pfx), f->qpath, nil);
+	btnewscan(&s, pfx, sizeof(pfx));
+	btenter(t, &s);
+	if(btnext(&s, &s.kv))
+		e = Enempty;
+	else
+		e = nil;
+	btexit(&s);
+	return e;
+}
+
+static void
+fsremove(Fmsg *m, int id, Amsg **ao)
+{
+	char *e, buf[Kvmax];
+	Fcall r;
+	Msg mb[2];
+	Tree *t;
+	Kvp kv;
+	Fid *f;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	t = f->mnt->root;
+	clunkfid(m->conn, f, nil);
+
+	truncwait(f->dent, id);
+	wlock(f->dent);
+	*ao = nil;
+	if(waserror()){
+		rerror(m, errmsg());
+		free(*ao);
+		*ao = nil;
+		goto Err;
+	}
+	if(f->dent->gone)
+		error(Ephase);
+	/*
+	 * we need a double check that the file is in the tree
+	 * here, because the walk to the fid is done in a reader
+	 * proc that can look it up in a stale version of the
+	 * tree, while we clunk the dent in the mutator proc.
+	 *
+	 * this means we can theoretically get some deletions
+	 * of files that are already gone.
+	 */
+	if(!btlookup(t, &f->dent->Key, &kv, buf, sizeof(buf)))
+		error(Ephase);
+	if((e = candelete(f)) != nil)
+		error(e);
+	if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
+		error(Eperm);
+	mb[0].op = Odelete;
+	mb[0].k = f->dent->k;
+	mb[0].nk = f->dent->nk;
+	mb[0].nv = 0;
+
+	if(f->dent->qid.type & QTDIR){
+		packsuper(buf, sizeof(buf), f->qpath);
+		mb[1].op = Oclobber;
+		mb[1].k = buf;
+		mb[1].nk = Upksz;
+		mb[1].nv = 0;
+		upsert(f->mnt, mb, 2);
+	}else{
+		*ao = emalloc(sizeof(Amsg), 1);
+		aincl(&f->mnt->ref, 1);
+		(*ao)->op = AOclear;
+		(*ao)->mnt = f->mnt;
+		(*ao)->qpath = f->qpath;
+		(*ao)->off = 0;
+		(*ao)->end = f->dent->length;
+		(*ao)->dent = nil;
+		upsert(f->mnt, mb, 1);
+	}
+	f->dent->gone = 1;
+	r.type = Rremove;
+	respond(m, &r);
+	poperror();
+Err:
+	wunlock(f->dent);
+	putfid(f);
+	return;
+}
+
+static void
+fsopen(Fmsg *m, int id, Amsg **ao)
+{
+	char *p, *e, buf[Kvmax];
+	int mbits;
+	Tree *t;
+	Fcall r;
+	Xdir d;
+	Fid *f;
+	Kvp kv;
+	Msg mb;
+
+	mbits = mode2bits(m->mode);
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	if(waserror()){
+		rerror(m, errmsg());
+		putfid(f);
+		return;
+	}
+	if(m->mode & OTRUNC)
+		truncwait(f->dent, id);
+	t = agetp(&f->mnt->root);
+	if((f->qpath & Qdump) != 0){
+		filldumpdir(&d);
+	}else{
+		if(!btlookup(t, f->dent, &kv, buf, sizeof(buf)))
+			error(Esrch);
+		kv2dir(&kv, &d);
+	}
+	wlock(f->dent);
+	if(waserror()){
+		wunlock(f->dent);
+		nexterror();
+	}
+	if(f->dent->gone)
+		error(Ephase);
+	if(f->dent->qid.type & QTEXCL)
+	if(f->dent->ref != 1)
+		error(Elocked);
+	if(m->mode & ORCLOSE)
+		if((e = candelete(f)) != nil)
+			error(e);
+	if(fsaccess(f, d.mode, d.uid, d.gid, mbits) == -1)
+		error(Eperm);
+	f->dent->length = d.length;
+	poperror();
+	wunlock(f->dent);
+	r.type = Ropen;
+	r.qid = d.qid;
+	r.iounit = f->iounit;
+
+	lock(f);
+	if(f->mode != -1){
+		unlock(f);
+		error(Einuse);
+	}
+	if((m->mode & OTRUNC) && !(f->dent->mode & DMAPPEND)){
+		wlock(f->dent);
+
+		if(waserror()){
+			wunlock(f->dent);
+			free(*ao);
+			*ao = nil;
+			nexterror();
+		}
+		*ao = emalloc(sizeof(Amsg), 1);
+		qlock(&f->dent->trunclk);
+		f->dent->trunc = 1;
+		qunlock(&f->dent->trunclk);
+		aincl(&f->dent->ref, 1);
+		aincl(&f->mnt->ref, 1);
+		(*ao)->op = AOclear;
+		(*ao)->mnt = f->mnt;
+		(*ao)->qpath = f->qpath;
+		(*ao)->off = 0;
+		(*ao)->end = f->dent->length;
+		(*ao)->dent = f->dent;
+
+		f->dent->muid = f->uid;
+		f->dent->qid.vers++;
+		f->dent->length = 0;
+
+		mb.op = Owstat;
+		p = buf;
+		p[0] = Owsize|Owmuid;	p += 1;
+		PACK64(p, 0);		p += 8;
+		PACK32(p, f->uid);	p += 4;
+		mb.k = f->dent->k;
+		mb.nk = f->dent->nk;
+		mb.v = buf;
+		mb.nv = p - buf;
+
+		upsert(f->mnt, &mb, 1);
+		wunlock(f->dent);
+		poperror();
+	}
+	f->mode = mode2bits(m->mode);
+	if(m->mode & ORCLOSE)
+		f->rclose = 1;
+	unlock(f);
+	poperror();
+	respond(m, &r);
+	putfid(f);
+}
+
+static void
+readsnap(Fmsg *m, Fid *f, Fcall *r)
+{
+	char pfx[1], *p;
+	int n, ns;
+	Scan *s;
+	Xdir d;
+
+	s = f->scan;
+	if(s != nil && s->offset != 0 && s->offset != m->offset)
+		error(Edscan);
+	if(s == nil || m->offset == 0){
+		s = emalloc(sizeof(Scan), 1);
+		pfx[0] = Klabel;
+		btnewscan(s, pfx, 1);
+		lock(f);
+		if(f->scan != nil){
+			free(f->scan);
+		}
+		f->scan = s;
+		unlock(f);
+	}
+	if(s->donescan){
+		r->count = 0;
+		return;
+	}
+	p = r->data;
+	n = m->count;
+	d = f->dent->Xdir;
+	if(s->overflow){
+		memcpy(d.name, s->kv.k+1, s->kv.nk-1);
+		d.name[s->kv.nk-1] = 0;
+		d.qid.path = UNPACK64(s->kv.v + 1);
+		if((ns = dir2statbuf(&d, p, n)) == -1){
+			r->count = 0;
+			return;
+		}
+		s->overflow = 0;
+		p += ns;
+		n -= ns;
+	}
+	btenter(&fs->snap, s);
+	while(1){
+		if(!btnext(s, &s->kv))
+			break;
+		memcpy(d.name, s->kv.k+1, s->kv.nk-1);
+		d.name[s->kv.nk-1] = 0;
+		d.qid.path = UNPACK64(s->kv.v + 1);
+		if((ns = dir2statbuf(&d, p, n)) == -1){
+			s->overflow = 1;
+			break;
+		}
+		p += ns;
+		n -= ns;
+	}
+	btexit(s);
+	r->count = p - r->data;
+	return;
+}
+
+static void
+readdir(Fmsg *m, Fid *f, Fcall *r)
+{
+	char pfx[Dpfxsz], *p;
+	int n, ns;
+	Tree *t;
+	Scan *s;
+
+	s = f->scan;
+	t = agetp(&f->mnt->root);
+	if(s != nil && s->offset != 0 && s->offset != m->offset)
+		error(Edscan);
+	if(s == nil || m->offset == 0){
+		s = emalloc(sizeof(Scan), 1);
+		packdkey(pfx, sizeof(pfx), f->qpath, nil);
+		btnewscan(s, pfx, sizeof(pfx));
+		lock(f);
+		if(f->scan != nil)
+			free(f->scan);
+		f->scan = s;
+		unlock(f);
+	}
+	if(s->donescan){
+		r->count = 0;
+		return;
+	}
+	p = r->data;
+	n = m->count;
+	if(s->overflow){
+		if((ns = kv2statbuf(&s->kv, p, n)) == -1){
+			r->count = 0;
+			return;
+		}
+		s->overflow = 0;
+		p += ns;
+		n -= ns;
+	}
+	btenter(t, s);
+	while(1){
+		if(!btnext(s, &s->kv))
+			break;
+		if((ns = kv2statbuf(&s->kv, p, n)) == -1){
+			s->overflow = 1;
+			break;
+		}
+		p += ns;
+		n -= ns;
+	}
+	btexit(s);
+	r->count = p - r->data;
+}
+
+static void
+readfile(Fmsg *m, Fid *f, Fcall *r)
+{
+	vlong n, c, o;
+	char *p;
+	Dent *e;
+	Tree *t;
+
+	e = f->dent;
+	rlock(e);
+	if(m->offset > e->length){
+		runlock(e);
+		return;
+	}
+	p = r->data;
+	c = m->count;
+	o = m->offset;
+	t = agetp(&f->mnt->root);
+	if(m->offset + m->count > e->length)
+		c = e->length - m->offset;
+	while(c != 0){
+		n = readb(t, f, p, o, c, e->length);
+		r->count += n;
+		if(n == 0)
+			break;
+		p += n;
+		o += n;
+		c -= n;
+	}
+	runlock(e);
+}
+
+static void
+fsread(Fmsg *m)
+{
+	Fcall r;
+	Fid *f;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	r.type = Rread;
+	r.count = 0;
+	r.data = nil;
+	if(waserror()){
+		rerror(m, errmsg());
+		free(r.data);
+		putfid(f);
+		return;
+	}	
+	r.data = emalloc(m->count, 0);
+	if(f->dent->qid.type & QTAUTH)
+		authread(f, &r, r.data, m->count);
+	else if(f->dent->qid.path == Qdump)
+		readsnap(m, f, &r);
+	else if(f->dent->qid.type & QTDIR)
+		readdir(m, f, &r);
+	else
+		readfile(m, f, &r);
+	respond(m, &r);
+	free(r.data);
+	poperror();
+	putfid(f);
+}
+
+static void
+fswrite(Fmsg *m, int id)
+{
+	char sbuf[Wstatmax], kbuf[Max9p/Blksz+2][Offksz], vbuf[Max9p/Blksz+2][Ptrsz];
+	Bptr bp[Max9p/Blksz + 2];
+	Msg kv[Max9p/Blksz + 2];
+	vlong n, o, c, w;
+	int i, j;
+	char *p;
+	Fcall r;
+	Tree *t;
+	Fid *f;
+
+	if((f = getfid(m->conn, m->fid)) == nil){
+		rerror(m, Enofid);
+		return;
+	}
+	if(!(f->mode & DMWRITE)){
+		rerror(m, Einuse);
+		putfid(f);
+		return;
+	}
+	truncwait(f->dent, id);
+	wlock(f->dent);
+	if(waserror()){
+		rerror(m, errmsg());
+		wunlock(f->dent);
+		putfid(f);
+		return;
+	}
+	if(f->dent->gone)
+		error(Ephase);
+	if(f->dent->qid.type & QTAUTH){
+		authwrite(f, &r, m->data, m->count);
+		goto Out;
+	}	
+
+	w = 0;
+	p = m->data;
+	o = m->offset;
+	c = m->count;
+	if(f->dent->mode & DMAPPEND)
+		o = f->dent->length;
+	t = agetp(&f->mnt->root);
+	for(i = 0; i < nelem(kv)-1 && c != 0; i++){
+		assert(i == 0 || o%Blksz == 0);
+		kv[i].op = Oinsert;
+		kv[i].k = kbuf[i];
+		kv[i].nk = sizeof(kbuf[i]);
+		kv[i].v = vbuf[i];
+		kv[i].nv = sizeof(vbuf[i]);
+		if(waserror()){
+			if(!fs->rdonly)
+				for(j = 0; j < i; j++)
+					freeblk(t, nil, bp[j]);
+			nexterror();
+		}
+		n = writeb(f, &kv[i], &bp[i], p, o, c, f->dent->length);
+		poperror();
+		w += n;
+		p += n;
+		o += n;
+		c -= n;
+	}
+
+	p = sbuf;
+	kv[i].op = Owstat;
+	kv[i].k = f->dent->k;
+	kv[i].nk = f->dent->nk;
+	*p++ = 0;
+	if(o > f->dent->length){ 
+		sbuf[0] |= Owsize;
+		PACK64(p, o);
+		p += 8;
+		f->dent->length = m->offset+m->count;
+	}
+	sbuf[0] |= Owmtime;
+	f->dent->mtime = nsec();
+	PACK64(p, f->dent->mtime);
+	p += 8;
+	sbuf[0] |= Owmuid;
+	PACK32(p, f->uid);
+	p += 4;
+
+	kv[i].v = sbuf;
+	kv[i].nv = p - sbuf;
+	upsert(f->mnt, kv, i+1);
+
+	r.type = Rwrite;
+	r.count = w;
+Out:
+	poperror();
+ 	respond(m, &r);
+	wunlock(f->dent);
+	putfid(f);	
+}
+
+void
+fsflush(Fmsg *m)
+{
+	Fcall r;
+
+	r.type = Rflush;
+	respond(m, &r);
+}
+
+Conn *
+newconn(int rfd, int wfd)
+{
+	Conn *c;
+
+	if((c = mallocz(sizeof(*c), 1)) == nil)
+		return nil;
+	c->rfd = rfd;
+	c->wfd = wfd;
+	c->iounit = Max9p;
+	c->next = fs->conns;
+	lock(&fs->connlk);
+	fs->conns = c;
+	unlock(&fs->connlk);
+	return c;
+}
+
+void
+runfs(int, void *pc)
+{
+	char err[128];
+	RWLock *lk;
+	Amsg *a;
+	Conn *c;
+	Fcall r;
+	Fmsg *m;
+	u32int h;
+
+	c = pc;
+	while(1){
+		if(readmsg(c, &m) < 0){
+			fshangup(c, "read message: %r");
+			return;
+		}
+		if(m == nil)
+			break;
+		if(convM2S(m->buf, m->sz, m) == 0){
+			fshangup(c, "invalid message: %r");
+			return;
+		}
+		if(m->type != Tversion && !c->versioned){
+			fshangup(c, "version required");
+			return;
+		}
+		dprint("← %F\n", &m->Fcall);
+
+		if(m->type == Tflush){
+			lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
+			wlock(lk);
+		}else{
+			lk = &fs->flushq[ihash(m->tag) % Nflushtab];
+			rlock(lk);
+		}
+
+		a = nil;
+		h = ihash(m->fid) % fs->nreaders;
+		switch(m->type){
+		/* sync setup, must not access tree */
+		case Tversion:	fsversion(m);	break;
+		case Tauth:	fsauth(m);	break;
+		case Tflush:	fsflush(m);	break;
+		case Tclunk:	fsclunk(m, &a);	break;
+
+		/* mutators */
+		case Tcreate:	chsend(fs->wrchan, m);	break;
+		case Twrite:	chsend(fs->wrchan, m);	break;
+		case Twstat:	chsend(fs->wrchan, m);	break;
+		case Tremove:	chsend(fs->wrchan, m);	break;
+
+		/* reads */
+		case Tattach:	chsend(fs->rdchan[h], m);	break;
+		case Twalk:	chsend(fs->rdchan[h], m);	break;
+		case Tread:	chsend(fs->rdchan[h], m);	break;
+		case Tstat:	chsend(fs->rdchan[h], m);	break;
+
+		/* both */
+		case Topen:
+			if((m->mode & OTRUNC) || (m->mode & ORCLOSE) != 0)
+				chsend(fs->wrchan, m);
+			else
+				chsend(fs->rdchan[h], m);
+			break;
+
+		default:
+			fprint(2, "unknown message %F\n", &m->Fcall);
+			snprint(err, sizeof(err), "unknown message: %F", &m->Fcall);
+			r.type = Rerror;
+			r.ename = err;
+			respond(m, &r);
+			break;
+		}
+		assert(estacksz() == 0);
+		if(a != nil)
+			chsend(fs->admchan, a);
+	}
+}
+
+void
+runmutate(int id, void *)
+{
+	Fmsg *m;
+	Amsg *a;
+	Fid *f;
+
+	while(1){
+		a = nil;
+		m = chrecv(fs->wrchan);
+		if(fs->rdonly){
+			/*
+			 * special case: even if Tremove fails, we need
+			 * to clunk the fid.
+			 */
+			if(m->type == Tremove){
+				if((f = getfid(m->conn, m->fid)) == nil){
+					rerror(m, Enofid);
+					continue;
+				}
+				clunkfid(m->conn, f, nil);
+				putfid(f);
+			}
+			rerror(m, Erdonly);
+			continue;
+ 		}
+
+		qlock(&fs->mutlk);
+		epochstart(id);
+		fs->snap.dirty = 1;
+		switch(m->type){
+		case Tcreate:	fscreate(m);		break;
+		case Twrite:	fswrite(m, id);		break;
+		case Twstat:	fswstat(m, id, &a);	break;
+		case Tremove:	fsremove(m, id, &a);	break;
+		case Topen:	fsopen(m, id, &a);	break;
+		default:	abort();		break;
+		}
+		assert(estacksz() == 0);
+		epochend(id);
+		epochclean();
+		qunlock(&fs->mutlk);
+
+		if(a != nil)
+			chsend(fs->admchan, a);
+	}
+}
+
+void
+runread(int id, void *ch)
+{
+	Fmsg *m;
+
+	while(1){
+		m = chrecv(ch);
+		epochstart(id);
+		switch(m->type){
+		case Tattach:	fsattach(m);		break;
+		case Twalk:	fswalk(m);		break;
+		case Tread:	fsread(m);		break;
+		case Tstat:	fsstat(m);		break;
+		case Topen:	fsopen(m, id, nil);	break;
+		}
+		assert(estacksz() == 0);
+		epochend(id);
+	}
+}
+
+void
+freetree(Bptr rb, vlong pred)
+{
+	Bptr bp;
+	Blk *b;
+	Kvp kv;
+	int i;
+
+	b = getblk(rb, 0);
+	if(b->type == Tpivot){
+		for(i = 0; i < b->nval; i++){
+			getval(b, i, &kv);
+			bp = unpackbp(kv.v, kv.nv);
+			freetree(bp, pred);
+			qlock(&fs->mutlk);
+			epochclean();
+			qunlock(&fs->mutlk);
+		}
+	}
+	if(rb.gen > pred)
+		freeblk(nil, nil, rb);
+	dropblk(b);
+}
+
+/*
+ * Here, we clean epochs frequently, but we run outside of
+ * an epoch; this is because the caller of this function
+ * has already waited for an epoch to tick over, there's
+ * nobody that can be accessing the tree other than us,
+ * and we just need to keep the limbo list short.
+ *
+ * Because this is the last reference to the tree, we don't
+ * need to hold the mutlk, other than when we free or kill
+ * blocks via epochclean.
+ */
+void
+sweeptree(Tree *t)
+{
+	char pfx[1];
+	Scan s;
+	Bptr bp;
+	pfx[0] = Kdat;
+	btnewscan(&s, pfx, 1);
+	btenter(t, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		bp = unpackbp(s.kv.v, s.kv.nv);
+		if(bp.gen > t->pred)
+			freeblk(nil, nil, bp);
+		qlock(&fs->mutlk);
+		epochclean();
+		qunlock(&fs->mutlk);
+	}
+	btexit(&s);
+	freetree(t->bp, t->pred);
+}
+
+void
+runsweep(int id, void*)
+{
+	char buf[Kvmax];
+	Bptr bp, nb, *oldhd;
+	vlong off;
+	Tree *t;
+	Arena *a;
+	Amsg *am;
+	Blk *b;
+	Msg m, mb[2];
+	int i, nm;
+
+	if((oldhd = calloc(fs->narena, sizeof(Bptr))) == nil)
+		sysfatal("malloc log heads");
+	while(1){
+		am = chrecv(fs->admchan);
+		if(agetl(&fs->rdonly)){
+			fprint(2, "spurious adm message\n");
+			break;
+		}
+		switch(am->op){
+		case AOsync:
+			tracem("syncreq");
+			if(!fs->snap.dirty && !am->halt)
+				continue;
+			if(agetl(&fs->rdonly))
+				goto Justhalt;
+			if(waserror()){
+				fprint(2, "sync error: %s\n", errmsg());
+				ainc(&fs->rdonly);
+				break;
+			}
+
+			if(am->halt)
+				ainc(&fs->rdonly);
+			qlock(&fs->mutlk);
+			for(i = 0; i < fs->narena; i++){
+				a = &fs->arenas[i];
+				qlock(a);
+				if(a->nlog < a->reserve/(10*Blksz)){
+					oldhd[i].addr = -1;
+					oldhd[i].hash = -1;
+					oldhd[i].gen = -1;
+					qunlock(a);
+					continue;
+				}
+				if(waserror()){
+					qunlock(&fs->mutlk);
+					qunlock(a);
+					nexterror();
+				}
+				oldhd[i] = a->loghd;
+				epochstart(id);
+				compresslog(a);
+				qunlock(a);
+				epochend(id);
+				epochclean();
+				poperror();
+			}
+			qunlock(&fs->mutlk);
+			sync();
+
+			for(i = 0; i < fs->narena; i++){
+				for(bp = oldhd[i]; bp.addr != -1; bp = nb){
+					qlock(&fs->mutlk);
+					epochstart(id);
+					b = getblk(bp, 0);
+					nb = b->logp;
+					freeblk(nil, b, b->bp);
+					dropblk(b);
+					epochend(id);
+					epochclean();
+					qunlock(&fs->mutlk);
+				}
+			}
+
+Justhalt:
+			if(am->halt){
+				assert(fs->snapdl.hd.addr == -1);
+				assert(fs->snapdl.tl.addr == -1);
+				postnote(PNGROUP, getpid(), "halted");
+				exits(nil);
+			}
+			poperror();
+			break;
+
+		case AOsnap:
+			tracem("snapreq");
+			if(agetl(&fs->rdonly)){
+				fprint(2, "read only fs");
+				continue;
+			}
+			if(waserror()){
+				fprint(2, "taking snap: %s\n", errmsg());
+				ainc(&fs->rdonly);
+				break;
+			}
+
+			qlock(&fs->mutlk);
+			if(waserror()){
+				qunlock(&fs->mutlk);
+				nexterror();
+			}
+			epochstart(id);
+			snapfs(am, &t);
+			epochend(id);
+			poperror();
+			qunlock(&fs->mutlk);
+
+			sync();
+
+			if(t != nil){
+				epochwait();
+				sweeptree(t);
+				closesnap(t);
+			}
+			poperror();
+			break;
+
+		case AOrclose:
+			nm = 0;
+			mb[nm].op = Odelete;
+			mb[nm].k = am->dent->k;
+			mb[nm].nk = am->dent->nk;
+			mb[nm].nv = 0;
+			nm++;
+			if(am->dent->qid.type & QTDIR){
+				packsuper(buf, sizeof(buf), am->qpath);
+				mb[nm].op = Oclobber;
+				mb[nm].k = buf;
+				mb[nm].nk = Upksz;
+				mb[nm].nv = 0;
+				nm++;
+			}
+			upsert(am->mnt, mb, nm);
+			/* fallthrough */
+		case AOclear:
+			tracem("bgclear");
+			if(waserror()){
+				fprint(2, "clear file %llx: %s\n", am->qpath, errmsg());
+				ainc(&fs->rdonly);
+				break;
+			}
+			if(am->dent != nil)
+				qlock(&am->dent->trunclk);
+			fs->snap.dirty = 1;
+			for(off = am->off; off < am->end; off += Blksz){
+				qlock(&fs->mutlk);
+				if(waserror()){
+					qunlock(&fs->mutlk);
+					nexterror();
+				}
+				epochstart(id);
+				m.k = buf;
+				m.nk = sizeof(buf);
+				m.op = Oclearb;
+				m.k[0] = Kdat;
+				PACK64(m.k+1, am->qpath);
+				PACK64(m.k+9, off);
+				m.v = nil;
+				m.nv = 0;
+				upsert(am->mnt, &m, 1);
+				epochend(id);
+				epochclean();
+				qunlock(&fs->mutlk);
+				poperror();
+			}
+			if(am->dent != nil){
+				am->dent->trunc = 0;
+				rwakeup(&am->dent->truncrz);
+				qunlock(&am->dent->trunclk);
+				clunkdent(am->dent);
+			}
+			clunkmount(am->mnt);
+			poperror();
+			break;
+		}
+		assert(estacksz() == 0);
+		free(am);
+	}
+}
+
+void
+snapmsg(char *old, char *new, int flg)
+{
+	Amsg *a;
+
+	a = emalloc(sizeof(Amsg), 1);
+	a->op = AOsnap;
+	a->fd = -1;
+	a->flag = flg;
+	strecpy(a->old, a->old+sizeof(a->old), old);
+	if(new == nil)
+		a->delete = 1;
+	else
+		strecpy(a->new, a->new+sizeof(a->new), new);
+	chsend(fs->admchan, a);
+}
+
+void
+runtasks(int, void *)
+{
+	char buf[128];
+	Tm now, then;
+	Mount *mnt;
+	int m, h;
+	Amsg *a;
+
+	m = 0;
+	h = 0;
+	tmnow(&then, nil);
+	tmnow(&now, nil);
+	while(1){
+		sleep(5000);
+		if(fs->rdonly)
+			continue;
+		if(waserror()){
+			fprint(2, "task error: %s\n", errmsg());
+			continue;
+		}
+		a = emalloc(sizeof(Amsg), 1);
+		a->op = AOsync;
+		a->halt = 0;
+		a->fd = -1;
+		chsend(fs->admchan, a);
+
+		tmnow(&now, nil);
+		for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+			if(!(mnt->flag & Ltsnap))
+				continue;
+			if(now.yday != then.yday){
+				snprint(buf, sizeof(buf),
+					"%s@day.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+				snapmsg("main", buf, Lauto);
+			}
+			if(now.hour != then.hour){
+				if(mnt->hourly[h][0] != 0)
+					snapmsg(mnt->hourly[h], nil, 0);
+				snprint(mnt->hourly[h], sizeof(mnt->hourly[h]),
+					"%s@hour.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+				snapmsg("main", mnt->hourly[h], Lauto);
+			}
+			if(now.min != then.min){
+				if(mnt->minutely[m][0] != 0)
+					snapmsg(mnt->minutely[m], nil, 0);
+				snprint(mnt->minutely[m], sizeof(mnt->minutely[m]),
+					"%s@minute.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+				snapmsg("main", mnt->minutely[m], Lauto);
+			}
+		}
+		if(now.hour != then.hour)
+			h = (h+1)%24;
+		if(now.min != then.min)
+			m = (m+1)%60;
+		then = now;
+		poperror();
+	}
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/hash.c
@@ -1,0 +1,153 @@
+// metrohash64.cpp
+//
+// The MIT License (MIT)
+//
+// Copyright (c) 2015 J. Andrew Rogers
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+#define _le64toh(x) \
+	GBIT64((char*)&x)
+
+
+#define ROTATE(x, b) (u64int)( ((x) << (b)) | ( (x) >> (64 - (b))) )
+
+#define HALF_ROUND(a,b,c,d,s,t)			\
+	a += b; c += d;				\
+	b = ROTATE(b, s) ^ a;			\
+	d = ROTATE(d, t) ^ c;			\
+	a = ROTATE(a, 32);
+
+#define DOUBLE_ROUND(v0,v1,v2,v3)		\
+	HALF_ROUND(v0,v1,v2,v3,13,16);		\
+	HALF_ROUND(v2,v1,v0,v3,17,21);		\
+	HALF_ROUND(v0,v1,v2,v3,13,16);		\
+	HALF_ROUND(v2,v1,v0,v3,17,21);
+
+#define rotate_right(v, k)\
+	((v >> k) | (v << (64 - k)))
+#define read_u64(ptr) \
+	(*(u64int*)ptr)
+#define read_u32(ptr) \
+	(*(u32int*)ptr)
+#define read_u16(ptr) \
+	(*(u16int*)ptr)
+#define read_u8(ptr) \
+	(*(u8int*)ptr)
+
+uvlong
+metrohash64_1(void * key, u64int len, u32int seed)
+{
+	static const u64int k0 = 0xC83A91E1;
+	static const u64int k1 = 0x8648DBDB;
+	static const u64int k2 = 0x7BDEC03B;
+	static const u64int k3 = 0x2F5870A5;
+
+	const uchar * ptr = key;
+	const uchar * const end = ptr + len;
+	
+	u64int hash = ((((u64int) seed) + k2) * k0) + len;
+	
+	if(len >= 32){
+		u64int v[4];
+		v[0] = hash;
+		v[1] = hash;
+		v[2] = hash;
+		v[3] = hash;
+		
+		do{
+			v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
+			v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3];
+			v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0];
+			v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1];
+		}
+		while(ptr <= (end - 32));
+
+		v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1;
+		v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0;
+		v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1;
+		v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0;
+		hash += v[0] ^ v[1];
+	}
+	
+	if((end - ptr) >= 16){
+		u64int v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1;
+		u64int v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2;
+		v0 ^= rotate_right(v0 * k0, 35) + v1;
+		v1 ^= rotate_right(v1 * k3, 35) + v0;
+		hash += v1;
+	}
+	
+	if((end - ptr) >= 8){
+		hash += read_u64(ptr) * k3; ptr += 8;
+		hash ^= rotate_right(hash, 33) * k1;
+		
+	}
+	
+	if((end - ptr) >= 4){
+		hash += read_u32(ptr) * k3; ptr += 4;
+		hash ^= rotate_right(hash, 15) * k1;
+	}
+	
+	if((end - ptr) >= 2){
+		hash += read_u16(ptr) * k3; ptr += 2;
+		hash ^= rotate_right(hash, 13) * k1;
+	}
+	
+	if((end - ptr) >= 1){
+		hash += read_u8 (ptr) * k3;
+		hash ^= rotate_right(hash, 25) * k1;
+	}
+	
+	hash ^= rotate_right(hash, 33);
+	hash *= k0;
+	hash ^= rotate_right(hash, 33);
+
+	return hash;
+}
+
+uvlong
+bufhash(void *src, usize len)
+{
+	return metrohash64_1(src, len, 0x6765);
+}
+
+uvlong
+blkhash(Blk *b)
+{
+	return metrohash64_1(b->buf, Blksz, 0x6765);
+}
+
+u32int
+ihash(uvlong x)
+{
+	x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
+	x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
+	x = x ^ (x >> 31);
+	return x;
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/load.c
@@ -1,0 +1,142 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+rangecmp(Avl *a, Avl *b)
+{
+	if(((Arange*)a)->off < ((Arange*)b)->off)
+		return -1;
+	if(((Arange*)a)->off > ((Arange*)b)->off)
+		return 1;
+	return 0;
+}
+
+void
+loadarena(Arena *a, Bptr hd)
+{
+	Blk *h0, *h1, *b;
+	Bptr bp;
+
+	/* try to load block pointers with consistency check */
+	bp = hd;
+	h0 = nil;
+	h1 = nil;
+	if(!waserror()){
+		h0 = getblk(bp, GBsoftchk);
+		poperror();
+	}else
+		print("loading arena primary header: %s\n", errmsg());
+	bp.addr += Blksz;
+	if(!waserror()){
+		h1 = getblk(bp, GBsoftchk);
+		poperror();
+	}else
+		print("loading arena backup header: %s\n", errmsg());
+
+	/* if neither head nor tail is consistent, we're hosed */
+	b = (h0 != nil) ? h0 : h1;
+	if(b == nil)
+		error(Efs);
+
+	/* otherwise, we could have crashed mid-pass, just load the blocks */
+	bp = hd;
+	if(h0 == nil)
+		h0 = getblk(bp, GBnochk);
+	bp.addr += Blksz;
+	if(h1 == nil)
+		h1 = getblk(bp, GBnochk);
+
+	unpackarena(a, b->data, Arenasz);
+	if((a->free = avlcreate(rangecmp)) == nil)
+		error(Enomem);
+	a->h0 = h0;
+	a->h1 = h1;
+	a->used = a->size;
+}
+
+void
+loadfs(char *dev)
+{
+	Bptr bhd, btl;
+	Mount *dump;
+	Arena *a;
+	Tree *t;
+	Dir *d;
+	int i;
+	vlong eb;
+
+	if((dump = mallocz(sizeof(*dump), 1)) == nil)
+		sysfatal("malloc: %r");
+	if(waserror())
+		sysfatal("load fs: %s", errmsg());
+	snprint(dump->name, sizeof(dump->name), "dump");
+	dump->ref = 1;
+	dump->gen = -1;
+	dump->root = &fs->snap;
+
+	fs->snapmnt = dump;
+	fs->narena = 1;
+	if((fs->fd = open(dev, ORDWR)) == -1)
+		sysfatal("open %s: %r", dev);
+	if((d = dirfstat(fs->fd)) == nil)
+		sysfatal("stat %s: %r", dev);
+	eb = d->length;
+	eb = eb - (eb%Blksz) - Blksz;
+	bhd = (Bptr){0, -1, -1};
+	btl = (Bptr){eb, -1, -1};
+	fs->sb0 = getblk(bhd, GBnochk);
+	fs->sb1 = getblk(btl, GBnochk);
+	if(!waserror()){
+		unpacksb(fs, fs->sb0->buf, Blksz);
+		poperror();
+	}else{
+		fprint(2, "unable to load primary superblock: %s\n", errmsg());
+		if(waserror()){
+			fprint(2, "unable to load primary superblock: %s\n", errmsg());
+			exits("corrupt");
+		}
+		unpacksb(fs, fs->sb1->buf, Blksz);
+		poperror();
+	}
+
+	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+		sysfatal("malloc: %r");
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		loadarena(a, fs->arenabp[i]);
+		a->reserve = a->size / 1024;
+		if(a->reserve < 512*KiB)
+			a->reserve = 512*KiB;
+		if(a->reserve > 8*MiB)
+			a->reserve = 8*MiB;
+	}
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		a->logbuf[0] = cachepluck();
+		a->logbuf[1] = cachepluck();
+		a->logbuf[0]->bp = (Bptr){-1, -1, -1};
+		a->logbuf[1]->bp = (Bptr){-1, -1, -1};
+		loadlog(a, a->loghd);
+	}
+
+	if((t = opensnap("adm", nil)) == nil)
+		sysfatal("load users: no adm label");
+	loadusers(2, t);
+	poperror();
+
+	fprint(2, "load %s:\n", dev);
+	fprint(2, "\tsnaptree:\t%B\n", fs->snap.bp);
+	fprint(2, "\tnarenas:\t%d\n", fs->narena);
+	fprint(2, "\tfeatures:\t%lld\n", fs->flag);
+	fprint(2, "\tnextqid:\t%lld\n", fs->nextqid);
+	fprint(2, "\tlastqgen:\t%lld\n", fs->qgen);
+	fprint(2, "\tnextgen:\t%lld\n", fs->nextgen);
+	fprint(2, "\tblocksize:\t%lld\n", Blksz);
+	fprint(2, "\tcachesz:\t%lld MiB\n", fs->cmax*Blksz/MiB);
+	closesnap(t);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/main.c
@@ -1,0 +1,435 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+Gefs *fs;
+
+int	ream;
+int	grow;
+int	debug;
+int	stdio;
+int	noauth;
+int	nproc;
+int	permissive;
+int	usereserve;
+int	checkonly;
+char	*reamuser;
+char	*dev;
+vlong	tracesz		= 16*MiB;
+vlong	cachesz 	= 512*MiB;
+char	*srvname 	= "gefs";
+int	noneid		= 0;
+int	nogroupid	= 9999;
+int	admid		= -1;
+Blk	*blkbuf;
+Errctx	**errctx;
+
+void
+_trace(char *msg, Bptr bp, vlong v0, vlong v1)
+{
+	Trace *t;
+	ulong idx;
+
+	idx = aincl(&fs->traceidx, 1);
+	t = &fs->trace[(idx-1) % fs->ntrace];
+	strecpy(t->msg, t->msg+sizeof(t->msg), msg);
+	t->tid = (*errctx)->tid;
+	t->qgen = agetv(&fs->qgen);
+	t->bp = bp;
+	t->v0 = v0;
+	t->v1 = v1;
+}
+
+static void
+nokill(void)
+{
+	char buf[128];
+	int fd;
+
+	snprint(buf, sizeof(buf), "/proc/%d/ctl", getpid());
+	if((fd = open(buf, OWRITE)) == -1){
+		fprint(2, "nokill: open %s: %r", buf);
+		return;
+	}
+	if(fprint(fd, "noswap\n") == -1){
+		fprint(2, "nokill: write %s: %r", buf);
+		return;
+	}
+}
+
+static uvlong
+memsize(void)
+{
+	char *ln, *f[2];
+	vlong mem;
+	Biobuf *bp;
+
+	mem = 512*MiB;
+	if((bp = Bopen("/dev/swap", OREAD)) == nil)
+		return mem;
+	while((ln = Brdstr(bp, '\n', 1)) != nil){
+		if(tokenize(ln, f, nelem(f)) != 2)
+			continue;
+		if(strcmp(f[1], "memory") == 0){
+			mem = strtoll(f[0], 0, 0);
+			free(ln);
+			break;
+		}
+		free(ln);
+	}
+	Bterm(bp);
+	return mem;
+}
+
+jmp_buf*
+_waserror(void)
+{
+	Errctx *c;
+
+	c = *errctx;
+	c->nerrlab++;
+	assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+	return c->errlab + (c->nerrlab-1);
+}
+
+_Noreturn static void
+errorv(char *fmt, va_list ap, int broke)
+{
+	Errctx *c;
+
+	c = *errctx;
+	vsnprint(c->err, sizeof(c->err), fmt, ap);
+	if(broke){
+		fprint(2, "%s\n", c->err);
+		abort();
+	}
+	assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+	longjmp(c->errlab[--c->nerrlab], -1);
+}
+
+_Noreturn void
+broke(char *fmt, ...)
+{
+	va_list ap;
+
+	aincl(&fs->rdonly, 1);
+	va_start(ap, fmt);
+	errorv(fmt, ap, 1);
+}
+
+_Noreturn void
+error(char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	errorv(fmt, ap, 0);
+}
+
+_Noreturn void
+nexterror(void)
+{
+	Errctx *c;
+
+	c = *errctx;
+	assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+	longjmp(c->errlab[--c->nerrlab], -1);
+}
+
+void*
+emalloc(usize sz, int zero)
+{
+	void *p;
+
+	if((p = mallocz(sz, zero)) == nil)
+		error(Enomem);
+	setmalloctag(p, getcallerpc(&sz));
+	return p;
+}
+
+static void
+initfs(vlong cachesz)
+{
+	Blk *b;
+
+	if((fs = mallocz(sizeof(Gefs), 1)) == nil)
+		sysfatal("malloc: %r");
+
+	if(tracesz != 0){
+		fs->trace = emalloc(tracesz, 1);
+		fs->ntrace = tracesz/sizeof(Trace);
+	}
+	fs->lrurz.l = &fs->lrulk;
+	fs->syncrz.l = &fs->synclk;
+	fs->noauth = noauth;
+	fs->cmax = cachesz/Blksz;
+	if(fs->cmax > (1<<30))
+		sysfatal("cache too big");
+	if((fs->bcache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
+		sysfatal("malloc: %r");
+	fs->dlcmax = fs->cmax/10;
+	if(fs->dlcmax < 4)
+		fs->dlcmax = 4;
+	if(fs->dlcmax > 512)
+		fs->dlcmax = 512;
+	if((fs->dlcache = mallocz(fs->dlcmax*sizeof(Dlist*), 1)) == nil)
+		sysfatal("malloc: %r");
+
+	blkbuf = sbrk(fs->cmax * sizeof(Blk));
+	if(blkbuf == (void*)-1)
+		sysfatal("sbrk: %r");
+	for(b = blkbuf; b != blkbuf+fs->cmax; b++){
+		b->bp.addr = -1;
+		b->bp.hash = -1;
+		b->magic = Magic;
+		lrutop(b);
+	}
+}
+
+static void
+launch(void (*f)(int, void *), void *arg, char *text)
+{
+	long pid, id;
+
+	assert(fs->nworker < nelem(fs->lepoch));
+	pid = rfork(RFPROC|RFMEM|RFNOWAIT);
+	if (pid < 0)
+		sysfatal("can't fork: %r");
+	if (pid == 0) {
+		nokill();
+		id = aincl(&fs->nworker, 1);
+		if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
+			sysfatal("malloc: %r");
+		(*errctx)->tid = id;
+		procsetname("%s.%ld", text, id);
+		(*f)(id, arg);
+		exits("child returned");
+	}
+}
+
+static int
+postfd(char *name, char *suff, int mode)
+{
+	char buf[80];
+	int fd[2];
+	int cfd;
+
+	if(pipe(fd) < 0)
+		sysfatal("can't make a pipe");
+	snprint(buf, sizeof buf, "/srv/%s%s", name, suff);
+	if((cfd = create(buf, OWRITE|ORCLOSE|OCEXEC, mode)) == -1)
+		sysfatal("create %s: %r", buf);
+	if(fprint(cfd, "%d", fd[0]) == -1)
+		sysfatal("write %s: %r", buf);
+	close(fd[0]);
+	return fd[1];
+}
+
+static void
+runannounce(int, void *arg)
+{
+	char *ann, adir[40], ldir[40];
+	int actl, lctl, fd;
+	Conn *c;
+
+	ann = arg;
+	if((actl = announce(ann, adir)) < 0)
+		sysfatal("announce %s: %r", ann);
+	while(1){
+		if((lctl = listen(adir, ldir)) < 0){
+			fprint(2, "listen %s: %r", adir);
+			break;
+		}
+		fd = accept(lctl, ldir);
+		close(lctl);
+		if(fd < 0){
+			fprint(2, "accept %s: %r", ldir);
+			continue;
+		}
+		if(!(c = newconn(fd, fd))){
+			close(fd);
+			fprint(2, "%r");
+			continue;
+		}
+
+		launch(runfs, c, "netio");
+	}
+	close(actl);
+}
+
+static void
+usage(void)
+{
+	fprint(2, "usage: %s [-SA] [-r user] [-m mem] [-n srv] [-a net]... -f dev\n", argv0);
+	exits("usage");
+}
+
+void
+main(int argc, char **argv)
+{
+	int i, srvfd, ctlfd, nann;
+	char *s, *e, *ann[16];
+	vlong v, memsz;
+	Conn *c;
+
+	nann = 0;
+	memsz = memsize();
+	cachesz = 25*memsz/100;
+	ARGBEGIN{
+	case 'a':
+		if(nann == nelem(ann))
+			sysfatal("too many announces");
+		ann[nann++] = EARGF(usage());
+		break;
+	case 'r':
+		ream = 1;
+		reamuser = EARGF(usage());
+		break;
+	case 'c':
+		checkonly = 1;
+		break;
+	case 'g':
+		grow = 1;
+		break;
+	case 't':
+		tracesz = strtoll(EARGF(usage()), &e, 0);
+		tracesz *= MiB;
+		break;
+	case 'm':
+		v = strtoll(EARGF(usage()), &e, 0);
+		switch(*e){
+		case 'M': case 'm': case 0:
+			cachesz = v*MiB;
+			break;
+		case 'G': case 'g':
+			cachesz = v*GiB;
+			break;
+		case '%':
+			cachesz = v*memsz/100;
+			break;
+		default:
+			sysfatal("unknown suffix %s", e);
+		}
+		break;
+	case 'd':
+		debug++;
+		break;
+	case 'n':
+		srvname = EARGF(usage());
+		break;
+	case 's':
+		stdio = 1;
+		break;
+	case 'A':
+		noauth = 1;
+		break;
+	case 'S':
+		permissive = 1;
+		break;
+	case 'f':
+		dev = EARGF(usage());
+		break;
+	default:
+		usage();
+		break;
+	}ARGEND;
+	if(dev == nil)
+		usage();
+
+	/*
+	 * sanity checks -- I've tuned these to stupid
+	 * values in the past.
+	 */
+	assert(4*Kpmax < Pivspc);
+	assert(2*Msgmax < Bufspc);
+	assert(Treesz < Inlmax);
+
+	initfs(cachesz);
+	initshow();
+	errctx = privalloc();
+	if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
+		sysfatal("malloc: %r");
+	tmfmtinstall();
+	fmtinstall('H', encodefmt);
+	fmtinstall('B', Bconv);
+	fmtinstall('M', Mconv);
+	fmtinstall('P', Pconv);
+	fmtinstall('K', Kconv);
+	fmtinstall('R', Rconv);
+	fmtinstall('F', fcallfmt);
+	fmtinstall('Q', Qconv);
+
+	if((s = getenv("NPROC")) != nil)
+		nproc = atoi(s);
+	free(s);
+
+	/*
+	 * too few procs, we can't parallelize io,
+	 * too many, we suffer lock contention
+	 */
+	if(nproc < 2)
+		nproc = 2;
+	if(nproc > 8)
+		nproc = 8;
+	if(ream){
+		reamfs(dev);
+		exits(nil);
+	}
+	if(grow){
+		growfs(dev);
+		exits(nil);
+	}
+	if(checkonly){
+		loadfs(dev);
+		if(!checkfs(2))
+			sysfatal("broken fs: %r");
+		exits(nil);
+	}
+
+	rfork(RFNOTEG);
+	nokill();
+	loadfs(dev);
+	fs->wrchan = mkchan(32);
+	fs->admchan = mkchan(32);
+	fs->nsyncers = nproc/2;
+	fs->nreaders = nproc/2;
+	if(fs->nsyncers > fs->narena)
+		fs->nsyncers = fs->narena;
+	for(i = 0; i < fs->nsyncers; i++)
+		qinit(&fs->syncq[i]);
+	if((fs->rdchan = malloc(fs->nreaders*sizeof(Chan*))) == nil)
+		sysfatal("malloc: %r");
+	for(i = 0; i < fs->nreaders; i++)
+		fs->rdchan[i] = mkchan(32);
+	for(i = 0; i < fs->narena; i++)
+		fs->arenas[i].sync = &fs->syncq[i%fs->nsyncers];
+	srvfd = postfd(srvname, "", 0666);
+	ctlfd = postfd(srvname, ".cmd", 0600);
+	launch(runcons, (void*)ctlfd, "ctl");
+	launch(runmutate, nil, "mutate");
+	launch(runsweep, nil, "sweep");
+	launch(runtasks, nil, "tasks");
+	for(i = 0; i < fs->nreaders; i++)
+		launch(runread, fs->rdchan[i], "readio");
+	for(i = 0; i < fs->nsyncers; i++)
+		launch(runsync, &fs->syncq[i], "syncio");
+	for(i = 0; i < nann; i++)
+		launch(runannounce, ann[i], "announce");
+	if(srvfd != -1){
+		if((c = newconn(srvfd, srvfd)) == nil)
+			sysfatal("%r");
+		launch(runfs, c, "srvio");
+	}
+	if(stdio){
+		if((c = newconn(0, 1)) == nil)
+			sysfatal("%r");
+		launch(runfs, c, "stdio");
+	}
+	exits(nil);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/mkfile
@@ -1,0 +1,39 @@
+</$objtype/mkfile
+
+TARG=gefs
+BIN=/$objtype/bin
+OFILES=\
+	blk.$O\
+	cache.$O\
+	check.$O\
+	cons.$O\
+	dump.$O\
+	error.$O\
+	fs.$O\
+	hash.$O\
+	load.$O\
+	main.$O\
+	pack.$O\
+	ream.$O\
+	snap.$O\
+	tree.$O\
+	user.$O\
+	\
+	atomic-$objtype.$O
+
+HFILES=\
+	dat.h\
+	fns.h\
+	atomic.h
+
+</sys/src/cmd/mkone
+</sys/doc/fonts
+
+%.ps: %.ms
+	{ echo $FONTS; cat $stem.ms } | pic | tbl | eqn | troff -ms | lp -dstdout > $target
+%.pdf: %.ps
+	ps2pdf $stem.ps $stem.pdf
+
+man.install: gefs.4.man gefs.8.man
+	cp gefs.4.man /sys/man/4/gefs
+	cp gefs.8.man /sys/man/8/gefs
--- /dev/null
+++ b/sys/src/cmd/gefs/pack.c
@@ -1,0 +1,512 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+/* Terminated so we can use them directly in C */
+char*
+unpackstr(char *p, char *e, char **s)
+{
+	int n;
+
+	if (e - p < 3)
+		error(Elength);
+	n = UNPACK16(p);
+	if(e - p < n + 3 || p[n+2] != 0)
+		broke(Efs);
+	*s = p+2;
+	return p+3+n;
+}
+
+/* Terminated so we can use them directly in C */
+char*
+packstr(char *p, char *e, char *s)
+{
+	int n;
+
+	n = strlen(s);
+	if (e - p < n+3)
+		error(Elength);
+	PACK16(p, n);		p += 2;
+	memmove(p, s, n);	p += n;
+	*p = 0;			p += 1;
+	return p;
+}
+		
+void
+dir2kv(vlong up, Xdir *d, Kvp *kv, char *buf, int nbuf)
+{
+	char *ek, *ev, *eb;
+
+	ek = packdkey(buf, nbuf, up, d->name);
+	kv->k = buf;
+	kv->nk = ek - buf;
+	eb = buf + nbuf;
+	ev = packdval(ek, eb - ek, d);
+	kv->v = ek;
+	kv->nv = ev - ek;
+}
+
+char*
+packdkey(char *p, int sz, vlong up, char *name)
+{
+	char *ep;
+
+	ep = p + sz;
+	PACK8(p, Kent);	p += 1;
+	PACK64(p, up);	p += 8;
+	if(name != nil)
+		p = packstr(p, ep, name);
+	return p;
+}
+
+char*
+unpackdkey(char *p, int sz, vlong *up)
+{
+	char key, *ep, *name;
+
+	ep = p + sz;
+	assert(sz > 9);
+	key = UNPACK8(p);	p += 1;
+	*up = UNPACK64(p);	p += 8;
+	assert(key == Kent);
+	p = unpackstr(p, ep, &name);
+	assert(p <= ep);
+	return name;
+}
+
+char*
+packsuper(char *p, int sz, vlong up)
+{
+	char *ep;
+
+	ep = p+sz;
+	PACK8(p, Kup);	p += 1;
+	PACK64(p, up);	p += 8;
+	assert(p <= ep);
+	return p;
+}
+
+char*
+packdval(char *p, int sz, Xdir *d)
+{
+	char *e;
+
+	e = p + sz;
+	PACK64(p, d->flag);	p += 8;
+	PACK64(p, d->qid.path);	p += 8;
+	PACK32(p, d->qid.vers);	p += 4;
+	PACK8(p, d->qid.type);	p += 1;
+	PACK32(p, d->mode);	p += 4;
+	PACK64(p, d->atime);	p += 8;
+	PACK64(p, d->mtime);	p += 8;
+	PACK64(p, d->length);	p += 8;
+	PACK32(p, d->uid);	p += 4;
+	PACK32(p, d->gid);	p += 4;
+	PACK32(p, d->muid);	p += 4;
+	assert(p <= e);
+	return p;
+}
+
+void
+kv2dir(Kvp *kv, Xdir *d)
+{
+	char *k, *ek, *v, *ev;
+
+	memset(d, 0, sizeof(Xdir));
+	k = kv->k + 9;
+	ek = kv->k + kv->nk;
+	k = unpackstr(k, ek, &d->name);
+
+	v = kv->v;
+	ev = v + kv->nv;
+	d->flag 	= UNPACK64(v);	v += 8;
+	d->qid.path	= UNPACK64(v);	v += 8;
+	d->qid.vers	= UNPACK32(v);	v += 4;
+	d->qid.type	= UNPACK8(v);	v += 1;
+	d->mode		= UNPACK32(v);	v += 4;
+	d->atime	= UNPACK64(v);	v += 8;
+	d->mtime	= UNPACK64(v);	v += 8;
+	d->length	= UNPACK64(v);	v += 8;
+	d->uid		= UNPACK32(v);	v += 4;
+	d->gid		= UNPACK32(v);	v += 4;
+	d->muid		= UNPACK32(v);	v += 4;
+	assert(v <= ev);
+	if(k != ek)
+		broke(Efs);
+	if(v != ev)
+		broke(Efs);
+}
+
+int
+dir2statbuf(Xdir *d, char *buf, int nbuf)
+{
+	int sz, nn, nu, ng, nm;
+	vlong atime, mtime;
+	User *u, *g, *m;
+	char *p;
+
+	rlock(&fs->userlk);
+	if((u = uid2user(d->uid)) == nil)
+		u = uid2user(noneid);
+	if((g = uid2user(d->gid)) == nil)
+		u = uid2user(nogroupid);
+	if((m = uid2user(d->muid)) == nil)
+		m = uid2user(noneid);
+	if(u == nil || g == nil || m == nil)
+		error(Eperm);
+
+	p = buf;
+	nn = strlen(d->name);
+	nu = strlen(u->name);
+	ng = strlen(g->name);
+	nm = strlen(m->name);
+	atime = (d->atime+Nsec/2)/Nsec;
+	mtime = (d->mtime+Nsec/2)/Nsec;
+	sz = STATFIXLEN + nn + nu + ng + nm;
+	if(sz > nbuf){
+		runlock(&fs->userlk);
+		return -1;
+	}
+	
+	PBIT16(p, sz-2);		p += 2;
+	PBIT16(p, -1 /*type*/);		p += 2;
+	PBIT32(p, -1 /*dev*/);		p += 4;
+	PBIT8(p, d->qid.type);		p += 1;
+	PBIT32(p, d->qid.vers);		p += 4;
+	PBIT64(p, d->qid.path);		p += 8;
+	PBIT32(p, d->mode);		p += 4;
+	PBIT32(p, atime);		p += 4;
+	PBIT32(p, mtime);		p += 4;
+	PBIT64(p, d->length);		p += 8;
+
+	PBIT16(p, nn);			p += 2;
+	memcpy(p, d->name, nn);		p += nn;
+	PBIT16(p, nu);			p += 2;
+	memcpy(p, u->name, nu);		p += nu;
+	PBIT16(p, ng);			p += 2;
+	memcpy(p, g->name, ng);		p += ng;
+	PBIT16(p, nm);			p += 2;
+	memcpy(p, m->name, nm);		p += nm;
+	assert(p - buf == sz);
+	runlock(&fs->userlk);
+	return sz;
+}
+
+int
+kv2statbuf(Kvp *kv, char *buf, int nbuf)
+{
+	Xdir d;
+
+	kv2dir(kv, &d);
+	return dir2statbuf(&d, buf, nbuf);
+}
+
+void
+kv2qid(Kvp *kv, Qid *q)
+{
+	char *v, *e;
+
+	v = kv->v;
+	e = v + kv->nv;
+	q->path = UNPACK64(v);	v += 8;
+	q->vers = UNPACK64(v);	v += 8;
+	assert(v <= e);
+}
+
+void
+kv2dlist(Kvp *kv, Dlist *dl)
+{
+	char *p, *e;
+
+	p = kv->k;
+	e = p + kv->nk;
+	p++;
+	dl->gen = UNPACK64(p);	p += 8;
+	dl->bgen = UNPACK64(p);	p += 8;
+	assert(p <= e);
+	
+	p = kv->v;
+	e = p + kv->nv;
+	dl->hd = unpackbp(p, e-p);	p += Ptrsz;
+	dl->tl = unpackbp(p, e-p);	p += Ptrsz;
+	assert(p <= e);
+}
+
+void
+dlist2kv(Dlist *dl, Kvp *kv, char *buf, int nbuf)
+{
+	char *p, *e;
+
+	assert(nbuf >= Dlkvpsz);
+	p = buf;
+	e = buf+nbuf;
+
+	kv->k = p;
+	*p++ = Kdlist;
+	PACK64(p, dl->gen);	p += 8;
+	PACK64(p, dl->bgen);	p += 8;
+	kv->nk = (p - kv->k);
+	
+	kv->v = p;
+	p = packbp(p, e-p, &dl->hd);
+	p = packbp(p, e-p, &dl->tl);
+	kv->nv = (p - kv->v);
+}
+
+void
+tree2kv(Tree *t, Kvp *kv, char *buf, int nbuf)
+{
+	char *p, *e;
+
+	p = buf;
+	e = buf+nbuf;
+
+	kv->k = p;
+	if((p = packsnap(p, e-p, t->gen)) == nil)
+		abort();
+	kv->nk = p - kv->k;
+
+	kv->v = p;
+	if((p = packtree(p, e-p, t)) == nil)
+		abort();
+	kv->nv = p - kv->v;
+}
+
+void
+retag2kv(vlong gen, vlong link, int dlbl, int dref, Kvp *kv, char *buf, int nbuf)
+{
+	char *p;
+
+	assert(nbuf >= 8+1+1);
+	kv->k = buf;
+	if((p = packsnap(buf, nbuf, gen)) == nil)
+		abort();
+	kv->nk = p - buf;
+
+	kv->v = p;
+	PACK64(p, link);	p += 8;
+	*p = dlbl;		p += 1;
+	*p = dref;		p += 1;
+	kv->nv = p - kv->v;
+}
+
+void
+lbl2kv(char *lbl, vlong gen, uint flg, Kvp *kv, char *buf, int nbuf)
+{
+	char *p;
+	int n;
+
+	n = strlen(lbl);
+	assert(nbuf >= 1+n + 1+8+4);
+
+	p = buf;
+	kv->k = p;
+	p[0] = Klabel;		p += 1;
+	memcpy(p, lbl, n);	p += n;
+	kv->nk = p - kv->k;
+
+	kv->v = p;
+	p[0] = Ksnap;		p += 1;
+	PACK64(p, gen);		p += 8;
+	PACK32(p, flg);		p += 4;
+	kv->nv = p - kv->v;
+}
+
+char*
+packlbl(char *p, int sz, char *name)
+{
+	int n;
+
+	n = strlen(name);
+	assert(sz >= n+1);
+	p[0] = Klabel;		p += 1;
+	memcpy(p, name, n);	p += n;
+	return p;
+}
+
+char*
+packsnap(char *p, int sz, vlong id)
+{
+	assert(sz >= Snapsz);
+	p[0] = Ksnap;		p += 1;
+	PACK64(p, id);		p += 8;
+	return p;
+}
+
+char*
+packbp(char *p, int sz, Bptr *bp)
+{
+	assert(sz >= Ptrsz);
+	PACK64(p, bp->addr);	p += 8;
+	PACK64(p, bp->hash);	p += 8;
+	PACK64(p, bp->gen);	p += 8;
+	return p;
+}
+
+Bptr
+unpackbp(char *p, int sz)
+{
+	Bptr bp;
+
+	assert(sz >= Ptrsz);
+	bp.addr = UNPACK64(p);	p += 8;
+	bp.hash = UNPACK64(p);	p += 8;
+	bp.gen = UNPACK64(p);
+	return bp;
+}
+
+Tree*
+unpacktree(Tree *t, char *p, int sz)
+{
+	assert(sz >= Treesz);
+	memset(t, 0, sizeof(Tree));
+	t->nref = UNPACK32(p);		p += 4;
+	t->nlbl = UNPACK32(p);		p += 4;
+	t->ht = UNPACK32(p);		p += 4;
+	t->flag = UNPACK32(p);		p += 4;
+	t->gen = UNPACK64(p);		p += 8;
+	t->pred = UNPACK64(p);		p += 8;
+	t->succ = UNPACK64(p);		p += 8;
+	t->base = UNPACK64(p);		p += 8;
+	t->bp.addr = UNPACK64(p);	p += 8;
+	t->bp.hash = UNPACK64(p);	p += 8;
+	t->bp.gen = UNPACK64(p);	//p += 8;
+
+	return t;
+}
+
+char*
+packtree(char *p, int sz, Tree *t)
+{
+	assert(sz >= Treesz);
+	PACK32(p, t->nref);	p += 4;
+	PACK32(p, t->nlbl);	p += 4;
+	PACK32(p, t->ht);	p += 4;
+	PACK32(p, t->flag);	p += 4;
+	PACK64(p, t->gen);	p += 8;
+	PACK64(p, t->pred);	p += 8;
+	PACK64(p, t->succ);	p += 8;
+	PACK64(p, t->base);	p += 8;
+	PACK64(p, t->bp.addr);	p += 8;
+	PACK64(p, t->bp.hash);	p += 8;
+	PACK64(p, t->bp.gen);	p += 8;
+	return p;
+}
+
+char*
+packarena(char *p, int sz, Arena *a)
+{
+	char *e;
+
+	assert(sz >= Arenasz);
+	e = p + Arenasz;
+	PACK64(p, a->loghd.addr);	p += 8;	/* freelist addr */
+	PACK64(p, a->loghd.hash);	p += 8;	/* freelist hash */
+	PACK64(p, a->size);		p += 8;	/* arena size */
+	PACK64(p, a->used);		p += 8;	/* arena used */
+	assert(p <= e);
+	return p;
+}
+
+char*
+unpackarena(Arena *a, char *p, int sz)
+{
+	char *e;
+
+	assert(sz >= Arenasz);
+	memset(a, 0, sizeof(*a));
+
+	e = p + Arenasz;
+	a->loghd.addr = UNPACK64(p);	p += 8;
+	a->loghd.hash = UNPACK64(p);	p += 8;
+	a->loghd.gen = -1;		p += 0;
+	a->size = UNPACK64(p);		p += 8;
+	a->used = UNPACK64(p);		p += 8;
+	a->logtl = nil;
+
+	assert(p <= e);
+	return p;
+}
+
+char*
+packsb(char *p0, int sz, Gefs *fi)
+{
+	uvlong h;
+	char *p;
+	int i;
+
+	assert(sz == Blksz);
+	assert(fi->narena < 512);
+	p = p0;
+	memcpy(p, "gefs9.00", 8);	p += 8;
+	PACK32(p, Blksz);		p += 4;
+	PACK32(p, Bufspc);		p += 4;
+	PACK32(p, fi->narena);		p += 4;
+	PACK32(p, fi->snap.ht);		p += 4;
+	PACK64(p, fi->snap.bp.addr);	p += 8;
+	PACK64(p, fi->snap.bp.hash);	p += 8;
+	PACK64(p, fi->snapdl.hd.addr);	p += 8;
+	PACK64(p, fi->snapdl.hd.hash);	p += 8;
+	PACK64(p, fi->snapdl.tl.addr);	p += 8;
+	PACK64(p, fi->snapdl.tl.hash);	p += 8;
+	PACK64(p, fi->flag);		p += 8;
+	PACK64(p, fi->nextqid);		p += 8;
+	PACK64(p, fi->nextgen);		p += 8;
+	PACK64(p, fi->qgen);		p += 8;
+	for(i = 0; i < fi->narena; i++){
+		PACK64(p, fi->arenabp[i].addr);	p += 8;
+		PACK64(p, fi->arenabp[i].hash);	p += 8;
+	}
+	h = bufhash(p0, p - p0);
+	PACK64(p, h);			p += 8;
+	return p;
+}
+
+char*
+unpacksb(Gefs *fi, char *p0, int sz)
+{
+	uvlong dh, xh;
+	char *p;
+	int i;
+
+	assert(sz == Blksz);
+	p = p0;
+	if(memcmp(p, "gefs9.00", 8) != 0)
+		error("%s %.8s", Efsvers, p);
+	p += 8;
+	fi->blksz = UNPACK32(p);		p += 4;
+	fi->bufspc = UNPACK32(p);		p += 4;
+	fi->narena = UNPACK32(p);		p += 4;
+	fi->snap.ht = UNPACK32(p);		p += 4;
+	fi->snap.bp.addr = UNPACK64(p);		p += 8;
+	fi->snap.bp.hash = UNPACK64(p);		p += 8;
+	fi->snap.bp.gen = -1;			p += 0;
+	fi->snapdl.hd.addr = UNPACK64(p);	p += 8;
+	fi->snapdl.hd.hash = UNPACK64(p);	p += 8;
+	fi->snapdl.hd.gen = -1;			p += 0;
+	fi->snapdl.gen = -1;			p += 0;
+	fi->snapdl.tl.addr = UNPACK64(p);	p += 8;
+	fi->snapdl.tl.hash = UNPACK64(p);	p += 8;
+	fi->snapdl.hd.gen = -1;			p += 0;
+	fi->snapdl.gen = -1;			p += 0;
+	fi->flag = UNPACK64(p);			p += 8;
+	fi->nextqid = UNPACK64(p);		p += 8;
+	fi->nextgen = UNPACK64(p);		p += 8;
+	fi->qgen = UNPACK64(p);	p += 8;
+	fi->arenabp = emalloc(fi->narena * sizeof(Bptr), 0);
+	for(i = 0; i < fi->narena; i++){
+		fi->arenabp[i].addr = UNPACK64(p);	p += 8;
+		fi->arenabp[i].hash = UNPACK64(p);	p += 8;
+		fi->arenabp[i].gen = -1;
+	}
+	xh = bufhash(p0, p - p0);
+	dh = UNPACK64(p);			p += 8;
+	if(dh != xh)
+		error("corrupt superblock: %llx != %llx", dh, xh);
+	assert(fi->narena < 256);	/* should be more than anyone needs */
+	return p;
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/ream.c
@@ -1,0 +1,462 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+enum {
+	Qmainroot,
+	Qadmroot,
+	Qadmuser,
+	Nreamqid,
+};
+
+static void
+fillxdir(Xdir *d, vlong qid, char *name, int type, int mode)
+{
+	memset(d, 0, sizeof(Xdir));
+	d->qid = (Qid){qid, 0, type};
+	d->mode = mode;
+	d->atime = 0;
+	d->mtime = 0;
+	d->length = 0;
+	d->name = name;
+	d->uid = -1;
+	d->gid = -1;
+	d->muid = 0;
+}
+
+static void
+initadm(Blk *r, Blk *u, int nu)
+{
+	char *p, kbuf[Keymax], vbuf[Inlmax];
+	Kvp kv;
+	Xdir d;
+
+	/* nb: values must be inserted in key order */
+	kv.k = kbuf;
+	kv.nk = Offksz;
+	kv.v = vbuf;
+	kv.nv = Ptrsz;
+	kbuf[0] = Kdat;
+	PACK64(kbuf+1, (uvlong)Qadmuser);
+	PACK64(kbuf+9, 0ULL);
+	packbp(kv.v, kv.nv, &u->bp);
+	setval(r, &kv);
+
+	fillxdir(&d, Qadmuser, "users", QTFILE, 0664);
+	d.length = nu;
+	dir2kv(Qadmroot, &d, &kv, vbuf, sizeof(vbuf));
+	setval(r, &kv);
+	fillxdir(&d, Qadmroot, "", QTDIR, DMDIR|0775);
+	dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
+	setval(r, &kv);
+
+	p = packsuper(kbuf, sizeof(kbuf), 0);
+	kv.k = kbuf;
+	kv.nk = p - kbuf;
+	p = packdkey(vbuf, sizeof(vbuf), -1, "");
+	kv.v = vbuf;
+	kv.nv = p - vbuf;
+	setval(r, &kv);
+}
+
+static void
+initroot(Blk *r)
+{
+	char *p, kbuf[Keymax], vbuf[Inlmax];
+	Kvp kv;
+	Xdir d;
+
+	/* nb: values must be inserted in key order */
+	fillxdir(&d, Qmainroot, "", QTDIR, DMDIR|0775);
+	dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
+	setval(r, &kv);
+
+	p = packsuper(kbuf, sizeof(kbuf), 0);
+	kv.k = kbuf;
+	kv.nk = p - kbuf;
+	p = packdkey(vbuf, sizeof(vbuf), -1, "");
+	kv.v = vbuf;
+	kv.nv = p - vbuf;
+	setval(r, &kv);
+}
+
+static void
+initsnap(Blk *s, Blk *r, Blk *a)
+{
+	char *p, *e, buf[Kvmax];
+	Tree t;
+	Kvp kv;
+
+	lbl2kv("adm", 1, Lmut|Ltsnap, &kv, buf, sizeof(buf));
+	setval(s, &kv);
+	lbl2kv("empty", 0, 0, &kv, buf, sizeof(buf));
+	setval(s, &kv);
+	lbl2kv("main", 2, Lmut|Ltsnap, &kv, buf, sizeof(buf));
+	setval(s, &kv);
+
+	p = buf;
+	e = p + sizeof(buf);
+
+	/* empty */
+	kv.k = p;
+	p = packsnap(buf, e - p, 0);
+	kv.nk = p - kv.k;
+	kv.v = p;
+	memset(&t, 0, sizeof(Tree));
+	t.flag = 0;
+	t.nref = 2;
+	t.nlbl = 1;
+	t.ht = 1;
+	t.gen = fs->nextgen++;
+	t.pred = 0;
+	t.succ = 2;
+	t.bp = r->bp;
+	p = packtree(p, e - p, &t);
+	kv.nv = p - kv.v;
+	setval(s, &kv);
+
+	p = buf;
+	e = p + sizeof(buf);
+
+	/* adm */
+	kv.k = p;
+	p = packsnap(p, e - p, 1);
+	kv.nk = p - kv.k;
+	kv.v = p;
+	memset(&t, 0, sizeof(Tree));
+	t.nref = 0;
+	t.nlbl = 1;
+	t.ht = 1;
+	t.gen = fs->nextgen++;
+	t.pred = 0;
+	t.succ = -1;
+	t.bp = a->bp;
+	p = packtree(p, e - p, &t);
+	kv.nv = p - kv.v;
+	setval(s, &kv);
+
+	p = buf;
+	e = p + sizeof(buf);
+
+	/* main */
+	kv.k = p;
+	p = packsnap(buf, e - p, 2);
+	kv.nk = p - kv.k;
+	kv.v = p;
+	memset(&t, 0, sizeof(Tree));
+	t.nref = 0;
+	t.nlbl = 1;
+	t.ht = 1;
+	t.gen = fs->nextgen++;
+	t.pred = 0;
+	t.succ = -1;
+	t.bp = r->bp;
+	p = packtree(p, e - p, &t);
+	kv.nv = p - kv.v;
+	setval(s, &kv);
+}
+
+static void
+initarena(Arena *a, uvlong hdaddr, vlong asz)
+{
+	Blk *b, *h0, *h1;
+	uvlong addr;
+	char *p;
+
+	b = cachepluck();
+	addr = hdaddr+2*Blksz;	/* leave room for arena hdr */
+
+	a->loghd.addr = -1;
+	a->loghd.hash = -1;
+	a->loghd.gen = -1;
+
+	memset(b->buf, 0, sizeof(b->buf));
+	b->type = Tlog;
+	b->bp.addr = addr;
+	b->logsz = 0;
+	b->logp = (Bptr){-1, -1, -1};
+	b->data = b->buf + Loghdsz;
+	setflag(b, Bdirty);
+
+	p = b->buf + Loghdsz;
+	b->logp = (Bptr){-1, -1, -1};
+	PACK64(p, addr|LogFree);	p += 8;	/* addr */
+	PACK64(p, asz-2*Blksz);		p += 8;	/* len */
+	PACK64(p, b->bp.addr|LogAlloc);	p += 8;	/* addr */
+	PACK64(p, Blksz);		p += 8;	/* len */
+	PACK64(p, (uvlong)LogSync);	p += 8;	/* barrier */
+	b->logsz = p - b->data;
+	finalize(b);
+	syncblk(b);
+	dropblk(b);
+
+	a->loghd = b->bp;
+	a->loghd.gen = -1;
+	a->size = asz;
+	a->used = Blksz;
+
+	h0 = cachepluck();
+	h1 = cachepluck();
+
+	memset(h0->buf, 0, sizeof(h0->buf));
+	h0->type = Tarena;
+	h0->bp.addr = hdaddr;
+	h0->data = h0->buf+2;
+	finalize(h0);
+
+	memset(h1->buf, 0, sizeof(h1->buf));
+	h1->type = Tarena;
+	h1->bp.addr = hdaddr+Blksz;
+	h1->data = h1->buf+2;
+	finalize(h1);
+
+	packarena(h0->data, Arenasz, a);
+	packarena(h1->data, Arenasz, a);
+	finalize(h0);
+	finalize(h1);
+	syncblk(h0);
+	syncblk(h1);
+	a->h0 = h0;
+	a->h1 = h1;
+}
+
+void
+reamfs(char *dev)
+{
+	Blk *sb0, *sb1, *tb, *mb, *ab, *ub;
+	vlong sz, asz, off;
+	Mount *mnt, *adm;
+	Arena *a;
+	char *utab;
+	Dir *d;
+	int i;
+
+	if(waserror())
+		sysfatal("ream %s: %s\n", dev, errmsg());
+	if((fs->fd = open(dev, ORDWR)) == -1)
+		sysfatal("open %s: %r", dev);
+	if((d = dirfstat(fs->fd)) == nil)
+		sysfatal("ream: %r");
+	sz = d->length;
+	free(d);
+
+	print("reaming %s\n", dev);
+	if(sz < 128*MiB+Blksz)
+		sysfatal("ream: disk too small");
+	mnt = emalloc(sizeof(Mount), 1);
+	mnt->root = mallocz(sizeof(Tree), 1);
+	adm = mallocz(sizeof(Mount), 1);
+	adm->root = mallocz(sizeof(Tree), 1);
+
+	sz = sz - sz%Blksz - 2*Blksz;
+	fs->narena = (sz + 4096ULL*GiB - 1) / (4096ULL*GiB);
+	if(fs->narena < 8)
+		fs->narena = 8;
+	if(fs->narena >= 32)
+		fs->narena = 32;
+	fs->arenas = emalloc(fs->narena*sizeof(Arena), 1);
+
+
+	off = Blksz;
+	asz = sz/fs->narena;
+	asz = asz - (asz % Blksz) - 2*Blksz;
+
+	sb0 = cachepluck();
+	sb1 = cachepluck();
+	sb0->bp = (Bptr){0, -1, -1};
+	sb1->bp = (Bptr){sz+Blksz, -1, -1};
+
+	fs->arenabp = emalloc(fs->narena * sizeof(Bptr), 1);
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		print("\tarena %d: %lld blocks at %llx\n", i, asz/Blksz, off);
+		initarena(a, off, asz);
+		fs->arenabp[i] = a->h0->bp;
+		off += asz+2*Blksz;
+
+	}
+	
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		loadarena(a, a->h0->bp);
+		loadlog(a, a->loghd);
+	}
+
+	if((mb = newblk(mnt->root, Tleaf, 0)) == nil)
+		sysfatal("ream: allocate root: %r");
+	holdblk(mb);
+	initroot(mb);
+	finalize(mb);
+	syncblk(mb);
+
+	mnt->root->ht = 1;
+	mnt->root->bp = mb->bp;
+
+	if((ab = newblk(adm->root, Tleaf, 0)) == nil)
+		sysfatal("ream: allocate root: %r");
+	if((ub = newblk(adm->root, Tdat, 0)) == nil)
+		sysfatal("ream: allocate root: %r");
+	holdblk(ab);
+	holdblk(ub);
+	utab = smprint(
+		"-1:adm::%s\n"
+		"0:none::\n"
+		"1:%s:%s:\n",
+		reamuser, reamuser, reamuser);
+	memcpy(ub->data, utab, strlen(utab));
+	finalize(ub);
+	syncblk(ub);
+	initadm(ab, ub, strlen(utab));
+	finalize(ab);
+	syncblk(ab);
+
+	adm->root->ht = 1;
+	adm->root->bp = ab->bp;
+
+	/*
+	 * Now that we have a completely empty fs, give it
+	 * a single snap block that the tree will insert
+	 * into, and take a snapshot as the initial state.
+	 */
+	if((tb = newblk(mnt->root, Tleaf, 0)) == nil)
+		sysfatal("ream: allocate snaps: %r");
+	holdblk(tb);
+	initsnap(tb, mb, ab);
+	finalize(tb);
+	syncblk(tb);
+
+	fs->snap.bp = tb->bp;
+	fs->snap.ht = 1;
+	fs->snapdl.hd.addr = -1;
+	fs->snapdl.hd.hash = -1;
+	fs->snapdl.tl.addr = -1;
+	fs->snapdl.tl.hash = -1;
+	fs->nextqid = Nreamqid;
+
+	dropblk(mb);
+	dropblk(ab);
+	dropblk(ub);
+	dropblk(tb);
+	fs->nextqid = Nreamqid;
+
+	/*
+	 * We need to write back all of the arenas
+	 * with the updated free lists
+	 */
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		finalize(a->logtl);
+		syncblk(a->logtl);
+		packarena(a->h0->data, Blksz, a);
+		finalize(a->h0);
+		syncblk(a->h0);
+		packarena(a->h1->data, Blksz, a);
+		finalize(a->h1);
+		syncblk(a->h1);
+		fs->arenabp[i] = a->h0->bp;
+		dropblk(a->h0);
+		dropblk(a->h1);
+	}
+
+	dropblk(mb);
+	dropblk(ab);
+	dropblk(ub);
+	dropblk(tb);
+
+	/*
+	 * Finally, write back the superblock and backup
+	 * superblock.
+	 */
+	packsb(sb0->buf, Blksz, fs);
+	packsb(sb1->buf, Blksz, fs);
+	finalize(sb0);
+	finalize(sb1);
+	syncblk(sb0);
+	syncblk(sb1);
+	dropblk(sb0);
+	dropblk(sb1);
+	free(mnt);
+	poperror();
+}
+
+void
+growfs(char *dev)
+{
+	vlong oldsz, newsz, asz, off, eb;
+	int i, narena;
+	Arena *a;
+	Bptr bp;
+	Dir *d;
+
+	if(waserror())
+		sysfatal("grow %s: %s\n", dev, errmsg());
+	if((fs->fd = open(dev, ORDWR)) == -1)
+		sysfatal("open %s: %r", dev);
+	if((d = dirfstat(fs->fd)) == nil)
+		sysfatal("ream: %r");
+
+	bp = (Bptr){0, -1, -1};
+	fs->sb0 = getblk(bp, GBnochk);
+	unpacksb(fs, fs->sb0->buf, Blksz);
+	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+		sysfatal("malloc: %r");
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		loadarena(a, fs->arenabp[i]);
+		fs->arenabp[i] = a->h0->bp;
+	}
+	a = &fs->arenas[fs->narena-1];
+	oldsz = a->h0->bp.addr + a->size + 2*Blksz;
+	newsz = d->length - d->length%Blksz - 2*Blksz;
+	if(newsz - oldsz < 64*MiB)
+		sysfatal("new arenas too small (%lld < %lld), not growing", newsz - oldsz, 64*MiB);
+	asz = (newsz - oldsz)/4;
+	asz = asz - asz % Blksz - 2*Blksz;
+	narena = fs->narena + 4;
+	assert(oldsz % Blksz == 0);
+	if((fs->arenas = realloc(fs->arenas, narena*sizeof(Arena))) == nil)
+		error(Enomem);
+	if((fs->arenabp = realloc(fs->arenabp, narena*sizeof(Bptr))) == nil)
+		error(Enomem);
+
+	off = oldsz;
+	for(i = fs->narena; i < narena; i++){
+		a = &fs->arenas[i];
+		print("\tnew arena %d: adding %lld blocks at %llx\n", i, asz/Blksz, off);
+		initarena(&fs->arenas[i], off, asz);
+		loadarena(a, a->h0->bp);
+		loadlog(a, a->loghd);
+		a = &fs->arenas[i];
+		packarena(a->h0->data, Blksz, a);
+		packarena(a->h1->data, Blksz, a);
+		finalize(a->h0);
+		finalize(a->h1);
+		syncblk(a->h0);
+		syncblk(a->h1);
+
+		fs->arenabp[i] = a->h0->bp;
+		off += asz+2*Blksz;
+	}
+	fs->narena = narena;
+	packsb(fs->sb0->buf, Blksz, fs);
+	finalize(fs->sb0);
+	syncblk(fs->sb0);
+	/*
+	 * We're being a bit tricksy here: because we're on a bigger
+	 * partition, we don't know where the end is; just load the
+	 * first block, and patch the address in to the right place
+	 * when we write it back.
+	 */
+	eb = d->length;
+	eb = eb - (eb%Blksz) - Blksz;
+	fs->sb0->bp = (Bptr){eb, -1, -1};
+	packsb(fs->sb0->buf, Blksz, fs);
+	finalize(fs->sb0);
+	syncblk(fs->sb0);
+	free(d);
+	poperror();
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/snap.c
@@ -1,0 +1,617 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "atomic.h"
+#include "dat.h"
+#include "fns.h"
+
+static void
+dlflush(Dlist *dl)
+{
+	char kvbuf[512];
+	Msg m;
+
+	if(dl->ins == nil)
+		return;
+	traceb("dlflush", dl->ins->bp);
+	enqueue(dl->ins);
+	dropblk(dl->ins);
+	dl->hd = dl->ins->bp;
+	if(dl->tl.addr == dl->hd.addr)
+		dl->tl = dl->hd;
+	dl->ins = nil;
+	/* special case: the snap dlist has gen -1, skip it */
+	if(dl->gen != -1){
+		m.op = Oinsert;
+		dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+		btupsert(&fs->snap, &m, 1);
+	}
+}
+
+static void
+dlcachedel(Dlist *dl, int hdel)
+{
+	uint h;
+	Dlist *d, **p;
+
+	h = ihash(dl->gen) ^ ihash(dl->bgen);
+	if(hdel){
+		p = &fs->dlcache[h % fs->dlcmax];
+		for(d = *p; d != nil; d = d->chain){
+			if(d->gen == dl->gen && d->bgen == dl->bgen)
+				break;
+			p = &d->chain;
+		}
+		if(d != nil)
+			*p  = d->chain;
+	}
+	if(dl == fs->dlhead)
+		fs->dlhead = dl->cnext;
+	if(dl == fs->dltail)
+		fs->dltail = dl->cprev;
+	if(dl->cnext != nil)
+		dl->cnext->cprev = dl->cprev;
+	if(dl->cprev != nil)
+		dl->cprev->cnext = dl->cnext;
+	dl->cnext = nil;
+	dl->cprev = nil;
+}
+
+static Dlist*
+dlcacheget(vlong gen, vlong bgen)
+{
+	Dlist *dl;
+	uint h;
+
+	h = ihash(gen) ^ ihash(bgen);
+	for(dl = fs->dlcache[h % fs->dlcmax]; dl != nil; dl = dl->chain)
+		if(dl->gen == gen && dl->bgen == bgen)
+			break;
+	if(dl != nil)
+		dlcachedel(dl, 0);
+	return dl;
+}
+
+static Dlist*
+getdl(vlong gen, vlong bgen)
+{
+	char kbuf[Dlksz], kvbuf[Dlkvpsz];
+	Dlist *dl, **p;
+	uint h;
+	Msg m;
+	Kvp kv;
+	Key k;
+
+	if((dl = dlcacheget(gen, bgen)) != nil)
+		return dl;
+	dl = emalloc(sizeof(Dlist), 1);
+	if(waserror()){
+		free(dl);
+		nexterror();
+	}
+	kbuf[0] = Kdlist;
+	PACK64(kbuf+1, gen);
+	PACK64(kbuf+9, bgen);
+	k.k = kbuf;
+	k.nk = sizeof(kbuf);
+
+	/* load up existing dlist */
+	if(btlookup(&fs->snap, &k, &kv, kvbuf, sizeof(kvbuf))){
+		kv2dlist(&kv, dl);
+		goto Found;
+	}
+
+	/* create a new one if it didn't exist */
+	dl->gen = gen;
+	dl->bgen = bgen;
+	dl->hd.addr = -1;
+	dl->tl.addr = -1;
+	dl->ins = nil;
+
+	m.op = Oinsert;
+	dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+	btupsert(&fs->snap, &m, 1);
+Found:
+	poperror();
+	h = ihash(gen) ^ ihash(bgen);
+	p = &fs->dlcache[h % fs->dlcmax];
+	dl->chain = *p;
+	*p = dl;
+	return dl;
+}
+
+void
+putdl(Dlist *dl)
+{
+	Dlist *dt;
+
+	if(dl->gen == -1)
+		return;
+	dlcachedel(dl, 0);
+	while(fs->dltail != nil && fs->dlcount >= fs->dlcmax){
+		dt = fs->dltail;
+		dlflush(dt);
+		dlcachedel(dt, 1);
+		dropblk(dt->ins);
+		free(dt);
+	}
+
+	dl->cprev = nil;
+	dl->cnext = fs->dlhead;
+	if(fs->dltail == nil)
+		fs->dltail = dl;
+	if(fs->dlhead != nil)
+		fs->dlhead->cprev = dl;
+	fs->dlhead = dl;
+}
+
+void
+freedl(Dlist *dl, int docontents)
+{
+	char buf[Kvmax];
+	Arena *a;
+	Qent qe;
+	Bptr bp;
+	Msg m;
+	Blk *b;
+	char *p;
+
+	bp = dl->hd;
+	if(dl->gen != -1){
+		m.op = Odelete;
+		dlist2kv(dl, &m, buf, sizeof(buf));
+		btupsert(&fs->snap, &m, 1);
+	}
+	while(bp.addr != -1){
+		b = getblk(bp, 0);
+		/*
+		 * Because these deadlists are dead-dead at this point,
+		 * they'll never be read from again; we can avoid worrying
+		 * about deferred reclamation, and queue them up to be freed
+		 * directly, which means we don't need to worry about watiing
+		 * for a quiescent state, and the associated out-of-block
+		 * deadlocks that come with it.
+		 */
+		if(docontents){
+			for(p = b->data; p != b->data+b->logsz; p += 8){
+				qe.op = Qfree;
+				qe.bp.addr = UNPACK64(p);
+				qe.bp.hash = -1;
+				qe.bp.gen = -1;
+				qe.b = nil;
+				a = getarena(qe.bp.addr);
+				qput(a->sync, qe);
+				traceb("dlclear", qe.bp);
+			}
+		}
+		bp = b->logp;
+		qe.op = Qfree;
+		qe.bp = b->bp;
+		qe.b = b;
+		a = getarena(qe.bp.addr);
+		qput(a->sync, qe);
+		traceb("dlfreeb", qe.bp);
+	}
+}
+
+static void
+mergedl(vlong merge, vlong gen, vlong bgen)
+{
+	char buf[2][Kvmax];
+	Dlist *d, *m;
+	Msg msg[2];
+	Blk *b;
+
+	d = nil;
+	m = nil;
+	if(waserror()){
+		putdl(m);
+		putdl(d);
+		nexterror();
+	}
+	d = getdl(merge, bgen);
+	m = getdl(gen, bgen);
+	assert(d != m);
+	/*
+	 * If the dest dlist didn't exist,
+	 * just move the merge dlist over
+	 * and be done with it, otherwise
+	 * chain onto the existing dlist
+	 * tail.
+	 */
+	if(d->hd.addr == -1){
+		assert(d->ins == nil);
+		d->hd = m->hd;
+		d->tl = m->tl;
+		d->ins = m->ins;
+		if(d->ins != nil)
+			holdblk(d->ins);
+	}else{
+		if(m->ins != nil){
+			enqueue(m->ins);
+			dropblk(m->ins);
+			m->ins = nil;
+		}
+		b = getblk(d->tl, 0);
+		b->logp = m->hd;
+		assert(d->hd.addr != m->hd.addr);
+		finalize(b);
+		syncblk(b);
+		dropblk(b);
+	}
+	msg[0].op = Odelete;
+	dlist2kv(m, &msg[0], buf[0], sizeof(buf[0]));
+	msg[1].op = Oinsert;
+	dlist2kv(d, &msg[1], buf[1], sizeof(buf[1]));
+	btupsert(&fs->snap, msg, 2);
+	putdl(m);
+	putdl(d);
+	poperror();
+}
+
+static void
+reclaimblocks(vlong gen, vlong succ, vlong prev)
+{
+	char pfx[9];
+	Dlist dl;
+	Scan s;
+
+	pfx[0] = Kdlist;
+	PACK64(pfx+1, gen);
+	btnewscan(&s, pfx, sizeof(pfx));
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		kv2dlist(&s.kv, &dl);
+
+		if(succ != -1 && dl.bgen <= prev)
+			mergedl(succ, dl.gen, dl.bgen);
+		else if(dl.bgen <= prev)
+			mergedl(prev, dl.gen, dl.bgen);
+		else
+			freedl(&dl, 1);
+	}
+	btexit(&s);
+	if(succ != -1){
+		pfx[0] = Kdlist;
+		PACK64(pfx+1, succ);
+		btnewscan(&s, pfx, sizeof(pfx));
+		btenter(&fs->snap, &s);
+		while(1){
+			if(!btnext(&s, &s.kv))
+				break;
+			kv2dlist(&s.kv, &dl);
+			if(dl.bgen > prev)
+				freedl(&dl, 1);
+		}
+		btexit(&s);
+	}
+}
+
+/*
+ * Removes a label from a snapshot, allowing
+ * it to be reclaimed if it is not a direct
+ * predecessor of more than one other snapshot.
+ *
+ * If it has one successor and no label, then
+ * it will be merged with that successor.
+ */
+void
+delsnap(Tree *t, vlong succ, char *name)
+{
+	char *p, buf[4][Kvmax];
+	int nm, deltree;
+	Mount *mnt;
+	Msg m[4];
+
+	nm = 0;
+	deltree = 0;
+	if(name != nil){
+		if(strcmp(name, "dump") == 0
+		|| strcmp(name, "empty") == 0
+		|| strcmp(name, "adm") == 0)
+			error(Ename);
+
+		m[nm].op = Odelete;
+		m[nm].k = buf[nm];
+		p = packlbl(buf[nm], sizeof(buf[nm]), name);
+		m[nm].nk = p - m[nm].k;
+		m[nm].v = nil;
+		m[nm].nv = 0;
+		t->nlbl--;
+		nm++;
+	}
+ 
+	if(t->nlbl == 0 && t->nref <= 1){
+		deltree = 1;
+		m[nm].op = Orelink;
+		retag2kv(t->pred, succ, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
+		nm++;
+		if(t->succ != -1){
+			m[nm].op = Oreprev;
+			retag2kv(t->succ, t->pred, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
+			nm++;
+		}
+		m[nm].op = Odelete;
+		m[nm].k = buf[nm];
+		p = packsnap(buf[nm], sizeof(buf[nm]), t->gen);
+		m[nm].nk = p - m[nm].k;
+		m[nm].v = nil;
+		m[nm].nv = 0;
+		nm++;
+	}
+	assert(nm <= nelem(m));
+	dlsync();
+	btupsert(&fs->snap, m, nm);
+	if(deltree){
+		reclaimblocks(t->gen, succ, t->pred);
+		for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+			if(mnt->root->gen == t->succ)
+				mnt->root->pred = t->pred;
+			if(mnt->root->gen == t->pred)
+				mnt->root->succ = t->succ;
+		}
+	}
+}
+
+/*
+ * Attaches a label to a tree, incrementing
+ * its reference count. This labelled snapshot
+ * will show up in the dump.
+ */
+void
+tagsnap(Tree *t, char *name, int flg)
+{
+	char buf[3][Kvmax];
+	Msg m[3];
+	Tree *n;
+	int i;
+
+	if(strcmp(name, "dump") == 0
+	|| strcmp(name, "empty") == 0
+	|| strcmp(name, "adm") == 0)
+		error(Ename);
+
+	i = 0;
+	n = nil;
+	if(waserror()){
+		free(n);
+		nexterror();
+	}
+	if(flg & Lmut){
+		n = emalloc(sizeof(Tree), 1);
+		n->memref = 1;
+		n->dirty = 0;
+		n->nlbl = 1;
+		n->nref = 0;
+		n->ht = t->ht;
+		n->bp = t->bp;
+		n->succ = -1;
+		n->pred = t->gen;
+		n->base = t->gen;
+		n->gen = aincv(&fs->nextgen, 1);
+		n->memgen = aincv(&fs->nextgen, 1);
+
+		t->nref++;
+		m[i].op = Orelink;
+		retag2kv(t->gen, t->succ, 0, 1, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+		m[i].op = Oinsert;
+		lbl2kv(name, n->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+		m[i].op = Oinsert;
+		tree2kv(n, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+	}else{
+		t->nlbl++;
+		m[i].op = Orelink;
+		retag2kv(t->gen, t->succ, 1, 0, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+
+		m[i].op = Oinsert;
+		t->pred = t->gen;
+		t->nlbl++;
+		lbl2kv(name, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+		i++;
+	}
+	btupsert(&fs->snap, m, i);
+	poperror();
+	free(n);
+}
+
+/*
+ * Updates a snapshot; keeps the generation the same if possible,
+ * otherwise moves to a new generation. A snapshot may only stay
+ * at the same generation as long as it is at the tip of a snapshot
+ * list; once it's observable by a derived snapshot it must be
+ * immutable.
+ */
+void
+updatesnap(Tree **r, Tree *o, char *lbl, int flg)
+{
+	char buf[4][Kvmax];
+	Msg m[4];
+	Tree *t;
+	int i;
+
+	if(!o->dirty)
+		return;
+
+	traceb("updatesnap", o->bp);
+	/* update the old kvp */
+	o->nlbl--;
+	o->nref++;
+
+	/* create the new one */
+
+	t = emalloc(sizeof(Tree), 1);
+	if(waserror()){
+		free(t);
+		nexterror();
+	}
+	t->memref = 1;
+	t->dirty = 0;
+
+	t->nlbl = 1;
+	t->nref = 0;
+	t->ht = o->ht;
+	t->bp = o->bp;
+	t->succ = -1;
+	t->base = o->base;
+	t->gen = o->memgen;
+	t->memgen = aincv(&fs->nextgen, 1);
+
+	i = 0;
+	m[i].op = Orelink;
+	if(o->nlbl == 0 && o->nref == 1){
+		t->pred = o->pred;
+		retag2kv(t->pred, t->gen, 0, 0, &m[i], buf[i], sizeof(buf[i]));
+	}else{
+		t->pred = o->gen;
+		retag2kv(t->pred, t->gen, -1, 1, &m[i], buf[i], sizeof(buf[i]));
+	}
+	i++;
+
+	m[i].op = Oinsert;
+	tree2kv(t, &m[i], buf[i], sizeof(buf[i]));
+	i++;
+	m[i].op = Oinsert;
+	lbl2kv(lbl, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+	i++;
+	btupsert(&fs->snap, m, i);
+
+	/* only update the dirty status after we sync */
+	o->dirty = 0;
+
+	/* this was the last ref to the snap */
+	if(o->nlbl == 0 && o->nref == 1)
+		delsnap(o, t->gen, nil);
+	closesnap(o);
+	asetp(r, t);
+	poperror();
+}
+
+/*
+ * open snapshot by label, returning a tree.
+ */
+Tree*
+opensnap(char *label, int *flg)
+{
+	char *p, buf[Kvmax];
+	Tree *t;
+	vlong gen;
+	Kvp kv;
+	Key k;
+
+	/* Klabel{"name"} => Ksnap{id} */
+	if((p = packlbl(buf, sizeof(buf), label)) == nil)
+		return nil;
+	k.k = buf;
+	k.nk = p - buf;
+	if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
+		return nil;
+	assert(kv.nv == 1+8+4);
+	gen = UNPACK64(kv.v + 1);
+	if(flg != nil)
+		*flg = UNPACK32(kv.v + 1+8);
+
+	t = mallocz(sizeof(Tree), 1);
+	if(waserror()){
+		free(t);
+		nexterror();
+	}
+	p = packsnap(buf, sizeof(buf), gen);
+	k.k = buf;
+	k.nk = p - buf;
+	if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
+		broke(Efs);
+	unpacktree(t, kv.v, kv.nv);
+	t->memref = 1;
+	t->memgen = aincv(&fs->nextgen, 1);
+	poperror();
+	return t;
+}
+
+/*
+ * close snapshot, flushing and freeing in-memory
+ * representation.
+ */
+void
+closesnap(Tree *t)
+{
+	Bfree *f;
+
+	if(t == nil || adec(&t->memref) != 0)
+		return;
+	f = malloc(sizeof(Bfree));
+	f->op = DFtree;
+	f->t = t;
+	limbo(f);
+}
+
+void
+dlsync(void)
+{
+	Dlist *dl, *n;
+
+	tracem("dlsync");
+	dlflush(&fs->snapdl);
+	for(dl = fs->dlhead; dl != nil; dl = n){
+		n = dl->cnext;
+		dlflush(dl);
+	}
+}
+
+/*
+ * Marks a block as killed by the tree
+ * t, which means that it will be free
+ * for use after t is reclaimed.
+ *
+ * t must be an active snapshot with
+ * no successors.
+ */
+void
+killblk(Tree *t, Bptr bp)
+{
+	Dlist *dl;
+	Blk *b;
+	char *p;
+
+	/* 
+	 * When we have a forked snap, blocks allocated before the fork
+	 * are the responsibility of the other chain; in this chain, we
+	 * leak it and let the last reference in the other chain clean up
+	 */
+	if(t == &fs->snap)
+		dl = &fs->snapdl;
+	else if(bp.gen > t->base)
+		dl = getdl(t->memgen, bp.gen);
+	else
+		return;
+	if(waserror()){
+		putdl(dl);
+		nexterror();
+	}
+	if(dl->ins == nil || Logspc - dl->ins->logsz < Logslop){
+		b = newblk(&fs->snap, Tdlist, 0);
+		if(dl->ins != nil){
+			enqueue(dl->ins);
+			dropblk(dl->ins);
+		}
+		if(dl->tl.addr == -1)
+			dl->tl = b->bp;
+		b->logp = dl->hd;
+		dl->hd = b->bp;
+		dl->ins = b;
+		cacheins(b);
+	}
+	p = dl->ins->data + dl->ins->logsz;
+	dl->ins->logsz += 8;
+	setflag(dl->ins, Bdirty);
+	PACK64(p, bp.addr);
+	poperror();
+	putdl(dl);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/tree.c
@@ -1,0 +1,1540 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Path	Path;
+
+struct Path {
+	/* Flowing down for flush */
+	Msg	*ins;	/* inserted values, bounded by lo..hi */
+	Blk	*b;	/* to shadow */
+	int	idx;	/* insert at */
+	int	lo;	/* key range */
+	int	hi;	/* key range */
+	int	sz;	/* size of range */
+
+	/* Flowing up from flush */
+	int	op;	/* change done along path */
+	Blk	*m;	/* node merged against, for post-update free */
+	Blk	*nl;	/* new left */
+	Blk	*nr;	/* new right, if we split or rotated */
+	int	midx;	/* modification index */
+	int	npull;	/* number of messages successfully pulled */
+	int	pullsz;	/* size of pulled messages */
+};
+
+#define efreeblk(t, b) do { \
+	if(b != nil) \
+		freeblk(t, b, b->bp); \
+	} while(0)
+
+static void
+stablesort(Msg *m, int nm)
+{
+	int i, j;
+	Msg t;
+
+	for(i = 1; i < nm; i++){
+		for(j = i; j > 0; j--){
+			if(keycmp(&m[j-1], &m[j]) <= 0)
+				break;
+			t = m[j-1];
+			m[j-1] = m[j];
+			m[j] = t;
+		}
+	}
+}
+
+void
+cpkey(Key *dst, Key *src, char *buf, int nbuf)
+{
+	assert(src->nk <= nbuf);
+	memmove(buf, src->k, src->nk);
+	dst->k = buf;
+	dst->nk = src->nk;
+}
+
+void
+cpkvp(Kvp *dst, Kvp *src, char *buf, int nbuf)
+{
+	assert(src->nk+src->nv <= nbuf);
+	memmove(buf, src->k, src->nk);
+	memmove(buf+ src->nk, src->v, src->nv);
+	dst->k = buf;
+	dst->nk = src->nk;
+	dst->v = buf+src->nk;
+	dst->nv = src->nv;
+}
+
+int
+keycmp(Key *a, Key *b)
+{
+	int c, n;
+
+	n = (a->nk < b->nk) ? a->nk : b->nk;
+	if((c = memcmp(a->k, b->k, n)) != 0)
+		return c < 0 ? -1 : 1;
+	if(a->nk < b->nk)
+		return -1;
+	else if(a->nk > b->nk)
+		return 1;
+	else
+		return 0;
+}
+
+static int
+msgsz(Msg *m)
+{
+	/* disp + op + klen + key + vlen + v */
+	return 2+1+2+m->nk +2+ m->nv;
+}
+
+static int
+valsz(Kvp *kv)
+{
+	return 2 + 2+kv->nk + 2+kv->nv;
+}
+
+void
+getval(Blk *b, int i, Kvp *kv)
+{
+	char *p;
+	int o;
+
+	assert(i >= 0 && i < b->nval);
+	p = b->data + 2*i;
+	o = UNPACK16(p);	p = b->data + o;
+	kv->nk = UNPACK16(p);	p += 2;
+	kv->k = p;		p += kv->nk;
+	kv->nv = UNPACK16(p);	p += 2;
+	kv->v = p;
+}
+
+Bptr
+getptr(Kvp *kv, int *fill)
+{
+	assert(kv->nv == Ptrsz || kv->nv == Ptrsz+2);
+	*fill = UNPACK16(kv->v + Ptrsz);
+	return unpackbp(kv->v, kv->nv);
+}
+
+/* Exported for reaming */
+void
+setval(Blk *b, Kvp *kv)
+{
+	int off, spc;
+	char *p;
+
+	spc = (b->type == Tleaf) ? Leafspc : Pivspc;
+	b->valsz += 2 + kv->nk + 2 + kv->nv;
+	off = spc - b->valsz;
+
+	assert(2*(b->nval+1) + b->valsz <= spc);
+	assert(2*(b->nval+1) <= off);
+
+	p = b->data + 2*b->nval;
+	PACK16(p, off);
+
+	p = b->data + off;
+	PACK16(p, kv->nk);		p += 2;
+	memmove(p, kv->k, kv->nk);	p += kv->nk;
+	PACK16(p, kv->nv);		p += 2;
+	memmove(p, kv->v, kv->nv);
+
+	b->nval++;
+}
+
+static void
+setptr(Blk *b, Key *k, Bptr bp, int fill)
+{
+	char *p, buf[Ptrsz+2];
+	Kvp kv;
+
+	kv.k = k->k;
+	kv.nk = k->nk;
+	kv.v = buf;
+	kv.nv = sizeof(buf);
+	p = packbp(buf, sizeof(buf), &bp);
+	PACK16(p, fill);
+	setval(b, &kv);
+}
+
+static void
+setmsg(Blk *b, Msg *m)
+{
+	char *p;
+	int o;
+
+	assert(b->type == Tpivot);
+	b->bufsz += msgsz(m)-2;
+
+	p = b->data + Pivspc + 2*b->nbuf;
+	o = Bufspc - b->bufsz;
+	PACK16(p, o);
+
+	p = b->data + Pivspc + o;
+	*p = m->op;			p += 1;
+	PACK16(p, m->nk);		p += 2;
+	memmove(p, m->k, m->nk);	p += m->nk;
+	PACK16(p, m->nv);		p += 2;
+	memmove(p, m->v, m->nv);
+
+	b->nbuf++;
+}
+
+void
+getmsg(Blk *b, int i, Msg *m)
+{
+	char *p;
+	int o;
+
+	assert(b->type == Tpivot);
+	assert(i >= 0 && i < b->nbuf);
+	p = b->data + Pivspc + 2*i;
+	o = UNPACK16(p);
+	p = b->data + Pivspc + o;
+	m->op = *p;		p += 1;
+	m->nk = UNPACK16(p);	p += 2;
+	m->k = p;		p += m->nk;
+	m->nv = UNPACK16(p);	p += 2;
+	m->v = p;
+}
+
+static int
+bufsearch(Blk *b, Key *k, Msg *m, int *same)
+{
+	int lo, hi, ri, mid, r;
+	Msg cmp;
+
+	ri = -1;
+	lo = 0;
+	hi = b->nbuf-1;
+	while(lo <= hi){
+		mid = (hi + lo) / 2;
+		getmsg(b, mid, &cmp);
+		r = keycmp(k, &cmp);
+		switch(r){
+		case -1:
+			hi = mid-1;
+			break;
+		case 0:
+			ri = mid;
+			hi = mid-1;
+			break;
+		case 1:
+			lo = mid+1;
+			break;
+		}
+	}
+	/*
+	 * we can have duplicate messages, and we
+	 * want to point to the first of them:
+	 * scan backwards.
+	 */
+	*same = 0;
+	if(ri == -1)
+		ri = lo-1;
+	else
+		*same = 1;
+	if(m != nil && ri >= 0)
+		getmsg(b, ri, m);
+	return ri;
+}
+
+static int
+blksearch(Blk *b, Key *k, Kvp *rp, int *same)
+{
+	int lo, hi, ri, mid, r;
+	Kvp cmp;
+
+	ri = -1;
+	lo = 0;
+	hi = b->nval-1;
+	while(lo <= hi){
+		mid = (hi + lo) / 2;
+		getval(b, mid, &cmp);
+		r = keycmp(k, &cmp);
+		switch(r){
+		case -1:
+			hi = mid-1;
+			break;
+		case 0:
+			ri = mid;
+			hi = mid-1;
+			break;
+		case 1:
+			lo = mid+1;
+			break;
+		}
+	}
+	*same = 0;
+	if(ri == -1)
+		ri = lo-1;
+	else
+		*same = 1;
+	if(ri >= 0)
+		getval(b, ri, rp);
+	return ri;
+}
+
+static int
+buffill(Blk *b)
+{
+	assert(b->type == Tpivot);
+	return 2*b->nbuf + b->bufsz;
+}
+
+static int
+filledbuf(Blk *b, int nmsg, int needed)
+{
+	assert(b->type == Tpivot);
+	return 2*(b->nbuf+nmsg) + b->bufsz + needed > Bufspc;
+}
+
+static int
+filledleaf(Blk *b, int needed)
+{
+	assert(b->type == Tleaf);
+	return 2*(b->nval+1) + b->valsz + needed > Leafspc;
+}
+
+static int
+filledpiv(Blk *b, int reserve)
+{
+	/* 
+	 * We need to guarantee there's room for one message
+	 * at all times, so that splits along the whole path
+	 * have somewhere to go as they propagate up.
+	 */
+	assert(b->type == Tpivot);
+	return 2*(b->nval+1) + b->valsz + reserve*Kpmax > Pivspc;
+}
+
+static void
+copyup(Blk *n, Path *pp, int *nbytes)
+{
+	Kvp kv;
+	Msg m;
+
+	/*
+	 * It's possible for the previous node to have
+	 * been fully cleared out by a large number of
+	 * delete messages, so we need to check if
+	 * there's anything in it to copy up.
+	 */
+	if(pp->nl->nval > 0){
+		getval(pp->nl, 0, &kv);
+		if(pp->nl->nbuf > 0){
+			getmsg(pp->nl, 0, &m);
+			if(keycmp(&kv, &m) > 0)
+				kv.Key = m.Key;
+		}
+		setptr(n, &kv, pp->nl->bp, blkfill(pp->nl));
+		if(nbytes != nil)
+			*nbytes += valsz(&kv);
+	}
+	if(pp->nr != nil && pp->nr->nval > 0){
+		getval(pp->nr, 0, &kv);
+		if(pp->nr->nbuf > 0){
+			getmsg(pp->nr, 0, &m);
+			if(keycmp(&kv, &m) > 0)
+				kv.Key = m.Key;
+		}
+		setptr(n, &kv, pp->nr->bp, blkfill(pp->nr));
+		if(nbytes != nil)
+			*nbytes += valsz(&kv);
+	}
+}
+
+static void
+statupdate(Kvp *kv, Msg *m)
+{
+	int op;
+	char *p;
+	Xdir d;
+
+	p = m->v;
+	op = *p++;
+	kv2dir(kv, &d);
+	/* bump version */
+	d.qid.vers++;
+	if(op & Owsize){
+		d.length = UNPACK64(p);
+		p += 8;
+	}
+	if(op & Owmode){
+		d.mode = UNPACK32(p);
+		d.qid.type = d.mode>>24;
+		p += 4;
+	}
+	if(op & Owmtime){
+		d.mtime = UNPACK64(p);
+		p += 8;
+	}
+	if(op & Owatime){
+		d.atime = UNPACK64(p);
+		p += 8;
+	}
+	if(op & Owuid){
+		d.uid = UNPACK32(p);
+		p += 4;
+	}
+	if(op & Owgid){
+		d.gid = UNPACK32(p);
+		p += 4;
+	}
+	if(op & Owmuid){
+		d.muid = UNPACK32(p);
+		p += 4;
+	}
+	if(p != m->v + m->nv)
+		fatal("malformed stat: kv=%P, m=%M\n", kv, m);
+	if(packdval(kv->v, kv->nv, &d) == nil)
+		fatal("repacking dir failed\n");
+}
+
+static int
+apply(Kvp *kv, Msg *m, char *buf, int nbuf)
+{
+	vlong *pv;
+	char *p;
+	Tree t;
+
+	switch(m->op){
+	case Oclearb:
+	case Odelete:
+	case Oclobber:
+		assert(keycmp(kv, m) == 0);
+		return 0;
+	case Oinsert:
+		cpkvp(kv, m, buf, nbuf);
+		return 1;
+	case Owstat:
+		assert(keycmp(kv, m) == 0);
+		statupdate(kv, m);
+		return 1;
+	case Orelink:
+	case Oreprev:
+		unpacktree(&t, kv->v, kv->nv);
+		p = m->v;
+		pv = (m->op == Orelink) ? &t.succ : &t.pred;
+		*pv = UNPACK64(p);	p += 8;
+		t.nlbl += *p;		p++;
+		t.nref += *p;		p++;
+		assert(t.nlbl >= 0 && t.nref >= 0);
+		assert(p == m->v + m->nv);
+		packtree(kv->v, kv->nv, &t);
+		return 1;
+	default:
+		fatal("invalid op %d\n", m->op);
+	}
+	return 0;
+}
+
+static int
+pullmsg(Path *p, int i, Kvp *v, Msg *m, int *full, int spc)
+{
+	if(i < 0 || i >= p->hi || *full)
+		return -1;
+
+	if(p->ins != nil)
+		*m = p->ins[i];
+	else
+		getmsg(p->b, i, m);
+	if(msgsz(m) <= spc)
+		return (v == nil) ? 0 : keycmp(v, m);
+	*full = 1;
+	return -1;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1, 
+ */
+static void
+updateleaf(Tree *t, Path *up, Path *p)
+{
+	char buf[Msgmax];
+	int i, j, c, ok, full, spc;
+	Blk *b, *n;
+	Bptr bp;
+	Msg m;
+	Kvp v;
+
+	i = 0;
+	j = up->lo;
+	b = p->b;
+	/*
+	 * spc is the amount of room we have
+	 * to copy data down from the parent; it's
+	 * necessarily a bit conservative, because
+	 * deletion messages don't take space -- but
+	 * we don't know how what the types of all
+	 * messages are.
+	 */
+	full = 0;
+	spc = Leafspc - blkfill(b);
+	n = newblk(t, b->type, 0);
+	assert(i >= 0 && j >= 0);
+	while(i < b->nval || j < up->hi){
+		if(i >= b->nval)
+			c = 1;
+		else{
+			c = -1;
+			getval(p->b, i, &v);
+			if(j < up->hi){
+				if(up->ins != nil)
+					m = up->ins[j];
+				else
+					getmsg(up->b, j, &m);
+				if(msgsz(&m) <= spc)
+					c = keycmp(&v, &m);
+				else
+					full = 1;
+			}
+		}
+		switch(c){
+		/* Value before message: just copy value */
+		case -1:
+			i++;
+			setval(n, &v);
+			break;
+		/* Value merges with message sequence */
+		case 0:
+			i++;
+			j++;
+			cpkvp(&v, &v, buf, sizeof(buf));
+			if(v.nk > 0 && v.k[0] == Kdat)
+			if(m.op == Oclearb
+			|| m.op == Oinsert
+			|| m.op == Odelete){
+				bp = unpackbp(v.v, v.nv);
+				freeblk(t, nil, bp);
+			}
+			ok = apply(&v, &m, buf, sizeof(buf));
+			goto Copyloop;
+		/* Message before value: Insert message sequence */
+		case 1:
+			j++;
+			cpkvp(&v, &m, buf, sizeof(buf));
+			ok = 0;
+			if(m.op != Oclearb && m.op != Oclobber){
+				spc -= valsz(&m);
+				p->pullsz += msgsz(&m);
+				ok = 1;
+			}
+			goto Copyloop;
+		Copyloop:
+			while(j < up->hi){
+				if(pullmsg(up, j, &v, &m, &full, spc) != 0)
+					break;
+				if(ok && v.nk > 0 && v.k[0] == Kdat)
+				if(m.op == Oclearb
+				|| m.op == Oinsert
+				|| m.op == Odelete){
+					bp = unpackbp(v.v, v.nv);
+					freeblk(t, nil, bp);
+				}
+				p->pullsz += msgsz(&m);
+				ok = apply(&v, &m, buf, sizeof(buf));
+				j++;
+			}
+			if(ok)
+				setval(n, &v);
+			break;
+		}
+	}
+	p->npull = (j - up->lo);
+	p->nl = n;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1, 
+ */
+static void
+updatepiv(Tree *t, Path *up, Path *p, Path *pp)
+{
+	char buf[Msgmax];
+	int i, j, sz, full, spc;
+	Blk *b, *n;
+	Msg m, u;
+
+	b = p->b;
+	n = newblk(t, b->type, 0);
+	for(i = 0; i < b->nval; i++){
+		if(pp != nil && i == p->midx){
+			copyup(n, pp, nil);
+			if(pp->op == POrot || pp->op == POmerge)
+				i++;
+		}else{
+			getval(b, i, &m);
+			setval(n, &m);
+		}
+	}
+	i = 0;
+	j = up->lo;
+	sz = 0;
+	full = 0;
+	spc = Bufspc - buffill(b);
+	if(pp != nil)
+		spc += pp->pullsz;
+	while(i < b->nbuf){
+		if(i == p->lo)
+			i += pp->npull;
+		if(i == b->nbuf)
+			break;
+		getmsg(b, i, &m);
+		switch(pullmsg(up, j, &m, &u, &full, spc - sz)){
+		case -1:
+		case 0:
+			setmsg(n, &m);
+			i++;
+			break;
+		case 1:
+			cpkvp(&m, &u, buf, sizeof(buf));
+			while(pullmsg(up, j, &m, &u, &full, spc) == 0){
+				setmsg(n, &u);
+				sz = msgsz(&u);
+				p->pullsz += sz;
+				spc -= sz;
+				j++;
+			}
+		}
+	}
+	while(j < up->hi){
+		pullmsg(up, j, nil, &u, &full, spc);
+		if(full)
+			break;
+		setmsg(n, &u);
+		sz = msgsz(&u);
+		p->pullsz += sz;
+		spc -= sz;
+		j++;
+	}
+	p->npull = (j - up->lo);
+	p->nl = n;
+}
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the tree by more than 1.
+ */
+static void
+splitleaf(Tree *t, Path *up, Path *p, Kvp *mid)
+{
+	char buf[Msgmax];
+	Blk *b, *d, *l, *r;
+	int full, copied, spc, ok, halfsz;
+	int i, j, c;
+	Bptr bp;
+	Msg m;
+	Kvp v;
+
+	/*
+	 * If the block one entry up the
+	 * p is nil, we're at the root,
+	 * so we want to make a new block.
+	 */
+	b = p->b;
+	l = nil;
+	r = nil;
+	if(waserror()){
+		efreeblk(t, l);
+		efreeblk(t, r);
+		nexterror();
+	}
+	l = newblk(t, b->type, 0);
+	r = newblk(t, b->type, 0);
+
+	d = l;
+	i = 0;
+	j = up->lo;
+	full = 0;
+	copied = 0;
+	halfsz = (2*b->nval + b->valsz + up->sz) / 2;
+	if(halfsz > Leafspc/2)
+		halfsz = Leafspc/2;
+	spc = Leafspc - (halfsz + Msgmax);
+	assert(b->nval >= 4);
+	while(i < b->nval){
+		/*
+		 * We're trying to balance size,
+		 * but we need at least 2 nodes
+		 * in each half of the split if
+		 * we want a valid tree.
+		 */
+		if(d == l)
+		if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
+			d = r;
+			spc = Leafspc - (halfsz + Msgmax);
+			getval(b, i, mid);
+		}
+		getval(b, i, &v);
+ 		c = pullmsg(up, j, &v, &m, &full, spc);
+		switch(c){
+		case -1:
+			i++;
+			setval(d, &v);
+			copied += valsz(&v);
+			break;
+		case 0:
+			i++;
+			j++;
+			cpkvp(&v, &v, buf, sizeof(buf));
+			copied += valsz(&v);
+			if(v.nk > 0 && v.k[0] == Kdat)
+			if(m.op == Oclearb
+			|| m.op == Oinsert
+			|| m.op == Odelete){
+				bp = unpackbp(v.v, v.nv);
+				freeblk(t, nil, bp);
+			}
+			ok = apply(&v, &m, buf, sizeof(buf));
+			goto Copyloop;
+		case 1:
+			j++;
+			cpkvp(&v, &m, buf, sizeof(buf));
+			copied += valsz(&v);
+			ok = 0;
+			if(m.op != Oclearb && m.op != Oclobber){
+				spc -= valsz(&m);
+				p->pullsz += msgsz(&m);
+				ok = 1;
+			}
+			goto Copyloop;
+		Copyloop:
+			while(j < up->hi){
+				if(pullmsg(up, j, &v, &m, &full, spc) != 0)
+					break;
+				if(ok && v.nk > 0 && v.k[0] == Kdat)
+				if(m.op == Oclearb
+				|| m.op == Oinsert
+				|| m.op == Odelete){
+					bp = unpackbp(v.v, v.nv);
+					freeblk(t, nil, bp);
+				}
+				p->pullsz += msgsz(&m);
+				ok = apply(&v, &m, buf, sizeof(buf));
+				j++;
+			}
+			if(ok)
+				setval(d, &v);
+			break;
+		}
+	}
+	p->npull = (j - up->lo);
+	p->op = POsplit;
+	p->nl = l;
+	p->nr = r;
+	poperror();
+}
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the tree by more
+ * than one.
+ */
+static void
+splitpiv(Tree *t, Path *, Path *p, Path *pp, Kvp *mid)
+{
+	int i, copied, halfsz;
+	Blk *b, *d, *l, *r;
+	Kvp tk;
+	Msg m;
+
+	/*
+	 * If the bp->lock one entry up the
+	 * p is nil, we're at the root,
+	 * so we want to make a new bp->lock.
+	 */
+	b = p->b;
+	l = nil;
+	r = nil;
+	if(waserror()){
+		efreeblk(t, l);
+		efreeblk(t, r);
+		nexterror();
+	}
+	l = newblk(t, b->type, 0);
+	r = newblk(t, b->type, 0);
+	d = l;
+	copied = 0;
+	halfsz = (2*b->nval + b->valsz)/2;
+	assert(b->nval >= 4);
+	for(i = 0; i < b->nval; i++){
+		/*
+		 * We're trying to balance size,
+		 * but we need at least 2 nodes
+		 * in each half of the split if
+		 * we want a valid tree.
+		 */
+		if(d == l)
+		if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
+			d = r;
+			getval(b, i, mid);
+		}
+		if(i == p->idx){
+			copyup(d, pp, &copied);
+			continue;
+		}
+		getval(b, i, &tk);
+		setval(d, &tk);
+		copied += valsz(&tk);
+	}
+	d = l;
+	for(i = 0; i < b->nbuf; i++){
+		if(i == p->lo)
+			i += pp->npull;
+		if(i == b->nbuf)
+			break;
+		getmsg(b, i, &m);
+		if(d == l && keycmp(&m, mid) >= 0)
+			d = r;
+		setmsg(d, &m);
+	}
+	p->op = POsplit;
+	p->nl = l;
+	p->nr = r;
+	poperror();
+}
+
+static void
+merge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
+{
+	Blk *d;
+	Msg m;
+	int i;
+
+	d = newblk(t, a->type, 0);
+	for(i = 0; i < a->nval; i++){
+		getval(a, i, &m);
+		setval(d, &m);
+	}
+	for(i = 0; i < b->nval; i++){
+		getval(b, i, &m);
+		setval(d, &m);
+	}
+	if(a->type == Tpivot){
+		for(i = 0; i < a->nbuf; i++){
+			getmsg(a, i, &m);
+			setmsg(d, &m);
+		}
+		for(i = 0; i < b->nbuf; i++){
+			getmsg(b, i, &m);
+			setmsg(d, &m);
+		}
+	}
+	enqueue(d);
+	p->midx = idx;
+	pp->nl = d;
+	pp->op = POmerge;
+	pp->nr = nil;
+}
+
+/*
+ * Scan a single block for the split offset;
+ * returns 1 if we'd spill out of the buffer,
+ * updates *idx and returns 0 otherwise.
+ */
+static int
+spillscan(Blk *d, Blk *b, Msg *m, int *idx, int o)
+{
+	int i, used;
+	Msg n;
+
+	used = 2*d->nbuf + d->bufsz;
+	for(i = *idx; i < b->nbuf; i++){
+		getmsg(b, i, &n);
+		if(keycmp(m, &n) <= 0){
+			*idx = i + o;
+			return 0;
+		}
+		used += msgsz(&n);
+		if(used > Bufspc)
+			return 1;
+	}
+	*idx = b->nbuf;
+	return 0;
+}
+
+/*
+ * Returns whether the keys in b between
+ * idx and m would spill out of the buffer
+ * of d.
+ */
+static int
+spillsbuf(Blk *d, Blk *l, Blk *r, Msg *m, int *idx)
+{
+	if(l->type == Tleaf)
+		return 0;
+
+	if(*idx < l->nbuf && spillscan(d, l, m, idx, 0))
+		return 1;
+	if(*idx >= l->nbuf && spillscan(d, r, m, idx, l->nbuf))
+		return 1;
+	return 0;
+}
+
+static void
+rotate(Tree *t, Path *p, Path *pp, int midx, Blk *a, Blk *b, int halfpiv)
+{
+	int i, o, cp, sp, idx;
+	Blk *d, *l, *r;
+	Msg m;
+
+	l = nil;
+	r = nil;
+	if(waserror()){
+		efreeblk(t, l);
+		efreeblk(t, r);
+		nexterror();
+	}
+	l = newblk(t, a->type, 0);
+	r = newblk(t, a->type, 0);
+	d = l;
+	cp = 0;
+	sp = -1;
+	idx = 0;
+	for(i = 0; i < a->nval; i++){
+		getval(a, i, &m);
+		if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+			sp = idx;
+			d = r;
+		}
+		setval(d, &m);
+		cp += valsz(&m);
+	}
+	for(i = 0; i < b->nval; i++){
+		getval(b, i, &m);
+		if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+			sp = idx;
+			d = r;
+		}
+		setval(d, &m);
+		cp += valsz(&m);
+	}
+	if(a->type == Tpivot){
+		d = l;
+		o = 0;
+		for(i = 0; i < a->nbuf; i++){
+			if(o == sp){
+				d = r;
+				o = 0;
+			}
+			getmsg(a, i, &m);
+			setmsg(d, &m);
+			o++;
+		}
+		for(i = 0; i < b->nbuf; i++){
+			if(o == sp){
+				d = r;
+				o = 0;
+			}
+			getmsg(b, i, &m);
+			setmsg(d, &m);
+			o++;
+		}
+	}
+	enqueue(l);
+	enqueue(r);
+	p->midx = midx;
+	pp->op = POrot;
+	pp->nl = l;
+	pp->nr = r;
+	poperror();
+}
+
+static void
+rotmerge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
+{
+	int na, nb, ma, mb, imbalance;
+
+	assert(a->type == b->type);
+
+	na = 2*a->nval + a->valsz;
+	nb = 2*b->nval + b->valsz;
+	if(a->type == Tleaf){
+		ma = 0;
+		mb = 0;
+	}else{
+		ma = 2*a->nbuf + a->bufsz;
+		mb = 2*b->nbuf + b->bufsz;
+	}
+	imbalance = na - nb;
+	if(imbalance < 0)
+		imbalance *= -1;
+	/* works for leaf, because 0 always < Bufspc */
+	if(na + nb < (Pivspc - 4*Msgmax) && ma + mb < Bufspc)
+		merge(t, p, pp, idx, a, b);
+	else if(imbalance > 4*Msgmax)
+		rotate(t, p, pp, idx, a, b, (na + nb)/2);
+}
+
+static void
+trybalance(Tree *t, Path *p, Path *pp, int idx)
+{
+	Blk *l, *m, *r;
+	Kvp kl, kr;
+	int spc, fill;
+	Bptr bp;
+
+	if(p->idx == -1 || pp == nil || pp->nl == nil)
+		return;
+	if(pp->op != POmod || pp->op != POmerge)
+		return;
+
+	l = nil;
+	r = nil;
+	m = holdblk(pp->nl);
+	if(waserror()){
+		dropblk(m);
+		dropblk(l);
+		dropblk(r);
+		nexterror();
+	}
+	spc = (m->type == Tleaf) ? Leafspc : Pivspc;
+	if(idx-1 >= 0){
+		getval(p->b, idx-1, &kl);
+		bp = getptr(&kl, &fill);
+		if(fill + blkfill(m) < spc){
+			l = getblk(bp, 0);
+			rotmerge(t, p, pp, idx-1, l, m);
+			goto Done;
+		}
+	}
+	if(idx+1 < p->b->nval){
+		getval(p->b, idx+1, &kr);
+		bp = getptr(&kr, &fill);
+		if(fill + blkfill(m) < spc){
+			r = getblk(bp, 0);
+			rotmerge(t, p, pp, idx, m, r);
+			goto Done;
+		}
+	}
+Done:
+	dropblk(m);
+	dropblk(l);
+	dropblk(r);
+	poperror();
+}
+
+static Path*
+flush(Tree *t, Path *path, int npath)
+{
+
+	Path *up, *p, *pp, *rp;
+	Kvp mid;
+
+	/*
+	 * The path must contain at minimum two elements:
+	 * we must have 1 node we're inserting into, and
+	 * an empty element at the top of the path that
+	 * we put the new root into if the root gets split.
+	 */
+	assert(npath >= 2);
+	rp = nil;
+	pp = nil;
+	p = &path[npath - 1];
+	up = &path[npath - 2];
+	if(p->b->type == Tleaf){
+		if(!filledleaf(p->b, up->sz)){
+			updateleaf(t, p-1, p);
+			enqueue(p->nl);
+			rp = p;
+		}else{
+			splitleaf(t, up, p, &mid);
+			enqueue(p->nl);
+			enqueue(p->nr);
+		}
+		p->midx = -1;
+		pp = p;
+		up--;
+		p--;
+	}
+	while(p != path){
+		if(!filledpiv(p->b, 1)){
+			trybalance(t, p, pp, p->idx);
+			/* If we merged the root node, break out. */
+			if(up == path && pp != nil && pp->op == POmerge && p->b->nval == 2){
+				rp = pp;
+				goto Out;
+			}
+			updatepiv(t, up, p, pp);
+			enqueue(p->nl);
+			rp = p;
+		}else{
+			splitpiv(t, up, p, pp, &mid);
+			enqueue(p->nl);
+			enqueue(p->nr);
+		}
+		pp = p;
+		up--;
+		p--;
+	}
+	if(pp->nl != nil && pp->nr != nil){
+		rp = &path[0];
+		rp->nl = newblk(t, Tpivot, 0);
+		rp->npull = pp->npull;
+		rp->pullsz = pp->pullsz;
+		copyup(rp->nl, pp, nil);
+		enqueue(rp->nl);
+	}
+Out:
+	return rp;
+}
+
+static void
+freepath(Tree *t, Path *path, int npath)
+{
+	Path *p;
+
+	for(p = path; p != path + npath; p++){
+		if(p->b != nil)
+			freeblk(t, p->b, p->b->bp);
+		if(p->m != nil)
+			freeblk(t, p->b, p->m->bp);
+		dropblk(p->b);
+		dropblk(p->nl);
+		dropblk(p->nr);
+	}
+	free(path);
+}
+
+/*
+ * Select child node that with the largest message
+ * segment in the current node's buffer.
+ */
+static void
+victim(Blk *b, Path *p)
+{
+	int i, j, lo, maxsz, cursz;
+	Kvp kv;
+	Msg m;
+
+	j = 0;
+	maxsz = 0;
+	p->b = b;
+	/* 
+	 * Start at the second pivot: all values <= this
+	 * go to the first node. Stop *after* the last entry,
+	 * because entries >= the last entry all go into it.
+	 */
+	for(i = 1; i <= b->nval; i++){
+		if(i < b->nval)
+			getval(b, i, &kv);
+		cursz = 0;
+		lo = j;
+		for(; j < b->nbuf; j++){
+			getmsg(b, j, &m);
+			if(i < b->nval && keycmp(&m, &kv) >= 0)
+				break;
+			/* 2 bytes for offset, plus message size in buffer */
+			cursz += msgsz(&m);
+		}
+		if(cursz > maxsz){
+			maxsz = cursz;
+			p->op = POmod;
+			p->lo = lo;
+			p->hi = j;
+			p->sz = maxsz;
+			p->idx = i - 1;
+			p->midx = i - 1;
+			p->npull = 0;
+			p->pullsz = 0;
+		}
+	}
+}
+
+static void
+fastupsert(Tree *t, Blk *b, Msg *msg, int nmsg)
+{
+	int i, c, o, ri, lo, hi, mid, nbuf;
+	Msg cmp;
+	char *p;
+	Blk *r;
+
+	if((r = dupblk(t, b)) == nil)
+		error(Enomem);
+
+	nbuf = r->nbuf;
+	for(i = 0; i < nmsg; i++)
+		setmsg(r, &msg[i]);
+
+	for(i = 0; i < nmsg; i++){
+		ri = -1;
+		lo = 0;
+		hi = nbuf+i-1;
+		while(lo <= hi){
+			mid = (hi + lo) / 2;
+			getmsg(r, mid, &cmp);
+			c = keycmp(&msg[i], &cmp);
+			switch(c){
+			case -1:
+				hi = mid-1;
+				break;
+			case 0:
+				ri = mid+1;
+				lo = mid+1;
+				break;
+			case 1:
+				lo = mid+1;
+				break;
+			}
+		}
+		if(ri == -1)
+			ri = hi+1;
+		p = r->data + Pivspc + 2*(nbuf+i);
+		o = UNPACK16(p);
+		p = r->data + Pivspc + 2*ri;
+		memmove(p+2, p, 2*(nbuf+i-ri));
+		PACK16(p, o);
+	}
+	enqueue(r);
+
+	lock(&t->lk);
+	t->bp = r->bp;
+	t->dirty = 1;
+	unlock(&t->lk);
+
+	freeblk(t, b, b->bp);
+	dropblk(b);
+	dropblk(r);
+}
+	
+
+void
+btupsert(Tree *t, Msg *msg, int nmsg)
+{
+	int i, npath, npull, dh, sz, height;
+	Path *path, *rp;
+	Blk *b, *rb;
+	Kvp sep;
+	Bptr bp;
+
+	sz = 0;
+	stablesort(msg, nmsg);
+	for(i = 0; i < nmsg; i++)
+		sz += msgsz(&msg[i]);
+	npull = 0;
+	path = nil;
+	npath = 0;
+
+Again:
+	if(waserror()){
+		freepath(t, path, npath);
+		nexterror();
+	}
+
+	b = getroot(t, &height);
+	if(npull == 0 && b->type == Tpivot && !filledbuf(b, nmsg, sz)){
+		fastupsert(t, b, msg, nmsg);
+		poperror();
+		return;
+	}
+	/*
+	 * The tree can grow in height by 1 when we
+	 * split, so we allocate room for one extra
+	 * node in the path.
+	 */
+	npath = 0;
+	if((path = calloc((height + 2), sizeof(Path))) == nil)
+		error(Enomem);
+	path[npath].b = nil;
+	path[npath].idx = -1;
+	path[npath].midx = -1;
+	npath++;
+
+	path[0].sz = sz;
+	path[0].ins = msg;
+	path[0].lo = npull;
+	path[0].hi = nmsg;
+	while(b->type == Tpivot){
+		if(!filledbuf(b, nmsg, path[npath - 1].sz))
+			break;
+		victim(b, &path[npath]);
+		getval(b, path[npath].idx, &sep);
+		bp = unpackbp(sep.v, sep.nv);
+		b = getblk(bp, 0);
+		npath++;
+	}
+	path[npath].b = b;
+	path[npath].idx = -1;
+	path[npath].midx = -1;
+	path[npath].lo = -1;
+	path[npath].hi = -1;
+	path[npath].npull = 0;
+	path[npath].pullsz = 0;
+	npath++;
+
+	rp = flush(t, path, npath);
+	rb = rp->nl;
+
+	if(path[0].nl != nil)
+		dh = 1;
+	else if(path[1].nl != nil)
+		dh = 0;
+	else if(npath >2 && path[2].nl != nil)
+		dh = -1;
+	else
+		fatal("broken path change");
+
+	assert(rb->bp.addr != 0);
+	assert(rb->bp.addr != 0);
+
+	lock(&t->lk);
+	traceb("setroot", rb->bp);
+	t->ht += dh;
+	t->bp = rb->bp;
+	t->dirty = 1;
+	unlock(&t->lk);
+
+	npull += rp->npull;
+	freepath(t, path, npath);
+	poperror();
+
+	if(npull != nmsg){
+		tracem("short pull");
+		goto Again;
+	}
+}
+
+Blk*
+getroot(Tree *t, int *h)
+{
+	Bptr bp;
+
+	lock(&t->lk);
+	bp = t->bp;
+	if(h != nil)
+		*h = t->ht;
+	unlock(&t->lk);
+
+	return getblk(bp, 0);
+}
+
+int
+btlookup(Tree *t, Key *k, Kvp *r, char *buf, int nbuf)
+{
+	int i, j, h, ok, same;
+	Blk *b, **p;
+	Bptr bp;
+	Msg m;
+
+	b = getroot(t, &h);
+	if((p = calloc(h, sizeof(Blk*))) == nil){
+		dropblk(b);
+		error(Enomem);
+	}
+	ok = 0;
+	p[0] = holdblk(b);
+	for(i = 1; i < h; i++){
+		if(blksearch(p[i-1], k, r, &same) == -1)
+			break;
+		bp = unpackbp(r->v, r->nv);
+		p[i] = getblk(bp, 0);
+	}
+	if(p[h-1] != nil)
+		blksearch(p[h-1], k, r, &ok);
+	if(ok)
+		cpkvp(r, r, buf, nbuf);
+	for(i = h-2; i >= 0; i--){
+		if(p[i] == nil)
+			continue;
+		j = bufsearch(p[i], k, &m, &same);
+		if(j < 0 || !same)
+			continue;
+		if(!(ok || m.op == Oinsert || m.op == Oclearb))
+			fatal("lookup %K << %M missing insert\n", k, &m);
+		ok = apply(r, &m, buf, nbuf);
+		for(j++; j < p[i]->nbuf; j++){
+			getmsg(p[i], j, &m);
+			if(keycmp(k, &m) != 0)
+				break;
+			ok = apply(r, &m, buf, nbuf);
+		}
+	}
+	for(i = 0; i < h; i++)
+		if(p[i] != nil)
+			dropblk(p[i]);
+	dropblk(b);
+	free(p);
+	return ok;
+}
+
+void
+btnewscan(Scan *s, char *pfx, int npfx)
+{
+	memset(s, 0, sizeof(*s));
+	s->first = 1;
+	s->donescan = 0;
+	s->offset = 0;
+	s->pfx.k = s->pfxbuf;
+	s->pfx.nk = npfx;
+	memmove(s->pfxbuf, pfx, npfx);
+
+	s->kv.v = s->kvbuf+npfx;
+	s->kv.nv = 0;
+	cpkey(&s->kv, &s->pfx, s->kvbuf, sizeof(s->kvbuf));
+}
+
+void
+btenter(Tree *t, Scan *s)
+{
+	int i, same;
+	Scanp *p;
+	Msg m, c;
+	Bptr bp;
+	Blk *b;
+	Kvp v;
+
+	if(s->donescan)
+		return;
+	b = getroot(t, &s->ht);
+	if((s->path = calloc(s->ht, sizeof(Scanp))) == nil){
+		dropblk(b);
+		error(Enomem);
+	}
+	p = s->path;
+	p[0].b = b;
+	for(i = 0; i < s->ht; i++){
+		p[i].vi = blksearch(b, &s->kv, &v, &same);
+		if(b->type == Tpivot){
+			if(p[i].vi == -1)
+				getval(b, ++p[i].vi, &v);
+			p[i].bi = bufsearch(b, &s->kv, &m, &same);
+			if(p[i].bi == -1){
+				p[i].bi++;
+			}else if(!same || !s->first){
+				/* scan past repeated messages */
+				while(p[i].bi < p[i].b->nbuf){
+					getmsg(p[i].b, p[i].bi, &c);
+					if(keycmp(&m, &c) != 0)
+						break;
+					p[i].bi++;
+				}
+			}
+			bp = unpackbp(v.v, v.nv);
+			b = getblk(bp, 0);
+			p[i+1].b = b;
+		}else if(p[i].vi == -1 || !same || !s->first)
+			p[i].vi++;
+	}
+	s->first = 0;
+}
+
+int
+btnext(Scan *s, Kvp *r)
+{
+	int i, j, h, ok, start, bufsrc;
+	Scanp *p;
+	Msg m, n;
+	Bptr bp;
+	Kvp kv;
+
+Again:
+	p = s->path;
+	h = s->ht;
+	start = h;
+	bufsrc = -1;
+	if(s->donescan)
+		return 0;
+	if(waserror()){
+		btexit(s);
+		nexterror();
+	}
+	/* load up the correct blocks for the scan */
+	for(i = h-1; i >= 0; i--){
+		if(p[i].b != nil
+		&&(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf))
+			break;
+		if(i == 0){
+			s->donescan = 1;
+			poperror();
+			return 0;
+		}
+		if(p[i].b != nil)
+			dropblk(p[i].b);
+		p[i].b = nil;
+		p[i].vi = 0;
+		p[i].bi = 0;
+		p[i-1].vi++;
+		start = i;
+	}
+
+	if(p[start-1].vi < p[start-1].b->nval){
+		for(i = start; i < h; i++){
+			getval(p[i-1].b, p[i-1].vi, &kv);
+			bp = unpackbp(kv.v, kv.nv);
+			p[i].b = getblk(bp, 0);
+		}
+	
+		/* find the minimum key along the path up */
+		m.op = Oinsert;
+		getval(p[h-1].b, p[h-1].vi, &m);
+	}else{
+		getmsg(p[start-1].b, p[start-1].bi, &m);
+		assert(m.op == Oinsert);
+		bufsrc = start-1;
+	}
+
+	for(i = h-2; i >= 0; i--){
+		if(p[i].b == nil || p[i].bi == p[i].b->nbuf)
+			continue;
+		getmsg(p[i].b, p[i].bi, &n);
+		if(keycmp(&n, &m) < 0){
+			bufsrc = i;
+			m = n;
+		}
+	}
+	if(m.nk < s->pfx.nk || memcmp(m.k, s->pfx.k, s->pfx.nk) != 0){
+		s->donescan = 1;
+		poperror();
+		return 0;
+	}
+
+	/* scan all messages applying to the message */
+	ok = 1;
+	cpkvp(r, &m, s->kvbuf, sizeof(s->kvbuf));
+	if(bufsrc == -1)
+		p[h-1].vi++;
+	else
+		p[bufsrc].bi++;
+	for(i = h-2; i >= 0; i--){
+		for(j = p[i].bi; p[i].b != nil && j < p[i].b->nbuf; j++){
+			getmsg(p[i].b, j, &m);
+			if(keycmp(r, &m) != 0)
+				break;
+			ok = apply(r, &m, s->kvbuf, sizeof(s->kvbuf));
+			p[i].bi++;
+		}
+	}
+	poperror();
+	if(!ok)
+		goto Again;
+	return 1;
+}
+
+void
+btexit(Scan *s)
+{
+	int i;
+
+	for(i = 0; i < s->ht; i++)
+		dropblk(s->path[i].b);
+	free(s->path);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/user.c
@@ -1,0 +1,260 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static char*
+slurp(Tree *t, vlong path, vlong len)
+{
+	char *ret, buf[Offksz], kvbuf[Offksz + Ptrsz];
+	vlong o;
+	Blk *b;
+	Bptr bp;
+	Key k;
+	Kvp kv;
+
+	if((ret = malloc(len + 1)) == nil)
+		error(Enomem);
+	k.k = buf;
+	k.nk = Offksz;
+	for(o = 0; o < len; o += Blksz){
+		k.k[0] = Kdat;
+		PACK64(k.k+1, path);
+		PACK64(k.k+9, o);
+		if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+			error(Esrch);
+		bp = unpackbp(kv.v, kv.nv);
+		b = getblk(bp, GBraw);
+		if(len - o >= Blksz)
+			memcpy(ret + o, b->buf, Blksz);
+		else
+			memcpy(ret + o, b->buf, len - o);
+	}
+	ret[len] = 0;
+	return ret;
+}
+
+static char*
+readline(char **p, char *buf, int nbuf)
+{
+	char *e;
+	int n;
+
+	if((e = strchr(*p, '\n')) == nil)
+		return nil;
+	n = (e - *p) + 1;
+	if(n >= nbuf)
+		n = nbuf - 1;
+	strecpy(buf, buf + n, *p);
+	*p = e+1;
+	return buf;
+}
+
+static char*
+getfield(char **p, char delim)
+{
+	char *r;
+
+	if(*p == nil)
+		return nil;
+	r = *p;
+	*p = strchr(*p, delim);
+	if(*p != nil){
+		**p = '\0';
+		*p += 1;
+	}
+	return r;
+}
+
+User*
+name2user(char *name)
+{
+	int i;
+
+	for(i = 0; i < fs->nusers; i++)
+		if(strcmp(fs->users[i].name, name) == 0)
+			return &fs->users[i];
+	return nil;
+}
+
+User*
+uid2user(int id)
+{
+	int i;
+
+	for(i = 0; i < fs->nusers; i++)
+		if(fs->users[i].id == id)
+			return &fs->users[i];
+	return nil;
+}
+
+static char*
+parseusers(int fd, char *udata)
+{
+	char *pu, *p, *f, *m, *err, buf[8192];
+	int i, j, lnum, ngrp, nusers, usersz;
+	User *u, *n, *users;
+	int *g, *grp;
+
+	i = 0;
+	err = nil;
+	nusers = 0;
+	usersz = 8;
+	if((users = calloc(usersz, sizeof(User))) == nil)
+		return Enomem;
+	pu = udata;
+	lnum = 0;
+	while((p = readline(&pu, buf, sizeof(buf))) != nil){
+		lnum++;
+		if(p[0] == '#' || p[0] == 0)
+			continue;
+		if(i == usersz){
+			usersz *= 2;
+			n = realloc(users, usersz*sizeof(User));
+			if(n == nil){
+				free(users);
+				return Enomem;
+			}
+			users = n;
+		}
+		if((f = getfield(&p, ':')) == nil){
+			fprint(fd, "/adm/users:%d: missing ':' after id\n", lnum);
+			err = Esyntax;
+			goto Error;
+		}
+		u = &users[i];
+		u->id = atol(f);
+		if((f = getfield(&p, ':')) == nil){
+			fprint(fd, "/adm/users:%d: missing ':' after name\n", lnum);
+			err = Esyntax;
+			goto Error;
+		}
+		snprint(u->name, sizeof(u->name), "%s", f);
+		u->memb = nil;
+		u->nmemb = 0;
+		i++;
+	}
+	nusers = i;
+
+
+	i = 0;
+	pu = udata;
+	lnum = 0;
+	while((p = readline(&pu, buf, sizeof(buf))) != nil){
+		lnum++;
+		if(buf[0] == '#' || buf[0] == 0)
+			continue;
+		getfield(&p, ':');	/* skip id */
+		getfield(&p, ':');	/* skip name */
+		if((f = getfield(&p, ':')) == nil){
+			fprint(fd, "/adm/users:%d: missing ':' after name\n", lnum);
+			err = Esyntax;
+			goto Error;
+		}
+		if(f[0] != '\0'){
+			u = nil;
+			for(j = 0; j < nusers; j++)
+				if(strcmp(users[j].name, f) == 0)
+					u = &users[j];
+			if(u == nil){
+				fprint(fd, "/adm/users:%d: leader %s does not exist\n", lnum, f);
+				err = Enouser;
+				goto Error;
+			}
+			users[i].lead = u->id;
+		}
+		if((f = getfield(&p, ':')) == nil){
+			err = Esyntax;
+			goto Error;
+		}
+		grp = nil;
+		ngrp = 0;
+		while((m = getfield(&f, ',')) != nil){
+			if(m[0] == '\0')
+				continue;
+			u = nil;
+			for(j = 0; j < nusers; j++)
+				if(strcmp(users[j].name, m) == 0)
+					u = &users[j];
+			if(u == nil){
+				fprint(fd, "/adm/users:%d: user %s does not exist\n", lnum, m);
+				free(grp);
+				err = Enouser;
+				goto Error;
+			}
+			if((g = realloc(grp, (ngrp+1)*sizeof(int))) == nil){
+				free(grp);
+				err = Enomem;
+				goto Error;
+			}
+			grp = g;
+			grp[ngrp++] = u->id;
+		}
+		users[i].memb = grp;
+		users[i].nmemb = ngrp;
+		i++;
+	}
+
+	wlock(&fs->userlk);
+	n = fs->users;
+	i = fs->nusers;
+	fs->users = users;
+	fs->nusers = nusers;
+	wunlock(&fs->userlk);
+	users = n;
+	nusers = i;
+
+Error:
+	if(users != nil)
+		for(i = 0; i < nusers; i++)
+			free(users[i].memb);
+	free(users);
+		
+	return err;
+		
+}
+
+void
+loadusers(int fd, Tree *t)
+{
+	char *s, *e;
+	vlong len;
+	Qid q;
+	User *u;
+
+	if(walk1(t, -1, "", &q, &len) == -1)
+		error(Efs);
+	if(walk1(t, q.path, "users", &q, &len) == -1)
+		error(Esrch);
+	if(q.type & QTDIR)
+		error(Etype);
+	if(len >= 1*MiB)
+		error(Efsize);
+	s = slurp(t, q.path, len);
+	e = parseusers(fd, s);
+	if(e != nil){
+		if(fs->users != nil){
+			fprint(2, "load users: %s\n", e);
+			fprint(2, "keeping old table\n");
+			error(e);
+		}
+		if(!permissive){
+			fprint(2, "user table broken: %s\n", e);
+			fprint(2, "\tnot permissive: bailing\n");
+			error(e);
+		}
+		fprint(2, "user table broken: %s\n", e);
+		fprint(2, "\tfalling back to default\n");
+		parseusers(fd, "-1:adm::\n0:none::\n");
+	}
+	if((u = name2user("none")) != nil)
+		noneid = u->id;
+	if((u = name2user("adm")) != nil)
+		admid = u->id;
+	if((u = name2user("nogroup")) != nil)
+		nogroupid = u->id;
+	free(s);
+}