ref: 45f7b30244297a8bf87789793eec875246ed063c
dir: /sys/src/cmd/venti/srv/dcache.c/
/* * Disk cache. * * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back. * Getdblock has a mode parameter that determines i/o and access to a block: * if mode is OREAD or ORDWR, it is read from disk if not already in memory. * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned. * It is *not* marked dirty -- once changes have been made, they should be noted * by using dirtydblock() before putdblock(). * * There is a global cache lock as well as a lock on each block. * Within a thread, the cache lock can be acquired while holding a block lock, * but not vice versa; and a block cannot be locked if you already hold the lock * on another block. * * The flush proc writes out dirty blocks in batches, one batch per dirty tag. * For example, the DirtyArena blocks are all written to disk before any of the * DirtyArenaCib blocks. * * This code used to be in charge of flushing the dirty index blocks out to * disk, but updating the index turned out to benefit from extra care. * Now cached index blocks are never marked dirty. The index.c code takes * care of updating them behind our back, and uses _getdblock to update any * cached copies of the blocks as it changes them on disk. */ #include "stdinc.h" #include "dat.h" #include "fns.h" typedef struct DCache DCache; enum { HashLog = 9, HashSize = 1<<HashLog, HashMask = HashSize - 1, }; struct DCache { QLock lock; RWLock dirtylock; /* must be held to inspect or set b->dirty */ Rendez full; Round round; DBlock *free; /* list of available lumps */ u32int now; /* ticks for usage timestamps */ int size; /* max. size of any block; allocated to each block */ DBlock **heads; /* hash table for finding address */ int nheap; /* number of available victims */ DBlock **heap; /* heap for locating victims */ int nblocks; /* number of blocks allocated */ DBlock *blocks; /* array of block descriptors */ DBlock **write; /* array of block pointers to be written */ u8int *mem; /* memory for all block descriptors */ int ndirty; /* number of dirty blocks */ int maxdirty; /* max. number of dirty blocks */ }; typedef struct Ra Ra; struct Ra { Part *part; u64int addr; }; static DCache dcache; static int downheap(int i, DBlock *b); static int upheap(int i, DBlock *b); static DBlock *bumpdblock(void); static void delheap(DBlock *db); static void fixheap(int i, DBlock *b); static void flushproc(void*); static void writeproc(void*); void initdcache(u32int mem) { DBlock *b, *last; u32int nblocks, blocksize; int i; u8int *p; if(maxblocksize == 0) sysfatal("no max. block size given for disk cache"); if(mem < maxblocksize * 2) sysfatal("need at least 2 max-size blocks (%d bytes) for the disk cache", maxblocksize * 2); blocksize = maxblocksize; nblocks = mem / blocksize; dcache.full.l = &dcache.lock; dcache.nblocks = nblocks; dcache.maxdirty = (nblocks * 2) / 3; trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n", nblocks, blocksize, dcache.maxdirty); dcache.size = blocksize; dcache.heads = vtbrk(sizeof(DBlock*) * HashSize); dcache.heap = vtbrk(sizeof(DBlock*) * nblocks); dcache.blocks = vtbrk(sizeof(DBlock) * nblocks); dcache.write = vtbrk(sizeof(DBlock*) * nblocks); dcache.mem = vtbrk((nblocks+1+128) * blocksize); last = nil; p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1)); for(i = 0; i < nblocks; i++){ b = &dcache.blocks[i]; b->data = &p[i * blocksize]; b->heap = TWID32; b->writedonechan = chancreate(sizeof(void*), 1); b->next = last; last = b; } dcache.free = last; dcache.nheap = 0; setstat(StatDcacheSize, nblocks); initround(&dcache.round, "dcache", 120*1000); vtproc(flushproc, nil); vtproc(delaykickroundproc, &dcache.round); } static u32int pbhash(u64int addr) { u32int h; #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask) h = (addr >> 32) ^ addr; return hashit(h); } DBlock* getdblock(Part *part, u64int addr, int mode) { DBlock *b; b = _getdblock(part, addr, mode, 1); if(mode == OREAD || mode == ORDWR) addstat(StatDcacheRead, 1); if(mode == OWRITE || mode == ORDWR) addstat(StatDcacheWrite, 1); return b; } DBlock* _getdblock(Part *part, u64int addr, int mode, int load) { DBlock *b; u32int h, size, ms; ms = 0; trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr); size = part->blocksize; if(size > dcache.size){ seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size); if(load) addstat(StatDcacheLookup, 1); return nil; } h = pbhash(addr); /* * look for the block in the cache */ qlock(&dcache.lock); again: for(b = dcache.heads[h]; b != nil; b = b->next){ if(b->part == part && b->addr == addr){ if(load) addstat2(StatDcacheHit, 1, StatDcacheLookup, 1); goto found; } } /* * missed: locate the block with the oldest second to last use. * remove it from the heap, and fix up the heap. */ if(!load){ qunlock(&dcache.lock); return nil; } /* * Only start timer here, on cache miss - calling msec() on plain cache hits * makes cache hits system-call bound. */ ms = msec(); addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1); b = bumpdblock(); if(b == nil){ trace(TraceBlock, "all disk cache blocks in use"); addstat(StatDcacheStall, 1); rsleep(&dcache.full); addstat(StatDcacheStall, -1); goto again; } assert(!b->dirty); /* * the new block has no last use, so assume it happens sometime in the middle ZZZ this is not reasonable */ b->used = (b->used2 + dcache.now) / 2; /* * rechain the block on the correct hash chain */ b->next = dcache.heads[h]; dcache.heads[h] = b; if(b->next != nil) b->next->prev = b; b->prev = nil; b->addr = addr; b->part = part; b->size = 0; found: b->ref++; b->used2 = b->used; b->used = dcache.now++; if(b->heap != TWID32) fixheap(b->heap, b); if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){ trace(TraceBlock, "getdblock allocwriteproc %s", part->name); part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks); vtproc(writeproc, part); } qunlock(&dcache.lock); trace(TraceBlock, "getdblock lock"); addstat(StatDblockStall, 1); if(mode == OREAD) rlock(&b->lock); else wlock(&b->lock); addstat(StatDblockStall, -1); trace(TraceBlock, "getdblock locked"); if(b->size != size){ if(mode == OREAD){ addstat(StatDblockStall, 1); runlock(&b->lock); wlock(&b->lock); addstat(StatDblockStall, -1); } if(b->size < size){ if(mode == OWRITE) memset(&b->data[b->size], 0, size - b->size); else{ trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr); diskaccess(0); if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){ b->mode = ORDWR; /* so putdblock wunlocks */ putdblock(b); return nil; } trace(TraceBlock, "getdblock readpartdone"); addstat(StatApartRead, 1); addstat(StatApartReadBytes, size-b->size); } } b->size = size; if(mode == OREAD){ addstat(StatDblockStall, 1); wunlock(&b->lock); rlock(&b->lock); addstat(StatDblockStall, -1); } } b->mode = mode; trace(TraceBlock, "getdblock exit"); if(ms) addstat(StatDcacheLookupTime, msec() - ms); return b; } void putdblock(DBlock *b) { if(b == nil) return; trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr); if(b->mode == OREAD) runlock(&b->lock); else wunlock(&b->lock); qlock(&dcache.lock); if(--b->ref == 0 && !b->dirty){ if(b->heap == TWID32) upheap(dcache.nheap++, b); rwakeupall(&dcache.full); } qunlock(&dcache.lock); } void dirtydblock(DBlock *b, int dirty) { int odirty; trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux", b->part->name, b->addr, dirty, getcallerpc(&b)); assert(b->ref != 0); assert(b->mode==ORDWR || b->mode==OWRITE); odirty = b->dirty; if(b->dirty) assert(b->dirty == dirty); else b->dirty = dirty; qlock(&dcache.lock); if(!odirty){ dcache.ndirty++; setstat(StatDcacheDirty, dcache.ndirty); if(dcache.ndirty >= dcache.maxdirty) kickround(&dcache.round, 0); else delaykickround(&dcache.round); } qunlock(&dcache.lock); } static void unchain(DBlock *b) { ulong h; /* * unchain the block */ if(b->prev == nil){ h = pbhash(b->addr); if(dcache.heads[h] != b) sysfatal("bad hash chains in disk cache"); dcache.heads[h] = b->next; }else b->prev->next = b->next; if(b->next != nil) b->next->prev = b->prev; } /* * remove some block from use and update the free list and counters */ static DBlock* bumpdblock(void) { DBlock *b; trace(TraceBlock, "bumpdblock enter"); b = dcache.free; if(b != nil){ dcache.free = b->next; return b; } if(dcache.ndirty >= dcache.maxdirty) kickdcache(); /* * remove blocks until we find one that is unused * referenced blocks are left in the heap even though * they can't be scavenged; this is simple a speed optimization */ for(;;){ if(dcache.nheap == 0){ kickdcache(); trace(TraceBlock, "bumpdblock gotnothing"); return nil; } b = dcache.heap[0]; delheap(b); if(!b->ref && !b->dirty) break; } trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr); unchain(b); return b; } void emptydcache(void) { DBlock *b; qlock(&dcache.lock); while(dcache.nheap > 0){ b = dcache.heap[0]; delheap(b); if(!b->ref && !b->dirty){ unchain(b); b->next = dcache.free; dcache.free = b; } } qunlock(&dcache.lock); } /* * delete an arbitrary block from the heap */ static void delheap(DBlock *db) { if(db->heap == TWID32) return; fixheap(db->heap, dcache.heap[--dcache.nheap]); db->heap = TWID32; } /* * push an element up or down to it's correct new location */ static void fixheap(int i, DBlock *b) { if(upheap(i, b) == i) downheap(i, b); } static int upheap(int i, DBlock *b) { DBlock *bb; u32int now; int p; now = dcache.now; for(; i != 0; i = p){ p = (i - 1) >> 1; bb = dcache.heap[p]; if(b->used2 - now >= bb->used2 - now) break; dcache.heap[i] = bb; bb->heap = i; } dcache.heap[i] = b; b->heap = i; return i; } static int downheap(int i, DBlock *b) { DBlock *bb; u32int now; int k; now = dcache.now; for(; ; i = k){ k = (i << 1) + 1; if(k >= dcache.nheap) break; if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now) k++; bb = dcache.heap[k]; if(b->used2 - now <= bb->used2 - now) break; dcache.heap[i] = bb; bb->heap = i; } dcache.heap[i] = b; b->heap = i; return i; } static void findblock(DBlock *bb) { DBlock *b, *last; int h; last = nil; h = pbhash(bb->addr); for(b = dcache.heads[h]; b != nil; b = b->next){ if(last != b->prev) sysfatal("bad prev link"); if(b == bb) return; last = b; } sysfatal("block missing from hash table"); } void checkdcache(void) { DBlock *b; u32int size, now; int i, k, refed, nfree; qlock(&dcache.lock); size = dcache.size; now = dcache.now; for(i = 0; i < dcache.nheap; i++){ if(dcache.heap[i]->heap != i) sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap); if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now) sysfatal("dc: bad heap ordering"); k = (i << 1) + 1; if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) sysfatal("dc: bad heap ordering"); k++; if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) sysfatal("dc: bad heap ordering"); } refed = 0; for(i = 0; i < dcache.nblocks; i++){ b = &dcache.blocks[i]; if(b->data != &dcache.mem[i * size]) sysfatal("dc: mis-blocked at %d", i); if(b->ref && b->heap == TWID32) refed++; if(b->addr) findblock(b); if(b->heap != TWID32 && dcache.heap[b->heap] != b) sysfatal("dc: spurious heap value"); } nfree = 0; for(b = dcache.free; b != nil; b = b->next){ if(b->addr != 0 || b->heap != TWID32) sysfatal("dc: bad free list"); nfree++; } if(dcache.nheap + nfree + refed != dcache.nblocks) sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks); qunlock(&dcache.lock); } void flushdcache(void) { trace(TraceProc, "flushdcache enter"); kickround(&dcache.round, 1); trace(TraceProc, "flushdcache exit"); } void kickdcache(void) { kickround(&dcache.round, 0); } static int parallelwrites(DBlock **b, DBlock **eb, int dirty) { DBlock **p, **q; Part *part; for(p=b; p<eb && (*p)->dirty == dirty; p++){ assert(b<=p && p<eb); sendp((*p)->part->writechan, *p); } q = p; for(p=b; p<q; p++){ assert(b<=p && p<eb); recvp((*p)->writedonechan); } /* * Flush the partitions that have been written to. */ part = nil; for(p=b; p<q; p++){ if(part != (*p)->part){ part = (*p)->part; flushpart(part); /* what if it fails? */ } } return p-b; } /* * Sort first by dirty flag, then by partition, then by address in partition. */ static int writeblockcmp(const void *va, const void *vb) { DBlock *a, *b; a = *(DBlock**)va; b = *(DBlock**)vb; if(a->dirty != b->dirty) return a->dirty - b->dirty; if(a->part != b->part){ if(a->part < b->part) return -1; if(a->part > b->part) return 1; } if(a->addr < b->addr) return -1; return 1; } static void flushproc(void *v) { int i, j, n; ulong t0; DBlock *b, **write; USED(v); threadsetname("flushproc"); for(;;){ waitforkick(&dcache.round); trace(TraceWork, "start"); t0 = nsec()/1000; trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0); write = dcache.write; n = 0; for(i=0; i<dcache.nblocks; i++){ b = &dcache.blocks[i]; if(b->dirty) write[n++] = b; } qsort(write, n, sizeof(write[0]), writeblockcmp); /* Write each stage of blocks out. */ trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0); i = 0; for(j=1; j<DirtyMax; j++){ trace(TraceProc, "writeblocks.%d t=%lud", j, (ulong)(nsec()/1000)-t0); i += parallelwrites(write+i, write+n, j); } if(i != n){ fprint(2, "in flushproc i=%d n=%d\n", i, n); for(i=0; i<n; i++) fprint(2, "\tblock %d: dirty=%d\n", i, write[i]->dirty); abort(); } /* * b->dirty is protected by b->lock while ndirty is protected * by dcache.lock, so the --ndirty below is the delayed one * from clearing b->dirty in the write proc. It may happen * that some other proc has come along and redirtied b since * the write. That's okay, it just means that ndirty may be * one too high until we catch up and do the decrement. */ trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0); qlock(&dcache.lock); for(i=0; i<n; i++){ b = write[i]; --dcache.ndirty; if(b->ref == 0 && b->heap == TWID32){ upheap(dcache.nheap++, b); rwakeupall(&dcache.full); } } setstat(StatDcacheDirty, dcache.ndirty); qunlock(&dcache.lock); addstat(StatDcacheFlush, 1); trace(TraceWork, "finish"); } } static void writeproc(void *v) { DBlock *b; Part *p; p = v; threadsetname("writeproc:%s", p->name); for(;;){ b = recvp(p->writechan); trace(TraceWork, "start"); assert(b->part == p); trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr); wlock(&b->lock); trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr); diskaccess(0); if(writepart(p, b->addr, b->data, b->size) < 0) fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n", argv0, p->name, b->addr); addstat(StatApartWrite, 1); addstat(StatApartWriteBytes, b->size); b->dirty = 0; wunlock(&b->lock); trace(TraceProc, "finish %s 0x%llux", p->name, b->addr); trace(TraceWork, "finish"); sendp(b->writedonechan, b); } }