ref: 57f8b3d1acf273a88568c49d0b83b6863485b9e0
dir: /sys/src/cmd/disk/qcowfs.c/
/* Adapted from OpenBSD's src/usr.sbin/vmd/vioqcow2.c */ #include <u.h> #include <libc.h> #include <fcall.h> #include <thread.h> #include <9p.h> typedef struct Header Header; typedef struct Disk Disk; struct Header { char magic[4]; u32int version; u64int backingoff; u32int backingsz; u32int clustershift; u64int disksz; u32int cryptmethod; u32int l1sz; u64int l1off; u64int refoff; u32int refsz; u32int snapcount; u64int snapsz; /* v3 additions */ u64int incompatfeatures; u64int compatfeatures; u64int autoclearfeatures; u32int reforder; /* Bits = 1 << reforder */ u32int headersz; }; #define QCOW2_COMPRESSED 0x4000000000000000ull #define QCOW2_INPLACE 0x8000000000000000ull char *MAGIC_QCOW = "QFI\xfb"; enum{ QCOW2_DIRTY = 1 << 0, QCOW2_CORRUPT = 1 << 1, ICFEATURE_DIRTY = 1 << 0, ICFEATURE_CORRUPT = 1 << 1, ACFEATURE_BITEXT = 1 << 0, HDRSZ = 4 + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 8 + 8 + 4 + 4 + 8 + 8 + 8 + 8 + 4 + 4, }; struct Disk { RWLock lock; Disk *base; Header h; int fd; u64int *l1; s64int end; s64int clustersz; s64int disksz; /* In bytes */ u32int cryptmethod; u32int l1sz; s64int l1off; s64int refoff; s64int refsz; u32int nsnap; s64int snapoff; /* v3 features */ u64int incompatfeatures; u64int autoclearfeatures; u32int refssz; u32int headersz; }; #define PUT2(p, u) (p)[0] = (u)>>8, (p)[1] = (u) #define GET2(p) (u16int)(p)[1] | (u16int)(p)[0]<<8 #define PUT4(p, u) (p)[0] = (u)>>24, (p)[1] = (u)>>16, (p)[2] = (u)>>8, (p)[3] = (u) #define GET4(p) (u32int)(p)[3] | (u32int)(p)[2]<<8 | (u32int)(p)[1]<<16 | (u32int)(p)[0]<<24 #define PUT8(p, u) (p)[0] = (u)>>56, (p)[1] = (u)>>48, (p)[2] = (u)>>40, (p)[3] = (u)>>32, \ (p)[4] = (u)>>24, (p)[5] = (u)>>16, (p)[6] = (u)>>8, (p)[7] = (u) #define GET8(p) (u64int)(p)[7] | (u64int)(p)[6]<<8 | (u64int)(p)[5]<<16 | (u64int)(p)[4]<<24 | \ (u64int)(p)[3]<<32 | (u64int)(p)[2]<<40 | (u64int)(p)[1]<<48 | (u64int)(p)[0]<<56 int ftruncate(int fd, s64int length) { Dir d; if(length < 0) return -1; nulldir(&d); d.length = length; if(dirfwstat(fd, &d) < 0) return -1; return 0; } static void writehdr(Header *src, int fd) { uchar store[HDRSZ]; uchar *buf = store; memcpy(buf, src->magic, strlen(src->magic)); buf += 4; PUT4(buf, src->version); buf += 4; PUT8(buf, src->backingoff); buf += 8; PUT4(buf, src->backingsz); buf += 4; PUT4(buf, src->clustershift); buf += 4; PUT8(buf, src->disksz); buf += 8; PUT4(buf, src->cryptmethod); buf += 4; PUT4(buf, src->l1sz); buf += 4; PUT8(buf, src->l1off); buf += 8; PUT8(buf, src->refoff); buf += 8; PUT4(buf, src->refsz); buf += 4; PUT4(buf, src->snapcount); buf += 4; PUT8(buf, src->snapsz); buf += 8; PUT8(buf, src->incompatfeatures); buf += 8; PUT8(buf, src->compatfeatures); buf += 8; PUT8(buf, src->autoclearfeatures); buf += 8; PUT4(buf, src->reforder); buf += 4; PUT4(buf, src->headersz); if(write(fd, store, sizeof store) != sizeof store) sysfatal("writehdr: %r"); } static void readhdr(Header *dst, int fd) { uchar store[HDRSZ]; uchar *buf = store; if(readn(fd, store, sizeof store) != sizeof store) sysfatal("short read on header: %r"); if(memcmp(MAGIC_QCOW, buf, strlen(MAGIC_QCOW)) != 0) sysfatal("invalid magic"); buf += 4; dst->version = GET4(buf); if(dst->version != 2 && dst->version != 3) sysfatal("unsupported version: %d", dst->version); buf += 4; dst->backingoff = GET8(buf); buf += 8; dst->backingsz = GET4(buf); buf += 4; dst->clustershift = GET4(buf); buf += 4; dst->disksz = GET8(buf); buf += 8; dst->cryptmethod = GET4(buf); buf += 4; dst->l1sz = GET4(buf); buf += 4; dst->l1off = GET8(buf); buf += 8; dst->refoff = GET8(buf); buf += 8; dst->refsz = GET4(buf); buf += 4; dst->snapcount = GET4(buf); buf += 4; dst->snapsz = GET8(buf); buf += 8; dst->incompatfeatures = GET8(buf); buf += 8; dst->compatfeatures = GET8(buf); buf += 8; dst->autoclearfeatures = GET8(buf); buf += 8; dst->reforder = GET4(buf); buf += 4; dst->headersz = GET4(buf); } #define ALIGNSZ(sz, align) ((sz + align - 1) & ~(align - 1)) static void qc2create(int fd, u64int disksz) { Header hdr; s64int base_len; u64int l1sz, refsz, initsz, clustersz; u64int l1off, refoff, i, l1entrysz, refentrysz; uchar v[8], v2[2]; clustersz = 1<<16; l1off = ALIGNSZ(HDRSZ, clustersz); l1entrysz = clustersz * clustersz / 8; l1sz = (disksz + l1entrysz - 1) / l1entrysz; refoff = ALIGNSZ(l1off + 8*l1sz, clustersz); refentrysz = clustersz * clustersz * clustersz / 2; refsz = (disksz + refentrysz - 1) / refentrysz; initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz); base_len = 0; memcpy(hdr.magic, MAGIC_QCOW, strlen(MAGIC_QCOW)); hdr.version = 3; hdr.backingoff = 0; hdr.backingsz = base_len; hdr.clustershift = 16; hdr.disksz = disksz; hdr.cryptmethod = 0; hdr.l1sz = l1sz; hdr.l1off = l1off; hdr.refoff = refoff; hdr.refsz = refsz; hdr.snapcount = 0; hdr.snapsz = 0; hdr.incompatfeatures = 0; hdr.compatfeatures = 0; hdr.autoclearfeatures = 0; hdr.reforder = 4; hdr.headersz = HDRSZ; writehdr(&hdr, fd); if(ftruncate(fd, (s64int)initsz + clustersz) == -1) sysfatal("ftruncate: %r"); assert(initsz/clustersz < clustersz/2); PUT8(v, initsz); if(pwrite(fd, v, sizeof v, refoff) != sizeof v) sysfatal("q2create: pwrite: %r"); for(i=0; i < initsz/clustersz + 1; i++){ PUT2(v2, 1); if(pwrite(fd, v2, sizeof v2, initsz + 2*i) != sizeof v2) sysfatal("q2create: pwrite: %r"); } } static void qc2open(Disk *disk, int fd) { int i; Dir *d; uchar buf[8]; disk->fd = fd; disk->base = nil; disk->l1 = nil; readhdr(&disk->h, disk->fd); disk->clustersz = 1ull << disk->h.clustershift; disk->disksz = disk->h.disksz; disk->cryptmethod = disk->h.cryptmethod; disk->l1sz = disk->h.l1sz; disk->l1off = disk->h.l1off; disk->refsz = disk->h.refsz; disk->refoff = disk->h.refoff; disk->nsnap = disk->h.snapcount; disk->snapoff = disk->h.snapsz; disk->incompatfeatures = disk->h.incompatfeatures; disk->autoclearfeatures = disk->h.autoclearfeatures; disk->refssz = disk->h.refsz; disk->headersz = disk->h.headersz; if(disk->h.reforder != 4) sysfatal("unsupoprted refcount size %d", disk->h.reforder); disk->l1 = mallocz(disk->l1sz * 8, 1); pread(disk->fd, disk->l1, disk->l1sz * 8, disk->l1off); for(i = 0; i < disk->l1sz; i++){ memcpy(buf, disk->l1 + i, sizeof buf); disk->l1[i] = GET8(buf); } d = dirfstat(fd); if(d == nil) sysfatal("dirfstat: %r"); disk->end = d->length; free(d); } static u64int xlate(Disk *disk, s64int off, int *inplace) { s64int l2sz, l1off, l2tab, l2off, cluster, clusteroff; uchar buf[8]; /* * Clear out inplace flag -- xlate misses should not * be flagged as updatable in place. We will still * return 0 from them, but this leaves less surprises * in the API. */ if (inplace) *inplace = 0; rlock(&disk->lock); if (off < 0) goto err; l2sz = disk->clustersz / 8; l1off = (off / disk->clustersz) / l2sz; if (l1off >= disk->l1sz) goto err; l2tab = disk->l1[l1off]; l2tab &= ~QCOW2_INPLACE; if (l2tab == 0) { runlock(&disk->lock); return 0; } l2off = (off / disk->clustersz) % l2sz; pread(disk->fd, buf, sizeof(buf), l2tab + l2off * 8); cluster = GET8(buf); /* * cluster may be 0, but all future operations don't affect * the return value. */ if (inplace) *inplace = !!(cluster & QCOW2_INPLACE); if (cluster & QCOW2_COMPRESSED) sysfatal("xlate: compressed clusters unsupported"); runlock(&disk->lock); clusteroff = 0; cluster &= ~QCOW2_INPLACE; if (cluster) clusteroff = off % disk->clustersz; return cluster + clusteroff; err: runlock(&disk->lock); return -1; } static void inc_refs(Disk *disk, s64int off, int newcluster) { s64int l1off, l1idx, l2idx, l2cluster; u64int nper; u16int refs; uchar buf[8], buf2[2]; off &= ~QCOW2_INPLACE; nper = disk->clustersz / 2; l1idx = (off / disk->clustersz) / nper; l2idx = (off / disk->clustersz) % nper; l1off = disk->refoff + 8 * l1idx; if (pread(disk->fd, buf, sizeof(buf), l1off) != 8) sysfatal("could not read refs"); l2cluster = GET8(buf); if (l2cluster == 0) { l2cluster = disk->end; disk->end += disk->clustersz; if (ftruncate(disk->fd, disk->end) < 0) sysfatal("inc_refs: failed to allocate ref block"); PUT8(buf, l2cluster); if (pwrite(disk->fd, buf, sizeof(buf), l1off) != 8) sysfatal("inc_refs: failed to write ref block"); } refs = 1; if (!newcluster) { if (pread(disk->fd, buf2, sizeof buf2, l2cluster + 2 * l2idx) != 2) sysfatal("could not read ref cluster"); refs = GET2(buf2) + 1; } PUT2(buf2, refs); if (pwrite(disk->fd, buf2, sizeof buf2, l2cluster + 2 * l2idx) != 2) sysfatal("inc_refs: could not write ref block"); } static void copy_cluster(Disk *disk, Disk *base, u64int dst, u64int src) { char *scratch; scratch = malloc(disk->clustersz); if(!scratch) sysfatal("out of memory"); src &= ~(disk->clustersz - 1); dst &= ~(disk->clustersz - 1); if(pread(base->fd, scratch, disk->clustersz, src) == -1) sysfatal("copy_cluster: could not read cluster"); if(pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) sysfatal("copy_cluster: could not write cluster"); free(scratch); } /* * Allocates a new cluster on disk, creating a new L2 table * if needed. The cluster starts off with a refs of one, * and the writable bit set. * * Returns -1 on error, and the physical address within the * cluster of the write offset if it exists. */ static s64int mkcluster(Disk *disk, Disk *base, s64int off, s64int src_phys) { s64int l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; uchar buf[8]; wlock(&disk->lock); /* L1 entries always exist */ l2sz = disk->clustersz / 8; l1off = off / (disk->clustersz * l2sz); if (l1off >= disk->l1sz) sysfatal("l1 offset outside disk"); disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); l2tab = disk->l1[l1off]; l2off = (off / disk->clustersz) % l2sz; /* We may need to create or clone an L2 entry to map the block */ if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { orig = l2tab & ~QCOW2_INPLACE; l2tab = disk->end; disk->end += disk->clustersz; if (ftruncate(disk->fd, disk->end) == -1) sysfatal("mkcluster: ftruncate failed"); /* * If we translated, found a L2 entry, but it needed to * be copied, copy it. */ if (orig != 0) copy_cluster(disk, disk, l2tab, orig); /* Update l1 -- we flush it later */ disk->l1[l1off] = l2tab | QCOW2_INPLACE; inc_refs(disk, l2tab, 1); } l2tab &= ~QCOW2_INPLACE; /* Grow the disk */ if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) sysfatal("mkcluster: could not grow disk"); if (src_phys > 0) copy_cluster(disk, base, disk->end, src_phys); cluster = disk->end; disk->end += disk->clustersz; PUT8(buf, cluster | QCOW2_INPLACE); if (pwrite(disk->fd, buf, sizeof(buf), l2tab + l2off * 8) != 8) sysfatal("mkcluster: could not write cluster"); PUT8(buf, disk->l1[l1off]); if (pwrite(disk->fd, buf, sizeof(buf), disk->l1off + 8 * l1off) != 8) sysfatal("mkcluster: could not write l1"); inc_refs(disk, cluster, 1); wunlock(&disk->lock); clusteroff = off % disk->clustersz; if (cluster + clusteroff < disk->clustersz) sysfatal("write would clobber header"); return cluster + clusteroff; } static void fsread(Req *r) { char *buf; Disk *disk, *d; s64int off, phys_off, end, cluster_off; u64int len, sz, rem; off = r->ifcall.offset; buf = r->ofcall.data; len = r->ifcall.count; disk = d = r->fid->file->aux; end = off + len; if(end > d->disksz) len -= end - d->disksz; rem = len; while(rem != 0){ phys_off = xlate(d, off, nil); if(phys_off <= 0) d = nil; cluster_off = off % disk->clustersz; sz = disk->clustersz - cluster_off; if(sz > rem) sz = rem; if(!d) memset(buf, 0, sz); else sz = pread(d->fd, buf, sz, phys_off); off += sz; buf += sz; rem -= sz; } r->ofcall.count = len; respond(r, nil); } static void fswrite(Req *r) { char *buf; Disk *d; s64int off, phys_off, end, cluster_off; u64int len, sz, rem; int inplace; off = r->ifcall.offset; buf = r->ifcall.data; len = r->ifcall.count; d = r->fid->file->aux; inplace = 1; end = off + len; if(end > d->disksz){ respond(r, "end of device"); return; } rem = len; while(off != end){ cluster_off = off % d->clustersz; sz = d->clustersz - cluster_off; if(sz > rem) sz = rem; phys_off = xlate(d, off, nil); if(phys_off == -1){ respond(r, "xlate error"); return; } if(!inplace || phys_off == 0) phys_off = mkcluster(d, d, off, phys_off); if(phys_off == -1){ respond(r, "mkcluster error"); return; } if(phys_off < d->clustersz) sysfatal("fswrite: writing reserved cluster"); if(pwrite(d->fd, buf, sz, phys_off) != sz){ respond(r, "phase error"); return; } off += sz; buf += sz; rem -= sz; } r->ofcall.count = len; respond(r, nil); } Srv fs = { .read = fsread, .write = fswrite, }; static void usage(void) { fprint(2, "usage: %s [-s srv] [-m mntpt ] [-n size] file\n", argv0); exits("usage"); } void main(int argc, char **argv) { int fd; char *uid; File *f; Disk *d; uvlong size; int nflag; char *mntpt = "/mnt/qcow"; char *srvname = nil; size = 0; nflag = 0; ARGBEGIN{ case 'm': mntpt = EARGF(usage()); break; case 'n': size = strtoull(EARGF(usage()), nil, 0); nflag++; break; case 's': srvname = EARGF(usage()); break; default: usage(); break; }ARGEND if(argc < 1) usage(); if(nflag){ if((fd = create(argv[0], ORDWR, 0666)) < 0) sysfatal("create: %r"); qc2create(fd, size); seek(fd, 0, 0); } else if((fd = open(argv[0], ORDWR)) < 0) sysfatal("open: %r"); uid = getuser(); fs.tree = alloctree(uid, uid, 0755, nil); if(fs.tree == nil) sysfatal("alloctree: %r"); f = createfile(fs.tree->root, "data", uid, 0666, nil); d = mallocz(sizeof(Disk), 1); qc2open(d, fd); f->aux = d; f->length = d->disksz; postmountsrv(&fs, srvname, mntpt, MREPL); exits(nil); }