ref: f58d99aa7a97ba5f79af89f38b78d5924d4e35a2
parent: c3589ef3cf33189d342a3ab638b558fd9249b220
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sun Jul 11 07:24:13 EDT 2021
virtio: add non-legacy virtio 1.0 drivers for disk and ethernet The new interface uses pci capability structures to locate the registers in a rather fine granular way making it more complicated as they can be located anywhere in any pci bar at any offset. As far as i can see, qemu (6.0.50) never uses i/o bars in non-legacy mode, so only mmio is implemented for now. The previous virtio drivers implemented the legacy interface only which uses i/o ports for all register accesses. This is still the preferred method (and also qemu default) as it is easier to emulate and most likely faster. However, some vps providers like vultr force the legacy interface to disabled with qemu -device option "disable-legacy=on" resulting on a system without a disk and ethernet.
--- a/sys/src/9/pc/ethervirtio.c
+++ b/sys/src/9/pc/ethervirtio.c
@@ -1,3 +1,7 @@
+/*
+ * virtio ethernet driver implementing the legacy interface:
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ */
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
@@ -9,11 +13,6 @@
#include "../port/netif.h"
#include "../port/etherif.h"
-/*
- * virtio ethernet driver
- * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
- */
-
typedef struct Vring Vring;
typedef struct Vdesc Vdesc;
typedef struct Vused Vused;
@@ -555,13 +554,14 @@
h = t = nil;
/* §4.1.2 PCI Device Discovery */
- for(p = nil; p = pcimatch(p, 0, 0);){
- if(p->vid != 0x1AF4)
- continue;
+ for(p = nil; p = pcimatch(p, 0x1AF4, 0);){
/* the two possible DIDs for virtio-net */
if(p->did != 0x1000 && p->did != 0x1041)
continue;
- /* non-transitional devices will have a revision > 0 */
+ /*
+ * non-transitional devices will have a revision > 0,
+ * these are handled by ethervirtio10 driver.
+ */
if(p->rid != 0)
continue;
/* first membar needs to be I/O */
@@ -588,6 +588,8 @@
/* §3.1.2 Legacy Device Initialization */
outb(c->port+Qstatus, 0);
+ while(inb(c->port+Qstatus) != 0)
+ delay(1);
outb(c->port+Qstatus, Sacknowledge|Sdriver);
/* negotiate feature bits */
--- /dev/null
+++ b/sys/src/9/pc/ethervirtio10.c
@@ -1,0 +1,790 @@
+/*
+ * virtio 1.0 ethernet driver
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ *
+ * In contrast to ethervirtio.c, this driver handles the non-legacy
+ * interface for virtio ethernet which uses mmio for all register accesses
+ * and requires a laborate pci capability structure dance to get working.
+ *
+ * It is kind of pointless as it is most likely slower than
+ * port i/o (harder to emulate on the pc platform).
+ *
+ * The reason why this driver is needed it is that vultr set the
+ * disable-legacy=on option in the -device parameter for qemu
+ * on their hypervisor.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "../port/error.h"
+#include "../port/netif.h"
+#include "../port/etherif.h"
+
+typedef struct Vconfig Vconfig;
+typedef struct Vnetcfg Vnetcfg;
+
+typedef struct Vring Vring;
+typedef struct Vdesc Vdesc;
+typedef struct Vused Vused;
+typedef struct Vheader Vheader;
+typedef struct Vqueue Vqueue;
+
+typedef struct Ctlr Ctlr;
+
+enum {
+ /* §2.1 Device Status Field */
+ Sacknowledge = 1,
+ Sdriver = 2,
+ Sdriverok = 4,
+ Sfeatureok = 8,
+ Sfailed = 128,
+
+ /* flags in Qnetstatus */
+ Nlinkup = (1<<0),
+ Nannounce = (1<<1),
+
+ /* feat[0] bits */
+ Fmac = 1<<5,
+ Fstatus = 1<<16,
+ Fctrlvq = 1<<17,
+ Fctrlrx = 1<<18,
+
+ /* feat[1] bits */
+ Fversion1 = 1<<(32-32),
+
+ /* vring used flags */
+ Unonotify = 1,
+ /* vring avail flags */
+ Rnointerrupt = 1,
+
+ /* descriptor flags */
+ Dnext = 1,
+ Dwrite = 2,
+ Dindirect = 4,
+
+ /* struct sizes */
+ VringSize = 4,
+ VdescSize = 16,
+ VusedSize = 8,
+ VheaderSize = 12,
+
+ Vrxq = 0,
+ Vtxq = 1,
+ Vctlq = 2,
+
+ /* class/cmd for Vctlq */
+ CtrlRx = 0x00,
+ CmdPromisc = 0x00,
+ CmdAllmulti = 0x01,
+ CtrlMac = 0x01,
+ CmdMacTableSet = 0x00,
+ CtrlVlan= 0x02,
+ CmdVlanAdd = 0x00,
+ CmdVlanDel = 0x01,
+};
+
+struct Vconfig {
+ u32int devfeatsel;
+ u32int devfeat;
+ u32int drvfeatsel;
+ u32int drvfeat;
+
+ u16int msixcfg;
+ u16int nqueues;
+
+ u8int status;
+ u8int cfggen;
+ u16int queuesel;
+
+ u16int queuesize;
+ u16int queuemsixvect;
+
+ u16int queueenable;
+ u16int queuenotifyoff;
+
+ u64int queuedesc;
+ u64int queueavail;
+ u64int queueused;
+};
+
+struct Vnetcfg
+{
+ u16int mac0;
+ u16int mac1;
+ u16int mac2;
+ u16int status;
+ u16int maxqueuepairs;
+ u16int mtu;
+};
+
+struct Vring
+{
+ u16int flags;
+ u16int idx;
+};
+
+struct Vdesc
+{
+ u64int addr;
+ u32int len;
+ u16int flags;
+ u16int next;
+};
+
+struct Vused
+{
+ u32int id;
+ u32int len;
+};
+
+struct Vheader
+{
+ u8int flags;
+ u8int segtype;
+ u16int hlen;
+ u16int seglen;
+ u16int csumstart;
+ u16int csumend;
+};
+
+struct Vqueue
+{
+ Rendez;
+
+ uint qsize;
+ uint qmask;
+
+ Vdesc *desc;
+
+ Vring *avail;
+ u16int *availent;
+ u16int *availevent;
+
+ Vring *used;
+ Vused *usedent;
+ u16int *usedevent;
+ u16int lastused;
+
+ uint nintr;
+ uint nnote;
+
+ /* notify register */
+ void *notify;
+};
+
+struct Ctlr {
+ Lock;
+
+ QLock ctllock;
+
+ int attached;
+
+ /* registers */
+ Vconfig *cfg;
+ Vnetcfg *dev;
+ u8int *isr;
+ u8int *notify;
+ u32int notifyoffmult;
+
+ uvlong port;
+ Pcidev *pcidev;
+ Ctlr *next;
+ int active;
+ ulong feat[2];
+ int nqueue;
+
+ /* virtioether has 3 queues: rx, tx and ctl */
+ Vqueue queue[3];
+};
+
+static Ctlr *ctlrhead;
+
+static int
+vhasroom(void *v)
+{
+ Vqueue *q = v;
+ return q->lastused != q->used->idx;
+}
+
+static void
+vqnotify(Ctlr *ctlr, int x)
+{
+ Vqueue *q;
+
+ coherence();
+ q = &ctlr->queue[x];
+ if(q->used->flags & Unonotify)
+ return;
+ q->nnote++;
+ *((u16int*)q->notify) = x;
+}
+
+static void
+txproc(void *v)
+{
+ Vheader *header;
+ Block **blocks;
+ Ether *edev;
+ Ctlr *ctlr;
+ Vqueue *q;
+ Vused *u;
+ Block *b;
+ int i, j;
+
+ edev = v;
+ ctlr = edev->ctlr;
+ q = &ctlr->queue[Vtxq];
+
+ header = smalloc(VheaderSize);
+ blocks = smalloc(sizeof(Block*) * (q->qsize/2));
+
+ for(i = 0; i < q->qsize/2; i++){
+ j = i << 1;
+ q->desc[j].addr = PADDR(header);
+ q->desc[j].len = VheaderSize;
+ q->desc[j].next = j | 1;
+ q->desc[j].flags = Dnext;
+
+ q->availent[i] = q->availent[i + q->qsize/2] = j;
+
+ j |= 1;
+ q->desc[j].next = 0;
+ q->desc[j].flags = 0;
+ }
+
+ q->avail->flags &= ~Rnointerrupt;
+
+ while(waserror())
+ ;
+
+ while((b = qbread(edev->oq, 1000000)) != nil){
+ for(;;){
+ /* retire completed packets */
+ while((i = q->lastused) != q->used->idx){
+ u = &q->usedent[i & q->qmask];
+ i = (u->id & q->qmask) >> 1;
+ if(blocks[i] == nil)
+ break;
+ freeb(blocks[i]);
+ blocks[i] = nil;
+ q->lastused++;
+ }
+
+ /* have free slot? */
+ i = q->avail->idx & (q->qmask >> 1);
+ if(blocks[i] == nil)
+ break;
+
+ /* ring full, wait and retry */
+ if(!vhasroom(q))
+ sleep(q, vhasroom, q);
+ }
+
+ /* slot is free, fill in descriptor */
+ blocks[i] = b;
+ j = (i << 1) | 1;
+ q->desc[j].addr = PADDR(b->rp);
+ q->desc[j].len = BLEN(b);
+ coherence();
+ q->avail->idx++;
+ vqnotify(ctlr, Vtxq);
+ }
+
+ pexit("ether out queue closed", 1);
+}
+
+static void
+rxproc(void *v)
+{
+ Vheader *header;
+ Block **blocks;
+ Ether *edev;
+ Ctlr *ctlr;
+ Vqueue *q;
+ Vused *u;
+ Block *b;
+ int i, j;
+
+ edev = v;
+ ctlr = edev->ctlr;
+ q = &ctlr->queue[Vrxq];
+
+ header = smalloc(VheaderSize);
+ blocks = smalloc(sizeof(Block*) * (q->qsize/2));
+
+ for(i = 0; i < q->qsize/2; i++){
+ j = i << 1;
+ q->desc[j].addr = PADDR(header);
+ q->desc[j].len = VheaderSize;
+ q->desc[j].next = j | 1;
+ q->desc[j].flags = Dwrite|Dnext;
+
+ q->availent[i] = q->availent[i + q->qsize/2] = j;
+
+ j |= 1;
+ q->desc[j].next = 0;
+ q->desc[j].flags = Dwrite;
+ }
+
+ q->avail->flags &= ~Rnointerrupt;
+
+ while(waserror())
+ ;
+
+ for(;;){
+ /* replenish receive ring */
+ do {
+ i = q->avail->idx & (q->qmask >> 1);
+ if(blocks[i] != nil)
+ break;
+ if((b = iallocb(ETHERMAXTU)) == nil)
+ break;
+ blocks[i] = b;
+ j = (i << 1) | 1;
+ q->desc[j].addr = PADDR(b->rp);
+ q->desc[j].len = BALLOC(b);
+ coherence();
+ q->avail->idx++;
+ } while(q->avail->idx != q->used->idx);
+ vqnotify(ctlr, Vrxq);
+
+ /* wait for any packets to complete */
+ if(!vhasroom(q))
+ sleep(q, vhasroom, q);
+
+ /* retire completed packets */
+ while((i = q->lastused) != q->used->idx) {
+ u = &q->usedent[i & q->qmask];
+ i = (u->id & q->qmask) >> 1;
+ if((b = blocks[i]) == nil)
+ break;
+
+ blocks[i] = nil;
+ b->wp = b->rp + u->len - VheaderSize;
+ etheriq(edev, b);
+ q->lastused++;
+ }
+ }
+}
+
+static int
+vctlcmd(Ether *edev, uchar class, uchar cmd, uchar *data, int ndata)
+{
+ uchar hdr[2], ack[1];
+ Ctlr *ctlr;
+ Vqueue *q;
+ Vdesc *d;
+ int i;
+
+ ctlr = edev->ctlr;
+ q = &ctlr->queue[Vctlq];
+ if(q->qsize < 3)
+ return -1;
+
+ qlock(&ctlr->ctllock);
+ while(waserror())
+ ;
+
+ ack[0] = 0x55;
+ hdr[0] = class;
+ hdr[1] = cmd;
+
+ d = &q->desc[0];
+ d->addr = PADDR(hdr);
+ d->len = sizeof(hdr);
+ d->next = 1;
+ d->flags = Dnext;
+ d++;
+ d->addr = PADDR(data);
+ d->len = ndata;
+ d->next = 2;
+ d->flags = Dnext;
+ d++;
+ d->addr = PADDR(ack);
+ d->len = sizeof(ack);
+ d->next = 0;
+ d->flags = Dwrite;
+
+ i = q->avail->idx & q->qmask;
+ q->availent[i] = 0;
+ coherence();
+
+ q->avail->flags &= ~Rnointerrupt;
+ q->avail->idx++;
+ vqnotify(ctlr, Vctlq);
+ while(!vhasroom(q))
+ sleep(q, vhasroom, q);
+ q->lastused = q->used->idx;
+ q->avail->flags |= Rnointerrupt;
+
+ qunlock(&ctlr->ctllock);
+ poperror();
+
+ if(ack[0] != 0)
+ print("#l%d: vctlcmd: %ux.%ux -> %ux\n", edev->ctlrno, class, cmd, ack[0]);
+
+ return ack[0];
+}
+
+static void
+interrupt(Ureg*, void* arg)
+{
+ Ether *edev;
+ Ctlr *ctlr;
+ Vqueue *q;
+ int i;
+
+ edev = arg;
+ ctlr = edev->ctlr;
+ if(*ctlr->isr & 1){
+ for(i = 0; i < ctlr->nqueue; i++){
+ q = &ctlr->queue[i];
+ if(vhasroom(q)){
+ q->nintr++;
+ wakeup(q);
+ }
+ }
+ }
+}
+
+static void
+attach(Ether* edev)
+{
+ char name[KNAMELEN];
+ Ctlr* ctlr;
+ int i;
+
+ ctlr = edev->ctlr;
+ ilock(ctlr);
+ if(ctlr->attached){
+ iunlock(ctlr);
+ return;
+ }
+ ctlr->attached = 1;
+
+ /* driver is ready */
+ ctlr->cfg->status |= Sdriverok;
+
+ /* enable the queues */
+ for(i = 0; i < ctlr->nqueue; i++){
+ ctlr->cfg->queuesel = i;
+ ctlr->cfg->queueenable = 1;
+ }
+ iunlock(ctlr);
+
+ /* start kprocs */
+ snprint(name, sizeof name, "#l%drx", edev->ctlrno);
+ kproc(name, rxproc, edev);
+ snprint(name, sizeof name, "#l%dtx", edev->ctlrno);
+ kproc(name, txproc, edev);
+}
+
+static long
+ifstat(Ether *edev, void *a, long n, ulong offset)
+{
+ int i, l;
+ char *p;
+ Ctlr *ctlr;
+ Vqueue *q;
+
+ ctlr = edev->ctlr;
+
+ p = smalloc(READSTR);
+
+ l = snprint(p, READSTR, "devfeat %32.32lub %32.32lub\n", ctlr->feat[1], ctlr->feat[0]);
+ l += snprint(p+l, READSTR-l, "devstatus %8.8ub\n", ctlr->cfg->status);
+
+ for(i = 0; i < ctlr->nqueue; i++){
+ q = &ctlr->queue[i];
+ l += snprint(p+l, READSTR-l,
+ "vq%d %#p size %d avail->idx %d used->idx %d lastused %hud nintr %ud nnote %ud\n",
+ i, q, q->qsize, q->avail->idx, q->used->idx, q->lastused, q->nintr, q->nnote);
+ }
+
+ n = readstr(offset, a, n, p);
+ free(p);
+
+ return n;
+}
+
+static void
+shutdown(Ether* edev)
+{
+ Ctlr *ctlr = edev->ctlr;
+
+ coherence();
+ ctlr->cfg->status = 0;
+ coherence();
+
+ pciclrbme(ctlr->pcidev);
+}
+
+static void
+promiscuous(void *arg, int on)
+{
+ Ether *edev = arg;
+ uchar b[1];
+
+ b[0] = on != 0;
+ vctlcmd(edev, CtrlRx, CmdPromisc, b, sizeof(b));
+}
+
+static void
+multicast(void *arg, uchar*, int)
+{
+ Ether *edev = arg;
+ uchar b[1];
+
+ b[0] = edev->nmaddr > 0;
+ vctlcmd(edev, CtrlRx, CmdAllmulti, b, sizeof(b));
+}
+
+static int
+initqueue(Vqueue *q, int size)
+{
+ uchar *p;
+
+ q->desc = mallocalign(VdescSize*size, 16, 0, 0);
+ if(q->desc == nil)
+ return -1;
+ p = mallocalign(VringSize + 2*size + 2, 2, 0, 0);
+ if(p == nil){
+FreeDesc:
+ free(q->desc);
+ q->desc = nil;
+ return -1;
+ }
+ q->avail = (void*)p;
+ p += VringSize;
+ q->availent = (void*)p;
+ p += sizeof(u16int)*size;
+ q->availevent = (void*)p;
+ p = mallocalign(VringSize + VusedSize*size + 2, 4, 0, 0);
+ if(p == nil){
+ free(q->avail);
+ q->avail = nil;
+ goto FreeDesc;
+ }
+ q->used = (void*)p;
+ p += VringSize;
+ q->usedent = (void*)p;
+ p += VusedSize*size;
+ q->usedevent = (void*)p;
+
+ q->qsize = size;
+ q->qmask = q->qsize - 1;
+
+ q->lastused = q->avail->idx = q->used->idx = 0;
+
+ q->avail->flags |= Rnointerrupt;
+
+ return 0;
+}
+
+static int
+matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
+{
+ int bar;
+
+ if(cap != 9 || pcicfgr8(p, off+3) != typ)
+ return 1;
+
+ /* skip invalid or non memory bars */
+ bar = pcicfgr8(p, off+4);
+ if(bar < 0 || bar >= nelem(p->mem)
+ || p->mem[bar].size == 0
+ || (p->mem[bar].bar & 3) != 0)
+ return 1;
+
+ return 0;
+}
+
+static int
+virtiocap(Pcidev *p, int typ)
+{
+ return pcienumcaps(p, matchvirtiocfgcap, typ);
+}
+
+static void*
+virtiomapregs(Pcidev *p, int cap, int size)
+{
+ int bar, len;
+ uvlong addr;
+
+ if(cap < 0)
+ return nil;
+ bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+ addr = pcicfgr32(p, cap+8);
+ len = pcicfgr32(p, cap+12);
+ if(size <= 0)
+ size = len;
+ else if(len < size)
+ return nil;
+ if(addr+len > p->mem[bar].size)
+ return nil;
+ addr += p->mem[bar].bar & ~0xFULL;
+ return vmap(addr, size);
+}
+
+static Ctlr*
+pciprobe(void)
+{
+ Ctlr *c, *h, *t;
+ Pcidev *p;
+ Vconfig *cfg;
+ int bar, cap, n, i;
+
+ h = t = nil;
+
+ /* §4.1.2 PCI Device Discovery */
+ for(p = nil; p = pcimatch(p, 0x1AF4, 0x1041);){
+ /* non-transitional devices will have a revision > 0 */
+ if(p->rid == 0)
+ continue;
+ if((cap = virtiocap(p, 1)) < 0)
+ continue;
+ bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+ cfg = virtiomapregs(p, cap, sizeof(Vconfig));
+ if(cfg == nil)
+ continue;
+ if((c = mallocz(sizeof(Ctlr), 1)) == nil){
+ print("ethervirtio: no memory for Ctlr\n");
+ break;
+ }
+ c->cfg = cfg;
+ c->pcidev = p;
+ c->port = p->mem[bar].bar & ~0xFULL;
+
+ pcienable(p);
+ c->dev = virtiomapregs(p, virtiocap(p, 4), sizeof(Vnetcfg));
+ if(c->dev == nil)
+ goto Baddev;
+ c->isr = virtiomapregs(p, virtiocap(p, 3), 0);
+ if(c->isr == nil)
+ goto Baddev;
+ cap = virtiocap(p, 2);
+ c->notify = virtiomapregs(p, cap, 0);
+ if(c->notify == nil)
+ goto Baddev;
+ c->notifyoffmult = pcicfgr32(p, cap+16);
+
+ /* device reset */
+ coherence();
+ cfg->status = 0;
+ while(cfg->status != 0)
+ delay(1);
+ cfg->status = Sacknowledge|Sdriver;
+
+ /* negotiate feature bits */
+ cfg->devfeatsel = 1;
+ c->feat[1] = cfg->devfeat;
+
+ cfg->devfeatsel = 0;
+ c->feat[0] = cfg->devfeat;
+
+ cfg->drvfeatsel = 1;
+ cfg->drvfeat = c->feat[1] & Fversion1;
+
+ cfg->drvfeatsel = 0;
+ cfg->drvfeat = c->feat[0] & (Fmac|Fctrlvq|Fctrlrx);
+
+ for(i=0; i<nelem(c->queue); i++){
+ cfg->queuesel = i;
+ n = cfg->queuesize;
+ if(n == 0 || (n & (n-1)) != 0){
+ if(i < 2)
+ print("ethervirtio: queue %d has invalid size %d\n", i, n);
+ break;
+ }
+ if(initqueue(&c->queue[i], n) < 0)
+ break;
+ c->queue[i].notify = c->notify + c->notifyoffmult * cfg->queuenotifyoff;
+ coherence();
+ cfg->queuedesc = PADDR(c->queue[i].desc);
+ cfg->queueavail = PADDR(c->queue[i].avail);
+ cfg->queueused = PADDR(c->queue[i].used);
+ }
+ if(i < 2){
+ print("ethervirtio: no queues\n");
+Baddev:
+ pcidisable(p);
+ /* TODO, vunmap */
+ free(c);
+ continue;
+ }
+ c->nqueue = i;
+
+ if(h == nil)
+ h = c;
+ else
+ t->next = c;
+ t = c;
+ }
+
+ return h;
+}
+
+
+static int
+reset(Ether* edev)
+{
+ static uchar zeros[Eaddrlen];
+ Ctlr *ctlr;
+ int i;
+
+ if(ctlrhead == nil)
+ ctlrhead = pciprobe();
+
+ for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
+ if(ctlr->active)
+ continue;
+ if(edev->port == 0 || edev->port == ctlr->port){
+ ctlr->active = 1;
+ break;
+ }
+ }
+
+ if(ctlr == nil)
+ return -1;
+
+ edev->ctlr = ctlr;
+ edev->port = ctlr->port;
+ edev->irq = ctlr->pcidev->intl;
+ edev->tbdf = ctlr->pcidev->tbdf;
+ edev->mbps = 1000;
+ edev->link = 1;
+
+ if((ctlr->feat[0] & Fmac) != 0 && memcmp(edev->ea, zeros, Eaddrlen) == 0){
+ for(i = 0; i < Eaddrlen; i++)
+ edev->ea[i] = ((uchar*)ctlr->dev)[i];
+ } else {
+ for(i = 0; i < Eaddrlen; i++)
+ ((uchar*)ctlr->dev)[i] = edev->ea[i];
+ }
+
+ edev->arg = edev;
+
+ edev->attach = attach;
+ edev->shutdown = shutdown;
+ edev->ifstat = ifstat;
+
+ if((ctlr->feat[0] & (Fctrlvq|Fctrlrx)) == (Fctrlvq|Fctrlrx)){
+ edev->multicast = multicast;
+ edev->promiscuous = promiscuous;
+ }
+
+ pcisetbme(ctlr->pcidev);
+ intrenable(edev->irq, interrupt, edev, edev->tbdf, edev->name);
+
+ return 0;
+}
+
+void
+ethervirtio10link(void)
+{
+ addethercard("virtio10", reset);
+}
--- a/sys/src/9/pc/pc
+++ b/sys/src/9/pc/pc
@@ -80,6 +80,7 @@
etherwpi pci wifi
etherrt2860 pci wifi
ethervirtio pci
+ ethervirtio10 pci
ethermedium
pcmciamodem
netdevmedium
@@ -108,6 +109,7 @@
sdiahci pci sdscsi led
sdodin pci sdscsi led
sdvirtio pci sdscsi
+ sdvirtio10 pci sdscsi
sdmmc pci pmmc
sdnvme pci
sdloop
--- a/sys/src/9/pc/sdvirtio.c
+++ b/sys/src/9/pc/sdvirtio.c
@@ -1,3 +1,7 @@
+/*
+ * virtio ethernet driver implementing the legacy interface:
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ */
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
--- /dev/null
+++ b/sys/src/9/pc/sdvirtio10.c
@@ -1,0 +1,808 @@
+/*
+ * virtio 1.0 disk driver
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ *
+ * In contrast to sdvirtio.c, this driver handles the non-legacy
+ * interface for virtio disk which uses mmio for all register accesses
+ * and requires a laborate pci capability structure dance to get working.
+ *
+ * It is kind of pointless as it is most likely slower than
+ * port i/o (harder to emulate on the pc platform).
+ *
+ * The reason why this driver is needed it is that vultr set the
+ * disable-legacy=on option in the -device parameter for qemu
+ * on their hypervisor.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "ureg.h"
+#include "../port/error.h"
+
+#include "../port/sd.h"
+
+typedef struct Vscsidev Vscsidev;
+typedef struct Vblkdev Vblkdev;
+
+typedef struct Vconfig Vconfig;
+typedef struct Vring Vring;
+typedef struct Vdesc Vdesc;
+typedef struct Vused Vused;
+typedef struct Vqueue Vqueue;
+typedef struct Vdev Vdev;
+
+
+/* device types */
+enum {
+ TypBlk = 2,
+ TypSCSI = 8,
+};
+
+/* status flags */
+enum {
+ Acknowledge = 1,
+ Driver = 2,
+ DriverOk = 4,
+ Failed = 0x80,
+};
+
+/* descriptor flags */
+enum {
+ Next = 1,
+ Write = 2,
+ Indirect = 4,
+};
+
+/* struct sizes */
+enum {
+ VringSize = 4,
+};
+
+enum {
+ CDBSIZE = 32,
+ SENSESIZE = 96,
+};
+
+
+struct Vscsidev
+{
+ u32int num_queues;
+ u32int seg_max;
+ u32int max_sectors;
+ u32int cmd_per_lun;
+ u32int event_info_size;
+ u32int sense_size;
+ u32int cdb_size;
+ u16int max_channel;
+ u16int max_target;
+ u32int max_lun;
+};
+
+struct Vblkdev
+{
+ u64int capacity;
+};
+
+struct Vconfig {
+ u32int devfeatsel;
+ u32int devfeat;
+ u32int drvfeatsel;
+ u32int drvfeat;
+
+ u16int msixcfg;
+ u16int nqueues;
+
+ u8int status;
+ u8int cfggen;
+ u16int queuesel;
+
+ u16int queuesize;
+ u16int queuemsixvect;
+
+ u16int queueenable;
+ u16int queuenotifyoff;
+
+ u64int queuedesc;
+ u64int queueavail;
+ u64int queueused;
+};
+
+struct Vring
+{
+ u16int flags;
+ u16int idx;
+};
+
+struct Vdesc
+{
+ u64int addr;
+ u32int len;
+ u16int flags;
+ u16int next;
+};
+
+struct Vused
+{
+ u32int id;
+ u32int len;
+};
+
+struct Vqueue
+{
+ Lock;
+
+ Vdev *dev;
+ void *notify;
+ int idx;
+
+ int size;
+
+ int free;
+ int nfree;
+
+ Vdesc *desc;
+
+ Vring *avail;
+ u16int *availent;
+ u16int *availevent;
+
+ Vring *used;
+ Vused *usedent;
+ u16int *usedevent;
+ u16int lastused;
+
+ void *rock[];
+};
+
+struct Vdev
+{
+ int typ;
+
+ Pcidev *pci;
+
+ uvlong port;
+ ulong feat[2];
+
+ int nqueue;
+ Vqueue *queue[16];
+
+ void *dev; /* device specific config (for scsi) */
+
+ /* registers */
+ Vconfig *cfg;
+ u8int *isr;
+ u8int *notify;
+ u32int notifyoffmult;
+
+ Vdev *next;
+};
+
+static Vqueue*
+mkvqueue(int size)
+{
+ Vqueue *q;
+ uchar *p;
+ int i;
+
+ q = malloc(sizeof(*q) + sizeof(void*)*size);
+ p = mallocalign(
+ PGROUND(sizeof(Vdesc)*size +
+ VringSize +
+ sizeof(u16int)*size +
+ sizeof(u16int)) +
+ PGROUND(VringSize +
+ sizeof(Vused)*size +
+ sizeof(u16int)),
+ BY2PG, 0, 0);
+ if(p == nil || q == nil){
+ print("virtio: no memory for Vqueue\n");
+ free(p);
+ free(q);
+ return nil;
+ }
+
+ q->desc = (void*)p;
+ p += sizeof(Vdesc)*size;
+ q->avail = (void*)p;
+ p += VringSize;
+ q->availent = (void*)p;
+ p += sizeof(u16int)*size;
+ q->availevent = (void*)p;
+ p += sizeof(u16int);
+
+ p = (uchar*)PGROUND((uintptr)p);
+ q->used = (void*)p;
+ p += VringSize;
+ q->usedent = (void*)p;
+ p += sizeof(Vused)*size;
+ q->usedevent = (void*)p;
+
+ q->free = -1;
+ q->nfree = q->size = size;
+ for(i=0; i<size; i++){
+ q->desc[i].next = q->free;
+ q->free = i;
+ }
+
+ return q;
+}
+
+static int
+matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
+{
+ int bar;
+
+ if(cap != 9 || pcicfgr8(p, off+3) != typ)
+ return 1;
+
+ /* skip invalid or non memory bars */
+ bar = pcicfgr8(p, off+4);
+ if(bar < 0 || bar >= nelem(p->mem)
+ || p->mem[bar].size == 0
+ || (p->mem[bar].bar & 3) != 0)
+ return 1;
+
+ return 0;
+}
+
+static int
+virtiocap(Pcidev *p, int typ)
+{
+ return pcienumcaps(p, matchvirtiocfgcap, typ);
+}
+
+static void*
+virtiomapregs(Pcidev *p, int cap, int size)
+{
+ int bar, len;
+ uvlong addr;
+
+ if(cap < 0)
+ return nil;
+ bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+ addr = pcicfgr32(p, cap+8);
+ len = pcicfgr32(p, cap+12);
+ if(size <= 0)
+ size = len;
+ else if(len < size)
+ return nil;
+ if(addr+len > p->mem[bar].size)
+ return nil;
+ addr += p->mem[bar].bar & ~0xFULL;
+ return vmap(addr, size);
+}
+
+static Vdev*
+viopnpdevs(int typ)
+{
+ Vdev *vd, *h, *t;
+ Vconfig *cfg;
+ Vqueue *q;
+ Pcidev *p;
+ int cap, bar;
+ int n, i;
+
+ h = t = nil;
+ for(p = nil; p = pcimatch(p, 0x1AF4, 0x1040+typ);){
+ if(p->rid == 0)
+ continue;
+ if((cap = virtiocap(p, 1)) < 0)
+ continue;
+ bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+ cfg = virtiomapregs(p, cap, sizeof(Vconfig));
+ if(cfg == nil)
+ continue;
+ if((vd = malloc(sizeof(*vd))) == nil){
+ print("virtio: no memory for Vdev\n");
+ break;
+ }
+ vd->port = p->mem[bar].bar & ~0xFULL;
+ vd->typ = typ;
+ vd->pci = p;
+ vd->cfg = cfg;
+ pcienable(p);
+
+ vd->isr = virtiomapregs(p, virtiocap(p, 3), 0);
+ if(vd->isr == nil){
+Baddev:
+ pcidisable(p);
+ /* TODO: vunmap */
+ free(vd);
+ continue;
+ }
+ cap = virtiocap(p, 2);
+ vd->notify = virtiomapregs(p, cap, 0);
+ if(vd->notify == nil)
+ goto Baddev;
+ vd->notifyoffmult = pcicfgr32(p, cap+16);
+
+ /* reset */
+ cfg->status = 0;
+ while(cfg->status != 0)
+ delay(1);
+ cfg->status = Acknowledge|Driver;
+
+ /* negotiate feature bits */
+ cfg->devfeatsel = 1;
+ vd->feat[1] = cfg->devfeat;
+ cfg->devfeatsel = 0;
+ vd->feat[0] = cfg->devfeat;
+ cfg->drvfeatsel = 1;
+ cfg->drvfeat = vd->feat[1] & 1;
+ cfg->drvfeatsel = 0;
+ cfg->drvfeat = 0;
+
+ for(i=0; i<nelem(vd->queue); i++){
+ cfg->queuesel = i;
+ n = cfg->queuesize;
+ if(n == 0 || (n & (n-1)) != 0)
+ break;
+ if((q = mkvqueue(n)) == nil)
+ break;
+ q->notify = vd->notify + vd->notifyoffmult * cfg->queuenotifyoff;
+ q->dev = vd;
+ q->idx = i;
+ vd->queue[i] = q;
+ coherence();
+ cfg->queuedesc = PADDR(q->desc);
+ cfg->queueavail = PADDR(q->avail);
+ cfg->queueused = PADDR(q->used);
+ }
+ vd->nqueue = i;
+
+ if(h == nil)
+ h = vd;
+ else
+ t->next = vd;
+ t = vd;
+ }
+
+ return h;
+}
+
+struct Rock {
+ int done;
+ Rendez *sleep;
+};
+
+static void
+vqinterrupt(Vqueue *q)
+{
+ int id, free, m;
+ struct Rock *r;
+ Rendez *z;
+
+ m = q->size-1;
+
+ ilock(q);
+ while((q->lastused ^ q->used->idx) & m){
+ id = q->usedent[q->lastused++ & m].id;
+ if(r = q->rock[id]){
+ q->rock[id] = nil;
+ z = r->sleep;
+ r->done = 1; /* hands off */
+ if(z != nil)
+ wakeup(z);
+ }
+ do {
+ free = id;
+ id = q->desc[free].next;
+ q->desc[free].next = q->free;
+ q->free = free;
+ q->nfree++;
+ } while(q->desc[free].flags & Next);
+ }
+ iunlock(q);
+}
+
+static void
+viointerrupt(Ureg *, void *arg)
+{
+ Vdev *vd = arg;
+
+ if(vd->isr[0] & 1)
+ vqinterrupt(vd->queue[vd->typ == TypSCSI ? 2 : 0]);
+}
+
+static int
+viodone(void *arg)
+{
+ return ((struct Rock*)arg)->done;
+}
+
+static void
+vqio(Vqueue *q, int head)
+{
+ struct Rock rock;
+
+ rock.done = 0;
+ rock.sleep = &up->sleep;
+ q->rock[head] = &rock;
+ q->availent[q->avail->idx & (q->size-1)] = head;
+ coherence();
+ q->avail->idx++;
+ iunlock(q);
+ if((q->used->flags & 1) == 0)
+ *((u16int*)q->notify) = q->idx;
+ while(!rock.done){
+ while(waserror())
+ ;
+ tsleep(rock.sleep, viodone, &rock, 1000);
+ poperror();
+
+ if(!rock.done)
+ vqinterrupt(q);
+ }
+}
+
+static int
+vioblkreq(Vdev *vd, int typ, void *a, long count, long secsize, uvlong lba)
+{
+ int need, free, head;
+ Vqueue *q;
+ Vdesc *d;
+
+ u8int status;
+ struct Vioblkreqhdr {
+ u32int typ;
+ u32int prio;
+ u64int lba;
+ } req;
+
+ need = 2;
+ if(a != nil)
+ need = 3;
+
+ status = -1;
+ req.typ = typ;
+ req.prio = 0;
+ req.lba = lba;
+
+ q = vd->queue[0];
+ ilock(q);
+ while(q->nfree < need){
+ iunlock(q);
+
+ if(!waserror())
+ tsleep(&up->sleep, return0, 0, 500);
+ poperror();
+
+ ilock(q);
+ }
+
+ head = free = q->free;
+
+ d = &q->desc[free]; free = d->next;
+ d->addr = PADDR(&req);
+ d->len = sizeof(req);
+ d->flags = Next;
+
+ if(a != nil){
+ d = &q->desc[free]; free = d->next;
+ d->addr = PADDR(a);
+ d->len = secsize*count;
+ d->flags = typ ? Next : (Write|Next);
+ }
+
+ d = &q->desc[free]; free = d->next;
+ d->addr = PADDR(&status);
+ d->len = sizeof(status);
+ d->flags = Write;
+
+ q->free = free;
+ q->nfree -= need;
+
+ /* queue io, unlock and wait for completion */
+ vqio(q, head);
+
+ return status;
+}
+
+static int
+vioscsireq(SDreq *r)
+{
+ u8int resp[4+4+2+2+SENSESIZE];
+ u8int req[8+8+3+CDBSIZE];
+ int free, head;
+ u32int len;
+ Vqueue *q;
+ Vdesc *d;
+ Vdev *vd;
+ SDunit *u;
+ Vscsidev *scsi;
+
+ u = r->unit;
+ vd = u->dev->ctlr;
+ scsi = vd->dev;
+
+ memset(resp, 0, sizeof(resp));
+ memset(req, 0, sizeof(req));
+ req[0] = 1;
+ req[1] = u->subno;
+ req[2] = r->lun>>8;
+ req[3] = r->lun&0xFF;
+ *(u64int*)(&req[8]) = (uintptr)r;
+
+ memmove(&req[8+8+3], r->cmd, r->clen);
+
+ q = vd->queue[2];
+ ilock(q);
+ while(q->nfree < 3){
+ iunlock(q);
+
+ if(!waserror())
+ tsleep(&up->sleep, return0, 0, 500);
+ poperror();
+
+ ilock(q);
+ }
+
+ head = free = q->free;
+
+ d = &q->desc[free]; free = d->next;
+ d->addr = PADDR(req);
+ d->len = 8+8+3+scsi->cdb_size;
+ d->flags = Next;
+
+ if(r->write && r->dlen > 0){
+ d = &q->desc[free]; free = d->next;
+ d->addr = PADDR(r->data);
+ d->len = r->dlen;
+ d->flags = Next;
+ }
+
+ d = &q->desc[free]; free = d->next;
+ d->addr = PADDR(resp);
+ d->len = 4+4+2+2+scsi->sense_size;
+ d->flags = Write;
+
+ if(!r->write && r->dlen > 0){
+ d->flags |= Next;
+
+ d = &q->desc[free]; free = d->next;
+ d->addr = PADDR(r->data);
+ d->len = r->dlen;
+ d->flags = Write;
+ }
+
+ q->free = free;
+ q->nfree -= 2 + (r->dlen > 0);
+
+ /* queue io, unlock and wait for completion */
+ vqio(q, head);
+
+ /* response+status */
+ r->status = resp[10];
+ if(resp[11] != 0)
+ r->status = SDcheck;
+
+ /* sense_len */
+ len = *((u32int*)&resp[0]);
+ if(len > 0){
+ if(len > sizeof(r->sense))
+ len = sizeof(r->sense);
+ memmove(r->sense, &resp[4+4+2+2], len);
+ r->flags |= SDvalidsense;
+ }
+
+ /* data residue */
+ len = *((u32int*)&resp[4]);
+ if(len > r->dlen)
+ r->rlen = 0;
+ else
+ r->rlen = r->dlen - len;
+
+ return r->status;
+
+}
+
+static long
+viobio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
+{
+ long ss, cc, max, ret;
+ Vdev *vd;
+
+ vd = u->dev->ctlr;
+ if(vd->typ == TypSCSI)
+ return scsibio(u, lun, write, a, count, lba);
+
+ max = 32;
+ ss = u->secsize;
+ ret = 0;
+ while(count > 0){
+ if((cc = count) > max)
+ cc = max;
+ if(vioblkreq(vd, write != 0, (uchar*)a + ret, cc, ss, lba) != 0)
+ error(Eio);
+ ret += cc*ss;
+ count -= cc;
+ lba += cc;
+ }
+ return ret;
+}
+
+static int
+viorio(SDreq *r)
+{
+ int i, count, rw;
+ uvlong lba;
+ SDunit *u;
+ Vdev *vd;
+
+ u = r->unit;
+ vd = u->dev->ctlr;
+ if(vd->typ == TypSCSI)
+ return vioscsireq(r);
+ if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){
+ if(vioblkreq(vd, 4, nil, 0, 0, 0) != 0)
+ return sdsetsense(r, SDcheck, 3, 0xc, 2);
+ return sdsetsense(r, SDok, 0, 0, 0);
+ }
+ if((i = sdfakescsi(r)) != SDnostatus)
+ return r->status = i;
+ if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
+ return i;
+ r->rlen = viobio(u, r->lun, rw == SDwrite, r->data, count, lba);
+ return r->status = SDok;
+}
+
+static int
+vioonline(SDunit *u)
+{
+ Vdev *vd;
+ Vblkdev *blk;
+ uvlong cap;
+
+ vd = u->dev->ctlr;
+ if(vd->typ == TypSCSI)
+ return scsionline(u);
+
+ blk = vd->dev;
+ cap = blk->capacity;
+ if(u->sectors != cap){
+ u->sectors = cap;
+ u->secsize = 512;
+ return 2;
+ }
+ return 1;
+}
+
+static int
+vioverify(SDunit *u)
+{
+ Vdev *vd;
+
+ vd = u->dev->ctlr;
+ if(vd->typ == TypSCSI)
+ return scsiverify(u);
+
+ return 1;
+}
+
+SDifc sdvirtio10ifc;
+
+static int
+vioenable(SDev *sd)
+{
+ char name[32];
+ Vdev *vd;
+ int i;
+
+ vd = sd->ctlr;
+ pcisetbme(vd->pci);
+ snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+ intrenable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
+ coherence();
+
+ vd->cfg->status |= DriverOk;
+ for(i = 0; i < vd->nqueue; i++){
+ vd->cfg->queuesel = i;
+ vd->cfg->queueenable = 1;
+ }
+
+ return 1;
+}
+
+static int
+viodisable(SDev *sd)
+{
+ char name[32];
+ Vdev *vd;
+
+ vd = sd->ctlr;
+ snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+ intrdisable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
+ pciclrbme(vd->pci);
+ return 1;
+}
+
+static SDev*
+viopnp(void)
+{
+ SDev *s, *h, *t;
+ Vdev *vd;
+ int id;
+
+ h = t = nil;
+
+ id = 'F';
+ for(vd = viopnpdevs(TypBlk); vd; vd = vd->next){
+ if(vd->nqueue == 0)
+ continue;
+
+ if((vd->dev = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vblkdev))) == nil)
+ break;
+ if((s = malloc(sizeof(*s))) == nil)
+ break;
+ s->ctlr = vd;
+ s->idno = id++;
+ s->ifc = &sdvirtio10ifc;
+ s->nunit = 1;
+ if(h)
+ t->next = s;
+ else
+ h = s;
+ t = s;
+ }
+
+ id = '0';
+ for(vd = viopnpdevs(TypSCSI); vd; vd = vd->next){
+ Vscsidev *scsi;
+
+ if(vd->nqueue < 3)
+ continue;
+
+ if((scsi = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vscsidev))) == nil)
+ break;
+ if(scsi->max_target == 0){
+ vunmap(scsi, sizeof(Vscsidev));
+ continue;
+ }
+ if((scsi->cdb_size > CDBSIZE) || (scsi->sense_size > SENSESIZE)){
+ print("sdvirtio: cdb %ud or sense size %ud too big\n",
+ scsi->cdb_size, scsi->sense_size);
+ vunmap(scsi, sizeof(Vscsidev));
+ continue;
+ }
+ vd->dev = scsi;
+
+ if((s = malloc(sizeof(*s))) == nil)
+ break;
+ s->ctlr = vd;
+ s->idno = id++;
+ s->ifc = &sdvirtio10ifc;
+ s->nunit = scsi->max_target;
+
+ if(h)
+ t->next = s;
+ else
+ h = s;
+ t = s;
+ }
+ return h;
+}
+
+SDifc sdvirtio10ifc = {
+ "virtio10", /* name */
+
+ viopnp, /* pnp */
+ nil, /* legacy */
+ vioenable, /* enable */
+ viodisable, /* disable */
+
+ vioverify, /* verify */
+ vioonline, /* online */
+ viorio, /* rio */
+ nil, /* rctl */
+ nil, /* wctl */
+
+ viobio, /* bio */
+ nil, /* probe */
+ nil, /* clear */
+ nil, /* rtopctl */
+ nil, /* wtopctl */
+};
--- a/sys/src/9/pc64/pc64
+++ b/sys/src/9/pc64/pc64
@@ -78,6 +78,7 @@
etherwpi pci wifi
etherrt2860 pci wifi
ethervirtio pci
+ ethervirtio10 pci
ethermedium
# pcmciamodem
netdevmedium
@@ -105,6 +106,7 @@
sdiahci pci sdscsi led
# sdodin pci sdscsi led
sdvirtio pci sdscsi
+ sdvirtio10 pci sdscsi
sdmmc pci pmmc
sdnvme pci
sdloop