ref: 39c021fbc13b51b35b271fa0364332a08f15a68c
dir: /sys/src/9/port/ethervirtio10.c/
/*
* virtio 1.0 ethernet driver
* http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
*
* In contrast to ethervirtio.c, this driver handles the non-legacy
* interface for virtio ethernet which uses mmio for all register accesses
* and requires a laborate pci capability structure dance to get working.
*
* It is kind of pointless as it is most likely slower than
* port i/o (harder to emulate on the pc platform).
*
* The reason why this driver is needed it is that vultr set the
* disable-legacy=on option in the -device parameter for qemu
* on their hypervisor.
*/
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../port/pci.h"
#include "../port/error.h"
#include "../port/netif.h"
#include "../port/etherif.h"
typedef struct Vconfig Vconfig;
typedef struct Vnetcfg Vnetcfg;
typedef struct Vring Vring;
typedef struct Vdesc Vdesc;
typedef struct Vused Vused;
typedef struct Vheader Vheader;
typedef struct Vqueue Vqueue;
typedef struct Ctlr Ctlr;
enum {
/* §2.1 Device Status Field */
Sacknowledge = 1,
Sdriver = 2,
Sdriverok = 4,
Sfeaturesok = 8,
Sfailed = 128,
/* flags in Qnetstatus */
Nlinkup = (1<<0),
Nannounce = (1<<1),
/* feat[0] bits */
Fmac = 1<<5,
Fstatus = 1<<16,
Fctrlvq = 1<<17,
Fctrlrx = 1<<18,
/* feat[1] bits */
Fversion1 = 1<<(32-32),
/* vring used flags */
Unonotify = 1,
/* vring avail flags */
Rnointerrupt = 1,
/* descriptor flags */
Dnext = 1,
Dwrite = 2,
Dindirect = 4,
/* struct sizes */
VringSize = 4,
VdescSize = 16,
VusedSize = 8,
VheaderSize = 12,
Vrxq = 0,
Vtxq = 1,
Vctlq = 2,
/* class/cmd for Vctlq */
CtrlRx = 0x00,
CmdPromisc = 0x00,
CmdAllmulti = 0x01,
CtrlMac = 0x01,
CmdMacTableSet = 0x00,
CtrlVlan= 0x02,
CmdVlanAdd = 0x00,
CmdVlanDel = 0x01,
};
struct Vconfig {
u32int devfeatsel;
u32int devfeat;
u32int drvfeatsel;
u32int drvfeat;
u16int msixcfg;
u16int nqueues;
u8int status;
u8int cfggen;
u16int queuesel;
u16int queuesize;
u16int queuemsixvect;
u16int queueenable;
u16int queuenotifyoff;
u64int queuedesc;
u64int queueavail;
u64int queueused;
};
struct Vnetcfg
{
u16int mac0;
u16int mac1;
u16int mac2;
u16int status;
u16int maxqueuepairs;
u16int mtu;
};
struct Vring
{
u16int flags;
u16int idx;
};
struct Vdesc
{
u64int addr;
u32int len;
u16int flags;
u16int next;
};
struct Vused
{
u32int id;
u32int len;
};
struct Vheader
{
u8int flags;
u8int segtype;
u16int hlen;
u16int seglen;
u16int csumstart;
u16int csumend;
};
struct Vqueue
{
Rendez;
uint qsize;
uint qmask;
Vdesc *desc;
Vring *avail;
u16int *availent;
u16int *availevent;
Vring *used;
Vused *usedent;
u16int *usedevent;
u16int lastused;
uint nintr;
uint nnote;
/* notify register */
void *notify;
};
struct Ctlr {
Lock;
QLock ctllock;
int attached;
/* registers */
Vconfig *cfg;
Vnetcfg *dev;
u8int *isr;
u8int *notify;
u32int notifyoffmult;
uvlong port;
Pcidev *pcidev;
Ctlr *next;
int active;
ulong feat[2];
int nqueue;
/* virtioether has 3 queues: rx, tx and ctl */
Vqueue queue[3];
};
static Ctlr *ctlrhead;
static int
vhasroom(void *v)
{
Vqueue *q = v;
return q->lastused != q->used->idx;
}
static void
vqnotify(Ctlr *ctlr, int x)
{
Vqueue *q;
coherence();
q = &ctlr->queue[x];
if(q->used->flags & Unonotify)
return;
q->nnote++;
*((u16int*)q->notify) = x;
}
static void
txproc(void *v)
{
Vheader *header;
Block **blocks;
Ether *edev;
Ctlr *ctlr;
Vqueue *q;
Vused *u;
Block *b;
int i, j;
edev = v;
ctlr = edev->ctlr;
q = &ctlr->queue[Vtxq];
header = smalloc(VheaderSize);
blocks = smalloc(sizeof(Block*) * (q->qsize/2));
for(i = 0; i < q->qsize/2; i++){
j = i << 1;
q->desc[j].addr = PADDR(header);
q->desc[j].len = VheaderSize;
q->desc[j].next = j | 1;
q->desc[j].flags = Dnext;
q->availent[i] = q->availent[i + q->qsize/2] = j;
j |= 1;
q->desc[j].next = 0;
q->desc[j].flags = 0;
}
q->avail->flags &= ~Rnointerrupt;
while(waserror())
;
while((b = qbread(edev->oq, 1000000)) != nil){
for(;;){
/* retire completed packets */
while((i = q->lastused) != q->used->idx){
u = &q->usedent[i & q->qmask];
i = (u->id & q->qmask) >> 1;
if(blocks[i] == nil)
break;
freeb(blocks[i]);
blocks[i] = nil;
q->lastused++;
}
/* have free slot? */
i = q->avail->idx & (q->qmask >> 1);
if(blocks[i] == nil)
break;
/* ring full, wait and retry */
if(!vhasroom(q))
sleep(q, vhasroom, q);
}
/* slot is free, fill in descriptor */
blocks[i] = b;
j = (i << 1) | 1;
q->desc[j].addr = PADDR(b->rp);
q->desc[j].len = BLEN(b);
coherence();
q->avail->idx++;
vqnotify(ctlr, Vtxq);
}
pexit("ether out queue closed", 1);
}
static void
rxproc(void *v)
{
Vheader *header;
Block **blocks;
Ether *edev;
Ctlr *ctlr;
Vqueue *q;
Vused *u;
Block *b;
int i, j;
edev = v;
ctlr = edev->ctlr;
q = &ctlr->queue[Vrxq];
header = smalloc(VheaderSize);
blocks = smalloc(sizeof(Block*) * (q->qsize/2));
for(i = 0; i < q->qsize/2; i++){
j = i << 1;
q->desc[j].addr = PADDR(header);
q->desc[j].len = VheaderSize;
q->desc[j].next = j | 1;
q->desc[j].flags = Dwrite|Dnext;
q->availent[i] = q->availent[i + q->qsize/2] = j;
j |= 1;
q->desc[j].next = 0;
q->desc[j].flags = Dwrite;
}
q->avail->flags &= ~Rnointerrupt;
while(waserror())
;
for(;;){
/* replenish receive ring */
do {
i = q->avail->idx & (q->qmask >> 1);
if(blocks[i] != nil)
break;
if((b = iallocb(ETHERMAXTU)) == nil)
break;
blocks[i] = b;
j = (i << 1) | 1;
q->desc[j].addr = PADDR(b->rp);
q->desc[j].len = BALLOC(b);
coherence();
q->avail->idx++;
} while(q->avail->idx != q->used->idx);
vqnotify(ctlr, Vrxq);
/* wait for any packets to complete */
if(!vhasroom(q))
sleep(q, vhasroom, q);
/* retire completed packets */
while((i = q->lastused) != q->used->idx) {
u = &q->usedent[i & q->qmask];
i = (u->id & q->qmask) >> 1;
if((b = blocks[i]) == nil)
break;
blocks[i] = nil;
b->wp = b->rp + u->len - VheaderSize;
etheriq(edev, b);
q->lastused++;
}
}
}
static int
vctlcmd(Ether *edev, uchar class, uchar cmd, uchar *data, int ndata)
{
uchar hdr[2], ack[1];
Ctlr *ctlr;
Vqueue *q;
Vdesc *d;
int i;
ctlr = edev->ctlr;
q = &ctlr->queue[Vctlq];
if(q->qsize < 3)
return -1;
qlock(&ctlr->ctllock);
while(waserror())
;
ack[0] = 0x55;
hdr[0] = class;
hdr[1] = cmd;
d = &q->desc[0];
d->addr = PADDR(hdr);
d->len = sizeof(hdr);
d->next = 1;
d->flags = Dnext;
d++;
d->addr = PADDR(data);
d->len = ndata;
d->next = 2;
d->flags = Dnext;
d++;
d->addr = PADDR(ack);
d->len = sizeof(ack);
d->next = 0;
d->flags = Dwrite;
i = q->avail->idx & q->qmask;
q->availent[i] = 0;
coherence();
q->avail->flags &= ~Rnointerrupt;
q->avail->idx++;
vqnotify(ctlr, Vctlq);
while(!vhasroom(q))
sleep(q, vhasroom, q);
q->lastused = q->used->idx;
q->avail->flags |= Rnointerrupt;
qunlock(&ctlr->ctllock);
poperror();
if(ack[0] != 0)
print("#l%d: vctlcmd: %ux.%ux -> %ux\n", edev->ctlrno, class, cmd, ack[0]);
return ack[0];
}
static void
interrupt(Ureg*, void* arg)
{
Ether *edev;
Ctlr *ctlr;
Vqueue *q;
int i;
edev = arg;
ctlr = edev->ctlr;
if(*ctlr->isr & 1){
for(i = 0; i < ctlr->nqueue; i++){
q = &ctlr->queue[i];
if(vhasroom(q)){
q->nintr++;
wakeup(q);
}
}
}
}
static void
attach(Ether* edev)
{
char name[KNAMELEN];
Ctlr* ctlr;
int i;
ctlr = edev->ctlr;
ilock(ctlr);
if(ctlr->attached){
iunlock(ctlr);
return;
}
ctlr->attached = 1;
/* enable the queues */
for(i = 0; i < ctlr->nqueue; i++){
ctlr->cfg->queuesel = i;
ctlr->cfg->queueenable = 1;
}
/* driver is ready */
ctlr->cfg->status |= Sdriverok;
iunlock(ctlr);
/* start kprocs */
snprint(name, sizeof name, "#l%drx", edev->ctlrno);
kproc(name, rxproc, edev);
snprint(name, sizeof name, "#l%dtx", edev->ctlrno);
kproc(name, txproc, edev);
}
static long
ifstat(Ether *edev, void *a, long n, ulong offset)
{
int i, l;
char *p;
Ctlr *ctlr;
Vqueue *q;
ctlr = edev->ctlr;
p = smalloc(READSTR);
l = snprint(p, READSTR, "devfeat %32.32lub %32.32lub\n", ctlr->feat[1], ctlr->feat[0]);
l += snprint(p+l, READSTR-l, "devstatus %8.8ub\n", ctlr->cfg->status);
for(i = 0; i < ctlr->nqueue; i++){
q = &ctlr->queue[i];
l += snprint(p+l, READSTR-l,
"vq%d %#p size %d avail->idx %d used->idx %d lastused %hud nintr %ud nnote %ud\n",
i, q, q->qsize, q->avail->idx, q->used->idx, q->lastused, q->nintr, q->nnote);
}
n = readstr(offset, a, n, p);
free(p);
return n;
}
static void
shutdown(Ether* edev)
{
Ctlr *ctlr = edev->ctlr;
coherence();
ctlr->cfg->status = 0;
coherence();
pciclrbme(ctlr->pcidev);
}
static void
promiscuous(void *arg, int on)
{
Ether *edev = arg;
Ctlr *ctlr = edev->ctlr;
uchar b[1];
if((ctlr->feat[0] & (Fctrlvq|Fctrlrx)) != (Fctrlvq|Fctrlrx))
return;
b[0] = on != 0;
vctlcmd(edev, CtrlRx, CmdPromisc, b, sizeof(b));
}
static void
multicast(void *arg, uchar*, int)
{
Ether *edev = arg;
Ctlr *ctlr = edev->ctlr;
uchar b[1];
if((ctlr->feat[0] & (Fctrlvq|Fctrlrx)) != (Fctrlvq|Fctrlrx))
return;
b[0] = edev->nmaddr > 0;
vctlcmd(edev, CtrlRx, CmdAllmulti, b, sizeof(b));
}
static int
initqueue(Vqueue *q, int size)
{
uchar *p;
q->desc = mallocalign(VdescSize*size, 16, 0, 0);
if(q->desc == nil)
return -1;
p = mallocalign(VringSize + 2*size + 2, 2, 0, 0);
if(p == nil){
FreeDesc:
free(q->desc);
q->desc = nil;
return -1;
}
q->avail = (void*)p;
p += VringSize;
q->availent = (void*)p;
p += sizeof(u16int)*size;
q->availevent = (void*)p;
p = mallocalign(VringSize + VusedSize*size + 2, 4, 0, 0);
if(p == nil){
free(q->avail);
q->avail = nil;
goto FreeDesc;
}
q->used = (void*)p;
p += VringSize;
q->usedent = (void*)p;
p += VusedSize*size;
q->usedevent = (void*)p;
q->qsize = size;
q->qmask = q->qsize - 1;
q->lastused = q->avail->idx = q->used->idx = 0;
q->avail->flags |= Rnointerrupt;
return 0;
}
static int
matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
{
int bar;
if(cap != 9 || pcicfgr8(p, off+3) != typ)
return 1;
/* skip invalid or non memory bars */
bar = pcicfgr8(p, off+4);
if(bar < 0 || bar >= nelem(p->mem)
|| p->mem[bar].size == 0
|| (p->mem[bar].bar & 3) != 0)
return 1;
return 0;
}
static int
virtiocap(Pcidev *p, int typ)
{
return pcienumcaps(p, matchvirtiocfgcap, typ);
}
static void*
virtiomapregs(Pcidev *p, int cap, int size)
{
int bar, len;
uvlong addr;
if(cap < 0)
return nil;
bar = pcicfgr8(p, cap+4) % nelem(p->mem);
addr = pcicfgr32(p, cap+8);
len = pcicfgr32(p, cap+12);
if(size <= 0)
size = len;
else if(len < size)
return nil;
if(addr+len > p->mem[bar].size)
return nil;
addr += p->mem[bar].bar & ~0xFULL;
return vmap(addr, size);
}
static Ctlr*
pciprobe(void)
{
Ctlr *c, *h, *t;
Pcidev *p;
Vconfig *cfg;
int bar, cap, n, i;
h = t = nil;
/* §4.1.2 PCI Device Discovery */
for(p = nil; p = pcimatch(p, 0x1AF4, 0x1041);){
/* non-transitional devices will have a revision > 0 */
if(p->rid == 0)
continue;
if((cap = virtiocap(p, 1)) < 0)
continue;
bar = pcicfgr8(p, cap+4) % nelem(p->mem);
cfg = virtiomapregs(p, cap, sizeof(Vconfig));
if(cfg == nil)
continue;
if((c = mallocz(sizeof(Ctlr), 1)) == nil){
print("ethervirtio: no memory for Ctlr\n");
break;
}
c->cfg = cfg;
c->pcidev = p;
c->port = p->mem[bar].bar & ~0xFULL;
pcienable(p);
c->dev = virtiomapregs(p, virtiocap(p, 4), sizeof(Vnetcfg));
if(c->dev == nil)
goto Baddev;
c->isr = virtiomapregs(p, virtiocap(p, 3), 0);
if(c->isr == nil)
goto Baddev;
cap = virtiocap(p, 2);
c->notify = virtiomapregs(p, cap, 0);
if(c->notify == nil)
goto Baddev;
c->notifyoffmult = pcicfgr32(p, cap+16);
/* device reset */
coherence();
cfg->status = 0;
while(cfg->status != 0)
delay(1);
cfg->status = Sacknowledge|Sdriver;
/* negotiate feature bits */
cfg->devfeatsel = 1;
c->feat[1] = cfg->devfeat;
cfg->devfeatsel = 0;
c->feat[0] = cfg->devfeat;
cfg->drvfeatsel = 1;
cfg->drvfeat = c->feat[1] & Fversion1;
cfg->drvfeatsel = 0;
cfg->drvfeat = c->feat[0] & (Fmac|Fctrlvq|Fctrlrx);
cfg->status |= Sfeaturesok;
for(i=0; i<nelem(c->queue); i++){
cfg->queuesel = i;
n = cfg->queuesize;
if(n == 0 || (n & (n-1)) != 0){
if(i < 2)
print("ethervirtio: queue %d has invalid size %d\n", i, n);
break;
}
if(initqueue(&c->queue[i], n) < 0)
break;
c->queue[i].notify = c->notify + c->notifyoffmult * cfg->queuenotifyoff;
coherence();
cfg->queuedesc = PADDR(c->queue[i].desc);
cfg->queueavail = PADDR(c->queue[i].avail);
cfg->queueused = PADDR(c->queue[i].used);
}
if(i < 2){
print("ethervirtio: no queues\n");
Baddev:
pcidisable(p);
/* TODO, vunmap */
free(c);
continue;
}
c->nqueue = i;
if(h == nil)
h = c;
else
t->next = c;
t = c;
}
return h;
}
static int
reset(Ether* edev)
{
static uchar zeros[Eaddrlen];
Ctlr *ctlr;
int i;
if(ctlrhead == nil)
ctlrhead = pciprobe();
for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
if(ctlr->active)
continue;
if(edev->port == 0 || edev->port == ctlr->port){
ctlr->active = 1;
break;
}
}
if(ctlr == nil)
return -1;
edev->ctlr = ctlr;
edev->port = ctlr->port;
edev->irq = ctlr->pcidev->intl;
edev->tbdf = ctlr->pcidev->tbdf;
edev->mbps = 1000;
edev->link = 1;
if((ctlr->feat[0] & Fmac) != 0 && memcmp(edev->ea, zeros, Eaddrlen) == 0){
for(i = 0; i < Eaddrlen; i++)
edev->ea[i] = ((uchar*)ctlr->dev)[i];
} else {
for(i = 0; i < Eaddrlen; i++)
((uchar*)ctlr->dev)[i] = edev->ea[i];
}
edev->arg = edev;
edev->attach = attach;
edev->shutdown = shutdown;
edev->ifstat = ifstat;
edev->multicast = multicast;
edev->promiscuous = promiscuous;
pcisetbme(ctlr->pcidev);
intrenable(edev->irq, interrupt, edev, edev->tbdf, edev->name);
return 0;
}
void
ethervirtio10link(void)
{
addethercard("virtio10", reset);
}