shithub: riscv

ref: c2762df06f75459cb52510d21db6424ed9b1184a
dir: /sys/src/9/pc/ethervirtio10.c/

View raw version
/*
 * virtio 1.0 ethernet driver
 * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
 *
 * In contrast to ethervirtio.c, this driver handles the non-legacy
 * interface for virtio ethernet which uses mmio for all register accesses
 * and requires a laborate pci capability structure dance to get working.
 *
 * It is kind of pointless as it is most likely slower than
 * port i/o (harder to emulate on the pc platform).
 * 
 * The reason why this driver is needed it is that vultr set the
 * disable-legacy=on option in the -device parameter for qemu
 * on their hypervisor.
 */
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../port/pci.h"
#include "../port/error.h"
#include "../port/netif.h"
#include "../port/etherif.h"

typedef struct Vconfig Vconfig;
typedef struct Vnetcfg Vnetcfg;

typedef struct Vring Vring;
typedef struct Vdesc Vdesc;
typedef struct Vused Vused;
typedef struct Vheader Vheader;
typedef struct Vqueue Vqueue;

typedef struct Ctlr Ctlr;

enum {
	/* §2.1 Device Status Field */
	Sacknowledge = 1,
	Sdriver = 2,
	Sdriverok = 4,
	Sfeaturesok = 8,
	Sfailed = 128,

	/* flags in Qnetstatus */
	Nlinkup = (1<<0),
	Nannounce = (1<<1),

	/* feat[0] bits */
	Fmac = 1<<5,
	Fstatus = 1<<16,
	Fctrlvq = 1<<17,
	Fctrlrx = 1<<18,

	/* feat[1] bits */
	Fversion1 = 1<<(32-32),

	/* vring used flags */
	Unonotify = 1,
	/* vring avail flags */
	Rnointerrupt = 1,

	/* descriptor flags */
	Dnext = 1,
	Dwrite = 2,
	Dindirect = 4,

	/* struct sizes */
	VringSize = 4,
	VdescSize = 16,
	VusedSize = 8,
	VheaderSize = 12,

	Vrxq	= 0,
	Vtxq	= 1,
	Vctlq	= 2,

	/* class/cmd for Vctlq */
	CtrlRx	= 0x00,
		CmdPromisc	= 0x00,
		CmdAllmulti	= 0x01,
	CtrlMac	= 0x01,
		CmdMacTableSet	= 0x00,
	CtrlVlan= 0x02,
		CmdVlanAdd	= 0x00,
		CmdVlanDel	= 0x01,
};

struct Vconfig {
	u32int	devfeatsel;
	u32int	devfeat;
	u32int	drvfeatsel;
	u32int	drvfeat;

	u16int	msixcfg;
	u16int	nqueues;

	u8int	status;
	u8int	cfggen;
	u16int	queuesel;

	u16int	queuesize;
	u16int	queuemsixvect;

	u16int	queueenable;
	u16int	queuenotifyoff;

	u64int	queuedesc;
	u64int	queueavail;
	u64int	queueused;
};

struct Vnetcfg
{
	u16int	mac0;
	u16int	mac1;
	u16int	mac2;
	u16int	status;
	u16int	maxqueuepairs;
	u16int	mtu;
};

struct Vring
{
	u16int	flags;
	u16int	idx;
};

struct Vdesc
{
	u64int	addr;
	u32int	len;
	u16int	flags;
	u16int	next;
};

struct Vused
{
	u32int	id;
	u32int	len;
};

struct Vheader
{
	u8int	flags;
	u8int	segtype;
	u16int	hlen;
	u16int	seglen;
	u16int	csumstart;
	u16int	csumend;
};

struct Vqueue
{
	Rendez;

	uint	qsize;
	uint	qmask;

	Vdesc	*desc;

	Vring	*avail;
	u16int	*availent;
	u16int	*availevent;

	Vring	*used;
	Vused	*usedent;
	u16int	*usedevent;
	u16int	lastused;

	uint	nintr;
	uint	nnote;

	/* notify register */
	void	*notify;
};

struct Ctlr {
	Lock;

	QLock	ctllock;

	int	attached;

	/* registers */
	Vconfig	*cfg;
	Vnetcfg *dev;
	u8int	*isr;
	u8int	*notify;
	u32int	notifyoffmult;

	uvlong	port;
	Pcidev	*pcidev;
	Ctlr	*next;
	int	active;
	ulong	feat[2];
	int	nqueue;

	/* virtioether has 3 queues: rx, tx and ctl */
	Vqueue	queue[3];
};

static Ctlr *ctlrhead;

static int
vhasroom(void *v)
{
	Vqueue *q = v;
	return q->lastused != q->used->idx;
}

static void
vqnotify(Ctlr *ctlr, int x)
{
	Vqueue *q;

	coherence();
	q = &ctlr->queue[x];
	if(q->used->flags & Unonotify)
		return;
	q->nnote++;
	*((u16int*)q->notify) = x;
}

static void
txproc(void *v)
{
	Vheader *header;
	Block **blocks;
	Ether *edev;
	Ctlr *ctlr;
	Vqueue *q;
	Vused *u;
	Block *b;
	int i, j;

	edev = v;
	ctlr = edev->ctlr;
	q = &ctlr->queue[Vtxq];

	header = smalloc(VheaderSize);
	blocks = smalloc(sizeof(Block*) * (q->qsize/2));

	for(i = 0; i < q->qsize/2; i++){
		j = i << 1;
		q->desc[j].addr = PADDR(header);
		q->desc[j].len = VheaderSize;
		q->desc[j].next = j | 1;
		q->desc[j].flags = Dnext;

		q->availent[i] = q->availent[i + q->qsize/2] = j;

		j |= 1;
		q->desc[j].next = 0;
		q->desc[j].flags = 0;
	}

	q->avail->flags &= ~Rnointerrupt;

	while(waserror())
		;

	while((b = qbread(edev->oq, 1000000)) != nil){
		for(;;){
			/* retire completed packets */
			while((i = q->lastused) != q->used->idx){
				u = &q->usedent[i & q->qmask];
				i = (u->id & q->qmask) >> 1;
				if(blocks[i] == nil)
					break;
				freeb(blocks[i]);
				blocks[i] = nil;
				q->lastused++;
			}

			/* have free slot? */
			i = q->avail->idx & (q->qmask >> 1);
			if(blocks[i] == nil)
				break;

			/* ring full, wait and retry */
			if(!vhasroom(q))
				sleep(q, vhasroom, q);
		}

		/* slot is free, fill in descriptor */
		blocks[i] = b;
		j = (i << 1) | 1;
		q->desc[j].addr = PADDR(b->rp);
		q->desc[j].len = BLEN(b);
		coherence();
		q->avail->idx++;
		vqnotify(ctlr, Vtxq);
	}

	pexit("ether out queue closed", 1);
}

static void
rxproc(void *v)
{
	Vheader *header;
	Block **blocks;
	Ether *edev;
	Ctlr *ctlr;
	Vqueue *q;
	Vused *u;
	Block *b;
	int i, j;

	edev = v;
	ctlr = edev->ctlr;
	q = &ctlr->queue[Vrxq];

	header = smalloc(VheaderSize);
	blocks = smalloc(sizeof(Block*) * (q->qsize/2));

	for(i = 0; i < q->qsize/2; i++){
		j = i << 1;
		q->desc[j].addr = PADDR(header);
		q->desc[j].len = VheaderSize;
		q->desc[j].next = j | 1;
		q->desc[j].flags = Dwrite|Dnext;

		q->availent[i] = q->availent[i + q->qsize/2] = j;

		j |= 1;
		q->desc[j].next = 0;
		q->desc[j].flags = Dwrite;
	}

	q->avail->flags &= ~Rnointerrupt;

	while(waserror())
		;

	for(;;){
		/* replenish receive ring */
		do {
			i = q->avail->idx & (q->qmask >> 1);
			if(blocks[i] != nil)
				break;
			if((b = iallocb(ETHERMAXTU)) == nil)
				break;
			blocks[i] = b;
			j = (i << 1) | 1;
			q->desc[j].addr = PADDR(b->rp);
			q->desc[j].len = BALLOC(b);
			coherence();
			q->avail->idx++;
		} while(q->avail->idx != q->used->idx);
		vqnotify(ctlr, Vrxq);

		/* wait for any packets to complete */
		if(!vhasroom(q))
			sleep(q, vhasroom, q);

		/* retire completed packets */
		while((i = q->lastused) != q->used->idx) {
			u = &q->usedent[i & q->qmask];
			i = (u->id & q->qmask) >> 1;
			if((b = blocks[i]) == nil)
				break;

			blocks[i] = nil;
			b->wp = b->rp + u->len - VheaderSize;
			etheriq(edev, b);
			q->lastused++;
		}
	}
}

static int
vctlcmd(Ether *edev, uchar class, uchar cmd, uchar *data, int ndata)
{
	uchar hdr[2], ack[1];
	Ctlr *ctlr;
	Vqueue *q;
	Vdesc *d;
	int i;

	ctlr = edev->ctlr;
	q = &ctlr->queue[Vctlq];
	if(q->qsize < 3)
		return -1;

	qlock(&ctlr->ctllock);
	while(waserror())
		;

	ack[0] = 0x55;
	hdr[0] = class;
	hdr[1] = cmd;

	d = &q->desc[0];
	d->addr = PADDR(hdr);
	d->len = sizeof(hdr);
	d->next = 1;
	d->flags = Dnext;
	d++;
	d->addr = PADDR(data);
	d->len = ndata;
	d->next = 2;
	d->flags = Dnext;
	d++;
	d->addr = PADDR(ack);
	d->len = sizeof(ack);
	d->next = 0;
	d->flags = Dwrite;

	i = q->avail->idx & q->qmask;
	q->availent[i] = 0;
	coherence();

	q->avail->flags &= ~Rnointerrupt;
	q->avail->idx++;
	vqnotify(ctlr, Vctlq);
	while(!vhasroom(q))
		sleep(q, vhasroom, q);
	q->lastused = q->used->idx;
	q->avail->flags |= Rnointerrupt;

	qunlock(&ctlr->ctllock);
	poperror();

	if(ack[0] != 0)
		print("#l%d: vctlcmd: %ux.%ux -> %ux\n", edev->ctlrno, class, cmd, ack[0]);

	return ack[0];
}

static void
interrupt(Ureg*, void* arg)
{
	Ether *edev;
	Ctlr *ctlr;
	Vqueue *q;
	int i;

	edev = arg;
	ctlr = edev->ctlr;
	if(*ctlr->isr & 1){
		for(i = 0; i < ctlr->nqueue; i++){
			q = &ctlr->queue[i];
			if(vhasroom(q)){
				q->nintr++;
				wakeup(q);
			}
		}
	}
}

static void
attach(Ether* edev)
{
	char name[KNAMELEN];
	Ctlr* ctlr;
	int i;

	ctlr = edev->ctlr;
	ilock(ctlr);
	if(ctlr->attached){
		iunlock(ctlr);
		return;
	}
	ctlr->attached = 1;

	/* enable the queues */
	for(i = 0; i < ctlr->nqueue; i++){
		ctlr->cfg->queuesel = i;
		ctlr->cfg->queueenable = 1;
	}

	/* driver is ready */
	ctlr->cfg->status |= Sdriverok;

	iunlock(ctlr);

	/* start kprocs */
	snprint(name, sizeof name, "#l%drx", edev->ctlrno);
	kproc(name, rxproc, edev);
	snprint(name, sizeof name, "#l%dtx", edev->ctlrno);
	kproc(name, txproc, edev);
}

static long
ifstat(Ether *edev, void *a, long n, ulong offset)
{
	int i, l;
	char *p;
	Ctlr *ctlr;
	Vqueue *q;

	ctlr = edev->ctlr;

	p = smalloc(READSTR);

	l = snprint(p, READSTR, "devfeat %32.32lub %32.32lub\n", ctlr->feat[1], ctlr->feat[0]);
	l += snprint(p+l, READSTR-l, "devstatus %8.8ub\n", ctlr->cfg->status);

	for(i = 0; i < ctlr->nqueue; i++){
		q = &ctlr->queue[i];
		l += snprint(p+l, READSTR-l,
			"vq%d %#p size %d avail->idx %d used->idx %d lastused %hud nintr %ud nnote %ud\n",
			i, q, q->qsize, q->avail->idx, q->used->idx, q->lastused, q->nintr, q->nnote);
	}

	n = readstr(offset, a, n, p);
	free(p);

	return n;
}

static void
shutdown(Ether* edev)
{
	Ctlr *ctlr = edev->ctlr;

	coherence();
	ctlr->cfg->status = 0;
	coherence();

	pciclrbme(ctlr->pcidev);
}

static void
promiscuous(void *arg, int on)
{
	Ether *edev = arg;
	uchar b[1];

	b[0] = on != 0;
	vctlcmd(edev, CtrlRx, CmdPromisc, b, sizeof(b));
}

static void
multicast(void *arg, uchar*, int)
{
	Ether *edev = arg;
	uchar b[1];

	b[0] = edev->nmaddr > 0;
	vctlcmd(edev, CtrlRx, CmdAllmulti, b, sizeof(b));
}

static int
initqueue(Vqueue *q, int size)
{
	uchar *p;

	q->desc = mallocalign(VdescSize*size, 16, 0, 0);
	if(q->desc == nil)
		return -1;
	p = mallocalign(VringSize + 2*size + 2, 2, 0, 0);
	if(p == nil){
FreeDesc:
		free(q->desc);
		q->desc = nil;
		return -1;
	}
	q->avail = (void*)p;
	p += VringSize;
	q->availent = (void*)p;
	p += sizeof(u16int)*size;
	q->availevent = (void*)p;
	p = mallocalign(VringSize + VusedSize*size + 2, 4, 0, 0);
	if(p == nil){
		free(q->avail);
		q->avail = nil;
		goto FreeDesc;
	}
	q->used = (void*)p;
	p += VringSize;
	q->usedent = (void*)p;
	p += VusedSize*size;
	q->usedevent = (void*)p;

	q->qsize = size;
	q->qmask = q->qsize - 1;

	q->lastused = q->avail->idx = q->used->idx = 0;

	q->avail->flags |= Rnointerrupt;

	return 0;
}

static int
matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
{
	int bar;

	if(cap != 9 || pcicfgr8(p, off+3) != typ)
		return 1;

	/* skip invalid or non memory bars */
	bar = pcicfgr8(p, off+4);
	if(bar < 0 || bar >= nelem(p->mem) 
	|| p->mem[bar].size == 0
	|| (p->mem[bar].bar & 3) != 0)
		return 1;

	return 0;
}

static int
virtiocap(Pcidev *p, int typ)
{
	return pcienumcaps(p, matchvirtiocfgcap, typ);
}

static void*
virtiomapregs(Pcidev *p, int cap, int size)
{
	int bar, len;
	uvlong addr;

	if(cap < 0)
		return nil;
	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
	addr = pcicfgr32(p, cap+8);
	len = pcicfgr32(p, cap+12);
	if(size <= 0)
		size = len;
	else if(len < size)
		return nil;
	if(addr+len > p->mem[bar].size)
		return nil;
	addr += p->mem[bar].bar & ~0xFULL;
	return vmap(addr, size);
}

static Ctlr*
pciprobe(void)
{
	Ctlr *c, *h, *t;
	Pcidev *p;
	Vconfig *cfg;
	int bar, cap, n, i;

	h = t = nil;

	/* §4.1.2 PCI Device Discovery */
	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1041);){
		/* non-transitional devices will have a revision > 0 */
		if(p->rid == 0)
			continue;
		if((cap = virtiocap(p, 1)) < 0)
			continue;
		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
		if(cfg == nil)
			continue;
		if((c = mallocz(sizeof(Ctlr), 1)) == nil){
			print("ethervirtio: no memory for Ctlr\n");
			break;
		}
		c->cfg = cfg;
		c->pcidev = p;
		c->port = p->mem[bar].bar & ~0xFULL;

		pcienable(p);
		c->dev = virtiomapregs(p, virtiocap(p, 4), sizeof(Vnetcfg));
		if(c->dev == nil)
			goto Baddev;
		c->isr = virtiomapregs(p, virtiocap(p, 3), 0);
		if(c->isr == nil)
			goto Baddev;
		cap = virtiocap(p, 2);
		c->notify = virtiomapregs(p, cap, 0);
		if(c->notify == nil)
			goto Baddev;
		c->notifyoffmult = pcicfgr32(p, cap+16);

		/* device reset */
		coherence();
		cfg->status = 0;
		while(cfg->status != 0)
			delay(1);
		cfg->status = Sacknowledge|Sdriver;

		/* negotiate feature bits */
		cfg->devfeatsel = 1;
		c->feat[1] = cfg->devfeat;

		cfg->devfeatsel = 0;
		c->feat[0] = cfg->devfeat;

		cfg->drvfeatsel = 1;
		cfg->drvfeat = c->feat[1] & Fversion1;

		cfg->drvfeatsel = 0;
		cfg->drvfeat = c->feat[0] & (Fmac|Fctrlvq|Fctrlrx);

		cfg->status |= Sfeaturesok;

		for(i=0; i<nelem(c->queue); i++){
			cfg->queuesel = i;
			n = cfg->queuesize;
			if(n == 0 || (n & (n-1)) != 0){
				if(i < 2)
					print("ethervirtio: queue %d has invalid size %d\n", i, n);
				break;
			}
			if(initqueue(&c->queue[i], n) < 0)
				break;
			c->queue[i].notify = c->notify + c->notifyoffmult * cfg->queuenotifyoff;
			coherence();
			cfg->queuedesc = PADDR(c->queue[i].desc);
			cfg->queueavail = PADDR(c->queue[i].avail);
			cfg->queueused = PADDR(c->queue[i].used);
		}
		if(i < 2){
			print("ethervirtio: no queues\n");
Baddev:
			pcidisable(p);
			/* TODO, vunmap */
			free(c);
			continue;
		}
		c->nqueue = i;		

		if(h == nil)
			h = c;
		else
			t->next = c;
		t = c;
	}

	return h;
}


static int
reset(Ether* edev)
{
	static uchar zeros[Eaddrlen];
	Ctlr *ctlr;
	int i;

	if(ctlrhead == nil)
		ctlrhead = pciprobe();

	for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
		if(ctlr->active)
			continue;
		if(edev->port == 0 || edev->port == ctlr->port){
			ctlr->active = 1;
			break;
		}
	}

	if(ctlr == nil)
		return -1;

	edev->ctlr = ctlr;
	edev->port = ctlr->port;
	edev->irq = ctlr->pcidev->intl;
	edev->tbdf = ctlr->pcidev->tbdf;
	edev->mbps = 1000;
	edev->link = 1;

	if((ctlr->feat[0] & Fmac) != 0 && memcmp(edev->ea, zeros, Eaddrlen) == 0){
		for(i = 0; i < Eaddrlen; i++)
			edev->ea[i] = ((uchar*)ctlr->dev)[i];
	} else {
		for(i = 0; i < Eaddrlen; i++)
			((uchar*)ctlr->dev)[i] = edev->ea[i];
	}

	edev->arg = edev;

	edev->attach = attach;
	edev->shutdown = shutdown;
	edev->ifstat = ifstat;

	if((ctlr->feat[0] & (Fctrlvq|Fctrlrx)) == (Fctrlvq|Fctrlrx)){
		edev->multicast = multicast;
		edev->promiscuous = promiscuous;
	}

	pcisetbme(ctlr->pcidev);
	intrenable(edev->irq, interrupt, edev, edev->tbdf, edev->name);

	return 0;
}

void
ethervirtio10link(void)
{
	addethercard("virtio10", reset);
}