ref: e39d9249076e9a95e97b33313be1ab2e23095f1d
parent: c2c397422f472e4733d02eb03b86a71a6ca9508c
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sun Jul 3 07:35:20 EDT 2022
sdnvme: add dmaflush() instructions, move to port/
--- a/sys/src/9/pc/sdnvme.c
+++ /dev/null
@@ -1,685 +1,0 @@
-#include "u.h"
-#include "../port/lib.h"
-#include "mem.h"
-#include "dat.h"
-#include "fns.h"
-#include "io.h"
-#include "../port/pci.h"
-#include "ureg.h"
-#include "../port/error.h"
-
-#include "../port/sd.h"
-
-typedef struct WS WS;
-typedef struct CQ CQ;
-typedef struct SQ SQ;
-typedef struct Ctlr Ctlr;
-
-struct WS
-{
- u32int cdw0;
- ushort status;
- Rendez *sleep;
- WS **link;
- SQ *queue;
-};
-
-struct CQ
-{
- u32int head;
- u32int mask;
- u32int shift;
- u32int *base;
- Ctlr *ctlr;
-};
-
-struct SQ
-{
- u32int tail;
- u32int mask;
- u32int shift;
- u32int *base;
- WS **wait;
- Ctlr *ctlr;
- Lock;
-};
-
-struct Ctlr
-{
- QLock;
-
- Lock intr;
- u32int ints;
- u32int irqc[2];
-
- Pcidev *pci;
- u32int *reg;
-
- u64int cap;
- uchar *ident;
- u32int *nsid;
- int nnsid;
-
- u32int mps; /* mps = 1<<mpsshift */
- u32int mpsshift;
- u32int dstrd;
-
- u32int nsq;
-
- CQ cq[1+1];
- SQ sq[1+MAXMACH];
-
- Ctlr *next;
-};
-
-/* controller registers */
-enum {
- Cap0,
- Cap1,
- Ver,
- IntMs,
- IntMc,
- CCfg,
-
- CSts = 0x1C/4,
- Nssr,
- AQAttr,
- ASQBase0,
- ASQBase1,
- ACQBase0,
- ACQBase1,
-
- DBell = 0x1000/4,
-};
-
-static u32int*
-qcmd(WS *ws, Ctlr *ctlr, int adm, u32int opc, u32int nsid, void *mptr, void *data, ulong len)
-{
- u32int cid, *e;
- u64int pa;
- SQ *sq;
-
- if(!adm){
- Retry:
- splhi();
- sq = &ctlr->sq[1+(m->machno % ctlr->nsq)];
- if(conf.nmach > ctlr->nsq)
- lock(sq);
- } else {
- qlock(ctlr);
- sq = &ctlr->sq[0];
- }
- ws->sleep = &up->sleep;
- ws->queue = sq;
- ws->link = &sq->wait[sq->tail & sq->mask];
- while(*ws->link != nil){
- sched();
- if(!adm){
- /* should be very rare */
- goto Retry;
- }
- }
- *ws->link = ws;
-
- e = &sq->base[((cid = sq->tail++) & sq->mask)<<4];
- e[0] = opc | cid<<16;
- e[1] = nsid;
- e[2] = 0;
- e[3] = 0;
- if(mptr != nil){
- pa = PCIWADDR(mptr);
- e[4] = pa;
- e[5] = pa>>32;
- } else {
- e[4] = 0;
- e[5] = 0;
- }
- if(len > 0){
- pa = PCIWADDR(data);
- e[6] = pa;
- e[7] = pa>>32;
- if(len > ctlr->mps - (pa & ctlr->mps-1))
- pa += ctlr->mps - (pa & ctlr->mps-1);
- else
- pa = 0;
- } else {
- e[6] = 0;
- e[7] = 0;
- pa = 0;
- }
- e[8] = pa;
- e[9] = pa>>32;
- return e;
-}
-
-static void
-nvmeintr(Ureg *, void *arg)
-{
- u32int phaseshift, *e;
- WS *ws, **wp;
- Ctlr *ctlr;
- SQ *sq;
- CQ *cq;
-
- ctlr = arg;
- if(ctlr->ints == 0)
- return;
-
- ilock(&ctlr->intr);
- ctlr->reg[IntMs] = ctlr->ints;
- for(cq = &ctlr->cq[nelem(ctlr->cq)-1]; cq >= ctlr->cq; cq--){
- if(cq->base == nil)
- continue;
- phaseshift = 16 - cq->shift;
- for(;;){
- e = &cq->base[(cq->head & cq->mask)<<2];
- if(((e[3] ^ (cq->head << phaseshift)) & 0x10000) == 0)
- break;
-
- if(0) iprint("nvmeintr: cq%d [%.4ux] %.8ux %.8ux %.8ux %.8ux\n",
- (int)(cq - ctlr->cq), cq->head & cq->mask,
- e[0], e[1], e[2], e[3]);
-
- sq = &ctlr->sq[e[2] >> 16];
- wp = &sq->wait[e[3] & sq->mask];
- if((ws = *wp) != nil && ws->link == wp){
- Rendez *z = ws->sleep;
- ws->cdw0 = e[0];
- ws->status = e[3]>>17;
- *wp = nil;
- wakeup(z);
- }
- ctlr->reg[DBell + ((cq-ctlr->cq)*2+1 << ctlr->dstrd)] = ++cq->head & cq->mask;
- }
- }
- ctlr->reg[IntMc] = ctlr->ints;
- iunlock(&ctlr->intr);
-}
-
-static int
-wdone(void *arg)
-{
- WS *ws = arg;
- return *ws->link != ws;
-}
-
-static u32int
-wcmd(WS *ws)
-{
- SQ *sq = ws->queue;
- Ctlr *ctlr = sq->ctlr;
-
- coherence();
- ctlr->reg[DBell + ((sq-ctlr->sq)*2+0 << ctlr->dstrd)] = sq->tail & sq->mask;
- if(sq > ctlr->sq) {
- assert(sq == &ctlr->sq[1+(m->machno % ctlr->nsq)]);
- if(conf.nmach > ctlr->nsq)
- unlock(sq);
- spllo();
- } else
- qunlock(sq->ctlr);
- while(waserror())
- ;
- tsleep(ws->sleep, wdone, ws, 5);
- while(!wdone(ws)){
- nvmeintr(nil, ctlr);
- tsleep(ws->sleep, wdone, ws, 10);
- }
- poperror();
- return ws->status;
-}
-
-void
-checkstatus(u32int status, char *info)
-{
- if(status == 0)
- return;
- snprint(up->genbuf, sizeof(up->genbuf), "%s: status %ux", info, status);
- error(up->genbuf);
-}
-
-static long
-nvmebio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
-{
- u32int nsid, s, n, m, *e;
- Ctlr *ctlr;
- uchar *p;
- WS ws;
-
- USED(lun);
-
- ctlr = u->dev->ctlr;
- nsid = ctlr->nsid[u->subno];
- s = u->secsize;
- p = a;
- while(count > 0){
- m = (2*ctlr->mps - ((uintptr)p & ctlr->mps-1)) / s;
- if((n = count) > m)
- n = m;
- e = qcmd(&ws, ctlr, 0, write ? 0x01 : 0x02, nsid, nil, p, n*s);
- e[10] = lba;
- e[11] = lba>>32;
- e[12] = n-1;
- e[13] = (count>n)<<6; /* sequential request */
- e[14] = 0;
- e[15] = 0;
- checkstatus(wcmd(&ws), write ? "write" : "read");
- p += n*s;
- count -= n;
- lba += n;
- }
- return p - (uchar*)a;
-}
-
-static int
-nvmerio(SDreq *r)
-{
- int i, count, rw;
- uvlong lba;
- SDunit *u;
-
- u = r->unit;
- if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91)
- return sdsetsense(r, SDok, 0, 0, 0);
- if((i = sdfakescsi(r)) != SDnostatus)
- return r->status = i;
- if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
- return i;
- r->rlen = nvmebio(u, r->lun, rw == SDwrite, r->data, count, lba);
- return r->status = SDok;
-}
-
-static int
-nvmeverify(SDunit *u)
-{
- Ctlr *ctlr = u->dev->ctlr;
- return u->subno < ctlr->nnsid;
-}
-
-static int
-nvmeonline(SDunit *u)
-{
- u32int *e, lbaf;
- uchar *info, *p;
- Ctlr *ctlr;
- WS ws;
-
- if(u->sectors != 0)
- return 1;
-
- ctlr = u->dev->ctlr;
- if((info = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
- return 0;
-
- e = qcmd(&ws, ctlr, 1, 0x06, ctlr->nsid[u->subno], nil, info, 0x1000);
- e[10] = 0; // identify namespace
- if(wcmd(&ws) != 0){
- free(info);
- return 0;
- }
- p = info;
- u->sectors = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24
- | (u64int)p[4]<<32
- | (u64int)p[5]<<40
- | (u64int)p[6]<<48
- | (u64int)p[7]<<56;
- p = &info[128 + 4*(info[26]&15)];
- lbaf = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24;
- u->secsize = 1<<((lbaf>>16)&0xFF);
- free(info);
-
- memset(u->inquiry, 0, sizeof u->inquiry);
- u->inquiry[2] = 2;
- u->inquiry[3] = 2;
- u->inquiry[4] = sizeof u->inquiry - 4;
- memmove(u->inquiry+8, ctlr->ident+24, 20);
-
- return 2;
-}
-
-static int
-nvmerctl(SDunit *u, char *p, int l)
-{
- Ctlr *ctlr;
- char *e, *s;
-
- if((ctlr = u->dev->ctlr) == nil || ctlr->ident == nil)
- return 0;
-
- e = p+l;
- s = p;
-
- p = seprint(p, e, "model\t%.20s\n", (char*)ctlr->ident+24);
- p = seprint(p, e, "serial\t%.10s\n", (char*)ctlr->ident+4);
- p = seprint(p, e, "firm\t%.6s\n", (char*)ctlr->ident+64);
- p = seprint(p, e, "geometry %llud %lud\n", u->sectors, u->secsize);
-
- return p-s;
-}
-
-static void*
-cqalloc(Ctlr *ctlr, CQ *cq, u32int lgsize)
-{
- cq->ctlr = ctlr;
- cq->head = 0;
- cq->shift = lgsize-4;
- cq->mask = (1<<cq->shift)-1;
- if((cq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
- error(Enomem);
- memset(cq->base, 0, 1<<lgsize);
- return cq->base;
-}
-
-static void*
-sqalloc(Ctlr *ctlr, SQ *sq, u32int lgsize)
-{
- sq->ctlr = ctlr;
- sq->tail = 0;
- sq->shift = lgsize-6;
- sq->mask = (1<<sq->shift)-1;
- if((sq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
- error(Enomem);
- if((sq->wait = mallocz(sizeof(WS*)*(sq->mask+1), 1)) == nil)
- error(Enomem);
- memset(sq->base, 0, 1<<lgsize);
- return sq->base;
-}
-
-static void
-setupqueues(Ctlr *ctlr)
-{
- u32int lgsize, st, *e;
- CQ *cq;
- SQ *sq;
- WS ws;
- int i;
-
- /* Overkill */
- lgsize = 12-6+4;
- while(lgsize < 16+4 && lgsize < ctlr->mpsshift && 1<<lgsize < conf.nmach<<12-6+4)
- lgsize++;
-
- /* CQID1: shared completion queue */
- cq = &ctlr->cq[1];
- cqalloc(ctlr, cq, lgsize);
- e = qcmd(&ws, ctlr, 1, 0x05, 0, nil, cq->base, 1<<lgsize);
- e[10] = (cq - ctlr->cq) | cq->mask<<16;
- e[11] = 3; /* IEN | PC */
- checkstatus(wcmd(&ws), "create completion queue");
-
- st = 0;
-
- /* SQID[1..nmach]: submission queue per cpu */
- for(i=1; i<=conf.nmach; i++){
- sq = &ctlr->sq[i];
- sqalloc(ctlr, sq, 12);
- e = qcmd(&ws, ctlr, 1, 0x01, 0, nil, sq->base, 0x1000);
- e[10] = i | sq->mask<<16;
- e[11] = (cq - ctlr->cq)<<16 | 1; /* CQID<<16 | PC */
-
- st = wcmd(&ws);
- if(st != 0){
- free(sq->base);
- free(sq->wait);
- memset(sq, 0, sizeof(*sq));
- break;
- }
- }
-
- ctlr->nsq = i - 1;
- if(ctlr->nsq < 1)
- checkstatus(st, "create submission queues");
-
- ilock(&ctlr->intr);
- ctlr->ints |= 1<<(cq - ctlr->cq);
- ctlr->reg[IntMc] = ctlr->ints;
- iunlock(&ctlr->intr);
-}
-
-static void
-identify(Ctlr *ctlr)
-{
- u32int *e;
- WS ws;
-
- if(ctlr->ident == nil)
- if((ctlr->ident = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
- error(Enomem);
- if(ctlr->nsid == nil)
- if((ctlr->nsid = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
- error(Enomem);
-
- e = qcmd(&ws, ctlr, 1, 0x06, 0, nil, ctlr->ident, 0x1000);
- e[10] = 1; // identify controller
- checkstatus(wcmd(&ws), "identify controller");
-
- e = qcmd(&ws, ctlr, 1, 0x06, 0, nil, ctlr->nsid, 0x1000);
- e[10] = 2; // namespace list
- if(wcmd(&ws) != 0)
- ctlr->nsid[0] = 1; /* assume namespace #1 */
-
- ctlr->nnsid = 0;
- while(ctlr->nnsid < 1024 && ctlr->nsid[ctlr->nnsid] != 0)
- ctlr->nnsid++;
-}
-
-static int
-nvmedisable(SDev *sd)
-{
- char name[32];
- Ctlr *ctlr;
- int i;
-
- ctlr = sd->ctlr;
-
- /* mask interrupts */
- ilock(&ctlr->intr);
- ctlr->ints = 0;
- ctlr->reg[IntMs] = ~ctlr->ints;
- iunlock(&ctlr->intr);
-
- /* disable controller */
- ctlr->reg[CCfg] = 0;
-
- for(i = 0; i < 10; i++){
- if((ctlr->reg[CSts] & 1) == 0)
- break;
- tsleep(&up->sleep, return0, nil, 100);
- }
-
- snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
- intrdisable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
-
- pciclrbme(ctlr->pci); /* dma disable */
-
- for(i=0; i<nelem(ctlr->sq); i++){
- free(ctlr->sq[i].base);
- free(ctlr->sq[i].wait);
- }
- for(i=0; i<nelem(ctlr->cq); i++)
- free(ctlr->cq[i].base);
-
- memset(ctlr->sq, 0, sizeof(ctlr->sq));
- memset(ctlr->cq, 0, sizeof(ctlr->cq));
-
- free(ctlr->ident);
- ctlr->ident = nil;
- free(ctlr->nsid);
- ctlr->nsid = nil;
- ctlr->nnsid = 0;
-
- return 1;
-}
-
-static int
-nvmeenable(SDev *sd)
-{
- char name[32];
- Ctlr *ctlr;
- u64int pa;
- int to;
-
- ctlr = sd->ctlr;
-
- snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
- intrenable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
-
- if(waserror()){
- print("%s: %s\n", name, up->errstr);
- nvmedisable(sd);
- sd->nunit = 0; /* hack: prevent further probing */
- return 0;
- }
-
- pa = PCIWADDR(cqalloc(ctlr, &ctlr->cq[0], ctlr->mpsshift));
- ctlr->reg[ACQBase0] = pa;
- ctlr->reg[ACQBase1] = pa>>32;
-
- pa = PCIWADDR(sqalloc(ctlr, &ctlr->sq[0], ctlr->mpsshift));
- ctlr->reg[ASQBase0] = pa;
- ctlr->reg[ASQBase1] = pa>>32;
-
- ctlr->reg[AQAttr] = ctlr->sq[0].mask | ctlr->cq[0].mask<<16;
-
- /* dma enable */
- pcisetbme(ctlr->pci);
-
- /* enable interrupt */
- ilock(&ctlr->intr);
- ctlr->ints = 1;
- ctlr->reg[IntMc] = ctlr->ints;
- iunlock(&ctlr->intr);
-
- /* enable controller */
- ctlr->reg[CCfg] = 1 | (ctlr->mpsshift-12)<<7 | 6<<16 | 4<<20;
-
- for(to = (ctlr->cap>>24) & 255; to >= 0; to--){
- tsleep(&up->sleep, return0, nil, 500);
- if((ctlr->reg[CSts] & 3) == 1)
- goto Ready;
- }
- if(ctlr->reg[CSts] & 2)
- error("fatal controller status during initialization");
- error("controller initialization timeout");
-Ready:
- identify(ctlr);
- setupqueues(ctlr);
- print("%s: using %d submit queues\n", name, ctlr->nsq);
- poperror();
-
- return 1;
-}
-
-static Ctlr*
-nvmepnpctlrs(void)
-{
- Ctlr *ctlr, *h, *t;
- Pcidev *p;
- int i;
-
- h = t = nil;
- for(p = nil; p = pcimatch(p, 0, 0);){
- if(p->ccrb != 1 || p->ccru != 8 || p->ccrp != 2)
- continue;
- if(p->mem[0].size == 0 || (p->mem[0].bar & 1) != 0)
- continue;
- if((ctlr = malloc(sizeof(*ctlr))) == nil){
- print("nvme: no memory for Ctlr\n");
- break;
- }
- pcienable(p);
- ctlr->pci = p;
- ctlr->reg = vmap(p->mem[0].bar & ~0xF, p->mem[0].size);
- if(ctlr->reg == nil){
- print("nvme: can't vmap bar0\n");
- Bad:
- if(ctlr->reg != nil)
- vunmap(ctlr->reg, p->mem[0].size);
- pcidisable(p);
- free(ctlr);
- continue;
- }
- ctlr->cap = ctlr->reg[Cap0];
- ctlr->cap |= (u64int)ctlr->reg[Cap1]<<32;
-
- /* mask interrupts */
- ctlr->ints = 0;
- ctlr->reg[IntMs] = ~ctlr->ints;
-
- /* disable controller */
- ctlr->reg[CCfg] = 0;
-
- if((ctlr->cap&(1ULL<<37)) == 0){
- print("nvme: doesnt support NVM commactlr set: %ux\n",
- (u32int)(ctlr->cap>>37) & 0xFF);
- goto Bad;
- }
-
- /* use 64K page size when possible */
- ctlr->dstrd = (ctlr->cap >> 32) & 15;
- for(i = (ctlr->cap >> 48) & 15; i < ((ctlr->cap >> 52) & 15); i++){
- if(i >= 16-12) /* 64K */
- break;
- }
- ctlr->mpsshift = i+12;
- ctlr->mps = 1 << ctlr->mpsshift;
-
- if(h == nil)
- h = ctlr;
- else
- t->next = ctlr;
- t = ctlr;
- }
-
- return h;
-}
-
-SDifc sdnvmeifc;
-
-static SDev*
-nvmepnp(void)
-{
- SDev *s, *h, *t;
- Ctlr *ctlr;
- int id;
-
- h = t = nil;
-
- id = 'N';
- for(ctlr = nvmepnpctlrs(); ctlr != nil; ctlr = ctlr->next){
- if((s = malloc(sizeof(*s))) == nil)
- break;
- s->ctlr = ctlr;
- s->idno = id++;
- s->ifc = &sdnvmeifc;
- s->nunit = 1024;
- if(h)
- t->next = s;
- else
- h = s;
- t = s;
- }
-
- return h;
-}
-
-SDifc sdnvmeifc = {
- "nvme", /* name */
-
- nvmepnp, /* pnp */
- nil, /* legacy */
- nvmeenable, /* enable */
- nvmedisable, /* disable */
-
- nvmeverify, /* verify */
- nvmeonline, /* online */
- nvmerio, /* rio */
- nvmerctl, /* rctl */
- nil, /* wctl */
-
- nvmebio, /* bio */
- nil, /* probe */
- nil, /* clear */
- nil, /* rtopctl */
- nil, /* wtopctl */
-};
--- /dev/null
+++ b/sys/src/9/port/sdnvme.c
@@ -1,0 +1,694 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "ureg.h"
+#include "../port/error.h"
+
+#include "../port/sd.h"
+
+typedef struct WS WS;
+typedef struct CQ CQ;
+typedef struct SQ SQ;
+typedef struct Ctlr Ctlr;
+
+struct WS
+{
+ u32int cdw0;
+ ushort status;
+ Rendez *sleep;
+ WS **link;
+ SQ *queue;
+};
+
+struct CQ
+{
+ u32int head;
+ u32int mask;
+ u32int shift;
+ u32int *base;
+ Ctlr *ctlr;
+};
+
+struct SQ
+{
+ u32int tail;
+ u32int mask;
+ u32int shift;
+ u32int *base;
+ WS **wait;
+ Ctlr *ctlr;
+ Lock;
+};
+
+struct Ctlr
+{
+ QLock;
+
+ Lock intr;
+ u32int ints;
+ u32int irqc[2];
+
+ Pcidev *pci;
+ u32int *reg;
+
+ u64int cap;
+ uchar *ident;
+ u32int *nsid;
+ int nnsid;
+
+ u32int mps; /* mps = 1<<mpsshift */
+ u32int mpsshift;
+ u32int dstrd;
+
+ u32int nsq;
+
+ CQ cq[1+1];
+ SQ sq[1+MAXMACH];
+
+ Ctlr *next;
+};
+
+/* controller registers */
+enum {
+ Cap0,
+ Cap1,
+ Ver,
+ IntMs,
+ IntMc,
+ CCfg,
+
+ CSts = 0x1C/4,
+ Nssr,
+ AQAttr,
+ ASQBase0,
+ ASQBase1,
+ ACQBase0,
+ ACQBase1,
+
+ DBell = 0x1000/4,
+};
+
+static u32int*
+qcmd(WS *ws, Ctlr *ctlr, int adm, u32int opc, u32int nsid, void *mptr, void *data, ulong len)
+{
+ u32int cid, *e;
+ u64int pa;
+ SQ *sq;
+
+ if(!adm){
+ Retry:
+ splhi();
+ sq = &ctlr->sq[1+(m->machno % ctlr->nsq)];
+ if(conf.nmach > ctlr->nsq)
+ lock(sq);
+ } else {
+ qlock(ctlr);
+ sq = &ctlr->sq[0];
+ }
+ ws->sleep = &up->sleep;
+ ws->queue = sq;
+ ws->link = &sq->wait[sq->tail & sq->mask];
+ while(*ws->link != nil){
+ sched();
+ if(!adm){
+ /* should be very rare */
+ goto Retry;
+ }
+ }
+ *ws->link = ws;
+
+ e = &sq->base[((cid = sq->tail++) & sq->mask)<<4];
+ e[0] = opc | cid<<16;
+ e[1] = nsid;
+ e[2] = 0;
+ e[3] = 0;
+ if(mptr != nil){
+ pa = PCIWADDR(mptr);
+ e[4] = pa;
+ e[5] = pa>>32;
+ } else {
+ e[4] = 0;
+ e[5] = 0;
+ }
+ if(len > 0){
+ dmaflush(1, data, len);
+ pa = PCIWADDR(data);
+ e[6] = pa;
+ e[7] = pa>>32;
+ if(len > ctlr->mps - (pa & ctlr->mps-1))
+ pa += ctlr->mps - (pa & ctlr->mps-1);
+ else
+ pa = 0;
+ } else {
+ e[6] = 0;
+ e[7] = 0;
+ pa = 0;
+ }
+ e[8] = pa;
+ e[9] = pa>>32;
+ return e;
+}
+
+static void
+nvmeintr(Ureg *, void *arg)
+{
+ u32int phaseshift, *e;
+ WS *ws, **wp;
+ Ctlr *ctlr;
+ SQ *sq;
+ CQ *cq;
+
+ ctlr = arg;
+ if(ctlr->ints == 0)
+ return;
+
+ ilock(&ctlr->intr);
+ ctlr->reg[IntMs] = ctlr->ints;
+ for(cq = &ctlr->cq[nelem(ctlr->cq)-1]; cq >= ctlr->cq; cq--){
+ if(cq->base == nil)
+ continue;
+ phaseshift = 16 - cq->shift;
+ for(;;){
+ e = &cq->base[(cq->head & cq->mask)<<2];
+ dmaflush(0, e, 32);
+ if(((e[3] ^ (cq->head << phaseshift)) & 0x10000) == 0)
+ break;
+
+ if(0) iprint("nvmeintr: cq%d [%.4ux] %.8ux %.8ux %.8ux %.8ux\n",
+ (int)(cq - ctlr->cq), cq->head & cq->mask,
+ e[0], e[1], e[2], e[3]);
+
+ sq = &ctlr->sq[e[2] >> 16];
+ wp = &sq->wait[e[3] & sq->mask];
+ if((ws = *wp) != nil && ws->link == wp){
+ Rendez *z = ws->sleep;
+ ws->cdw0 = e[0];
+ ws->status = e[3]>>17;
+ *wp = nil;
+ wakeup(z);
+ }
+ ctlr->reg[DBell + ((cq-ctlr->cq)*2+1 << ctlr->dstrd)] = ++cq->head & cq->mask;
+ }
+ }
+ ctlr->reg[IntMc] = ctlr->ints;
+ iunlock(&ctlr->intr);
+}
+
+static int
+wdone(void *arg)
+{
+ WS *ws = arg;
+ return *ws->link != ws;
+}
+
+static u32int
+wcmd(WS *ws, u32int *e)
+{
+ SQ *sq = ws->queue;
+ Ctlr *ctlr = sq->ctlr;
+
+ if(e != nil) dmaflush(1, e, 64);
+ coherence();
+ ctlr->reg[DBell + ((sq-ctlr->sq)*2+0 << ctlr->dstrd)] = sq->tail & sq->mask;
+ if(sq > ctlr->sq) {
+ assert(sq == &ctlr->sq[1+(m->machno % ctlr->nsq)]);
+ if(conf.nmach > ctlr->nsq)
+ unlock(sq);
+ spllo();
+ } else
+ qunlock(sq->ctlr);
+ while(waserror())
+ ;
+ tsleep(ws->sleep, wdone, ws, 5);
+ while(!wdone(ws)){
+ nvmeintr(nil, ctlr);
+ tsleep(ws->sleep, wdone, ws, 10);
+ }
+ poperror();
+ return ws->status;
+}
+
+void
+checkstatus(u32int status, char *info)
+{
+ if(status == 0)
+ return;
+ snprint(up->genbuf, sizeof(up->genbuf), "%s: status %ux", info, status);
+ error(up->genbuf);
+}
+
+static long
+nvmebio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
+{
+ u32int nsid, s, n, m, *e;
+ Ctlr *ctlr;
+ uchar *p;
+ WS ws;
+
+ USED(lun);
+
+ ctlr = u->dev->ctlr;
+ nsid = ctlr->nsid[u->subno];
+ s = u->secsize;
+ p = a;
+ while(count > 0){
+ m = (2*ctlr->mps - ((uintptr)p & ctlr->mps-1)) / s;
+ if((n = count) > m)
+ n = m;
+ e = qcmd(&ws, ctlr, 0, write ? 0x01 : 0x02, nsid, nil, p, n*s);
+ e[10] = lba;
+ e[11] = lba>>32;
+ e[12] = n-1;
+ e[13] = (count>n)<<6; /* sequential request */
+ e[14] = 0;
+ e[15] = 0;
+ checkstatus(wcmd(&ws, e), write ? "write" : "read");
+ p += n*s;
+ count -= n;
+ lba += n;
+ }
+ if(!write) dmaflush(0, a, p - (uchar*)a);
+ return p - (uchar*)a;
+}
+
+static int
+nvmerio(SDreq *r)
+{
+ int i, count, rw;
+ uvlong lba;
+ SDunit *u;
+
+ u = r->unit;
+ if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91)
+ return sdsetsense(r, SDok, 0, 0, 0);
+ if((i = sdfakescsi(r)) != SDnostatus)
+ return r->status = i;
+ if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
+ return i;
+ r->rlen = nvmebio(u, r->lun, rw == SDwrite, r->data, count, lba);
+ return r->status = SDok;
+}
+
+static int
+nvmeverify(SDunit *u)
+{
+ Ctlr *ctlr = u->dev->ctlr;
+ return u->subno < ctlr->nnsid;
+}
+
+static int
+nvmeonline(SDunit *u)
+{
+ u32int *e, lbaf;
+ uchar *info, *p;
+ Ctlr *ctlr;
+ WS ws;
+
+ if(u->sectors != 0)
+ return 1;
+
+ ctlr = u->dev->ctlr;
+ if((info = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
+ return 0;
+
+ e = qcmd(&ws, ctlr, 1, 0x06, ctlr->nsid[u->subno], nil, info, 0x1000);
+ e[10] = 0; // identify namespace
+ if(wcmd(&ws, e) != 0){
+ free(info);
+ return 0;
+ }
+ dmaflush(0, info, 0x1000);
+ p = info;
+ u->sectors = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24
+ | (u64int)p[4]<<32
+ | (u64int)p[5]<<40
+ | (u64int)p[6]<<48
+ | (u64int)p[7]<<56;
+ p = &info[128 + 4*(info[26]&15)];
+ lbaf = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24;
+ u->secsize = 1<<((lbaf>>16)&0xFF);
+ free(info);
+
+ memset(u->inquiry, 0, sizeof u->inquiry);
+ u->inquiry[2] = 2;
+ u->inquiry[3] = 2;
+ u->inquiry[4] = sizeof u->inquiry - 4;
+ memmove(u->inquiry+8, ctlr->ident+24, 20);
+
+ return 2;
+}
+
+static int
+nvmerctl(SDunit *u, char *p, int l)
+{
+ Ctlr *ctlr;
+ char *e, *s;
+
+ if((ctlr = u->dev->ctlr) == nil || ctlr->ident == nil)
+ return 0;
+
+ e = p+l;
+ s = p;
+
+ p = seprint(p, e, "model\t%.20s\n", (char*)ctlr->ident+24);
+ p = seprint(p, e, "serial\t%.10s\n", (char*)ctlr->ident+4);
+ p = seprint(p, e, "firm\t%.6s\n", (char*)ctlr->ident+64);
+ p = seprint(p, e, "geometry %llud %lud\n", u->sectors, u->secsize);
+
+ return p-s;
+}
+
+static void*
+cqalloc(Ctlr *ctlr, CQ *cq, u32int lgsize)
+{
+ cq->ctlr = ctlr;
+ cq->head = 0;
+ cq->shift = lgsize-4;
+ cq->mask = (1<<cq->shift)-1;
+ if((cq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
+ error(Enomem);
+ memset(cq->base, 0, 1<<lgsize);
+ return cq->base;
+}
+
+static void*
+sqalloc(Ctlr *ctlr, SQ *sq, u32int lgsize)
+{
+ sq->ctlr = ctlr;
+ sq->tail = 0;
+ sq->shift = lgsize-6;
+ sq->mask = (1<<sq->shift)-1;
+ if((sq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
+ error(Enomem);
+ if((sq->wait = mallocz(sizeof(WS*)*(sq->mask+1), 1)) == nil)
+ error(Enomem);
+ memset(sq->base, 0, 1<<lgsize);
+ return sq->base;
+}
+
+static void
+setupqueues(Ctlr *ctlr)
+{
+ u32int lgsize, st, *e;
+ CQ *cq;
+ SQ *sq;
+ WS ws;
+ int i;
+
+ /* Overkill */
+ lgsize = 12-6+4;
+ while(lgsize < 16+4 && lgsize < ctlr->mpsshift && 1<<lgsize < conf.nmach<<12-6+4)
+ lgsize++;
+
+ /* CQID1: shared completion queue */
+ cq = &ctlr->cq[1];
+ cqalloc(ctlr, cq, lgsize);
+ e = qcmd(&ws, ctlr, 1, 0x05, 0, nil, cq->base, 1<<lgsize);
+ e[10] = (cq - ctlr->cq) | cq->mask<<16;
+ e[11] = 3; /* IEN | PC */
+ checkstatus(wcmd(&ws, e), "create completion queue");
+
+ st = 0;
+
+ /* SQID[1..nmach]: submission queue per cpu */
+ for(i=1; i<=conf.nmach; i++){
+ sq = &ctlr->sq[i];
+ sqalloc(ctlr, sq, 12);
+ e = qcmd(&ws, ctlr, 1, 0x01, 0, nil, sq->base, 0x1000);
+ e[10] = i | sq->mask<<16;
+ e[11] = (cq - ctlr->cq)<<16 | 1; /* CQID<<16 | PC */
+ st = wcmd(&ws, e);
+ if(st != 0){
+ free(sq->base);
+ free(sq->wait);
+ memset(sq, 0, sizeof(*sq));
+ break;
+ }
+ }
+
+ ctlr->nsq = i - 1;
+ if(ctlr->nsq < 1)
+ checkstatus(st, "create submission queues");
+
+ ilock(&ctlr->intr);
+ ctlr->ints |= 1<<(cq - ctlr->cq);
+ ctlr->reg[IntMc] = ctlr->ints;
+ iunlock(&ctlr->intr);
+}
+
+static void
+identify(Ctlr *ctlr)
+{
+ u32int *e;
+ WS ws;
+
+ if(ctlr->ident == nil)
+ if((ctlr->ident = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
+ error(Enomem);
+ if(ctlr->nsid == nil)
+ if((ctlr->nsid = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
+ error(Enomem);
+
+ e = qcmd(&ws, ctlr, 1, 0x06, 0, nil, ctlr->ident, 0x1000);
+ e[10] = 1; // identify controller
+ checkstatus(wcmd(&ws, e), "identify controller");
+ dmaflush(0, ctlr->ident, 0x1000);
+
+ e = qcmd(&ws, ctlr, 1, 0x06, 0, nil, ctlr->nsid, 0x1000);
+ e[10] = 2; // namespace list
+ if(wcmd(&ws, e) == 0)
+ dmaflush(0, ctlr->nsid, 0x1000);
+ else
+ ctlr->nsid[0] = 1; /* assume namespace #1 */
+
+ ctlr->nnsid = 0;
+ while(ctlr->nnsid < 1024 && ctlr->nsid[ctlr->nnsid] != 0)
+ ctlr->nnsid++;
+}
+
+static int
+nvmedisable(SDev *sd)
+{
+ char name[32];
+ Ctlr *ctlr;
+ int i;
+
+ ctlr = sd->ctlr;
+
+ /* mask interrupts */
+ ilock(&ctlr->intr);
+ ctlr->ints = 0;
+ ctlr->reg[IntMs] = ~ctlr->ints;
+ iunlock(&ctlr->intr);
+
+ /* disable controller */
+ ctlr->reg[CCfg] = 0;
+
+ for(i = 0; i < 10; i++){
+ if((ctlr->reg[CSts] & 1) == 0)
+ break;
+ tsleep(&up->sleep, return0, nil, 100);
+ }
+
+ snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+ intrdisable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
+
+ pciclrbme(ctlr->pci); /* dma disable */
+
+ for(i=0; i<nelem(ctlr->sq); i++){
+ free(ctlr->sq[i].base);
+ free(ctlr->sq[i].wait);
+ }
+ for(i=0; i<nelem(ctlr->cq); i++)
+ free(ctlr->cq[i].base);
+
+ memset(ctlr->sq, 0, sizeof(ctlr->sq));
+ memset(ctlr->cq, 0, sizeof(ctlr->cq));
+
+ free(ctlr->ident);
+ ctlr->ident = nil;
+ free(ctlr->nsid);
+ ctlr->nsid = nil;
+ ctlr->nnsid = 0;
+
+ return 1;
+}
+
+static int
+nvmeenable(SDev *sd)
+{
+ char name[32];
+ Ctlr *ctlr;
+ u64int pa;
+ int to;
+
+ ctlr = sd->ctlr;
+
+ snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+ intrenable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
+
+ if(waserror()){
+ print("%s: %s\n", name, up->errstr);
+ nvmedisable(sd);
+ sd->nunit = 0; /* hack: prevent further probing */
+ return 0;
+ }
+
+ pa = PCIWADDR(cqalloc(ctlr, &ctlr->cq[0], ctlr->mpsshift));
+ dmaflush(1, ctlr->cq[0].base, 1<<ctlr->mpsshift);
+ ctlr->reg[ACQBase0] = pa;
+ ctlr->reg[ACQBase1] = pa>>32;
+
+ pa = PCIWADDR(sqalloc(ctlr, &ctlr->sq[0], ctlr->mpsshift));
+ dmaflush(1, ctlr->sq[0].base, 1<<ctlr->mpsshift);
+ ctlr->reg[ASQBase0] = pa;
+ ctlr->reg[ASQBase1] = pa>>32;
+
+ ctlr->reg[AQAttr] = ctlr->sq[0].mask | ctlr->cq[0].mask<<16;
+
+ /* dma enable */
+ pcisetbme(ctlr->pci);
+
+ /* enable interrupt */
+ ilock(&ctlr->intr);
+ ctlr->ints = 1;
+ ctlr->reg[IntMc] = ctlr->ints;
+ iunlock(&ctlr->intr);
+
+ /* enable controller */
+ ctlr->reg[CCfg] = 1 | (ctlr->mpsshift-12)<<7 | 6<<16 | 4<<20;
+
+ for(to = (ctlr->cap>>24) & 255; to >= 0; to--){
+ tsleep(&up->sleep, return0, nil, 500);
+ if((ctlr->reg[CSts] & 3) == 1)
+ goto Ready;
+ }
+ if(ctlr->reg[CSts] & 2)
+ error("fatal controller status during initialization");
+ error("controller initialization timeout");
+Ready:
+ identify(ctlr);
+ setupqueues(ctlr);
+ print("%s: using %d submit queues\n", name, ctlr->nsq);
+ poperror();
+
+ return 1;
+}
+
+static Ctlr*
+nvmepnpctlrs(void)
+{
+ Ctlr *ctlr, *h, *t;
+ Pcidev *p;
+ int i;
+
+ h = t = nil;
+ for(p = nil; p = pcimatch(p, 0, 0);){
+ if(p->ccrb != 1 || p->ccru != 8 || p->ccrp != 2)
+ continue;
+ if(p->mem[0].size == 0 || (p->mem[0].bar & 1) != 0)
+ continue;
+ if((ctlr = malloc(sizeof(*ctlr))) == nil){
+ print("nvme: no memory for Ctlr\n");
+ break;
+ }
+ pcienable(p);
+ ctlr->pci = p;
+ ctlr->reg = vmap(p->mem[0].bar & ~0xF, p->mem[0].size);
+ if(ctlr->reg == nil){
+ print("nvme: can't vmap bar0\n");
+ Bad:
+ if(ctlr->reg != nil)
+ vunmap(ctlr->reg, p->mem[0].size);
+ pcidisable(p);
+ free(ctlr);
+ continue;
+ }
+ ctlr->cap = ctlr->reg[Cap0];
+ ctlr->cap |= (u64int)ctlr->reg[Cap1]<<32;
+
+ /* mask interrupts */
+ ctlr->ints = 0;
+ ctlr->reg[IntMs] = ~ctlr->ints;
+
+ /* disable controller */
+ ctlr->reg[CCfg] = 0;
+
+ if((ctlr->cap&(1ULL<<37)) == 0){
+ print("nvme: doesnt support NVM commactlr set: %ux\n",
+ (u32int)(ctlr->cap>>37) & 0xFF);
+ goto Bad;
+ }
+
+ /* use 64K page size when possible */
+ ctlr->dstrd = (ctlr->cap >> 32) & 15;
+ for(i = (ctlr->cap >> 48) & 15; i < ((ctlr->cap >> 52) & 15); i++){
+ if(i >= 16-12) /* 64K */
+ break;
+ }
+ ctlr->mpsshift = i+12;
+ ctlr->mps = 1 << ctlr->mpsshift;
+
+ if(h == nil)
+ h = ctlr;
+ else
+ t->next = ctlr;
+ t = ctlr;
+ }
+
+ return h;
+}
+
+SDifc sdnvmeifc;
+
+static SDev*
+nvmepnp(void)
+{
+ SDev *s, *h, *t;
+ Ctlr *ctlr;
+ int id;
+
+ h = t = nil;
+
+ id = 'N';
+ for(ctlr = nvmepnpctlrs(); ctlr != nil; ctlr = ctlr->next){
+ if((s = malloc(sizeof(*s))) == nil)
+ break;
+ s->ctlr = ctlr;
+ s->idno = id++;
+ s->ifc = &sdnvmeifc;
+ s->nunit = 1024;
+ if(h)
+ t->next = s;
+ else
+ h = s;
+ t = s;
+ }
+
+ return h;
+}
+
+SDifc sdnvmeifc = {
+ "nvme", /* name */
+
+ nvmepnp, /* pnp */
+ nil, /* legacy */
+ nvmeenable, /* enable */
+ nvmedisable, /* disable */
+
+ nvmeverify, /* verify */
+ nvmeonline, /* online */
+ nvmerio, /* rio */
+ nvmerctl, /* rctl */
+ nil, /* wctl */
+
+ nvmebio, /* bio */
+ nil, /* probe */
+ nil, /* clear */
+ nil, /* rtopctl */
+ nil, /* wtopctl */
+};