ref: a5db044fd3ccb8f021d4deb07b3dc732ed41d699
dir: /sys/src/9/pc/devvmx.c/
#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "../port/error.h" #include "ureg.h" extern int vmxon(u64int); extern int vmxoff(void); extern int vmclear(u64int); extern int vmptrld(u64int); extern int vmlaunch(Ureg *, int); extern int vmread(u32int, uintptr *); extern int vmwrite(u32int, uintptr); extern int invept(u32int, uvlong, uvlong); extern int invvpid(u32int, uvlong, uvlong); static int gotvmx; static vlong procb_ctls, pinb_ctls; enum { VMX_BASIC_MSR = 0x480, VMX_PINB_CTLS_MSR = 0x481, VMX_PROCB_CTLS_MSR = 0x482, VMX_VMEXIT_CTLS_MSR = 0x483, VMX_VMENTRY_CTLS_MSR = 0x484, VMX_MISC_MSR = 0x485, VMX_CR0_FIXED0 = 0x486, VMX_CR0_FIXED1 = 0x487, VMX_CR4_FIXED0 = 0x488, VMX_CR4_FIXED1 = 0x489, VMX_VMCS_ENUM = 0x48A, VMX_PROCB_CTLS2_MSR = 0x48B, VMX_TRUE_PINB_CTLS_MSR = 0x48D, VMX_TRUE_PROCB_CTLS_MSR = 0x48E, VMX_TRUE_EXIT_CTLS_MSR = 0x48F, VMX_TRUE_ENTRY_CTLS_MSR = 0x490, VMX_VMFUNC_MSR = 0x491, PINB_CTLS = 0x4000, PINB_EXITIRQ = 1<<0, PINB_EXITNMI = 1<<3, PROCB_CTLS = 0x4002, PROCB_IRQWIN = 1<<2, PROCB_TSCOFFSET = 1<<3, PROCB_EXITHLT = 1<<7, PROCB_EXITINVLPG = 1<<9, PROCB_EXITMWAIT = 1<<10, PROCB_EXITRDPMC = 1<<11, PROCB_EXITRDTSC = 1<<12, PROCB_EXITCR3LD = 1<<15, PROCB_EXITCR3ST = 1<<16, PROCB_EXITCR8LD = 1<<19, PROCB_EXITCR8ST = 1<<20, PROCB_EXITMOVDR = 1<<23, PROCB_EXITIO = 1<<24, PROCB_MONTRAP = 1<<27, PROCB_MSRBITMAP = 1<<28, PROCB_EXITMONITOR = 1<<29, PROCB_EXITPAUSE = 1<<30, PROCB_USECTLS2 = 1<<31, PROCB_CTLS2 = 0x401E, PROCB_EPT = 1<<1, PROCB_EXITGDT = 1<<2, PROCB_VPID = 1<<5, PROCB_UNRESTR = 1<<7, EXC_BITMAP = 0x4004, PFAULT_MASK = 0x4006, PFAULT_MATCH = 0x4008, CR3_TARGCNT = 0x400a, MSR_BITMAP = 0x2004, VMEXIT_CTLS = 0x400c, VMEXIT_ST_DEBUG = 1<<2, VMEXIT_HOST64 = 1<<9, VMEXIT_LD_IA32_PERF_GLOBAL_CTRL = 1<<12, VMEXIT_ST_IA32_PAT = 1<<18, VMEXIT_LD_IA32_PAT = 1<<19, VMEXIT_ST_IA32_EFER = 1<<20, VMEXIT_LD_IA32_EFER = 1<<21, VMEXIT_MSRSTCNT = 0x400e, VMEXIT_MSRLDCNT = 0x4010, VMEXIT_MSRSTADDR = 0x2006, VMEXIT_MSRLDADDR = 0x2008, VMENTRY_MSRLDADDR = 0x200A, VMENTRY_CTLS = 0x4012, VMENTRY_LD_DEBUG = 1<<2, VMENTRY_GUEST64 = 1<<9, VMENTRY_LD_IA32_PERF_GLOBAL_CTRL = 1<<13, VMENTRY_LD_IA32_PAT = 1<<14, VMENTRY_LD_IA32_EFER = 1<<15, VMENTRY_MSRLDCNT = 0x4014, VMENTRY_INTRINFO = 0x4016, VMENTRY_INTRCODE = 0x4018, VMENTRY_INTRILEN = 0x401a, VMCS_TSC_OFFSET = 0x2010, VMCS_LINK = 0x2800, GUEST_ES = 0x800, GUEST_CS = 0x802, GUEST_SS = 0x804, GUEST_DS = 0x806, GUEST_FS = 0x808, GUEST_GS = 0x80A, GUEST_LDTR = 0x80C, GUEST_TR = 0x80E, GUEST_CR0 = 0x6800, GUEST_CR3 = 0x6802, GUEST_CR4 = 0x6804, GUEST_ESLIMIT = 0x4800, GUEST_CSLIMIT = 0x4802, GUEST_SSLIMIT = 0x4804, GUEST_DSLIMIT = 0x4806, GUEST_FSLIMIT = 0x4808, GUEST_GSLIMIT = 0x480A, GUEST_LDTRLIMIT = 0x480C, GUEST_TRLIMIT = 0x480E, GUEST_GDTRLIMIT = 0x4810, GUEST_IDTRLIMIT = 0x4812, GUEST_ESPERM = 0x4814, GUEST_CSPERM = 0x4816, GUEST_SSPERM = 0x4818, GUEST_DSPERM = 0x481A, GUEST_FSPERM = 0x481C, GUEST_GSPERM = 0x481E, GUEST_LDTRPERM = 0x4820, GUEST_TRPERM = 0x4822, GUEST_CR0MASK = 0x6000, GUEST_CR4MASK = 0x6002, GUEST_CR0SHADOW = 0x6004, GUEST_CR4SHADOW = 0x6006, GUEST_ESBASE = 0x6806, GUEST_CSBASE = 0x6808, GUEST_SSBASE = 0x680A, GUEST_DSBASE = 0x680C, GUEST_FSBASE = 0x680E, GUEST_GSBASE = 0x6810, GUEST_LDTRBASE = 0x6812, GUEST_TRBASE = 0x6814, GUEST_GDTRBASE = 0x6816, GUEST_IDTRBASE = 0x6818, GUEST_DR7 = 0x681A, GUEST_RSP = 0x681C, GUEST_RIP = 0x681E, GUEST_RFLAGS = 0x6820, GUEST_IA32_DEBUGCTL = 0x2802, GUEST_IA32_PAT = 0x2804, GUEST_IA32_EFER = 0x2806, GUEST_IA32_PERF_GLOBAL_CTRL = 0x2808, HOST_ES = 0xC00, HOST_CS = 0xC02, HOST_SS = 0xC04, HOST_DS = 0xC06, HOST_FS = 0xC08, HOST_GS = 0xC0A, HOST_TR = 0xC0C, HOST_CR0 = 0x6C00, HOST_CR3 = 0x6C02, HOST_CR4 = 0x6C04, HOST_FSBASE = 0x6C06, HOST_GSBASE = 0x6C08, HOST_TRBASE = 0x6C0A, HOST_GDTR = 0x6C0C, HOST_IDTR = 0x6C0E, HOST_RSP = 0x6C14, HOST_RIP = 0x6C16, HOST_IA32_PAT = 0x2C00, HOST_IA32_EFER = 0x2C02, HOST_IA32_PERF_GLOBAL_CTRL = 0x2C04, GUEST_CANINTR = 0x4824, VM_INSTRERR = 0x4400, VM_EXREASON = 0x4402, VM_EXINTRINFO = 0x4404, VM_EXINTRCODE = 0x4406, VM_IDTVECINFO = 0x4408, VM_IDTVECCODE = 0x440A, VM_EXINSTRLEN = 0x440C, VM_EXINSTRINFO = 0x440E, VM_EXQUALIF = 0x6400, VM_IORCX = 0x6402, VM_IORSI = 0x6404, VM_IORDI = 0x6406, VM_IORIP = 0x6408, VM_GUESTVA = 0x640A, VM_GUESTPA = 0x2400, VM_VPID = 0x000, VM_EPTPIDX = 0x0004, VM_EPTP = 0x201A, VM_EPTPLA = 0x2024, INVLOCAL = 1, }; enum { CR0RSVD = 0x1ffaffc0, CR4RSVD = 0xff889000, CR4MCE = 1<<6, CR4VMXE = 1<<13, CR4SMXE = 1<<14, CR4PKE = 1<<22, CR0KERNEL = CR0RSVD | 0x30 | (uintptr)0xFFFFFFFF00000000ULL, CR4KERNEL = CR4RSVD | CR4VMXE | CR4SMXE | CR4MCE | CR4PKE | (uintptr)0xFFFFFFFF00000000ULL }; enum { MAXMSR = 512, }; typedef struct VmxMach VmxMach; typedef struct Vmx Vmx; typedef struct VmCmd VmCmd; typedef struct VmMem VmMem; typedef struct VmIntr VmIntr; struct VmMem { uvlong addr; Segment *seg; uintptr off; char *name; VmMem *next, *prev; u16int attr; }; struct VmxMach { char vmxon[4096]; /* has to be at the start for alignment */ QLock; int vms; Vmx *active; }; #define vmxmach ((VmxMach*)((m)->vmx)) #define vmxmachp(n) ((VmxMach*)(MACHP(n)->vmx)) struct VmIntr { u32int info, code, ilen; }; struct Vmx { uchar vmcs[4096]; /* page aligned */ uvlong pml4[512]; /* page aligned */ u32int msrbits[1024]; /* page aligned */ FPsave fp; /* page aligned */ u64int msrhost[MAXMSR*2]; /* 16 byte aligned */ u64int msrguest[MAXMSR*2]; /* 16 byte aligned */ enum { VMXINIT, VMXREADY, VMXRUNNING, VMXDEAD, VMXENDING, } state; int index, machno; char errstr[ERRMAX]; Ureg ureg; uvlong tscoffset; u32int procbctls; uintptr cr2; uintptr xcr0; uintptr dr[8]; /* DR7 is also kept in VMCS */ u8int launched; u8int vpid; enum { FLUSHVPID = 1, FLUSHEPT = 2, STEP = 4, POSTEX = 8, POSTIRQ = 16, } onentry; Rendez cmdwait; Lock cmdlock; VmCmd *firstcmd, **lastcmd; VmCmd *postponed; VmMem mem; enum { GOTEXIT = 1, GOTIRQACK = 2, } got; VmIntr exc, irq, irqack; int nmsr; }; struct VmCmd { enum { CMDFDONE = 1, CMDFFAIL = 2, CMDFPOSTP = 4, } flags; u8int scratched; Rendez; Lock; int (*cmd)(VmCmd *, va_list); int retval; char *errstr; va_list va; VmCmd *next; Vmx *vmx; }; static char Equit[] = "vmx: ending"; static char *statenames[] = { [VMXINIT] "init", [VMXREADY] "ready", [VMXRUNNING] "running", [VMXDEAD] "dead", [VMXENDING]"ending" }; static Vmx *moribund; static QLock vmxtablock; static Vmx **vmxtab; static int nvmxtab; void vmxprocrestore(Proc *p) { int s; Vmx *vmx; s = splhi(); vmx = p->vmx; if(vmxmach->active != vmx){ if(vmx != nil && vmptrld(PADDR(vmx->vmcs)) < 0) panic("VMPTRLD(%p) failed", vmx->vmcs); vmxmach->active = vmx; } splx(s); } static u64int vmcsread(u32int addr) { int rc; u64int val; val = 0; rc = vmread(addr, (uintptr *) &val); if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000) rc = vmread(addr | 1, (uintptr *) &val + 1); if(rc < 0){ char errbuf[128]; snprint(errbuf, sizeof(errbuf), "vmcsread failed (%#.4ux)", addr); error(errbuf); } return val; } static void vmcswrite(u32int addr, u64int val) { int rc; rc = vmwrite(addr, val); if(rc >= 0 && sizeof(uintptr) == 4 && (addr & 0x6000) == 0x2000) rc = vmwrite(addr | 1, val >> 32); if(rc < 0){ char errbuf[128]; snprint(errbuf, sizeof(errbuf), "vmcswrite failed (%#.4ux = %#.16ullx)", addr, val); error(errbuf); } } static uvlong parseval(char *s) { uvlong v; char *p; v = strtoull(s, &p, 0); if(p == s || *p != 0) error("invalid value"); return v; } static char * cr0fakeread(Vmx *, char *p, char *e) { uvlong guest, mask, shadow; guest = vmcsread(GUEST_CR0); mask = vmcsread(GUEST_CR0MASK); shadow = vmcsread(GUEST_CR0SHADOW); return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & ~mask | shadow & mask); } static char * cr4fakeread(Vmx *, char *p, char *e) { uvlong guest, mask, shadow; guest = vmcsread(GUEST_CR4); mask = vmcsread(GUEST_CR4MASK); shadow = vmcsread(GUEST_CR4SHADOW); return seprint(p, e, "%#.*ullx", sizeof(uintptr) * 2, guest & ~mask | shadow & mask); } static void updatelma(void) { uvlong cr0, efer, nefer, ectrl; if(sizeof(uintptr) != 8) return; cr0 = vmcsread(GUEST_CR0); efer = vmcsread(GUEST_IA32_EFER); nefer = efer & ~0x400 | efer << 2 & cr0 >> 21 & 0x400; if(efer == nefer) return; vmcswrite(GUEST_IA32_EFER, nefer); ectrl = vmcsread(VMENTRY_CTLS); ectrl = ectrl & ~0x200 | nefer >> 1 & 0x200; vmcswrite(VMENTRY_CTLS, ectrl); } static int cr0realwrite(Vmx *, char *s) { uvlong v; v = parseval(s); vmcswrite(GUEST_CR0, vmcsread(GUEST_CR0) & CR0KERNEL | v & ~CR0KERNEL); updatelma(); return 0; } static int cr0maskwrite(Vmx *, char *s) { uvlong v; v = parseval(s); vmcswrite(GUEST_CR0MASK, v | CR0KERNEL); return 0; } static int eferwrite(Vmx *, char *s) { uvlong v; v = parseval(s); vmcswrite(GUEST_IA32_EFER, v); updatelma(); return 0; } static int cr4realwrite(Vmx *, char *s) { uvlong v; v = parseval(s); vmcswrite(GUEST_CR4, vmcsread(GUEST_CR4) & CR4KERNEL | v & ~CR4KERNEL); return 0; } static int cr4maskwrite(Vmx *, char *s) { uvlong v; v = parseval(s); vmcswrite(GUEST_CR4MASK, v | CR4KERNEL); return 0; } static int dr7write(Vmx *vmx, char *s) { uvlong v; v = (u32int) parseval(s); vmcswrite(GUEST_DR7, vmx->dr[7] = (u32int) v); return 0; } static int xcr0write(Vmx *vmx, char *s) { vmx->xcr0 = (parseval(s) | 1) & m->xcr0; return 0; } static int readonly(Vmx *, char *) { return -1; } static int dr6write(Vmx *vmx, char *s) { uvlong v; v = parseval(s); vmx->dr[6] = (u32int) v; return 0; } typedef struct GuestReg GuestReg; struct GuestReg { int offset; u8int size; /* in bytes; 0 means == uintptr */ char *name; char *(*read)(Vmx *, char *, char *); int (*write)(Vmx *, char *); }; #define VMXVAR(x) ~(ulong)&(((Vmx*)0)->x) #define UREG(x) VMXVAR(ureg.x) static GuestReg guestregs[] = { {GUEST_RIP, 0, "pc"}, {GUEST_RSP, 0, "sp"}, {GUEST_RFLAGS, 0, "flags"}, {UREG(ax), 0, "ax"}, {UREG(bx), 0, "bx"}, {UREG(cx), 0, "cx"}, {UREG(dx), 0, "dx"}, {UREG(bp), 0, "bp"}, {UREG(si), 0, "si"}, {UREG(di), 0, "di"}, #ifdef RMACH {UREG(r8), 0, "r8"}, {UREG(r9), 0, "r9"}, {UREG(r10), 0, "r10"}, {UREG(r11), 0, "r11"}, {UREG(r12), 0, "r12"}, {UREG(r13), 0, "r13"}, {UREG(r14), 0, "r14"}, {UREG(r15), 0, "r15"}, #endif {GUEST_GDTRBASE, 0, "gdtrbase"}, {GUEST_GDTRLIMIT, 4, "gdtrlimit"}, {GUEST_IDTRBASE, 0, "idtrbase"}, {GUEST_IDTRLIMIT, 4, "idtrlimit"}, {GUEST_CS, 2, "cs"}, {GUEST_CSBASE, 0, "csbase"}, {GUEST_CSLIMIT, 4, "cslimit"}, {GUEST_CSPERM, 4, "csperm"}, {GUEST_DS, 2, "ds"}, {GUEST_DSBASE, 0, "dsbase"}, {GUEST_DSLIMIT, 4, "dslimit"}, {GUEST_DSPERM, 4, "dsperm"}, {GUEST_ES, 2, "es"}, {GUEST_ESBASE, 0, "esbase"}, {GUEST_ESLIMIT, 4, "eslimit"}, {GUEST_ESPERM, 4, "esperm"}, {GUEST_FS, 2, "fs"}, {GUEST_FSBASE, 0, "fsbase"}, {GUEST_FSLIMIT, 4, "fslimit"}, {GUEST_FSPERM, 4, "fsperm"}, {GUEST_GS, 2, "gs"}, {GUEST_GSBASE, 0, "gsbase"}, {GUEST_GSLIMIT, 4, "gslimit"}, {GUEST_GSPERM, 4, "gsperm"}, {GUEST_SS, 2, "ss"}, {GUEST_SSBASE, 0, "ssbase"}, {GUEST_SSLIMIT, 4, "sslimit"}, {GUEST_SSPERM, 4, "ssperm"}, {GUEST_TR, 2, "tr"}, {GUEST_TRBASE, 0, "trbase"}, {GUEST_TRLIMIT, 4, "trlimit"}, {GUEST_TRPERM, 4, "trperm"}, {GUEST_LDTR, 2, "ldtr"}, {GUEST_LDTRBASE, 0, "ldtrbase"}, {GUEST_LDTRLIMIT, 4, "ldtrlimit"}, {GUEST_LDTRPERM, 4, "ldtrperm"}, {GUEST_CR0, 0, "cr0real", nil, cr0realwrite}, {GUEST_CR0SHADOW, 0, "cr0fake", cr0fakeread}, {GUEST_CR0MASK, 0, "cr0mask", nil, cr0maskwrite}, {VMXVAR(cr2), 0, "cr2"}, {GUEST_CR3, 0, "cr3"}, {GUEST_CR4, 0, "cr4real", nil, cr4realwrite}, {GUEST_CR4SHADOW, 0, "cr4fake", cr4fakeread}, {GUEST_CR4MASK, 0, "cr4mask", nil, cr4maskwrite}, {GUEST_IA32_PAT, 8, "pat"}, {GUEST_IA32_EFER, 8, "efer", nil, eferwrite}, {VMXVAR(dr[0]), 0, "dr0"}, {VMXVAR(dr[1]), 0, "dr1"}, {VMXVAR(dr[2]), 0, "dr2"}, {VMXVAR(dr[3]), 0, "dr3"}, {VMXVAR(dr[6]), 0, "dr6", nil, dr6write}, {VMXVAR(xcr0), 0, "xcr0", nil, xcr0write}, {GUEST_DR7, 0, "dr7", nil, dr7write}, {VM_INSTRERR, 4, "instructionerror", nil, readonly}, {VM_EXREASON, 4, "exitreason", nil, readonly}, {VM_EXQUALIF, 0, "exitqualification", nil, readonly}, {VM_EXINTRINFO, 4, "exitinterruptinfo", nil, readonly}, {VM_EXINTRCODE, 4, "exitinterruptcode", nil, readonly}, {VM_EXINSTRLEN, 4, "exitinstructionlen", nil, readonly}, {VM_EXINSTRINFO, 4, "exitinstructioninfo", nil, readonly}, {VM_GUESTVA, 0, "exitva", nil, readonly}, {VM_GUESTPA, 0, "exitpa", nil, readonly}, {VM_IDTVECINFO, 4, "idtinterruptinfo", nil, readonly}, {VM_IDTVECCODE, 4, "idtinterruptcode", nil, readonly}, }; static int vmokpage(u64int addr) { return (addr & 0xfff) == 0 && addr >> 48 == 0; } static uvlong * eptwalk(Vmx *vmx, uvlong addr) { uvlong *tab, *nt; uvlong v; int i; tab = vmx->pml4; if(tab == nil) error(Egreg); for(i = 3; i >= 1; i--){ tab += addr >> 12 + 9 * i & 0x1ff; v = *tab; if((v & 3) == 0){ nt = mallocalign(BY2PG, BY2PG, 0, 0); if(nt == nil) error(Enomem); memset(nt, 0, BY2PG); v = PADDR(nt) | 0x407; *tab = v; } tab = KADDR(v & ~0xfff); } return tab + (addr >> 12 & 0x1ff); } static void eptfree(uvlong *tab, int level) { int i; uvlong v, *t; if(tab == nil) error(Egreg); if(level < 3){ for(i = 0; i < 512; i++){ v = tab[i]; if((v & 3) == 0) continue; t = KADDR(v & ~0xfff); eptfree(t, level + 1); tab[i] = 0; } } if(level > 0) free(tab); } static void epttranslate(Vmx *vmx, VmMem *mp, uvlong end) { uvlong p, v; if((mp->addr & 0xfff) != 0 || (end & 0xfff) != 0 || (uint)mp->attr >= 0x1000) error(Egreg); if(mp->seg != nil){ switch(mp->seg->type & SG_TYPE){ default: error(Egreg); case SG_FIXED: case SG_STICKY: break; } if(mp->seg->base + mp->off + (end - mp->addr) > mp->seg->top) error(Egreg); for(p = mp->addr, v = mp->off; p != end; p += BY2PG, v += BY2PG) *eptwalk(vmx, p) = mp->seg->map[v/PTEMAPMEM]->pages[(v & PTEMAPMEM-1)/BY2PG]->pa | mp->attr; }else { for(p = mp->addr; p != end; p += BY2PG) *eptwalk(vmx, p) = 0; } vmx->onentry |= FLUSHEPT; } static char *mtype[] = {"uc", "wc", "02", "03", "wt", "wp", "wb", "07"}; static int cmdgetmeminfo(VmCmd *cmd, va_list va) { VmMem *mp; char *p0, *e, *p; char attr[4]; char mt[4]; p0 = va_arg(va, char *); e = va_arg(va, char *); p = p0; if(p < e) *p = 0; for(mp = cmd->vmx->mem.next; mp != &cmd->vmx->mem; mp = mp->next){ if(mp->seg == nil) continue; attr[0] = (mp->attr & 1) != 0 ? 'r' : '-'; attr[1] = (mp->attr & 2) != 0 ? 'w' : '-'; attr[2] = (mp->attr & 4) != 0 ? 'x' : '-'; attr[3] = 0; *(ushort*)mt = *(u16int*)mtype[mp->attr >> 3 & 7]; mt[2] = (mp->attr & 0x40) != 0 ? '!' : 0; mt[3] = 0; p = seprint(p, e, "%s %s %#llux %#llux %s %#llux\n", attr, mt, mp->addr, mp->next->addr, mp->name, (uvlong)mp->off); } return p - p0; } static void vmmeminsert(VmMem *l, VmMem *p) { p->prev = l->prev; p->next = l; p->prev->next = p; p->next->prev = p; } static VmMem * vmmemremove(VmMem *p) { VmMem *r; r = p->next; p->next->prev = p->prev; p->prev->next = p->next; free(p->name); putseg(p->seg); free(p); return r; } static int cmdclearmeminfo(VmCmd *cmd, va_list) { VmMem *mp; Vmx *vmx; vmx = cmd->vmx; eptfree(cmd->vmx->pml4, 0); for(mp = vmx->mem.next; mp != &vmx->mem; ) mp = vmmemremove(mp); vmx->mem.prev = &vmx->mem; vmx->mem.next = &vmx->mem; vmx->onentry |= FLUSHEPT; return 0; } static void vmmemupdate(Vmx *vmx, VmMem *mp, uvlong end) { VmMem *p, *q; for(p = vmx->mem.prev; p != &vmx->mem; p = p->prev) if(p->addr <= end || end == 0) break; if(p == &vmx->mem || p->addr < mp->addr){ q = smalloc(sizeof(VmMem)); *q = *p; if(p->seg != nil){ incref(q->seg); kstrdup(&q->name, p->name); } vmmeminsert(p->next, q); }else q = p; if(q->seg != nil) q->off += end - q->addr; q->addr = end; for(p = vmx->mem.next; p != &vmx->mem; p = p->next) if(p->addr >= mp->addr) break; vmmeminsert(p, mp); while(p != q) p = vmmemremove(p); for(p = vmx->mem.next; p != &vmx->mem; ) if(p->seg == p->prev->seg && (p->seg == nil || p->addr - p->prev->addr == p->off - p->prev->off)) p = vmmemremove(p); else p = p->next; } extern Segment* (*_globalsegattach)(char*); static int cmdsetmeminfo(VmCmd *cmd, va_list va) { char *p0, *p, *q, *r; int j; char *f[10]; VmMem *mp; int rc; uvlong end; p0 = va_arg(va, char *); p = p0; mp = nil; for(;;){ q = strchr(p, '\n'); if(q == 0) break; *q = 0; if(mp == nil){ mp = malloc(sizeof(VmMem)); if(mp == nil) error(Enomem); } memset(mp, 0, sizeof(VmMem)); if(waserror()){ putseg(mp->seg); free(mp->name); free(mp); nexterror(); } rc = tokenize(p, f, nelem(f)); p = q + 1; if(rc == 0){ poperror(); continue; } if(rc != 4 && rc != 6) error("number of fields wrong"); for(q = f[0]; *q != 0; q++) switch(*q){ case 'r': if((mp->attr & 1) != 0) goto tinval; mp->attr |= 1; break; case 'w': if((mp->attr & 2) != 0) goto tinval; mp->attr |= 2; break; case 'x': if((mp->attr & 4) != 0) goto tinval; mp->attr |= 0x404; break; case '-': break; default: tinval: error("invalid access field"); } for(j = 0; j < 8; j++) if(strncmp(mtype[j], f[1], 2) == 0){ mp->attr |= j << 3; break; } if(j == 8 || strlen(f[1]) > 3) error("invalid memory type"); if(f[1][2] == '!') mp->attr |= 0x40; else if(f[1][2] != 0) error("invalid memory type"); mp->addr = strtoull(f[2], &r, 0); if(*r != 0 || !vmokpage(mp->addr)) error("invalid low guest physical address"); end = strtoull(f[3], &r, 0); if(*r != 0 || !vmokpage(end) || end <= mp->addr) error("invalid high guest physical address"); if((mp->attr & 7) != 0){ if(rc != 6) error("number of fields wrong"); mp->seg = _globalsegattach(f[4]); if(mp->seg == nil) error("no such segment"); if(mp->seg->base + mp->off + (end - mp->addr) > mp->seg->top) error("out of bounds"); kstrdup(&mp->name, f[4]); mp->off = strtoull(f[5], &r, 0); if(*r != 0 || !vmokpage(mp->off)) error("invalid offset"); } poperror(); epttranslate(cmd->vmx, mp, end); vmmemupdate(cmd->vmx, mp, end); mp = nil; } free(mp); return p - p0; } static void vmxreset(void) { ulong regs[4]; vlong msr; int i; cpuid(1, 0, regs); if((regs[2] & 1<<5) == 0) return; /* check if disabled by BIOS */ if(rdmsr(0x3a, &msr) < 0) return; if((msr & 5) != 5){ if((msr & 1) == 0){ /* msr still unlocked */ wrmsr(0x3a, msr | 5); if(rdmsr(0x3a, &msr) < 0) return; } if((msr & 5) != 5) return; } if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) return; if((vlong)msr >= 0) return; if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) return; if((msr >> 32 & PROCB_EPT) == 0 || (msr >> 32 & PROCB_VPID) == 0) return; for(i = 0; i < conf.nmach; i++){ MACHP(i)->vmx = mallocalign(sizeof(VmxMach), 4096, 0, 0); if(vmxmachp(i) == nil) error(Enomem); } gotvmx = 1; } static void vmxaddmsr(Vmx *vmx, u32int msr, u64int gval) { int i; if(vmx->nmsr >= MAXMSR) error("too many MSRs"); i = 2 * vmx->nmsr++; vmx->msrhost[i] = msr; rdmsr(msr, (vlong *) &vmx->msrhost[i+1]); vmx->msrguest[i] = msr; vmx->msrguest[i+1] = gval; vmcswrite(VMENTRY_MSRLDCNT, vmx->nmsr); vmcswrite(VMEXIT_MSRSTCNT, vmx->nmsr); vmcswrite(VMEXIT_MSRLDCNT, vmx->nmsr); } static void vmxtrapmsr(Vmx *vmx, u32int msr, enum { TRAPRD = 1, TRAPWR = 2 } state) { u32int m; if(msr >= 0x2000 && (u32int)(msr - 0xc0000000) >= 0x2000) return; msr = msr & 0x1fff | msr >> 18 & 0x2000; m = 1<<(msr & 31); if((state & TRAPRD) != 0) vmx->msrbits[msr / 32] |= m; else vmx->msrbits[msr / 32] &= ~m; if((state & TRAPWR) != 0) vmx->msrbits[msr / 32 + 512] |= m; else vmx->msrbits[msr / 32 + 512] &= ~m; } static void vmcsinit(Vmx *vmx) { vlong msr; u32int x; int s; memset(&vmx->ureg, 0, sizeof(vmx->ureg)); vmx->launched = 0; vmx->onentry = 0; if(rdmsr(VMX_BASIC_MSR, &msr) < 0) error("rdmsr(VMX_BASIC_MSR) failed"); if((msr & 1ULL<<55) != 0){ if(rdmsr(VMX_TRUE_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_TRUE_PROCB_CTLS_MSR) failed"); if(rdmsr(VMX_TRUE_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_TRUE_PINB_CTLS_MSR) failed"); }else{ if(rdmsr(VMX_PROCB_CTLS_MSR, &procb_ctls) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR) failed"); if(rdmsr(VMX_PINB_CTLS_MSR, &pinb_ctls) < 0) error("rdmsr(VMX_PINB_CTLS_MSR) failed"); } if(rdmsr(VMX_PINB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PINB_CTLS_MSR failed"); x = (u32int)pinb_ctls | 1<<1 | 1<<2 | 1<<4; /* currently reserved default1 bits */ x |= PINB_EXITIRQ | PINB_EXITNMI; x &= pinb_ctls >> 32; vmcswrite(PINB_CTLS, x); if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR failed"); x = (u32int)procb_ctls | 1<<1 | 7<<4 | 1<<8 | 1<<13 | 1<<14 | 1<<26; /* currently reserved default1 bits */ x |= PROCB_TSCOFFSET | PROCB_EXITMWAIT | PROCB_EXITMONITOR | PROCB_EXITHLT; x |= PROCB_EXITMOVDR | PROCB_EXITIO | PROCB_MSRBITMAP; x |= PROCB_USECTLS2; x &= msr >> 32; vmcswrite(PROCB_CTLS, x); vmx->procbctls = x; if(rdmsr(VMX_PROCB_CTLS2_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS2_MSR failed"); x = PROCB_EPT | PROCB_VPID | PROCB_UNRESTR; x &= msr >> 32; vmcswrite(PROCB_CTLS2, x); if(rdmsr(VMX_VMEXIT_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMEXIT_CTLS_MSR failed"); x = (u32int)msr; if(sizeof(uintptr) == 8) x |= VMEXIT_HOST64; x |= VMEXIT_LD_IA32_PAT | VMEXIT_LD_IA32_EFER | VMEXIT_ST_DEBUG | VMEXIT_ST_IA32_EFER; x &= msr >> 32; vmcswrite(VMEXIT_CTLS, x); if(rdmsr(VMX_VMENTRY_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_VMENTRY_CTLS_MSR failed"); x = (u32int)msr; x |= VMENTRY_LD_IA32_PAT | VMENTRY_LD_IA32_EFER | VMENTRY_LD_DEBUG; x &= msr >> 32; vmcswrite(VMENTRY_CTLS, x); vmcswrite(CR3_TARGCNT, 0); vmcswrite(VMENTRY_INTRINFO, 0); vmcswrite(VMCS_LINK, -1); vmcswrite(HOST_CS, KESEL); vmcswrite(HOST_DS, KDSEL); vmcswrite(HOST_ES, KDSEL); vmcswrite(HOST_FS, KDSEL); vmcswrite(HOST_GS, KDSEL); vmcswrite(HOST_SS, KDSEL); vmcswrite(HOST_TR, TSSSEL); vmcswrite(HOST_CR0, getcr0() & ~0xe); vmcswrite(HOST_CR3, getcr3()); vmcswrite(HOST_CR4, getcr4()); rdmsr(FSbase, &msr); vmcswrite(HOST_FSBASE, msr); rdmsr(GSbase, &msr); vmcswrite(HOST_GSBASE, msr); vmcswrite(HOST_TRBASE, (uintptr) m->tss); vmcswrite(HOST_GDTR, (uintptr) m->gdt); vmcswrite(HOST_IDTR, IDTADDR); if(rdmsr(0x277, &msr) < 0) error("rdmsr(IA32_PAT) failed"); vmcswrite(HOST_IA32_PAT, msr); if(rdmsr(Efer, &msr) < 0) error("rdmsr(IA32_EFER) failed"); vmcswrite(HOST_IA32_EFER, msr); vmcswrite(EXC_BITMAP, 1<<18|1<<1); vmcswrite(PFAULT_MASK, 0); vmcswrite(PFAULT_MATCH, 0); vmcswrite(GUEST_CSBASE, 0); vmcswrite(GUEST_DSBASE, 0); vmcswrite(GUEST_ESBASE, 0); vmcswrite(GUEST_FSBASE, 0); vmcswrite(GUEST_GSBASE, 0); vmcswrite(GUEST_SSBASE, 0); vmcswrite(GUEST_CSLIMIT, -1); vmcswrite(GUEST_DSLIMIT, -1); vmcswrite(GUEST_ESLIMIT, -1); vmcswrite(GUEST_FSLIMIT, -1); vmcswrite(GUEST_GSLIMIT, -1); vmcswrite(GUEST_SSLIMIT, -1); vmcswrite(GUEST_CSPERM, (SEGG|SEGD|SEGP|SEGPL(0)|SEGEXEC|SEGR) >> 8 | 1); vmcswrite(GUEST_DSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1); vmcswrite(GUEST_ESPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1); vmcswrite(GUEST_FSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1); vmcswrite(GUEST_GSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1); vmcswrite(GUEST_SSPERM, (SEGG|SEGB|SEGP|SEGPL(0)|SEGDATA|SEGW) >> 8 | 1); vmcswrite(GUEST_LDTRPERM, 1<<16); vmcswrite(GUEST_CR0MASK, CR0KERNEL); vmcswrite(GUEST_CR4MASK, CR4KERNEL); vmcswrite(GUEST_CR0, getcr0() & CR0KERNEL | 0x31); vmcswrite(GUEST_CR3, 0); vmcswrite(GUEST_CR4, getcr4() & CR4KERNEL); vmcswrite(GUEST_CR0SHADOW, getcr0() & CR0KERNEL | 0x31); vmcswrite(GUEST_CR4SHADOW, getcr4() & ~CR4VMXE & CR4KERNEL); vmcswrite(GUEST_IA32_PAT, 0x0007040600070406ULL); vmcswrite(GUEST_IA32_EFER, 0); vmcswrite(GUEST_TRBASE, 0); vmcswrite(GUEST_TRLIMIT, 0xffff); vmcswrite(GUEST_TRPERM, (SEGTSS|SEGPL(0)|SEGP) >> 8 | 2); vmcswrite(VM_EPTP, PADDR(vmx->pml4) | 3<<3); vmx->vpid = 1; vmcswrite(VM_VPID, vmx->vpid); vmcswrite(GUEST_RFLAGS, 2); vmx->onentry = FLUSHVPID | FLUSHEPT; s = splhi(); #ifdef KFPSTATE fpukexit(nil, nil); #endif fpinit(); fpsave(&vmx->fp); vmx->xcr0 = m->xcr0 & 1; /* x87 only */ splx(s); memset(vmx->msrbits, -1, 4096); vmxtrapmsr(vmx, Efer, 0); vmcswrite(VMENTRY_MSRLDADDR, PADDR(vmx->msrguest)); vmcswrite(VMEXIT_MSRSTADDR, PADDR(vmx->msrguest)); vmcswrite(VMEXIT_MSRLDADDR, PADDR(vmx->msrhost)); vmcswrite(MSR_BITMAP, PADDR(vmx->msrbits)); if(vmx->procbctls & PROCB_TSCOFFSET){ cycles(&vmx->tscoffset); vmcswrite(VMCS_TSC_OFFSET, vmx->tscoffset); } if(sizeof(uintptr) == 8){ vmxaddmsr(vmx, Star, 0); vmxaddmsr(vmx, Lstar, 0); vmxaddmsr(vmx, Cstar, 0); vmxaddmsr(vmx, Sfmask, 0); vmxaddmsr(vmx, KernelGSbase, 0); vmxtrapmsr(vmx, Star, 0); vmxtrapmsr(vmx, Lstar, 0); vmxtrapmsr(vmx, Cstar, 0); vmxtrapmsr(vmx, Sfmask, 0); vmxtrapmsr(vmx, FSbase, 0); vmxtrapmsr(vmx, GSbase, 0); vmxtrapmsr(vmx, KernelGSbase, 0); } } static void vmxstart(Vmx *vmx) { vlong msr, msr2; uintptr cr; vlong x; putcr4(getcr4() | CR4VMXE); putcr0(getcr0() | 0x20); /* set NE */ cr = getcr0(); if(rdmsr(VMX_CR0_FIXED0, &msr) < 0) error("rdmsr(VMX_CR0_FIXED0) failed"); if(rdmsr(VMX_CR0_FIXED1, &msr2) < 0) error("rdmsr(VMX_CR0_FIXED1) failed"); if((cr & ~msr & ~msr2 | ~cr & msr & msr2) != 0) error("invalid CR0 value"); cr = getcr4(); if(rdmsr(VMX_CR4_FIXED0, &msr) < 0) error("rdmsr(VMX_CR4_FIXED0) failed"); if(rdmsr(VMX_CR4_FIXED1, &msr2) < 0) error("rdmsr(VMX_CR4_FIXED1) failed"); if((cr & ~msr & ~msr2 | ~cr & msr & msr2) != 0) error("invalid CR4 value"); rdmsr(VMX_BASIC_MSR, &x); qlock(vmxmach); if(waserror()){ qunlock(vmxmach); nexterror(); } if(vmxmach->vms == 0){ memset(vmxmach->vmxon, 0, sizeof(vmxmach->vmxon)); *(ulong*)vmxmach->vmxon = x; if(vmxon(PADDR(vmxmach->vmxon)) < 0) error("vmxon failed"); } vmxmach->vms++; qunlock(vmxmach); poperror(); memset(vmx->vmcs, 0, sizeof(vmx->vmcs)); *(ulong*)vmx->vmcs = x; if(vmclear(PADDR(vmx->vmcs)) < 0) error("vmclear failed"); up->vmx = vmx; vmxprocrestore(up); vmcsinit(vmx); } static void cmdrelease(VmCmd *p, int f) { lock(p); p->flags |= CMDFDONE | f; wakeup(p); unlock(p); } static void killcmds(Vmx *vmx, VmCmd *notme) { VmCmd *p, *pn; for(p = vmx->postponed; p != nil; p = pn){ pn = p->next; p->next = nil; if(p == notme) continue; kstrcpy(p->errstr, Equit, ERRMAX); cmdrelease(p, CMDFFAIL); } vmx->postponed = nil; ilock(&vmx->cmdlock); for(p = vmx->firstcmd; p != nil; p = pn){ pn = p->next; p->next = nil; if(p == notme) continue; kstrcpy(p->errstr, Equit, ERRMAX); cmdrelease(p, CMDFFAIL); } vmx->firstcmd = nil; vmx->lastcmd = &vmx->firstcmd; iunlock(&vmx->cmdlock); } static int cmdquit(VmCmd *p, va_list va) { Vmx *vmx; vmx = p->vmx; vmx->state = VMXENDING; killcmds(vmx, p); cmdclearmeminfo(p, va); up->vmx = nil; vmxprocrestore(up); vmclear(PADDR(vmx->vmcs)); qlock(vmxmach); if(--vmxmach->vms == 0) vmxoff(); qunlock(vmxmach); qlock(&vmxtablock); if(moribund == vmx) moribund = nil; vmxtab[vmx->index] = nil; qunlock(&vmxtablock); free(vmx); cmdrelease(p, 0); pexit(Equit, 1); return 0; } static void processexit(Vmx *vmx) { u32int reason; reason = vmcsread(VM_EXREASON); if((reason & 1<<31) == 0) switch(reason & 0xffff){ case 1: /* external interrupt */ case 3: /* INIT */ case 4: /* SIPI */ case 5: /* IO SMI */ case 6: /* SMI */ case 7: /* IRQ window */ case 8: /* NMI window */ return; } vmx->state = VMXREADY; vmx->got |= GOTEXIT; vmx->onentry &= ~STEP; } static int cmdgetregs(VmCmd *cmd, va_list va) { char *p0, *e; GuestReg *r; uvlong val; int s; char *p; p0 = va_arg(va, char *); e = va_arg(va, char *); p = p0; for(r = guestregs; r < guestregs + nelem(guestregs); r++) if(r->read != nil){ p = seprint(p, e, "%s ", r->name); p = r->read(cmd->vmx, p, e); p = strecpy(p, e, "\n"); }else{ if(r->offset >= 0) val = vmcsread(r->offset); else val = *(uintptr*)((uchar*)cmd->vmx + ~r->offset); s = r->size; if(s == 0) s = sizeof(uintptr); p = seprint(p, e, "%s %#.*llux\n", r->name, s * 2, val); } return p - p0; } static int setregs(Vmx *vmx, char *p0, char rs, char *fs) { char *p, *q, *rp; char *f[10]; GuestReg *r; uvlong val; int sz; int rc; p = p0; for(;;){ q = strchr(p, rs); if(q == 0) break; *q = 0; rc = getfields(p, f, nelem(f), 1, fs); p = q + 1; if(rc == 0) continue; if(rc != 2) error("number of fields wrong"); for(r = guestregs; r < guestregs + nelem(guestregs); r++) if(strcmp(r->name, f[0]) == 0) break; if(r == guestregs + nelem(guestregs)) error("unknown register"); if(r->write != nil){ r->write(vmx, f[1]); continue; } val = strtoull(f[1], &rp, 0); sz = r->size; if(sz == 0) sz = sizeof(uintptr); if(rp == f[1] || *rp != 0) error("invalid value"); if(r->offset >= 0) vmcswrite(r->offset, val); else{ assert((u32int)~r->offset + sz <= sizeof(Vmx)); switch(sz){ case 1: *(u8int*)((u8int*)vmx + (u32int)~r->offset) = val; break; case 2: *(u16int*)((u8int*)vmx + (u32int)~r->offset) = val; break; case 4: *(u32int*)((u8int*)vmx + (u32int)~r->offset) = val; break; case 8: *(u64int*)((u8int*)vmx + (u32int)~r->offset) = val; break; default: error(Egreg); } } } return p - p0; } static int cmdsetregs(VmCmd *cmd, va_list va) { return setregs(cmd->vmx, va_arg(va, char *), '\n', " \t"); } static int cmdgetfpregs(VmCmd *cmd, va_list va) { uchar *p; p = va_arg(va, uchar *); memmove(p, &cmd->vmx->fp, sizeof(FPsave)); return sizeof(FPsave); } static int cmdsetfpregs(VmCmd *cmd, va_list va) { uchar *p; ulong n; vlong off; p = va_arg(va, uchar *); n = va_arg(va, ulong); off = va_arg(va, vlong); if(off < 0 || off >= sizeof(FPsave)) n = 0; else if(off + n > sizeof(FPsave)) n = sizeof(FPsave) - n; memmove((uchar*)&cmd->vmx->fp + off, p, n); return n; } static int cmdgo(VmCmd *cmd, va_list va) { int step; char *r; Vmx *vmx; vmx = cmd->vmx; if(vmx->state != VMXREADY) error("VM not ready"); step = va_arg(va, int); r = va_arg(va, char *); if(r != nil) setregs(vmx, r, ';', "="); if(step) vmx->onentry |= STEP; vmx->state = VMXRUNNING; return 0; } static int cmdstop(VmCmd *cmd, va_list) { Vmx *vmx; vmx = cmd->vmx; if(vmx->state != VMXREADY && vmx->state != VMXRUNNING) error("VM not ready or running"); vmx->state = VMXREADY; return 0; } static int cmdstatus(VmCmd *cmd, va_list va) { kstrcpy(va_arg(va, char *), cmd->vmx->errstr, ERRMAX); return cmd->vmx->state; } static char *exitreasons[] = { [0] "exc", [1] "extirq", [2] "triplef", [3] "initsig", [4] "sipi", [5] "smiio", [6] "smiother", [7] "irqwin", [8] "nmiwin", [9] "taskswitch", [10] ".cpuid", [11] ".getsec", [12] ".hlt", [13] ".invd", [14] ".invlpg", [15] ".rdpmc", [16] ".rdtsc", [17] ".rsm", [18] ".vmcall", [19] ".vmclear", [20] ".vmlaunch", [21] ".vmptrld", [22] ".vmptrst", [23] ".vmread", [24] ".vmresume", [25] ".vmwrite", [26] ".vmxoff", [27] ".vmxon", [28] "movcr", [29] ".movdr", [30] "io", [31] ".rdmsr", [32] ".wrmsr", [33] "entrystate", [34] "entrymsr", [36] ".mwait", [37] "monitortrap", [39] ".monitor", [40] ".pause", [41] "mcheck", [43] "tpr", [44] "apicacc", [45] "eoi", [46] "gdtr_idtr", [47] "ldtr_tr", [48] "eptfault", [49] "eptinval", [50] ".invept", [51] ".rdtscp", [52] "preempt", [53] ".invvpid", [54] ".wbinvd", [55] ".xsetbv", [56] "apicwrite", [57] ".rdrand", [58] ".invpcid", [59] ".vmfunc", [60] ".encls", [61] ".rdseed", [62] "pmlfull", [63] ".xsaves", [64] ".xrstors", }; static char *except[] = { [0] "#de", [1] "#db", [3] "#bp", [4] "#of", [5] "#br", [6] "#ud", [7] "#nm", [8] "#df", [10] "#ts", [11] "#np", [12] "#ss", [13] "#gp", [14] "#pf", [16] "#mf", [17] "#ac", [18] "#mc", [19] "#xm", [20] "#ve", }; static int cmdwait(VmCmd *cp, va_list va) { char *p, *p0, *e; u32int reason, intr; uvlong qual; u16int rno; Vmx *vmx; if(cp->scratched) error(Eintr); vmx = cp->vmx; p0 = p = va_arg(va, char *); e = va_arg(va, char *); if((vmx->got & GOTIRQACK) != 0){ p = seprint(p, e, "*ack %d\n", vmx->irqack.info & 0xff); vmx->got &= ~GOTIRQACK; return p - p0; } if((vmx->got & GOTEXIT) == 0){ cp->flags |= CMDFPOSTP; return -1; } vmx->got &= ~GOTEXIT; reason = vmcsread(VM_EXREASON); qual = vmcsread(VM_EXQUALIF); rno = reason; intr = vmcsread(VM_EXINTRINFO); if((reason & 1<<31) != 0) p = seprint(p, e, "!"); if(rno == 0 && (intr & 1<<31) != 0){ if((intr & 0xff) >= nelem(except) || except[intr & 0xff] == nil) p = seprint(p, e, "#%d ", intr & 0xff); else p = seprint(p, e, "%s ", except[intr & 0xff]); }else if(rno >= nelem(exitreasons) || exitreasons[rno] == nil) p = seprint(p, e, "?%d ", rno); else p = seprint(p, e, "%s ", exitreasons[rno]); p = seprint(p, e, "%#ullx pc %#ullx sp %#ullx ilen %#ullx iinfo %#ullx", qual, vmcsread(GUEST_RIP), vmcsread(GUEST_RSP), vmcsread(VM_EXINSTRLEN), vmcsread(VM_EXINSTRINFO)); if((intr & 1<<11) != 0) p = seprint(p, e, " excode %#ullx", vmcsread(VM_EXINTRCODE)); if(rno == 48 && (qual & 0x80) != 0) p = seprint(p, e, " va %#ullx", vmcsread(VM_GUESTVA)); if(rno == 48 || rno == 49) p = seprint(p, e, " pa %#ullx", vmcsread(VM_GUESTPA)); if(rno == 30) p = seprint(p, e, " ax %#ullx", (uvlong)vmx->ureg.ax); p = seprint(p, e, "\n"); return p - p0; } static void eventparse(char *p, VmIntr *vi) { char *q, *r; int i; memset(vi, 0, sizeof(VmIntr)); q = nil; kstrdup(&q, p); if(waserror()){ free(q); memset(vi, 0, sizeof(VmIntr)); nexterror(); } vi->info = 1<<31; r = strchr(q, ','); if(r != nil) *r++ = 0; for(i = 0; i < nelem(except); i++) if(except[i] != nil && strcmp(except[i], q) == 0) break; if(*q == '#'){ q++; vi->info |= 3 << 8; } if(i == nelem(except)){ i = strtoul(q, &q, 10); if(*q != 0 || i > 255) error(Ebadctl); } vi->info |= i; if((vi->info & 0x7ff) == 3 || (vi->info & 0x7ff) == 4) vi->info += 3 << 8; if(r == nil) goto out; if(*r != ','){ vi->code = strtoul(r, &r, 0); vi->info |= 1<<11; }else r++; if(*r == ',') vi->ilen = strtoul(r + 1, &r, 0); if(*r != 0) error(Ebadctl); out: poperror(); free(q); } static int cmdexcept(VmCmd *cp, va_list va) { Vmx *vmx; vmx = cp->vmx; if(cp->scratched) error(Eintr); if((vmx->onentry & POSTEX) != 0){ cp->flags |= CMDFPOSTP; return 0; } eventparse(va_arg(va, char *), &vmx->exc); vmx->onentry |= POSTEX; return 0; } static int cmdirq(VmCmd *cmd, va_list va) { char *p; VmIntr vi; Vmx *vmx; vmx = cmd->vmx; p = va_arg(va, char *); if(p == nil) vmx->onentry &= ~POSTIRQ; else{ eventparse(p, &vi); vmx->irq = vi; vmx->onentry |= POSTIRQ; } return 0; } static int cmdextrap(VmCmd *, va_list va) { char *p, *q; u32int v; p = va_arg(va, char *); v = strtoul(p, &q, 0); if(q == p || *q != 0) error(Ebadarg); vmcswrite(EXC_BITMAP, v); return 0; } static int gotcmd(void *vmxp) { Vmx *vmx = vmxp; return vmx->firstcmd != nil; } static void markcmddone(VmCmd *p, VmCmd ***pp) { if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP){ **pp = p; *pp = &p->next; }else{ p->flags = p->flags & ~CMDFPOSTP; cmdrelease(p, 0); } } static VmCmd ** markppcmddone(VmCmd **pp) { VmCmd *p; p = *pp; if((p->flags & (CMDFFAIL|CMDFPOSTP)) == CMDFPOSTP) return &p->next; *pp = p->next; p->next = nil; p->flags = p->flags & ~CMDFPOSTP; cmdrelease(p, 0); return pp; } static void runcmd(Vmx *vmx) { VmCmd *p, **pp; for(pp = &vmx->postponed; p = *pp, p != nil; ){ if(waserror()){ kstrcpy(p->errstr, up->errstr, ERRMAX); p->flags |= CMDFFAIL; pp = markppcmddone(pp); continue; } p->flags &= ~CMDFPOSTP; p->retval = p->cmd(p, p->va); poperror(); pp = markppcmddone(pp); } for(;;){ ilock(&vmx->cmdlock); p = vmx->firstcmd; if(p == nil){ iunlock(&vmx->cmdlock); break; } vmx->firstcmd = p->next; if(vmx->lastcmd == &p->next) vmx->lastcmd = &vmx->firstcmd; iunlock(&vmx->cmdlock); p->next = nil; if(waserror()){ kstrcpy(p->errstr, up->errstr, ERRMAX); p->flags |= CMDFFAIL; markcmddone(p, &pp); continue; } if(p->scratched) error(Eintr); p->retval = p->cmd(p, p->va); poperror(); markcmddone(p, &pp); } } static void vmxproc(void *vmxp) { int init, rc, x, useend; u32int procbctls, defprocbctls; u64int start, end, adj; vlong v; Vmx *vmx; vmx = vmxp; procwired(up, vmx->machno); sched(); init = 0; useend = 0; adj = 0; defprocbctls = 0; while(waserror()){ kstrcpy(vmx->errstr, up->errstr, ERRMAX); vmx->state = VMXDEAD; } for(;;){ if(!init){ init = 1; vmxstart(vmx); vmx->state = VMXREADY; defprocbctls = vmcsread(PROCB_CTLS); } runcmd(vmx); if(vmx->state == VMXRUNNING){ procbctls = defprocbctls; if((vmx->onentry & STEP) != 0) defprocbctls |= PROCB_MONTRAP; if((vmx->onentry & POSTEX) != 0){ vmcswrite(VMENTRY_INTRINFO, vmx->exc.info); vmcswrite(VMENTRY_INTRCODE, vmx->exc.code); vmcswrite(VMENTRY_INTRILEN, vmx->exc.ilen); vmx->onentry &= ~POSTEX; } if((vmx->onentry & POSTIRQ) != 0 && (vmx->onentry & STEP) == 0){ if((vmx->onentry & POSTEX) == 0 && (vmcsread(GUEST_RFLAGS) & 1<<9) != 0 && (vmcsread(GUEST_CANINTR) & 3) == 0){ vmcswrite(VMENTRY_INTRINFO, vmx->irq.info); vmcswrite(VMENTRY_INTRCODE, vmx->irq.code); vmcswrite(VMENTRY_INTRILEN, vmx->irq.ilen); vmx->onentry &= ~POSTIRQ; vmx->got |= GOTIRQACK; vmx->irqack = vmx->irq; }else procbctls |= PROCB_IRQWIN; } if((vmx->onentry & FLUSHVPID) != 0){ if(invvpid(INVLOCAL, vmx->vpid, 0) < 0) error("invvpid failed"); vmx->onentry &= ~FLUSHVPID; } if((vmx->onentry & FLUSHEPT) != 0){ if(invept(INVLOCAL, PADDR(vmx->pml4) | 3<<3, 0) < 0) error("invept failed"); vmx->onentry &= ~FLUSHEPT; } vmcswrite(PROCB_CTLS, procbctls); vmx->got &= ~GOTEXIT; x = splhi(); if(sizeof(uintptr) == 8){ rdmsr(FSbase, &v); vmwrite(HOST_FSBASE, v); } if((vmx->dr[7] & ~0xd400) != 0) putdr01236(vmx->dr); #ifdef KFPSTATE fpukexit(nil, nil); #endif fprestore(&vmx->fp); if(vmx->xcr0 != m->xcr0) putxcr0(vmx->xcr0); if(vmx->cr2 != getcr2()) putcr2(vmx->cr2); cycles(&start); if(useend){ vmx->tscoffset -= end - start + adj; vmcswrite(VMCS_TSC_OFFSET, vmx->tscoffset); } if(adj == 0){ cycles(&adj); adj -= start; } rc = vmlaunch(&vmx->ureg, vmx->launched); cycles(&end); useend = vmx->procbctls & PROCB_TSCOFFSET; vmx->cr2 = getcr2(); if(vmx->xcr0 != m->xcr0) putxcr0(m->xcr0); fpsave(&vmx->fp); splx(x); if(rc < 0) error("vmlaunch failed"); vmx->launched = 1; processexit(vmx); }else{ up->psstate = "Idle"; sleep(&vmx->cmdwait, gotcmd, vmx); up->psstate = nil; } } } enum { /* Qdir */ Qclone = 1, }; enum { Qdir, Qctl, Qregs, Qstatus, Qmap, Qwait, Qfpregs, }; static Dirtab vmxdir[] = { "ctl", { Qctl, 0, 0 }, 0, 0660, "regs", { Qregs, 0, 0 }, 0, 0660, "status", { Qstatus, 0, 0 }, 0, 0440, "map", { Qmap, 0, 0 }, 0, 0660, "wait", { Qwait, 0, 0 }, 0, 0440, "fpregs", { Qfpregs, 0, 0 }, 0, 0660, }; enum { CMquit, CMgo, CMstop, CMstep, CMexc, CMirq, CMextrap, }; static Cmdtab vmxctlmsg[] = { CMquit, "quit", 1, CMgo, "go", 0, CMstop, "stop", 1, CMstep, "step", 0, CMexc, "exc", 2, CMirq, "irq", 0, CMextrap, "extrap", 2, }; enum { AUXSIZE = 4096 }; static Vmx * vmxlook(vlong n) { if(n < 0) return nil; if(n >= nvmxtab) return nil; return vmxtab[n]; } #define QIDPATH(q,e) ((q) + 1 << 8 | (e)) #define SLOT(q) ((vlong)((q).path >> 8) - 1) #define FILE(q) ((int)(q).path & 0xff) static Vmx * vmxent(Qid q) { Vmx *vmx; eqlock(&vmxtablock); if(waserror()){ qunlock(&vmxtablock); nexterror(); } vmx = vmxlook(SLOT(q)); qunlock(&vmxtablock); poperror(); return vmx; } static int iscmddone(void *cp) { return (((VmCmd*)cp)->flags & CMDFDONE) != 0; } static int vmxcmd(Vmx *vmx, int (*f)(VmCmd *, va_list), ...) { VmCmd cmd; if(vmx->state == VMXENDING) ending: error(Equit); memset(&cmd, 0, sizeof(VmCmd)); cmd.vmx = vmx; cmd.errstr = up->errstr; cmd.cmd = f; va_start(cmd.va, f); ilock(&vmx->cmdlock); if(vmx->state == VMXENDING){ iunlock(&vmx->cmdlock); goto ending; } *vmx->lastcmd = &cmd; vmx->lastcmd = &cmd.next; iunlock(&vmx->cmdlock); while(waserror()) cmd.scratched = 1; wakeup(&vmx->cmdwait); do sleep(&cmd, iscmddone, &cmd); while(!iscmddone(&cmd)); poperror(); lock(&cmd); unlock(&cmd); if((cmd.flags & CMDFFAIL) != 0) error(up->errstr); return cmd.retval; } static Vmx * vmxnew(void) { Vmx *vmx; Vmx **newtab; int i, mi, mv; vmx = mallocalign(sizeof(Vmx), 4096, 0, 0); if(waserror()){ free(vmx); nexterror(); } memset(vmx, 0, sizeof(Vmx)); vmx->state = VMXINIT; vmx->lastcmd = &vmx->firstcmd; vmx->mem.next = &vmx->mem; vmx->mem.prev = &vmx->mem; vmx->index = -1; eqlock(&vmxtablock); if(waserror()){ if(vmx->index >= 0) vmxtab[vmx->index] = 0; qunlock(&vmxtablock); nexterror(); } for(i = 0; i < nvmxtab; i++) if(vmxtab[i] == nil){ vmxtab[i] = vmx; vmx->index = i; break; } if(i == nvmxtab){ newtab = realloc(vmxtab, (nvmxtab + 1) * sizeof(Vmx *)); if(newtab == nil) error(Enomem); vmxtab = newtab; vmxtab[nvmxtab] = vmx; vmx->index = nvmxtab++; } kproc("kvmx", vmxproc, vmx); qunlock(&vmxtablock); poperror(); poperror(); mi = 0; mv = 0x7fffffff; for(i = 0; i < conf.nmach; i++) if(vmxmachp(i)->vms < mv){ mi = i; mv = vmxmachp(i)->vms; } vmx->machno = mi; if(vmxcmd(vmx, cmdstatus, up->errstr) == VMXDEAD) error(up->errstr); return vmx; } void vmxshutdown(void) { int i; for(i = 0; i < nvmxtab; i++) if(vmxtab[i] != nil) vmxcmd(vmxtab[i], cmdquit); } static Chan * vmxattach(char *spec) { if(!gotvmx) error(Enodev); return devattach('X', spec); } static int vmxgen(Chan *c, char *, Dirtab *, int, int s, Dir *dp) { Dirtab *tab; int path; if(s == DEVDOTDOT){ devdir(c, (Qid){Qdir, 0, QTDIR}, "#X", 0, eve, 0555, dp); return 1; } if(c->qid.path == Qdir){ if(s-- == 0) goto clone; if(s >= nvmxtab) return -1; if(vmxlook(s) == nil) return 0; sprint(up->genbuf, "%d", s); devdir(c, (Qid){QIDPATH(s, Qdir), 0, QTDIR}, up->genbuf, 0, eve, DMDIR|0555, dp); return 1; } if(c->qid.path == Qclone){ clone: strcpy(up->genbuf, "clone"); devdir(c, (Qid){Qclone, 0, QTFILE}, up->genbuf, 0, eve, 0444, dp); return 1; } if(s >= nelem(vmxdir)) return -1; tab = &vmxdir[s]; path = QIDPATH(SLOT(c->qid), 0); devdir(c, (Qid){tab->qid.path|path, tab->qid.vers, tab->qid.type}, tab->name, tab->length, eve, tab->perm, dp); return 1; } static Walkqid* vmxwalk(Chan *c, Chan *nc, char **name, int nname) { Walkqid *rc; eqlock(&vmxtablock); if(waserror()){ qunlock(&vmxtablock); nexterror(); } rc = devwalk(c, nc, name, nname, nil, 0, vmxgen); qunlock(&vmxtablock); poperror(); return rc; } static int vmxstat(Chan *c, uchar *dp, int n) { int rc; eqlock(&vmxtablock); if(waserror()){ qunlock(&vmxtablock); nexterror(); } rc = devstat(c, dp, n, nil, 0, vmxgen); qunlock(&vmxtablock); poperror(); return rc; } static Chan* vmxopen(Chan* c, int omode) { Chan *ch; Vmx *vmx; if(c->qid.path == Qclone){ if(!iseve()) error(Eperm); vmx = vmxnew(); c->qid.path = QIDPATH(vmx->index, Qctl); } eqlock(&vmxtablock); if(waserror()){ qunlock(&vmxtablock); nexterror(); } vmx = vmxlook(SLOT(c->qid)); if(SLOT(c->qid) >= 0 && vmx == nil) error(Enonexist); if(FILE(c->qid) != Qdir && !iseve()) error(Eperm); ch = devopen(c, omode, nil, 0, vmxgen); qunlock(&vmxtablock); poperror(); ch->aux = smalloc(AUXSIZE); if(SLOT(ch->qid) >= 0 && FILE(ch->qid) == Qmap){ if((omode & OTRUNC) != 0) vmxcmd(vmx, cmdclearmeminfo); } return ch; } static void vmxclunk(Chan *ch) { free(ch->aux); ch->aux = nil; } static void vmxremove(Chan *ch) { Vmx *vmx, *old; vmxclunk(ch); if(SLOT(ch->qid) == -1 || FILE(ch->qid) != Qctl) error(Eperm); vmx = vmxent(ch->qid); if(vmx == nil) error(Enonexist); qlock(&vmxtablock); old = moribund; moribund = vmx; qunlock(&vmxtablock); if(old != nil) vmxcmd(old, cmdquit); } static void vmxclose(Chan *ch) { if((ch->flag & CRCLOSE) != 0) vmxremove(ch); else vmxclunk(ch); } static long vmxread(Chan* c, void* a, long n, vlong off) { int rc; Vmx *vmx; if(SLOT(c->qid) == -1){ switch((int)c->qid.path){ case Qdir: dirread: eqlock(&vmxtablock); if(waserror()){ qunlock(&vmxtablock); nexterror(); } rc = devdirread(c, a, n, nil, 0, vmxgen); qunlock(&vmxtablock); poperror(); return rc; default: error(Egreg); } } vmx = vmxent(c->qid); if(vmx == nil) error(Enonexist); switch(FILE(c->qid)){ case Qdir: goto dirread; case Qctl: { char buf[20]; sprint(buf, "%d", vmx->index); return readstr(off, a, n, buf); } case Qregs: if(off == 0) vmxcmd(vmx, cmdgetregs, c->aux, (char *) c->aux + AUXSIZE); return readstr(off, a, n, c->aux); case Qmap: if(off == 0) vmxcmd(vmx, cmdgetmeminfo, c->aux, (char *) c->aux + AUXSIZE); return readstr(off, a, n, c->aux); case Qstatus: { char buf[ERRMAX+128]; char errbuf[ERRMAX]; int status; status = vmx->state; if(status == VMXDEAD){ vmxcmd(vmx, cmdstatus, errbuf); snprint(buf, sizeof(buf), "%s %#q\n", statenames[status], errbuf); }else if(status >= 0 && status < nelem(statenames)) snprint(buf, sizeof(buf), "%s\n", statenames[status]); else snprint(buf, sizeof(buf), "%d\n", status); return readstr(off, a, n, buf); } case Qwait: { char buf[512]; rc = vmxcmd(vmx, cmdwait, buf, buf + sizeof(buf)); if(rc > n) rc = n; if(rc > 0) memmove(a, buf, rc); return rc; } case Qfpregs: { char buf[sizeof(FPsave)]; vmxcmd(vmx, cmdgetfpregs, buf); if(n < 0 || off < 0 || off >= sizeof(buf)) n = 0; else if(off + n > sizeof(buf)) n = sizeof(buf) - off; if(n != 0) memmove(a, buf + off, n); return n; } default: error(Egreg); break; } return 0; } static long vmxwrite(Chan* c, void* a, long n, vlong off) { Cmdbuf *cb; Cmdtab *ct; char *s; int rc; Vmx *vmx; if(SLOT(c->qid) == -1){ switch((int)c->qid.path){ case Qdir: error(Eperm); default: error(Egreg); } } vmx = vmxent(c->qid); if(vmx == nil) error(Enonexist); switch(FILE(c->qid)){ case Qdir: error(Eperm); case Qctl: cb = parsecmd(a, n); if(waserror()){ free(cb); nexterror(); } ct = lookupcmd(cb, vmxctlmsg, nelem(vmxctlmsg)); switch(ct->index){ case CMquit: vmxcmd(vmx, cmdquit); break; case CMgo: case CMstep: s = nil; if(cb->nf == 2) kstrdup(&s, cb->f[1]); else if(cb->nf != 1) error(Ebadarg); if(waserror()){ free(s); nexterror(); } vmxcmd(vmx, cmdgo, ct->index == CMstep, s); poperror(); free(s); break; case CMstop: vmxcmd(vmx, cmdstop); break; case CMexc: s = nil; kstrdup(&s, cb->f[1]); if(waserror()){ free(s); nexterror(); } vmxcmd(vmx, cmdexcept, s); poperror(); free(s); break; case CMirq: s = nil; if(cb->nf == 2) kstrdup(&s, cb->f[1]); if(waserror()){ free(s); nexterror(); } vmxcmd(vmx, cmdirq, s); poperror(); free(s); break; case CMextrap: s = nil; kstrdup(&s, cb->f[1]); if(waserror()){ free(s); nexterror(); } vmxcmd(vmx, cmdextrap, s); poperror(); free(s); break; default: error(Egreg); } poperror(); free(cb); break; case Qmap: case Qregs: s = malloc(n+1); if(s == nil) error(Enomem); if(waserror()){ free(s); nexterror(); } memmove(s, a, n); s[n] = 0; rc = vmxcmd(vmx, FILE(c->qid) == Qregs ? cmdsetregs : cmdsetmeminfo, s); poperror(); free(s); return rc; case Qfpregs: { char buf[sizeof(FPsave)]; if(n > sizeof(FPsave)) n = sizeof(FPsave); memmove(buf, a, n); return vmxcmd(vmx, cmdsetfpregs, buf, n, off); } default: error(Egreg); break; } return n; } Dev vmxdevtab = { 'X', "vmx", vmxreset, devinit, devshutdown, vmxattach, vmxwalk, vmxstat, vmxopen, devcreate, vmxclose, vmxread, devbread, vmxwrite, devbwrite, vmxremove, devwstat, };