Print this page
13275 bhyve needs richer INIT/SIPI support
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
@@ -107,21 +107,19 @@
* (i) initialized when vcpu is created and when it is reinitialized
* (o) initialized the first time the vcpu is created
* (x) initialized before use
*/
struct vcpu {
- struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */
+ /* (o) protects state, run_state, hostcpu, sipi_vector */
+ struct mtx mtx;
+
enum vcpu_state state; /* (o) vcpu state */
-#ifndef __FreeBSD__
+ enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */
kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
kcondvar_t state_cv; /* (o) IDLE-transition cv */
-#endif /* __FreeBSD__ */
int hostcpu; /* (o) vcpu's current host cpu */
-#ifndef __FreeBSD__
int lastloccpu; /* (o) last host cpu localized to */
-#endif
- uint_t runblock; /* (i) block vcpu from run state */
int reqidle; /* (i) request vcpu to idle */
struct vlapic *vlapic; /* (i) APIC device model */
enum x2apic_state x2apic_state; /* (i) APIC mode */
uint64_t exitintinfo; /* (i) events pending at VM exit */
int nmi_pending; /* (i) NMI pending */
@@ -128,10 +126,11 @@
int extint_pending; /* (i) INTR pending */
int exception_pending; /* (i) exception pending */
int exc_vector; /* (x) exception collateral */
int exc_errcode_valid;
uint32_t exc_errcode;
+ uint8_t sipi_vector; /* (i) SIPI vector */
struct savefpu *guestfpu; /* (a,i) guest fpu state */
uint64_t guest_xcr0; /* (i) guest %xcr0 register */
void *stats; /* (a,i) statistics */
struct vm_exit exitinfo; /* (x) exit reason and collateral */
uint64_t nextrip; /* (x) next instruction to execute */
@@ -198,19 +197,10 @@
uint16_t cores; /* (o) num of cores/socket */
uint16_t threads; /* (o) num of threads/core */
uint16_t maxcpus; /* (o) max pluggable cpus */
struct ioport_config ioports; /* (o) ioport handling */
-
- bool sipi_req; /* (i) SIPI requested */
- int sipi_req_vcpu; /* (i) SIPI destination */
- uint64_t sipi_req_rip; /* (i) SIPI start %rip */
-
- /* Miscellaneous VM-wide statistics and counters */
- struct vm_wide_stats {
- uint64_t sipi_supersede;
- } stats;
};
static int vmm_initialized;
@@ -247,12 +237,12 @@
#define VMM_INIT(num) ((*ops->init)(num))
#define VMM_CLEANUP() ((*ops->cleanup)())
#define VMM_RESUME() ((*ops->resume)())
#define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
-#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
- ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo))
+#define VMRUN(vmi, vcpu, rip, pmap) \
+ ((*ops->vmrun)(vmi, vcpu, rip, pmap))
#define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
#define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
#define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
#define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
@@ -290,10 +280,12 @@
static int trace_guest_exceptions;
static void vm_free_memmap(struct vm *vm, int ident);
static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
+static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
+static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
#ifndef __FreeBSD__
static void vm_clear_memseg(struct vm *, int);
/* Flags for vtc_status */
@@ -368,13 +360,13 @@
} else {
vie_reset(vcpu->vie_ctx);
bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
}
+ vcpu->run_state = VRS_HALT;
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
- vcpu->runblock = 0;
vcpu->reqidle = 0;
vcpu->exitintinfo = 0;
vcpu->nmi_pending = 0;
vcpu->extint_pending = 0;
vcpu->exception_pending = 0;
@@ -1231,11 +1223,11 @@
return (VMGETDESC(vm->cookie, vcpu, reg, desc));
}
int
-vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
@@ -1242,10 +1234,53 @@
return (EINVAL);
return (VMSETDESC(vm->cookie, vcpu, reg, desc));
}
+int
+vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
+ return (EINVAL);
+ }
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ *state = vcpu->run_state;
+ *sipi_vec = vcpu->sipi_vector;
+ vcpu_unlock(vcpu);
+
+ return (0);
+}
+
+int
+vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
+ return (EINVAL);
+ }
+ if (!VRS_IS_VALID(state)) {
+ return (EINVAL);
+ }
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ vcpu->run_state = state;
+ vcpu->sipi_vector = sipi_vec;
+ vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+ vcpu_unlock(vcpu);
+
+ return (0);
+}
+
+
static void
restore_guest_fpustate(struct vcpu *vcpu)
{
/* flush host state to the pcb */
@@ -1352,20 +1387,10 @@
default:
error = 1;
break;
}
- if (newstate == VCPU_RUNNING) {
- while (vcpu->runblock != 0) {
-#ifdef __FreeBSD__
- msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
-#else
- cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
-#endif
- }
- }
-
if (error)
return (EBUSY);
VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
@@ -1374,12 +1399,11 @@
if (newstate == VCPU_RUNNING)
vcpu->hostcpu = curcpu;
else
vcpu->hostcpu = NOCPU;
- if (newstate == VCPU_IDLE ||
- (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
+ if (newstate == VCPU_IDLE) {
#ifdef __FreeBSD__
wakeup(&vcpu->state);
#else
cv_broadcast(&vcpu->state_cv);
#endif
@@ -1411,16 +1435,12 @@
*/
static int
vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
{
struct vcpu *vcpu;
-#ifdef __FreeBSD__
- const char *wmesg;
-#else
- const char *wmesg __unused;
-#endif
int t, vcpu_halted, vm_halted;
+ bool userspace_exit = false;
KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
vcpu = &vm->vcpu[vcpuid];
vcpu_halted = 0;
@@ -1427,72 +1447,54 @@
vm_halted = 0;
vcpu_lock(vcpu);
while (1) {
/*
- * Do a final check for pending NMI or interrupts before
- * really putting this thread to sleep. Also check for
- * software events that would cause this vcpu to wakeup.
- *
- * These interrupts/events could have happened after the
- * vcpu returned from VMRUN() and before it acquired the
- * vcpu lock above.
+ * Do a final check for pending interrupts (including NMI and
+ * INIT) before putting this thread to sleep.
*/
- if (vm->suspend || vcpu->reqidle)
- break;
if (vm_nmi_pending(vm, vcpuid))
break;
+ if (vcpu_run_state_pending(vm, vcpuid))
+ break;
if (!intr_disabled) {
if (vm_extint_pending(vm, vcpuid) ||
vlapic_pending_intr(vcpu->vlapic, NULL)) {
break;
}
}
- /* Don't go to sleep if the vcpu thread needs to yield */
- if (vcpu_should_yield(vm, vcpuid))
+ /*
+ * Also check for software events which would cause a wake-up.
+ * This will set the appropriate exitcode directly, rather than
+ * requiring a trip through VM_RUN().
+ */
+ if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
+ userspace_exit = true;
break;
+ }
- if (vcpu_debugged(vm, vcpuid))
- break;
-
/*
* Some Linux guests implement "halt" by having all vcpus
* execute HLT with interrupts disabled. 'halted_cpus' keeps
* track of the vcpus that have entered this state. When all
* vcpus enter the halted state the virtual machine is halted.
*/
if (intr_disabled) {
- wmesg = "vmhalt";
- VCPU_CTR0(vm, vcpuid, "Halted");
if (!vcpu_halted && halt_detection_enabled) {
vcpu_halted = 1;
CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
}
if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
vm_halted = 1;
break;
}
- } else {
- wmesg = "vmidle";
}
t = ticks;
vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
-#ifdef __FreeBSD__
- /*
- * XXX msleep_spin() cannot be interrupted by signals so
- * wake up periodically to check pending signals.
- */
- msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
-#else
- /*
- * Fortunately, cv_wait_sig can be interrupted by signals, so
- * there is no need to periodically wake up.
- */
(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
-#endif
vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
}
if (vcpu_halted)
@@ -1501,11 +1503,11 @@
vcpu_unlock(vcpu);
if (vm_halted)
vm_suspend(vm, VM_SUSPEND_HALT);
- return (0);
+ return (userspace_exit ? -1 : 0);
}
static int
vm_handle_paging(struct vm *vm, int vcpuid)
{
@@ -1830,10 +1832,66 @@
vcpu->reqidle = 0;
vcpu_unlock(vcpu);
return (-1);
}
+static int
+vm_handle_run_state(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu = &vm->vcpu[vcpuid];
+ bool handled = false;
+
+ vcpu_lock(vcpu);
+ while (1) {
+ if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
+ vcpu_unlock(vcpu);
+ VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
+ vcpu_lock(vcpu);
+
+ vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
+ vcpu->run_state |= VRS_INIT;
+ }
+
+ if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
+ (VRS_INIT | VRS_PEND_SIPI)) {
+ const uint8_t vector = vcpu->sipi_vector;
+
+ vcpu_unlock(vcpu);
+ VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
+ vcpu_lock(vcpu);
+
+ vcpu->run_state &= ~VRS_PEND_SIPI;
+ vcpu->run_state |= VRS_RUN;
+ }
+
+ /*
+ * If the vCPU is now in the running state, there is no need to
+ * wait for anything prior to re-entry.
+ */
+ if ((vcpu->run_state & VRS_RUN) != 0) {
+ handled = true;
+ break;
+ }
+
+ /*
+ * Also check for software events which would cause a wake-up.
+ * This will set the appropriate exitcode directly, rather than
+ * requiring a trip through VM_RUN().
+ */
+ if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
+ break;
+ }
+
+ vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
+ (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
+ vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
+ }
+ vcpu_unlock(vcpu);
+
+ return (handled ? 0 : -1);
+}
+
#ifndef __FreeBSD__
static int
vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
{
struct vcpu *cpu = &vm->vcpu[vcpuid];
@@ -1848,22 +1906,10 @@
return (-1);
}
#endif /* __FreeBSD__ */
-void
-vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip)
-{
- if (vm->sipi_req) {
- /* This should never occur if userspace is doing its job. */
- vm->stats.sipi_supersede++;
- }
- vm->sipi_req = true;
- vm->sipi_req_vcpu = req_vcpuid;
- vm->sipi_req_rip = req_rip;
-}
-
int
vm_suspend(struct vm *vm, enum vm_suspend_how how)
{
int i;
@@ -1888,71 +1934,22 @@
return (0);
}
void
-vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
+vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
- KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
- ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
-
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
- vmexit->exitcode = VM_EXITCODE_SUSPENDED;
- vmexit->u.suspended.how = vm->suspend;
+ vmexit->exitcode = VM_EXITCODE_RUN_STATE;
+ vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
}
-void
-vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
-{
- struct vm_exit *vmexit;
- vmexit = vm_exitinfo(vm, vcpuid);
- vmexit->rip = rip;
- vmexit->inst_length = 0;
- vmexit->exitcode = VM_EXITCODE_DEBUG;
-}
-
-void
-vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
-{
- struct vm_exit *vmexit;
-
- vmexit = vm_exitinfo(vm, vcpuid);
- vmexit->rip = rip;
- vmexit->inst_length = 0;
- vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
- vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
-}
-
-void
-vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
-{
- struct vm_exit *vmexit;
-
- vmexit = vm_exitinfo(vm, vcpuid);
- vmexit->rip = rip;
- vmexit->inst_length = 0;
- vmexit->exitcode = VM_EXITCODE_REQIDLE;
- vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
-}
-
-void
-vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
-{
- struct vm_exit *vmexit;
-
- vmexit = vm_exitinfo(vm, vcpuid);
- vmexit->rip = rip;
- vmexit->inst_length = 0;
- vmexit->exitcode = VM_EXITCODE_BOGUS;
- vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
-}
-
#ifndef __FreeBSD__
/*
* Some vmm resources, such as the lapic, may have CPU-specific resources
* allocated to them which would benefit from migration onto the host CPU which
* is processing the vcpu state.
@@ -2070,11 +2067,11 @@
case VEC_DEFAULT:
return (0);
case VEC_DISCARD_INSTR:
vie_reset(vie);
return (0);
- case VEC_COMPLETE_MMIO:
+ case VEC_FULFILL_MMIO:
err = vie_fulfill_mmio(vie, &entry->u.mmio);
if (err == 0) {
err = vie_emulate_mmio(vie, vm, vcpuid);
if (err == 0) {
vie_advance_pc(vie, &vcpu->nextrip);
@@ -2089,11 +2086,11 @@
vie_reset(vie);
err = 0;
}
}
break;
- case VEC_COMPLETE_INOUT:
+ case VEC_FULFILL_INOUT:
err = vie_fulfill_inout(vie, &entry->u.inout);
if (err == 0) {
err = vie_emulate_inout(vie, vm, vcpuid);
if (err == 0) {
vie_advance_pc(vie, &vcpu->nextrip);
@@ -2130,29 +2127,16 @@
*/
vie_exitinfo(vie, vme);
return (-1);
}
- if (vcpuid == 0 && vm->sipi_req) {
- /* The boot vCPU has sent a SIPI to one of the other CPUs */
- vme->exitcode = VM_EXITCODE_SPINUP_AP;
- vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu;
- vme->u.spinup_ap.rip = vm->sipi_req_rip;
-
- vm->sipi_req = false;
- vm->sipi_req_vcpu = 0;
- vm->sipi_req_rip = 0;
- return (-1);
- }
-
return (0);
}
int
vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
{
- struct vm_eventinfo evinfo;
int error;
struct vcpu *vcpu;
#ifdef __FreeBSD__
struct pcb *pcb;
#endif
@@ -2175,13 +2159,10 @@
return (EINVAL);
pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
- evinfo.rptr = &vcpu->runblock;
- evinfo.sptr = &vm->suspend;
- evinfo.iptr = &vcpu->reqidle;
#ifndef __FreeBSD__
vtc.vtc_vm = vm;
vtc.vtc_vcpuid = vcpuid;
vtc.vtc_status = 0;
@@ -2240,11 +2221,11 @@
}
vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
#endif
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
- error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
+ error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
#ifdef __FreeBSD__
save_guest_fpustate(vcpu);
#else
@@ -2271,19 +2252,20 @@
vcpu->nextrip = vme->rip + vme->inst_length;
switch (vme->exitcode) {
case VM_EXITCODE_REQIDLE:
error = vm_handle_reqidle(vm, vcpuid);
break;
+ case VM_EXITCODE_RUN_STATE:
+ error = vm_handle_run_state(vm, vcpuid);
+ break;
case VM_EXITCODE_SUSPENDED:
error = vm_handle_suspend(vm, vcpuid);
break;
case VM_EXITCODE_IOAPIC_EOI:
vioapic_process_eoi(vm, vcpuid,
vme->u.ioapic_eoi.vector);
break;
- case VM_EXITCODE_RUNBLOCK:
- break;
case VM_EXITCODE_HLT:
intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
error = vm_handle_hlt(vm, vcpuid, intr_disabled);
break;
case VM_EXITCODE_PAGING:
@@ -2790,10 +2772,200 @@
vcpu->extint_pending = 0;
vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
}
int
+vm_inject_init(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ vcpu_lock(vcpu);
+ vcpu->run_state |= VRS_PEND_INIT;
+ vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+ vcpu_unlock(vcpu);
+ return (0);
+}
+
+int
+vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ vcpu_lock(vcpu);
+ vcpu->run_state |= VRS_PEND_SIPI;
+ vcpu->sipi_vector = vector;
+ /* SIPI is only actionable if the CPU is waiting in INIT state */
+ if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
+ vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+ }
+ vcpu_unlock(vcpu);
+ return (0);
+}
+
+bool
+vcpu_run_state_pending(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
+ vcpu = &vm->vcpu[vcpuid];
+
+ /* Of interest: vCPU not in running state or with pending INIT */
+ return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
+}
+
+int
+vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
+{
+ struct seg_desc desc;
+ const enum vm_reg_name clear_regs[] = {
+ VM_REG_GUEST_CR2,
+ VM_REG_GUEST_CR3,
+ VM_REG_GUEST_CR4,
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15,
+ VM_REG_GUEST_DR0,
+ VM_REG_GUEST_DR1,
+ VM_REG_GUEST_DR2,
+ VM_REG_GUEST_DR3,
+ VM_REG_GUEST_EFER,
+ };
+ const enum vm_reg_name data_segs[] = {
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS,
+ };
+ struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+ if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+ return (EINVAL);
+
+ for (uint_t i = 0; i < nitems(clear_regs); i++) {
+ VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
+ }
+
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
+
+ /*
+ * The prescribed contents of %rdx differ slightly between the Intel and
+ * AMD architectural definitions. The former expects the Extended Model
+ * in bits 16-19 where the latter expects all the Family, Model, and
+ * Stepping be there. Common boot ROMs appear to disregard this
+ * anyways, so we stick with a compromise value similar to what is
+ * spelled out in the Intel SDM.
+ */
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
+
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
+
+ /* CS: Present, R/W, Accessed */
+ desc.access = 0x0093;
+ desc.base = 0xffff0000;
+ desc.limit = 0xffff;
+ VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
+
+ /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
+ desc.access = 0x0093;
+ desc.base = 0;
+ desc.limit = 0xffff;
+ for (uint_t i = 0; i < nitems(data_segs); i++) {
+ VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
+ VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
+ }
+
+ /* GDTR, IDTR */
+ desc.base = 0;
+ desc.limit = 0xffff;
+ VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
+ VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
+
+ /* LDTR: Present, LDT */
+ desc.access = 0x0082;
+ desc.base = 0;
+ desc.limit = 0xffff;
+ VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
+
+ /* TR: Present, 32-bit TSS */
+ desc.access = 0x008b;
+ desc.base = 0;
+ desc.limit = 0xffff;
+ VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
+
+ vlapic_reset(vm_lapic(vm, vcpuid));
+
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
+
+ vcpu->exitintinfo = 0;
+ vcpu->exception_pending = 0;
+ vcpu->nmi_pending = 0;
+ vcpu->extint_pending = 0;
+
+ /*
+ * A CPU reset caused by power-on or system reset clears more state than
+ * one which is trigged from an INIT IPI.
+ */
+ if (!init_only) {
+ vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
+ fpu_save_area_reset(vcpu->guestfpu);
+
+ /* XXX: clear MSRs and other pieces */
+ }
+
+ return (0);
+}
+
+static int
+vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
+{
+ struct seg_desc desc;
+
+ if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+ return (EINVAL);
+
+ /* CS: Present, R/W, Accessed */
+ desc.access = 0x0093;
+ desc.base = (uint64_t)vector << 12;
+ desc.limit = 0xffff;
+ VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
+ (uint64_t)vector << 8));
+
+ VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
+
+ return (0);
+}
+
+int
vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
@@ -2892,11 +3064,11 @@
{
int error;
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
- panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+ panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
@@ -2910,11 +3082,11 @@
{
struct vcpu *vcpu;
enum vcpu_state state;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
- panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+ panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
state = vcpu->state;
@@ -2923,58 +3095,10 @@
vcpu_unlock(vcpu);
return (state);
}
-void
-vcpu_block_run(struct vm *vm, int vcpuid)
-{
- struct vcpu *vcpu;
-
- if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
- panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
-
- vcpu = &vm->vcpu[vcpuid];
-
- vcpu_lock(vcpu);
- vcpu->runblock++;
- if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
- vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
- }
- while (vcpu->state == VCPU_RUNNING) {
-#ifdef __FreeBSD__
- msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
-#else
- cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
-#endif
- }
- vcpu_unlock(vcpu);
-}
-
-void
-vcpu_unblock_run(struct vm *vm, int vcpuid)
-{
- struct vcpu *vcpu;
-
- if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
- panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
-
- vcpu = &vm->vcpu[vcpuid];
-
- vcpu_lock(vcpu);
- KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
- vcpu->runblock--;
- if (vcpu->runblock == 0) {
-#ifdef __FreeBSD__
- wakeup(&vcpu->state);
-#else
- cv_broadcast(&vcpu->state_cv);
-#endif
- }
- vcpu_unlock(vcpu);
-}
-
#ifndef __FreeBSD__
uint64_t
vcpu_tsc_offset(struct vm *vm, int vcpuid)
{
return (vm->vcpu[vcpuid].tsc_offset);
@@ -3036,17 +3160,99 @@
CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
}
return (0);
}
-int
-vcpu_debugged(struct vm *vm, int vcpuid)
+static bool
+vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
+ uint64_t entry_rip)
{
+ struct vcpu *vcpu = &vm->vcpu[vcpuid];
+ struct vm_exit *vme = &vcpu->exitinfo;
+ bool bail = false;
- return (CPU_ISSET(vcpuid, &vm->debug_cpus));
+ ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
+
+ if (vm->suspend) {
+ if (on_entry) {
+ VERIFY(vm->suspend > VM_SUSPEND_NONE &&
+ vm->suspend < VM_SUSPEND_LAST);
+
+ vme->exitcode = VM_EXITCODE_SUSPENDED;
+ vme->u.suspended.how = vm->suspend;
+ } else {
+ /*
+ * Handling VM suspend is complicated, so if that
+ * condition is detected outside of VM-entry itself,
+ * just emit a BOGUS exitcode so we take a lap to pick
+ * up the event during an entry and are directed into
+ * the vm_handle_suspend() logic.
+ */
+ vme->exitcode = VM_EXITCODE_BOGUS;
+ }
+ bail = true;
+ }
+ if (vcpu->reqidle) {
+ vme->exitcode = VM_EXITCODE_REQIDLE;
+ vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
+
+ if (!on_entry) {
+ /*
+ * A reqidle request detected outside of VM-entry can be
+ * handled directly by clearing the request (and taking
+ * a lap to userspace).
+ */
+ vcpu_assert_locked(vcpu);
+ vcpu->reqidle = 0;
+ }
+ bail = true;
+ }
+ if (vcpu_should_yield(vm, vcpuid)) {
+ vme->exitcode = VM_EXITCODE_BOGUS;
+ vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
+ bail = true;
+ }
+ if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
+ vme->exitcode = VM_EXITCODE_DEBUG;
+ bail = true;
+ }
+
+ if (bail) {
+ if (on_entry) {
+ /*
+ * If bailing out during VM-entry, the current %rip must
+ * be recorded in the exitinfo.
+ */
+ vme->rip = entry_rip;
+ }
+ vme->inst_length = 0;
+ }
+ return (bail);
}
+static bool
+vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
+{
+ /*
+ * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
+ * wait-for-SIPI) expect that %rip is already populated in the vm_exit
+ * structure, and we would only modify the exitcode.
+ */
+ return (vcpu_bailout_checks(vm, vcpuid, false, 0));
+}
+
+bool
+vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
+{
+ /*
+ * Bail-out checks done as part of VM entry require an updated %rip to
+ * populate the vm_exit struct if any of the conditions of interest are
+ * matched in the check.
+ */
+ return (vcpu_bailout_checks(vm, vcpuid, true, rip));
+}
+
cpuset_t
vm_active_cpus(struct vm *vm)
{
return (vm->active_cpus);