Print this page
13275 bhyve needs richer INIT/SIPI support
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>

*** 107,127 **** * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { ! struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ ! #ifndef __FreeBSD__ kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ kcondvar_t state_cv; /* (o) IDLE-transition cv */ - #endif /* __FreeBSD__ */ int hostcpu; /* (o) vcpu's current host cpu */ - #ifndef __FreeBSD__ int lastloccpu; /* (o) last host cpu localized to */ - #endif - uint_t runblock; /* (i) block vcpu from run state */ int reqidle; /* (i) request vcpu to idle */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ --- 107,125 ---- * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { ! /* (o) protects state, run_state, hostcpu, sipi_vector */ ! struct mtx mtx; ! enum vcpu_state state; /* (o) vcpu state */ ! enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ kcondvar_t state_cv; /* (o) IDLE-transition cv */ int hostcpu; /* (o) vcpu's current host cpu */ int lastloccpu; /* (o) last host cpu localized to */ int reqidle; /* (i) request vcpu to idle */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */
*** 128,137 **** --- 126,136 ---- int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; + uint8_t sipi_vector; /* (i) SIPI vector */ struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */
*** 198,216 **** uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ struct ioport_config ioports; /* (o) ioport handling */ - - bool sipi_req; /* (i) SIPI requested */ - int sipi_req_vcpu; /* (i) SIPI destination */ - uint64_t sipi_req_rip; /* (i) SIPI start %rip */ - - /* Miscellaneous VM-wide statistics and counters */ - struct vm_wide_stats { - uint64_t sipi_supersede; - } stats; }; static int vmm_initialized; --- 197,206 ----
*** 247,258 **** #define VMM_INIT(num) ((*ops->init)(num)) #define VMM_CLEANUP() ((*ops->cleanup)()) #define VMM_RESUME() ((*ops->resume)()) #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap)) ! #define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ ! ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo)) #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max)) #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace)) #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) --- 237,248 ---- #define VMM_INIT(num) ((*ops->init)(num)) #define VMM_CLEANUP() ((*ops->cleanup)()) #define VMM_RESUME() ((*ops->resume)()) #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap)) ! #define VMRUN(vmi, vcpu, rip, pmap) \ ! ((*ops->vmrun)(vmi, vcpu, rip, pmap)) #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max)) #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace)) #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
*** 290,299 **** --- 280,291 ---- static int trace_guest_exceptions; static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); + static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); + static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); #ifndef __FreeBSD__ static void vm_clear_memseg(struct vm *, int); /* Flags for vtc_status */
*** 368,380 **** } else { vie_reset(vcpu->vie_ctx); bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); } vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); - vcpu->runblock = 0; vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; --- 360,372 ---- } else { vie_reset(vcpu->vie_ctx); bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); } + vcpu->run_state = VRS_HALT; vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0;
*** 1231,1241 **** return (VMGETDESC(vm->cookie, vcpu, reg, desc)); } int ! vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) --- 1223,1233 ---- return (VMGETDESC(vm->cookie, vcpu, reg, desc)); } int ! vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg))
*** 1242,1251 **** --- 1234,1286 ---- return (EINVAL); return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } + int + vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) + { + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) { + return (EINVAL); + } + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + *state = vcpu->run_state; + *sipi_vec = vcpu->sipi_vector; + vcpu_unlock(vcpu); + + return (0); + } + + int + vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) + { + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) { + return (EINVAL); + } + if (!VRS_IS_VALID(state)) { + return (EINVAL); + } + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu->run_state = state; + vcpu->sipi_vector = sipi_vec; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + vcpu_unlock(vcpu); + + return (0); + } + + static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */
*** 1352,1371 **** default: error = 1; break; } - if (newstate == VCPU_RUNNING) { - while (vcpu->runblock != 0) { - #ifdef __FreeBSD__ - msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); - #else - cv_wait(&vcpu->state_cv, &vcpu->mtx.m); - #endif - } - } - if (error) return (EBUSY); VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); --- 1387,1396 ----
*** 1374,1385 **** if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; ! if (newstate == VCPU_IDLE || ! (newstate == VCPU_FROZEN && vcpu->runblock != 0)) { #ifdef __FreeBSD__ wakeup(&vcpu->state); #else cv_broadcast(&vcpu->state_cv); #endif --- 1399,1409 ---- if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; ! if (newstate == VCPU_IDLE) { #ifdef __FreeBSD__ wakeup(&vcpu->state); #else cv_broadcast(&vcpu->state_cv); #endif
*** 1411,1426 **** */ static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) { struct vcpu *vcpu; - #ifdef __FreeBSD__ - const char *wmesg; - #else - const char *wmesg __unused; - #endif int t, vcpu_halted, vm_halted; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu = &vm->vcpu[vcpuid]; vcpu_halted = 0; --- 1435,1446 ---- */ static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) { struct vcpu *vcpu; int t, vcpu_halted, vm_halted; + bool userspace_exit = false; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu = &vm->vcpu[vcpuid]; vcpu_halted = 0;
*** 1427,1498 **** vm_halted = 0; vcpu_lock(vcpu); while (1) { /* ! * Do a final check for pending NMI or interrupts before ! * really putting this thread to sleep. Also check for ! * software events that would cause this vcpu to wakeup. ! * ! * These interrupts/events could have happened after the ! * vcpu returned from VMRUN() and before it acquired the ! * vcpu lock above. */ - if (vm->suspend || vcpu->reqidle) - break; if (vm_nmi_pending(vm, vcpuid)) break; if (!intr_disabled) { if (vm_extint_pending(vm, vcpuid) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } ! /* Don't go to sleep if the vcpu thread needs to yield */ ! if (vcpu_should_yield(vm, vcpuid)) break; - if (vcpu_debugged(vm, vcpuid)) - break; - /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { - wmesg = "vmhalt"; - VCPU_CTR0(vm, vcpuid, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } - } else { - wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); - #ifdef __FreeBSD__ - /* - * XXX msleep_spin() cannot be interrupted by signals so - * wake up periodically to check pending signals. - */ - msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); - #else - /* - * Fortunately, cv_wait_sig can be interrupted by signals, so - * there is no need to periodically wake up. - */ (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); - #endif vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } if (vcpu_halted) --- 1447,1500 ---- vm_halted = 0; vcpu_lock(vcpu); while (1) { /* ! * Do a final check for pending interrupts (including NMI and ! * INIT) before putting this thread to sleep. */ if (vm_nmi_pending(vm, vcpuid)) break; + if (vcpu_run_state_pending(vm, vcpuid)) + break; if (!intr_disabled) { if (vm_extint_pending(vm, vcpuid) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } ! /* ! * Also check for software events which would cause a wake-up. ! * This will set the appropriate exitcode directly, rather than ! * requiring a trip through VM_RUN(). ! */ ! if (vcpu_sleep_bailout_checks(vm, vcpuid)) { ! userspace_exit = true; break; + } /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } t = ticks; vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } if (vcpu_halted)
*** 1501,1511 **** vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); ! return (0); } static int vm_handle_paging(struct vm *vm, int vcpuid) { --- 1503,1513 ---- vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); ! return (userspace_exit ? -1 : 0); } static int vm_handle_paging(struct vm *vm, int vcpuid) {
*** 1830,1839 **** --- 1832,1897 ---- vcpu->reqidle = 0; vcpu_unlock(vcpu); return (-1); } + static int + vm_handle_run_state(struct vm *vm, int vcpuid) + { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + bool handled = false; + + vcpu_lock(vcpu); + while (1) { + if ((vcpu->run_state & VRS_PEND_INIT) != 0) { + vcpu_unlock(vcpu); + VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); + vcpu_lock(vcpu); + + vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); + vcpu->run_state |= VRS_INIT; + } + + if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == + (VRS_INIT | VRS_PEND_SIPI)) { + const uint8_t vector = vcpu->sipi_vector; + + vcpu_unlock(vcpu); + VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); + vcpu_lock(vcpu); + + vcpu->run_state &= ~VRS_PEND_SIPI; + vcpu->run_state |= VRS_RUN; + } + + /* + * If the vCPU is now in the running state, there is no need to + * wait for anything prior to re-entry. + */ + if ((vcpu->run_state & VRS_RUN) != 0) { + handled = true; + break; + } + + /* + * Also check for software events which would cause a wake-up. + * This will set the appropriate exitcode directly, rather than + * requiring a trip through VM_RUN(). + */ + if (vcpu_sleep_bailout_checks(vm, vcpuid)) { + break; + } + + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + return (handled ? 0 : -1); + } + #ifndef __FreeBSD__ static int vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) { struct vcpu *cpu = &vm->vcpu[vcpuid];
*** 1848,1869 **** return (-1); } #endif /* __FreeBSD__ */ - void - vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip) - { - if (vm->sipi_req) { - /* This should never occur if userspace is doing its job. */ - vm->stats.sipi_supersede++; - } - vm->sipi_req = true; - vm->sipi_req_vcpu = req_vcpuid; - vm->sipi_req_rip = req_rip; - } - int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; --- 1906,1915 ----
*** 1888,1958 **** return (0); } void ! vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; - KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, - ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); - vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; ! vmexit->exitcode = VM_EXITCODE_SUSPENDED; ! vmexit->u.suspended.how = vm->suspend; } - void - vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) - { - struct vm_exit *vmexit; - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_DEBUG; - } - - void - vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip) - { - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_RUNBLOCK; - vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1); - } - - void - vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) - { - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_REQIDLE; - vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); - } - - void - vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) - { - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_BOGUS; - vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); - } - #ifndef __FreeBSD__ /* * Some vmm resources, such as the lapic, may have CPU-specific resources * allocated to them which would benefit from migration onto the host CPU which * is processing the vcpu state. --- 1934,1955 ---- return (0); } void ! vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; ! vmexit->exitcode = VM_EXITCODE_RUN_STATE; ! vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); } #ifndef __FreeBSD__ /* * Some vmm resources, such as the lapic, may have CPU-specific resources * allocated to them which would benefit from migration onto the host CPU which * is processing the vcpu state.
*** 2070,2080 **** case VEC_DEFAULT: return (0); case VEC_DISCARD_INSTR: vie_reset(vie); return (0); ! case VEC_COMPLETE_MMIO: err = vie_fulfill_mmio(vie, &entry->u.mmio); if (err == 0) { err = vie_emulate_mmio(vie, vm, vcpuid); if (err == 0) { vie_advance_pc(vie, &vcpu->nextrip); --- 2067,2077 ---- case VEC_DEFAULT: return (0); case VEC_DISCARD_INSTR: vie_reset(vie); return (0); ! case VEC_FULFILL_MMIO: err = vie_fulfill_mmio(vie, &entry->u.mmio); if (err == 0) { err = vie_emulate_mmio(vie, vm, vcpuid); if (err == 0) { vie_advance_pc(vie, &vcpu->nextrip);
*** 2089,2099 **** vie_reset(vie); err = 0; } } break; ! case VEC_COMPLETE_INOUT: err = vie_fulfill_inout(vie, &entry->u.inout); if (err == 0) { err = vie_emulate_inout(vie, vm, vcpuid); if (err == 0) { vie_advance_pc(vie, &vcpu->nextrip); --- 2086,2096 ---- vie_reset(vie); err = 0; } } break; ! case VEC_FULFILL_INOUT: err = vie_fulfill_inout(vie, &entry->u.inout); if (err == 0) { err = vie_emulate_inout(vie, vm, vcpuid); if (err == 0) { vie_advance_pc(vie, &vcpu->nextrip);
*** 2130,2158 **** */ vie_exitinfo(vie, vme); return (-1); } - if (vcpuid == 0 && vm->sipi_req) { - /* The boot vCPU has sent a SIPI to one of the other CPUs */ - vme->exitcode = VM_EXITCODE_SPINUP_AP; - vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu; - vme->u.spinup_ap.rip = vm->sipi_req_rip; - - vm->sipi_req = false; - vm->sipi_req_vcpu = 0; - vm->sipi_req_rip = 0; - return (-1); - } - return (0); } int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) { - struct vm_eventinfo evinfo; int error; struct vcpu *vcpu; #ifdef __FreeBSD__ struct pcb *pcb; #endif --- 2127,2142 ----
*** 2175,2187 **** return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; - evinfo.rptr = &vcpu->runblock; - evinfo.sptr = &vm->suspend; - evinfo.iptr = &vcpu->reqidle; #ifndef __FreeBSD__ vtc.vtc_vm = vm; vtc.vtc_vcpuid = vcpuid; vtc.vtc_status = 0; --- 2159,2168 ----
*** 2240,2250 **** } vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; #endif vcpu_require_state(vm, vcpuid, VCPU_RUNNING); ! error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); #ifdef __FreeBSD__ save_guest_fpustate(vcpu); #else --- 2221,2231 ---- } vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; #endif vcpu_require_state(vm, vcpuid, VCPU_RUNNING); ! error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); #ifdef __FreeBSD__ save_guest_fpustate(vcpu); #else
*** 2271,2289 **** vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vm, vcpuid); break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; - case VM_EXITCODE_RUNBLOCK: - break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled); break; case VM_EXITCODE_PAGING: --- 2252,2271 ---- vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vm, vcpuid); break; + case VM_EXITCODE_RUN_STATE: + error = vm_handle_run_state(vm, vcpuid); + break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled); break; case VM_EXITCODE_PAGING:
*** 2790,2799 **** --- 2772,2971 ---- vcpu->extint_pending = 0; vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); } int + vm_inject_init(struct vm *vm, int vcpuid) + { + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + vcpu->run_state |= VRS_PEND_INIT; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + vcpu_unlock(vcpu); + return (0); + } + + int + vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) + { + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + vcpu->run_state |= VRS_PEND_SIPI; + vcpu->sipi_vector = vector; + /* SIPI is only actionable if the CPU is waiting in INIT state */ + if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + } + vcpu_unlock(vcpu); + return (0); + } + + bool + vcpu_run_state_pending(struct vm *vm, int vcpuid) + { + struct vcpu *vcpu; + + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); + vcpu = &vm->vcpu[vcpuid]; + + /* Of interest: vCPU not in running state or with pending INIT */ + return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); + } + + int + vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) + { + struct seg_desc desc; + const enum vm_reg_name clear_regs[] = { + VM_REG_GUEST_CR2, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_EFER, + }; + const enum vm_reg_name data_segs[] = { + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_ES, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + }; + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + for (uint_t i = 0; i < nitems(clear_regs); i++) { + VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); + } + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); + + /* + * The prescribed contents of %rdx differ slightly between the Intel and + * AMD architectural definitions. The former expects the Extended Model + * in bits 16-19 where the latter expects all the Family, Model, and + * Stepping be there. Common boot ROMs appear to disregard this + * anyways, so we stick with a compromise value similar to what is + * spelled out in the Intel SDM. + */ + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); + + /* CS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = 0xffff0000; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); + + /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = 0; + desc.limit = 0xffff; + for (uint_t i = 0; i < nitems(data_segs); i++) { + VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); + VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); + } + + /* GDTR, IDTR */ + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); + + /* LDTR: Present, LDT */ + desc.access = 0x0082; + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); + + /* TR: Present, 32-bit TSS */ + desc.access = 0x008b; + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); + + vlapic_reset(vm_lapic(vm, vcpuid)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); + + vcpu->exitintinfo = 0; + vcpu->exception_pending = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + + /* + * A CPU reset caused by power-on or system reset clears more state than + * one which is trigged from an INIT IPI. + */ + if (!init_only) { + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + fpu_save_area_reset(vcpu->guestfpu); + + /* XXX: clear MSRs and other pieces */ + } + + return (0); + } + + static int + vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) + { + struct seg_desc desc; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + /* CS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = (uint64_t)vector << 12; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, + (uint64_t)vector << 8)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); + + return (0); + } + + int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL);
*** 2892,2902 **** { int error; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) ! panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); --- 3064,3074 ---- { int error; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) ! panic("vcpu_set_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
*** 2910,2920 **** { struct vcpu *vcpu; enum vcpu_state state; if (vcpuid < 0 || vcpuid >= vm->maxcpus) ! panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); state = vcpu->state; --- 3082,3092 ---- { struct vcpu *vcpu; enum vcpu_state state; if (vcpuid < 0 || vcpuid >= vm->maxcpus) ! panic("vcpu_get_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); state = vcpu->state;
*** 2923,2980 **** vcpu_unlock(vcpu); return (state); } - void - vcpu_block_run(struct vm *vm, int vcpuid) - { - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) - panic("vcpu_block_run: invalid vcpuid %d", vcpuid); - - vcpu = &vm->vcpu[vcpuid]; - - vcpu_lock(vcpu); - vcpu->runblock++; - if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) { - vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); - } - while (vcpu->state == VCPU_RUNNING) { - #ifdef __FreeBSD__ - msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); - #else - cv_wait(&vcpu->state_cv, &vcpu->mtx.m); - #endif - } - vcpu_unlock(vcpu); - } - - void - vcpu_unblock_run(struct vm *vm, int vcpuid) - { - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) - panic("vcpu_block_run: invalid vcpuid %d", vcpuid); - - vcpu = &vm->vcpu[vcpuid]; - - vcpu_lock(vcpu); - KASSERT(vcpu->runblock != 0, ("expected non-zero runblock")); - vcpu->runblock--; - if (vcpu->runblock == 0) { - #ifdef __FreeBSD__ - wakeup(&vcpu->state); - #else - cv_broadcast(&vcpu->state_cv); - #endif - } - vcpu_unlock(vcpu); - } - #ifndef __FreeBSD__ uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid].tsc_offset); --- 3095,3104 ----
*** 3036,3052 **** CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); } return (0); } ! int ! vcpu_debugged(struct vm *vm, int vcpuid) { ! return (CPU_ISSET(vcpuid, &vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); --- 3160,3258 ---- CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); } return (0); } ! static bool ! vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, ! uint64_t entry_rip) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + struct vm_exit *vme = &vcpu->exitinfo; + bool bail = false; ! ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); ! ! if (vm->suspend) { ! if (on_entry) { ! VERIFY(vm->suspend > VM_SUSPEND_NONE && ! vm->suspend < VM_SUSPEND_LAST); ! ! vme->exitcode = VM_EXITCODE_SUSPENDED; ! vme->u.suspended.how = vm->suspend; ! } else { ! /* ! * Handling VM suspend is complicated, so if that ! * condition is detected outside of VM-entry itself, ! * just emit a BOGUS exitcode so we take a lap to pick ! * up the event during an entry and are directed into ! * the vm_handle_suspend() logic. ! */ ! vme->exitcode = VM_EXITCODE_BOGUS; ! } ! bail = true; ! } ! if (vcpu->reqidle) { ! vme->exitcode = VM_EXITCODE_REQIDLE; ! vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); ! ! if (!on_entry) { ! /* ! * A reqidle request detected outside of VM-entry can be ! * handled directly by clearing the request (and taking ! * a lap to userspace). ! */ ! vcpu_assert_locked(vcpu); ! vcpu->reqidle = 0; ! } ! bail = true; ! } ! if (vcpu_should_yield(vm, vcpuid)) { ! vme->exitcode = VM_EXITCODE_BOGUS; ! vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); ! bail = true; ! } ! if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { ! vme->exitcode = VM_EXITCODE_DEBUG; ! bail = true; ! } ! ! if (bail) { ! if (on_entry) { ! /* ! * If bailing out during VM-entry, the current %rip must ! * be recorded in the exitinfo. ! */ ! vme->rip = entry_rip; ! } ! vme->inst_length = 0; ! } ! return (bail); } + static bool + vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) + { + /* + * Bail-out check done prior to sleeping (in vCPU contexts like HLT or + * wait-for-SIPI) expect that %rip is already populated in the vm_exit + * structure, and we would only modify the exitcode. + */ + return (vcpu_bailout_checks(vm, vcpuid, false, 0)); + } + + bool + vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) + { + /* + * Bail-out checks done as part of VM entry require an updated %rip to + * populate the vm_exit struct if any of the conditions of interest are + * matched in the check. + */ + return (vcpu_bailout_checks(vm, vcpuid, true, rip)); + } + cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus);