Print this page
13275 bhyve needs richer INIT/SIPI support
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>

@@ -107,21 +107,19 @@
  * (i) initialized when vcpu is created and when it is reinitialized
  * (o) initialized the first time the vcpu is created
  * (x) initialized before use
  */
 struct vcpu {
-        struct mtx      mtx;            /* (o) protects 'state' and 'hostcpu' */
+        /* (o) protects state, run_state, hostcpu, sipi_vector */
+        struct mtx      mtx;
+
         enum vcpu_state state;          /* (o) vcpu state */
-#ifndef __FreeBSD__
+        enum vcpu_run_state run_state;  /* (i) vcpu init/sipi/run state */
         kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
         kcondvar_t      state_cv;       /* (o) IDLE-transition cv */
-#endif /* __FreeBSD__ */
         int             hostcpu;        /* (o) vcpu's current host cpu */
-#ifndef __FreeBSD__
         int             lastloccpu;     /* (o) last host cpu localized to */
-#endif
-        uint_t          runblock;       /* (i) block vcpu from run state */
         int             reqidle;        /* (i) request vcpu to idle */
         struct vlapic   *vlapic;        /* (i) APIC device model */
         enum x2apic_state x2apic_state; /* (i) APIC mode */
         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
         int             nmi_pending;    /* (i) NMI pending */

@@ -128,10 +126,11 @@
         int             extint_pending; /* (i) INTR pending */
         int     exception_pending;      /* (i) exception pending */
         int     exc_vector;             /* (x) exception collateral */
         int     exc_errcode_valid;
         uint32_t exc_errcode;
+        uint8_t         sipi_vector;    /* (i) SIPI vector */
         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
         void            *stats;         /* (a,i) statistics */
         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
         uint64_t        nextrip;        /* (x) next instruction to execute */

@@ -198,19 +197,10 @@
         uint16_t        cores;                  /* (o) num of cores/socket */
         uint16_t        threads;                /* (o) num of threads/core */
         uint16_t        maxcpus;                /* (o) max pluggable cpus */
 
         struct ioport_config ioports;           /* (o) ioport handling */
-
-        bool            sipi_req;               /* (i) SIPI requested */
-        int             sipi_req_vcpu;          /* (i) SIPI destination */
-        uint64_t        sipi_req_rip;           /* (i) SIPI start %rip */
-
-        /* Miscellaneous VM-wide statistics and counters */
-        struct vm_wide_stats {
-                uint64_t sipi_supersede;
-        } stats;
 };
 
 static int vmm_initialized;
 
 

@@ -247,12 +237,12 @@
 #define VMM_INIT(num)                   ((*ops->init)(num))
 #define VMM_CLEANUP()                   ((*ops->cleanup)())
 #define VMM_RESUME()                    ((*ops->resume)())
 
 #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
-#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
-        ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo))
+#define VMRUN(vmi, vcpu, rip, pmap) \
+        ((*ops->vmrun)(vmi, vcpu, rip, pmap))
 #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 
 #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))

@@ -290,10 +280,12 @@
 static int trace_guest_exceptions;
 
 static void vm_free_memmap(struct vm *vm, int ident);
 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
+static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
+static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
 
 #ifndef __FreeBSD__
 static void vm_clear_memseg(struct vm *, int);
 
 /* Flags for vtc_status */

@@ -368,13 +360,13 @@
         } else {
                 vie_reset(vcpu->vie_ctx);
                 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
         }
 
+        vcpu->run_state = VRS_HALT;
         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
         vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
-        vcpu->runblock = 0;
         vcpu->reqidle = 0;
         vcpu->exitintinfo = 0;
         vcpu->nmi_pending = 0;
         vcpu->extint_pending = 0;
         vcpu->exception_pending = 0;

@@ -1231,11 +1223,11 @@
 
         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 int
-vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
 {
         if (vcpu < 0 || vcpu >= vm->maxcpus)
                 return (EINVAL);
 
         if (!is_segment_register(reg) && !is_descriptor_table(reg))

@@ -1242,10 +1234,53 @@
                 return (EINVAL);
 
         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 }
 
+int
+vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
+{
+        struct vcpu *vcpu;
+
+        if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
+                return (EINVAL);
+        }
+
+        vcpu = &vm->vcpu[vcpuid];
+
+        vcpu_lock(vcpu);
+        *state = vcpu->run_state;
+        *sipi_vec = vcpu->sipi_vector;
+        vcpu_unlock(vcpu);
+
+        return (0);
+}
+
+int
+vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
+{
+        struct vcpu *vcpu;
+
+        if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
+                return (EINVAL);
+        }
+        if (!VRS_IS_VALID(state)) {
+                return (EINVAL);
+        }
+
+        vcpu = &vm->vcpu[vcpuid];
+
+        vcpu_lock(vcpu);
+        vcpu->run_state = state;
+        vcpu->sipi_vector = sipi_vec;
+        vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+        vcpu_unlock(vcpu);
+
+        return (0);
+}
+
+
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
         /* flush host state to the pcb */

@@ -1352,20 +1387,10 @@
         default:
                 error = 1;
                 break;
         }
 
-        if (newstate == VCPU_RUNNING) {
-                while (vcpu->runblock != 0) {
-#ifdef __FreeBSD__
-                        msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
-#else
-                        cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
-#endif
-                }
-        }
-
         if (error)
                 return (EBUSY);
 
         VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
             vcpu_state2str(vcpu->state), vcpu_state2str(newstate));

@@ -1374,12 +1399,11 @@
         if (newstate == VCPU_RUNNING)
                 vcpu->hostcpu = curcpu;
         else
                 vcpu->hostcpu = NOCPU;
 
-        if (newstate == VCPU_IDLE ||
-            (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
+        if (newstate == VCPU_IDLE) {
 #ifdef __FreeBSD__
                 wakeup(&vcpu->state);
 #else
                 cv_broadcast(&vcpu->state_cv);
 #endif

@@ -1411,16 +1435,12 @@
  */
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
 {
         struct vcpu *vcpu;
-#ifdef __FreeBSD__
-        const char *wmesg;
-#else
-        const char *wmesg __unused;
-#endif
         int t, vcpu_halted, vm_halted;
+        bool userspace_exit = false;
 
         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
         vcpu = &vm->vcpu[vcpuid];
         vcpu_halted = 0;

@@ -1427,72 +1447,54 @@
         vm_halted = 0;
 
         vcpu_lock(vcpu);
         while (1) {
                 /*
-                 * Do a final check for pending NMI or interrupts before
-                 * really putting this thread to sleep. Also check for
-                 * software events that would cause this vcpu to wakeup.
-                 *
-                 * These interrupts/events could have happened after the
-                 * vcpu returned from VMRUN() and before it acquired the
-                 * vcpu lock above.
+                 * Do a final check for pending interrupts (including NMI and
+                 * INIT) before putting this thread to sleep.
                  */
-                if (vm->suspend || vcpu->reqidle)
-                        break;
                 if (vm_nmi_pending(vm, vcpuid))
                         break;
+                if (vcpu_run_state_pending(vm, vcpuid))
+                        break;
                 if (!intr_disabled) {
                         if (vm_extint_pending(vm, vcpuid) ||
                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
                                 break;
                         }
                 }
 
-                /* Don't go to sleep if the vcpu thread needs to yield */
-                if (vcpu_should_yield(vm, vcpuid))
+                /*
+                 * Also check for software events which would cause a wake-up.
+                 * This will set the appropriate exitcode directly, rather than
+                 * requiring a trip through VM_RUN().
+                 */
+                if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
+                        userspace_exit = true;
                         break;
+                }
 
-                if (vcpu_debugged(vm, vcpuid))
-                        break;
-
                 /*
                  * Some Linux guests implement "halt" by having all vcpus
                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
                  * track of the vcpus that have entered this state. When all
                  * vcpus enter the halted state the virtual machine is halted.
                  */
                 if (intr_disabled) {
-                        wmesg = "vmhalt";
-                        VCPU_CTR0(vm, vcpuid, "Halted");
                         if (!vcpu_halted && halt_detection_enabled) {
                                 vcpu_halted = 1;
                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
                         }
                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
                                 vm_halted = 1;
                                 break;
                         }
-                } else {
-                        wmesg = "vmidle";
                 }
 
                 t = ticks;
                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
-#ifdef __FreeBSD__
-                /*
-                 * XXX msleep_spin() cannot be interrupted by signals so
-                 * wake up periodically to check pending signals.
-                 */
-                msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
-#else
-                /*
-                 * Fortunately, cv_wait_sig can be interrupted by signals, so
-                 * there is no need to periodically wake up.
-                 */
                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
-#endif
                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
                 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
         }
 
         if (vcpu_halted)

@@ -1501,11 +1503,11 @@
         vcpu_unlock(vcpu);
 
         if (vm_halted)
                 vm_suspend(vm, VM_SUSPEND_HALT);
 
-        return (0);
+        return (userspace_exit ? -1 : 0);
 }
 
 static int
 vm_handle_paging(struct vm *vm, int vcpuid)
 {

@@ -1830,10 +1832,66 @@
         vcpu->reqidle = 0;
         vcpu_unlock(vcpu);
         return (-1);
 }
 
+static int
+vm_handle_run_state(struct vm *vm, int vcpuid)
+{
+        struct vcpu *vcpu = &vm->vcpu[vcpuid];
+        bool handled = false;
+
+        vcpu_lock(vcpu);
+        while (1) {
+                if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
+                        vcpu_unlock(vcpu);
+                        VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
+                        vcpu_lock(vcpu);
+
+                        vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
+                        vcpu->run_state |= VRS_INIT;
+                }
+
+                if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
+                    (VRS_INIT | VRS_PEND_SIPI)) {
+                        const uint8_t vector = vcpu->sipi_vector;
+
+                        vcpu_unlock(vcpu);
+                        VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
+                        vcpu_lock(vcpu);
+
+                        vcpu->run_state &= ~VRS_PEND_SIPI;
+                        vcpu->run_state |= VRS_RUN;
+                }
+
+                /*
+                 * If the vCPU is now in the running state, there is no need to
+                 * wait for anything prior to re-entry.
+                 */
+                if ((vcpu->run_state & VRS_RUN) != 0) {
+                        handled = true;
+                        break;
+                }
+
+                /*
+                 * Also check for software events which would cause a wake-up.
+                 * This will set the appropriate exitcode directly, rather than
+                 * requiring a trip through VM_RUN().
+                 */
+                if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
+                        break;
+                }
+
+                vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
+                (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
+                vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
+        }
+        vcpu_unlock(vcpu);
+
+        return (handled ? 0 : -1);
+}
+
 #ifndef __FreeBSD__
 static int
 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
 {
         struct vcpu *cpu = &vm->vcpu[vcpuid];

@@ -1848,22 +1906,10 @@
 
         return (-1);
 }
 #endif /* __FreeBSD__ */
 
-void
-vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip)
-{
-        if (vm->sipi_req) {
-                /* This should never occur if userspace is doing its job. */
-                vm->stats.sipi_supersede++;
-        }
-        vm->sipi_req = true;
-        vm->sipi_req_vcpu = req_vcpuid;
-        vm->sipi_req_rip = req_rip;
-}
-
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
         int i;
 

@@ -1888,71 +1934,22 @@
 
         return (0);
 }
 
 void
-vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
+vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
 {
         struct vm_exit *vmexit;
 
-        KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
-            ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
-
         vmexit = vm_exitinfo(vm, vcpuid);
         vmexit->rip = rip;
         vmexit->inst_length = 0;
-        vmexit->exitcode = VM_EXITCODE_SUSPENDED;
-        vmexit->u.suspended.how = vm->suspend;
+        vmexit->exitcode = VM_EXITCODE_RUN_STATE;
+        vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
 }
 
-void
-vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
-{
-        struct vm_exit *vmexit;
 
-        vmexit = vm_exitinfo(vm, vcpuid);
-        vmexit->rip = rip;
-        vmexit->inst_length = 0;
-        vmexit->exitcode = VM_EXITCODE_DEBUG;
-}
-
-void
-vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
-{
-        struct vm_exit *vmexit;
-
-        vmexit = vm_exitinfo(vm, vcpuid);
-        vmexit->rip = rip;
-        vmexit->inst_length = 0;
-        vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
-        vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
-}
-
-void
-vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
-{
-        struct vm_exit *vmexit;
-
-        vmexit = vm_exitinfo(vm, vcpuid);
-        vmexit->rip = rip;
-        vmexit->inst_length = 0;
-        vmexit->exitcode = VM_EXITCODE_REQIDLE;
-        vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
-}
-
-void
-vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
-{
-        struct vm_exit *vmexit;
-
-        vmexit = vm_exitinfo(vm, vcpuid);
-        vmexit->rip = rip;
-        vmexit->inst_length = 0;
-        vmexit->exitcode = VM_EXITCODE_BOGUS;
-        vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
-}
-
 #ifndef __FreeBSD__
 /*
  * Some vmm resources, such as the lapic, may have CPU-specific resources
  * allocated to them which would benefit from migration onto the host CPU which
  * is processing the vcpu state.

@@ -2070,11 +2067,11 @@
         case VEC_DEFAULT:
                 return (0);
         case VEC_DISCARD_INSTR:
                 vie_reset(vie);
                 return (0);
-        case VEC_COMPLETE_MMIO:
+        case VEC_FULFILL_MMIO:
                 err = vie_fulfill_mmio(vie, &entry->u.mmio);
                 if (err == 0) {
                         err = vie_emulate_mmio(vie, vm, vcpuid);
                         if (err == 0) {
                                 vie_advance_pc(vie, &vcpu->nextrip);

@@ -2089,11 +2086,11 @@
                                 vie_reset(vie);
                                 err = 0;
                         }
                 }
                 break;
-        case VEC_COMPLETE_INOUT:
+        case VEC_FULFILL_INOUT:
                 err = vie_fulfill_inout(vie, &entry->u.inout);
                 if (err == 0) {
                         err = vie_emulate_inout(vie, vm, vcpuid);
                         if (err == 0) {
                                 vie_advance_pc(vie, &vcpu->nextrip);

@@ -2130,29 +2127,16 @@
                  */
                 vie_exitinfo(vie, vme);
                 return (-1);
         }
 
-        if (vcpuid == 0 && vm->sipi_req) {
-                /* The boot vCPU has sent a SIPI to one of the other CPUs */
-                vme->exitcode = VM_EXITCODE_SPINUP_AP;
-                vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu;
-                vme->u.spinup_ap.rip = vm->sipi_req_rip;
-
-                vm->sipi_req = false;
-                vm->sipi_req_vcpu = 0;
-                vm->sipi_req_rip = 0;
-                return (-1);
-        }
-
         return (0);
 }
 
 int
 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
 {
-        struct vm_eventinfo evinfo;
         int error;
         struct vcpu *vcpu;
 #ifdef  __FreeBSD__
         struct pcb *pcb;
 #endif

@@ -2175,13 +2159,10 @@
                 return (EINVAL);
 
         pmap = vmspace_pmap(vm->vmspace);
         vcpu = &vm->vcpu[vcpuid];
         vme = &vcpu->exitinfo;
-        evinfo.rptr = &vcpu->runblock;
-        evinfo.sptr = &vm->suspend;
-        evinfo.iptr = &vcpu->reqidle;
 
 #ifndef __FreeBSD__
         vtc.vtc_vm = vm;
         vtc.vtc_vcpuid = vcpuid;
         vtc.vtc_status = 0;

@@ -2240,11 +2221,11 @@
         }
         vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
 #endif
 
         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
-        error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
+        error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 #ifdef  __FreeBSD__
         save_guest_fpustate(vcpu);
 #else

@@ -2271,19 +2252,20 @@
         vcpu->nextrip = vme->rip + vme->inst_length;
         switch (vme->exitcode) {
         case VM_EXITCODE_REQIDLE:
                 error = vm_handle_reqidle(vm, vcpuid);
                 break;
+        case VM_EXITCODE_RUN_STATE:
+                error = vm_handle_run_state(vm, vcpuid);
+                break;
         case VM_EXITCODE_SUSPENDED:
                 error = vm_handle_suspend(vm, vcpuid);
                 break;
         case VM_EXITCODE_IOAPIC_EOI:
                 vioapic_process_eoi(vm, vcpuid,
                     vme->u.ioapic_eoi.vector);
                 break;
-        case VM_EXITCODE_RUNBLOCK:
-                break;
         case VM_EXITCODE_HLT:
                 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
                 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
                 break;
         case VM_EXITCODE_PAGING:

@@ -2790,10 +2772,200 @@
         vcpu->extint_pending = 0;
         vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 }
 
 int
+vm_inject_init(struct vm *vm, int vcpuid)
+{
+        struct vcpu *vcpu;
+
+        if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+                return (EINVAL);
+
+        vcpu = &vm->vcpu[vcpuid];
+        vcpu_lock(vcpu);
+        vcpu->run_state |= VRS_PEND_INIT;
+        vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+        vcpu_unlock(vcpu);
+        return (0);
+}
+
+int
+vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
+{
+        struct vcpu *vcpu;
+
+        if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+                return (EINVAL);
+
+        vcpu = &vm->vcpu[vcpuid];
+        vcpu_lock(vcpu);
+        vcpu->run_state |= VRS_PEND_SIPI;
+        vcpu->sipi_vector = vector;
+        /* SIPI is only actionable if the CPU is waiting in INIT state */
+        if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
+                vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+        }
+        vcpu_unlock(vcpu);
+        return (0);
+}
+
+bool
+vcpu_run_state_pending(struct vm *vm, int vcpuid)
+{
+        struct vcpu *vcpu;
+
+        ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
+        vcpu = &vm->vcpu[vcpuid];
+
+        /* Of interest: vCPU not in running state or with pending INIT */
+        return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
+}
+
+int
+vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
+{
+        struct seg_desc desc;
+        const enum vm_reg_name clear_regs[] = {
+                VM_REG_GUEST_CR2,
+                VM_REG_GUEST_CR3,
+                VM_REG_GUEST_CR4,
+                VM_REG_GUEST_RAX,
+                VM_REG_GUEST_RBX,
+                VM_REG_GUEST_RCX,
+                VM_REG_GUEST_RSI,
+                VM_REG_GUEST_RDI,
+                VM_REG_GUEST_RBP,
+                VM_REG_GUEST_RSP,
+                VM_REG_GUEST_R8,
+                VM_REG_GUEST_R9,
+                VM_REG_GUEST_R10,
+                VM_REG_GUEST_R11,
+                VM_REG_GUEST_R12,
+                VM_REG_GUEST_R13,
+                VM_REG_GUEST_R14,
+                VM_REG_GUEST_R15,
+                VM_REG_GUEST_DR0,
+                VM_REG_GUEST_DR1,
+                VM_REG_GUEST_DR2,
+                VM_REG_GUEST_DR3,
+                VM_REG_GUEST_EFER,
+        };
+        const enum vm_reg_name data_segs[] = {
+                VM_REG_GUEST_SS,
+                VM_REG_GUEST_DS,
+                VM_REG_GUEST_ES,
+                VM_REG_GUEST_FS,
+                VM_REG_GUEST_GS,
+        };
+        struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+        if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+                return (EINVAL);
+
+        for (uint_t i = 0; i < nitems(clear_regs); i++) {
+                VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
+        }
+
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
+
+        /*
+         * The prescribed contents of %rdx differ slightly between the Intel and
+         * AMD architectural definitions.  The former expects the Extended Model
+         * in bits 16-19 where the latter expects all the Family, Model, and
+         * Stepping be there.  Common boot ROMs appear to disregard this
+         * anyways, so we stick with a compromise value similar to what is
+         * spelled out in the Intel SDM.
+         */
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
+
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
+
+        /* CS: Present, R/W, Accessed */
+        desc.access = 0x0093;
+        desc.base = 0xffff0000;
+        desc.limit = 0xffff;
+        VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
+
+        /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
+        desc.access = 0x0093;
+        desc.base = 0;
+        desc.limit = 0xffff;
+        for (uint_t i = 0; i < nitems(data_segs); i++) {
+                VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
+                VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
+        }
+
+        /* GDTR, IDTR */
+        desc.base = 0;
+        desc.limit = 0xffff;
+        VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
+        VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
+
+        /* LDTR: Present, LDT */
+        desc.access = 0x0082;
+        desc.base = 0;
+        desc.limit = 0xffff;
+        VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
+
+        /* TR: Present, 32-bit TSS */
+        desc.access = 0x008b;
+        desc.base = 0;
+        desc.limit = 0xffff;
+        VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
+
+        vlapic_reset(vm_lapic(vm, vcpuid));
+
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
+
+        vcpu->exitintinfo = 0;
+        vcpu->exception_pending = 0;
+        vcpu->nmi_pending = 0;
+        vcpu->extint_pending = 0;
+
+        /*
+         * A CPU reset caused by power-on or system reset clears more state than
+         * one which is trigged from an INIT IPI.
+         */
+        if (!init_only) {
+                vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
+                fpu_save_area_reset(vcpu->guestfpu);
+
+                /* XXX: clear MSRs and other pieces */
+        }
+
+        return (0);
+}
+
+static int
+vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
+{
+        struct seg_desc desc;
+
+        if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+                return (EINVAL);
+
+        /* CS: Present, R/W, Accessed */
+        desc.access = 0x0093;
+        desc.base = (uint64_t)vector << 12;
+        desc.limit = 0xffff;
+        VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
+            (uint64_t)vector << 8));
+
+        VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
+
+        return (0);
+}
+
+int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
         if (vcpu < 0 || vcpu >= vm->maxcpus)
                 return (EINVAL);
 

@@ -2892,11 +3064,11 @@
 {
         int error;
         struct vcpu *vcpu;
 
         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
-                panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+                panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
 
         vcpu = &vm->vcpu[vcpuid];
 
         vcpu_lock(vcpu);
         error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);

@@ -2910,11 +3082,11 @@
 {
         struct vcpu *vcpu;
         enum vcpu_state state;
 
         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
-                panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+                panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
 
         vcpu = &vm->vcpu[vcpuid];
 
         vcpu_lock(vcpu);
         state = vcpu->state;

@@ -2923,58 +3095,10 @@
         vcpu_unlock(vcpu);
 
         return (state);
 }
 
-void
-vcpu_block_run(struct vm *vm, int vcpuid)
-{
-        struct vcpu *vcpu;
-
-        if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
-                panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
-
-        vcpu = &vm->vcpu[vcpuid];
-
-        vcpu_lock(vcpu);
-        vcpu->runblock++;
-        if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
-                vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
-        }
-        while (vcpu->state == VCPU_RUNNING) {
-#ifdef __FreeBSD__
-                msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
-#else
-                cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
-#endif
-        }
-        vcpu_unlock(vcpu);
-}
-
-void
-vcpu_unblock_run(struct vm *vm, int vcpuid)
-{
-        struct vcpu *vcpu;
-
-        if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
-                panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
-
-        vcpu = &vm->vcpu[vcpuid];
-
-        vcpu_lock(vcpu);
-        KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
-        vcpu->runblock--;
-        if (vcpu->runblock == 0) {
-#ifdef __FreeBSD__
-                wakeup(&vcpu->state);
-#else
-                cv_broadcast(&vcpu->state_cv);
-#endif
-        }
-        vcpu_unlock(vcpu);
-}
-
 #ifndef __FreeBSD__
 uint64_t
 vcpu_tsc_offset(struct vm *vm, int vcpuid)
 {
         return (vm->vcpu[vcpuid].tsc_offset);

@@ -3036,17 +3160,99 @@
                 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
         }
         return (0);
 }
 
-int
-vcpu_debugged(struct vm *vm, int vcpuid)
+static bool
+vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
+    uint64_t entry_rip)
 {
+        struct vcpu *vcpu = &vm->vcpu[vcpuid];
+        struct vm_exit *vme = &vcpu->exitinfo;
+        bool bail = false;
 
-        return (CPU_ISSET(vcpuid, &vm->debug_cpus));
+        ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
+
+        if (vm->suspend) {
+                if (on_entry) {
+                        VERIFY(vm->suspend > VM_SUSPEND_NONE &&
+                            vm->suspend < VM_SUSPEND_LAST);
+
+                        vme->exitcode = VM_EXITCODE_SUSPENDED;
+                        vme->u.suspended.how = vm->suspend;
+                } else {
+                        /*
+                         * Handling VM suspend is complicated, so if that
+                         * condition is detected outside of VM-entry itself,
+                         * just emit a BOGUS exitcode so we take a lap to pick
+                         * up the event during an entry and are directed into
+                         * the vm_handle_suspend() logic.
+                         */
+                        vme->exitcode = VM_EXITCODE_BOGUS;
+                }
+                bail = true;
+        }
+        if (vcpu->reqidle) {
+                vme->exitcode = VM_EXITCODE_REQIDLE;
+                vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
+
+                if (!on_entry) {
+                        /*
+                         * A reqidle request detected outside of VM-entry can be
+                         * handled directly by clearing the request (and taking
+                         * a lap to userspace).
+                         */
+                        vcpu_assert_locked(vcpu);
+                        vcpu->reqidle = 0;
+                }
+                bail = true;
+        }
+        if (vcpu_should_yield(vm, vcpuid)) {
+                vme->exitcode = VM_EXITCODE_BOGUS;
+                vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
+                bail = true;
+        }
+        if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
+                vme->exitcode = VM_EXITCODE_DEBUG;
+                bail = true;
+        }
+
+        if (bail) {
+                if (on_entry) {
+                        /*
+                         * If bailing out during VM-entry, the current %rip must
+                         * be recorded in the exitinfo.
+                         */
+                        vme->rip = entry_rip;
+                }
+                vme->inst_length = 0;
+        }
+        return (bail);
 }
 
+static bool
+vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
+{
+        /*
+         * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
+         * wait-for-SIPI) expect that %rip is already populated in the vm_exit
+         * structure, and we would only modify the exitcode.
+         */
+        return (vcpu_bailout_checks(vm, vcpuid, false, 0));
+}
+
+bool
+vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
+{
+        /*
+         * Bail-out checks done as part of VM entry require an updated %rip to
+         * populate the vm_exit struct if any of the conditions of interest are
+         * matched in the check.
+         */
+        return (vcpu_bailout_checks(vm, vcpuid, true, rip));
+}
+
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
         return (vm->active_cpus);